visual-analytics/tweeter.py at master · sudeshjethoe/visual-analytics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
    Dump tweets in elasticsearch
"""
import argparse
import configparser
import csv
import os
import sys
import time
from datetime import datetime
from time import mktime
import IPython
from elasticsearch import Elasticsearch
from twarc import Twarc


config = configparser.ConfigParser()
config.read('/home/svjethoe/.twarc')
index = 'visualanalytics'
es = Elasticsearch()

t = Twarc(config.get('main', 'consumer_key'),
        config.get('main', 'consumer_secret'),
        config.get('main', 'access_token'),
        config.get('main', 'access_token_secret'))


def readtokens(f='binance-subset.csv'):
    records = []
    with open(f, 'r') as csvfile:
        for row in csv.reader(csvfile):
            records.append("#%s" % row[1])

    return records


def create_index(index):
    settings = {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'tweet': {
                'properties': {
                    'country': {'type': 'text'},
                    'timestamp': {
                        'type': 'date'},
                    'text': {'type': 'text'},
                    'url': {'type': 'text'}
                }
            }
        }
    }
    if not es.indices.exists(index):
        es.indices.create(index, settings)


def post(tweet):
    try:
        url = tweet['entities']['urls'][0]['url']
    except:
        url = None
    ts = time.strftime('%Y-%m-%dT%H:%M:%S', time.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
    doc = {
        'url': url,
        'timestamp': ts,
        'type': 'tweet',
        'country': tweet['user']['location'],
        'tags': [x['text'].lower() for x in tweet['entities']['hashtags']],
        'text': tweet['full_text'] or tweet['extended_tweet']['full_text']}

    res = es.index(
        index=index,
        doc_type='tweet',
        id=tweet['id'],
        body=doc)

    return res


def main():
    tokens = readtokens()
    create_index(index)
    # tweets = t.search("#bitcoin")
    searchtokens = " OR ".join(tokens)
    since = "2018-02-13"
    until = "2018-02-14"
    if since and until:
        timerange = "since:%s until:%s" % (since, until)
    elif since:
        timerange = "since:%s" % since
    else:
        timerange = "since:%s" % time.strftime("%Y-%m-%d")
    # for tweet in t.search("#btc OR #eth since:2018-01-01 until:2018-01-28"):
    for tweet in t.search("%s %s" % (searchtokens, timerange)):
        post(tweet)
        # IPython.embed()
        # sys.exit()


if __name__ == "__main__":
    main()


# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fdm=indent nocompatible