-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_interface.py
222 lines (167 loc) · 7.56 KB
/
twitter_interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
''' Class to handle of all of the twitter and twitter cache interfacing '''
from collections import defaultdict
import json
import time
import pandas as pd
import tweepy as tw
from config import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET
class TwitterWrapper():
'''This is the class where all of the wrapping happens'''
#In [46]: user = api.get_user(screen_name = 'saimadhup')
#In [47]: user.id
#Out[47]: 1088398616
def __init__(self, name):
# self.name = name
# self.file_name = "data/" + self.name + '.json'
self.tweets = []
def set_user_id(self, user_id):
self.name = ""
def set_screen_name(self, input_screen_name):
auth = tw.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True)
user = api.get_user(screen_name = input_screen_name)
self.name = user.id
self.file_name = "data/" + str(input_screen_name) + '.json'
self.tweets = []
def set_test_name(self, input_screen_name):
self.name = input_screen_name
self.file_name = "data/" + str(input_screen_name) + '_1000.json'
self.tweets = []
def load_tweets(self, cache_only=True):
''' Single funciton to be used by callers to get data '''
old_tweets = self.load_tweets_from_file()
if(cache_only is False):
since = self.get_recent_tweet_id(old_tweets)
new_tweets = self.load_timeline_tweets(since)
self.tweets = self.join_tweets(old_tweets, new_tweets)
self.save_tweets(self.tweets)
return self.tweets
def sanatize(self, tweets):
ret_value = []
for t in tweets:
ret_value.append(t._json)
return ret_value
def load_timeline_tweets(self, since):
''' load max timeline tweets. Capped by twitter at 3200 '''
auth = tw.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True)
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(user_id=self.name,count=200, tweet_mode="extended")
sanitized_tweets = self.sanatize(new_tweets)
#save most recent tweets
alltweets.extend(sanitized_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1]["id"] - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print( "getting tweets before %s" % (oldest))
# time.sleep(2)
#all subsiquent requests use the max_id param to prevent duplicates
# oldest = 1194414093781917695
new_tweets = api.user_timeline(user_id=self.name,count=200,max_id=oldest,min_id=since, tweet_mode="extended")
#save most recent tweets
alltweets.extend(self.sanatize(new_tweets))
#update the id of the oldest tweet less one
oldest = alltweets[-1]["id"] - 1
print( "...%s tweets downloaded so far" % (len(alltweets)))
self.tweets = alltweets
self.save_tweets(self.tweets)
return self.tweets
def get_tweet_text(self, cache_only=True):
''' Single funciton to be used by callers to get data '''
self.load_tweets(cache_only)
df = pd.read_json(self.file_name, orient='records')
return df['text']
def get_tweet_id_text(self, cache_only=True):
''' Single funciton to be used by callers to get data '''
tweets = self.load_tweets(cache_only)
df = pd.read_json(self.file_name, orient='records')
## If 'id' doesn't exist, append 'id' for _1000.json files
if 'id' not in df.columns:
df['id'] = range(1, len(df) + 1)
return df[['id', 'text']]
def load_tweets_from_file(self):
'''Get the list of tweets for the user that is cached in the local filesystem'''
self.tweets = []
try:
with open(self.file_name, 'r') as infile:
self.tweets = json.loads(infile.read())
except:
self.tweets = []
return self.tweets
def get_current_tweets(self, since_id=1, count=100):
''' Download the current list of tweets from twitter'''
auth = tw.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True)
self.tweets = []
# Convert the tweets to json immediatly
for tweet in api.user_timeline(user_id=self.name, count=count, since_id=since_id):
self.tweets.append(tweet._json)
return self.tweets
def get_recent_tweet_id(self, tweets):
''' Given a list of tweets, return the id of the most recent tweet'''
since_id = 1
if(tweets is not None):
for tweet in tweets:
if(int(tweet['id_str']) > since_id):
since_id = int(tweet['id_str'])
return since_id
def join_tweets(self, tweets1, tweets2):
''' Given two lists of tweets, join them and make them unique'''
ids = defaultdict(lambda: 0)
out_tweets = []
if(tweets1 is not None):
for tweet in tweets1:
ids[int(tweet["id_str"])] += 1
out_tweets = tweets1
if(tweets2 is not None):
for tweet in tweets2:
if(tweet["id"] not in ids):
ids[int(tweet["id_str"])] += 1
out_tweets.append(tweet)
return out_tweets
def sort_tweets(self, tweets):
''' Sort list of tweets by the id field, putting them in
chronological order '''
return sorted(tweets, key=lambda x: int(x["id_str"]), reverse=True)
def save_tweets(self, tweets):
''' Write all tweets to person's file '''
outjson = []
for item in tweets:
outitem = {}
outitem['text'] = ""
for field in item:
if(field == "created_at"):
outitem['created_at'] = item['created_at']
if(field == "full_text"):
outitem['text'] = item['full_text']
if(field == "text"):
outitem['text'] = item['text']
if(field == "id_str"):
outitem['id_str'] = item['id_str']
if(field == "id"):
outitem['id'] = item['id']
# if(field == "is_retweet"):
# outitem['is_retweet'] = item['is_retweet']
# if(field == "retweeted"):
# outitem['is_retweet'] = item['is_retweet']
if(outitem['text'].startswith("RT") is False):
# print("skipping")
# else:
outjson.append(outitem)
with open(self.file_name, 'w') as outfile:
json.dump(outjson, outfile)
def more_timeline(self):
# OAuth process, using the keys and tokens
auth = tw.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
# Creation of the actual interface, using authentication
api = tw.API(auth)
i = 0
for status in tw.Cursor(api.user_timeline, screen_name=self.name, tweet_mode="extended").items():
print("{} {}".format(i, status.full_text))
i += 1