forked from markjosims/aitabot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapeComments.py
52 lines (47 loc) · 1.29 KB
/
ScrapeComments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# uses pushshift.io
import requests
import json
from time import time
# decorator
def time_exec(f):
def g(*args, **kwargs):
start = time()
f(*args, **kwargs)
end = time()
print(str(f), end-start)
return g
@time_exec
def main():
data = []
after="1356998400"
no_data_ct = 0
while len(data) < 20000 and no_data_ct < 50:
# get submission objects
print('Fetching comments...currently have', len(data))
new_data, after = fetch_comments(after)
#new_data, before = get_aggs('nta', before)
if not new_data:
no_data_ct += 1
else:
no_data_ct = 0
data.extend(new_data)
print(len(data))
with open('comments.json', 'w') as f:
json.dump(data, f, indent=2)
def fetch_comments(after):
params = {
'subreddit': 'amitheasshole',
'fields': ['body', 'created_utc', 'id', 'link_id'],
'size': 1000,
'after': after,
'author': "Judgement_Bot_AITA"
}
response = requests.get("https://api.pushshift.io/reddit/comment/search", params)
data = response.json()['data']
if not data:
print("no more data")
return None, None
cutoff = data[-1]['created_utc']
return data, cutoff
if __name__ == '__main__':
main()