-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
104 lines (86 loc) · 2.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#Jake Macdonald
# 9/23/2022
# Designed to produce json files with details of all articles authored by choice person.
#Retrives title, authors, url, and description
from utilities.util import *
import os.path
def main (start, numItems) :
params = {
# Yes this is bad practice. But the service is free soooo i dont care
"api_key": "c801fb0ffe9a68445624b9e9c7bd2d0a84bbc0bd9db4506cf91f267b8b3f44f3",
"engine": "google_scholar_author",
"author_id": "G1CnZ38AAAAJ", #Nathan's Id
"hl": "en",
"sort": "pubdate",
"num": numItems,
"start": str(start),
}
search = GoogleSearch(params)
results = dict(search.get_dict())
first = str(results.get('articles'))
strArr = divide(first)
for i in range(len(strArr)) :
if i == 0 : continue #b/c weirdness of DS
item = ArtItem
item.setTit(item,strArr[i])
item.setLink(item,strArr[i])
item.setAuth(item,strArr[i])
fName = item.title
sizeOfName = len(fName)
if not sizeOfName :
fName = item.authors
sizeOfName = len (fName)
if sizeOfName > 15:
fName = fName[:15]
fName = fName.replace('"',"")
fName = fName.replace('/',"")
fName = fName.replace(' ',"")
fName += str(sizeOfName)
# Checking if file already exists
try:
with open('datCache/ ' + fName +'.json', 'r') as json_file:
json_file.close()
continue
except IOError:
pass
# An attempt to make crawling less bot seeming
baseTime = 10.0
time.sleep(baseTime + (baseTime * random.random()))
(item.abstract, item.authors, item.date) = getContent(str(item.link),str(item.authors))
if item.abstract == 'found' : return 404
if item.date == "ignore" : continue
item.authors = authorFormat(item.authors)
dictPort = {
'title' : item.title,
'author' : item.authors,
'URL' : item.link,
'abstract' : item.abstract,
'issued' : item.date
}
with open('datCache/ ' + fName +'.json', 'w') as json_file:
json.dump(dictPort, json_file)
json_file.close()
return 1
#serpAPI only allows 100 articles to be retrieved per search
numItems = 10
numArt = 130
iterate = 0
recompile = True
if __name__ == "__main__":
try:
while numArt > 0 :
if not main(iterate * numItems, numItems) == None:
recompile = True
numArt -= numItems
iterate += 1
finally:
#used to make a single js file with all objects defined in the json files
if recompile :
path = "datCache"
dir_list = os.listdir(path)
new = open("output/publications.js", "w")
new.write("publications = [\n")
for i in dir_list :
addToMaster (path +'/'+i, new)
back = open("output/oldPub.txt", "r")
new.write(back.read())