-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_raw_refs.py
76 lines (56 loc) · 1.61 KB
/
parse_raw_refs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
'''
Extracts references from pdf files.
'''
import os
import requests
import util
import config as cfg
#script config
N_THREADS = cfg.n_threads*4
PARSING_API_URL = 'http://freecite.library.brown.edu/citations/create'
def normalize_fields(data):
if data.get('authors') is None:
data['authors'] = []
authors = {util.normalize_author(a) for a in data['authors']}
data['norm-authors'] = sorted(authors)
if data.get('title') is None:
data['title'] = ''
data['norm-title'] = util.normalize_title(data['title'])
return data
def parse_raw_ref(ref):
resp = requests.post(
PARSING_API_URL,
headers={
'Accept': 'application/json',
},
data={
'citation': ref,
},
)
try:
data = resp.json()[0]
data = normalize_fields(data)
except:
print('ERROR with ref "{}", return_code = {}'.format(
ref, resp.status_code))
data = {}
return data
def _parse_raw_refs(refs):
data = []
for ref in refs:
data_ = parse_raw_ref(ref)
print('parsed "{}" to "{}"'.format(ref, data_))
if data_:
data.append(data_)
return data
def parse_raw_refs():
refs = util.load_json(cfg.paths['raw-papers-refs'])
data = util.parallelize(_parse_raw_refs, list(refs.values()), N_THREADS)
refs = {k: v for k, v in zip(refs.keys(), data)}
util.save_json(cfg.paths['papers-refs'], refs)
print('saved papers refs to "{}"'.format(cfg.paths['papers-refs']))
def main():
parse_raw_refs()
if __name__ == '__main__':
main()