-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre_proc_papers_metadata.py
121 lines (88 loc) · 2.86 KB
/
pre_proc_papers_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
'''
This script pre-processes papers metadata extracted from Zotero.
It formats some fields and includes others.
'''
import os
import RISparser as rp
from collections import defaultdict
import random
import hashlib
import util
import config as cfg
def load_ris(path):
with open(path) as f:
entries = rp.readris(f)
entries = list(entries)
return entries
def _set_default_values(meta, keys, def_val=''):
for k in keys:
meta[k] = meta.get(k, def_val)
return meta
def set_default_values(metas):
keys = set(util.flatten(m.keys() for m in metas))
for meta in metas:
meta = _set_default_values(meta, keys)
return metas
def set_lowercase_keys(meta):
keys = list(meta.keys())
for k in keys:
if not k.islower():
meta[k.lower()] = meta[k]
del meta[k]
return meta
def set_uid(meta):
hash_obj = hashlib.sha1(str(sorted(meta.items())).encode('utf-8'))
meta['uid'] = hash_obj.hexdigest()
return meta
def set_local_pdf_path_field(meta):
keys = sorted(k for k in meta.keys() if k.startswith('file_attachments'))
for k in keys:
if meta[k].lower().endswith('.pdf'):
path = os.path.join(cfg.paths['exported-metadata-dir'], meta[k])
break
else:
path = ''
meta['pdf-path'] = path
return meta
def normalize_title(meta):
meta['norm-title'] = util.normalize_title(meta['title'])
return meta
def normalize_authors(meta):
authors = {util.normalize_author(a) for a in meta['authors']}
meta['norm-authors'] = sorted(authors)
return meta
def pre_proc_paper_meta(meta):
meta = set_uid(meta)
meta = set_lowercase_keys(meta)
meta = normalize_title(meta)
meta = normalize_authors(meta)
meta = set_local_pdf_path_field(meta)
return meta
def mk_unique_norm_titles(metas):
hist = defaultdict(int)
for meta in metas:
hist[meta['norm-title']] += 1
if hist[meta['norm-title']] > 1:
new_title = '{}-{}'.format(
meta['norm-title'], hist[meta['norm-title']])
print('WARNING: "{}" already exists, renaming to "{}"'.format(
meta['norm-title'], new_title))
meta['norm-title'] = new_title
return metas
def _pre_proc_paper_metas(metas):
metas = set_default_values(metas)
metas = [pre_proc_paper_meta(m) for m in metas]
metas = mk_unique_norm_titles(metas)
assert len(metas) == len({m['uid'] for m in metas})
return metas
def pre_proc_papers_metas():
raw_metas = load_ris(cfg.paths['raw-papers-metadata'])
metas = _pre_proc_paper_metas(raw_metas)
util.save_json(cfg.paths['papers-metadata'], metas)
print('saved updated papers metadata to "{}"'.format(
cfg.paths['papers-metadata']))
def main():
pre_proc_papers_metas()
if __name__ == '__main__':
main()