-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_parser.py
77 lines (58 loc) · 2.1 KB
/
data_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from lxml import etree
# schema for the search index
schema = Schema(
title=TEXT(stored=True),
authors=TEXT(stored=True),
year=NUMERIC(stored=True),
url=ID(unique=True, stored=True),
type=TEXT(stored=True)
)
# Whoosh index
index_dir = "academic_papers_index"
if not os.path.exists(index_dir):
os.mkdir(index_dir)
ix = create_in(index_dir, schema)
# function to index academic papers
def index_paper(title, authors, year, url, paper_type):
writer = ix.writer()
writer.add_document(
title=title,
authors=authors,
year=year,
url=url,
type=paper_type
)
writer.commit()
# function to parse and index an entry
def parse_and_index_entry(entry):
title = entry.findtext('title')
authors = [author.text for author in entry.xpath('author')]
year = int(entry.findtext('year'))
# handle both types of URL elements
url = entry.findtext('ee[@type="oa"]')
if url is None:
url = entry.findtext('url')
# handle articles without a URL
if url is None:
url = "URL not available"
paper_type = entry.tag
index_paper(title, ", ".join(authors), year, url, paper_type)
# function to fetch and index academic papers from a source
def fetch_and_index_papers(xml_file):
# specify the DTD file to be used during parsing
dtd_file = 'dblp.dtd'
# create a custom parser with DTD validation
parser = etree.XMLParser(dtd_validation=True, no_network=False)
# parse the XML document and specify the DTD file
tree = etree.parse(xml_file, parser=parser)
# get all entries from the XML (articles, books, inproceedings, etc.)
entries = tree.xpath('//article | //book | //inproceedings | //www | //phdthesis | //proceedings | //incollection |'
' //mastersthesis')
# iterate through articles within the specified range
for entry in entries:
parse_and_index_entry(entry)
if __name__ == '__main__':
fetch_and_index_papers('dblp.xml') # fetch and index papers on startup