-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
229 lines (193 loc) · 8.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# Author: Evan Dobbs
# Description: A web scraper that begins on the "new" page of the quantum physics arXiv, goes through the abstracts of
# all new papers submitted, and finds keywords in them. In the future, this will be extended to actually accessing
# the pdf versions of the papers and navigating reference sections to build a knowledge base of relevant papers for
# various topics.
# Notes: While this currently parses only the abs pages, I would like to parse the pdfs of actual papers as well for
# select papers which seem interesting based on the information extracted from their abstract pages. This is why there
# is a great deal of unused code for reading/parsing pdfs.
# known issues:
# 1. Some abstracts have latex in them (to be expected). Such text retains its uncompiled form (and so is unintelligible
# to the program at the moment). A possible solution is to compile all text as latex code before parsing it.
# Logic:
# 1. open chatbot
# 2. either ask for summary of the day's new papers, the week's new papers, information on a specific topic, or a specific paper.
# 3. if compiling a list of papers related to a specific topic or paper, compile until some stopping criterion is reached.
# This criterion can be either length (unrealistic to read more than 5-10 papers), relevance (if papers stop being relevant),
# or processing time (if things are slow).
# 4. Maybe keep a list of topics of interest for specific users, and when searching papers (either regular or specific)
# always make a note of these papers and their related topics.
from urllib import request
from bs4 import BeautifulSoup
import requests
import re
#import pdfquery
import os
import PyPDF4 as pf
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from os.path import exists
#bs4 is used for navigating arxiv, PyPDF4 for navigating the actual papers in pdf form. Maybe look for alternative to
# PyPDF4? Try pdfquery.
def main():
read_search('https://arxiv.org/list/quant-ph/new')
kb = {}
with open('urls.txt', 'r') as f: # change to read/write from just read
urls = f.read().splitlines()
for doc in urls:
# look for external urls in doc and add to urls.txt
#print("doc url: ")
#print(doc) # dummy
# read_pdf(doc)
read_html(doc)
with open(str(doc[27:] + '_clean.txt'), 'r', encoding="utf-8") as f: # [27:] starts the url after all periods.
text = f.read()
kws = get_keywords(text)
kb = build_kb(text, kws, kb)
for kw in kb:
print("KEYWORD " + kw + ": ")
for sent in kb[kw]:
print(sent)
print("done\n\n\n")
important_terms = ['protection', 'determinism', 'photons', 'geometric', 'algorithm', 'envelope', 'metasurfaces', 'photonic', 'wva', 'memory']
for key in important_terms:
print("KEYWORD: " + key)
for sent in kb[key]:
print(sent)
#not used for the web-scraping project
def read_pdf(url):
pdfurl = url[27:] + '.pdf'
print(pdfurl)
path = os.getcwd() + pdfurl
if not exists(path):
with open(pdfurl, 'wb') as f:
f.write(requests.get(url).content)
with open(pdfurl, 'rb') as f:
readpdf = pf.PdfFileReader(f)
for page in range(readpdf.numPages):
curr_page = readpdf.getPage(page)
text = curr_page.extractText()
#find external links and useful info
links = get_links(text)
print("links: ")
for link in links:
print(link)
# keywords = get_keywords(text)
# print('keywords: ')
# for kw in keywords:
# print(kw)
# Store raw text from pages in one file and cleaned text in another.
def read_html(url):
data = requests.get(url).text
soup = BeautifulSoup(data)
thisurl = url[27:]+'_raw.txt'
cleanurl = url[27:]+'_clean.txt'
with open(thisurl, 'w', encoding="utf-8") as f:
counter = 0
data = soup.findAll(text=True)
result = filter(visible, data)
temp_list = list(result) # list from filter
abs = False
temp_str = ' '.join(temp_list)
f.write(temp_str) #write raw text
#I know exactly what each page will look like so I know what I want. No point doing this in another function
#and parsing the raw text a second time.
with open(cleanurl, 'w', encoding="utf-8") as f:
for item in temp_list:
if str(item).startswith('Abstract:'):
abs = True
if str(item).startswith('\n'):
abs = False
if abs:
#print(item)
f.write(item)
# Finds keywords by term-frequency. Because of the small size of abstracts, many of the top 25 terms have a frequency
# of 1, leading to inconsistent output. Swap to tf-idf for chatbot project.
def get_keywords(text):
words = word_tokenize(text)
words = [w.lower() for w in words]
words = [w for w in words if w.isalpha() and w not in stopwords.words('english')]
num_words = len(words)
words_set = set(words)
num_unique_words = len(words_set)
word_counts = [(w, words.count(w)) for w in words_set]
word_counts.sort(reverse=True, key=lambda x: x[1]) # sort in descending order
counter = 0
kws = []
for word, count in word_counts:
print(word, count)
kws.append(word)
counter+=1
if counter >= 25:
break
return kws
# Given a list of keywords and raw text, build a dictionary of keywords and related sentences. Requires an existing
# kb to be passed in by reference so as to account for duplicated keywords between pages.
def build_kb(text, kws, kb):
sents = sent_tokenize(text)
for kw in kws:
#print("KW: ")
#print(kw)
temp_list = []
for s in sents:
s = s.lower()
if not s.find(kw) == -1:
#print("sentence!!!")
#print(s + "\n")
#print(kw + "\n")
temp_list.append(s)
#else:
#print("not found!")
if kw in kb:
kb[kw] = kb[kw] + temp_list
else:
kb[kw] = temp_list
return kb
# Find URLs in a page. This is mostly used for finding links in pdfs, and is not used in the web-scraping project.
def get_links(text):
rx = r"(https?://\S+)"
url = re.findall(rx, text)
return url
# pdf = pdfquery.PDFQuery('tbr.pdf')
# pdf.load()
# xmlurl = url + '_xml.txt'
# pdf.tree.write(xmlurl, pretty_print=True)
# os.remove('tbr.pdf')
# Find URLs in an html page. This is step 1 of the web-scraping project.
def read_search(url):
# with request.urlopen(url) as f:
# raw = f.read().decode('utf8')
# soup = BeautifulSoup(raw)
data = requests.get(url).text
soup = BeautifulSoup(data)
with open('urls.txt', 'w') as f:
counter = 0
wanted = False
for link in soup.find_all('a'):
link_str = str(link.get('href'))
#print(link_str)
#if link_str.startswith("/pdf"): # for now this section is hardcoded.
if link_str.startswith("/abs"):
#print("found something!")
link_str = "https://arxiv.org" + link_str
wanted = True
if wanted:
f.write(link_str + '\n')
wanted = False
counter+=1
if counter > 100:
break
#for script in soup(['script', 'style']):
#script.extract()
#print(soup.get_text())
# Taken from the sample code on the Github:
# https://github.com/kjmazidi/NLP/blob/master/Xtra_Python_Material/Web_Scraping/3%20-%20Web%20crawler%20-%20almost.ipynb
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True
if __name__ == '__main__':
main()