-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
77 lines (60 loc) · 2.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
USERNAME = "USERNAME"
PASSWORD = "PASSWORD"
def get_html_for_page(url):
payload = {
"url": url,
"source": "google",
}
response = requests.post(
"https://realtime.oxylabs.io/v1/queries",
auth=(USERNAME, PASSWORD),
json=payload,
)
response.raise_for_status()
return response.json()["results"][0]["content"]
def get_citations(article_id):
url = f"https://scholar.google.com/scholar?q=info:{article_id}:scholar.google.com&output=cite"
html = get_html_for_page(url)
soup = BeautifulSoup(html, "html.parser")
data = []
for citation in soup.find_all("tr"):
title = citation.find("th", {"class": "gs_cith"}).get_text(strip=True)
content = citation.find("div", {"class": "gs_citr"}).get_text(strip=True)
entry = {
"title": title,
"content": content,
}
data.append(entry)
return data
def parse_data_from_article(article):
title_elem = article.find("h3", {"class": "gs_rt"})
title = title_elem.get_text()
title_anchor_elem = article.select("a")[0]
url = title_anchor_elem["href"]
article_id = title_anchor_elem["id"]
authors = article.find("div", {"class": "gs_a"}).get_text()
return {
"title": title,
"authors": authors,
"url": url,
"citations": get_citations(article_id),
}
def get_url_for_page(url, page_index):
return url + f"&start={page_index}"
def get_data_from_page(url):
html = get_html_for_page(url)
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("div", {"class": "gs_ri"})
return [parse_data_from_article(article) for article in articles]
data = []
url = "https://scholar.google.com/scholar?q=global+warming+&hl=en&as_sdt=0,5"
NUM_OF_PAGES = 1
page_index = 0
for _ in range(NUM_OF_PAGES):
page_url = get_url_for_page(url, page_index)
entries = get_data_from_page(page_url)
data.extend(entries)
page_index += 10
print(data)