-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
83 lines (69 loc) · 2.25 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import bs4
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import configs as cfg
from utils.utils import save_as_pkl
def get_html() -> BeautifulSoup:
html = requests.get(cfg.PROJECT_SETTINGS.get('URL')).text
soup = BeautifulSoup(html, 'lxml')
return soup
def get_wikitable(soup:BeautifulSoup, table_class:str) -> bs4.element.Tag:
return soup.find('table', class_=table_class)
def get_headers(table:bs4.element.Tag) -> tuple:
rows = table.find_all('tr')
# table headers
attrs = []
# attribute descriptions
attrs_desc = []
for header in rows[0].find_all('th'):
is_span = header.span
attrs_desc.append(
header.span['title'] if is_span else None
)
attrs.append(header.text)
return (attrs, attrs_desc)
def process_attrs(attrs:list, attrs_desc:list) -> tuple:
attrs = list(map(lambda x: x.strip(), attrs))
attrs_desc = list(map(
lambda x: '' if not x else x,
attrs_desc
))
return (attrs, attrs_desc)
def generate_dataframe(table:bs4.element.Tag, attrs:list) -> pd.DataFrame:
rows = table.find_all('tr')
all_attributes = []
for hero in rows[1:]:
attribute = []
for x in hero.find_all('td'):
try:
attribute.append(
x.a['title'] if len(x.text.strip()) == 0 else x.text
)
except TypeError as err:
attribute.append(np.nan)
all_attributes.append(
[y.strip() for y in attribute if isinstance(y, str)]
)
# Creating the dataframe
hero_stats = pd.DataFrame(data=all_attributes, columns=attrs)
return hero_stats
def main() -> None:
soup = get_html()
table = get_wikitable(soup, 'wikitable')
# attributes and descriptions
attrs, attrs_desc = get_headers(table)
attrs, attrs_desc = process_attrs(attrs, attrs_desc)
hero_stats = generate_dataframe(table, attrs)
save_as_pkl(
hero_stats,
[cfg.PROJECT_SETTINGS.get('DATA_DIR'), cfg.PROJECT_SETTINGS.get('DATA_FILE')]
)
save_as_pkl(
attrs_desc,
[cfg.PROJECT_SETTINGS.get('DATA_DIR'), cfg.PROJECT_SETTINGS.get('DESC_FILE')]
)
# Driver code
if __name__ == '__main__':
main()