-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_scraper.py
67 lines (47 loc) · 1.75 KB
/
test_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import urllib2
import sys
from bs4 import BeautifulSoup
import re
def main():
#url = sys.argv[1]
url = "http://en.wikipedia.org/wiki/Web_scraping"
url = "http://sfbay.craigslist.org/nby/muc/2985476465.html"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
web_page = opener.open(url)
soup = BeautifulSoup(web_page.read())
text_to_match = soup.find(text=re.compile("Blues/Country Bass Player available"))
"""This section is for training"""
#ToMatch [(name, attrs, ith name in parent)]
path = []
current_element = text_to_match.parent
while type(current_element) != type(soup):
parent = current_element.parent
current_type = current_element.name
for i, ele in enumerate(parent.find_all(current_type)):
parents_nth_tag = i
if ele == current_element:
break
match = (current_element.name, current_element.attrs, parents_nth_tag)
path.append(match)
current_element = current_element.parent
"""This section actually matches the data"""
web_page = opener.open(sys.argv[1])
soup = BeautifulSoup(web_page.read())
path.reverse()
current_element = soup
for node in path:
tag_type_elements = current_element.find_all(node[0])
element = tag_type_elements[node[2]]
if element.attrs == node[1]:
print "Attribute matched"
else:
print "Attribute did not match"
print element.attrs, node[1]
#print element.name, node[0]
current_element = element
print current_element
#for child in list(soup.contents):
#print child
if __name__ == "__main__":
main()