-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathnewegg.py
116 lines (109 loc) · 3.96 KB
/
newegg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import urllib
import urllib2
import re
import itertools
import traceback
import string
import BeautifulSoup
import cookielib
import time
def extract_text(t):
if not t:
return ""
if t == '–': return ''
if isinstance(t, (unicode, str)):
return t
return "".join(extract_text(c) for c in t)
def extract_price(data):
res = sorted(float(x.replace(',', '')) for x in re.findall('\$([,0-9.]+)', data))
if not res: raise ValueError('no price in %r' % (data,))
return res[len(res)//2]
def get(N, *PropertyCodeValues):
res = []
page = 1
while True:
print "Page", page
data = {
'Submit': 'Property',
'bop': 'And',
'Pagesize': '100',
'Page': str(page),
}
if isinstance(N, dict):
data.update((k, str(v)) for k, v in N.iteritems())
else:
data['N'] = str(N)
data = data.items()
for PropertyCodeValue in PropertyCodeValues:
data.append(('PropertyCodeValue', PropertyCodeValue))
url = 'http://www.newegg.com/Product/ProductList.aspx?'+urllib.urlencode(data)
print url
req = urllib2.Request(url)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
while True:
try:
text = opener.open(req).read()
text = opener.open(req).read()
except:
traceback.print_exc()
time.sleep(5)
else: break
dom = BeautifulSoup.BeautifulSoup(text)
pager = dom.find('li', {'name':'currentPage'})
if not pager:
print "lost pager"
break
pager = int(extract_text(pager))
if pager != page:
print "lost pager2"
break
for item in dom.findAll('div', {'class': 'itemCell itemCell-ProductList itemCell-ProductGridList'}):
dom_desc = item.find('span', {'class': 'itemDescription'})
title = dom_desc.contents[0]
link = dom_desc.parent['href']
rating = item.findAll('a', {'class':'itemRating'})
eggs = "0"
for pk in rating:
#print "pk: " + str(pk)
pks = pk.findAll('span', limit=1)
#print str(pk.findNextSibling(text))
for pks_i in pks:
eggs = str(pks_i['class'])[6]
if len(eggs) == 0:
eggs = "0"
votes = str(extract_text(rating))
if len(votes) > 8:
votes = votes[8:len(votes)-1]
else:
votes = "0"
#print "title: " + title
#print "link: " +link
#print "eggs: " + eggs + " votes: " + votes
try:
out = re.compile("[^0-9A-Za-z.]([0-9.]+)( )?([MGT])B").findall(title.upper())[0]
except IndexError:
print "invalid size", repr(title)
continue
size = float(out[0])
if out[2] == 'M':
size /= 1000.
elif out[2] == 'T':
size *= 1000.
try:
price = extract_price(extract_text(item))
except:
traceback.print_exc()
try:
price = float(extract_text(item.find('li', {'class':'price-current'})).split('$')[1].replace(',',''))
except:
traceback.print_exc()
try:
price = float(extract_text(item.find('li', {'class':'priceBefore'})).split('$')[1].replace(',',''))
except:
traceback.print_exc()
#print "invalid price", repr(extract_text(item.find('li', {'class':'priceFinal'}))), repr(extract_text(item.find('li', {'class':'priceList'})))
continue
yield size, price, title, link, votes, eggs
print "Page", page, "done"
page += 1