forked from Andrew9Tech/Meituan_Dianping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdianpingContent_0.1.py
220 lines (189 loc) · 7.84 KB
/
dianpingContent_0.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#coding=utf-8
# 程序:大众点评商家数据抓取
# 版本:0.1
# 作者:andrew9tech
# 日期:2015-6-13
# 语言:Python 2.7
import urllib
import re
import os
import random
import time
import requests
from tempfile import TemporaryFile
from xlwt import Workbook
from xlutils.copy import copy
from xlrd import open_workbook
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
Row = []
f = open(u"h:\\大众点评\\shlogo.txt", "r+")
for line in f.readlines():
Row.append(line.strip('\n'))
f.close()
row = int(Row[0])
Num = []
f = open(u"h:\\大众点评\\" + u"上海编号.txt", "r+")
for line in f.readlines():
Num.append(line.strip('\n'))
print len(Num)
f.close()
#代理IP实时抓取
P = ['183.223.137.20:8123', '218.204.140.104:8118', '116.231.100.46:8090', '101.4.136.66:84']
for n in range(8912, len(Num)):
num = Num[n]
#自定义一个请求
url1 = 'http://t.dianping.com/deal/' + str(num)
url2 = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=' + str(num) + '&cityId=1&action=shops&page=1®ionId=0'
print "url1: ", url1
#print num
headers = {
'Accept-Encoding':'gzip,deflate',
'User-Agent':'Mozilla/3.1.0'
}
try:
print "Length of P:", len(P)
randomP = random.randint(0, len(P)-1)
proxies = {"http" : 'http://'+str(P[randomP]),
"https" : 'http://'+str(P[randomP]),}
print proxies
#访问链接url1
response1 = requests.get(url1, headers=headers, proxies=proxies, timeout=10)
#判断头文件信息,包含数据是否用gzip传输,访问是否成功
#print response1.headers
shopcontent1 = response1.content
#print shopcontent1
#访问链接url2
response2 = requests.get(url2, headers=headers, proxies=proxies, timeout=10)
#判断头文件信息,包含数据是否用gzip传输,
#print response2.headers
shopcontent2 = response2.content
#print shopcontent2
except Exception as e10:
del P[randomP]
try:
randomP = random.randint(0, len(P)-1)
proxies = {"http" : 'http://'+str(P[randomP]),
"https" : 'http://'+str(P[randomP]),}
print proxies
#访问链接url1
response1 = requests.get(url1, headers=headers, proxies=proxies, timeout=10)
#判断头文件信息,包含数据是否用gzip传输,
#print response1.headers
shopcontent1 = response1.content
#print shopcontent1
#访问链接url2
response2 = requests.get(url2, headers=headers, proxies=proxies, timeout=10)
#判断头文件信息,包含数据是否用gzip传输,
#print response2.headers
shopcontent2 = response2.content
#print shopcontent2
except Exception as e11:
del P[randomP]
shopcontent2 = -1
shopcontent1 = -1
print "url error!"
continue
try:
try:
stores_name = re.findall(r'<h1 class="title">(.*?)</h1>', shopcontent1, re.S)[0]
#print "stores_name:", stores_name
Stores_Name = stores_name.replace(' ', '').replace('\n', '')
#print "Stores_Name:", Stores_Name
except Exception as e1:
Stores_Name = -1
print "Stores_Name:", e1
try:
subtitle = re.findall(r'<h2 class="sub-title">(.*?)</h2>', shopcontent1, re.S)[0]
#print "subtitle: ", subtitle
Sub_Title = re.sub(r'</?[^>]+>', '', subtitle).replace(' ', '').replace('\n', '')
#print "Sub_Title:", Sub_Title
except Exception as e2:
Sub_Title = -1
print "Sub_Title:", e2
try:
Price_Display = re.findall(r'<span class="price-display"><em>¥</em>(.*?)</span>', shopcontent1, re.S)[0]
#print "Price_Display:", Price_Display
except Exception as e3:
Price_Display = -1
print "Price_Display:", e3
try:
#<span class="price-discount">5<em>折</em></span>
Price_Discount = re.findall(r'<span class="price-discount">(.*?)<em>折</em></span>', shopcontent1, re.S)[0]
#print "Price_Discount:", Price_Discount
except Exception as e4:
Price_Discount = -1
print "Price_Discount:", e4
try:
#<span class="price-original">价值 <em>¥</em>70</span>
Price_Original = re.findall(r'<span class="price-original">价值 <em>¥</em>(.*?)</span>', shopcontent1, re.S)[0]
#print "Price_Original:", Price_Original
except Exception as e5:
Price_Original = -1
print "Price_Original:", e5
try:
#<span>已售<em class="J_current_join">651</em>份</span>
J_current_join = re.findall(r'<span>已售<em class="J_current_join">(.*?)</em>份</span>', shopcontent1, re.S)[0]
#print "J_current_join:", J_current_join
except Exception as e6:
J_current_join = -1
print "J_current_join:", e6
try:
#<span class="star-rate">4.5</span>
Star_rate = re.findall(r'<span class="star-rate">(.*?)</span>', shopcontent1, re.S)[0]
#print "Star_rate:", Star_rate
except Exception as e7:
Star_rate = -1
print "Star_rate:", e7
try:
product_evaluate_num_temp = re.findall(r'<span class="star-rate">.*?</span>(.*?)</a>条团购评价', shopcontent1, re.S)[0].replace(' ', '').replace('\n', '')
#print "product_evaluate_num_temp:", product_evaluate_num_temp
Product_Evaluate_Num = re.sub(r'</?[^>]+>', '', product_evaluate_num_temp)
#print "Product_Evaluate_Num:", Product_Evaluate_Num
except Exception as e8:
Product_Evaluate_Num = -1
print "Product_Evaluate_Num:", e8
try:
#<span class="star-rate">4.5</span>
#{"address":"(.*?)","avgPrice":(.*?),"branchName":"(.*?)".*?"contactPhone":"(.*?)","crossRoad":"(.*?)".*?"glat":(.*?),"glng":(.*?).*?"shopId":(.*?),"shopName":"(.*?)".*?"voteTotal":(.*?)}
Shop_info = re.findall(r'{"address":"(.*?)","avgPrice":(.*?),"branchName":"(.*?)","businessHours".*?"contactPhone":"(.*?)","crossRoad":"(.*?)","dealGroupId".*?"glat":(.*?),"glng":(.*?),"power".*?"shopId":(.*?),"shopName":"(.*?)".*?"voteTotal":(.*?)}', shopcontent2, re.S)
#print "Shop_info:", Shop_info
Shop_Num = len(Shop_info)
#print Shop_Num
except Exception as e9:
Shop_info = -1
print e9
TGinfo = []
#print type(TGinfo)
TGinfo.append(Stores_Name)
TGinfo.append(Sub_Title)
TGinfo.append(Price_Display)
TGinfo.append(Price_Discount)
TGinfo.append(Price_Original)
TGinfo.append(J_current_join)
TGinfo.append(Star_rate)
TGinfo.append(Product_Evaluate_Num)
#数据保存于excl
rb = open_workbook(u'h:\\大众点评\\shanghai.xls')
#print "rb"
rs = rb.sheet_by_index(0)
#print "rs"
wb = copy(rb)
#print "wb"
#wb = copy(open_workbook(u'h:\\meituan\\shanghai.xls').sheet_by_index(0))
ws = wb.get_sheet(0)
#print "ws"
for i in xrange(8):
ws.write(row,i,str(TGinfo[i]).decode('utf-8'))
for i in xrange(Shop_Num):
for j in xrange(10):
#if len(Business_Locations[i][j])>0:
ws.write(row,8+j,Shop_info[i][j].decode('utf-8'))
row += 1
#for j in range(7):
# sheet1.write(0,j,info[j].encode('utf-8'))
wb.save(u"h:\\大众点评\\shanghai.xls")
print row, "Done!"
except Exception as e0:
print e0