forked from loonslo/python-spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathip_pond.py
155 lines (135 loc) · 5.09 KB
/
ip_pond.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import time
from random import random
import requests
from scrapy.selector import Selector
import pymysql
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
}
# 这是自己的数据库
# 创建数据库时,使用的字段是ip(varchar)(主键) port(varchar) proxy_type(varchar) speed(float)
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='jobs', charset='utf8')
cursor = conn.cursor()
# 定义一个随机延时,防止被封ip
def rand_sleep_time():
sleep_time = random() * 100
return time.sleep(sleep_time)
def update_ip_pond():
# 这个网站目前一共有3637页,这里获取前面的10个页面
for i in range(1, 11):
resp = requests.get('https://www.xicidaili.com/nn/%s' % i, headers=headers)
if resp.status_code != 200:
print('第%s页获取失败' % i)
else:
print('已获取第%s页内容' % i)
selector = Selector(text=resp.text)
all_items = selector.xpath('//*[@id="ip_list"]//tr')
ip_list = []
for item in all_items[1:]:
# 这里使用xpath从网页提取
speed_str = item.xpath('td[7]/div/@title').get()
if speed_str:
speed = float(speed_str.split('秒')[0])
ip = item.xpath('td[2]/text()').get()
port = item.xpath('td[3]/text()').get()
proxy_type = item.xpath('td[6]/text()').get().lower()
ip_list.append((ip, port, proxy_type, speed))
for ip_info in ip_list:
# sql的作用为:插入并更新相应的字段
cursor.execute(
"insert ip_pond(ip,port,proxy_type,speed) values ('{0}','{1}','{2}','{3}') ON DUPLICATE KEY UPDATE ip=VALUES(ip),port=VALUES(port),proxy_type=VALUES(proxy_type),speed=VALUES(speed)"
.format(ip_info[0], ip_info[1], ip_info[2], ip_info[3])
)
rand_sleep_time()
conn.commit()
class GetIp(object):
# 删除不可用的Ip
def delete_ip(self, ip):
delete_sql = """
DELETE FROM ip_pond WHERE ip='{0}'
""".format(ip)
cursor.execute(delete_sql)
conn.commit()
return True
# 验证ip是否可用
def judge_ip(self, ip, port, proxy_type):
http_url = 'https://www.baidu.com'
proxy_url = '{0}://{1}:{2}'.format(proxy_type, ip, port)
try:
if proxy_type == 'http':
proxy_dict = {
'http': proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict)
else:
proxy_dict = {
'https': proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict, verify=False)
except Exception as e:
print('invalid ip and port')
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >= 200 and code < 300:
print('effective ip')
return True
else:
print('invalid ip and port')
self.delete_ip(ip)
return False
# 从数据库中随机选择
def get_random_ip(self):
random_sql = """
SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY RAND() LIMIT 1
"""
cursor.execute(random_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
proxy_type = ip_info[2]
judge_re = self.judge_ip(ip, port, proxy_type)
if judge_re:
return '{0}://{1}:{2}'.format(proxy_type, ip, port)
else:
return self.get_random_ip()
# 从数据库中选速度最快的
def get_optimum_ip(self):
optimum_sql = """
SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY speed LIMIT 1
"""
cursor.execute(optimum_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
proxy_type = ip_info[2]
judge_re = self.judge_ip(ip, port, proxy_type)
if judge_re:
return '{0}://{1}:{2}'.format(proxy_type, ip, port)
else:
return self.get_optimum_ip()
def get_proxies(self):
getip = GetIp()
ip = getip.get_random_ip()
print(ip)
proxy_type = ip.split(':')[0]
proxies = {
proxy_type: ip
}
return proxies
if __name__ == '__main__':
# 当取到的ip是https的时候,会有点慢
sql = """
SELECT * FROM ip_pond
"""
check_table = cursor.execute(sql)
if check_table:
url = 'https://www.baidu.com'
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
}
proxies = GetIp().get_proxies()
res = requests.get(url=url, headers=headers, proxies=proxies)
else:
update_ip_pond()