-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmini_spider.py
266 lines (242 loc) · 7.56 KB
/
mini_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python
# -*- coding: utf-8 -*-
######################################################################
#
# Copyright (c) 2013 Baidu.com, Inc. All Rights Reserved.
#
######################################################################
"""
Python codemaster: a implementation of mini-spider.
Authors: zhangzhibiao01([email protected])
Date: 2014/02/18 14:24:30
"""
import BeautifulSoup
import chardet
import ConfigParser
import getopt
import logging
import os
import Queue
import re
import sys
import threading
import time
import urllib2
import urlparse
import threadPool
#import stacktracer
__version = 1.0
def usage():
"""
Usage
"""
print 'python mini_spider.py -c spider.conf [-d]\nversion : %s' % __version
class Node(object):
"""
Queue node, include url and depth
"""
def __init__(self, url="", depth=0):
self.url = url
self.depth = depth
class Spider(object):
"""
Mini spider implementation.
"""
def __init__(self, conf, logname='log'):
"""
Constructor.
@conf, config
@logname, name of log, log default.
"""
self.conf = conf
self.log = logname
self.thread_num = 0
self.output = ""
self.max_depth = 1
self.interval = 1 # 1s
self.timeout = 1 # 1s
self.target_url = ""
self.target_url_reg = ""
self.crawled_sets = set()
self.urlQueue = Queue.Queue()
self.event = threading.Event()
self.rlock = threading.RLock()
# load log module
logging.basicConfig(
filename=os.path.join(os.getcwd(), self.log),
level=logging.DEBUG,
format = '%(asctime)s - %(levelname)s: %(message)s'
)
def readConf(self):
"""
Read config.
Return value, True for success, False for fail.
"""
# read conf
try:
cf = ConfigParser.ConfigParser()
cf.read(self.conf)
self.thread_num = int(cf.get('spider', 'thread_count'))
self.output = cf.get('spider', 'output_directory')
self.max_depth = int(cf.get('spider', 'max_depth'))
self.interval = int(cf.get('spider', 'crawl_interval'))
self.timeout = int(cf.get('spider', 'crawl_timeout'))
self.target_url = cf.get('spider', 'target_url')
for line in open(cf.get('spider', 'url_list_file')):
if line.startswith('http'):
self.urlQueue.put(Node(url=line.strip(' /\n\r')))
self.event.set()
except:
logging.warning('read conf [%s] failed!', self.conf)
return False
# mkdir for output && log
outputdir = os.path.join(os.getcwd(), self.output)
try:
if not os.path.exists(outputdir):
os.mkdir(outputdir)
except os.error as e:
logging.warning('mkdir failed %s', outputdir)
return False
self.output = outputdir
self.target_url_reg = re.compile(self.target_url)
return True
def save_if_need(self, url, content):
"""
Check if need save content.
"""
if self.target_url_reg.match(url):
f = open(os.path.join(self.output, url.replace('/', '_')), 'w')
f.write('%s' % content)
f.close()
def parse_links(self, base_url, content):
"""
Parse hyperlink from html page.
"""
alllinks = []
links = BeautifulSoup.BeautifulSoup(content).findAll('a', href=re.compile('^http|^/|^\.'))
base_url = base_url.strip('/ ')
for item in links:
if not item['href'].startswith('http'):
alllinks.append(urlparse.urljoin(base_url, item['href']).strip('/ '))
else:
alllinks.append(item['href'].strip('/ '))
#logging.debug('%s', alllinks)
return alllinks
def crawl_work(self, node):
"""
Crawler main work function.
"""
try:
#print node.url
logging.debug("url start[%s]", node.url)
r = urllib2.urlopen(node.url, timeout = self.timeout)
if r.getcode() != 200:
logging.warning('Crawl [%s] failed, ret code[%d]', node.url, r.getcode())
time.sleep(self.interval)
return None
content = r.read()
logging.info('Crwaled url[%s], depth[%d], len [%d]', node.url, node.depth, len(content))
# save if necessay
self.save_if_need(node.url, content)
# parse more links
self.rlock.acquire()
for link in self.parse_links(node.url, content):
# check node depth and not crawled
if node.depth < self.max_depth and link not in self.crawled_sets:
self.urlQueue.put(Node(link, node.depth+1))
self.rlock.release()
logging.debug("url done[%s]", node.url)
if not self.event.isSet():
self.event.set()
except:
#raise
return None
return True
def start(self):
"""
Object start function.
"""
# read config
if not self.readConf():
return None
# thread pool
self.threadpool = threadPool.ThreadPool(self.thread_num, self.timeout)
self.threadpool.start_threads()
# logging start
logging.info('Begin crawling...')
# url队列退出控制变量
emptyCount = 0
maxEmptyCount = 60
# 主线程从url队列中获取url,并放到工作队列中
# main thread get url node from urlQueue, and
# put it in threadpool
while True:
if self.urlQueue.empty():
if not self.event.wait(self.timeout + 2):
break
self.event.clear()
emptyCount = 0
try:
item = self.urlQueue.get(timeout=self.timeout + 2)
except Queue.Empty:
logging.debug("urlQueue empty")
continue
self.threadpool.add_job(self.crawl_work, item)
self.rlock.acquire()
self.crawled_sets.add(item.url)
self.rlock.release()
self.urlQueue.task_done()
self.stop()
logging.info('Done')
def stop(self):
"""
Stop all.
"""
self.threadpool.stop_threads()
self.urlQueue.join()
def test(self):
"""
Self simple test function.
"""
url = 'http://www.baidu.com'
links = self.parse_links(url, urllib2.urlopen(url).read())
for i in links:
print i
print len(links)
def main():
"""
Main function.
"""
try:
opts, args = getopt.getopt(sys.argv[1:], "hc:vt")
except getopt.GetoptError as err:
print str(err)
usage()
sys.exit(2)
conf_file = ""
verbose = False
test_flag = False
for o, a in opts:
if o == "-v":
print version
elif o == "-h":
usage()
sys.exit()
elif o == "-c":
conf_file = a
elif o == '-t':
test_flag = True
else:
assert False, "unhandled option"
if len(conf_file) <= 0:
usage()
sys.exit(2)
#stacktracer.trace_start("trace.html",interval=1,auto=True)
spider = Spider(conf_file)
if test_flag:
spider.test()
else:
spider.start()
#stacktracer.trace_stop()
if __name__ == '__main__':
main()