-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy patheasy_scrapy.py
497 lines (416 loc) · 16.6 KB
/
easy_scrapy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2019/3/14 13:39
# @Author : way
# @Site :
# @Describe: 创建scrapy爬虫,自动生成文件到 相应的目录spiders , pipelines, items, GF_JOBS
import os
import time
item = """#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : ${time}
# @Author : ${author}
from SP.items.items import *
from sqlalchemy.types import VARCHAR
class ${spidername}_list_Item(scrapy.Item):
# define table
tablename = '${spidername}_list'
tabledesc = '列表'
# define the fields for your item here like:
# 关系型数据库,可以自定义字段的类型、长度,默认 VARCHAR(length=255)
# colname = scrapy.Field({'idx': 1, 'comment': '名称', 'type': VARCHAR(255)})
# default column
detail_full_url = scrapy.Field({'idx': 100, 'comment': '详情链接'}) # 通用字段
pkey = scrapy.Field({'idx': 101, 'comment': 'md5(detail_full_url)'}) # 通用字段
pagenum = scrapy.Field({'idx': 102, 'comment': '页码'}) # 通用字段
class ${spidername}_detail_Item(scrapy.Item):
# define table
tablename = '${spidername}_detail'
tabledesc = '详情'
# define the fields for your item here like:
# 关系型数据库,可以自定义字段的类型、长度,默认 VARCHAR(length=255)
# colname = scrapy.Field({'idx': 1, 'comment': '名称', 'type': VARCHAR(255)})
# default column
fkey = scrapy.Field({'idx': 100, 'comment': '等于list.pkey'}) # 通用字段
pagenum = scrapy.Field({'idx': 101, 'comment': '页码'}) # 通用字段
class ${spidername}_file_Item(SPfileItem):
# define table
tablename = '${spidername}_file'
tabledesc = '附件'
"""
spider = """#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : ${time}
# @Author : ${author}
# @Describe : ${describe}
from bs4 import BeautifulSoup
from SP.spiders.SPRedisSpider import SPRedisSpider
from SP.items.${spidername}_items import *
from SP.utils.ctrl_redis import RedisCtrl
from SP.utils.base import md5, log, ScheduledRequest
from SP.utils.tool import get_file_type
class ${spidername}_Spider(SPRedisSpider):
name = '${spidername}'
redis_key = f'{name}:start_urls'
allowed_domains = []
custom_settings = {
'LOG_LEVEL': "INFO",
'LOG_FILE': log(name),
# 'CONCURRENT_REQUESTS': 5, # 控制并发数,默认16
# 'DOWNLOAD_DELAY': 3, # 控制下载延迟,默认0
'ITEM_PIPELINES': {
# 'SP.pipelines.pipelines_file.FilePipeline': 100, # 附件下载
# 'SP.pipelines.pipelines_clean.CleanPipeline': 101, # 字段清洗
# 'SP.pipelines.pipelines_datafile.DataFilePipeline': 109, # 写到数据文件
'SP.pipelines.pipelines_rdbm.RdbmPipeline': 200, # 关系型数据库
# 'SP.pipelines.pipelines_hbase.HbasePipeline': 201, # Hbase
# 'SP.pipelines.pipelines_mongodb.MongodbPipeline': 202, # Mongodb
# 'SP.pipelines.pipelines_kafka.KafkaPipeline': 203, # Kafka
# 'SP.pipelines.pipelines_elasticsearch.ElasticSearchPipeline': 204, # ElasticSearch
# 'SP.pipelines.pipelines_hdfs.HdfsPipeline': 205 # hdfs, hive
# 'SP.pipelines.pipelines_doris.DorisPipeline': 206 # doris
},
'DOWNLOADER_MIDDLEWARES': {
'SP.middlewares.SPMiddleWare.UserAgentMiddleWare': 100, # 随机 user-agent
# 'SP.middlewares.SPMiddleWare.HeadersMiddleWare': 101, # 定制 headers
# 'SP.middlewares.SPMiddleWare.ProxyMiddleWare': 102, # 使用代理ip
# 'SP.middlewares.SPMiddleWare.CookiesPoolMiddleWare': 103, # 使用 cookies 池 随机切换采集账号
# 'SP.middlewares.SPMiddleWare.RequestsMiddleWare': 104, # 使用 requests
# 'scrapy_splash.SplashCookiesMiddleware': 723, # 在meta中增加splash 需要启用3个中间件
# 'scrapy_splash.SplashMiddleware': 725, # 在meta中增加splash 需要启用3个中间件
# 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, # 在meta中增加splash 需要启用3个中间件
'SP.middlewares.SPMiddleWare.SizeRetryMiddleWare': 900 # 重试中间件,允许设置 MINSIZE(int),response.body 长度小于该值时,自动触发重试
},
}
def get_callback(self, callback):
# url去重设置:True 不去重 False 去重
callback_dt = {
'list': (self.list_parse, True),
'detail': (self.detail_parse, True),
}
return callback_dt.get(callback)
def list_parse(self, response):
rows = BeautifulSoup(response.text, 'lxml')
reqs = []
for row in rows:
detail_url = row.find('a').get('href')
list_item = ${spidername}_list_Item()
# save value for your item here like:
# list_item['title'] = row.find('a').text
# default column
list_item['detail_full_url'] = response.urljoin(detail_url)
list_item['pkey'] = md5(list_item['detail_full_url'])
list_item['pagenum'] = response.meta.get('pagenum')
yield list_item
req = ScheduledRequest(
url=list_item['detail_full_url'],
method='GET',
callback='detail',
body={}, # 如果是POST,在这边填写post字典
meta={
'fkey': list_item['pkey'],
'pagenum': list_item['pagenum'],
# 反爬相关的meta字典也填写这边,然后在spider中启用相应的中间件
# 'splash': {'wait': 2} # js加载、异步加载渲染
}
)
reqs.append(req)
# 将详情链接作为新的任务 推到redis
RedisCtrl().reqs_push(self.redis_key, reqs)
def detail_parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
detail_item = ${spidername}_detail_Item()
# save value for your item here like:
# detail_item['title'] = soup.find('h1').text
# default column
detail_item['fkey'] = response.meta.get('fkey')
detail_item['pagenum'] = response.meta.get('pagenum')
yield detail_item
file_item = ${spidername}_file_Item()
file_url = soup.find('your xpath').get('href')
# save value for your item here like:
# file_item['file_url'] = response.urljoin(file_url)
file_item['px'] = 1
file_item['file_url'] = response.urljoin(file_url)
file_item['file_name'] = ""
file_item['file_type'] = get_file_type(file_url, 'jpg')
# default column
file_item['fkey'] = response.meta.get('fkey')
file_item['pagenum'] = response.meta.get('pagenum')
yield file_item
"""
job = """#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : ${time}
# @Author : ${author}
import os
import sys
import getopt
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from SP_JOBS.job import *
from SP.spiders.${spidername} import ${spidername}_Spider
class ${spidername}_job(SPJob):
def __init__(self):
super().__init__(spider_name=${spidername}_Spider.name)
self.delete() # 如需去重、增量采集,请注释该行
self.headers = {
# 有反爬的话,可以在这边定制请求头
}
self.cookies = (
# 多账号采集的话,可以在这边定制多个 cookie string
)
@Job.push
def make_job(self, pages):
for pagenum in range(1, pages + 1):
url = ''
yield ScheduledRequest(
url=url, # 请求地址
method='GET', # 请求方式 GET/POST
callback='list', # 回调函数标识
body={}, # 如果是POST,在这边填写post字典
meta={
'pagenum': pagenum, # 页码
# 'payload': {}, # request payload 传输方式
# 'splash' : {'wait': 2} # js加载、异步加载渲染
}
)
if __name__ == "__main__":
# 采集页数
pages = 1
# 爬虫数量
num = 1
# 支持传参调用
opts, args = getopt.getopt(sys.argv[1:], "p:n:", ["pages=", "num="])
onlyjob = None
for op, value in opts:
if op in ("-p", "--pages"):
pages = int(value)
elif op in ("-n", "--num"):
num = int(value)
elif op in ("--onlyjob"):
onlyjob = value
# 执行采集
job = ${spidername}_job()
job.make_job(pages)
if not onlyjob:
job.crawl(num)
"""
job_patch = """#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : ${time}
# @Author : ${author}
import os
import sys
import getopt
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from SP_JOBS.job import *
from SP.spiders.${spidername} import ${spidername}_Spider
from SP.utils.tool import rdbm_session
class ${spidername}_job(SPJob):
def __init__(self):
super().__init__(spider_name=${spidername}_Spider.name)
self.delete() # 如需去重、增量采集,请注释该行
self.headers = {
# 有反爬的话,可以在这边定制请求头
}
self.cookies = (
# 多账号采集的话,可以在这边定制多个 cookie string
)
@Job.push
def make_list_job(self, pages):
sql = \"\"\"
select pagenum
from ${spidername}_list
group by pagenum
\"\"\"
with rdbm_session() as session:
rows = session.execute(sql).fetchall()
rows = [int(row[0]) for row in rows]
ret = list(set(range(1, pages + 1)) - set(rows)) # 未采集的页码
for pagenum in ret:
url = ''
yield ScheduledRequest(
url=url, # 请求地址
method='GET', # 请求方式 GET/POST
callback='list', # 回调函数标识
body={}, # 如果是POST,在这边填写post字典
meta={
'pagenum': pagenum, # 页码
# 'payload': {}, # request payload 传输方式
# 'splash' : {'wait': 2} # js加载、异步加载渲染
}
)
@Job.push
def make_detail_job(self):
sql = \"\"\"
select a.detail_full_url, a.pagenum, a.pkey
from ${spidername}_list a
left join ${spidername}_detail b on a.pkey = b.fkey
where b.keyid is null
\"\"\"
with rdbm_session() as session:
rows = session.execute(sql).fetchall()
for row in rows:
detail_full_url, pagenum, pkey = row
yield ScheduledRequest(
url=detail_full_url,
method='GET',
callback='detail',
body={}, # 如果是POST,在这边填写post字典
meta={
'pagenum': pagenum, # 页码
'fkey': pkey, # 外键
# 'payload': {}, # request payload 传输方式
# 'splash' : {'wait': 2} # js加载、异步加载渲染
}
)
if __name__ == "__main__":
# 采集页数
pages = 1
# 爬虫数量
num = 1
# 支持传参调用
opts, args = getopt.getopt(sys.argv[1:], "p:n:", ["pages=", "num="])
onlyjob = None
for op, value in opts:
if op in ("-p", "--pages"):
pages = int(value)
elif op in ("-n", "--num"):
num = int(value)
elif op in ("--onlyjob"):
onlyjob = value
# 执行采集
job = ${spidername}_job()
job.make_list_job(pages) # list 补爬
job.make_detail_job() # detail 补爬
if not onlyjob:
job.crawl(num)
"""
def spider_info(spidername):
info = {}
info['spider_path'] = f'{os.getcwd()}\SP\spiders\{spidername}.py'
info['item_path'] = f'{os.getcwd()}\SP\items\{spidername}_items.py'
info['job_path'] = f'{os.getcwd()}\SP_JOBS\{spidername}_job.py'
info['job_patch'] = f'{os.getcwd()}\SP_JOBS\{spidername}_job_patch.py'
return info
def delete_spider(spidername):
info = spider_info(spidername)
for path in info.values():
if os.path.exists(path):
os.remove(path)
print(f"{path} 删除成功")
def open_in_pycharm(job_path, pycharm):
if not pycharm:
return
if os.path.exists(pycharm):
cmd = f'"{pycharm}" {job_path}'
os.system(cmd)
return
print("your pycharm not in utils.tool.open_in_pycharm, please add your PyCharm exe or link")
def new(**kwargs):
"""
:param kwargs: 新增一个全新的爬虫
:return:
"""
spidername = kwargs.get('spidername')
describe = kwargs.get('describe')
author = kwargs.get('author')
item = kwargs.get('item')
spider = kwargs.get('spider')
job = kwargs.get('job')
pycharm = kwargs.get('pycharm')
if not spidername:
raise NameError("spidername不允许为空, 请检查!")
info = spider_info(spidername)
if os.path.exists(info['job_path']):
print((f"{spidername}已存在!"))
open_in_pycharm(info['job_path'], pycharm) # 自动打开job文件
return
for path in info.values():
if os.path.exists(path):
raise NameError(f"{path}已存在, 请检查!")
replace_map = {
'${spidername}': spidername,
'${describe}': describe,
'${author}': author,
'${time}': time.strftime("%Y-%m-%d %H:%M", time.localtime()),
}
for key, val in replace_map.items():
item = item.replace(key, val)
spider = spider.replace(key, val)
job = job.replace(key, val)
path_map = {
info['spider_path']: spider,
info['item_path']: item,
info['job_path']: job
}
# 创建文件
for path, st in path_map.items():
with open(path, 'w', encoding='utf-8') as f:
f.write(st)
pathmsg = '\n'.join(path_map.keys())
msg = f"爬虫创建成功,请前往调整修改:\n{pathmsg}"
print(msg)
open_in_pycharm(info['job_path'], pycharm) # 自动打开job文件
# open_in_pycharm(spider_path) # 自动打开spider文件
def patch(**kwargs):
"""
:param kwargs: 新增补爬job
:return:
"""
spidername = kwargs.get('spidername')
author = kwargs.get('author')
job_patch = kwargs.get('job_patch')
pycharm = kwargs.get('pycharm')
if not spidername:
raise NameError("spidername不允许为空, 请检查!")
job_path = f'{os.getcwd()}\SP_JOBS\{spidername}_job_patch.py'
if os.path.exists(job_path):
print(f"{job_path}已存在!")
open_in_pycharm(job_path, pycharm) # 自动打开job文件
return
replace_map = {
'${spidername}': spidername,
'${author}': author,
'${time}': time.strftime("%Y-%m-%d %H:%M", time.localtime()),
}
# 创建文件
with open(job_path, 'w', encoding='utf-8') as f:
for key, val in replace_map.items():
job_patch = job_patch.replace(key, val)
f.write(job_patch)
msg = f"补爬job创建成功,请前往调整修改:{job_path}"
print(msg)
open_in_pycharm(job_path, pycharm) # 自动打开job文件
# open_in_pycharm(spider_path) # 自动打开spider文件
if __name__ == "__main__":
# 【必填】爬虫名称
spidername = ''
# 【可选】爬虫简单描述
describe = ''
# 是否生成补爬job_patch文件, 默认 False
make_job_patch = False
# 删除爬虫的所有代码文件,有时候可能名字没取好,强迫症。。。
# delete_spider(spidername)
# 个人配置
author = 'way'
pycharm = r"C:\Users\Public\Desktop\PyCharm Community Edition 2019.3.1 x64.lnk"
# 若爬虫已存在,则打开对应job文件;若不存在,则自动创建并打开对应job文件
# ---------------------------------------------------------- # 新建爬虫
new(
spidername=spidername,
describe=describe,
author=author,
item=item,
spider=spider,
job=job,
pycharm=pycharm,
)
# ---------------------------------------------------------- # 新建补爬job
if make_job_patch:
patch(
spidername=spidername,
author=author,
job_patch=job_patch,
pycharm=pycharm,
)