-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHaichuan.py
96 lines (83 loc) · 4.32 KB
/
Haichuan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import scrapy
from scrapy.dupefilters import RFPDupeFilter
from scrapy.http import Request
from scrapy.selector import Selector
from Bysj_SE.items import HaichuanItem
from Bysj_SE.utils.haichuan_login_request import haichuan_login
from Bysj_SE.utils import common_params
import re
from datetime import datetime
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy_redis.spiders import RedisCrawlSpider
# class HaichuanSpider(RedisCrawlSpider):
# class HaichuanSpider(CrawlSpider):
class HaichuanSpider(scrapy.Spider):
name = 'Haichuan'
allowed_domains = ['bbs.hcbbs.com'] ###
start_urls = ['https://bbs.hcbbs.com/forum-337-1.html'] ###
# redis_key = 'haichuanspider:start_urls'
a = 1
b = 2
def start_requests(self): # 爬虫入口
print("haichuan 起始url")
#cookies = haichuan_login("Crown-w", "wsh2766659938")
url = "https://bbs.hcbbs.com/forum-337-1.html"
# yield Request( url= url,headers= common_params.common_headers,cookies= cookies,callback= self.parse)
yield Request(url=url, headers=common_params.common_headers, callback=self.parse)
# page_links = LinkExtractor(allow=(r"https://bbs.hcbbs.com/forum-337-\d+.html"))
# article_links = LinkExtractor(allow=(r"https://bbs.hcbbs.com/thread-\d+-\d+-\d+.html"))
#
# rules = (
# Rule(page_links),
# Rule(article_links, callback="parse", follow=False)
# )
def parse(self, response):
"""
爬取海川论坛的工艺-工艺技术部分
:param response:
:return:
"""
print("haichuan parse函数")
# 获取所有页面
page_urls = Selector(response=response).xpath('//*[@id="fd_page_bottom"]/div/a')
# 获取所有文章的 链接 与 标题
article_urls = Selector(response=response).xpath( '//table[@id="threadlisttableid"]//th/a[re:test(@href,"https://bbs.hcbbs.com/thread-")]')
for article_url in article_urls:
href = article_url.xpath('@href').extract_first()
title = article_url.xpath('text()').extract_first()
yield Request(url= href, callback=self.empty_function, headers=common_params.common_headers)
for page_url in page_urls:
next_url = page_url.xpath('@href').extract_first()
yield Request(url=next_url, callback=self.parse)
def empty_function(self,response):
start_time = datetime.now()
href = response.url
title = Selector(response= response).xpath('//*[@id="thread_subject"]/text()').extract_first()
article = Selector(response= response).xpath('//*[@class="t_f"]/text()').extract_first()
if not article is None:
article = article.strip()
article = article.replace(' ', '').replace('\n', '').replace('\r', '')
else:
article = ""
if article =="":
article = ""
str_list = Selector(response= response).xpath('//*[@class="t_f"]//font/text()').extract()
str_list.extend(Selector(response= response).xpath('//*[@class="t_f"]/div/text()').extract())
for i in str_list:
article = article + i
temp = Selector(response= response).xpath('//*[@id="postlist"]/div[1]//div[@class="authi"]/em/span').extract_first()
# 后期把 datetime 的格式处理移动到 items.py 里面
if temp is None:
date_time_str = Selector(response= response).xpath('//*[@id="postlist"]/div[1]//div[@class="authi"]/em/text()').extract_first()
else:
date_time_str = Selector(response= response).xpath('//*[@id="postlist"]/div[1]//div[@class="authi"]/em/span/@title').extract_first()
print(date_time_str)
# tu_date_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",date_time_str).groups()[0]
tu_date_time = date_time_str
check = Selector(response= response).xpath('//div[@id="postlist"]/table//td[1]//span[2]/text()').extract_first()
reply = Selector(response= response).xpath('//div[@id="postlist"]/table//td[1]//span[5]/text()').extract_first()
item_obj = HaichuanItem(title= title, href= href, article= article, date_time= tu_date_time,check= check, reply= reply)
end_time = datetime.now()
last_seconds = (end_time - start_time).total_seconds()
yield item_obj