forked from mokeyjay/Yandere-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
287 lines (259 loc) · 11.4 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/env python3
import time
import random
import threading
import Http
import Yandere
import Function
import concurrent.futures
import threading
def switch_convert(status):
# 将选项转换为1/0以便判断,倒不是我忘了用bool……
# 非大小写'y'输入均被判断为否定,包括回车
if status == 'y' or status == 'Y':
return 1
else:
return 0
def compare(width, height):
# 懒得写一串选项判断
if width > height:
return 1
elif width < height:
return 2
else:
return 3
def input_settings(settings: dict):
settings['start_page'] = int(input('开始页码:'))
settings['stop_page'] = int(input('停止页码,为0时爬取至上次终止图片,非0时爬完此页即停止:'))
settings['pic_type'] = int(input('图片比例,0=全部 1=横图 2=竖图 3=正方形:'))
if switch_convert(input('选择是否重设图片尺寸限制条件。若重设,输入为0则不限制下载尺寸;若不重设,将从配置文件读取上次条件 (y/n)')):
settings['pic_size']['min']['width'] = int(input('最小宽度:'))
settings['pic_size']['min']['height'] = int(input('最小高度:'))
settings['pic_size']['min']['proportion'] = int(input('最小宽高比:'))
settings['pic_size']['max']['width'] = int(input('最大宽度:'))
settings['pic_size']['max']['height'] = int(input('最大高度:'))
settings['pic_size']['max']['proportion'] = int(input('最大宽高比:'))
settings['file_size_limit'] = switch_convert(input('限制图片体积?(y/n)'))
if settings['file_size_limit']:
settings['file_size'] = int(input('最大文件体积,单位兆字节(MB):')) * 1048576
settings['folder_path'] = input('保存路径:')
settings['tag_search'] = switch_convert(input('启用tag搜索? (y/n)'))
settings['random_delay'] = switch_convert(input('是否启用下载延迟?(y/n)'))
settings['safe_mode'] = switch_convert(input('是否过滤NSFW内容? (y/n)'))
settings['status_check'] = switch_convert(input('不下载待审核图片?(y/n) ※ "待审核"状态多由低质量触发'))
return settings
def judge(post, settings, discard_tags):
# pending判断
# 发现其他状态类型,将判断条件从“仅active”改为“排除pending”
if settings['status_check']:
if post['status'] == 'pending':
add_log('{} is {},跳过。原因:{}'.format(post['id'], post['status'], post['flag_detail']['reason']))
return False
# 分级判断
if settings['safe_mode']:
if post['rating'] == 'e':
return False
# 排除tag判断
if settings['tag_search']:
if list(set(discard_tags).intersection(set(post['tags'].strip(' ').split(' ')))):
add_log(post['id'] + ' 包含待排除tags,跳过')
return False
# 文件体积判断
if settings['file_size_limit']:
if post['file_size'] > settings['file_size']:
add_log(post['id'] + ' 超过体积限制,跳过')
return False
# 图片比例判断
# 由于预览图经过压缩,因此判断预览图尺寸会比原图多出一点冗余
if settings['pic_type']:
if not (settings['pic_type'] == compare(post['preview_width'], post['preview_height'])):
add_log(post['id'] + ' 比例不符,跳过')
return False
# 图片宽高比判断
proportion = post['preview_width'] / post['preview_height']
pic_size = settings['pic_size']
if proportion < pic_size['min']['proportion'] or (pic_size['max']['proportion'] and proportion > pic_size['max']['proportion']):
add_log(post['id'] + ' 宽高比不符,跳过')
# 图片尺寸判断
width = post['width']
height = post['height']
if width < pic_size['min']['width'] or height < pic_size['min']['height']:
add_log(post['id'] + ' 小于最小尺寸要求,跳过')
return False
else:
if (pic_size['max']['width'] and width > pic_size['max']['width']) or (pic_size['max']['height'] and height > pic_size['max']['height']):
add_log(post['id'] + ' 大于最大尺寸限制,跳过')
return False
# 所有条件满足
return True
def download(post):
global folder_path
# 获取文件名并解码
# 没错我就是嵌套狂魔
file_name = Function.rename(Http.decode(post['file_url']))
# 文件是否已存在?
# 提醒:存在已知问题
# 如果网站上post的tags被修改,那么两次爬取的文件名是不同的,exist方法将返回假。这样会造成相同文件重复写入。只有“爬取至上次终止位置”不会出现此问题。
# ——又不是不能用.jpg
if Function.exists(folder_path, file_name):
add_log(post['id'] + ' 已存在,跳过')
return True
add_log('{} 开始下载p{} 大小{}M 类型{}'.format(time.strftime('%H:%M:%S'), post['id'], "%.2f" %(post['file_size'] / 1048576), post['file_ext']))
ts = time.time()
img = Http.get(post['file_url'], {'Host': 'files.yande.re', 'Referer': 'https://yande.re/post/show/' + post['id']})
cost_time = time.time() - ts
add_log('{}下载完毕,耗时{}s,平均速度{}k/s'.format(post['id'], "%.2f" %cost_time, "%.2f" %(post['file_size'] / 1024 / cost_time)))
Function.write(folder_path, file_name, img)
def add_log(content):
global mode
global container
global log_file_name
global folder_path
# 因为没有错误处理所以要将日志立刻写入文件防止丢失
if mode:
# 日志输出判断,终端或UI
print(content)
else:
container.insert('end', content + '\n')
container.see('end')
Function.add(folder_path, log_file_name, content + '\n')
def main(settings: dict, tags: str, discard_tags: str, output_container, output_mode: str):
global end
global data
global mode
global container
global log_file_name
global folder_path
global lock
end = False
data = []
mode = output_mode
container = output_container
log_file_name = 'log_{}.txt'.format(time.strftime('%H-%M-%S'))
folder_path = settings['folder_path'] + '/' + time.strftime('%Y%m%d')
lock = threading.Condition()
Function.create_folder(folder_path)
# 建立线程
# 只启用了单线程
#改为多线程,默认4线程
get_data(settings, tags)
parallel_task(settings, discard_tags).join()
# 也可以不用进程锁
# 生产者线程:抓取页面,将post元素补充入data队列
class get_data(threading.Thread):
def __init__(self, settings, tags):
threading.Thread.__init__(self)
self.daemon = True
self.settings = settings
self.tags = tags
self.start()
def run(self):
global end
global lock
global data
settings = self.settings
tag_on = settings['tag_search']
page = settings['start_page']
stop_page = settings['stop_page']
if tag_on:
last_stop_id = settings['tagSearch_last_stop_id']
else:
last_stop_id = settings['last_stop_id']
tags = self.tags
while True:
if lock.acquire():
if end:
lock.release()
break
if len(data) < 10:
if page <= stop_page or not stop_page:
add_log('正在读取第{}页……'.format(str(page)))
origin = Yandere.get_li(Yandere.get_json(page, tag_on, tags))
if len(origin):
data.extend(origin)
else:
end = True
add_log('所有页面读取完毕')
lock.release()
break
if page == settings['start_page']:
post = data[0]
if post['id'] > last_stop_id:# 考虑开始页不是第一页的情况
if tag_on:
settings['tagSearch_last_stop_id'] = post['id']
else:
settings['last_stop_id'] = post['id']
Function.write(settings['folder_path'], 'config.json', Yandere.return_json(settings), True)
if tag_on:
settings['tagSearch_last_stop_id'] = last_stop_id
else:
settings['last_stop_id'] = last_stop_id
page += 1
lock.notify(1)
else:
end = True
lock.release()
break
else:
lock.wait()
lock.release()
# 定义一个处理每个任务的函数
def process_task(settings, discard_tags, post):
global end
global lock
global data
stop_page = settings['stop_page']
delay_on = settings['random_delay']
if settings['tag_search']:
last_stop_id = settings['tagSearch_last_stop_id']
else:
last_stop_id = settings['last_stop_id']
if post['id'] <= last_stop_id and not stop_page:
# 达到上次爬取位置,跳出循环
add_log('达到上次爬取终止位置')
end = True
return
post['id'] = str(post['id'])
if judge(post, settings, discard_tags):
download(post)
if delay_on:
# 两次下载间随机间隔,虽然不觉得有啥用
time.sleep(random.uniform(0.5, 10.0))
# 使用线程池来处理任务
def parallel_task(settings, discard_tags, max_workers=4):
# 创建一个线程池,最多可以创建 max_workers 个线程
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
while True:
if lock.acquire():
if len(data):
post = data.pop(0)
time.sleep(0.002)
lock.notify(1)
# 使用线程池来执行任务
executor.submit(process_task, settings, discard_tags, post)
continue
else:
if end:
lock.release()
break
else:
lock.wait()
lock.release()
if __name__ == "__main__":
# 获取设置
settings = Yandere.get_li(Function.read('config.json'))
if not switch_convert(input('使用上次设置? (y/n)')):
input_settings(settings)
if settings['tag_search']:
tags = input('tag搜索已启用,请输入要搜索的tags,多个tag以空格分隔:')
discard_tags = input('要排除的tags,多个tag以空格分隔, 不排除则按回车跳过:')
print('警告:改变tags后,爬取至上次停止图片时停止功能可能失效\n本次爬取图片标签:' + tags + '\n本次排除标签:' + discard_tags)
# 将排除tags转换为列表
discard_tags = discard_tags.strip(' ').split(' ')
# 将易读的空格分隔转换为加号分隔,urllib无法处理空格,会报错
tags = tags.replace(' ', '+')
else:
tags = ''
discard_tags = ''
# 开始运行
main(settings, tags, discard_tags, '', True)