forked from Geralt-TYH/obsidian-zhihu-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgui.py
100 lines (78 loc) · 2.74 KB
/
gui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/python3
import re
from tkinter import *
from tkinter.scrolledtext import ScrolledText
from main import get_article_urls_in_collection,get_single_post_content,get_single_answer_content,markdownify
from tqdm import tqdm
import os
import time
import random
import sys
from utils import filter_title_str
class myStdout(): # 重定向类
def __init__(self):
# 将其备份
self.stdoutbak = sys.stdout
self.stderrbak = sys.stderr
# 重定向
sys.stdout = self
sys.stderr = self
def flush(self):
pass
def write(self, info):
# info信息即标准输出sys.stdout和sys.stderr接收到的输出信息
contents.insert(END, info+'\n') # 在多行文本控件最后一行插入print信息
contents.update() # 更新显示的文本,不加这句插入的信息无法显示
contents.see(END) # 始终显示最后一行,不加这句,当文本溢出控件最后一行时,不会自动显示最后一行
def restoreStd(self):
# 恢复标准输出
sys.stdout = self.stdoutbak
sys.stderr = self.stderrbak
def validate_url(url):
url_regex = r'^((http|https):\/\/)?(www\.)?zhihu\.com.*'
matchObj = re.match(url_regex,url)
if matchObj:
return True
else:
return False
def clip():
collection_url = entry_url.get()
if not validate_url(collection_url):
print("非知乎网址,其他形式不支持")
return
collection_id = collection_url.split('?')[0].split('/')[-1]
urls, titles = get_article_urls_in_collection(collection_id)
for i in tqdm(range(len(urls))):
time.sleep(random.randint(1, 5))
content = None
url = urls[i]
title = titles[i]
if url.find('zhuanlan')!=-1:
content = get_single_post_content(url)
else:
content = get_single_answer_content(url)
md = markdownify(content, heading_style="ATX")
id = url.split('/')[-1]
downloadDir = os.path.join(os.path.expanduser("~"), "Downloads", "剪藏")
if not os.path.exists(downloadDir):
os.mkdir(downloadDir)
with open(os.path.join(downloadDir,filter_title_str(title) + ".md") , "w", encoding='utf-8') as md_file:
md_file.write(md)
print("全部下载完毕")
mystd = myStdout()
root = Tk()
root.geometry("400x450")
root.title("zhihu crawler")
frame = Frame(root)
frame.pack()
contents = ScrolledText(root)
contents.pack(side=BOTTOM, expand=True, fill=BOTH)
# 输入框
entry_url = Entry(root, text="url")
entry_url.pack( side = LEFT,expand=True, fill=X)
btn = Button(root,text='开始剪藏',command=clip)
btn.pack(side=RIGHT)
# Tkinter主事件循环
root.mainloop()
# 恢复标准输出
mystd.restoreStd()