-
Notifications
You must be signed in to change notification settings - Fork 61
/
Copy pathtwitter-media-scraper.py
123 lines (105 loc) · 4.32 KB
/
twitter-media-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import winreg, traceback, requests, re, os, time, json
version = '1.1'
proxy = None
headers = {}
host_url = 'https://api.twitter.com/1.1/guest/activate.json'
api_url = 'https://api.twitter.com/2/timeline/conversation/{}.json?include_entities=false&include_user_entities=false&tweet_mode=extended'
authorization = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
p_tw_link = re.compile(r'status/(\d+)')
p_media_link = re.compile(r"(https://pbs.twimg.com/media/.+?)'")
s = requests.Session()
def get_proxy():
global proxy
key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Internet Settings")
proxy_enable, key_type = winreg.QueryValueEx(key, "ProxyEnable")
if proxy_enable:
proxy_server, key_type = winreg.QueryValueEx(key, "ProxyServer")
proxy = {'http': 'http://'+proxy_server, 'https': 'https://'+proxy_server}
def set_header():
global headers
headers = {'authorization': authorization}
respone = s.post(host_url, proxies=proxy, headers=headers).json()
if 'guest_token' in respone:
x_guest_token = respone['guest_token']
headers = {'authorization': authorization, 'x-guest-token': x_guest_token}
else:
print("guest_token获取失败, 请前往issue页反馈:\nhttps://github.com/mengzonefire/twitter-media-scraper/issues")
input("\n按回车键退出程序\n")
exit()
def get_media_link(page_id):
page_content = s.get(api_url.format(page_id), proxies=proxy, headers=headers).text
if '"{}":'.format(page_id) in page_content:
tw_content = json.loads(page_content)['globalObjects']['tweets'][page_id]
media_link = p_media_link.findall(str(tw_content))
return media_link
else:
if 'Sorry, that page does not exist' in page_content:
print('提取失败: 该条推特已删除')
else:
print('提取失败: 接口访问错误, 请检查log文件, 并前往issue页反馈:\nhttps://github.com/mengzonefire/twitter-media-scraper/issues')
write_log(page_id, page_content)
return 'error'
def download_media(links):
for link in links:
filename = link.replace('https://pbs.twimg.com/media/', '')
print('正在下载: '+filename)
r = s.get(link+'?name=orig', proxies=proxy, stream=True)
with open('./download/'+filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
time.sleep(1)
def start_crawl():
page_urls = []
print('输入链接(支持批量,一行一条,双击回车确认):')
while True:
temp = input()
if not temp:
break
if '//t.co/' in temp or '//twitter.com/' in temp:
page_urls.append(temp)
for page_url in page_urls:
if '//t.co/' in page_url:
page_id = p_tw_link.search(s.get(page_url, proxies=proxy).text)
if page_id:
page_id = page_id.group(1)
else:
print('提取失败: 该条推特已删除')
continue
else:
page_id = p_tw_link.findall(page_url)
if page_id:
page_id = page_id[0]
else:
continue
print('开始提取: ' + page_url)
media_link = get_media_link(page_id)
if media_link:
if media_link != 'error':
download_media(media_link)
else:
print('提取失败: 该条推特不包含媒体内容')
if input('回车键退出, 输入任意内容继续提取\n'):
start_crawl()
def write_log(page_id, page_content):
if not os.path.exists('log'):
os.mkdir("log")
with open('./log/{}.txt'.format(page_id), 'w', encoding='utf-8') as f:
f.write(page_content)
def main():
if not os.path.exists('download'):
os.mkdir("download")
get_proxy()
set_header()
start_crawl()
if __name__ == '__main__':
try:
print('version: {}'.format(version))
main()
except Exception as e:
if 'WinError 10060' in str(e):
print('连接twitter.com超时,请检查系统代理')
else:
traceback.print_exc()
print(e)
if input('回车键退出, 输入任意内容重置脚本\n'):
main()