-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler1.py
executable file
·51 lines (42 loc) · 1.3 KB
/
crawler1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
# encoding: utf-8
import os
import re
import requests
import time
from bs4 import BeautifulSoup
###############################################获取系统时间
def GetNowTime():
return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
def getid():
with open('crawler_file.txt', 'r+', encoding='utf-8-sig') as f:
id_list = []
for line in f:
line = line.strip()
id_list.append(line)
f.close()
return id_list
def geturl(url):
html = requests.get(url).text
return html
if __name__ == '__main__':
num_aticle = 100000
ID = getid()
for id in ID:
num_aticle =num_aticle +1
if num_aticle % 800 == 0:
time.sleep(600)
print(GetNowTime())
try:
url = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=%s&report=medline&format=text' % id
xx = geturl(url)
file = open('Medline_BPA.txt', 'a', encoding='utf-8-sig')
file.write(xx)
file.write('\n\n')
file.close()
print('Succeed to write %s' % url)
print(GetNowTime())
except :
num_aticle = num_aticle + 1
time.sleep(600)
print(GetNowTime())