-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpubmed_grab.py
118 lines (83 loc) · 3.13 KB
/
pubmed_grab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from __future__ import print_function
from Bio import Entrez
from Bio import Medline
from Bio.Entrez import efetch, read
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import smtplib
import base64
from pandas import * # pretty printing 2d lists (a.k.a the id/abstract pairs)
# Task List for 7/13/2017
# add support for date-wise data grabs (from the previous week)
# add title column in email w/ instances of query highlighted
# add column w/ link to abstract
# separate results into two tables - one w/ query present in title, one w/out
fromaddr = "[email protected]"
toaddr = "[email protected]"
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = toaddr
Entrez.email = '[email protected]' # let NCBI know who you are
HEADER = '''
<html>
<head>
</head>
<body>
'''
FOOTER = '''
</body>
</html>
'''
def make_2d_list(row, col):
a = []
for row in xrange(row): a += [[0]*col]
return a
def search_by_string(query, max_res):
handle = Entrez.esearch(db="pubmed", term=query, retmax = max_res)
record = Entrez.read(handle)
handle.close()
idlist = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records) # makes it much easier, trust me
msg['Subject'] = query + "Query Results" # add today's date to the subject later
final_list = list()
# https://www.ncbi.nlm.nih.gov/pubmed/?term=PMID
for record in records:
final_list.append([record.get("PMID", "?"), record.get("TI", "?"), "https://www.ncbi.nlm.nih.gov/pubmed/?term=" +
record.get("PMID", "?")])
# print("PMID:", record.get("PMID", "?"))
# print("Abstract:", record.get("AB", "?"))
# print("") # order the abstracts by number of occurrences (add more metrics later)
print(DataFrame(final_list)) # this is really nice for data viz on the matrix
pandas.set_option('display.max_colwidth', -1)
df = pandas.DataFrame(final_list)
column_names = ["PMID", "Title", "Link to Abstract"]
df.columns = column_names
df = df.replace({query: '<b>' + query + '</b'}, regex=True)
with open('test.html', 'w') as f:
f.write(HEADER)
f.write(df.to_html(classes='df'))
f.write(FOOTER)
filename = 'test.html'
f = file(filename)
attachment = MIMEText(f.read(), 'html')
msg.attach(attachment)
server = smtplib.SMTP('smtp.gmail.com', 587) # I'm pretty sure 587 is the port?
server.ehlo()
server.starttls()
server.ehlo()
server.login("trivneel211", "inteli511") # i think this is pointless tbh but whatever
text = msg.as_string()
server.sendmail(fromaddr, toaddr, text)
def fetch_abstract(pmid): # not really being used at all, just a ref. function
handle = efetch(db='pubmed', id=pmid, retmode='xml')
xml_data = Entrez.read(handle)
print(xml_data)
try:
article = xml_data['MedlineCitation']['Article']
abstract = article['Abstract']
return abstract
except IndexError:
return None
search_by_string("BRAF", 500)