-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelpers.py
117 lines (93 loc) · 3.47 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import pandas as pd
from Bio import Entrez
Entrez.email = '[email protected]'
def pmids_for_query(query):
"""
Return PMIDs resulting frmo a query
"""
# Search pubmed for the query, returning the PMIDs fo all results (up to 1e4 results)
handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
searchResults = Entrez.read(handle)
pmids = searchResults['IdList']
return pmids
def pubmed_articles_for_query(query):
"""
Return a dataframe of articles resulting from a pubmed query.
"""
# Search pubmed for the query, returning the PMIDs fo all results (up to 1e4 results)
handle = Entrez.esearch(db='pubmed', retmax=10000, retmode='xml', term=query)
searchResults = Entrez.read(handle)
pmids = pmids_for_query(query)
# Get the articles from the PMIDs
handle = Entrez.efetch(db='pubmed', retmode='xml', id=pmids)
results = Entrez.read(handle)
articles = results['PubmedArticle']
df = pd.DataFrame({'PMID': pmids})
journal_names = []
titles = []
dois = []
abstracts = []
years = []
results = []
for i in range(len(articles)):
# Store the desired information in a dictionary
result = {}
result['PMID'] = pmids[i]
# MedlineCitation contains all the data of interest.
article = articles[i]
citation = article['MedlineCitation']
# Retrieve the fields of interest. Some have multiple fallback locations.
result['Journal'] = citation['Article']['Journal']['Title']
result['Title'] = citation['Article']['ArticleTitle']
try:
result['Abstract'] = citation['Article']['Abstract']['AbstractText'][0]
except:
result['Abstract'] = None
try:
result['Year'] = citation["Article"]["Journal"]["JournalIssue"]["PubDate"]["Year"]
except:
try:
result['Year'] = citation["Article"]["ArticleDate"][0]["Year"]
except:
try:
result['Year'] = citation["DateRevised"]["Year"]
except:
result['Year'] = None
# Get the DOI
refArray = article['PubmedData']['ArticleIdList']
doi = None
for entry in refArray:
if entry.attributes["IdType"] == "doi":
doi = entry
result['doi'] = doi
# Get the article type
try:
types = []
typelist = citation['Article']['PublicationTypeList']
for t in typelist:
tt = str(t)
if "Research Support" not in tt:
types.append(tt)
result['Types'] = types
except:
result['Types'] = None
results.append(result)
df = pd.DataFrame(results)
df['Year'] = df['Year'].astype(int)
return df
def avg_articles_per_year_last5(query):
"""
Return the average number of articles a year for from 2017-2021 for `query`
"""
# Only get last 6 years of data
query += (" AND \"last 6 years\"[dp]")
try:
data = pubmed_articles_for_query(query)
avgs = [len(data[data['Year'] == val]) for val in [2017, 2018, 2019, 2020, 2021]]
print(np.mean(avgs))
return np.mean(avgs)
except Exception as e:
print('failed on: %s'%query)
print(e)
return np.nan