Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated researcher appyter with more accurate pubmed citation queries #899

Merged
merged 7 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion appyters/Researcher_Summary_Report_Appyter/appyter.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
"name": "Researcher_Summary_Report_Appyter",
"title": "Researcher Summary Report",
"version": "0.0.2",
"version": "0.1.0",
"description": "An appyter to summarize and display various research information output pertaining to the work of a given researcher.",
"image": "thumbnail.png",
"authors": [
Expand Down
4 changes: 3 additions & 1 deletion appyters/Researcher_Summary_Report_Appyter/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ wordcloud
scholarly
kaleido
beautifulsoup4
Pillow==9.5.0
Pillow==9.5.0
google-search-results
parsel
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"{% set researcher_name = StringField(\n",
" name='researcher_name', \n",
" label='Investigator Name', \n",
" default=\"Avi Ma'ayan\", \n",
" default=\"Alan Attie\", \n",
" description='Input full name of PI along with middle names/middle initials followed by spaces as leaving it out may change the results of certain information', \n",
" section='Data_Section1', \n",
" required = True\n",
Expand Down Expand Up @@ -108,6 +108,18 @@
"Entrez.tool = 'Demoscript'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ssl\n",
"ssl._create_default_https_context = ssl._create_unverified_context\n",
"from dotenv import load_dotenv\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -250,6 +262,56 @@
"open_alex_display = getting_information_from_openalex(name_of_researcher_first_and_last, output_folder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%appyter markdown\n",
"## Citations per year for {{researcher_name.raw_value}}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PUBMED CITATION INFORMATION"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Searching for Citation Information from Pubmed for {}\".format(name_of_researcher_first_and_last))\n",
"#Pubmed API calls takes the pubmed_name of Lastname, First Initials\n",
"pmid_citation_dict = query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last)\n",
"\n",
"if pmid_citation_dict != None and len(pmid_citation_dict)> 0:\n",
" display(MyMarkdown(\"## Citation Information (from Pubmed)\"))\n",
" year_keys = list(pmid_citation_dict.keys())\n",
" year_keys.sort()\n",
" pmid_citation_dict = {year:pmid_citation_dict[year] for year in year_keys}\n",
" fig = make_bar_plot(pmid_citation_dict,'Year', \"Citations\", f\"Citations per Year\", \"Sourced from Pubmed\")\n",
" fig_line = make_line_plot(pmid_citation_dict, 'Year', \"Citations\", f\"Cumulative Citations\", \"Sourced from Pubmed\")\n",
" fig.show()\n",
" fig.write_image(output_folder+'citations_bar_pubmed.png')\n",
" figure_counter = display_figure_labels(output_folder, figure_counter, \"Citations that are connected to the publications each year for {}.\".format(name_of_researcher_first_and_last), title = 'citations_bar_pubmed')\n",
"\n",
" fig_line.show()\n",
" fig_line.write_image(output_folder+'citations_line_graph_pubmed.png')\n",
" figure_counter = display_figure_labels(output_folder, figure_counter, \"The cumulative citations that are connected to the publications each year for {}\".format(name_of_researcher_first_and_last), title = 'citations_line_graph_pubmed')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SEMANTIC SCHOLAR CITATION INFORMATION"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -264,8 +326,8 @@
" display(MyMarkdown(\"## Citation Information (from Semantic Scholar)\"))\n",
" display(MyMarkdown(\"The citation information counts is taken from the profile page of the given author name from [Semantic Scholar](https://www.semanticscholar.org/)\"))\n",
"\n",
" fig = make_bar_plot(semantic_citation_dict,'Year', \"Citations\", f\"Citations per Year\", \"Sourced from Semantic Scholar\")\n",
" fig_line = make_line_plot(semantic_citation_dict, 'Year', \"Citations\", f\"Cumulative Citations\", \"Sourced from Semantic Scholar\")\n",
" fig = make_bar_plot(semantic_citation_dict,'Year of Publication', \"Total Times Cited\", f\"Total Current Citations for Publications' Year\", \"Sourced from Semantic Scholar\")\n",
" fig_line = make_line_plot(semantic_citation_dict, 'Year of Publication', \"Total Cited\", f\"Cumulative Citations\", \"Sourced from Semantic Scholar\")\n",
" fig.show()\n",
" fig.write_image(output_folder+'citations_bar_semantic.png')\n",
" figure_counter = display_figure_labels(output_folder, figure_counter, \"Citations that are connected to the publications each year for {}.\".format(name_of_researcher_first_and_last), title = 'citations_bar_semantic')\n",
Expand Down Expand Up @@ -534,7 +596,8 @@
"outputs": [],
"source": [
"GENESHOT_URL = 'https://maayanlab.cloud/geneshot/api/search'\n",
"payload = {\"rif\": \"autorif\", \"term\": pubmed_name}\n",
"geneshot_name = re.sub('[^A-Za-z0-9 ]+', '', pubmed_name)\n",
"payload = {\"rif\": \"autorif\", \"term\": geneshot_name}\n",
"response = requests.post(GENESHOT_URL, json=payload)\n",
"gene_count = {}\n",
"# Setting up the url parameters to go to actual gene shot link\n",
Expand Down Expand Up @@ -753,7 +816,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.9.4"
},
"vscode": {
"interpreter": {
Expand Down
47 changes: 46 additions & 1 deletion appyters/Researcher_Summary_Report_Appyter/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,4 +544,49 @@ def display_summary_text_from_openalex_png(institution = '', interests = [], h_i
plt.axis('off')
plt.show()
return display_list

from Bio import Entrez
# Setting Entrez tool parameter
Entrez.email = '[email protected]'
Entrez.tool = 'Demoscript'
from urllib.parse import urlencode
def query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last):
params = {
'term': "{}".format(pubmed_name)
}
pmid_citation_dict = defaultdict(int)
#Get the pubmed publications for the researcher with the pmids
info = Entrez.esearch(db="PubMed", term= pubmed_name, retmax = "5000")
info = Entrez.read(info)
identifiers = info['IdList'] # Get list of identifiers which are pmids
if len(identifiers) == 0:
print("This name does not have any publications to search for citations returned from PubMed.")
return pmid_citation_dict
else:
print("This may take over a minute or two.")
display(MyMarkdown("### Link to [PubMed Query](https://pubmed.ncbi.nlm.nih.gov/?{}) for {}".format(urlencode(params), name_of_researcher_first_and_last)))
# Use the Entrez module efetch for the publication records for the PMIDs with text information included for each.
records = Entrez.efetch(db="pubmed", id=identifiers, rettype="medline", retmode="text")
publications = records.read().split("\n\n")
for pub in publications:
try:
year_published = int(pub.split("DP - ")[1].split('\n')[0].split()[0].strip()[:4])
pmid = pub.split("PMID-")[1].split('\n')[0].strip()
handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc_refs")
record = Entrez.read(handle)
if len(record[0]["LinkSetDb"]) != 0:
# print(record[0]["LinkSetDb"][0]["Link"])
list_of_ids = []
for id_dict in record[0]["LinkSetDb"][0]["Link"]:
list_of_ids.append(id_dict['Id'])
handle = Entrez.esummary(db="pmc", id=','.join(list_of_ids), retmode="xml")
pub_records = Entrez.parse(handle)
for record in pub_records:
if 'PubDate' in record:
if record['PubDate'][:4].isdigit():
year_article_published = int(record['PubDate'][:4])
pmid_citation_dict[year_article_published] += 1
except:
continue
if len(pmid_citation_dict) == 0:
print("This name does not have any publications to search for citations returned from PubMed.")
return pmid_citation_dict
Loading