-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_ris.py
108 lines (89 loc) · 2.76 KB
/
generate_ris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import argparse
import os
from pathlib import Path
from typing import Literal
import pandas as pd
from dotenv import load_dotenv
from eutils import QueryService # type: ignore
from metapub.pubmedarticle import PubMedArticle # type: ignore
# Load environment variables from .env file
load_dotenv()
def parse_excel_file(filepath: Path) -> set[int]:
"""
Parse PMIDs from an Excel file.
Parameters
----------
filepath : Path
Path to the Excel file containing PMIDs
Returns
-------
set[int]
Set of PMIDs from the Excel file
"""
df: pd.DataFrame = pd.read_excel(filepath, sheet_name="Missing")
pmids: set[int] = set(df["Missing PMIDs"].astype(int))
return pmids
def get_dois(pmids: set[int]) -> pd.DataFrame:
"""
Retrieve DOIs for given PMIDs using the NCBI API.
Parameters
----------
pmids : set[int]
Set of PMIDs
Returns
-------
pd.DataFrame
DataFrame containing PMIDs and their corresponding DOIs
"""
df_dicts: list[dict[str, str | int]] = []
qs = QueryService(
email=os.getenv("NCBI_EMAIL"), api_key=os.getenv("NCBI_API_KEY")
)
for pmid in pmids:
result = qs.efetch({"db": "pubmed", "id": pmid})
pma: PubMedArticle = PubMedArticle(result)
df_dict: dict[str, str | int] = {"PMID": pmid, "DOI": pma.doi}
df_dicts.append(df_dict)
return pd.DataFrame(df_dicts)
def generate_ris_file(
out_filepath: Path,
to_ris: pd.DataFrame,
write_type: Literal["a", "w"] = "w",
) -> None:
"""
Generate an RIS file from a DataFrame of DOIs.
Parameters
----------
out_filepath : Path
Path to the output RIS file
to_ris : pd.DataFrame
DataFrame containing DOIs to be written to the RIS file
write_type : Literal["a", "w"], optional
Write type, either 'a' for append or 'w' for write (default is 'w')
"""
with open(out_filepath, write_type) as file:
for i, (_, row) in enumerate(to_ris.iterrows()):
file.write("TY - JOUR\n")
file.write(f"DO - {row['DOI']}\n")
file.write("ER -\n")
if i < (len(to_ris) - 1):
file.write("\n")
def main():
"""
Main function to run the script as a CLI.
"""
parser = argparse.ArgumentParser(
description="Generate RIS file from Excel file containing PMIDs"
)
parser.add_argument(
"excel_file", type=Path, help="Path to the Excel file containing PMIDs"
)
parser.add_argument(
"ris_file", type=Path, help="Path to the output RIS file"
)
args = parser.parse_args()
pmids = parse_excel_file(args.excel_file)
dois = get_dois(pmids)
generate_ris_file(args.ris_file, dois)
if __name__ == "__main__":
main()