From 8c5b20d5fb26f8a28db325116e119971dafe1efb Mon Sep 17 00:00:00 2001 From: George Doyle Date: Thu, 23 May 2024 12:45:28 +0100 Subject: [PATCH 1/5] Added okd_qc_commands.py script --- README.md | 21 +++++++ okd_qc_commands.py | 139 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 okd_qc_commands.py diff --git a/README.md b/README.md index 9349e18..2cc346f 100644 --- a/README.md +++ b/README.md @@ -62,3 +62,24 @@ If run with the name of a TSO related project in DNA nexus as an argument, this ```bash tso_upload.sh 002_240216_A01229_0290_AHNL5GDMXY_TSO24006``` The resulting command will be sent to std out with just the APP_ID and MOKAGUYS_AUTH_TOKEN needing to be added to each line, this can be done using find & replace. IMPORTANT: HD200 and NTC samples (HD200 or 00000_00000 in sample name) should have their lines removed manually as these should not be uploaded. + +## okd_qc_commands.py + +Automates the generation of DNAnexus MultiQC/FastQC commands for a given OncoDeep runfolder. +Must be ran on workstation where authkey file is present. + +### Usage: +``` +python3 qcgen.py -p {dnanexus_project_id} -f {illumina_runfolder_name} +``` +### Arguments: +-p, --project The DNAnexus project ID for the run (i.e, project-XXXXXXXXXXX). +-f, --fastq_dir The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3). + +### Output: +Shell script file within the current working directory named according to the run. This can be run from any location on the genomics workstation + +### Testing: +Concordance testing using diff commands performed against pre-existing manually generated multiqc_fastqc.sh scripts in the dx_run_commands directory. + +Only deviation was in run 240507_A01229_0324_AH5CYWDRX5, where a manual entry mistake was identified. \ No newline at end of file diff --git a/okd_qc_commands.py b/okd_qc_commands.py new file mode 100644 index 0000000..a55aa2a --- /dev/null +++ b/okd_qc_commands.py @@ -0,0 +1,139 @@ +""" +DNAnexus FastQC and MultiQC Command Generator + +Created: 22/05/2024 +Author: Bioinformatics @ Synnovis (Guy's & St. Thomas' NHS Foundation Trust) + +This script automates the generation of DNAnexus MultiQC/FastQC commands for a given OncoDeep runfolder. +Must be ran on workstation where authkey file is present. + +Usage: + python3 qcgen.py -p {dnanexus_project_id} -f {illumina_runfolder_name} + +Arguments: + -p, --project The DNAnexus project ID for the run. + -f, --fastq_dir The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3). + +Output: + Generates a shell script with commands to run FastQC and MultiQC on OKD fastq files. +""" + +import os +import argparse +import re + +def parse_arguments(): + '''Parse command line arguments''' + parser = argparse.ArgumentParser(description="Generate a shell script for running FastQC and MultiQC on OKD fastq files.") + parser.add_argument("-p", "--project", required=True, help="The DNAnexus project ID for the run.") + parser.add_argument("-f", "--fastq_dir", required=True, help="The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3).") + return parser.parse_args() + +def get_fastq_dir(fastq_dir): + '''Construct the directory path containing fastq files''' + return os.path.join("/media/data3/share", fastq_dir, "Data/Intensities/BaseCalls") + +def get_okd_id(fastq_dir): + '''Extract the OKD ID''' + fastq_files = os.listdir(fastq_dir) + for file in fastq_files: + # Regex to extract OKD_XXXXX ID + match = re.search(r'OKD\d{5}', file) + if match: + return match.group() + raise ValueError("No OKD ID found in fastq files.") + +def get_fastq_files(fastq_dir): + '''Create list of all OKD fastq files''' + # Use startswith to exclude undetermined fastqs + return [f for f in os.listdir(fastq_dir) if f.startswith("OKD") and f.endswith(".fastq.gz")] + +def sort_fastq_files(fastq_files): + '''Sort the fastq files based on sample number''' + def sort_key(filename): + # Get the RX sample ID from the end of the fastq name + sample_name = "_".join(filename.split("_")[:-2]) + # Remove letter to leave int for sorting + sample_number = int(re.search(r'S(\d+)', sample_name).group(1)) + return sample_number + return sorted(fastq_files, key=sort_key) + +def read_auth_token(token_path): + '''Read DNAnexus auth token from file''' + with open(token_path, 'r') as token_file: + return token_file.read().strip() + +def generate_script(fastq_folder_name, okd_id, provided_fastq_folder_name, fastq_files, project_id, auth_token): + '''Generate the shell script''' + script_filename = "{}_multiqc_fastqc.sh".format(fastq_folder_name) + with open(script_filename, "w") as script_file: + # Create and open the output shell script file + script_file.write("depends_list=''\n\n") + for r1_file in fastq_files: + if "_R1_" in r1_file: + r2_file = r1_file.replace("_R1_", "_R2_") + sample_name = "_".join(r1_file.split("_")[:-3]) + + # Write the FastQC command for each pair of fastq files + script_file.write( + "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/fastqc_v1.4.0 --priority high -y " + "--name {sample_name} -ireads={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/Data/Intensities/BaseCalls/{r1_file} " + "-ireads={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/Data/Intensities/BaseCalls/{r2_file} " + "--dest={provided_fastq_folder_name}:/ --brief --auth-token {auth_token})\n".format( + sample_name=sample_name, provided_fastq_folder_name=provided_fastq_folder_name, fastq_folder_name=fastq_folder_name, r1_file=r1_file, r2_file=r2_file, okd_id=okd_id, auth_token=auth_token + ) + ) + script_file.write("depends_list=\"${depends_list} -d ${jobid} \"\n") + + # Write the MultiQC command + script_file.write( + "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/multiqc_v1.18.0 --priority high -y " + "--instance-type mem1_ssd1_v2_x4 -iproject_for_multiqc={provided_fastq_folder_name} " + "-icoverage_level=100 --project={project_id} $depends_list --brief --auth-token {auth_token})\n".format( + provided_fastq_folder_name=provided_fastq_folder_name, project_id=project_id, auth_token=auth_token + ) + ) + script_file.write("depends_list=\"${depends_list} -d ${jobid} \"\n") + + # Write the upload_multiqc command + script_file.write( + "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/upload_multiqc_v1.4.0 --priority high -y " + "--instance-type mem1_ssd1_v2_x2 -imultiqc_html=$jobid:multiqc_report -imultiqc_data_input=$jobid:multiqc " + "-imultiqc_data_input={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/{fastq_folder_name}.illumina_lane_metrics " + "--project={project_id} $depends_list --brief --auth-token {auth_token})\n".format( + provided_fastq_folder_name=provided_fastq_folder_name, fastq_folder_name=fastq_folder_name, okd_id=okd_id, project_id=project_id, auth_token=auth_token + ) + ) + + print("Shell script generated successfully: {}".format(script_filename)) + +def main(): + # Parse command line arguments + args = parse_arguments() + + # Construct the directory path for fastq files + fastq_dir = get_fastq_dir(args.fastq_dir) + + # Extract the OKD ID from fastq files + okd_id = get_okd_id(fastq_dir) + + # Get the fastq folder name + fastq_folder_name = os.path.basename(os.path.normpath(os.path.join(fastq_dir, "..", "..", ".."))) + + # Create the provided fastq folder name + provided_fastq_folder_name = "003_{}_{}".format(fastq_folder_name, okd_id) + + # Get the list of fastq files + fastq_files = get_fastq_files(fastq_dir) + + # Sort the fastq files + sorted_fastq_files = sort_fastq_files(fastq_files) + + # Read the DNAnexus auth token from file + auth_token = read_auth_token("/usr/local/src/mokaguys/.dnanexus_auth_token") + + # Generate the shell script with FastQC and MultiQC commands + generate_script(fastq_folder_name, okd_id, provided_fastq_folder_name, sorted_fastq_files, args.project, auth_token) + +if __name__ == "__main__": + main() \ No newline at end of file From dde6a10bd09d957b349bfba440106756244038ec Mon Sep 17 00:00:00 2001 From: George Doyle Date: Mon, 17 Jun 2024 15:33:02 +0100 Subject: [PATCH 2/5] Added panelapp_gene_query.py script --- README.md | 26 +++- panel_gene_query.py | 0 panelapp_gene_data.csv | 284 +++++++++++++++++++++++++++++++++++++++++ panelapp_gene_query.py | 97 ++++++++++++++ 4 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 panel_gene_query.py create mode 100644 panelapp_gene_data.csv create mode 100644 panelapp_gene_query.py diff --git a/README.md b/README.md index 2cc346f..8bca7d7 100644 --- a/README.md +++ b/README.md @@ -82,4 +82,28 @@ Shell script file within the current working directory named according to the ru ### Testing: Concordance testing using diff commands performed against pre-existing manually generated multiqc_fastqc.sh scripts in the dx_run_commands directory. -Only deviation was in run 240507_A01229_0324_AH5CYWDRX5, where a manual entry mistake was identified. \ No newline at end of file +Only deviation was in run 240507_A01229_0324_AH5CYWDRX5, where a manual entry mistake was identified. + +## panelapp_gene_query.py + +Generates a list of all signed-off gene/panel relations using the PanelApp API. + +Confidence scores: + +- Score 3 (lime green) - High level of evidence for this gene-disease association. Demonstrates confidence that this gene should be used for genome interpretation. +- Score 2 (amber) - Moderate evidence for this gene-disease association. This gene should not be used for genomic interpretation. +- Score 0 or 1 (red) - Not enough evidence for this gene-disease association. This gene should not be used for genomic interpretation. + +### Usage: +``` +python3 panelapp_gene_query.py +``` + +### Output: +CSV within the current working directory (```panelapp_gene_data.csv```). Data contains the following columns: gene_symbol, hgnc_id, panel_name, confidence_level and panel_id + +### Testing: +Spotcheck of output CSV to ensure both: + +- Results and confidence scores are reflected on the live PanelApp website +- All results pertain to signed-off panels \ No newline at end of file diff --git a/panel_gene_query.py b/panel_gene_query.py new file mode 100644 index 0000000..e69de29 diff --git a/panelapp_gene_data.csv b/panelapp_gene_data.csv new file mode 100644 index 0000000..262a164 --- /dev/null +++ b/panelapp_gene_data.csv @@ -0,0 +1,284 @@ +Gene Symbol,HGNC ID,Panel Name,Confidence Level,Panel ID +PKLR,HGNC:9020,Hereditary Erythrocytosis,1,157 +SLC30A10,HGNC:25355,Hereditary Erythrocytosis,3,157 +BPGM,HGNC:1093,Hereditary Erythrocytosis,3,157 +PIEZO1,HGNC:28993,Hereditary Erythrocytosis,2,157 +EGLN3,HGNC:14661,Hereditary Erythrocytosis,1,157 +VHL,HGNC:12687,Hereditary Erythrocytosis,3,157 +HBB,HGNC:4827,Hereditary Erythrocytosis,3,157 +HBA2,HGNC:4824,Hereditary Erythrocytosis,3,157 +EPAS1,HGNC:3374,Hereditary Erythrocytosis,3,157 +EGLN1,HGNC:1232,Hereditary Erythrocytosis,3,157 +SH2B3,HGNC:29605,Hereditary Erythrocytosis,2,157 +JAK2,HGNC:6192,Hereditary Erythrocytosis,2,157 +EPO,HGNC:3415,Hereditary Erythrocytosis,3,157 +EPOR,HGNC:3416,Hereditary Erythrocytosis,3,157 +EGLN2,HGNC:14660,Hereditary Erythrocytosis,1,157 +HBA1,HGNC:4823,Hereditary Erythrocytosis,3,157 +HIF1A,HGNC:4910,Hereditary Erythrocytosis,1,157 +ATM,HGNC:795,Brain cancer pertinent cancer susceptibility,3,166 +TP53,HGNC:11998,Brain cancer pertinent cancer susceptibility,3,166 +MSH2,HGNC:7325,Brain cancer pertinent cancer susceptibility,3,166 +MLH1,HGNC:7127,Brain cancer pertinent cancer susceptibility,3,166 +PMS2,HGNC:9122,Brain cancer pertinent cancer susceptibility,3,166 +MSH6,HGNC:7329,Brain cancer pertinent cancer susceptibility,3,166 +APC,HGNC:583,Brain cancer pertinent cancer susceptibility,3,166 +CDH1,HGNC:1748,Breast cancer pertinent cancer susceptibility,1,55 +ATRIP,HGNC:33499,Breast cancer pertinent cancer susceptibility,2,55 +BRCA1,HGNC:1100,Breast cancer pertinent cancer susceptibility,3,55 +BRCA2,HGNC:1101,Breast cancer pertinent cancer susceptibility,3,55 +PTEN,HGNC:9588,Breast cancer pertinent cancer susceptibility,1,55 +PALB2,HGNC:26144,Breast cancer pertinent cancer susceptibility,3,55 +TP53,HGNC:11998,Breast cancer pertinent cancer susceptibility,3,55 +BRCA1,HGNC:1100,Ovarian cancer pertinent cancer susceptibility,3,117 +BRIP1,HGNC:20473,Ovarian cancer pertinent cancer susceptibility,3,117 +PMS2,HGNC:9122,Ovarian cancer pertinent cancer susceptibility,1,117 +RAD51D,HGNC:9823,Ovarian cancer pertinent cancer susceptibility,3,117 +MLH1,HGNC:7127,Ovarian cancer pertinent cancer susceptibility,3,117 +RAD51C,HGNC:9820,Ovarian cancer pertinent cancer susceptibility,3,117 +MSH6,HGNC:7329,Ovarian cancer pertinent cancer susceptibility,3,117 +BRCA2,HGNC:1101,Ovarian cancer pertinent cancer susceptibility,3,117 +MSH2,HGNC:7325,Ovarian cancer pertinent cancer susceptibility,3,117 +SMPX,HGNC:11122,Distal myopathies,3,235 +ADSSL1,HGNC:20093,Distal myopathies,3,235 +GIPC1,HGNC:1226,Distal myopathies,1,235 +MYOT,HGNC:12399,Distal myopathies,3,235 +CNBP,HGNC:13164,Distal myopathies,1,235 +TTN,HGNC:12403,Distal myopathies,3,235 +DMPK,HGNC:2933,Distal myopathies,1,235 +CRYAB,HGNC:2389,Distal myopathies,3,235 +LRIF1,HGNC:30299,Distal myopathies,2,235 +FLNC,HGNC:3756,Distal myopathies,3,235 +MYH7,HGNC:7577,Distal myopathies,3,235 +NEB,HGNC:7720,Distal myopathies,3,235 +HSPB8,HGNC:30171,Distal myopathies,3,235 +DNAJB6,HGNC:14888,Distal myopathies,3,235 +DUX4,HGNC:50800,Distal myopathies,1,235 +MATR3,HGNC:6912,Distal myopathies,3,235 +DYSF,HGNC:3097,Distal myopathies,3,235 +ACTA1,HGNC:129,Distal myopathies,3,235 +DNM2,HGNC:2974,Distal myopathies,3,235 +LDB3,HGNC:15710,Distal myopathies,3,235 +DMD,HGNC:2928,Distal myopathies,1,235 +DES,HGNC:2770,Distal myopathies,3,235 +FHL1,HGNC:3702,Distal myopathies,3,235 +SQSTM1,HGNC:11280,Distal myopathies,3,235 +TIA1,HGNC:11802,Distal myopathies,3,235 +HSPB1,HGNC:5246,Distal myopathies,3,235 +GNE,HGNC:23657,Distal myopathies,3,235 +BAG3,HGNC:939,Distal myopathies,3,235 +ANO5,HGNC:27337,Distal myopathies,3,235 +VCP,HGNC:12666,Distal myopathies,3,235 +KLHL9,HGNC:18732,Distal myopathies,1,235 +TTR,HGNC:12405,Hyperthyroidism,3,236 +ALB,HGNC:399,Hyperthyroidism,3,236 +SECISBP2,HGNC:30972,Hyperthyroidism,3,236 +THRB,HGNC:11799,Hyperthyroidism,3,236 +TSHR,HGNC:12373,Hyperthyroidism,3,236 +THRA,HGNC:11796,Hyperthyroidism,3,236 +TRU-TCA1-1,HGNC:12348,Hyperthyroidism,1,236 +SLC16A2,HGNC:10923,Hyperthyroidism,3,236 +ADCY3,HGNC:234,Severe early-onset obesity,2,130 +DYRK1B,HGNC:3092,Severe early-onset obesity,2,130 +PGM2L1,HGNC:20898,Severe early-onset obesity,3,130 +KIDINS220,HGNC:29508,Severe early-onset obesity,3,130 +CPE,HGNC:2303,Severe early-onset obesity,3,130 +GNAS,HGNC:4392,Severe early-onset obesity,3,130 +PHIP,HGNC:15673,Severe early-onset obesity,3,130 +TUB,HGNC:12406,Severe early-onset obesity,2,130 +CEP290,HGNC:29021,Severe early-onset obesity,2,130 +SDCCAG8,HGNC:10671,Severe early-onset obesity,3,130 +NTRK2,HGNC:8032,Severe early-onset obesity,3,130 +MYT1L,HGNC:7623,Severe early-onset obesity,3,130 +LEPR,HGNC:6554,Severe early-onset obesity,3,130 +CEP19,HGNC:28209,Severe early-onset obesity,3,130 +BBS10,HGNC:26291,Severe early-onset obesity,3,130 +ALMS1,HGNC:428,Severe early-onset obesity,3,130 +ARL6,HGNC:13210,Severe early-onset obesity,3,130 +BBS1,HGNC:966,Severe early-onset obesity,3,130 +BBS12,HGNC:26648,Severe early-onset obesity,3,130 +BBS2,HGNC:967,Severe early-onset obesity,3,130 +BBS4,HGNC:969,Severe early-onset obesity,3,130 +BBS5,HGNC:970,Severe early-onset obesity,3,130 +BBS7,HGNC:18758,Severe early-onset obesity,3,130 +BBS9,HGNC:30000,Severe early-onset obesity,3,130 +LEP,HGNC:6553,Severe early-onset obesity,3,130 +MC4R,HGNC:6932,Severe early-onset obesity,3,130 +MKKS,HGNC:7108,Severe early-onset obesity,3,130 +MKS1,HGNC:7121,Severe early-onset obesity,3,130 +PCSK1,HGNC:8743,Severe early-onset obesity,3,130 +PHF6,HGNC:18145,Severe early-onset obesity,3,130 +POMC,HGNC:9201,Severe early-onset obesity,3,130 +TTC8,HGNC:20087,Severe early-onset obesity,3,130 +VPS13B,HGNC:2183,Severe early-onset obesity,3,130 +SH2B1,HGNC:30417,Severe early-onset obesity,2,130 +SIM1,HGNC:10882,Severe early-onset obesity,3,130 +AKR1C2,HGNC:385,Severe early-onset obesity,1,130 +INPP5E,HGNC:21474,Severe early-onset obesity,2,130 +MAGEL2,HGNC:6814,Severe early-onset obesity,1,130 +MRAP2,HGNC:21232,Severe early-onset obesity,1,130 +NR0B2,HGNC:7961,Severe early-onset obesity,1,130 +PPARG,HGNC:9236,Severe early-onset obesity,1,130 +TRIM32,HGNC:16380,Severe early-onset obesity,1,130 +WDPCP,HGNC:28027,Severe early-onset obesity,1,130 +KSR2,HGNC:18610,Severe early-onset obesity,2,130 +KDM1A,HGNC:29079,Congenital adrenal hypoplasia,1,145 +CYP11B2,HGNC:2592,Congenital adrenal hypoplasia,2,145 +POLE,HGNC:9177,Congenital adrenal hypoplasia,3,145 +CYP11A1,HGNC:2590,Congenital adrenal hypoplasia,3,145 +AAAS,HGNC:13666,Congenital adrenal hypoplasia,3,145 +STAR,HGNC:11359,Congenital adrenal hypoplasia,3,145 +ABCD1,HGNC:61,Congenital adrenal hypoplasia,1,145 +NNT,HGNC:7863,Congenital adrenal hypoplasia,3,145 +POMC,HGNC:9201,Congenital adrenal hypoplasia,1,145 +TXNRD2,HGNC:18155,Congenital adrenal hypoplasia,1,145 +AIRE,HGNC:360,Congenital adrenal hypoplasia,3,145 +CYP17A1,HGNC:2593,Congenital adrenal hypoplasia,1,145 +SAMD9,HGNC:1348,Congenital adrenal hypoplasia,3,145 +NR0B1,HGNC:7960,Congenital adrenal hypoplasia,3,145 +CYP21A2,HGNC:2600,Congenital adrenal hypoplasia,1,145 +TBX19,HGNC:11596,Congenital adrenal hypoplasia,3,145 +MCM4,HGNC:6947,Congenital adrenal hypoplasia,2,145 +CDKN1C,HGNC:1786,Congenital adrenal hypoplasia,3,145 +HSD3B2,HGNC:5218,Congenital adrenal hypoplasia,1,145 +MC2R,HGNC:6930,Congenital adrenal hypoplasia,3,145 +MRAP,HGNC:1304,Congenital adrenal hypoplasia,3,145 +NR5A1,HGNC:7983,Congenital adrenal hypoplasia,3,145 +SGPL1,HGNC:10817,Congenital adrenal hypoplasia,3,145 +CBS,HGNC:1550,Cerebral vascular malformations,0,147 +COL5A1,HGNC:2209,Cerebral vascular malformations,2,147 +SMARCAL1,HGNC:11102,Cerebral vascular malformations,1,147 +ADA2,HGNC:1839,Cerebral vascular malformations,2,147 +NOTCH3,HGNC:7883,Cerebral vascular malformations,1,147 +PIK3CA,HGNC:8975,Cerebral vascular malformations,1,147 +ANGPTL6,HGNC:23140,Cerebral vascular malformations,3,147 +PAFAH1B1,HGNC:8574,Cerebral vascular malformations,1,147 +DCX,HGNC:2714,Cerebral vascular malformations,1,147 +CHD4,HGNC:1919,Cerebral vascular malformations,2,147 +SETD5,HGNC:25566,Cerebral vascular malformations,2,147 +DNA2,HGNC:2939,Cerebral vascular malformations,1,147 +CNOT3,HGNC:7879,Cerebral vascular malformations,2,147 +PKD1,HGNC:9008,Cerebral vascular malformations,2,147 +FLVCR2,HGNC:20105,Cerebral vascular malformations,2,147 +YY1AP1,HGNC:30935,Cerebral vascular malformations,3,147 +SLC2A10,HGNC:13444,Cerebral vascular malformations,3,147 +PDCD10,HGNC:8761,Cerebral vascular malformations,3,147 +COL3A1,HGNC:2201,Cerebral vascular malformations,3,147 +TUBB2A,HGNC:12412,Cerebral vascular malformations,1,147 +CCM2,HGNC:21708,Cerebral vascular malformations,3,147 +ACTA2,HGNC:130,Cerebral vascular malformations,3,147 +ANIB1,HGNC:17627,Cerebral vascular malformations,0,147 +ATR,HGNC:882,Cerebral vascular malformations,2,147 +MYMY3,HGNC:20769,Cerebral vascular malformations,0,147 +MYMY1,HGNC:16401,Cerebral vascular malformations,0,147 +ELN,HGNC:3327,Cerebral vascular malformations,1,147 +GUCY1A3,HGNC:4685,Cerebral vascular malformations,3,147 +HBB,HGNC:4827,Cerebral vascular malformations,2,147 +NF1,HGNC:7765,Cerebral vascular malformations,2,147 +PKD2,HGNC:9009,Cerebral vascular malformations,2,147 +SMAD4,HGNC:6770,Cerebral vascular malformations,3,147 +SMAD3,HGNC:6769,Cerebral vascular malformations,1,147 +THSD1,HGNC:17754,Cerebral vascular malformations,2,147 +ABCC6,HGNC:57,Cerebral vascular malformations,1,147 +SMAD9,HGNC:6774,Cerebral vascular malformations,2,147 +ATP7A,HGNC:869,Cerebral vascular malformations,1,147 +BRCC3,HGNC:24185,Cerebral vascular malformations,1,147 +COL4A1,HGNC:2202,Cerebral vascular malformations,1,147 +COL4A2,HGNC:2203,Cerebral vascular malformations,1,147 +FBN1,HGNC:3603,Cerebral vascular malformations,1,147 +GLMN,HGNC:14373,Cerebral vascular malformations,1,147 +GNAQ,HGNC:4390,Cerebral vascular malformations,1,147 +HTRA1,HGNC:9476,Cerebral vascular malformations,1,147 +IL6,HGNC:6018,Cerebral vascular malformations,1,147 +LARGE1,HGNC:6511,Cerebral vascular malformations,1,147 +MEF2C,HGNC:6996,Cerebral vascular malformations,1,147 +OPHN1,HGNC:8148,Cerebral vascular malformations,1,147 +POMGNT1,HGNC:19139,Cerebral vascular malformations,1,147 +POMT1,HGNC:9202,Cerebral vascular malformations,1,147 +RELN,HGNC:9957,Cerebral vascular malformations,1,147 +RTTN,HGNC:18654,Cerebral vascular malformations,1,147 +RNF213,HGNC:14539,Cerebral vascular malformations,3,147 +STAMBP,HGNC:16950,Cerebral vascular malformations,1,147 +TEK,HGNC:11724,Cerebral vascular malformations,1,147 +TUBA8,HGNC:12410,Cerebral vascular malformations,1,147 +TUBB,HGNC:20778,Cerebral vascular malformations,1,147 +ACVRL1,HGNC:175,Cerebral vascular malformations,3,147 +CBL,HGNC:1541,Cerebral vascular malformations,2,147 +ENG,HGNC:3349,Cerebral vascular malformations,3,147 +CEP152,HGNC:29298,Cerebral vascular malformations,2,147 +MYH11,HGNC:7569,Cerebral vascular malformations,2,147 +KRIT1,HGNC:1573,Cerebral vascular malformations,3,147 +PCNT,HGNC:16068,Cerebral vascular malformations,2,147 +ARX,HGNC:18060,Cerebral vascular malformations,1,147 +RASA1,HGNC:9871,Cerebral vascular malformations,3,147 +SAMHD1,HGNC:15925,Cerebral vascular malformations,3,147 +EPHB4,HGNC:3395,Cerebral vascular malformations,2,147 +MRVI1,HGNC:7237,Cerebral vascular malformations,2,147 +GDF2,HGNC:4217,Cerebral vascular malformations,2,147 +TGFBR2,HGNC:11773,Cerebral vascular malformations,1,147 +ACE,HGNC:2707,Cerebral vascular malformations,1,147 +ANTXR1,HGNC:21014,Cerebral vascular malformations,1,147 +CENPJ,HGNC:17272,Cerebral vascular malformations,1,147 +CEP63,HGNC:25815,Cerebral vascular malformations,1,147 +CTSA,HGNC:9251,Cerebral vascular malformations,1,147 +FLT4,HGNC:3767,Cerebral vascular malformations,1,147 +FOXF1,HGNC:3809,Cerebral vascular malformations,1,147 +GLA,HGNC:4296,Cerebral vascular malformations,1,147 +HLA-DQB1,HGNC:4944,Cerebral vascular malformations,1,147 +HLA-DRB1,HGNC:4948,Cerebral vascular malformations,1,147 +KDR,HGNC:6307,Cerebral vascular malformations,1,147 +LAMB1,HGNC:6486,Cerebral vascular malformations,1,147 +LAMC3,HGNC:6494,Cerebral vascular malformations,1,147 +NDE1,HGNC:17619,Cerebral vascular malformations,1,147 +NIN,HGNC:14906,Cerebral vascular malformations,1,147 +OCLN,HGNC:8104,Cerebral vascular malformations,1,147 +PIK3R2,HGNC:8980,Cerebral vascular malformations,1,147 +PTEN,HGNC:9588,Cerebral vascular malformations,1,147 +RBBP8,HGNC:9891,Cerebral vascular malformations,1,147 +SRPX2,HGNC:30668,Cerebral vascular malformations,1,147 +TUBA1A,HGNC:20766,Cerebral vascular malformations,1,147 +TUBB2B,HGNC:30829,Cerebral vascular malformations,1,147 +TUBB3,HGNC:20772,Cerebral vascular malformations,1,147 +TUBG1,HGNC:12417,Cerebral vascular malformations,1,147 +JAG1,HGNC:6188,Cerebral vascular malformations,1,147 +TGFB2,HGNC:11768,Cerebral vascular malformations,1,147 +TGFBR1,HGNC:11772,Cerebral vascular malformations,1,147 +ADGRG1,HGNC:4512,Cerebral vascular malformations,1,147 +CRB1,HGNC:2343,Cerebral vascular malformations,1,147 +HLA-B,HGNC:4932,Cerebral vascular malformations,1,147 +POMT2,HGNC:19743,Cerebral vascular malformations,1,147 +TMEM5,HGNC:13530,Cerebral vascular malformations,1,147 +TRAIP,HGNC:30764,Cerebral vascular malformations,1,147 +VLDLR,HGNC:12698,Cerebral vascular malformations,1,147 +WDR62,HGNC:24502,Cerebral vascular malformations,1,147 +RNF113A,HGNC:12974,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +GTF2E2,HGNC:4651,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +CARS,HGNC:1493,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +TARS,HGNC:11572,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 +MARS,HGNC:6898,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",1,77 +AARS,HGNC:20,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 +POLH,HGNC:9181,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC1,HGNC:3433,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +MRE11,HGNC:7230,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 +ERCC5,HGNC:3437,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +XPA,HGNC:12814,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC3,HGNC:3435,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC4,HGNC:3436,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC8,HGNC:3439,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC2,HGNC:3434,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +GTF2H5,HGNC:21157,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +DDB2,HGNC:2718,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +ERCC6,HGNC:3438,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +XPC,HGNC:12816,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +DDB1,HGNC:2717,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",1,77 +MPLKIP,HGNC:16002,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 +APOB,HGNC:603,Familial chylomicronaemia syndrome (FCS),1,527 +APOA5,HGNC:17288,Familial chylomicronaemia syndrome (FCS),3,527 +LPL,HGNC:6677,Familial chylomicronaemia syndrome (FCS),3,527 +CREB3L3,HGNC:18855,Familial chylomicronaemia syndrome (FCS),3,527 +APOC2,HGNC:609,Familial chylomicronaemia syndrome (FCS),3,527 +APOE,HGNC:613,Familial chylomicronaemia syndrome (FCS),3,527 +GPD1,HGNC:4455,Familial chylomicronaemia syndrome (FCS),3,527 +GPIHBP1,HGNC:24945,Familial chylomicronaemia syndrome (FCS),3,527 +LMF1,HGNC:14154,Familial chylomicronaemia syndrome (FCS),3,527 +LIPI,HGNC:18821,Familial chylomicronaemia syndrome (FCS),1,527 diff --git a/panelapp_gene_query.py b/panelapp_gene_query.py new file mode 100644 index 0000000..1e0da4f --- /dev/null +++ b/panelapp_gene_query.py @@ -0,0 +1,97 @@ +''' +Created by the bioinformatics team @ Synnovis +Guy's & St. Thomas' NHS Trust + +Simple script to generate CSV containing all current gene/panel relations using PanelApp API +Includes data only from signed-off panels +Stores gene_symbol, hgnc_id, panel[name], confidence_level + +Usage: python3 panelapp_gene_query.py +panelapp_gene_data.csv will be saved in the current working directory +''' + +import csv +import requests + +def fetch_data(url): + """Call the API""" + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + print(f"Failed to retrieve data from the API. Status code: {response.status_code}") + return None + +def check_panel_signed(panel_id, signed_panels_cache, invalid_panels_cache): + """Check if the panel is signed off using the signed_off panels endpoint""" + + # If panel_id already checked, skip check to avoid extra API calls + if panel_id in signed_panels_cache: + # Already checked and signed-off + return True + if panel_id in invalid_panels_cache: + # Already checked and not signed-off + return False + + # New panel_id found, use API to check status and cache results + url = f"https://panelapp.genomicsengland.co.uk/api/v1/panels/signedoff/{panel_id}" + response = requests.get(url) + if response.status_code == 200 and "detail" not in response.json(): + print(f"Panel {panel_id} is signed off. Caching as signed-off.") + signed_panels_cache.add(panel_id) + return True + else: + print(f"Panel {panel_id} is not a signed-off panel or does not exist. Caching as invalid.") + invalid_panels_cache.add(panel_id) + return False + +def write_data_to_csv(data, file_path, signed_panels_cache, invalid_panels_cache): + """Write to CSV""" + with open(file_path, mode="w", newline='') as file: + writer = csv.writer(file) + writer.writerow(["Gene Symbol", "HGNC ID", "Panel Name", "Confidence Level", "Panel ID"]) + write_rows(writer, data, signed_panels_cache, invalid_panels_cache) + +def write_rows(writer, data, signed_panels_cache, invalid_panels_cache): + """Helper function to write rows in CSV""" + for result in data["results"]: + gene_data = result["gene_data"] + gene_symbol = gene_data["gene_symbol"] + hgnc_id = gene_data["hgnc_id"] + panel_name = result["panel"]["name"] + panel_id = result["panel"]["id"] + confidence_level = result["confidence_level"] + + # Check if the panel is present in the signed-off panel list before adding to list + if check_panel_signed(panel_id, signed_panels_cache, invalid_panels_cache): + writer.writerow([gene_symbol, hgnc_id, panel_name, confidence_level, panel_id]) + +def handle_pagination(data, writer, signed_panels_cache, invalid_panels_cache): + """Handle pagination""" + while data["next"]: + data = fetch_data(data["next"]) + if data: + write_rows(writer, data, signed_panels_cache, invalid_panels_cache) + else: + break + +def main(): + url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/" + output_file = "panelapp_gene_data.csv" + signed_panels_cache = set() + invalid_panels_cache = set() + + # Initial data fetch + initial_data = fetch_data(url) + if initial_data: + write_data_to_csv(initial_data, output_file, signed_panels_cache, invalid_panels_cache) + + # Handle rest of pages + with open(output_file, mode="a", newline='') as file: + writer = csv.writer(file) + handle_pagination(initial_data, writer, signed_panels_cache, invalid_panels_cache) + + print(f"Gene data saved to {output_file}") + +if __name__ == "__main__": + main() \ No newline at end of file From 5d27bf36b445a1b432cbb459f437b2b30d740ea8 Mon Sep 17 00:00:00 2001 From: George Doyle Date: Mon, 17 Jun 2024 15:34:07 +0100 Subject: [PATCH 3/5] Removed panelapp_gene_data.csv from cache --- panelapp_gene_data.csv | 284 ----------------------------------------- 1 file changed, 284 deletions(-) delete mode 100644 panelapp_gene_data.csv diff --git a/panelapp_gene_data.csv b/panelapp_gene_data.csv deleted file mode 100644 index 262a164..0000000 --- a/panelapp_gene_data.csv +++ /dev/null @@ -1,284 +0,0 @@ -Gene Symbol,HGNC ID,Panel Name,Confidence Level,Panel ID -PKLR,HGNC:9020,Hereditary Erythrocytosis,1,157 -SLC30A10,HGNC:25355,Hereditary Erythrocytosis,3,157 -BPGM,HGNC:1093,Hereditary Erythrocytosis,3,157 -PIEZO1,HGNC:28993,Hereditary Erythrocytosis,2,157 -EGLN3,HGNC:14661,Hereditary Erythrocytosis,1,157 -VHL,HGNC:12687,Hereditary Erythrocytosis,3,157 -HBB,HGNC:4827,Hereditary Erythrocytosis,3,157 -HBA2,HGNC:4824,Hereditary Erythrocytosis,3,157 -EPAS1,HGNC:3374,Hereditary Erythrocytosis,3,157 -EGLN1,HGNC:1232,Hereditary Erythrocytosis,3,157 -SH2B3,HGNC:29605,Hereditary Erythrocytosis,2,157 -JAK2,HGNC:6192,Hereditary Erythrocytosis,2,157 -EPO,HGNC:3415,Hereditary Erythrocytosis,3,157 -EPOR,HGNC:3416,Hereditary Erythrocytosis,3,157 -EGLN2,HGNC:14660,Hereditary Erythrocytosis,1,157 -HBA1,HGNC:4823,Hereditary Erythrocytosis,3,157 -HIF1A,HGNC:4910,Hereditary Erythrocytosis,1,157 -ATM,HGNC:795,Brain cancer pertinent cancer susceptibility,3,166 -TP53,HGNC:11998,Brain cancer pertinent cancer susceptibility,3,166 -MSH2,HGNC:7325,Brain cancer pertinent cancer susceptibility,3,166 -MLH1,HGNC:7127,Brain cancer pertinent cancer susceptibility,3,166 -PMS2,HGNC:9122,Brain cancer pertinent cancer susceptibility,3,166 -MSH6,HGNC:7329,Brain cancer pertinent cancer susceptibility,3,166 -APC,HGNC:583,Brain cancer pertinent cancer susceptibility,3,166 -CDH1,HGNC:1748,Breast cancer pertinent cancer susceptibility,1,55 -ATRIP,HGNC:33499,Breast cancer pertinent cancer susceptibility,2,55 -BRCA1,HGNC:1100,Breast cancer pertinent cancer susceptibility,3,55 -BRCA2,HGNC:1101,Breast cancer pertinent cancer susceptibility,3,55 -PTEN,HGNC:9588,Breast cancer pertinent cancer susceptibility,1,55 -PALB2,HGNC:26144,Breast cancer pertinent cancer susceptibility,3,55 -TP53,HGNC:11998,Breast cancer pertinent cancer susceptibility,3,55 -BRCA1,HGNC:1100,Ovarian cancer pertinent cancer susceptibility,3,117 -BRIP1,HGNC:20473,Ovarian cancer pertinent cancer susceptibility,3,117 -PMS2,HGNC:9122,Ovarian cancer pertinent cancer susceptibility,1,117 -RAD51D,HGNC:9823,Ovarian cancer pertinent cancer susceptibility,3,117 -MLH1,HGNC:7127,Ovarian cancer pertinent cancer susceptibility,3,117 -RAD51C,HGNC:9820,Ovarian cancer pertinent cancer susceptibility,3,117 -MSH6,HGNC:7329,Ovarian cancer pertinent cancer susceptibility,3,117 -BRCA2,HGNC:1101,Ovarian cancer pertinent cancer susceptibility,3,117 -MSH2,HGNC:7325,Ovarian cancer pertinent cancer susceptibility,3,117 -SMPX,HGNC:11122,Distal myopathies,3,235 -ADSSL1,HGNC:20093,Distal myopathies,3,235 -GIPC1,HGNC:1226,Distal myopathies,1,235 -MYOT,HGNC:12399,Distal myopathies,3,235 -CNBP,HGNC:13164,Distal myopathies,1,235 -TTN,HGNC:12403,Distal myopathies,3,235 -DMPK,HGNC:2933,Distal myopathies,1,235 -CRYAB,HGNC:2389,Distal myopathies,3,235 -LRIF1,HGNC:30299,Distal myopathies,2,235 -FLNC,HGNC:3756,Distal myopathies,3,235 -MYH7,HGNC:7577,Distal myopathies,3,235 -NEB,HGNC:7720,Distal myopathies,3,235 -HSPB8,HGNC:30171,Distal myopathies,3,235 -DNAJB6,HGNC:14888,Distal myopathies,3,235 -DUX4,HGNC:50800,Distal myopathies,1,235 -MATR3,HGNC:6912,Distal myopathies,3,235 -DYSF,HGNC:3097,Distal myopathies,3,235 -ACTA1,HGNC:129,Distal myopathies,3,235 -DNM2,HGNC:2974,Distal myopathies,3,235 -LDB3,HGNC:15710,Distal myopathies,3,235 -DMD,HGNC:2928,Distal myopathies,1,235 -DES,HGNC:2770,Distal myopathies,3,235 -FHL1,HGNC:3702,Distal myopathies,3,235 -SQSTM1,HGNC:11280,Distal myopathies,3,235 -TIA1,HGNC:11802,Distal myopathies,3,235 -HSPB1,HGNC:5246,Distal myopathies,3,235 -GNE,HGNC:23657,Distal myopathies,3,235 -BAG3,HGNC:939,Distal myopathies,3,235 -ANO5,HGNC:27337,Distal myopathies,3,235 -VCP,HGNC:12666,Distal myopathies,3,235 -KLHL9,HGNC:18732,Distal myopathies,1,235 -TTR,HGNC:12405,Hyperthyroidism,3,236 -ALB,HGNC:399,Hyperthyroidism,3,236 -SECISBP2,HGNC:30972,Hyperthyroidism,3,236 -THRB,HGNC:11799,Hyperthyroidism,3,236 -TSHR,HGNC:12373,Hyperthyroidism,3,236 -THRA,HGNC:11796,Hyperthyroidism,3,236 -TRU-TCA1-1,HGNC:12348,Hyperthyroidism,1,236 -SLC16A2,HGNC:10923,Hyperthyroidism,3,236 -ADCY3,HGNC:234,Severe early-onset obesity,2,130 -DYRK1B,HGNC:3092,Severe early-onset obesity,2,130 -PGM2L1,HGNC:20898,Severe early-onset obesity,3,130 -KIDINS220,HGNC:29508,Severe early-onset obesity,3,130 -CPE,HGNC:2303,Severe early-onset obesity,3,130 -GNAS,HGNC:4392,Severe early-onset obesity,3,130 -PHIP,HGNC:15673,Severe early-onset obesity,3,130 -TUB,HGNC:12406,Severe early-onset obesity,2,130 -CEP290,HGNC:29021,Severe early-onset obesity,2,130 -SDCCAG8,HGNC:10671,Severe early-onset obesity,3,130 -NTRK2,HGNC:8032,Severe early-onset obesity,3,130 -MYT1L,HGNC:7623,Severe early-onset obesity,3,130 -LEPR,HGNC:6554,Severe early-onset obesity,3,130 -CEP19,HGNC:28209,Severe early-onset obesity,3,130 -BBS10,HGNC:26291,Severe early-onset obesity,3,130 -ALMS1,HGNC:428,Severe early-onset obesity,3,130 -ARL6,HGNC:13210,Severe early-onset obesity,3,130 -BBS1,HGNC:966,Severe early-onset obesity,3,130 -BBS12,HGNC:26648,Severe early-onset obesity,3,130 -BBS2,HGNC:967,Severe early-onset obesity,3,130 -BBS4,HGNC:969,Severe early-onset obesity,3,130 -BBS5,HGNC:970,Severe early-onset obesity,3,130 -BBS7,HGNC:18758,Severe early-onset obesity,3,130 -BBS9,HGNC:30000,Severe early-onset obesity,3,130 -LEP,HGNC:6553,Severe early-onset obesity,3,130 -MC4R,HGNC:6932,Severe early-onset obesity,3,130 -MKKS,HGNC:7108,Severe early-onset obesity,3,130 -MKS1,HGNC:7121,Severe early-onset obesity,3,130 -PCSK1,HGNC:8743,Severe early-onset obesity,3,130 -PHF6,HGNC:18145,Severe early-onset obesity,3,130 -POMC,HGNC:9201,Severe early-onset obesity,3,130 -TTC8,HGNC:20087,Severe early-onset obesity,3,130 -VPS13B,HGNC:2183,Severe early-onset obesity,3,130 -SH2B1,HGNC:30417,Severe early-onset obesity,2,130 -SIM1,HGNC:10882,Severe early-onset obesity,3,130 -AKR1C2,HGNC:385,Severe early-onset obesity,1,130 -INPP5E,HGNC:21474,Severe early-onset obesity,2,130 -MAGEL2,HGNC:6814,Severe early-onset obesity,1,130 -MRAP2,HGNC:21232,Severe early-onset obesity,1,130 -NR0B2,HGNC:7961,Severe early-onset obesity,1,130 -PPARG,HGNC:9236,Severe early-onset obesity,1,130 -TRIM32,HGNC:16380,Severe early-onset obesity,1,130 -WDPCP,HGNC:28027,Severe early-onset obesity,1,130 -KSR2,HGNC:18610,Severe early-onset obesity,2,130 -KDM1A,HGNC:29079,Congenital adrenal hypoplasia,1,145 -CYP11B2,HGNC:2592,Congenital adrenal hypoplasia,2,145 -POLE,HGNC:9177,Congenital adrenal hypoplasia,3,145 -CYP11A1,HGNC:2590,Congenital adrenal hypoplasia,3,145 -AAAS,HGNC:13666,Congenital adrenal hypoplasia,3,145 -STAR,HGNC:11359,Congenital adrenal hypoplasia,3,145 -ABCD1,HGNC:61,Congenital adrenal hypoplasia,1,145 -NNT,HGNC:7863,Congenital adrenal hypoplasia,3,145 -POMC,HGNC:9201,Congenital adrenal hypoplasia,1,145 -TXNRD2,HGNC:18155,Congenital adrenal hypoplasia,1,145 -AIRE,HGNC:360,Congenital adrenal hypoplasia,3,145 -CYP17A1,HGNC:2593,Congenital adrenal hypoplasia,1,145 -SAMD9,HGNC:1348,Congenital adrenal hypoplasia,3,145 -NR0B1,HGNC:7960,Congenital adrenal hypoplasia,3,145 -CYP21A2,HGNC:2600,Congenital adrenal hypoplasia,1,145 -TBX19,HGNC:11596,Congenital adrenal hypoplasia,3,145 -MCM4,HGNC:6947,Congenital adrenal hypoplasia,2,145 -CDKN1C,HGNC:1786,Congenital adrenal hypoplasia,3,145 -HSD3B2,HGNC:5218,Congenital adrenal hypoplasia,1,145 -MC2R,HGNC:6930,Congenital adrenal hypoplasia,3,145 -MRAP,HGNC:1304,Congenital adrenal hypoplasia,3,145 -NR5A1,HGNC:7983,Congenital adrenal hypoplasia,3,145 -SGPL1,HGNC:10817,Congenital adrenal hypoplasia,3,145 -CBS,HGNC:1550,Cerebral vascular malformations,0,147 -COL5A1,HGNC:2209,Cerebral vascular malformations,2,147 -SMARCAL1,HGNC:11102,Cerebral vascular malformations,1,147 -ADA2,HGNC:1839,Cerebral vascular malformations,2,147 -NOTCH3,HGNC:7883,Cerebral vascular malformations,1,147 -PIK3CA,HGNC:8975,Cerebral vascular malformations,1,147 -ANGPTL6,HGNC:23140,Cerebral vascular malformations,3,147 -PAFAH1B1,HGNC:8574,Cerebral vascular malformations,1,147 -DCX,HGNC:2714,Cerebral vascular malformations,1,147 -CHD4,HGNC:1919,Cerebral vascular malformations,2,147 -SETD5,HGNC:25566,Cerebral vascular malformations,2,147 -DNA2,HGNC:2939,Cerebral vascular malformations,1,147 -CNOT3,HGNC:7879,Cerebral vascular malformations,2,147 -PKD1,HGNC:9008,Cerebral vascular malformations,2,147 -FLVCR2,HGNC:20105,Cerebral vascular malformations,2,147 -YY1AP1,HGNC:30935,Cerebral vascular malformations,3,147 -SLC2A10,HGNC:13444,Cerebral vascular malformations,3,147 -PDCD10,HGNC:8761,Cerebral vascular malformations,3,147 -COL3A1,HGNC:2201,Cerebral vascular malformations,3,147 -TUBB2A,HGNC:12412,Cerebral vascular malformations,1,147 -CCM2,HGNC:21708,Cerebral vascular malformations,3,147 -ACTA2,HGNC:130,Cerebral vascular malformations,3,147 -ANIB1,HGNC:17627,Cerebral vascular malformations,0,147 -ATR,HGNC:882,Cerebral vascular malformations,2,147 -MYMY3,HGNC:20769,Cerebral vascular malformations,0,147 -MYMY1,HGNC:16401,Cerebral vascular malformations,0,147 -ELN,HGNC:3327,Cerebral vascular malformations,1,147 -GUCY1A3,HGNC:4685,Cerebral vascular malformations,3,147 -HBB,HGNC:4827,Cerebral vascular malformations,2,147 -NF1,HGNC:7765,Cerebral vascular malformations,2,147 -PKD2,HGNC:9009,Cerebral vascular malformations,2,147 -SMAD4,HGNC:6770,Cerebral vascular malformations,3,147 -SMAD3,HGNC:6769,Cerebral vascular malformations,1,147 -THSD1,HGNC:17754,Cerebral vascular malformations,2,147 -ABCC6,HGNC:57,Cerebral vascular malformations,1,147 -SMAD9,HGNC:6774,Cerebral vascular malformations,2,147 -ATP7A,HGNC:869,Cerebral vascular malformations,1,147 -BRCC3,HGNC:24185,Cerebral vascular malformations,1,147 -COL4A1,HGNC:2202,Cerebral vascular malformations,1,147 -COL4A2,HGNC:2203,Cerebral vascular malformations,1,147 -FBN1,HGNC:3603,Cerebral vascular malformations,1,147 -GLMN,HGNC:14373,Cerebral vascular malformations,1,147 -GNAQ,HGNC:4390,Cerebral vascular malformations,1,147 -HTRA1,HGNC:9476,Cerebral vascular malformations,1,147 -IL6,HGNC:6018,Cerebral vascular malformations,1,147 -LARGE1,HGNC:6511,Cerebral vascular malformations,1,147 -MEF2C,HGNC:6996,Cerebral vascular malformations,1,147 -OPHN1,HGNC:8148,Cerebral vascular malformations,1,147 -POMGNT1,HGNC:19139,Cerebral vascular malformations,1,147 -POMT1,HGNC:9202,Cerebral vascular malformations,1,147 -RELN,HGNC:9957,Cerebral vascular malformations,1,147 -RTTN,HGNC:18654,Cerebral vascular malformations,1,147 -RNF213,HGNC:14539,Cerebral vascular malformations,3,147 -STAMBP,HGNC:16950,Cerebral vascular malformations,1,147 -TEK,HGNC:11724,Cerebral vascular malformations,1,147 -TUBA8,HGNC:12410,Cerebral vascular malformations,1,147 -TUBB,HGNC:20778,Cerebral vascular malformations,1,147 -ACVRL1,HGNC:175,Cerebral vascular malformations,3,147 -CBL,HGNC:1541,Cerebral vascular malformations,2,147 -ENG,HGNC:3349,Cerebral vascular malformations,3,147 -CEP152,HGNC:29298,Cerebral vascular malformations,2,147 -MYH11,HGNC:7569,Cerebral vascular malformations,2,147 -KRIT1,HGNC:1573,Cerebral vascular malformations,3,147 -PCNT,HGNC:16068,Cerebral vascular malformations,2,147 -ARX,HGNC:18060,Cerebral vascular malformations,1,147 -RASA1,HGNC:9871,Cerebral vascular malformations,3,147 -SAMHD1,HGNC:15925,Cerebral vascular malformations,3,147 -EPHB4,HGNC:3395,Cerebral vascular malformations,2,147 -MRVI1,HGNC:7237,Cerebral vascular malformations,2,147 -GDF2,HGNC:4217,Cerebral vascular malformations,2,147 -TGFBR2,HGNC:11773,Cerebral vascular malformations,1,147 -ACE,HGNC:2707,Cerebral vascular malformations,1,147 -ANTXR1,HGNC:21014,Cerebral vascular malformations,1,147 -CENPJ,HGNC:17272,Cerebral vascular malformations,1,147 -CEP63,HGNC:25815,Cerebral vascular malformations,1,147 -CTSA,HGNC:9251,Cerebral vascular malformations,1,147 -FLT4,HGNC:3767,Cerebral vascular malformations,1,147 -FOXF1,HGNC:3809,Cerebral vascular malformations,1,147 -GLA,HGNC:4296,Cerebral vascular malformations,1,147 -HLA-DQB1,HGNC:4944,Cerebral vascular malformations,1,147 -HLA-DRB1,HGNC:4948,Cerebral vascular malformations,1,147 -KDR,HGNC:6307,Cerebral vascular malformations,1,147 -LAMB1,HGNC:6486,Cerebral vascular malformations,1,147 -LAMC3,HGNC:6494,Cerebral vascular malformations,1,147 -NDE1,HGNC:17619,Cerebral vascular malformations,1,147 -NIN,HGNC:14906,Cerebral vascular malformations,1,147 -OCLN,HGNC:8104,Cerebral vascular malformations,1,147 -PIK3R2,HGNC:8980,Cerebral vascular malformations,1,147 -PTEN,HGNC:9588,Cerebral vascular malformations,1,147 -RBBP8,HGNC:9891,Cerebral vascular malformations,1,147 -SRPX2,HGNC:30668,Cerebral vascular malformations,1,147 -TUBA1A,HGNC:20766,Cerebral vascular malformations,1,147 -TUBB2B,HGNC:30829,Cerebral vascular malformations,1,147 -TUBB3,HGNC:20772,Cerebral vascular malformations,1,147 -TUBG1,HGNC:12417,Cerebral vascular malformations,1,147 -JAG1,HGNC:6188,Cerebral vascular malformations,1,147 -TGFB2,HGNC:11768,Cerebral vascular malformations,1,147 -TGFBR1,HGNC:11772,Cerebral vascular malformations,1,147 -ADGRG1,HGNC:4512,Cerebral vascular malformations,1,147 -CRB1,HGNC:2343,Cerebral vascular malformations,1,147 -HLA-B,HGNC:4932,Cerebral vascular malformations,1,147 -POMT2,HGNC:19743,Cerebral vascular malformations,1,147 -TMEM5,HGNC:13530,Cerebral vascular malformations,1,147 -TRAIP,HGNC:30764,Cerebral vascular malformations,1,147 -VLDLR,HGNC:12698,Cerebral vascular malformations,1,147 -WDR62,HGNC:24502,Cerebral vascular malformations,1,147 -RNF113A,HGNC:12974,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -GTF2E2,HGNC:4651,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -CARS,HGNC:1493,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -TARS,HGNC:11572,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 -MARS,HGNC:6898,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",1,77 -AARS,HGNC:20,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 -POLH,HGNC:9181,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC1,HGNC:3433,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -MRE11,HGNC:7230,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",2,77 -ERCC5,HGNC:3437,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -XPA,HGNC:12814,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC3,HGNC:3435,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC4,HGNC:3436,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC8,HGNC:3439,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC2,HGNC:3434,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -GTF2H5,HGNC:21157,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -DDB2,HGNC:2718,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -ERCC6,HGNC:3438,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -XPC,HGNC:12816,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -DDB1,HGNC:2717,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",1,77 -MPLKIP,HGNC:16002,"Xeroderma pigmentosum, Trichothiodystrophy or Cockayne syndrome",3,77 -APOB,HGNC:603,Familial chylomicronaemia syndrome (FCS),1,527 -APOA5,HGNC:17288,Familial chylomicronaemia syndrome (FCS),3,527 -LPL,HGNC:6677,Familial chylomicronaemia syndrome (FCS),3,527 -CREB3L3,HGNC:18855,Familial chylomicronaemia syndrome (FCS),3,527 -APOC2,HGNC:609,Familial chylomicronaemia syndrome (FCS),3,527 -APOE,HGNC:613,Familial chylomicronaemia syndrome (FCS),3,527 -GPD1,HGNC:4455,Familial chylomicronaemia syndrome (FCS),3,527 -GPIHBP1,HGNC:24945,Familial chylomicronaemia syndrome (FCS),3,527 -LMF1,HGNC:14154,Familial chylomicronaemia syndrome (FCS),3,527 -LIPI,HGNC:18821,Familial chylomicronaemia syndrome (FCS),1,527 From de227051b6ad5b474b85cb3094e0c005b2e47373 Mon Sep 17 00:00:00 2001 From: George Doyle Date: Mon, 17 Jun 2024 15:35:47 +0100 Subject: [PATCH 4/5] Added panelapp_gene_query script --- panelapp_gene_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/panelapp_gene_query.py b/panelapp_gene_query.py index 1e0da4f..584089a 100644 --- a/panelapp_gene_query.py +++ b/panelapp_gene_query.py @@ -4,7 +4,7 @@ Simple script to generate CSV containing all current gene/panel relations using PanelApp API Includes data only from signed-off panels -Stores gene_symbol, hgnc_id, panel[name], confidence_level +Stores gene_symbol, hgnc_id, panel[name], confidence_level and panel_id Usage: python3 panelapp_gene_query.py panelapp_gene_data.csv will be saved in the current working directory From 31f8b84e4a0b5eb95668cc88be21d06913c7b6ff Mon Sep 17 00:00:00 2001 From: George Doyle Date: Mon, 24 Jun 2024 10:41:16 +0100 Subject: [PATCH 5/5] Temp rm oncodeep command script, added settings/.gitignore and updated README --- .gitignore | 1 + README.md | 21 ------- okd_qc_commands.py | 139 -------------------------------------------- panel_gene_query.py | 0 settings.json | 37 ++++++++++++ 5 files changed, 38 insertions(+), 160 deletions(-) create mode 100644 .gitignore delete mode 100644 okd_qc_commands.py delete mode 100644 panel_gene_query.py create mode 100644 settings.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7598d60 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +okd_qc_commands.py \ No newline at end of file diff --git a/README.md b/README.md index 8bca7d7..bcdfd76 100644 --- a/README.md +++ b/README.md @@ -63,27 +63,6 @@ If run with the name of a TSO related project in DNA nexus as an argument, this The resulting command will be sent to std out with just the APP_ID and MOKAGUYS_AUTH_TOKEN needing to be added to each line, this can be done using find & replace. IMPORTANT: HD200 and NTC samples (HD200 or 00000_00000 in sample name) should have their lines removed manually as these should not be uploaded. -## okd_qc_commands.py - -Automates the generation of DNAnexus MultiQC/FastQC commands for a given OncoDeep runfolder. -Must be ran on workstation where authkey file is present. - -### Usage: -``` -python3 qcgen.py -p {dnanexus_project_id} -f {illumina_runfolder_name} -``` -### Arguments: --p, --project The DNAnexus project ID for the run (i.e, project-XXXXXXXXXXX). --f, --fastq_dir The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3). - -### Output: -Shell script file within the current working directory named according to the run. This can be run from any location on the genomics workstation - -### Testing: -Concordance testing using diff commands performed against pre-existing manually generated multiqc_fastqc.sh scripts in the dx_run_commands directory. - -Only deviation was in run 240507_A01229_0324_AH5CYWDRX5, where a manual entry mistake was identified. - ## panelapp_gene_query.py Generates a list of all signed-off gene/panel relations using the PanelApp API. diff --git a/okd_qc_commands.py b/okd_qc_commands.py deleted file mode 100644 index a55aa2a..0000000 --- a/okd_qc_commands.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -DNAnexus FastQC and MultiQC Command Generator - -Created: 22/05/2024 -Author: Bioinformatics @ Synnovis (Guy's & St. Thomas' NHS Foundation Trust) - -This script automates the generation of DNAnexus MultiQC/FastQC commands for a given OncoDeep runfolder. -Must be ran on workstation where authkey file is present. - -Usage: - python3 qcgen.py -p {dnanexus_project_id} -f {illumina_runfolder_name} - -Arguments: - -p, --project The DNAnexus project ID for the run. - -f, --fastq_dir The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3). - -Output: - Generates a shell script with commands to run FastQC and MultiQC on OKD fastq files. -""" - -import os -import argparse -import re - -def parse_arguments(): - '''Parse command line arguments''' - parser = argparse.ArgumentParser(description="Generate a shell script for running FastQC and MultiQC on OKD fastq files.") - parser.add_argument("-p", "--project", required=True, help="The DNAnexus project ID for the run.") - parser.add_argument("-f", "--fastq_dir", required=True, help="The name of the run folder (i.e., 240521_A01229_0331_AHWGJGDRX3).") - return parser.parse_args() - -def get_fastq_dir(fastq_dir): - '''Construct the directory path containing fastq files''' - return os.path.join("/media/data3/share", fastq_dir, "Data/Intensities/BaseCalls") - -def get_okd_id(fastq_dir): - '''Extract the OKD ID''' - fastq_files = os.listdir(fastq_dir) - for file in fastq_files: - # Regex to extract OKD_XXXXX ID - match = re.search(r'OKD\d{5}', file) - if match: - return match.group() - raise ValueError("No OKD ID found in fastq files.") - -def get_fastq_files(fastq_dir): - '''Create list of all OKD fastq files''' - # Use startswith to exclude undetermined fastqs - return [f for f in os.listdir(fastq_dir) if f.startswith("OKD") and f.endswith(".fastq.gz")] - -def sort_fastq_files(fastq_files): - '''Sort the fastq files based on sample number''' - def sort_key(filename): - # Get the RX sample ID from the end of the fastq name - sample_name = "_".join(filename.split("_")[:-2]) - # Remove letter to leave int for sorting - sample_number = int(re.search(r'S(\d+)', sample_name).group(1)) - return sample_number - return sorted(fastq_files, key=sort_key) - -def read_auth_token(token_path): - '''Read DNAnexus auth token from file''' - with open(token_path, 'r') as token_file: - return token_file.read().strip() - -def generate_script(fastq_folder_name, okd_id, provided_fastq_folder_name, fastq_files, project_id, auth_token): - '''Generate the shell script''' - script_filename = "{}_multiqc_fastqc.sh".format(fastq_folder_name) - with open(script_filename, "w") as script_file: - # Create and open the output shell script file - script_file.write("depends_list=''\n\n") - for r1_file in fastq_files: - if "_R1_" in r1_file: - r2_file = r1_file.replace("_R1_", "_R2_") - sample_name = "_".join(r1_file.split("_")[:-3]) - - # Write the FastQC command for each pair of fastq files - script_file.write( - "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/fastqc_v1.4.0 --priority high -y " - "--name {sample_name} -ireads={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/Data/Intensities/BaseCalls/{r1_file} " - "-ireads={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/Data/Intensities/BaseCalls/{r2_file} " - "--dest={provided_fastq_folder_name}:/ --brief --auth-token {auth_token})\n".format( - sample_name=sample_name, provided_fastq_folder_name=provided_fastq_folder_name, fastq_folder_name=fastq_folder_name, r1_file=r1_file, r2_file=r2_file, okd_id=okd_id, auth_token=auth_token - ) - ) - script_file.write("depends_list=\"${depends_list} -d ${jobid} \"\n") - - # Write the MultiQC command - script_file.write( - "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/multiqc_v1.18.0 --priority high -y " - "--instance-type mem1_ssd1_v2_x4 -iproject_for_multiqc={provided_fastq_folder_name} " - "-icoverage_level=100 --project={project_id} $depends_list --brief --auth-token {auth_token})\n".format( - provided_fastq_folder_name=provided_fastq_folder_name, project_id=project_id, auth_token=auth_token - ) - ) - script_file.write("depends_list=\"${depends_list} -d ${jobid} \"\n") - - # Write the upload_multiqc command - script_file.write( - "jobid=$(dx run project-ByfFPz00jy1fk6PjpZ95F27J:/Apps/upload_multiqc_v1.4.0 --priority high -y " - "--instance-type mem1_ssd1_v2_x2 -imultiqc_html=$jobid:multiqc_report -imultiqc_data_input=$jobid:multiqc " - "-imultiqc_data_input={provided_fastq_folder_name}:/{fastq_folder_name}_{okd_id}/{fastq_folder_name}.illumina_lane_metrics " - "--project={project_id} $depends_list --brief --auth-token {auth_token})\n".format( - provided_fastq_folder_name=provided_fastq_folder_name, fastq_folder_name=fastq_folder_name, okd_id=okd_id, project_id=project_id, auth_token=auth_token - ) - ) - - print("Shell script generated successfully: {}".format(script_filename)) - -def main(): - # Parse command line arguments - args = parse_arguments() - - # Construct the directory path for fastq files - fastq_dir = get_fastq_dir(args.fastq_dir) - - # Extract the OKD ID from fastq files - okd_id = get_okd_id(fastq_dir) - - # Get the fastq folder name - fastq_folder_name = os.path.basename(os.path.normpath(os.path.join(fastq_dir, "..", "..", ".."))) - - # Create the provided fastq folder name - provided_fastq_folder_name = "003_{}_{}".format(fastq_folder_name, okd_id) - - # Get the list of fastq files - fastq_files = get_fastq_files(fastq_dir) - - # Sort the fastq files - sorted_fastq_files = sort_fastq_files(fastq_files) - - # Read the DNAnexus auth token from file - auth_token = read_auth_token("/usr/local/src/mokaguys/.dnanexus_auth_token") - - # Generate the shell script with FastQC and MultiQC commands - generate_script(fastq_folder_name, okd_id, provided_fastq_folder_name, sorted_fastq_files, args.project, auth_token) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/panel_gene_query.py b/panel_gene_query.py deleted file mode 100644 index e69de29..0000000 diff --git a/settings.json b/settings.json new file mode 100644 index 0000000..c49072e --- /dev/null +++ b/settings.json @@ -0,0 +1,37 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.envFile": "${workspaceFolder}/.venv", + "python.analysis.extraPaths": [ + ], + "editor.formatOnSaveMode": "file", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + } + }, + "isort.args": [ + "--profile", + "black" + ], + "flake8.args": [ + "--max-line-length=120" + ], + "pylint.args": [ + "--max-line-length=120" + ], + "black-formatter.args": [ + "--line-length", + "120" + ], + "python.analysis.typeCheckingMode": "basic" +} \ No newline at end of file