diff --git a/.vscode/settings.json b/.vscode/settings.json index aaafda97f..ecc456889 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -25,5 +25,8 @@ "python.testing.pytestEnabled": true, "mypy-type-checker.severity": { "error": "Information" - } + }, + "yaml.extension.recommendations": false, + "workbench.remoteIndicator.showExtensionRecommendations": false, + "extensions.ignoreRecommendations": true } diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index c8f67d418..c61863b86 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -37,13 +37,13 @@ gnomad_public_bucket: gs://gcp-public-data--gnomad/release/ ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht -# variant_annotation +# GnomAD variant set: gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/ # Others chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain -vep_consequences: ${datasets.static_assets}/vep_consequences.tsv +vep_consequences: ${datasets.static_assets}/variant_consequence_to_score.tsv anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed javierre: ${datasets.static_assets}/javierre_2016_preprocessed jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv @@ -55,7 +55,7 @@ finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv # Dev output datasets -variant_annotation: ${datasets.outputs}/variant_annotation +gnomad_variants: ${datasets.outputs}/gnomad_variants study_locus: ${datasets.outputs}/study_locus summary_statistics: ${datasets.outputs}/summary_statistics study_locus_overlap: ${datasets.outputs}/study_locus_overlap @@ -68,6 +68,8 @@ catalog_study_locus: ${datasets.study_locus}/catalog_study_locus from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats from_sumstats_pics: ${datasets.credible_set}/from_sumstats +vep_output_path: gs://genetics_etl_python_playground/vep/full_variant_index_vcf + # ETL output datasets: l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json l2g_model: ${datasets.release_folder}/locus_to_gene_model/classifier.skops @@ -78,4 +80,4 @@ study_index: ${datasets.release_folder}/study_index variant_index: ${datasets.release_folder}/variant_index credible_set: ${datasets.release_folder}/credible_set gene_index: ${datasets.release_folder}/gene_index -v2g: ${datasets.release_folder}/variant_to_gene +variant_to_gene: ${datasets.release_folder}/variant_to_gene diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml deleted file mode 100644 index 55a9503ce..000000000 --- a/config/step/ot_variant_annotation.yaml +++ /dev/null @@ -1,19 +0,0 @@ -defaults: - - variant_annotation - -variant_annotation_path: ${datasets.variant_annotation} -gnomad_genomes_path: ${datasets.gnomad_genomes_path} -chain_38_37: ${datasets.chain_38_37} -gnomad_variant_populations: - - afr # African-American - - amr # American Admixed/Latino - - ami # Amish ancestry - - asj # Ashkenazi Jewish - - eas # East Asian - - fin # Finnish - - nfe # Non-Finnish European - - mid # Middle Eastern - - sas # South Asian - - remaining # Other -# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out. -use_version_from_input: true diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml index 3834196b2..00b6b1602 100644 --- a/config/step/ot_variant_index.yaml +++ b/config/step/ot_variant_index.yaml @@ -1,6 +1,6 @@ defaults: - variant_index -variant_annotation_path: ${datasets.variant_annotation} -credible_set_path: ${datasets.credible_set} +vep_output_json_path: ${datasets.vep_output_path} +gnomad_variant_annotations_path: ${datasets.gnomad_variants} variant_index_path: ${datasets.variant_index} diff --git a/config/step/ot_variant_to_gene.yaml b/config/step/ot_variant_to_gene.yaml index 1ac6d2fbe..7187a0625 100644 --- a/config/step/ot_variant_to_gene.yaml +++ b/config/step/ot_variant_to_gene.yaml @@ -2,7 +2,6 @@ defaults: - variant_to_gene variant_index_path: ${datasets.variant_index} -variant_annotation_path: ${datasets.variant_annotation} gene_index_path: ${datasets.gene_index} vep_consequences_path: ${datasets.vep_consequences} liftover_chain_file_path: ${datasets.chain_37_38} @@ -11,4 +10,4 @@ interval_sources: javierre: ${datasets.javierre} jung: ${datasets.jung} thurman: ${datasets.thurman} -v2g_path: ${datasets.v2g} +v2g_path: ${datasets.variant_to_gene} diff --git a/docs/assets/imgs/ensembl_logo.png b/docs/assets/imgs/ensembl_logo.png new file mode 100644 index 000000000..83fd25751 Binary files /dev/null and b/docs/assets/imgs/ensembl_logo.png differ diff --git a/docs/python_api/datasets/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md deleted file mode 100644 index f82210b6e..000000000 --- a/docs/python_api/datasets/variant_annotation.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Variant annotation ---- - -::: gentropy.dataset.variant_annotation.VariantAnnotation - -## Schema - ---8<-- "assets/schemas/variant_annotation.md" diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md index f05a2b834..e6e081b21 100644 --- a/docs/python_api/datasources/_datasources.md +++ b/docs/python_api/datasources/_datasources.md @@ -23,7 +23,8 @@ This section contains information about the data source harmonisation tools avai ## Variant annotation/validation 1. [GnomAD](gnomad/_gnomad.md) v4.0 -1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data) +2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data) +3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html) ## Linkage desiquilibrium diff --git a/docs/python_api/datasources/ensembl/_ensembl.md b/docs/python_api/datasources/ensembl/_ensembl.md new file mode 100644 index 000000000..56c49870a --- /dev/null +++ b/docs/python_api/datasources/ensembl/_ensembl.md @@ -0,0 +1,10 @@ +--- +title: Ensembl annotations +--- + +
+ +

Ensembl

+
+ +[Ensembl](https://www.ensembl.org/index.html) provides a diverse set of genetic data Gentropy takes advantage of including gene set, and variant annotations. diff --git a/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md new file mode 100644 index 000000000..c956ed3e1 --- /dev/null +++ b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md @@ -0,0 +1,5 @@ +--- +title: Variant effector parser +--- + +::: gentropy.datasource.ensembl.vep_parser.VariantEffectPredictorParser diff --git a/docs/python_api/steps/ld_index.md b/docs/python_api/steps/ld_index.md index bf8b9b58e..d8f61a528 100644 --- a/docs/python_api/steps/ld_index.md +++ b/docs/python_api/steps/ld_index.md @@ -1,5 +1,5 @@ --- -title: ld_index +title: GnomAD Linkage data ingestion --- -::: gentropy.ld_index.LDIndexStep +::: gentropy.gnomad_ingestion.LDIndexStep diff --git a/docs/python_api/steps/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md index e65a071b2..2b5582df4 100644 --- a/docs/python_api/steps/variant_annotation_step.md +++ b/docs/python_api/steps/variant_annotation_step.md @@ -1,5 +1,5 @@ --- -title: variant_annotation +title: GnomAD variant data ingestion --- -::: gentropy.variant_annotation.VariantAnnotationStep +::: gentropy.gnomad_ingestion.GnomadVariantIndexStep diff --git a/docs/python_api/steps/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md index 1a3e56af8..db1c1fd20 100644 --- a/docs/python_api/steps/variant_to_gene_step.md +++ b/docs/python_api/steps/variant_to_gene_step.md @@ -2,4 +2,4 @@ title: variant_to_gene --- -::: gentropy.v2g.V2GStep +::: gentropy.variant_to_gene.V2GStep diff --git a/poetry.lock b/poetry.lock index b181572d4..432c3d4cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6881,6 +6881,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/src/airflow/dags/gnomad_preprocess.py b/src/airflow/dags/gnomad_preprocess.py index 22e7fa056..03962bec5 100644 --- a/src/airflow/dags/gnomad_preprocess.py +++ b/src/airflow/dags/gnomad_preprocess.py @@ -1,4 +1,4 @@ -"""Airflow DAG for the Preprocess part of the pipeline.""" +"""Airflow DAG for the Preprocess GnomAD datasets - LD index and GnomAD variant set.""" from __future__ import annotations @@ -11,13 +11,13 @@ ALL_STEPS = [ "ot_ld_index", - "ot_variant_annotation", + "ot_gnomad_variants", ] with DAG( dag_id=Path(__file__).stem, - description="Open Targets Genetics — Preprocess", + description="Open Targets Genetics — GnomAD Preprocess", default_args=common.shared_dag_args, **common.shared_dag_kwargs, ): diff --git a/src/gentropy/assets/data/so_mappings.json b/src/gentropy/assets/data/so_mappings.json new file mode 100644 index 000000000..8a087b6f7 --- /dev/null +++ b/src/gentropy/assets/data/so_mappings.json @@ -0,0 +1,43 @@ +{ + "transcript_ablation": "SO_0001893", + "splice_acceptor_variant": "SO_0001574", + "splice_donor_variant": "SO_0001575", + "stop_gained": "SO_0001587", + "frameshift_variant": "SO_0001589", + "stop_lost": "SO_0001578", + "start_lost": "SO_0002012", + "transcript_amplification": "SO_0001889", + "feature_elongation": "SO_0001907", + "feature_truncation": "SO_0001906", + "inframe_insertion": "SO_0001821", + "inframe_deletion": "SO_0001822", + "missense_variant": "SO_0001583", + "protein_altering_variant": "SO_0001818", + "splice_donor_5th_base_variant": "SO_0001787", + "splice_region_variant": "SO_0001630", + "splice_donor_region_variant": "SO_0002170", + "splice_polypyrimidine_tract_variant": "SO_0002169", + "incomplete_terminal_codon_variant": "SO_0001626", + "start_retained_variant": "SO_0002019", + "stop_retained_variant": "SO_0001567", + "synonymous_variant": "SO_0001819", + "coding_sequence_variant": "SO_0001580", + "mature_miRNA_variant": "SO_0001620", + "5_prime_UTR_variant": "SO_0001623", + "3_prime_UTR_variant": "SO_0001624", + "non_coding_transcript_exon_variant": "SO_0001792", + "intron_variant": "SO_0001627", + "NMD_transcript_variant": "SO_0001621", + "non_coding_transcript_variant": "SO_0001619", + "coding_transcript_variant": "SO_0001968", + "upstream_gene_variant": "SO_0001631", + "downstream_gene_variant": "SO_0001632", + "TFBS_ablation": "SO_0001895", + "TFBS_amplification": "SO_0001892", + "TF_binding_site_variant": "SO_0001782", + "regulatory_region_ablation": "SO_0001894", + "regulatory_region_amplification": "SO_0001891", + "regulatory_region_variant": "SO_0001566", + "intergenic_variant": "SO_0001628", + "sequence_variant": "SO_0001060" +} diff --git a/src/gentropy/assets/schemas/variant_annotation.json b/src/gentropy/assets/schemas/variant_annotation.json deleted file mode 100644 index ab8767389..000000000 --- a/src/gentropy/assets/schemas/variant_annotation.json +++ /dev/null @@ -1,215 +0,0 @@ -{ - "type": "struct", - "fields": [ - { - "name": "variantId", - "type": "string", - "nullable": false, - "metadata": {} - }, - { - "name": "chromosome", - "type": "string", - "nullable": false, - "metadata": {} - }, - { - "name": "position", - "type": "integer", - "nullable": false, - "metadata": {} - }, - { - "name": "referenceAllele", - "type": "string", - "nullable": false, - "metadata": {} - }, - { - "name": "alternateAllele", - "type": "string", - "nullable": false, - "metadata": {} - }, - { - "name": "chromosomeB37", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "positionB37", - "type": "integer", - "nullable": true, - "metadata": {} - }, - { - "name": "alleleType", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "rsIds", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, - "nullable": true, - "metadata": {} - }, - { - "name": "alleleFrequencies", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "populationName", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "alleleFrequency", - "type": "double", - "nullable": true, - "metadata": {} - } - ] - }, - "containsNull": true - }, - "nullable": false, - "metadata": {} - }, - { - "name": "inSilicoPredictors", - "nullable": false, - "metadata": {}, - "type": { - "type": "struct", - "fields": [ - { - "name": "cadd", - "nullable": true, - "metadata": {}, - "type": { - "type": "struct", - "fields": [ - { - "name": "raw", - "type": "float", - "nullable": true, - "metadata": {} - }, - { - "name": "phred", - "type": "float", - "nullable": true, - "metadata": {} - } - ] - } - }, - { - "name": "revelMax", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "spliceaiDsMax", - "type": "float", - "nullable": true, - "metadata": {} - }, - { - "name": "pangolinLargestDs", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "phylop", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "siftMax", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "polyphenMax", - "type": "double", - "nullable": true, - "metadata": {} - } - ] - } - }, - { - "name": "vep", - "type": { - "type": "struct", - "fields": [ - { - "name": "mostSevereConsequence", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "transcriptConsequences", - "type": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [ - { - "name": "aminoAcids", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "consequenceTerms", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, - "nullable": true, - "metadata": {} - }, - { - "name": "geneId", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "lof", - "type": "string", - "nullable": true, - "metadata": {} - } - ] - }, - "containsNull": true - }, - "nullable": true, - "metadata": {} - } - ] - }, - "nullable": false, - "metadata": {} - } - ] -} diff --git a/src/gentropy/assets/schemas/variant_index.json b/src/gentropy/assets/schemas/variant_index.json index c6a3702c9..d9be4e50f 100644 --- a/src/gentropy/assets/schemas/variant_index.json +++ b/src/gentropy/assets/schemas/variant_index.json @@ -1,53 +1,188 @@ { - "type": "struct", "fields": [ { + "metadata": {}, "name": "variantId", - "type": "string", "nullable": false, - "metadata": {} + "type": "string" }, { + "metadata": {}, "name": "chromosome", - "type": "string", "nullable": false, - "metadata": {} + "type": "string" }, { + "metadata": {}, "name": "position", - "type": "integer", "nullable": false, - "metadata": {} + "type": "integer" }, { + "metadata": {}, "name": "referenceAllele", - "type": "string", "nullable": false, - "metadata": {} + "type": "string" }, { + "metadata": {}, "name": "alternateAllele", - "type": "string", "nullable": false, - "metadata": {} + "type": "string" }, { - "name": "chromosomeB37", - "type": "string", - "nullable": true, - "metadata": {} + "metadata": {}, + "name": "inSilicoPredictors", + "nullable": false, + "type": { + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "method", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "assessment", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "score", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "assessmentFlag", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "targetId", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "type": "array" + } + }, + { + "metadata": {}, + "name": "mostSevereConsequenceId", + "nullable": false, + "type": "string" }, { - "name": "positionB37", - "type": "integer", + "metadata": {}, + "name": "transcriptConsequences", "nullable": true, - "metadata": {} + "type": { + "containsNull": false, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "variantFunctionalConsequenceIds", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "aminoAcidChange", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "uniprotAccessions", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "isEnsemblCanonical", + "nullable": false, + "type": "boolean" + }, + { + "metadata": {}, + "name": "codons", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "distance", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "targetId", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "impact", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "lofteePrediction", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "siftPrediction", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "polyphenPrediction", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "transcriptId", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "type": "array" + } }, { - "name": "alleleType", - "type": "string", - "nullable": false, - "metadata": {} + "metadata": {}, + "name": "rsIds", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } }, { "name": "alleleFrequencies", @@ -76,88 +211,31 @@ "metadata": {} }, { - "name": "inSilicoPredictors", - "nullable": false, "metadata": {}, + "name": "dbXrefs", + "nullable": false, "type": { - "type": "struct", - "fields": [ - { - "name": "cadd", - "nullable": true, - "metadata": {}, - "type": { - "type": "struct", - "fields": [ - { - "name": "raw", - "type": "float", - "nullable": true, - "metadata": {} - }, - { - "name": "phred", - "type": "float", - "nullable": true, - "metadata": {} - } - ] + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "source", + "nullable": true, + "type": "string" } - }, - { - "name": "revelMax", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "spliceaiDsMax", - "type": "float", - "nullable": true, - "metadata": {} - }, - { - "name": "pangolinLargestDs", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "phylop", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "siftMax", - "type": "double", - "nullable": true, - "metadata": {} - }, - { - "name": "polyphenMax", - "type": "double", - "nullable": true, - "metadata": {} - } - ] + ], + "type": "struct" + }, + "type": "array" } - }, - { - "name": "mostSevereConsequence", - "type": "string", - "nullable": true, - "metadata": {} - }, - { - "name": "rsIds", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, - "nullable": true, - "metadata": {} } - ] + ], + "type": "struct" } diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json new file mode 100644 index 000000000..ecad3ea1e --- /dev/null +++ b/src/gentropy/assets/schemas/vep_json_output.json @@ -0,0 +1,531 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "allele_string", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "assembly_name", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "intergenic_consequences", + "nullable": true, + "type": { + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "cadd_phred", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "cadd_raw", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "consequence_terms", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "impact", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "variant_allele", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "gene_id", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "type": "array" + } + }, + { + "metadata": {}, + "name": "colocated_variants", + "nullable": true, + "type": { + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "allele_string", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "clin_sig", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "clin_sig_allele", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "end", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "phenotype_or_disease", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "pubmed", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "long", + "type": "array" + } + }, + { + "metadata": {}, + "name": "seq_region_name", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "start", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "strand", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "var_synonyms", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "ClinVar", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "LMDD", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "OIVD", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "OMIM", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "double", + "type": "array" + } + }, + { + "metadata": {}, + "name": "PharmGKB", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "PhenCode", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "UniProt", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "dbPEX", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + } + ], + "type": "struct" + } + } + ], + "type": "struct" + }, + "type": "array" + } + }, + { + "metadata": {}, + "name": "end", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "input", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "most_severe_consequence", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "seq_region_name", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "start", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "strand", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "transcript_consequences", + "nullable": true, + "type": { + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "alphamissense", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "am_class", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "am_pathogenicity", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "amino_acids", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "cadd_phred", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "cadd_raw", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "canonical", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "cdna_end", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "cdna_start", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "cds_end", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "cds_start", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "codons", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "consequence_terms", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "distance", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "flags", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "gene_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "impact", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "lof", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "lof_filter", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "lof_flags", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "lof_info", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "polyphen_prediction", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "polyphen_score", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "protein_end", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "protein_start", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "sift_prediction", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "sift_score", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "strand", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "swissprot", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "transcript_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "trembl", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "uniparc", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "uniprot_isoform", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "variant_allele", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "type": "array" + } + } + ], + "type": "struct" +} diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 4ebf4cbe8..f7ba73e98 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -180,7 +180,7 @@ class LDIndexConfig(StepConfig): ] ) use_version_from_input: bool = False - _target_: str = "gentropy.ld_index.LDIndexStep" + _target_: str = "gentropy.gnomad_ingestion.LDIndexStep" @dataclass @@ -302,8 +302,8 @@ class UkbPppEurConfig(StepConfig): @dataclass -class VariantAnnotationConfig(StepConfig): - """Variant annotation step configuration.""" +class GnomadVariantConfig(StepConfig): + """Gnomad variant ingestion step configuration.""" session: Any = field( default_factory=lambda: { @@ -312,7 +312,6 @@ class VariantAnnotationConfig(StepConfig): ) variant_annotation_path: str = MISSING gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/" - chain_38_37: str = "gs://hail-common/references/grch38_to_grch37.over.chain.gz" gnomad_variant_populations: list[str] = field( default_factory=lambda: [ "afr", # African-American @@ -328,16 +327,16 @@ class VariantAnnotationConfig(StepConfig): ] ) use_version_from_input: bool = False - _target_: str = "gentropy.variant_annotation.VariantAnnotationStep" + _target_: str = "gentropy.gnomad_ingestion.GnomadVariantIndexStep" @dataclass class VariantIndexConfig(StepConfig): """Variant index step configuration.""" - variant_annotation_path: str = MISSING - credible_set_path: str = MISSING + vep_output_json_path: str = MISSING variant_index_path: str = MISSING + gnomad_variant_annotations_path: str | None = None _target_: str = "gentropy.variant_index.VariantIndexStep" @@ -346,7 +345,6 @@ class VariantToGeneConfig(StepConfig): """V2G step configuration.""" variant_index_path: str = MISSING - variant_annotation_path: str = MISSING gene_index_path: str = MISSING vep_consequences_path: str = MISSING liftover_chain_file_path: str = MISSING @@ -371,7 +369,7 @@ class VariantToGeneConfig(StepConfig): ) interval_sources: Dict[str, str] = field(default_factory=dict) v2g_path: str = MISSING - _target_: str = "gentropy.v2g.V2GStep" + _target_: str = "gentropy.variant_to_gene.V2GStep" @dataclass @@ -491,8 +489,8 @@ def register_config() -> None: ) cs.store(group="step", name="pics", node=PICSConfig) + cs.store(group="step", name="variant_annotation", node=GnomadVariantConfig) cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig) - cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig) cs.store(group="step", name="variant_index", node=VariantIndexConfig) cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig) cs.store( diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py index 4bafbc288..a9e24dcab 100644 --- a/src/gentropy/dataset/variant_index.py +++ b/src/gentropy/dataset/variant_index.py @@ -1,78 +1,295 @@ -"""Variant index dataset.""" +"""Dataset definition for variant annotation.""" + from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING -import pyspark.sql.functions as f +from pyspark.sql import functions as f from gentropy.common.schemas import parse_spark_schema -from gentropy.common.spark_helpers import nullify_empty_array +from gentropy.common.spark_helpers import ( + get_record_with_maximum_value, + normalise_column, +) from gentropy.dataset.dataset import Dataset -from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.v2g import V2G if TYPE_CHECKING: + from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType - from gentropy.dataset.variant_annotation import VariantAnnotation - @dataclass class VariantIndex(Dataset): - """Variant index dataset. - - Variant index dataset is the result of intersecting the variant annotation dataset with the variants with V2D available information. - """ + """Dataset for representing variants and methods applied on them.""" @classmethod def get_schema(cls: type[VariantIndex]) -> StructType: - """Provides the schema for the VariantIndex dataset. + """Provides the schema for the variant index dataset. Returns: StructType: Schema for the VariantIndex dataset """ return parse_spark_schema("variant_index.json") - @classmethod - def from_variant_annotation( - cls: type[VariantIndex], - variant_annotation: VariantAnnotation, - study_locus: StudyLocus, + def add_annotation( + self: VariantIndex, annotation_source: VariantIndex ) -> VariantIndex: - """Initialise VariantIndex from pre-existing variant annotation dataset. + """Import annotation from an other variant index dataset. + + At this point the annotation can be extended with extra cross-references, + in-silico predictions and allele frequencies. Args: - variant_annotation (VariantAnnotation): Variant annotation dataset - study_locus (StudyLocus): Study locus dataset with the variants to intersect with the variant annotation dataset + annotation_source (VariantIndex): Annotation to add to the dataset + + Returns: + VariantIndex: VariantIndex dataset with the annotation added + """ + # Columns in the source dataset: + variant_index_columns = [ + # Combining cross-references: + f.array_union(f.col("dbXrefs"), f.col("annotation_dbXrefs")) + if row == "dbXrefs" + # Combining in silico predictors: + else f.array_union( + f.col("inSilicoPredictors"), f.col("annotation_inSilicoPredictors") + ) + if row == "inSilicoPredictors" + # Combining allele frequencies: + else f.array_union( + f.col("alleleFrequencies"), f.col("annotation_alleleFrequencies") + ) + if row == "alleleFrequencies" + # Carrying over all other columns: + else row + for row in self.df.columns + ] + + # Rename columns in the annotation source to avoid conflicts: + annotation = annotation_source.df.select( + *[ + f.col(col).alias(f"annotation_{col}") if col != "variantId" else col + for col in annotation_source.df.columns + ] + ) + + # Join the annotation to the dataset: + return VariantIndex( + _df=( + annotation.join( + f.broadcast(self.df), on="variantId", how="right" + ).select(*variant_index_columns) + ), + _schema=self.schema, + ) + + def max_maf(self: VariantIndex) -> Column: + """Maximum minor allele frequency accross all populations assuming all variants biallelic. Returns: - VariantIndex: Variant index dataset + Column: Maximum minor allele frequency accross all populations. + + Raises: + ValueError: Allele frequencies are not present in the dataset. """ - unchanged_cols = [ + if "alleleFrequencies" not in self.df.columns: + raise ValueError("Allele frequencies are not present in the dataset.") + + return f.array_max( + f.transform( + self.df.alleleFrequencies, + lambda af: f.when( + af.alleleFrequency > 0.5, 1 - af.alleleFrequency + ).otherwise(af.alleleFrequency), + ) + ) + + def filter_by_variant(self: VariantIndex, df: DataFrame) -> VariantIndex: + """Filter variant annotation dataset by a variant dataframe. + + Args: + df (DataFrame): A dataframe of variants + + Returns: + VariantIndex: A filtered variant annotation dataset + """ + join_columns = ["variantId", "chromosome"] + + assert all( + col in df.columns for col in join_columns + ), "The variant dataframe must contain the columns 'variantId' and 'chromosome'." + + return VariantIndex( + _df=self._df.join( + f.broadcast(df.select(*join_columns).distinct()), + on=join_columns, + how="inner", + ), + _schema=self.schema, + ) + + def get_transcript_consequence_df( + self: VariantIndex, gene_index: GeneIndex | None = None + ) -> DataFrame: + """Dataframe of exploded transcript consequences. + + Optionally the trancript consequences can be reduced to the universe of a gene index. + + Args: + gene_index (GeneIndex | None): A gene index. Defaults to None. + + Returns: + DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence + """ + # exploding the array removes records without VEP annotation + transript_consequences = self.df.withColumn( + "transcriptConsequence", f.explode("transcriptConsequences") + ).select( "variantId", "chromosome", "position", - "referenceAllele", - "alternateAllele", - "chromosomeB37", - "positionB37", - "alleleType", - "alleleFrequencies", - "inSilicoPredictors", - ] - va_slimmed = variant_annotation.filter_by_variant_df( - study_locus.unique_variants_in_locus() + "transcriptConsequence", + f.col("transcriptConsequence.targetId").alias("geneId"), ) - return cls( + if gene_index: + transript_consequences = transript_consequences.join( + f.broadcast(gene_index.df), + on=["chromosome", "geneId"], + ) + return transript_consequences + + def get_distance_to_tss( + self: VariantIndex, + gene_index: GeneIndex, + max_distance: int = 500_000, + ) -> V2G: + """Extracts variant to gene assignments for variants falling within a window of a gene's TSS. + + Args: + gene_index (GeneIndex): A gene index to filter by. + max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000. + + Returns: + V2G: variant to gene assignments with their distance to the TSS + """ + return V2G( + _df=( + self.df.alias("variant") + .join( + f.broadcast(gene_index.locations_lut()).alias("gene"), + on=[ + f.col("variant.chromosome") == f.col("gene.chromosome"), + f.abs(f.col("variant.position") - f.col("gene.tss")) + <= max_distance, + ], + how="inner", + ) + .withColumn( + "distance", f.abs(f.col("variant.position") - f.col("gene.tss")) + ) + .withColumn( + "inverse_distance", + max_distance - f.col("distance"), + ) + .transform(lambda df: normalise_column(df, "inverse_distance", "score")) + .select( + "variantId", + f.col("variant.chromosome").alias("chromosome"), + "distance", + "geneId", + "score", + f.lit("distance").alias("datatypeId"), + f.lit("canonical_tss").alias("datasourceId"), + ) + ), + _schema=V2G.get_schema(), + ) + + def get_plof_v2g(self: VariantIndex, gene_index: GeneIndex) -> V2G: + """Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm. + + Optionally the trancript consequences can be reduced to the universe of a gene index. + + Args: + gene_index (GeneIndex): A gene index to filter by. + + Returns: + V2G: variant to gene assignments from the LOFTEE algorithm + """ + return V2G( _df=( - va_slimmed.df.select( - *unchanged_cols, - f.col("vep.mostSevereConsequence").alias("mostSevereConsequence"), - # filters/rsid are arrays that can be empty, in this case we convert them to null - nullify_empty_array(f.col("rsIds")).alias("rsIds"), + self.get_transcript_consequence_df(gene_index) + .filter(f.col("transcriptConsequence.lofteePrediction").isNotNull()) + .withColumn( + "isHighQualityPlof", + f.when( + f.col("transcriptConsequence.lofteePrediction") == "HC", True + ).when( + f.col("transcriptConsequence.lofteePrediction") == "LC", False + ), + ) + .withColumn( + "score", + f.when(f.col("isHighQualityPlof"), 1.0).when( + ~f.col("isHighQualityPlof"), 0 + ), + ) + .select( + "variantId", + "chromosome", + "geneId", + "isHighQualityPlof", + f.col("score"), + f.lit("vep").alias("datatypeId"), + f.lit("loftee").alias("datasourceId"), + ) + ), + _schema=V2G.get_schema(), + ) + + def get_most_severe_transcript_consequence( + self: VariantIndex, + vep_consequences: DataFrame, + gene_index: GeneIndex, + ) -> V2G: + """Creates a dataset with variant to gene assignments based on VEP's predicted consequence of the transcript. + + Optionally the trancript consequences can be reduced to the universe of a gene index. + + Args: + vep_consequences (DataFrame): A dataframe of VEP consequences + gene_index (GeneIndex): A gene index to filter by. Defaults to None. + + Returns: + V2G: High and medium severity variant to gene assignments + """ + return V2G( + _df=self.get_transcript_consequence_df(gene_index) + .select( + "variantId", + "chromosome", + f.col("transcriptConsequence.targetId").alias("geneId"), + f.explode( + "transcriptConsequence.variantFunctionalConsequenceIds" + ).alias("variantFunctionalConsequenceId"), + f.lit("vep").alias("datatypeId"), + f.lit("variantConsequence").alias("datasourceId"), + ) + .join( + f.broadcast(vep_consequences), + on="variantFunctionalConsequenceId", + how="inner", + ) + .drop("label") + .filter(f.col("score") != 0) + # A variant can have multiple predicted consequences on a transcript, the most severe one is selected + .transform( + lambda df: get_record_with_maximum_value( + df, ["variantId", "geneId"], "score" ) - .repartition(400, "chromosome") - .sortWithinPartitions("chromosome", "position") ), - _schema=cls.get_schema(), + _schema=V2G.get_schema(), ) diff --git a/src/gentropy/datasource/ensembl/__init__.py b/src/gentropy/datasource/ensembl/__init__.py new file mode 100644 index 000000000..e20ce9a95 --- /dev/null +++ b/src/gentropy/datasource/ensembl/__init__.py @@ -0,0 +1,3 @@ +"""Ensembl's Variant Effect Predictor datasource.""" + +from __future__ import annotations diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py new file mode 100644 index 000000000..fcade35b1 --- /dev/null +++ b/src/gentropy/datasource/ensembl/vep_parser.py @@ -0,0 +1,784 @@ +"""Generating variant index based on Esembl's Variant Effect Predictor output.""" + +from __future__ import annotations + +import importlib.resources as pkg_resources +import json +from itertools import chain +from typing import TYPE_CHECKING, Dict, List + +from pyspark.sql import SparkSession +from pyspark.sql import functions as f +from pyspark.sql import types as t + +from gentropy.assets import data +from gentropy.common.schemas import parse_spark_schema +from gentropy.common.spark_helpers import ( + enforce_schema, + order_array_of_structs_by_field, +) +from gentropy.dataset.variant_index import VariantIndex + +if TYPE_CHECKING: + from pyspark.sql import Column, DataFrame + + +class VariantEffectPredictorParser: + """Collection of methods to parse VEP output in json format.""" + + # Schema description of the dbXref object: + DBXREF_SCHEMA = t.ArrayType( + t.StructType( + [ + t.StructField("id", t.StringType(), True), + t.StructField("source", t.StringType(), True), + ] + ) + ) + + # Schema description of the in silico predictor object: + IN_SILICO_PREDICTOR_SCHEMA = t.StructType( + [ + t.StructField("method", t.StringType(), True), + t.StructField("assessment", t.StringType(), True), + t.StructField("score", t.FloatType(), True), + t.StructField("assessmentFlag", t.StringType(), True), + t.StructField("targetId", t.StringType(), True), + ] + ) + + # Schema for the allele frequency column: + ALLELE_FREQUENCY_SCHEMA = t.ArrayType( + t.StructType( + [ + t.StructField("populationName", t.StringType(), True), + t.StructField("alleleFrequency", t.DoubleType(), True), + ] + ), + False, + ) + + @staticmethod + def get_schema() -> t.StructType: + """Return the schema of the VEP output. + + Returns: + t.StructType: VEP output schema. + + Examples: + >>> type(VariantEffectPredictorParser.get_schema()) + + """ + return parse_spark_schema("vep_json_output.json") + + @classmethod + def extract_variant_index_from_vep( + cls: type[VariantEffectPredictorParser], + spark: SparkSession, + vep_output_path: str | list[str], + **kwargs: bool | float | int | str | None, + ) -> VariantIndex: + """Extract variant index from VEP output. + + Args: + spark (SparkSession): Spark session. + vep_output_path (str | list[str]): Path to the VEP output. + **kwargs (bool | float | int | str | None): Additional arguments to pass to spark.read.json. + + Returns: + VariantIndex: Variant index dataset. + + Raises: + ValueError: Failed reading file. + ValueError: The dataset is empty. + """ + # To speed things up and simplify the json structure, read data following an expected schema: + vep_schema = cls.get_schema() + + try: + vep_data = spark.read.json(vep_output_path, schema=vep_schema, **kwargs) + except ValueError as e: + raise ValueError(f"Failed reading file: {vep_output_path}.") from e + + if vep_data.isEmpty(): + raise ValueError(f"The dataset is empty: {vep_output_path}") + + # Convert to VariantAnnotation dataset: + return VariantIndex( + _df=VariantEffectPredictorParser.process_vep_output(vep_data), + _schema=VariantIndex.get_schema(), + ) + + @staticmethod + def _extract_ensembl_xrefs(colocated_variants: Column) -> Column: + """Extract rs identifiers and build cross reference to Ensembl's variant page. + + Args: + colocated_variants (Column): Colocated variants field from VEP output. + + Returns: + Column: List of dbXrefs for rs identifiers. + """ + return VariantEffectPredictorParser._generate_dbxrefs( + VariantEffectPredictorParser._colocated_variants_to_rsids( + colocated_variants + ), + "ensembl_variation", + ) + + @enforce_schema(DBXREF_SCHEMA) + @staticmethod + def _generate_dbxrefs(ids: Column, source: str) -> Column: + """Convert a list of variant identifiers to dbXrefs given the source label. + + Identifiers are cast to strings, then Null values are filtered out of the id list. + + Args: + ids (Column): List of variant identifiers. + source (str): Source label for the dbXrefs. + + Returns: + Column: List of dbXrefs. + + Examples: + >>> ( + ... spark.createDataFrame([('rs12',),(None,)], ['test_id']) + ... .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col')) + ... .show(truncate=False) + ... ) + +--------------------------+ + |col | + +--------------------------+ + |[{rs12, ensemblVariation}]| + |[] | + +--------------------------+ + + >>> ( + ... spark.createDataFrame([('rs12',),(None,)], ['test_id']) + ... .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col')) + ... .first().col[0].asDict() + ... ) + {'id': 'rs12', 'source': 'ensemblVariation'} + """ + ids = f.filter(ids, lambda id: id.isNotNull()) + xref_column = f.transform( + ids, + lambda id: f.struct( + id.cast(t.StringType()).alias("id"), f.lit(source).alias("source") + ), + ) + + return f.when(xref_column.isNull(), f.array()).otherwise(xref_column) + + @staticmethod + def _colocated_variants_to_rsids(colocated_variants: Column) -> Column: + """Extract rs identifiers from the colocated variants VEP field. + + Args: + colocated_variants (Column): Colocated variants field from VEP output. + + Returns: + Column: List of rs identifiers. + + Examples: + >>> data = [('s1', 'rs1'),('s1', 'rs2'),('s1', 'rs3'),('s2', None),] + >>> ( + ... spark.createDataFrame(data, ['s','v']) + ... .groupBy('s') + ... .agg(f.collect_list(f.struct(f.col('v').alias('id'))).alias('cv')) + ... .select(VariantEffectPredictorParser._colocated_variants_to_rsids(f.col('cv')).alias('rsIds'),) + ... .show(truncate=False) + ... ) + +---------------+ + |rsIds | + +---------------+ + |[rs1, rs2, rs3]| + |[null] | + +---------------+ + + """ + return f.when( + colocated_variants.isNotNull(), + f.transform( + colocated_variants, lambda variant: variant.getItem("id").alias("id") + ), + ).alias("rsIds") + + @staticmethod + def _extract_omim_xrefs(colocated_variants: Column) -> Column: + """Build xdbRefs for OMIM identifiers. + + OMIM identifiers are extracted from the colocated variants field, casted to strings and formatted as dbXrefs. + + Args: + colocated_variants (Column): Colocated variants field from VEP output. + + Returns: + Column: List of dbXrefs for OMIM identifiers. + + Examples: + >>> data = [('234234.32', 'rs1', 's1',),(None, 'rs1', 's1',),] + >>> ( + ... spark.createDataFrame(data, ['id', 'rsid', 's']) + ... .groupBy('s') + ... .agg(f.collect_list(f.struct(f.struct(f.array(f.col('id')).alias('OMIM')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_omim_xrefs(f.col('cv')).alias('dbXrefs')) + ... .show(truncate=False) + ... ) + +-------------------+ + |dbXrefs | + +-------------------+ + |[{234234#32, omim}]| + +-------------------+ + + """ + variants_w_omim_ref = f.filter( + colocated_variants, + lambda variant: variant.getItem("var_synonyms").getItem("OMIM").isNotNull(), + ) + + omim_var_ids = f.transform( + variants_w_omim_ref, + lambda var: f.transform( + var.getItem("var_synonyms").getItem("OMIM"), + lambda var: f.regexp_replace(var.cast(t.StringType()), r"\.", r"#"), + ), + ) + + return VariantEffectPredictorParser._generate_dbxrefs( + f.flatten(omim_var_ids), "omim" + ) + + @staticmethod + def _extract_clinvar_xrefs(colocated_variants: Column) -> Column: + """Build xdbRefs for VCV ClinVar identifiers. + + ClinVar identifiers are extracted from the colocated variants field. + VCV-identifiers are filtered out to generate cross-references. + + Args: + colocated_variants (Column): Colocated variants field from VEP output. + + Returns: + Column: List of dbXrefs for VCV ClinVar identifiers. + + Examples: + >>> data = [('VCV2323,RCV3423', 'rs1', 's1',),(None, 'rs1', 's1',),] + >>> ( + ... spark.createDataFrame(data, ['id', 'rsid', 's']) + ... .groupBy('s') + ... .agg(f.collect_list(f.struct(f.struct(f.split(f.col('id'),',').alias('ClinVar')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_clinvar_xrefs(f.col('cv')).alias('dbXrefs')) + ... .show(truncate=False) + ... ) + +--------------------+ + |dbXrefs | + +--------------------+ + |[{VCV2323, clinvar}]| + +--------------------+ + + """ + variants_w_clinvar_ref = f.filter( + colocated_variants, + lambda variant: variant.getItem("var_synonyms") + .getItem("ClinVar") + .isNotNull(), + ) + + clin_var_ids = f.transform( + variants_w_clinvar_ref, + lambda var: f.filter( + var.getItem("var_synonyms").getItem("ClinVar"), + lambda x: x.startswith("VCV"), + ), + ) + + return VariantEffectPredictorParser._generate_dbxrefs( + f.flatten(clin_var_ids), "clinvar" + ) + + @staticmethod + def _get_most_severe_transcript( + transcript_column_name: str, score_field_name: str + ) -> Column: + """Return transcript with the highest in silico predicted score. + + This method assumes the higher the score, the more severe the consequence of the variant is. + Hence, by selecting the transcript with the highest score, we are selecting the most severe consequence. + + Args: + transcript_column_name (str): Name of the column containing the list of transcripts. + score_field_name (str): Name of the field containing the severity score. + + Returns: + Column: Most severe transcript. + + Examples: + >>> data = [("v1", 0.2, 'transcript1'),("v1", None, 'transcript2'),("v1", 0.6, 'transcript3'),("v1", 0.4, 'transcript4'),] + >>> ( + ... spark.createDataFrame(data, ['v', 'score', 'transcriptId']) + ... .groupBy('v') + ... .agg(f.collect_list(f.struct(f.col('score'),f.col('transcriptId'))).alias('transcripts')) + ... .select(VariantEffectPredictorParser._get_most_severe_transcript('transcripts', 'score').alias('most_severe_transcript')) + ... .show(truncate=False) + ... ) + +----------------------+ + |most_severe_transcript| + +----------------------+ + |{0.6, transcript3} | + +----------------------+ + + """ + assert isinstance( + transcript_column_name, str + ), "transcript_column_name must be a string and not a column." + # Order transcripts by severity score: + ordered_transcripts = order_array_of_structs_by_field( + transcript_column_name, score_field_name + ) + + # Drop transcripts with no severity score and return the most severe one: + return f.filter( + ordered_transcripts, + lambda transcript: transcript.getItem(score_field_name).isNotNull(), + )[0] + + @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA) + @staticmethod + def _get_max_alpha_missense(transcripts: Column) -> Column: + """Return the most severe alpha missense prediction from all transcripts. + + This function assumes one variant can fall onto only one gene with alpha-sense prediction available on the canonical transcript. + + Args: + transcripts (Column): List of transcripts. + + Returns: + Column: Most severe alpha missense prediction. + + Examples: + >>> data = [('v1', 0.4, 'assessment 1'), ('v1', None, None), ('v1', None, None),('v2', None, None),] + >>> ( + ... spark.createDataFrame(data, ['v', 'a', 'b']) + ... .groupBy('v') + ... .agg(f.collect_list(f.struct(f.struct( + ... f.col('a').alias('am_pathogenicity'), + ... f.col('b').alias('am_class')).alias('alphamissense'), + ... f.lit('gene1').alias('gene_id'))).alias('transcripts') + ... ) + ... .select(VariantEffectPredictorParser._get_max_alpha_missense(f.col('transcripts')).alias('am')) + ... .show(truncate=False) + ... ) + +----------------------------------------------------+ + |am | + +----------------------------------------------------+ + |{max alpha missense, assessment 1, 0.4, null, gene1}| + |{max alpha missense, null, null, null, gene1} | + +----------------------------------------------------+ + + """ + return f.transform( + # Extract transcripts with alpha missense values: + f.filter( + transcripts, + lambda transcript: transcript.getItem("alphamissense").isNotNull(), + ), + # Extract alpha missense values: + lambda transcript: f.struct( + transcript.getItem("alphamissense") + .getItem("am_pathogenicity") + .cast(t.FloatType()) + .alias("score"), + transcript.getItem("alphamissense") + .getItem("am_class") + .alias("assessment"), + f.lit("max alpha missense").alias("method"), + transcript.getItem("gene_id").alias("targetId"), + ), + )[0] + + @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA) + @staticmethod + def _vep_in_silico_prediction_extractor( + transcript_column_name: str, + method_name: str, + score_column_name: str | None = None, + assessment_column_name: str | None = None, + assessment_flag_column_name: str | None = None, + ) -> Column: + """Extract in silico prediction from VEP output. + + Args: + transcript_column_name (str): Name of the column containing the list of transcripts. + method_name (str): Name of the in silico predictor. + score_column_name (str | None): Name of the column containing the score. + assessment_column_name (str | None): Name of the column containing the assessment. + assessment_flag_column_name (str | None): Name of the column containing the assessment flag. + + Returns: + Column: In silico predictor. + """ + # Get highest score: + most_severe_transcript: Column = ( + # Getting the most severe transcript: + VariantEffectPredictorParser._get_most_severe_transcript( + transcript_column_name, score_column_name + ) + if score_column_name is not None + # If we don't have score, just pick one of the transcript where assessment is available: + else f.filter( + f.col(transcript_column_name), + lambda transcript: transcript.getItem( + assessment_column_name + ).isNotNull(), + ) + ) + + return f.when( + most_severe_transcript.isNotNull(), + f.struct( + # Adding method name: + f.lit(method_name).cast(t.StringType()).alias("method"), + # Adding assessment: + f.lit(None).cast(t.StringType()).alias("assessment") + if assessment_column_name is None + else most_severe_transcript.getField(assessment_column_name).alias( + "assessment" + ), + # Adding score: + f.lit(None).cast(t.FloatType()).alias("score") + if score_column_name is None + else most_severe_transcript.getField(score_column_name) + .cast(t.FloatType()) + .alias("score"), + # Adding assessment flag: + f.lit(None).cast(t.StringType()).alias("assessmentFlag") + if assessment_flag_column_name is None + else most_severe_transcript.getField(assessment_flag_column_name) + .cast(t.FloatType()) + .alias("assessmentFlag"), + # Adding target id if present: + most_severe_transcript.getItem("gene_id").alias("targetId"), + ), + ) + + @staticmethod + def _parser_amino_acid_change(amino_acids: Column, protein_end: Column) -> Column: + """Convert VEP amino acid change information to one letter code aa substitution code. + + The logic assumes that the amino acid change is given in the format "from/to" and the protein end is given also. + If any of the information is missing, the amino acid change will be set to None. + + Args: + amino_acids (Column): Amino acid change information. + protein_end (Column): Protein end information. + + Returns: + Column: Amino acid change in one letter code. + + Examples: + >>> data = [('A/B', 1),('A/B', None),(None, 1),] + >>> ( + ... spark.createDataFrame(data, ['amino_acids', 'protein_end']) + ... .select(VariantEffectPredictorParser._parser_amino_acid_change(f.col('amino_acids'), f.col('protein_end')).alias('amino_acid_change')) + ... .show() + ... ) + +-----------------+ + |amino_acid_change| + +-----------------+ + | A1B| + | null| + | null| + +-----------------+ + + """ + return f.when( + amino_acids.isNotNull() & protein_end.isNotNull(), + f.concat( + f.split(amino_acids, r"\/")[0], + protein_end, + f.split(amino_acids, r"\/")[1], + ), + ).otherwise(f.lit(None)) + + @staticmethod + def _collect_uniprot_accessions(trembl: Column, swissprot: Column) -> Column: + """Flatten arrays containing Uniprot accessions. + + Args: + trembl (Column): List of TrEMBL protein accessions. + swissprot (Column): List of SwissProt protein accessions. + + Returns: + Column: List of unique Uniprot accessions extracted from swissprot and trembl arrays, splitted from version numbers. + + Examples: + >>> data = [('v1', ["sp_1"], ["tr_1"]), ('v1', None, None), ('v1', None, ["tr_2"]),] + >>> ( + ... spark.createDataFrame(data, ['v', 'sp', 'tr']) + ... .select(VariantEffectPredictorParser._collect_uniprot_accessions(f.col('sp'), f.col('tr')).alias('proteinIds')) + ... .show() + ... ) + +------------+ + | proteinIds| + +------------+ + |[sp_1, tr_1]| + | []| + | [tr_2]| + +------------+ + + """ + # Dropping Null values and flattening the arrays: + return f.filter( + f.array_distinct( + f.transform( + f.flatten( + f.filter( + f.array(trembl, swissprot), + lambda x: x.isNotNull(), + ) + ), + lambda x: f.split(x, r"\.")[0], + ) + ), + lambda x: x.isNotNull(), + ) + + @staticmethod + def _consequence_to_sequence_ontology( + col: Column, so_dict: Dict[str, str] + ) -> Column: + """Convert VEP consequence terms to sequence ontology identifiers. + + Missing consequence label will be converted to None, unmapped consequences will be mapped as None. + + Args: + col (Column): Column containing VEP consequence terms. + so_dict (Dict[str, str]): Dictionary mapping VEP consequence terms to sequence ontology identifiers. + + Returns: + Column: Column containing sequence ontology identifiers. + + Examples: + >>> data = [('consequence_1',),('unmapped_consequence',),(None,)] + >>> m = {'consequence_1': 'SO:000000'} + >>> ( + ... spark.createDataFrame(data, ['label']) + ... .select('label',VariantEffectPredictorParser._consequence_to_sequence_ontology(f.col('label'),m).alias('id')) + ... .show() + ... ) + +--------------------+---------+ + | label| id| + +--------------------+---------+ + | consequence_1|SO:000000| + |unmapped_consequence| null| + | null| null| + +--------------------+---------+ + + """ + map_expr = f.create_map(*[f.lit(x) for x in chain(*so_dict.items())]) + + return map_expr[col].alias("ancestry") + + @staticmethod + def _parse_variant_location_id(vep_input_field: Column) -> List[Column]: + r"""Parse variant identifier, chromosome, position, reference allele and alternate allele from VEP input field. + + Args: + vep_input_field (Column): Column containing variant vcf string used as VEP input. + + Returns: + List[Column]: List of columns containing chromosome, position, reference allele and alternate allele. + """ + variant_fields = f.split(vep_input_field, r"\t") + return [ + f.concat_ws( + "_", + f.array( + variant_fields[0], + variant_fields[1], + variant_fields[3], + variant_fields[4], + ), + ).alias("variantId"), + variant_fields[0].cast(t.StringType()).alias("chromosome"), + variant_fields[1].cast(t.IntegerType()).alias("position"), + variant_fields[3].cast(t.StringType()).alias("referenceAllele"), + variant_fields[4].cast(t.StringType()).alias("alternateAllele"), + ] + + @classmethod + def process_vep_output(cls, vep_output: DataFrame) -> DataFrame: + """Process and format a VEP output in JSON format. + + Args: + vep_output (DataFrame): raw VEP output, read as spark DataFrame. + + Returns: + DataFrame: processed data in the right shape. + """ + # Reading consequence to sequence ontology map: + sequence_ontology_map = json.loads( + pkg_resources.read_text(data, "so_mappings.json", encoding="utf-8") + ) + # Processing VEP output: + return ( + vep_output + # Dropping non-canonical transcript consequences: # TODO: parametrize this. + .withColumn( + "transcript_consequences", + f.filter( + f.col("transcript_consequences"), + lambda consequence: consequence.getItem("canonical") == 1, + ), + ) + .select( + # Parse id and chr:pos:alt:ref: + *cls._parse_variant_location_id(f.col("input")), + # Extracting corss_references from colocated variants: + cls._extract_ensembl_xrefs(f.col("colocated_variants")).alias( + "ensembl_xrefs" + ), + cls._extract_omim_xrefs(f.col("colocated_variants")).alias( + "omim_xrefs" + ), + cls._extract_clinvar_xrefs(f.col("colocated_variants")).alias( + "clinvar_xrefs" + ), + # Extracting in silico predictors + f.when( + # The following in-silico predictors are only available for variants with transcript consequences: + f.col("transcript_consequences").isNotNull(), + f.filter( + f.array( + # Extract CADD scores: + cls._vep_in_silico_prediction_extractor( + transcript_column_name="transcript_consequences", + method_name="phred scaled CADD", + score_column_name="cadd_phred", + ), + # Extract polyphen scores: + cls._vep_in_silico_prediction_extractor( + transcript_column_name="transcript_consequences", + method_name="polyphen", + score_column_name="polyphen_score", + assessment_column_name="polyphen_prediction", + ), + # Extract sift scores: + cls._vep_in_silico_prediction_extractor( + transcript_column_name="transcript_consequences", + method_name="sift", + score_column_name="sift_score", + assessment_column_name="sift_prediction", + ), + # Extract loftee scores: + cls._vep_in_silico_prediction_extractor( + method_name="loftee", + transcript_column_name="transcript_consequences", + score_column_name="lof", + assessment_column_name="lof", + assessment_flag_column_name="lof_filter", + ), + # Extract max alpha missense: + cls._get_max_alpha_missense( + f.col("transcript_consequences") + ), + ), + lambda predictor: predictor.isNotNull(), + ), + ) + .otherwise( + # Extract CADD scores from intergenic object: + f.array( + cls._vep_in_silico_prediction_extractor( + transcript_column_name="intergenic_consequences", + method_name="phred scaled CADD", + score_column_name="cadd_phred", + ), + ) + ) + .alias("inSilicoPredictors"), + # Convert consequence to SO: + cls._consequence_to_sequence_ontology( + f.col("most_severe_consequence"), sequence_ontology_map + ).alias("mostSevereConsequenceId"), + # Collect transcript consequence: + f.when( + f.col("transcript_consequences").isNotNull(), + f.transform( + f.col("transcript_consequences"), + lambda transcript: f.struct( + # Convert consequence terms to SO identifier: + f.transform( + transcript.consequence_terms, + lambda y: cls._consequence_to_sequence_ontology( + y, sequence_ontology_map + ), + ).alias("variantFunctionalConsequenceIds"), + # Format amino acid change: + cls._parser_amino_acid_change( + transcript.amino_acids, transcript.protein_end + ).alias("aminoAcidChange"), + # Extract and clean uniprot identifiers: + cls._collect_uniprot_accessions( + transcript.swissprot, + transcript.trembl, + ).alias("uniprotAccessions"), + # Add canonical flag: + f.when(transcript.canonical == 1, f.lit(True)) + .otherwise(f.lit(False)) + .alias("isEnsemblCanonical"), + # Extract other fields as is: + transcript.codons.alias("codons"), + transcript.distance.alias("distance"), + transcript.gene_id.alias("targetId"), + transcript.impact.alias("impact"), + transcript.transcript_id.alias("transcriptId"), + transcript.lof.cast(t.StringType()).alias( + "lofteePrediction" + ), + transcript.lof.cast(t.FloatType()).alias("siftPrediction"), + transcript.lof.cast(t.FloatType()).alias( + "polyphenPrediction" + ), + ), + ), + ).alias("transcriptConsequences"), + # Extracting rsids: + cls._colocated_variants_to_rsids(f.col("colocated_variants")).alias( + "rsIds" + ), + # Adding empty array for allele frequencies - now this piece of data is not coming form the VEP data: + f.array().cast(cls.ALLELE_FREQUENCY_SCHEMA).alias("alleleFrequencies"), + ) + # Adding protvar xref for missense variants: # TODO: making and extendable list of consequences + .withColumn( + "protvar_xrefs", + f.when( + f.size( + f.filter( + f.col("transcriptConsequences"), + lambda x: f.array_contains( + x.variantFunctionalConsequenceIds, "SO_0001583" + ), + ) + ) + > 0, + cls._generate_dbxrefs(f.array(f.col("variantId")), "protvar"), + ), + ) + .withColumn( + "dbXrefs", + f.flatten( + f.filter( + f.array( + f.col("ensembl_xrefs"), + f.col("omim_xrefs"), + f.col("clinvar_xrefs"), + f.col("protvar_xrefs"), + ), + lambda x: x.isNotNull(), + ) + ), + ) + # Dropping intermediate xref columns: + .drop(*["ensembl_xrefs", "omim_xrefs", "clinvar_xrefs", "protvar_xrefs"]) + ) diff --git a/src/gentropy/datasource/gnomad/variants.py b/src/gentropy/datasource/gnomad/variants.py index fdc67a7cb..98e7013f5 100644 --- a/src/gentropy/datasource/gnomad/variants.py +++ b/src/gentropy/datasource/gnomad/variants.py @@ -7,8 +7,8 @@ import hail as hl from gentropy.common.types import VariantPopulation -from gentropy.config import VariantAnnotationConfig -from gentropy.dataset.variant_annotation import VariantAnnotation +from gentropy.config import GnomadVariantConfig +from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: pass @@ -19,26 +19,23 @@ class GnomADVariants: def __init__( self, - gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path, - chain_38_37: str = VariantAnnotationConfig().chain_38_37, + gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path, gnomad_variant_populations: list[ VariantPopulation | str - ] = VariantAnnotationConfig().gnomad_variant_populations, + ] = GnomadVariantConfig().gnomad_variant_populations, ): """Initialize. Args: gnomad_genomes_path (str): Path to gnomAD genomes hail table. - chain_38_37 (str): Path to GRCh38 to GRCh37 chain file. gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. - All defaults are stored in VariantAnnotationConfig. + All defaults are stored in GnomadVariantConfig. """ self.gnomad_genomes_path = gnomad_genomes_path - self.chain_38_37 = chain_38_37 self.gnomad_variant_populations = gnomad_variant_populations - def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: + def as_variant_index(self: GnomADVariants) -> VariantIndex: """Generate variant annotation dataset from gnomAD. Some relevant modifications to the original dataset are: @@ -48,7 +45,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: 3. Field names are converted to camel case to follow the convention. Returns: - VariantAnnotation: Variant annotation dataset + VariantIndex: GnomaAD variants dataset. """ # Load variants dataset ht = hl.read_table( @@ -56,19 +53,14 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: _load_refs=False, ) - # Liftover - grch37 = hl.get_reference("GRCh37") - grch38 = hl.get_reference("GRCh38") - grch38.add_liftover(self.chain_38_37, grch37) - # Drop non biallelic variants ht = ht.filter(ht.alleles.length() == 2) - # Liftover - ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, "GRCh37")) + # Select relevant fields and nested records to create class - return VariantAnnotation( + return VariantIndex( _df=( ht.select( + # Extract mandatory fields: variantId=hl.str("_").join( [ ht.locus.contig.replace("chr", ""), @@ -79,12 +71,9 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: ), chromosome=ht.locus.contig.replace("chr", ""), position=ht.locus.position, - chromosomeB37=ht.locus_GRCh37.contig.replace("chr", ""), - positionB37=ht.locus_GRCh37.position, referenceAllele=ht.alleles[0], alternateAllele=ht.alleles[1], - rsIds=ht.rsid, - alleleType=ht.allele_info.allele_type, + # Extract allele frequencies from populations of interest: alleleFrequencies=hl.set( [f"{pop}_adj" for pop in self.gnomad_variant_populations] ).map( @@ -93,33 +82,46 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF, ) ), - vep=hl.struct( - mostSevereConsequence=ht.vep.most_severe_consequence, - transcriptConsequences=hl.map( - lambda x: hl.struct( - aminoAcids=x.amino_acids, - consequenceTerms=x.consequence_terms, - geneId=x.gene_id, - lof=x.lof, + # Extract most severe consequence: + mostSevereConsequence=ht.vep.most_severe_consequence, + # Extract in silico predictors: + inSilicoPredictors=hl.array( + [ + hl.struct( + method=hl.str("spliceai"), + assessment=hl.missing(hl.tstr), + score=hl.expr.functions.float32( + ht.in_silico_predictors.spliceai_ds_max + ), + assessmentFlag=hl.missing(hl.tstr), + targetId=hl.missing(hl.tstr), ), - # Only keeping canonical transcripts - ht.vep.transcript_consequences.filter( - lambda x: (x.canonical == 1) - & (x.gene_symbol_source == "HGNC") + hl.struct( + method=hl.str("pangolin"), + assessment=hl.missing(hl.tstr), + score=hl.expr.functions.float32( + ht.in_silico_predictors.pangolin_largest_ds + ), + assessmentFlag=hl.missing(hl.tstr), + targetId=hl.missing(hl.tstr), ), - ), + ] ), - inSilicoPredictors=hl.struct( - cadd=hl.struct( - phred=ht.in_silico_predictors.cadd.phred, - raw=ht.in_silico_predictors.cadd.raw_score, - ), - revelMax=ht.in_silico_predictors.revel_max, - spliceaiDsMax=ht.in_silico_predictors.spliceai_ds_max, - pangolinLargestDs=ht.in_silico_predictors.pangolin_largest_ds, - phylop=ht.in_silico_predictors.phylop, - siftMax=ht.in_silico_predictors.sift_max, - polyphenMax=ht.in_silico_predictors.polyphen_max, + # Extract cross references to GnomAD: + dbXrefs=hl.array( + [ + hl.struct( + id=hl.str("-").join( + [ + ht.locus.contig.replace("chr", ""), + hl.str(ht.locus.position), + ht.alleles[0], + ht.alleles[1], + ] + ), + source=hl.str("gnomad"), + ) + ] ), ) .key_by("chromosome", "position") @@ -127,5 +129,5 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: .select_globals() .to_spark(flatten=False) ), - _schema=VariantAnnotation.get_schema(), + _schema=VariantIndex.get_schema(), ) diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py index d6763cb35..b4e9a2d45 100644 --- a/src/gentropy/datasource/gwas_catalog/associations.py +++ b/src/gentropy/datasource/gwas_catalog/associations.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from pyspark.sql import Column, DataFrame - from gentropy.dataset.variant_annotation import VariantAnnotation + from gentropy.dataset.variant_index import VariantIndex @dataclass @@ -197,14 +197,14 @@ def _collect_rsids( return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele)) @staticmethod - def _map_to_variant_annotation_variants( - gwas_associations: DataFrame, variant_annotation: VariantAnnotation + def _map_variants_to_variant_index( + gwas_associations: DataFrame, variant_index: VariantIndex ) -> DataFrame: """Add variant metadata in associations. Args: - gwas_associations (DataFrame): raw GWAS Catalog associations - variant_annotation (VariantAnnotation): variant annotation dataset + gwas_associations (DataFrame): raw GWAS Catalog associations. + variant_index (VariantIndex): GnomaAD variants dataset with allele frequencies. Returns: DataFrame: GWAS Catalog associations data including `variantId`, `referenceAllele`, @@ -228,7 +228,7 @@ def _map_to_variant_annotation_variants( ) # Subset of variant annotation required for GWAS Catalog annotations: - va_subset = variant_annotation.df.select( + va_subset = variant_index.df.select( "variantId", "chromosome", # Calculate the position in Ensembl coordinates for indels: @@ -241,7 +241,7 @@ def _map_to_variant_annotation_variants( "referenceAllele", "alternateAllele", "alleleFrequencies", - variant_annotation.max_maf().alias("maxMaf"), + variant_index.max_maf().alias("maxMaf"), ).join( f.broadcast( gwas_associations_subset.select( @@ -1035,7 +1035,7 @@ def _qc_palindromic_alleles( def from_source( cls: type[GWASCatalogCuratedAssociationsParser], gwas_associations: DataFrame, - variant_annotation: VariantAnnotation, + variant_index: VariantIndex, pvalue_threshold: float = WindowBasedClumpingStepConfig.gwas_significance, ) -> StudyLocusGWASCatalog: """Read GWASCatalog associations. @@ -1044,9 +1044,9 @@ def from_source( applies some pre-defined filters on the data: Args: - gwas_associations (DataFrame): GWAS Catalog raw associations dataset - variant_annotation (VariantAnnotation): Variant annotation dataset - pvalue_threshold (float): P-value threshold for flagging associations + gwas_associations (DataFrame): GWAS Catalog raw associations dataset. + variant_index (VariantIndex): Variant index dataset with available allele frequencies. + pvalue_threshold (float): P-value threshold for flagging associations. Returns: StudyLocusGWASCatalog: GWASCatalogAssociations dataset @@ -1060,8 +1060,8 @@ def from_source( .transform( # Map/harmonise variants to variant annotation dataset: # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position - lambda df: GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants( - df, variant_annotation + lambda df: GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index( + df, variant_index ) ) .withColumn( diff --git a/src/gentropy/datasource/ukbiobank/__init__.py b/src/gentropy/datasource/ukbiobank/__init__.py index 544779b18..910603703 100644 --- a/src/gentropy/datasource/ukbiobank/__init__.py +++ b/src/gentropy/datasource/ukbiobank/__init__.py @@ -1,3 +1,3 @@ -"""GWAS Catalog Data Source.""" +"""UK biobank.""" from __future__ import annotations diff --git a/src/gentropy/ld_index.py b/src/gentropy/gnomad_ingestion.py similarity index 54% rename from src/gentropy/ld_index.py rename to src/gentropy/gnomad_ingestion.py index 0cc00cf14..07b7fd58b 100644 --- a/src/gentropy/ld_index.py +++ b/src/gentropy/gnomad_ingestion.py @@ -1,14 +1,15 @@ -"""Step to dump a filtered version of a LD matrix (block matrix) as Parquet files.""" +"""Step to dump a filtered version of a LD matrix (block matrix) and GnomAD variants.""" from __future__ import annotations import hail as hl from gentropy.common.session import Session -from gentropy.common.types import LD_Population +from gentropy.common.types import LD_Population, VariantPopulation from gentropy.common.version_engine import VersionEngine -from gentropy.config import LDIndexConfig +from gentropy.config import GnomadVariantConfig, LDIndexConfig from gentropy.datasource.gnomad.ld import GnomADLDMatrix +from gentropy.datasource.gnomad.variants import GnomADVariants class LDIndexStep: @@ -69,3 +70,56 @@ def __init__( .parquet(ld_index_out) ) session.logger.info(ld_index_out) + + +class GnomadVariantIndexStep: + """A step to generate variant index dataset from gnomad data. + + Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. + This dataset is used to validate variants and as a source of annotation. + """ + + def __init__( + self, + session: Session, + gnomad_variant_output: str, + gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path, + gnomad_variant_populations: list[ + VariantPopulation | str + ] = GnomadVariantConfig().gnomad_variant_populations, + use_version_from_input: bool = GnomadVariantConfig().use_version_from_input, + ) -> None: + """Run Variant Annotation step. + + Args: + session (Session): Session object. + gnomad_variant_output (str): Path to resulting dataset. + gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`. + gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. + use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output gnomad_variant_output. Defaults to False. + + In case use_version_from_input is set to True, + data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path. + All defaults are stored in the GnomadVariantConfig. + """ + # amend data source version to output path + if use_version_from_input: + gnomad_variant_output = VersionEngine("gnomad").amend_version( + gnomad_genomes_path, gnomad_variant_output + ) + + # Initialise hail session. + hl.init(sc=session.spark.sparkContext, log="/dev/null") + + # Parse variant info from source. + gnomad_variants = GnomADVariants( + gnomad_genomes_path=gnomad_genomes_path, + gnomad_variant_populations=gnomad_variant_populations, + ).as_variant_index() + + # Write data partitioned by chromosome and position. + ( + gnomad_variants.df.write.mode(session.write_mode).parquet( + gnomad_variant_output + ) + ) diff --git a/src/gentropy/gwas_catalog_ingestion.py b/src/gentropy/gwas_catalog_ingestion.py index 6930a2df9..725f1ca4d 100644 --- a/src/gentropy/gwas_catalog_ingestion.py +++ b/src/gentropy/gwas_catalog_ingestion.py @@ -1,8 +1,9 @@ """Step to process GWAS Catalog associations and study table.""" + from __future__ import annotations from gentropy.common.session import Session -from gentropy.dataset.variant_annotation import VariantAnnotation +from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.gwas_catalog.associations import ( GWASCatalogCuratedAssociationsParser, ) @@ -26,7 +27,7 @@ def __init__( catalog_ancestry_files: list[str], catalog_sumstats_lut: str, catalog_associations_file: str, - variant_annotation_path: str, + gnomad_variant_path: str, catalog_studies_out: str, catalog_associations_out: str, gwas_catalog_study_curation_file: str | None = None, @@ -40,14 +41,14 @@ def __init__( catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog. catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table. catalog_associations_file (str): Raw GWAS catalog associations file. - variant_annotation_path (str): Input variant annotation path. + gnomad_variant_path (str): Path to GnomAD variants. catalog_studies_out (str): Output GWAS catalog studies path. catalog_associations_out (str): Output GWAS catalog associations path. gwas_catalog_study_curation_file (str | None): file of the curation table. Optional. inclusion_list_path (str | None): optional inclusion list (parquet) """ # Extract - va = VariantAnnotation.from_parquet(session, variant_annotation_path) + gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path) catalog_studies = session.spark.read.csv( list(catalog_study_files), sep="\t", header=True ) @@ -69,7 +70,9 @@ def __init__( StudyIndexGWASCatalogParser.from_source( catalog_studies, ancestry_lut, sumstats_lut ).annotate_from_study_curation(gwas_catalog_study_curation), - GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va), + GWASCatalogCuratedAssociationsParser.from_source( + catalog_associations, gnomad_variants + ), ) # if inclusion list is provided apply filter: diff --git a/src/gentropy/gwas_catalog_study_inclusion.py b/src/gentropy/gwas_catalog_study_inclusion.py index 872177601..f07f851a7 100644 --- a/src/gentropy/gwas_catalog_study_inclusion.py +++ b/src/gentropy/gwas_catalog_study_inclusion.py @@ -1,4 +1,5 @@ """Step to generate an GWAS Catalog study identifier inclusion and exclusion list.""" + from __future__ import annotations from typing import TYPE_CHECKING @@ -6,7 +7,7 @@ from pyspark.sql import functions as f from gentropy.common.session import Session -from gentropy.dataset.variant_annotation import VariantAnnotation +from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.gwas_catalog.associations import ( GWASCatalogCuratedAssociationsParser, ) @@ -84,7 +85,7 @@ def process_harmonised_list(studies: list[str], session: Session) -> DataFrame: @staticmethod def get_gwas_catalog_study_index( session: Session, - variant_annotation_path: str, + gnomad_variant_path: str, catalog_study_files: list[str], catalog_ancestry_files: list[str], harmonised_study_file: str, @@ -95,7 +96,7 @@ def get_gwas_catalog_study_index( Args: session (Session): Session object. - variant_annotation_path (str): Input variant annotation path. + gnomad_variant_path (str): Path to GnomAD variant list. catalog_study_files (list[str]): List of raw GWAS catalog studies file. catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog. harmonised_study_file (str): GWAS Catalog summary statistics lookup table. @@ -106,7 +107,7 @@ def get_gwas_catalog_study_index( StudyIndexGWASCatalog: Completely processed and fully annotated study index. """ # Extract - va = VariantAnnotation.from_parquet(session, variant_annotation_path) + gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path) catalog_studies = session.spark.read.csv( list(catalog_study_files), sep="\t", header=True ) @@ -130,7 +131,9 @@ def get_gwas_catalog_study_index( ancestry_lut, sumstats_lut, ).annotate_from_study_curation(gwas_catalog_study_curation), - GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va), + GWASCatalogCuratedAssociationsParser.from_source( + catalog_associations, gnomad_variants + ), ) return study_index @@ -142,7 +145,7 @@ def __init__( catalog_ancestry_files: list[str], catalog_associations_file: str, gwas_catalog_study_curation_file: str, - variant_annotation_path: str, + gnomad_variant_path: str, harmonised_study_file: str, criteria: str, inclusion_list_path: str, @@ -151,12 +154,12 @@ def __init__( """Run step. Args: - session (Session): Session object. + session (Session): Session objecct. catalog_study_files (list[str]): List of raw GWAS catalog studies file. catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog. catalog_associations_file (str): Raw GWAS catalog associations file. gwas_catalog_study_curation_file (str): file of the curation table. Optional. - variant_annotation_path (str): Input variant annotation path. + gnomad_variant_path (str): Path to GnomAD variant list. harmonised_study_file (str): GWAS Catalog summary statistics lookup table. criteria (str): name of the filter set to be applied. inclusion_list_path (str): Output path for the inclusion list. @@ -165,7 +168,7 @@ def __init__( # Create study index: study_index = self.get_gwas_catalog_study_index( session, - variant_annotation_path, + gnomad_variant_path, catalog_study_files, catalog_ancestry_files, harmonised_study_file, diff --git a/src/gentropy/variant_annotation.py b/src/gentropy/variant_annotation.py deleted file mode 100644 index 355a4dfea..000000000 --- a/src/gentropy/variant_annotation.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Step to generate variant annotation dataset.""" - -from __future__ import annotations - -import hail as hl - -from gentropy.common.session import Session -from gentropy.common.types import VariantPopulation -from gentropy.common.version_engine import VersionEngine -from gentropy.config import VariantAnnotationConfig -from gentropy.datasource.gnomad.variants import GnomADVariants - - -class VariantAnnotationStep: - """Variant annotation step. - - Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. - This dataset is used to validate variants and as a source of annotation. - """ - - def __init__( - self, - session: Session, - variant_annotation_path: str, - gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path, - gnomad_variant_populations: list[ - VariantPopulation | str - ] = VariantAnnotationConfig().gnomad_variant_populations, - chain_38_37: str = VariantAnnotationConfig().chain_38_37, - use_version_from_input: bool = VariantAnnotationConfig().use_version_from_input, - ) -> None: - """Run Variant Annotation step. - - Args: - session (Session): Session object. - variant_annotation_path (str): Variant annotation dataset path. - gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`. - gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. - chain_38_37 (str): Path to GRCh38 to GRCh37 chain file for lifover. - use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False. - - In case use_version_from_input is set to True, - data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path. - All defaults are stored in the VariantAnnotationConfig. - """ - # amend data source version to output path - if use_version_from_input: - variant_annotation_path = VersionEngine("gnomad").amend_version( - gnomad_genomes_path, variant_annotation_path - ) - - # Initialise hail session. - hl.init(sc=session.spark.sparkContext, log="/dev/null") - # Run variant annotation. - variant_annotation = GnomADVariants( - gnomad_genomes_path=gnomad_genomes_path, - gnomad_variant_populations=gnomad_variant_populations, - chain_38_37=chain_38_37, - ).as_variant_annotation() - - # Write data partitioned by chromosome and position. - ( - variant_annotation.df.write.mode(session.write_mode).parquet( - variant_annotation_path - ) - ) diff --git a/src/gentropy/variant_index.py b/src/gentropy/variant_index.py index bdf838ce2..ba86c4602 100644 --- a/src/gentropy/variant_index.py +++ b/src/gentropy/variant_index.py @@ -1,44 +1,53 @@ -"""Step to generate variant index dataset.""" +"""Step to generate variant index dataset based on VEP output.""" + from __future__ import annotations from gentropy.common.session import Session -from gentropy.dataset.study_locus import StudyLocus -from gentropy.dataset.variant_annotation import VariantAnnotation from gentropy.dataset.variant_index import VariantIndex +from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser class VariantIndexStep: - """Run variant index step to only variants in study-locus sets. + """Generate variant index based on a VEP output in json format. - Using a `VariantAnnotation` dataset as a reference, this step creates and writes a dataset of the type `VariantIndex` that includes only variants that have disease-association data with a reduced set of annotations. + The variant index is a dataset that contains variant annotations extracted from VEP output. It is expected that all variants in the VEP output are present in the variant index. + There's an option to provide extra variant annotations to be added to the variant index eg. allele frequencies from GnomAD. """ def __init__( self: VariantIndexStep, session: Session, - variant_annotation_path: str, - credible_set_path: str, + vep_output_json_path: str, variant_index_path: str, + gnomad_variant_annotations_path: str | None = None, ) -> None: """Run VariantIndex step. Args: session (Session): Session object. - variant_annotation_path (str): Variant annotation dataset path. - credible_set_path (str): Credible set dataset path. - variant_index_path (str): Variant index dataset path. + vep_output_json_path (str): Variant effect predictor output path (in json format). + variant_index_path (str): Variant index dataset path to save resulting data. + gnomad_variant_annotations_path (str | None): Path to extra variant annotation dataset. """ - # Extract - va = VariantAnnotation.from_parquet(session, variant_annotation_path) - credible_set = StudyLocus.from_parquet( - session, credible_set_path, recursiveFileLookup=True + # Extract variant annotations from VEP output: + variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep( + session.spark, vep_output_json_path ) - # Transform - vi = VariantIndex.from_variant_annotation(va, credible_set) + # Process variant annotations if provided: + if gnomad_variant_annotations_path: + # Read variant annotations from parquet: + annotations = VariantIndex.from_parquet( + session=session, + path=gnomad_variant_annotations_path, + recursiveFileLookup=True, + ) + + # Update file with extra annotations: + variant_index = variant_index.add_annotation(annotations) ( - vi.df.write.partitionBy("chromosome") + variant_index.df.write.partitionBy("chromosome") .mode(session.write_mode) .parquet(variant_index_path) ) diff --git a/src/gentropy/v2g.py b/src/gentropy/variant_to_gene.py similarity index 81% rename from src/gentropy/v2g.py rename to src/gentropy/variant_to_gene.py index e98348a01..cf21053d7 100644 --- a/src/gentropy/v2g.py +++ b/src/gentropy/variant_to_gene.py @@ -1,16 +1,16 @@ """Step to generate variant annotation dataset.""" + from __future__ import annotations from functools import reduce -import pyspark.sql.functions as f +from pyspark.sql import functions as f from gentropy.common.Liftover import LiftOverSpark from gentropy.common.session import Session from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals from gentropy.dataset.v2g import V2G -from gentropy.dataset.variant_annotation import VariantAnnotation from gentropy.dataset.variant_index import VariantIndex @@ -26,7 +26,6 @@ class V2GStep: Attributes: session (Session): Session object. variant_index_path (str): Input variant index path. - variant_annotation_path (str): Input variant annotation path. gene_index_path (str): Input gene index path. vep_consequences_path (str): Input VEP consequences path. liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file. @@ -41,7 +40,6 @@ def __init__( self, session: Session, variant_index_path: str, - variant_annotation_path: str, gene_index_path: str, vep_consequences_path: str, liftover_chain_file_path: str, @@ -56,7 +54,6 @@ def __init__( Args: session (Session): Session object. variant_index_path (str): Input variant index path. - variant_annotation_path (str): Input variant annotation path. gene_index_path (str): Input gene index path. vep_consequences_path (str): Input VEP consequences path. liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file. @@ -69,16 +66,10 @@ def __init__( # Read gene_index = GeneIndex.from_parquet(session, gene_index_path) vi = VariantIndex.from_parquet(session, variant_index_path).persist() - va = VariantAnnotation.from_parquet(session, variant_annotation_path) + # Reading VEP consequence to score table and cast the score to the right type: vep_consequences = session.spark.read.csv( vep_consequences_path, sep="\t", header=True - ).select( - f.element_at(f.split("Accession", r"/"), -1).alias( - "variantFunctionalConsequenceId" - ), - f.col("Term").alias("label"), - f.col("v2g_score").cast("double").alias("score"), - ) + ).withColumn("score", f.col("score").cast("double")) # Transform lift = LiftOverSpark( @@ -90,10 +81,7 @@ def __init__( # Filter gene index by approved biotypes to define V2G gene universe list(approved_biotypes) ) - va_slimmed = va.filter_by_variant_df( - # Variant annotation reduced to the variant index to define V2G variant universe - vi.df - ).persist() + intervals = Intervals( _df=reduce( lambda x, y: x.unionByName(y, allowMissingColumns=True), @@ -108,9 +96,11 @@ def __init__( _schema=Intervals.get_schema(), ) v2g_datasets = [ - va_slimmed.get_distance_to_tss(gene_index_filtered, max_distance), - va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered), - va_slimmed.get_plof_v2g(gene_index_filtered), + vi.get_distance_to_tss(gene_index_filtered, max_distance), + vi.get_most_severe_transcript_consequence( + vep_consequences, gene_index_filtered + ), + vi.get_plof_v2g(gene_index_filtered), intervals.v2g(vi), ] v2g = V2G( diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index 62b873355..c35188466 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -23,7 +23,6 @@ from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.dataset.summary_statistics import SummaryStatistics from gentropy.dataset.v2g import V2G -from gentropy.dataset.variant_annotation import VariantAnnotation from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex @@ -276,45 +275,6 @@ def mock_v2g(spark: SparkSession) -> V2G: return V2G(_df=data_spec.build(), _schema=v2g_schema) -@pytest.fixture() -def mock_variant_annotation(spark: SparkSession) -> VariantAnnotation: - """Mock variant annotation.""" - va_schema = VariantAnnotation.get_schema() - - data_spec = ( - dg.DataGenerator( - spark, - rows=400, - partitions=4, - randomSeedMethod="hash_fieldname", - ) - .withSchema(va_schema) - .withColumnSpec("alleleType", percentNulls=0.1) - .withColumnSpec("chromosomeB37", percentNulls=0.1) - .withColumnSpec("positionB37", percentNulls=0.1) - # Nested column handling workaround - # https://github.com/databrickslabs/dbldatagen/issues/135 - # It's a workaround for nested column handling in dbldatagen. - .withColumnSpec( - "alleleFrequencies", - expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))', - percentNulls=0.1, - ) - .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1) - .withColumnSpec( - "vep", - expr='named_struct("mostSevereConsequence", cast(rand() as string), "transcriptConsequences", array(named_struct("aminoAcids", cast(rand() as string), "consequenceTerms", array(cast(rand() as string)), "geneId", cast(rand() as string), "lof", cast(rand() as string))))', - percentNulls=0.1, - ) - .withColumnSpec( - "inSilicoPredictors", - expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))', - percentNulls=0.1, - ) - ) - return VariantAnnotation(_df=data_spec.build(), _schema=va_schema) - - @pytest.fixture() def mock_variant_index(spark: SparkSession) -> VariantIndex: """Mock variant index.""" @@ -328,23 +288,65 @@ def mock_variant_index(spark: SparkSession) -> VariantIndex: randomSeedMethod="hash_fieldname", ) .withSchema(vi_schema) - .withColumnSpec("chromosomeB37", percentNulls=0.1) - .withColumnSpec("positionB37", percentNulls=0.1) - .withColumnSpec("mostSevereConsequence", percentNulls=0.1) + .withColumnSpec("mostSevereConsequenceId", percentNulls=0.1) # Nested column handling workaround # https://github.com/databrickslabs/dbldatagen/issues/135 # It's a workaround for nested column handling in dbldatagen. + .withColumnSpec( + "inSilicoPredictors", + expr=""" + array( + named_struct( + "method", cast(rand() as string), + "assessment", cast(rand() as string), + "score", rand(), + "assessmentFlag", cast(rand() as string), + "targetId", cast(rand() as string) + ) + ) + """, + percentNulls=0.1, + ) .withColumnSpec( "alleleFrequencies", expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))', percentNulls=0.1, ) + .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1) .withColumnSpec( - "inSilicoPredictors", - expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))', + "transcriptConsequences", + expr=""" + array( + named_struct( + "variantFunctionalConsequenceIds", array(cast(rand() as string)), + "aminoAcidChange", cast(rand() as string), + "uniprotAccessions", array(cast(rand() as string)), + "isEnsemblCanonical", cast(rand() as boolean), + "codons", cast(rand() as string), + "distance", cast(rand() as long), + "targetId", cast(rand() as string), + "impact", cast(rand() as string), + "lofteePrediction", cast(rand() as string), + "siftPrediction", rand(), + "polyphenPrediction", rand(), + "transcriptId", cast(rand() as string) + ) + ) + """, + percentNulls=0.1, + ) + .withColumnSpec( + "dbXrefs", + expr=""" + array( + named_struct( + "id", cast(rand() as string), + "source", cast(rand() as string) + ) + ) + """, percentNulls=0.1, ) - .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1) ) return VariantIndex(_df=data_spec.build(), _schema=vi_schema) diff --git a/tests/gentropy/data_samples/finucane_PIPs.npy b/tests/gentropy/data_samples/finucane_PIPs.npy deleted file mode 100644 index 2a93a5c71..000000000 Binary files a/tests/gentropy/data_samples/finucane_PIPs.npy and /dev/null differ diff --git a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz b/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz deleted file mode 100644 index 74c0844a4..000000000 Binary files a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz and /dev/null differ diff --git a/tests/gentropy/data_samples/vep_consequences_sample.tsv b/tests/gentropy/data_samples/vep_consequences_sample.tsv deleted file mode 100644 index 7b5de68b2..000000000 --- a/tests/gentropy/data_samples/vep_consequences_sample.tsv +++ /dev/null @@ -1,45 +0,0 @@ -Accession Term Description Display term IMPACT v2g_score eco_score -http://purl.obolibrary.org/obo/SO_0001893 transcript_ablation A feature ablation whereby the deleted region includes a transcript feature Transcript ablation HIGH 1 1 -http://identifiers.org/eco/cttv_mapping_pipeline cttv_mapping_pipeline 1 -http://purl.obolibrary.org/obo/ECO_0000205 curator_inference 1 -http://purl.obolibrary.org/obo/SO_0002165 trinucleotide_repeat_expansion 1 -http://purl.obolibrary.org/obo/SO_0001574 splice_acceptor_variant A splice variant that changes the 2 base region at the 3' end of an intron Splice acceptor variant HIGH 1 0.95 -http://purl.obolibrary.org/obo/SO_0001575 splice_donor_variant A splice variant that changes the 2 base region at the 5' end of an intron Splice donor variant HIGH 1 0.95 -http://purl.obolibrary.org/obo/SO_0001587 stop_gained A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript Stop gained HIGH 1 0.95 -http://purl.obolibrary.org/obo/SO_0001589 frameshift_variant A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three Frameshift variant HIGH 1 0.95 -http://purl.obolibrary.org/obo/SO_0002012 start_lost A codon variant that changes at least one base of the canonical start codon Start lost HIGH 1 0.95 -http://purl.obolibrary.org/obo/SO_0001578 stop_lost A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript Stop lost HIGH 1 0.9 -http://purl.obolibrary.org/obo/SO_0001889 transcript_amplification A feature amplification of a region containing a transcript Transcript amplification HIGH 1 0.9 -http://purl.obolibrary.org/obo/SO_0001894 regulatory_region_ablation A feature ablation whereby the deleted region includes a regulatory region Regulatory region ablation MODERATE 0.66 0.9 -http://purl.obolibrary.org/obo/SO_0001583 missense_variant A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved Missense variant MODERATE 0.66 0.7 -http://purl.obolibrary.org/obo/SO_0001818 protein_altering_variant A sequence_variant which is predicted to change the protein encoded in the coding sequence Protein altering variant MODERATE 0.66 0.7 -http://purl.obolibrary.org/obo/SO_0001821 inframe_insertion An inframe non synonymous variant that inserts bases into in the coding sequence Inframe insertion MODERATE 0.66 0.7 -http://purl.obolibrary.org/obo/SO_0001822 inframe_deletion An inframe non synonymous variant that deletes bases from the coding sequence Inframe deletion MODERATE 0.66 0.7 -http://purl.obolibrary.org/obo/SO_0001582 initiator_codon_variant 0.7 -http://purl.obolibrary.org/obo/SO_0001630 splice_region_variant A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron Splice region variant LOW 0.33 0.65 -http://purl.obolibrary.org/obo/SO_0001626 incomplete_terminal_codon_variant A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed Incomplete terminal codon variant LOW 0.33 0.65 -http://purl.obolibrary.org/obo/SO_0001567 stop_retained_variant A sequence variant where at least one base in the terminator codon is changed, but the terminator remains Stop retained variant LOW 0.33 0.65 -http://purl.obolibrary.org/obo/SO_0001819 synonymous_variant A sequence variant where there is no resulting change to the encoded amino acid Synonymous variant LOW 0.33 0.65 -http://purl.obolibrary.org/obo/SO_0002019 start_retained_variant A sequence variant where at least one base in the start codon is changed, but the start remains Start retained variant LOW 0.33 0.65 -http://purl.obolibrary.org/obo/SO_0001619 non_coding_transcript_variant A transcript variant of a non coding RNA gene Non coding transcript variant MODIFIER 0 0.65 -http://purl.obolibrary.org/obo/SO_0001620 mature_miRNA_variant A transcript variant located with the sequence of the mature miRNA Mature miRNA variant MODIFIER 0 0.65 -http://purl.obolibrary.org/obo/SO_0001621 NMD_transcript_variant A variant in a transcript that is the target of NMD NMD transcript variant MODIFIER 0.1 0.65 -http://purl.obolibrary.org/obo/SO_0001623 5_prime_UTR_variant A UTR variant of the 5' UTR 5 prime UTR variant MODIFIER 0.1 0.65 -http://purl.obolibrary.org/obo/SO_0001624 3_prime_UTR_variant A UTR variant of the 3' UTR 3 prime UTR variant MODIFIER 0.1 0.65 -http://purl.obolibrary.org/obo/SO_0001627 intron_variant A transcript variant occurring within an intron Intron variant MODIFIER 0.1 0.65 -http://purl.obolibrary.org/obo/SO_0001792 non_coding_transcript_exon_variant A sequence variant that changes non-coding exon sequence in a non-coding transcript Non coding transcript exon variant MODIFIER 0 0.65 -http://purl.obolibrary.org/obo/SO_0001580 coding_sequence_variant A sequence variant that changes the coding sequence Coding sequence variant MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001566 regulatory_region_variant A sequence variant located within a regulatory region Regulatory region variant MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001631 upstream_gene_variant A sequence variant located 5' of a gene Upstream gene variant MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001632 downstream_gene_variant A sequence variant located 3' of a gene Downstream gene variant MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001782 TF_binding_site_variant A sequence variant located within a transcription factor binding site TF binding site variant MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001891 regulatory_region_amplification A feature amplification of a region containing a regulatory region Regulatory region amplification MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001892 TFBS_amplification A feature amplification of a region containing a transcription factor binding site TFBS amplification MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001895 TFBS_ablation A feature ablation whereby the deleted region includes a transcription factor binding site TFBS ablation MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001906 feature_truncation A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence Feature truncation MODIFIER 0 0.6 -http://purl.obolibrary.org/obo/SO_0001907 feature_elongation A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence Feature elongation MODIFIER 0 0.6 -http://targetvalidation.org/sequence/regulatory_nearest_gene_five_prime_end Regulatory nearest gene 5' end 0.6 -http://purl.obolibrary.org/obo/SO_0001628 intergenic_variant A sequence variant located in the intergenic region, between genes Intergenic variant MODIFIER 0 0.5 -http://purl.obolibrary.org/obo/SO_0001060 sequence_variant 0.5 -http://purl.obolibrary.org/obo/SO_0001825 conservative_inframe_deletion 0.5 -http://targetvalidation.org/sequence/nearest_gene_five_prime_end Nearest gene counting from 5' end 0.5 diff --git a/tests/gentropy/data_samples/vep_sample.jsonl b/tests/gentropy/data_samples/vep_sample.jsonl new file mode 100644 index 000000000..78a76f150 --- /dev/null +++ b/tests/gentropy/data_samples/vep_sample.jsonl @@ -0,0 +1,2 @@ +{"transcript_consequences":[{"gene_id":"ENSG00000168702","swissprot":["Q9NZR2.181"],"uniparc":["UPI00001B045B"],"transcript_id":"ENST00000389484","impact":"MODIFIER","canonical":1,"consequence_terms":["intron_variant"],"strand":-1,"variant_allele":"T"}],"assembly_name":"GRCh38","most_severe_consequence":"intron_variant","strand":1,"seq_region_name":"2","start":140699626,"end":140699625,"input":"2\t140699625\t2_140699625_G_GT\tG\tGT","allele_string":"-/T","id":"2_140699625_G_GT"} +{"transcript_consequences":[{"impact":"MODIFIER","consequence_terms":["upstream_gene_variant"],"distance":682,"trembl":["A0A2U3TZJ1.14"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000336451","strand":-1,"variant_allele":"T","gene_id":"ENSG00000155906","uniparc":["UPI000D18E792"]},{"lof_info":"INTRON_SIZE:8753","consequence_terms":["splice_donor_variant"],"impact":"HIGH","transcript_id":"ENST00000444024","cadd_phred":27.1,"cadd_raw":4.846757,"strand":-1,"variant_allele":"T","lof":"HC","canonical":1,"uniprot_isoform":["Q9NWS8-1"],"uniparc":["UPI00001AEAE1"],"swissprot":["Q9NWS8.145"],"gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"lof":"HC","variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000491268","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant"],"impact":"HIGH","lof_info":"INTRON_SIZE:8753"},{"variant_allele":"T","strand":-1,"uniparc":["UPI0002A12044"],"gene_id":"ENSG00000155906","consequence_terms":["intron_variant"],"impact":"MODIFIER","transcript_id":"ENST00000622845","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A087WXU0.51"]},{"transcript_id":"ENST00000643564","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER","gene_id":"ENSG00000155906","cdna_start":578,"cdna_end":578,"strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644054","cadd_phred":27.1,"trembl":["A0A2R8YFC3.14"],"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI001B8998C5"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644711","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A2R8Y4J4.16"],"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI000D1907AF"],"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000645367","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"cadd_phred":27.1,"transcript_id":"ENST00000645895","cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","cdna_end":724,"cdna_start":724,"variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000645917","cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER"},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"cadd_raw":4.846757,"trembl":["A0A2R8Y4P5.16"],"cadd_phred":27.1,"transcript_id":"ENST00000646926","variant_allele":"T","strand":-1,"uniparc":["UPI001B89CA49"],"gene_id":"ENSG00000155906"},{"transcript_id":"ENST00000682004","cadd_phred":27.1,"cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","uniparc":["UPI001B88E93C"],"lof":"HC","strand":-1,"variant_allele":"T","trembl":["A0A804HHY2.7"],"cadd_raw":4.846757,"transcript_id":"ENST00000682299","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"uniparc":["UPI001B8920B1"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T","trembl":["A0A804HLE1.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000682392","impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"]},{"cadd_raw":4.846757,"trembl":["A0A804HHW6.7"],"cadd_phred":27.1,"transcript_id":"ENST00000682641","impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753","gene_id":"ENSG00000155906","uniparc":["UPI001B893F2B"],"lof":"HC","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000682760","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"variant_allele":"T","strand":-1,"impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000683439","gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","swissprot":["Q9NWS8.145"],"uniparc":["UPI00001AEAE1"],"lof":"HC","uniprot_isoform":["Q9NWS8-1"],"strand":-1,"variant_allele":"T","cadd_raw":4.846757,"transcript_id":"ENST00000683724","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000683740","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1},{"cadd_phred":27.1,"transcript_id":"ENST00000684301","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684658","impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"strand":-1,"variant_allele":"T"},{"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1,"gene_id":"ENSG00000155906","cadd_phred":27.1,"transcript_id":"ENST00000684715","cadd_raw":4.846757},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"trembl":["A0A804HKF8.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684765","strand":-1,"variant_allele":"T","uniparc":["UPI001B89CAE3"],"gene_id":"ENSG00000155906"}],"seq_region_name":"6","assembly_name":"GRCh38","end":151445307,"strand":1,"most_severe_consequence":"splice_donor_variant","start":151445307,"id":"6_151445307_C_T","colocated_variants":[{"clin_sig_allele":"T:pathogenic;T:likely_pathogenic","end":151445307,"var_synonyms":{"ClinVar":["RCV000032983","VCV000039764","RCV001814019"],"OMIM":[614917.0001]},"strand":1,"start":151445307,"allele_string":"C/T","clin_sig":["likely_pathogenic","pathogenic"],"seq_region_name":"6","pubmed":[23022099,18835491],"phenotype_or_disease":1,"id":"rs1562800908"}],"input":"6\t151445307\t6_151445307_C_T\tC\tT","allele_string":"C/T"} diff --git a/tests/gentropy/dataset/test_variant_annotation.py b/tests/gentropy/dataset/test_variant_annotation.py deleted file mode 100644 index 05fa0cd60..000000000 --- a/tests/gentropy/dataset/test_variant_annotation.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Tests variant annotation dataset.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from gentropy.dataset.gene_index import GeneIndex - -from gentropy.dataset.v2g import V2G -from gentropy.dataset.variant_annotation import VariantAnnotation - - -def test_variant_index_creation(mock_variant_annotation: VariantAnnotation) -> None: - """Test gene index creation with mock gene index.""" - assert isinstance(mock_variant_annotation, VariantAnnotation) - - -def test_get_plof_v2g( - mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex -) -> None: - """Test get_plof_v2g with mock variant annotation.""" - assert isinstance(mock_variant_annotation.get_plof_v2g(mock_gene_index), V2G) - - -def test_get_distance_to_tss( - mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex -) -> None: - """Test get_distance_to_tss with mock variant annotation.""" - assert isinstance(mock_variant_annotation.get_distance_to_tss(mock_gene_index), V2G) diff --git a/tests/gentropy/dataset/test_variant_index.py b/tests/gentropy/dataset/test_variant_index.py index a41db3031..4d97ef536 100644 --- a/tests/gentropy/dataset/test_variant_index.py +++ b/tests/gentropy/dataset/test_variant_index.py @@ -4,11 +4,12 @@ from typing import TYPE_CHECKING +from gentropy.dataset.gene_index import GeneIndex +from gentropy.dataset.v2g import V2G from gentropy.dataset.variant_index import VariantIndex if TYPE_CHECKING: - from gentropy.dataset.study_locus import StudyLocus - from gentropy.dataset.variant_annotation import VariantAnnotation + pass def test_variant_index_creation(mock_variant_index: VariantIndex) -> None: @@ -16,11 +17,15 @@ def test_variant_index_creation(mock_variant_index: VariantIndex) -> None: assert isinstance(mock_variant_index, VariantIndex) -def test_from_variant_annotation( - mock_variant_annotation: VariantAnnotation, mock_study_locus: StudyLocus +def test_get_plof_v2g( + mock_variant_index: VariantIndex, mock_gene_index: GeneIndex ) -> None: - """Test variant index creation from variant annotation.""" - variant_index = VariantIndex.from_variant_annotation( - mock_variant_annotation, mock_study_locus - ) - assert isinstance(variant_index, VariantIndex) + """Test get_plof_v2g with mock variant annotation.""" + assert isinstance(mock_variant_index.get_plof_v2g(mock_gene_index), V2G) + + +def test_get_distance_to_tss( + mock_variant_index: VariantIndex, mock_gene_index: GeneIndex +) -> None: + """Test get_distance_to_tss with mock variant annotation.""" + assert isinstance(mock_variant_index.get_distance_to_tss(mock_gene_index), V2G) diff --git a/tests/gentropy/datasource/ensembl/test_vep_variants.py b/tests/gentropy/datasource/ensembl/test_vep_variants.py new file mode 100644 index 000000000..1401f1b6c --- /dev/null +++ b/tests/gentropy/datasource/ensembl/test_vep_variants.py @@ -0,0 +1,145 @@ +"""Testing VEP parsing and variant index extraction.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from gentropy.dataset.variant_index import VariantIndex +from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser +from pyspark.sql import DataFrame +from pyspark.sql import functions as f + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + + +class TestVEPParserInSilicoExtractor: + """Testing the _vep_in_silico_prediction_extractor method of the VEP parser class. + + These tests assumes that the _get_most_severe_transcript() method works correctly, as it's not tested. + + The test cases try to cover the following scenarios: + - Transcripts with no assessments. + - Transcripts without assessments flag. + - Transcripts with no score. + - Testing cases, where no score column is provided. + """ + + # Data prototype: + SAMPLE_DATA = [ + # Complete dataset: + ("v1", "deleterious", 0.1, "gene1", "flag"), + # No assessment: + ("v2", None, 0.1, "gene1", "flag"), + # No flag: + ("v3", "deleterious", 0.1, "gene1", None), + # No score: + ("v4", "deleterious", None, "gene1", "flag"), + ] + + SAMPLE_COLUMNS = ["variantId", "assessment", "score", "gene_id", "flag"] + + @pytest.fixture(autouse=True) + def _setup(self: TestVEPParserInSilicoExtractor, spark: SparkSession) -> None: + """Setup fixture.""" + parsed_df = ( + spark.createDataFrame(self.SAMPLE_DATA, self.SAMPLE_COLUMNS) + .groupBy("variantId") + .agg( + f.collect_list( + f.struct( + f.col("assessment").alias("assessment"), + f.col("score").alias("score"), + f.col("flag").alias("flag"), + f.col("gene_id").alias("gene_id"), + ) + ).alias("transcripts") + ) + .select( + "variantId", + VariantEffectPredictorParser._vep_in_silico_prediction_extractor( + "transcripts", "method_name", "score", "assessment", "flag" + ).alias("in_silico_predictions"), + ) + ).persist() + + self.df = parsed_df + + def test_in_silico_output_missing_value( + self: TestVEPParserInSilicoExtractor, + ) -> None: + """Test if the in silico output count is correct.""" + variant_with_missing_score = [ + x[0] for x in filter(lambda x: x[2] is None, self.SAMPLE_DATA) + ] + # Assert that the correct variants return null: + assert ( + [ + x["variantId"] + for x in self.df.filter( + f.col("in_silico_predictions").isNull() + ).collect() + ] + == variant_with_missing_score + ), "Not the right variants got nullified in-silico predictor object." + + +class TestVEPParser: + """Testing VEP parser class. + + Some of the input data: + - 6_151445307_C_T - complicated variant with numerous annotations. + - 2_140699625_G_GT - simple variant with no annotations whatsoever. + """ + + SAMPLE_VEP_DATA_PATH = "tests/gentropy/data_samples/vep_sample.jsonl" + + @pytest.fixture(autouse=True) + def _setup(self: TestVEPParser, spark: SparkSession) -> None: + """Setup fixture.""" + self.raw_vep_output = spark.read.json( + self.SAMPLE_VEP_DATA_PATH, + schema=VariantEffectPredictorParser.get_schema(), + ) + self.processed_vep_output = VariantEffectPredictorParser.process_vep_output( + self.raw_vep_output + ) + + def test_extract_variant_index_from_vep( + self: TestVEPParser, spark: SparkSession + ) -> None: + """Test if the variant index can be extracted from the VEP output.""" + variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep( + spark, self.SAMPLE_VEP_DATA_PATH + ) + + assert isinstance( + variant_index, VariantIndex + ), "VariantIndex object not created." + + def test_process(self: TestVEPParser) -> None: + """Test process method.""" + df = VariantEffectPredictorParser.process_vep_output(self.raw_vep_output) + assert isinstance(df, DataFrame), "Processed VEP output is not a DataFrame." + assert df.count() > 0, "No variant data in processed VEP dataframe." + + def test_conversion(self: TestVEPParser) -> None: + """Test if processed data can be converted into a VariantIndex object.""" + variant_index = VariantIndex( + _df=self.processed_vep_output, + _schema=VariantIndex.get_schema(), + ) + + assert isinstance( + variant_index, VariantIndex + ), "VariantIndex object not created." + + def test_variant_count(self: TestVEPParser) -> None: + """Test if the number of variants is correct. + + It is expected that all rows from the parsed VEP output are present in the processed VEP output. + """ + assert ( + self.raw_vep_output.count() == self.processed_vep_output.count() + ), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}." diff --git a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py index de70e8b63..e7067e3d9 100644 --- a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py +++ b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py @@ -2,7 +2,7 @@ from __future__ import annotations -from gentropy.dataset.variant_annotation import VariantAnnotation +from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.gwas_catalog.associations import ( GWASCatalogCuratedAssociationsParser, StudyLocusGWASCatalog, @@ -50,29 +50,29 @@ def test_qc_ambiguous_study( def test_study_locus_gwas_catalog_from_source( - mock_variant_annotation: VariantAnnotation, + mock_variant_index: VariantIndex, sample_gwas_catalog_associations: DataFrame, ) -> None: """Test study locus from gwas catalog mock data.""" assert isinstance( GWASCatalogCuratedAssociationsParser.from_source( - sample_gwas_catalog_associations, mock_variant_annotation + sample_gwas_catalog_associations, mock_variant_index ), StudyLocusGWASCatalog, ) -def test__map_to_variant_annotation_variants( +def test_map_variants_to_variant_index( sample_gwas_catalog_associations: DataFrame, - mock_variant_annotation: VariantAnnotation, + mock_variant_index: VariantIndex, ) -> None: """Test mapping to variant annotation variants.""" assert isinstance( - GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants( + GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index( sample_gwas_catalog_associations.withColumn( "studyLocusId", f.monotonically_increasing_id().cast(LongType()) ), - mock_variant_annotation, + mock_variant_index, ), DataFrame, ) diff --git a/tests/gentropy/test_schemas.py b/tests/gentropy/test_schemas.py index 417eb4657..630abd0ab 100644 --- a/tests/gentropy/test_schemas.py +++ b/tests/gentropy/test_schemas.py @@ -57,6 +57,9 @@ def test_schema_columns_camelcase(schema_json: str) -> None: Args: schema_json (str): schema filename """ + if schema_json == "vep_json_output.json": + pytest.skip("VEP schema is exempt from camelCase check.") + core_schema = json.loads(Path(SCHEMA_DIR, schema_json).read_text(encoding="utf-8")) schema = StructType.fromJson(core_schema) # Use a regular expression to check if the identifier is in camelCase