diff --git a/.vscode/settings.json b/.vscode/settings.json
index aaafda97f..ecc456889 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -25,5 +25,8 @@
"python.testing.pytestEnabled": true,
"mypy-type-checker.severity": {
"error": "Information"
- }
+ },
+ "yaml.extension.recommendations": false,
+ "workbench.remoteIndicator.showExtensionRecommendations": false,
+ "extensions.ignoreRecommendations": true
}
diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
index c8f67d418..c61863b86 100644
--- a/config/datasets/ot_gcp.yaml
+++ b/config/datasets/ot_gcp.yaml
@@ -37,13 +37,13 @@ gnomad_public_bucket: gs://gcp-public-data--gnomad/release/
ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm
ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht
liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht
-# variant_annotation
+# GnomAD variant set:
gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/
# Others
chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz
chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain
-vep_consequences: ${datasets.static_assets}/vep_consequences.tsv
+vep_consequences: ${datasets.static_assets}/variant_consequence_to_score.tsv
anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed
javierre: ${datasets.static_assets}/javierre_2016_preprocessed
jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv
@@ -55,7 +55,7 @@ finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r
finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv
# Dev output datasets
-variant_annotation: ${datasets.outputs}/variant_annotation
+gnomad_variants: ${datasets.outputs}/gnomad_variants
study_locus: ${datasets.outputs}/study_locus
summary_statistics: ${datasets.outputs}/summary_statistics
study_locus_overlap: ${datasets.outputs}/study_locus_overlap
@@ -68,6 +68,8 @@ catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
from_sumstats_pics: ${datasets.credible_set}/from_sumstats
+vep_output_path: gs://genetics_etl_python_playground/vep/full_variant_index_vcf
+
# ETL output datasets:
l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
l2g_model: ${datasets.release_folder}/locus_to_gene_model/classifier.skops
@@ -78,4 +80,4 @@ study_index: ${datasets.release_folder}/study_index
variant_index: ${datasets.release_folder}/variant_index
credible_set: ${datasets.release_folder}/credible_set
gene_index: ${datasets.release_folder}/gene_index
-v2g: ${datasets.release_folder}/variant_to_gene
+variant_to_gene: ${datasets.release_folder}/variant_to_gene
diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml
deleted file mode 100644
index 55a9503ce..000000000
--- a/config/step/ot_variant_annotation.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-defaults:
- - variant_annotation
-
-variant_annotation_path: ${datasets.variant_annotation}
-gnomad_genomes_path: ${datasets.gnomad_genomes_path}
-chain_38_37: ${datasets.chain_38_37}
-gnomad_variant_populations:
- - afr # African-American
- - amr # American Admixed/Latino
- - ami # Amish ancestry
- - asj # Ashkenazi Jewish
- - eas # East Asian
- - fin # Finnish
- - nfe # Non-Finnish European
- - mid # Middle Eastern
- - sas # South Asian
- - remaining # Other
-# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out.
-use_version_from_input: true
diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
index 3834196b2..00b6b1602 100644
--- a/config/step/ot_variant_index.yaml
+++ b/config/step/ot_variant_index.yaml
@@ -1,6 +1,6 @@
defaults:
- variant_index
-variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.credible_set}
+vep_output_json_path: ${datasets.vep_output_path}
+gnomad_variant_annotations_path: ${datasets.gnomad_variants}
variant_index_path: ${datasets.variant_index}
diff --git a/config/step/ot_variant_to_gene.yaml b/config/step/ot_variant_to_gene.yaml
index 1ac6d2fbe..7187a0625 100644
--- a/config/step/ot_variant_to_gene.yaml
+++ b/config/step/ot_variant_to_gene.yaml
@@ -2,7 +2,6 @@ defaults:
- variant_to_gene
variant_index_path: ${datasets.variant_index}
-variant_annotation_path: ${datasets.variant_annotation}
gene_index_path: ${datasets.gene_index}
vep_consequences_path: ${datasets.vep_consequences}
liftover_chain_file_path: ${datasets.chain_37_38}
@@ -11,4 +10,4 @@ interval_sources:
javierre: ${datasets.javierre}
jung: ${datasets.jung}
thurman: ${datasets.thurman}
-v2g_path: ${datasets.v2g}
+v2g_path: ${datasets.variant_to_gene}
diff --git a/docs/assets/imgs/ensembl_logo.png b/docs/assets/imgs/ensembl_logo.png
new file mode 100644
index 000000000..83fd25751
Binary files /dev/null and b/docs/assets/imgs/ensembl_logo.png differ
diff --git a/docs/python_api/datasets/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md
deleted file mode 100644
index f82210b6e..000000000
--- a/docs/python_api/datasets/variant_annotation.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-title: Variant annotation
----
-
-::: gentropy.dataset.variant_annotation.VariantAnnotation
-
-## Schema
-
---8<-- "assets/schemas/variant_annotation.md"
diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
index f05a2b834..e6e081b21 100644
--- a/docs/python_api/datasources/_datasources.md
+++ b/docs/python_api/datasources/_datasources.md
@@ -23,7 +23,8 @@ This section contains information about the data source harmonisation tools avai
## Variant annotation/validation
1. [GnomAD](gnomad/_gnomad.md) v4.0
-1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)
## Linkage desiquilibrium
diff --git a/docs/python_api/datasources/ensembl/_ensembl.md b/docs/python_api/datasources/ensembl/_ensembl.md
new file mode 100644
index 000000000..56c49870a
--- /dev/null
+++ b/docs/python_api/datasources/ensembl/_ensembl.md
@@ -0,0 +1,10 @@
+---
+title: Ensembl annotations
+---
+
+
+
+
Ensembl
+
+
+[Ensembl](https://www.ensembl.org/index.html) provides a diverse set of genetic data Gentropy takes advantage of including gene set, and variant annotations.
diff --git a/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md
new file mode 100644
index 000000000..c956ed3e1
--- /dev/null
+++ b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md
@@ -0,0 +1,5 @@
+---
+title: Variant effector parser
+---
+
+::: gentropy.datasource.ensembl.vep_parser.VariantEffectPredictorParser
diff --git a/docs/python_api/steps/ld_index.md b/docs/python_api/steps/ld_index.md
index bf8b9b58e..d8f61a528 100644
--- a/docs/python_api/steps/ld_index.md
+++ b/docs/python_api/steps/ld_index.md
@@ -1,5 +1,5 @@
---
-title: ld_index
+title: GnomAD Linkage data ingestion
---
-::: gentropy.ld_index.LDIndexStep
+::: gentropy.gnomad_ingestion.LDIndexStep
diff --git a/docs/python_api/steps/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md
index e65a071b2..2b5582df4 100644
--- a/docs/python_api/steps/variant_annotation_step.md
+++ b/docs/python_api/steps/variant_annotation_step.md
@@ -1,5 +1,5 @@
---
-title: variant_annotation
+title: GnomAD variant data ingestion
---
-::: gentropy.variant_annotation.VariantAnnotationStep
+::: gentropy.gnomad_ingestion.GnomadVariantIndexStep
diff --git a/docs/python_api/steps/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md
index 1a3e56af8..db1c1fd20 100644
--- a/docs/python_api/steps/variant_to_gene_step.md
+++ b/docs/python_api/steps/variant_to_gene_step.md
@@ -2,4 +2,4 @@
title: variant_to_gene
---
-::: gentropy.v2g.V2GStep
+::: gentropy.variant_to_gene.V2GStep
diff --git a/poetry.lock b/poetry.lock
index b181572d4..432c3d4cc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6881,6 +6881,7 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+ {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
diff --git a/src/airflow/dags/gnomad_preprocess.py b/src/airflow/dags/gnomad_preprocess.py
index 22e7fa056..03962bec5 100644
--- a/src/airflow/dags/gnomad_preprocess.py
+++ b/src/airflow/dags/gnomad_preprocess.py
@@ -1,4 +1,4 @@
-"""Airflow DAG for the Preprocess part of the pipeline."""
+"""Airflow DAG for the Preprocess GnomAD datasets - LD index and GnomAD variant set."""
from __future__ import annotations
@@ -11,13 +11,13 @@
ALL_STEPS = [
"ot_ld_index",
- "ot_variant_annotation",
+ "ot_gnomad_variants",
]
with DAG(
dag_id=Path(__file__).stem,
- description="Open Targets Genetics — Preprocess",
+ description="Open Targets Genetics — GnomAD Preprocess",
default_args=common.shared_dag_args,
**common.shared_dag_kwargs,
):
diff --git a/src/gentropy/assets/data/so_mappings.json b/src/gentropy/assets/data/so_mappings.json
new file mode 100644
index 000000000..8a087b6f7
--- /dev/null
+++ b/src/gentropy/assets/data/so_mappings.json
@@ -0,0 +1,43 @@
+{
+ "transcript_ablation": "SO_0001893",
+ "splice_acceptor_variant": "SO_0001574",
+ "splice_donor_variant": "SO_0001575",
+ "stop_gained": "SO_0001587",
+ "frameshift_variant": "SO_0001589",
+ "stop_lost": "SO_0001578",
+ "start_lost": "SO_0002012",
+ "transcript_amplification": "SO_0001889",
+ "feature_elongation": "SO_0001907",
+ "feature_truncation": "SO_0001906",
+ "inframe_insertion": "SO_0001821",
+ "inframe_deletion": "SO_0001822",
+ "missense_variant": "SO_0001583",
+ "protein_altering_variant": "SO_0001818",
+ "splice_donor_5th_base_variant": "SO_0001787",
+ "splice_region_variant": "SO_0001630",
+ "splice_donor_region_variant": "SO_0002170",
+ "splice_polypyrimidine_tract_variant": "SO_0002169",
+ "incomplete_terminal_codon_variant": "SO_0001626",
+ "start_retained_variant": "SO_0002019",
+ "stop_retained_variant": "SO_0001567",
+ "synonymous_variant": "SO_0001819",
+ "coding_sequence_variant": "SO_0001580",
+ "mature_miRNA_variant": "SO_0001620",
+ "5_prime_UTR_variant": "SO_0001623",
+ "3_prime_UTR_variant": "SO_0001624",
+ "non_coding_transcript_exon_variant": "SO_0001792",
+ "intron_variant": "SO_0001627",
+ "NMD_transcript_variant": "SO_0001621",
+ "non_coding_transcript_variant": "SO_0001619",
+ "coding_transcript_variant": "SO_0001968",
+ "upstream_gene_variant": "SO_0001631",
+ "downstream_gene_variant": "SO_0001632",
+ "TFBS_ablation": "SO_0001895",
+ "TFBS_amplification": "SO_0001892",
+ "TF_binding_site_variant": "SO_0001782",
+ "regulatory_region_ablation": "SO_0001894",
+ "regulatory_region_amplification": "SO_0001891",
+ "regulatory_region_variant": "SO_0001566",
+ "intergenic_variant": "SO_0001628",
+ "sequence_variant": "SO_0001060"
+}
diff --git a/src/gentropy/assets/schemas/variant_annotation.json b/src/gentropy/assets/schemas/variant_annotation.json
deleted file mode 100644
index ab8767389..000000000
--- a/src/gentropy/assets/schemas/variant_annotation.json
+++ /dev/null
@@ -1,215 +0,0 @@
-{
- "type": "struct",
- "fields": [
- {
- "name": "variantId",
- "type": "string",
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "chromosome",
- "type": "string",
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "position",
- "type": "integer",
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "referenceAllele",
- "type": "string",
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "alternateAllele",
- "type": "string",
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "chromosomeB37",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "positionB37",
- "type": "integer",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "alleleType",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "rsIds",
- "type": {
- "type": "array",
- "elementType": "string",
- "containsNull": true
- },
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "alleleFrequencies",
- "type": {
- "type": "array",
- "elementType": {
- "type": "struct",
- "fields": [
- {
- "name": "populationName",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "alleleFrequency",
- "type": "double",
- "nullable": true,
- "metadata": {}
- }
- ]
- },
- "containsNull": true
- },
- "nullable": false,
- "metadata": {}
- },
- {
- "name": "inSilicoPredictors",
- "nullable": false,
- "metadata": {},
- "type": {
- "type": "struct",
- "fields": [
- {
- "name": "cadd",
- "nullable": true,
- "metadata": {},
- "type": {
- "type": "struct",
- "fields": [
- {
- "name": "raw",
- "type": "float",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "phred",
- "type": "float",
- "nullable": true,
- "metadata": {}
- }
- ]
- }
- },
- {
- "name": "revelMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "spliceaiDsMax",
- "type": "float",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "pangolinLargestDs",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "phylop",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "siftMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "polyphenMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- }
- ]
- }
- },
- {
- "name": "vep",
- "type": {
- "type": "struct",
- "fields": [
- {
- "name": "mostSevereConsequence",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "transcriptConsequences",
- "type": {
- "type": "array",
- "elementType": {
- "type": "struct",
- "fields": [
- {
- "name": "aminoAcids",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "consequenceTerms",
- "type": {
- "type": "array",
- "elementType": "string",
- "containsNull": true
- },
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "geneId",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "lof",
- "type": "string",
- "nullable": true,
- "metadata": {}
- }
- ]
- },
- "containsNull": true
- },
- "nullable": true,
- "metadata": {}
- }
- ]
- },
- "nullable": false,
- "metadata": {}
- }
- ]
-}
diff --git a/src/gentropy/assets/schemas/variant_index.json b/src/gentropy/assets/schemas/variant_index.json
index c6a3702c9..d9be4e50f 100644
--- a/src/gentropy/assets/schemas/variant_index.json
+++ b/src/gentropy/assets/schemas/variant_index.json
@@ -1,53 +1,188 @@
{
- "type": "struct",
"fields": [
{
+ "metadata": {},
"name": "variantId",
- "type": "string",
"nullable": false,
- "metadata": {}
+ "type": "string"
},
{
+ "metadata": {},
"name": "chromosome",
- "type": "string",
"nullable": false,
- "metadata": {}
+ "type": "string"
},
{
+ "metadata": {},
"name": "position",
- "type": "integer",
"nullable": false,
- "metadata": {}
+ "type": "integer"
},
{
+ "metadata": {},
"name": "referenceAllele",
- "type": "string",
"nullable": false,
- "metadata": {}
+ "type": "string"
},
{
+ "metadata": {},
"name": "alternateAllele",
- "type": "string",
"nullable": false,
- "metadata": {}
+ "type": "string"
},
{
- "name": "chromosomeB37",
- "type": "string",
- "nullable": true,
- "metadata": {}
+ "metadata": {},
+ "name": "inSilicoPredictors",
+ "nullable": false,
+ "type": {
+ "containsNull": true,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "method",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "assessment",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "score",
+ "nullable": true,
+ "type": "float"
+ },
+ {
+ "metadata": {},
+ "name": "assessmentFlag",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "targetId",
+ "nullable": true,
+ "type": "string"
+ }
+ ],
+ "type": "struct"
+ },
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "mostSevereConsequenceId",
+ "nullable": false,
+ "type": "string"
},
{
- "name": "positionB37",
- "type": "integer",
+ "metadata": {},
+ "name": "transcriptConsequences",
"nullable": true,
- "metadata": {}
+ "type": {
+ "containsNull": false,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "variantFunctionalConsequenceIds",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "aminoAcidChange",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "uniprotAccessions",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "isEnsemblCanonical",
+ "nullable": false,
+ "type": "boolean"
+ },
+ {
+ "metadata": {},
+ "name": "codons",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "distance",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "targetId",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "impact",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "lofteePrediction",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "siftPrediction",
+ "nullable": true,
+ "type": "float"
+ },
+ {
+ "metadata": {},
+ "name": "polyphenPrediction",
+ "nullable": true,
+ "type": "float"
+ },
+ {
+ "metadata": {},
+ "name": "transcriptId",
+ "nullable": true,
+ "type": "string"
+ }
+ ],
+ "type": "struct"
+ },
+ "type": "array"
+ }
},
{
- "name": "alleleType",
- "type": "string",
- "nullable": false,
- "metadata": {}
+ "metadata": {},
+ "name": "rsIds",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
},
{
"name": "alleleFrequencies",
@@ -76,88 +211,31 @@
"metadata": {}
},
{
- "name": "inSilicoPredictors",
- "nullable": false,
"metadata": {},
+ "name": "dbXrefs",
+ "nullable": false,
"type": {
- "type": "struct",
- "fields": [
- {
- "name": "cadd",
- "nullable": true,
- "metadata": {},
- "type": {
- "type": "struct",
- "fields": [
- {
- "name": "raw",
- "type": "float",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "phred",
- "type": "float",
- "nullable": true,
- "metadata": {}
- }
- ]
+ "containsNull": true,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "id",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "source",
+ "nullable": true,
+ "type": "string"
}
- },
- {
- "name": "revelMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "spliceaiDsMax",
- "type": "float",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "pangolinLargestDs",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "phylop",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "siftMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "polyphenMax",
- "type": "double",
- "nullable": true,
- "metadata": {}
- }
- ]
+ ],
+ "type": "struct"
+ },
+ "type": "array"
}
- },
- {
- "name": "mostSevereConsequence",
- "type": "string",
- "nullable": true,
- "metadata": {}
- },
- {
- "name": "rsIds",
- "type": {
- "type": "array",
- "elementType": "string",
- "containsNull": true
- },
- "nullable": true,
- "metadata": {}
}
- ]
+ ],
+ "type": "struct"
}
diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json
new file mode 100644
index 000000000..ecad3ea1e
--- /dev/null
+++ b/src/gentropy/assets/schemas/vep_json_output.json
@@ -0,0 +1,531 @@
+{
+ "fields": [
+ {
+ "metadata": {},
+ "name": "allele_string",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "assembly_name",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "intergenic_consequences",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "cadd_phred",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "cadd_raw",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "consequence_terms",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "impact",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "variant_allele",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "gene_id",
+ "nullable": true,
+ "type": "string"
+ }
+ ],
+ "type": "struct"
+ },
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "colocated_variants",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "allele_string",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "clin_sig",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "clin_sig_allele",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "end",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "id",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "phenotype_or_disease",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "pubmed",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "long",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "seq_region_name",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "start",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "strand",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "var_synonyms",
+ "nullable": true,
+ "type": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "ClinVar",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "LMDD",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "OIVD",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "OMIM",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "double",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "PharmGKB",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "PhenCode",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "UniProt",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "dbPEX",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ }
+ ],
+ "type": "struct"
+ }
+ }
+ ],
+ "type": "struct"
+ },
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "end",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "id",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "input",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "most_severe_consequence",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "seq_region_name",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "start",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "strand",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "transcript_consequences",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "alphamissense",
+ "nullable": true,
+ "type": {
+ "fields": [
+ {
+ "metadata": {},
+ "name": "am_class",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "am_pathogenicity",
+ "nullable": true,
+ "type": "double"
+ }
+ ],
+ "type": "struct"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "amino_acids",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "cadd_phred",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "cadd_raw",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "canonical",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "cdna_end",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "cdna_start",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "cds_end",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "cds_start",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "codons",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "consequence_terms",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "distance",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "flags",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "gene_id",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "impact",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "lof",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "lof_filter",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "lof_flags",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "lof_info",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "polyphen_prediction",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "polyphen_score",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "protein_end",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "protein_start",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "sift_prediction",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "sift_score",
+ "nullable": true,
+ "type": "double"
+ },
+ {
+ "metadata": {},
+ "name": "strand",
+ "nullable": true,
+ "type": "long"
+ },
+ {
+ "metadata": {},
+ "name": "swissprot",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "transcript_id",
+ "nullable": true,
+ "type": "string"
+ },
+ {
+ "metadata": {},
+ "name": "trembl",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "uniparc",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "uniprot_isoform",
+ "nullable": true,
+ "type": {
+ "containsNull": true,
+ "elementType": "string",
+ "type": "array"
+ }
+ },
+ {
+ "metadata": {},
+ "name": "variant_allele",
+ "nullable": true,
+ "type": "string"
+ }
+ ],
+ "type": "struct"
+ },
+ "type": "array"
+ }
+ }
+ ],
+ "type": "struct"
+}
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 4ebf4cbe8..f7ba73e98 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -180,7 +180,7 @@ class LDIndexConfig(StepConfig):
]
)
use_version_from_input: bool = False
- _target_: str = "gentropy.ld_index.LDIndexStep"
+ _target_: str = "gentropy.gnomad_ingestion.LDIndexStep"
@dataclass
@@ -302,8 +302,8 @@ class UkbPppEurConfig(StepConfig):
@dataclass
-class VariantAnnotationConfig(StepConfig):
- """Variant annotation step configuration."""
+class GnomadVariantConfig(StepConfig):
+ """Gnomad variant ingestion step configuration."""
session: Any = field(
default_factory=lambda: {
@@ -312,7 +312,6 @@ class VariantAnnotationConfig(StepConfig):
)
variant_annotation_path: str = MISSING
gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/"
- chain_38_37: str = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
gnomad_variant_populations: list[str] = field(
default_factory=lambda: [
"afr", # African-American
@@ -328,16 +327,16 @@ class VariantAnnotationConfig(StepConfig):
]
)
use_version_from_input: bool = False
- _target_: str = "gentropy.variant_annotation.VariantAnnotationStep"
+ _target_: str = "gentropy.gnomad_ingestion.GnomadVariantIndexStep"
@dataclass
class VariantIndexConfig(StepConfig):
"""Variant index step configuration."""
- variant_annotation_path: str = MISSING
- credible_set_path: str = MISSING
+ vep_output_json_path: str = MISSING
variant_index_path: str = MISSING
+ gnomad_variant_annotations_path: str | None = None
_target_: str = "gentropy.variant_index.VariantIndexStep"
@@ -346,7 +345,6 @@ class VariantToGeneConfig(StepConfig):
"""V2G step configuration."""
variant_index_path: str = MISSING
- variant_annotation_path: str = MISSING
gene_index_path: str = MISSING
vep_consequences_path: str = MISSING
liftover_chain_file_path: str = MISSING
@@ -371,7 +369,7 @@ class VariantToGeneConfig(StepConfig):
)
interval_sources: Dict[str, str] = field(default_factory=dict)
v2g_path: str = MISSING
- _target_: str = "gentropy.v2g.V2GStep"
+ _target_: str = "gentropy.variant_to_gene.V2GStep"
@dataclass
@@ -491,8 +489,8 @@ def register_config() -> None:
)
cs.store(group="step", name="pics", node=PICSConfig)
+ cs.store(group="step", name="variant_annotation", node=GnomadVariantConfig)
cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig)
- cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig)
cs.store(group="step", name="variant_index", node=VariantIndexConfig)
cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig)
cs.store(
diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
index 4bafbc288..a9e24dcab 100644
--- a/src/gentropy/dataset/variant_index.py
+++ b/src/gentropy/dataset/variant_index.py
@@ -1,78 +1,295 @@
-"""Variant index dataset."""
+"""Dataset definition for variant annotation."""
+
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
-import pyspark.sql.functions as f
+from pyspark.sql import functions as f
from gentropy.common.schemas import parse_spark_schema
-from gentropy.common.spark_helpers import nullify_empty_array
+from gentropy.common.spark_helpers import (
+ get_record_with_maximum_value,
+ normalise_column,
+)
from gentropy.dataset.dataset import Dataset
-from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.v2g import V2G
if TYPE_CHECKING:
+ from pyspark.sql import Column, DataFrame
from pyspark.sql.types import StructType
- from gentropy.dataset.variant_annotation import VariantAnnotation
-
@dataclass
class VariantIndex(Dataset):
- """Variant index dataset.
-
- Variant index dataset is the result of intersecting the variant annotation dataset with the variants with V2D available information.
- """
+ """Dataset for representing variants and methods applied on them."""
@classmethod
def get_schema(cls: type[VariantIndex]) -> StructType:
- """Provides the schema for the VariantIndex dataset.
+ """Provides the schema for the variant index dataset.
Returns:
StructType: Schema for the VariantIndex dataset
"""
return parse_spark_schema("variant_index.json")
- @classmethod
- def from_variant_annotation(
- cls: type[VariantIndex],
- variant_annotation: VariantAnnotation,
- study_locus: StudyLocus,
+ def add_annotation(
+ self: VariantIndex, annotation_source: VariantIndex
) -> VariantIndex:
- """Initialise VariantIndex from pre-existing variant annotation dataset.
+ """Import annotation from an other variant index dataset.
+
+ At this point the annotation can be extended with extra cross-references,
+ in-silico predictions and allele frequencies.
Args:
- variant_annotation (VariantAnnotation): Variant annotation dataset
- study_locus (StudyLocus): Study locus dataset with the variants to intersect with the variant annotation dataset
+ annotation_source (VariantIndex): Annotation to add to the dataset
+
+ Returns:
+ VariantIndex: VariantIndex dataset with the annotation added
+ """
+ # Columns in the source dataset:
+ variant_index_columns = [
+ # Combining cross-references:
+ f.array_union(f.col("dbXrefs"), f.col("annotation_dbXrefs"))
+ if row == "dbXrefs"
+ # Combining in silico predictors:
+ else f.array_union(
+ f.col("inSilicoPredictors"), f.col("annotation_inSilicoPredictors")
+ )
+ if row == "inSilicoPredictors"
+ # Combining allele frequencies:
+ else f.array_union(
+ f.col("alleleFrequencies"), f.col("annotation_alleleFrequencies")
+ )
+ if row == "alleleFrequencies"
+ # Carrying over all other columns:
+ else row
+ for row in self.df.columns
+ ]
+
+ # Rename columns in the annotation source to avoid conflicts:
+ annotation = annotation_source.df.select(
+ *[
+ f.col(col).alias(f"annotation_{col}") if col != "variantId" else col
+ for col in annotation_source.df.columns
+ ]
+ )
+
+ # Join the annotation to the dataset:
+ return VariantIndex(
+ _df=(
+ annotation.join(
+ f.broadcast(self.df), on="variantId", how="right"
+ ).select(*variant_index_columns)
+ ),
+ _schema=self.schema,
+ )
+
+ def max_maf(self: VariantIndex) -> Column:
+ """Maximum minor allele frequency accross all populations assuming all variants biallelic.
Returns:
- VariantIndex: Variant index dataset
+ Column: Maximum minor allele frequency accross all populations.
+
+ Raises:
+ ValueError: Allele frequencies are not present in the dataset.
"""
- unchanged_cols = [
+ if "alleleFrequencies" not in self.df.columns:
+ raise ValueError("Allele frequencies are not present in the dataset.")
+
+ return f.array_max(
+ f.transform(
+ self.df.alleleFrequencies,
+ lambda af: f.when(
+ af.alleleFrequency > 0.5, 1 - af.alleleFrequency
+ ).otherwise(af.alleleFrequency),
+ )
+ )
+
+ def filter_by_variant(self: VariantIndex, df: DataFrame) -> VariantIndex:
+ """Filter variant annotation dataset by a variant dataframe.
+
+ Args:
+ df (DataFrame): A dataframe of variants
+
+ Returns:
+ VariantIndex: A filtered variant annotation dataset
+ """
+ join_columns = ["variantId", "chromosome"]
+
+ assert all(
+ col in df.columns for col in join_columns
+ ), "The variant dataframe must contain the columns 'variantId' and 'chromosome'."
+
+ return VariantIndex(
+ _df=self._df.join(
+ f.broadcast(df.select(*join_columns).distinct()),
+ on=join_columns,
+ how="inner",
+ ),
+ _schema=self.schema,
+ )
+
+ def get_transcript_consequence_df(
+ self: VariantIndex, gene_index: GeneIndex | None = None
+ ) -> DataFrame:
+ """Dataframe of exploded transcript consequences.
+
+ Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+ Args:
+ gene_index (GeneIndex | None): A gene index. Defaults to None.
+
+ Returns:
+ DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence
+ """
+ # exploding the array removes records without VEP annotation
+ transript_consequences = self.df.withColumn(
+ "transcriptConsequence", f.explode("transcriptConsequences")
+ ).select(
"variantId",
"chromosome",
"position",
- "referenceAllele",
- "alternateAllele",
- "chromosomeB37",
- "positionB37",
- "alleleType",
- "alleleFrequencies",
- "inSilicoPredictors",
- ]
- va_slimmed = variant_annotation.filter_by_variant_df(
- study_locus.unique_variants_in_locus()
+ "transcriptConsequence",
+ f.col("transcriptConsequence.targetId").alias("geneId"),
)
- return cls(
+ if gene_index:
+ transript_consequences = transript_consequences.join(
+ f.broadcast(gene_index.df),
+ on=["chromosome", "geneId"],
+ )
+ return transript_consequences
+
+ def get_distance_to_tss(
+ self: VariantIndex,
+ gene_index: GeneIndex,
+ max_distance: int = 500_000,
+ ) -> V2G:
+ """Extracts variant to gene assignments for variants falling within a window of a gene's TSS.
+
+ Args:
+ gene_index (GeneIndex): A gene index to filter by.
+ max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.
+
+ Returns:
+ V2G: variant to gene assignments with their distance to the TSS
+ """
+ return V2G(
+ _df=(
+ self.df.alias("variant")
+ .join(
+ f.broadcast(gene_index.locations_lut()).alias("gene"),
+ on=[
+ f.col("variant.chromosome") == f.col("gene.chromosome"),
+ f.abs(f.col("variant.position") - f.col("gene.tss"))
+ <= max_distance,
+ ],
+ how="inner",
+ )
+ .withColumn(
+ "distance", f.abs(f.col("variant.position") - f.col("gene.tss"))
+ )
+ .withColumn(
+ "inverse_distance",
+ max_distance - f.col("distance"),
+ )
+ .transform(lambda df: normalise_column(df, "inverse_distance", "score"))
+ .select(
+ "variantId",
+ f.col("variant.chromosome").alias("chromosome"),
+ "distance",
+ "geneId",
+ "score",
+ f.lit("distance").alias("datatypeId"),
+ f.lit("canonical_tss").alias("datasourceId"),
+ )
+ ),
+ _schema=V2G.get_schema(),
+ )
+
+ def get_plof_v2g(self: VariantIndex, gene_index: GeneIndex) -> V2G:
+ """Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.
+
+ Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+ Args:
+ gene_index (GeneIndex): A gene index to filter by.
+
+ Returns:
+ V2G: variant to gene assignments from the LOFTEE algorithm
+ """
+ return V2G(
_df=(
- va_slimmed.df.select(
- *unchanged_cols,
- f.col("vep.mostSevereConsequence").alias("mostSevereConsequence"),
- # filters/rsid are arrays that can be empty, in this case we convert them to null
- nullify_empty_array(f.col("rsIds")).alias("rsIds"),
+ self.get_transcript_consequence_df(gene_index)
+ .filter(f.col("transcriptConsequence.lofteePrediction").isNotNull())
+ .withColumn(
+ "isHighQualityPlof",
+ f.when(
+ f.col("transcriptConsequence.lofteePrediction") == "HC", True
+ ).when(
+ f.col("transcriptConsequence.lofteePrediction") == "LC", False
+ ),
+ )
+ .withColumn(
+ "score",
+ f.when(f.col("isHighQualityPlof"), 1.0).when(
+ ~f.col("isHighQualityPlof"), 0
+ ),
+ )
+ .select(
+ "variantId",
+ "chromosome",
+ "geneId",
+ "isHighQualityPlof",
+ f.col("score"),
+ f.lit("vep").alias("datatypeId"),
+ f.lit("loftee").alias("datasourceId"),
+ )
+ ),
+ _schema=V2G.get_schema(),
+ )
+
+ def get_most_severe_transcript_consequence(
+ self: VariantIndex,
+ vep_consequences: DataFrame,
+ gene_index: GeneIndex,
+ ) -> V2G:
+ """Creates a dataset with variant to gene assignments based on VEP's predicted consequence of the transcript.
+
+ Optionally the trancript consequences can be reduced to the universe of a gene index.
+
+ Args:
+ vep_consequences (DataFrame): A dataframe of VEP consequences
+ gene_index (GeneIndex): A gene index to filter by. Defaults to None.
+
+ Returns:
+ V2G: High and medium severity variant to gene assignments
+ """
+ return V2G(
+ _df=self.get_transcript_consequence_df(gene_index)
+ .select(
+ "variantId",
+ "chromosome",
+ f.col("transcriptConsequence.targetId").alias("geneId"),
+ f.explode(
+ "transcriptConsequence.variantFunctionalConsequenceIds"
+ ).alias("variantFunctionalConsequenceId"),
+ f.lit("vep").alias("datatypeId"),
+ f.lit("variantConsequence").alias("datasourceId"),
+ )
+ .join(
+ f.broadcast(vep_consequences),
+ on="variantFunctionalConsequenceId",
+ how="inner",
+ )
+ .drop("label")
+ .filter(f.col("score") != 0)
+ # A variant can have multiple predicted consequences on a transcript, the most severe one is selected
+ .transform(
+ lambda df: get_record_with_maximum_value(
+ df, ["variantId", "geneId"], "score"
)
- .repartition(400, "chromosome")
- .sortWithinPartitions("chromosome", "position")
),
- _schema=cls.get_schema(),
+ _schema=V2G.get_schema(),
)
diff --git a/src/gentropy/datasource/ensembl/__init__.py b/src/gentropy/datasource/ensembl/__init__.py
new file mode 100644
index 000000000..e20ce9a95
--- /dev/null
+++ b/src/gentropy/datasource/ensembl/__init__.py
@@ -0,0 +1,3 @@
+"""Ensembl's Variant Effect Predictor datasource."""
+
+from __future__ import annotations
diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py
new file mode 100644
index 000000000..fcade35b1
--- /dev/null
+++ b/src/gentropy/datasource/ensembl/vep_parser.py
@@ -0,0 +1,784 @@
+"""Generating variant index based on Esembl's Variant Effect Predictor output."""
+
+from __future__ import annotations
+
+import importlib.resources as pkg_resources
+import json
+from itertools import chain
+from typing import TYPE_CHECKING, Dict, List
+
+from pyspark.sql import SparkSession
+from pyspark.sql import functions as f
+from pyspark.sql import types as t
+
+from gentropy.assets import data
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.common.spark_helpers import (
+ enforce_schema,
+ order_array_of_structs_by_field,
+)
+from gentropy.dataset.variant_index import VariantIndex
+
+if TYPE_CHECKING:
+ from pyspark.sql import Column, DataFrame
+
+
+class VariantEffectPredictorParser:
+ """Collection of methods to parse VEP output in json format."""
+
+ # Schema description of the dbXref object:
+ DBXREF_SCHEMA = t.ArrayType(
+ t.StructType(
+ [
+ t.StructField("id", t.StringType(), True),
+ t.StructField("source", t.StringType(), True),
+ ]
+ )
+ )
+
+ # Schema description of the in silico predictor object:
+ IN_SILICO_PREDICTOR_SCHEMA = t.StructType(
+ [
+ t.StructField("method", t.StringType(), True),
+ t.StructField("assessment", t.StringType(), True),
+ t.StructField("score", t.FloatType(), True),
+ t.StructField("assessmentFlag", t.StringType(), True),
+ t.StructField("targetId", t.StringType(), True),
+ ]
+ )
+
+ # Schema for the allele frequency column:
+ ALLELE_FREQUENCY_SCHEMA = t.ArrayType(
+ t.StructType(
+ [
+ t.StructField("populationName", t.StringType(), True),
+ t.StructField("alleleFrequency", t.DoubleType(), True),
+ ]
+ ),
+ False,
+ )
+
+ @staticmethod
+ def get_schema() -> t.StructType:
+ """Return the schema of the VEP output.
+
+ Returns:
+ t.StructType: VEP output schema.
+
+ Examples:
+ >>> type(VariantEffectPredictorParser.get_schema())
+
+ """
+ return parse_spark_schema("vep_json_output.json")
+
+ @classmethod
+ def extract_variant_index_from_vep(
+ cls: type[VariantEffectPredictorParser],
+ spark: SparkSession,
+ vep_output_path: str | list[str],
+ **kwargs: bool | float | int | str | None,
+ ) -> VariantIndex:
+ """Extract variant index from VEP output.
+
+ Args:
+ spark (SparkSession): Spark session.
+ vep_output_path (str | list[str]): Path to the VEP output.
+ **kwargs (bool | float | int | str | None): Additional arguments to pass to spark.read.json.
+
+ Returns:
+ VariantIndex: Variant index dataset.
+
+ Raises:
+ ValueError: Failed reading file.
+ ValueError: The dataset is empty.
+ """
+ # To speed things up and simplify the json structure, read data following an expected schema:
+ vep_schema = cls.get_schema()
+
+ try:
+ vep_data = spark.read.json(vep_output_path, schema=vep_schema, **kwargs)
+ except ValueError as e:
+ raise ValueError(f"Failed reading file: {vep_output_path}.") from e
+
+ if vep_data.isEmpty():
+ raise ValueError(f"The dataset is empty: {vep_output_path}")
+
+ # Convert to VariantAnnotation dataset:
+ return VariantIndex(
+ _df=VariantEffectPredictorParser.process_vep_output(vep_data),
+ _schema=VariantIndex.get_schema(),
+ )
+
+ @staticmethod
+ def _extract_ensembl_xrefs(colocated_variants: Column) -> Column:
+ """Extract rs identifiers and build cross reference to Ensembl's variant page.
+
+ Args:
+ colocated_variants (Column): Colocated variants field from VEP output.
+
+ Returns:
+ Column: List of dbXrefs for rs identifiers.
+ """
+ return VariantEffectPredictorParser._generate_dbxrefs(
+ VariantEffectPredictorParser._colocated_variants_to_rsids(
+ colocated_variants
+ ),
+ "ensembl_variation",
+ )
+
+ @enforce_schema(DBXREF_SCHEMA)
+ @staticmethod
+ def _generate_dbxrefs(ids: Column, source: str) -> Column:
+ """Convert a list of variant identifiers to dbXrefs given the source label.
+
+ Identifiers are cast to strings, then Null values are filtered out of the id list.
+
+ Args:
+ ids (Column): List of variant identifiers.
+ source (str): Source label for the dbXrefs.
+
+ Returns:
+ Column: List of dbXrefs.
+
+ Examples:
+ >>> (
+ ... spark.createDataFrame([('rs12',),(None,)], ['test_id'])
+ ... .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col'))
+ ... .show(truncate=False)
+ ... )
+ +--------------------------+
+ |col |
+ +--------------------------+
+ |[{rs12, ensemblVariation}]|
+ |[] |
+ +--------------------------+
+
+ >>> (
+ ... spark.createDataFrame([('rs12',),(None,)], ['test_id'])
+ ... .select(VariantEffectPredictorParser._generate_dbxrefs(f.array(f.col('test_id')), "ensemblVariation").alias('col'))
+ ... .first().col[0].asDict()
+ ... )
+ {'id': 'rs12', 'source': 'ensemblVariation'}
+ """
+ ids = f.filter(ids, lambda id: id.isNotNull())
+ xref_column = f.transform(
+ ids,
+ lambda id: f.struct(
+ id.cast(t.StringType()).alias("id"), f.lit(source).alias("source")
+ ),
+ )
+
+ return f.when(xref_column.isNull(), f.array()).otherwise(xref_column)
+
+ @staticmethod
+ def _colocated_variants_to_rsids(colocated_variants: Column) -> Column:
+ """Extract rs identifiers from the colocated variants VEP field.
+
+ Args:
+ colocated_variants (Column): Colocated variants field from VEP output.
+
+ Returns:
+ Column: List of rs identifiers.
+
+ Examples:
+ >>> data = [('s1', 'rs1'),('s1', 'rs2'),('s1', 'rs3'),('s2', None),]
+ >>> (
+ ... spark.createDataFrame(data, ['s','v'])
+ ... .groupBy('s')
+ ... .agg(f.collect_list(f.struct(f.col('v').alias('id'))).alias('cv'))
+ ... .select(VariantEffectPredictorParser._colocated_variants_to_rsids(f.col('cv')).alias('rsIds'),)
+ ... .show(truncate=False)
+ ... )
+ +---------------+
+ |rsIds |
+ +---------------+
+ |[rs1, rs2, rs3]|
+ |[null] |
+ +---------------+
+
+ """
+ return f.when(
+ colocated_variants.isNotNull(),
+ f.transform(
+ colocated_variants, lambda variant: variant.getItem("id").alias("id")
+ ),
+ ).alias("rsIds")
+
+ @staticmethod
+ def _extract_omim_xrefs(colocated_variants: Column) -> Column:
+ """Build xdbRefs for OMIM identifiers.
+
+ OMIM identifiers are extracted from the colocated variants field, casted to strings and formatted as dbXrefs.
+
+ Args:
+ colocated_variants (Column): Colocated variants field from VEP output.
+
+ Returns:
+ Column: List of dbXrefs for OMIM identifiers.
+
+ Examples:
+ >>> data = [('234234.32', 'rs1', 's1',),(None, 'rs1', 's1',),]
+ >>> (
+ ... spark.createDataFrame(data, ['id', 'rsid', 's'])
+ ... .groupBy('s')
+ ... .agg(f.collect_list(f.struct(f.struct(f.array(f.col('id')).alias('OMIM')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_omim_xrefs(f.col('cv')).alias('dbXrefs'))
+ ... .show(truncate=False)
+ ... )
+ +-------------------+
+ |dbXrefs |
+ +-------------------+
+ |[{234234#32, omim}]|
+ +-------------------+
+
+ """
+ variants_w_omim_ref = f.filter(
+ colocated_variants,
+ lambda variant: variant.getItem("var_synonyms").getItem("OMIM").isNotNull(),
+ )
+
+ omim_var_ids = f.transform(
+ variants_w_omim_ref,
+ lambda var: f.transform(
+ var.getItem("var_synonyms").getItem("OMIM"),
+ lambda var: f.regexp_replace(var.cast(t.StringType()), r"\.", r"#"),
+ ),
+ )
+
+ return VariantEffectPredictorParser._generate_dbxrefs(
+ f.flatten(omim_var_ids), "omim"
+ )
+
+ @staticmethod
+ def _extract_clinvar_xrefs(colocated_variants: Column) -> Column:
+ """Build xdbRefs for VCV ClinVar identifiers.
+
+ ClinVar identifiers are extracted from the colocated variants field.
+ VCV-identifiers are filtered out to generate cross-references.
+
+ Args:
+ colocated_variants (Column): Colocated variants field from VEP output.
+
+ Returns:
+ Column: List of dbXrefs for VCV ClinVar identifiers.
+
+ Examples:
+ >>> data = [('VCV2323,RCV3423', 'rs1', 's1',),(None, 'rs1', 's1',),]
+ >>> (
+ ... spark.createDataFrame(data, ['id', 'rsid', 's'])
+ ... .groupBy('s')
+ ... .agg(f.collect_list(f.struct(f.struct(f.split(f.col('id'),',').alias('ClinVar')).alias('var_synonyms'),f.col('rsid').alias('id'),),).alias('cv'),).select(VariantEffectPredictorParser._extract_clinvar_xrefs(f.col('cv')).alias('dbXrefs'))
+ ... .show(truncate=False)
+ ... )
+ +--------------------+
+ |dbXrefs |
+ +--------------------+
+ |[{VCV2323, clinvar}]|
+ +--------------------+
+
+ """
+ variants_w_clinvar_ref = f.filter(
+ colocated_variants,
+ lambda variant: variant.getItem("var_synonyms")
+ .getItem("ClinVar")
+ .isNotNull(),
+ )
+
+ clin_var_ids = f.transform(
+ variants_w_clinvar_ref,
+ lambda var: f.filter(
+ var.getItem("var_synonyms").getItem("ClinVar"),
+ lambda x: x.startswith("VCV"),
+ ),
+ )
+
+ return VariantEffectPredictorParser._generate_dbxrefs(
+ f.flatten(clin_var_ids), "clinvar"
+ )
+
+ @staticmethod
+ def _get_most_severe_transcript(
+ transcript_column_name: str, score_field_name: str
+ ) -> Column:
+ """Return transcript with the highest in silico predicted score.
+
+ This method assumes the higher the score, the more severe the consequence of the variant is.
+ Hence, by selecting the transcript with the highest score, we are selecting the most severe consequence.
+
+ Args:
+ transcript_column_name (str): Name of the column containing the list of transcripts.
+ score_field_name (str): Name of the field containing the severity score.
+
+ Returns:
+ Column: Most severe transcript.
+
+ Examples:
+ >>> data = [("v1", 0.2, 'transcript1'),("v1", None, 'transcript2'),("v1", 0.6, 'transcript3'),("v1", 0.4, 'transcript4'),]
+ >>> (
+ ... spark.createDataFrame(data, ['v', 'score', 'transcriptId'])
+ ... .groupBy('v')
+ ... .agg(f.collect_list(f.struct(f.col('score'),f.col('transcriptId'))).alias('transcripts'))
+ ... .select(VariantEffectPredictorParser._get_most_severe_transcript('transcripts', 'score').alias('most_severe_transcript'))
+ ... .show(truncate=False)
+ ... )
+ +----------------------+
+ |most_severe_transcript|
+ +----------------------+
+ |{0.6, transcript3} |
+ +----------------------+
+
+ """
+ assert isinstance(
+ transcript_column_name, str
+ ), "transcript_column_name must be a string and not a column."
+ # Order transcripts by severity score:
+ ordered_transcripts = order_array_of_structs_by_field(
+ transcript_column_name, score_field_name
+ )
+
+ # Drop transcripts with no severity score and return the most severe one:
+ return f.filter(
+ ordered_transcripts,
+ lambda transcript: transcript.getItem(score_field_name).isNotNull(),
+ )[0]
+
+ @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
+ @staticmethod
+ def _get_max_alpha_missense(transcripts: Column) -> Column:
+ """Return the most severe alpha missense prediction from all transcripts.
+
+ This function assumes one variant can fall onto only one gene with alpha-sense prediction available on the canonical transcript.
+
+ Args:
+ transcripts (Column): List of transcripts.
+
+ Returns:
+ Column: Most severe alpha missense prediction.
+
+ Examples:
+ >>> data = [('v1', 0.4, 'assessment 1'), ('v1', None, None), ('v1', None, None),('v2', None, None),]
+ >>> (
+ ... spark.createDataFrame(data, ['v', 'a', 'b'])
+ ... .groupBy('v')
+ ... .agg(f.collect_list(f.struct(f.struct(
+ ... f.col('a').alias('am_pathogenicity'),
+ ... f.col('b').alias('am_class')).alias('alphamissense'),
+ ... f.lit('gene1').alias('gene_id'))).alias('transcripts')
+ ... )
+ ... .select(VariantEffectPredictorParser._get_max_alpha_missense(f.col('transcripts')).alias('am'))
+ ... .show(truncate=False)
+ ... )
+ +----------------------------------------------------+
+ |am |
+ +----------------------------------------------------+
+ |{max alpha missense, assessment 1, 0.4, null, gene1}|
+ |{max alpha missense, null, null, null, gene1} |
+ +----------------------------------------------------+
+
+ """
+ return f.transform(
+ # Extract transcripts with alpha missense values:
+ f.filter(
+ transcripts,
+ lambda transcript: transcript.getItem("alphamissense").isNotNull(),
+ ),
+ # Extract alpha missense values:
+ lambda transcript: f.struct(
+ transcript.getItem("alphamissense")
+ .getItem("am_pathogenicity")
+ .cast(t.FloatType())
+ .alias("score"),
+ transcript.getItem("alphamissense")
+ .getItem("am_class")
+ .alias("assessment"),
+ f.lit("max alpha missense").alias("method"),
+ transcript.getItem("gene_id").alias("targetId"),
+ ),
+ )[0]
+
+ @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
+ @staticmethod
+ def _vep_in_silico_prediction_extractor(
+ transcript_column_name: str,
+ method_name: str,
+ score_column_name: str | None = None,
+ assessment_column_name: str | None = None,
+ assessment_flag_column_name: str | None = None,
+ ) -> Column:
+ """Extract in silico prediction from VEP output.
+
+ Args:
+ transcript_column_name (str): Name of the column containing the list of transcripts.
+ method_name (str): Name of the in silico predictor.
+ score_column_name (str | None): Name of the column containing the score.
+ assessment_column_name (str | None): Name of the column containing the assessment.
+ assessment_flag_column_name (str | None): Name of the column containing the assessment flag.
+
+ Returns:
+ Column: In silico predictor.
+ """
+ # Get highest score:
+ most_severe_transcript: Column = (
+ # Getting the most severe transcript:
+ VariantEffectPredictorParser._get_most_severe_transcript(
+ transcript_column_name, score_column_name
+ )
+ if score_column_name is not None
+ # If we don't have score, just pick one of the transcript where assessment is available:
+ else f.filter(
+ f.col(transcript_column_name),
+ lambda transcript: transcript.getItem(
+ assessment_column_name
+ ).isNotNull(),
+ )
+ )
+
+ return f.when(
+ most_severe_transcript.isNotNull(),
+ f.struct(
+ # Adding method name:
+ f.lit(method_name).cast(t.StringType()).alias("method"),
+ # Adding assessment:
+ f.lit(None).cast(t.StringType()).alias("assessment")
+ if assessment_column_name is None
+ else most_severe_transcript.getField(assessment_column_name).alias(
+ "assessment"
+ ),
+ # Adding score:
+ f.lit(None).cast(t.FloatType()).alias("score")
+ if score_column_name is None
+ else most_severe_transcript.getField(score_column_name)
+ .cast(t.FloatType())
+ .alias("score"),
+ # Adding assessment flag:
+ f.lit(None).cast(t.StringType()).alias("assessmentFlag")
+ if assessment_flag_column_name is None
+ else most_severe_transcript.getField(assessment_flag_column_name)
+ .cast(t.FloatType())
+ .alias("assessmentFlag"),
+ # Adding target id if present:
+ most_severe_transcript.getItem("gene_id").alias("targetId"),
+ ),
+ )
+
+ @staticmethod
+ def _parser_amino_acid_change(amino_acids: Column, protein_end: Column) -> Column:
+ """Convert VEP amino acid change information to one letter code aa substitution code.
+
+ The logic assumes that the amino acid change is given in the format "from/to" and the protein end is given also.
+ If any of the information is missing, the amino acid change will be set to None.
+
+ Args:
+ amino_acids (Column): Amino acid change information.
+ protein_end (Column): Protein end information.
+
+ Returns:
+ Column: Amino acid change in one letter code.
+
+ Examples:
+ >>> data = [('A/B', 1),('A/B', None),(None, 1),]
+ >>> (
+ ... spark.createDataFrame(data, ['amino_acids', 'protein_end'])
+ ... .select(VariantEffectPredictorParser._parser_amino_acid_change(f.col('amino_acids'), f.col('protein_end')).alias('amino_acid_change'))
+ ... .show()
+ ... )
+ +-----------------+
+ |amino_acid_change|
+ +-----------------+
+ | A1B|
+ | null|
+ | null|
+ +-----------------+
+
+ """
+ return f.when(
+ amino_acids.isNotNull() & protein_end.isNotNull(),
+ f.concat(
+ f.split(amino_acids, r"\/")[0],
+ protein_end,
+ f.split(amino_acids, r"\/")[1],
+ ),
+ ).otherwise(f.lit(None))
+
+ @staticmethod
+ def _collect_uniprot_accessions(trembl: Column, swissprot: Column) -> Column:
+ """Flatten arrays containing Uniprot accessions.
+
+ Args:
+ trembl (Column): List of TrEMBL protein accessions.
+ swissprot (Column): List of SwissProt protein accessions.
+
+ Returns:
+ Column: List of unique Uniprot accessions extracted from swissprot and trembl arrays, splitted from version numbers.
+
+ Examples:
+ >>> data = [('v1', ["sp_1"], ["tr_1"]), ('v1', None, None), ('v1', None, ["tr_2"]),]
+ >>> (
+ ... spark.createDataFrame(data, ['v', 'sp', 'tr'])
+ ... .select(VariantEffectPredictorParser._collect_uniprot_accessions(f.col('sp'), f.col('tr')).alias('proteinIds'))
+ ... .show()
+ ... )
+ +------------+
+ | proteinIds|
+ +------------+
+ |[sp_1, tr_1]|
+ | []|
+ | [tr_2]|
+ +------------+
+
+ """
+ # Dropping Null values and flattening the arrays:
+ return f.filter(
+ f.array_distinct(
+ f.transform(
+ f.flatten(
+ f.filter(
+ f.array(trembl, swissprot),
+ lambda x: x.isNotNull(),
+ )
+ ),
+ lambda x: f.split(x, r"\.")[0],
+ )
+ ),
+ lambda x: x.isNotNull(),
+ )
+
+ @staticmethod
+ def _consequence_to_sequence_ontology(
+ col: Column, so_dict: Dict[str, str]
+ ) -> Column:
+ """Convert VEP consequence terms to sequence ontology identifiers.
+
+ Missing consequence label will be converted to None, unmapped consequences will be mapped as None.
+
+ Args:
+ col (Column): Column containing VEP consequence terms.
+ so_dict (Dict[str, str]): Dictionary mapping VEP consequence terms to sequence ontology identifiers.
+
+ Returns:
+ Column: Column containing sequence ontology identifiers.
+
+ Examples:
+ >>> data = [('consequence_1',),('unmapped_consequence',),(None,)]
+ >>> m = {'consequence_1': 'SO:000000'}
+ >>> (
+ ... spark.createDataFrame(data, ['label'])
+ ... .select('label',VariantEffectPredictorParser._consequence_to_sequence_ontology(f.col('label'),m).alias('id'))
+ ... .show()
+ ... )
+ +--------------------+---------+
+ | label| id|
+ +--------------------+---------+
+ | consequence_1|SO:000000|
+ |unmapped_consequence| null|
+ | null| null|
+ +--------------------+---------+
+
+ """
+ map_expr = f.create_map(*[f.lit(x) for x in chain(*so_dict.items())])
+
+ return map_expr[col].alias("ancestry")
+
+ @staticmethod
+ def _parse_variant_location_id(vep_input_field: Column) -> List[Column]:
+ r"""Parse variant identifier, chromosome, position, reference allele and alternate allele from VEP input field.
+
+ Args:
+ vep_input_field (Column): Column containing variant vcf string used as VEP input.
+
+ Returns:
+ List[Column]: List of columns containing chromosome, position, reference allele and alternate allele.
+ """
+ variant_fields = f.split(vep_input_field, r"\t")
+ return [
+ f.concat_ws(
+ "_",
+ f.array(
+ variant_fields[0],
+ variant_fields[1],
+ variant_fields[3],
+ variant_fields[4],
+ ),
+ ).alias("variantId"),
+ variant_fields[0].cast(t.StringType()).alias("chromosome"),
+ variant_fields[1].cast(t.IntegerType()).alias("position"),
+ variant_fields[3].cast(t.StringType()).alias("referenceAllele"),
+ variant_fields[4].cast(t.StringType()).alias("alternateAllele"),
+ ]
+
+ @classmethod
+ def process_vep_output(cls, vep_output: DataFrame) -> DataFrame:
+ """Process and format a VEP output in JSON format.
+
+ Args:
+ vep_output (DataFrame): raw VEP output, read as spark DataFrame.
+
+ Returns:
+ DataFrame: processed data in the right shape.
+ """
+ # Reading consequence to sequence ontology map:
+ sequence_ontology_map = json.loads(
+ pkg_resources.read_text(data, "so_mappings.json", encoding="utf-8")
+ )
+ # Processing VEP output:
+ return (
+ vep_output
+ # Dropping non-canonical transcript consequences: # TODO: parametrize this.
+ .withColumn(
+ "transcript_consequences",
+ f.filter(
+ f.col("transcript_consequences"),
+ lambda consequence: consequence.getItem("canonical") == 1,
+ ),
+ )
+ .select(
+ # Parse id and chr:pos:alt:ref:
+ *cls._parse_variant_location_id(f.col("input")),
+ # Extracting corss_references from colocated variants:
+ cls._extract_ensembl_xrefs(f.col("colocated_variants")).alias(
+ "ensembl_xrefs"
+ ),
+ cls._extract_omim_xrefs(f.col("colocated_variants")).alias(
+ "omim_xrefs"
+ ),
+ cls._extract_clinvar_xrefs(f.col("colocated_variants")).alias(
+ "clinvar_xrefs"
+ ),
+ # Extracting in silico predictors
+ f.when(
+ # The following in-silico predictors are only available for variants with transcript consequences:
+ f.col("transcript_consequences").isNotNull(),
+ f.filter(
+ f.array(
+ # Extract CADD scores:
+ cls._vep_in_silico_prediction_extractor(
+ transcript_column_name="transcript_consequences",
+ method_name="phred scaled CADD",
+ score_column_name="cadd_phred",
+ ),
+ # Extract polyphen scores:
+ cls._vep_in_silico_prediction_extractor(
+ transcript_column_name="transcript_consequences",
+ method_name="polyphen",
+ score_column_name="polyphen_score",
+ assessment_column_name="polyphen_prediction",
+ ),
+ # Extract sift scores:
+ cls._vep_in_silico_prediction_extractor(
+ transcript_column_name="transcript_consequences",
+ method_name="sift",
+ score_column_name="sift_score",
+ assessment_column_name="sift_prediction",
+ ),
+ # Extract loftee scores:
+ cls._vep_in_silico_prediction_extractor(
+ method_name="loftee",
+ transcript_column_name="transcript_consequences",
+ score_column_name="lof",
+ assessment_column_name="lof",
+ assessment_flag_column_name="lof_filter",
+ ),
+ # Extract max alpha missense:
+ cls._get_max_alpha_missense(
+ f.col("transcript_consequences")
+ ),
+ ),
+ lambda predictor: predictor.isNotNull(),
+ ),
+ )
+ .otherwise(
+ # Extract CADD scores from intergenic object:
+ f.array(
+ cls._vep_in_silico_prediction_extractor(
+ transcript_column_name="intergenic_consequences",
+ method_name="phred scaled CADD",
+ score_column_name="cadd_phred",
+ ),
+ )
+ )
+ .alias("inSilicoPredictors"),
+ # Convert consequence to SO:
+ cls._consequence_to_sequence_ontology(
+ f.col("most_severe_consequence"), sequence_ontology_map
+ ).alias("mostSevereConsequenceId"),
+ # Collect transcript consequence:
+ f.when(
+ f.col("transcript_consequences").isNotNull(),
+ f.transform(
+ f.col("transcript_consequences"),
+ lambda transcript: f.struct(
+ # Convert consequence terms to SO identifier:
+ f.transform(
+ transcript.consequence_terms,
+ lambda y: cls._consequence_to_sequence_ontology(
+ y, sequence_ontology_map
+ ),
+ ).alias("variantFunctionalConsequenceIds"),
+ # Format amino acid change:
+ cls._parser_amino_acid_change(
+ transcript.amino_acids, transcript.protein_end
+ ).alias("aminoAcidChange"),
+ # Extract and clean uniprot identifiers:
+ cls._collect_uniprot_accessions(
+ transcript.swissprot,
+ transcript.trembl,
+ ).alias("uniprotAccessions"),
+ # Add canonical flag:
+ f.when(transcript.canonical == 1, f.lit(True))
+ .otherwise(f.lit(False))
+ .alias("isEnsemblCanonical"),
+ # Extract other fields as is:
+ transcript.codons.alias("codons"),
+ transcript.distance.alias("distance"),
+ transcript.gene_id.alias("targetId"),
+ transcript.impact.alias("impact"),
+ transcript.transcript_id.alias("transcriptId"),
+ transcript.lof.cast(t.StringType()).alias(
+ "lofteePrediction"
+ ),
+ transcript.lof.cast(t.FloatType()).alias("siftPrediction"),
+ transcript.lof.cast(t.FloatType()).alias(
+ "polyphenPrediction"
+ ),
+ ),
+ ),
+ ).alias("transcriptConsequences"),
+ # Extracting rsids:
+ cls._colocated_variants_to_rsids(f.col("colocated_variants")).alias(
+ "rsIds"
+ ),
+ # Adding empty array for allele frequencies - now this piece of data is not coming form the VEP data:
+ f.array().cast(cls.ALLELE_FREQUENCY_SCHEMA).alias("alleleFrequencies"),
+ )
+ # Adding protvar xref for missense variants: # TODO: making and extendable list of consequences
+ .withColumn(
+ "protvar_xrefs",
+ f.when(
+ f.size(
+ f.filter(
+ f.col("transcriptConsequences"),
+ lambda x: f.array_contains(
+ x.variantFunctionalConsequenceIds, "SO_0001583"
+ ),
+ )
+ )
+ > 0,
+ cls._generate_dbxrefs(f.array(f.col("variantId")), "protvar"),
+ ),
+ )
+ .withColumn(
+ "dbXrefs",
+ f.flatten(
+ f.filter(
+ f.array(
+ f.col("ensembl_xrefs"),
+ f.col("omim_xrefs"),
+ f.col("clinvar_xrefs"),
+ f.col("protvar_xrefs"),
+ ),
+ lambda x: x.isNotNull(),
+ )
+ ),
+ )
+ # Dropping intermediate xref columns:
+ .drop(*["ensembl_xrefs", "omim_xrefs", "clinvar_xrefs", "protvar_xrefs"])
+ )
diff --git a/src/gentropy/datasource/gnomad/variants.py b/src/gentropy/datasource/gnomad/variants.py
index fdc67a7cb..98e7013f5 100644
--- a/src/gentropy/datasource/gnomad/variants.py
+++ b/src/gentropy/datasource/gnomad/variants.py
@@ -7,8 +7,8 @@
import hail as hl
from gentropy.common.types import VariantPopulation
-from gentropy.config import VariantAnnotationConfig
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.config import GnomadVariantConfig
+from gentropy.dataset.variant_index import VariantIndex
if TYPE_CHECKING:
pass
@@ -19,26 +19,23 @@ class GnomADVariants:
def __init__(
self,
- gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path,
- chain_38_37: str = VariantAnnotationConfig().chain_38_37,
+ gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
gnomad_variant_populations: list[
VariantPopulation | str
- ] = VariantAnnotationConfig().gnomad_variant_populations,
+ ] = GnomadVariantConfig().gnomad_variant_populations,
):
"""Initialize.
Args:
gnomad_genomes_path (str): Path to gnomAD genomes hail table.
- chain_38_37 (str): Path to GRCh38 to GRCh37 chain file.
gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
- All defaults are stored in VariantAnnotationConfig.
+ All defaults are stored in GnomadVariantConfig.
"""
self.gnomad_genomes_path = gnomad_genomes_path
- self.chain_38_37 = chain_38_37
self.gnomad_variant_populations = gnomad_variant_populations
- def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
+ def as_variant_index(self: GnomADVariants) -> VariantIndex:
"""Generate variant annotation dataset from gnomAD.
Some relevant modifications to the original dataset are:
@@ -48,7 +45,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
3. Field names are converted to camel case to follow the convention.
Returns:
- VariantAnnotation: Variant annotation dataset
+ VariantIndex: GnomaAD variants dataset.
"""
# Load variants dataset
ht = hl.read_table(
@@ -56,19 +53,14 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
_load_refs=False,
)
- # Liftover
- grch37 = hl.get_reference("GRCh37")
- grch38 = hl.get_reference("GRCh38")
- grch38.add_liftover(self.chain_38_37, grch37)
-
# Drop non biallelic variants
ht = ht.filter(ht.alleles.length() == 2)
- # Liftover
- ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, "GRCh37"))
+
# Select relevant fields and nested records to create class
- return VariantAnnotation(
+ return VariantIndex(
_df=(
ht.select(
+ # Extract mandatory fields:
variantId=hl.str("_").join(
[
ht.locus.contig.replace("chr", ""),
@@ -79,12 +71,9 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
),
chromosome=ht.locus.contig.replace("chr", ""),
position=ht.locus.position,
- chromosomeB37=ht.locus_GRCh37.contig.replace("chr", ""),
- positionB37=ht.locus_GRCh37.position,
referenceAllele=ht.alleles[0],
alternateAllele=ht.alleles[1],
- rsIds=ht.rsid,
- alleleType=ht.allele_info.allele_type,
+ # Extract allele frequencies from populations of interest:
alleleFrequencies=hl.set(
[f"{pop}_adj" for pop in self.gnomad_variant_populations]
).map(
@@ -93,33 +82,46 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,
)
),
- vep=hl.struct(
- mostSevereConsequence=ht.vep.most_severe_consequence,
- transcriptConsequences=hl.map(
- lambda x: hl.struct(
- aminoAcids=x.amino_acids,
- consequenceTerms=x.consequence_terms,
- geneId=x.gene_id,
- lof=x.lof,
+ # Extract most severe consequence:
+ mostSevereConsequence=ht.vep.most_severe_consequence,
+ # Extract in silico predictors:
+ inSilicoPredictors=hl.array(
+ [
+ hl.struct(
+ method=hl.str("spliceai"),
+ assessment=hl.missing(hl.tstr),
+ score=hl.expr.functions.float32(
+ ht.in_silico_predictors.spliceai_ds_max
+ ),
+ assessmentFlag=hl.missing(hl.tstr),
+ targetId=hl.missing(hl.tstr),
),
- # Only keeping canonical transcripts
- ht.vep.transcript_consequences.filter(
- lambda x: (x.canonical == 1)
- & (x.gene_symbol_source == "HGNC")
+ hl.struct(
+ method=hl.str("pangolin"),
+ assessment=hl.missing(hl.tstr),
+ score=hl.expr.functions.float32(
+ ht.in_silico_predictors.pangolin_largest_ds
+ ),
+ assessmentFlag=hl.missing(hl.tstr),
+ targetId=hl.missing(hl.tstr),
),
- ),
+ ]
),
- inSilicoPredictors=hl.struct(
- cadd=hl.struct(
- phred=ht.in_silico_predictors.cadd.phred,
- raw=ht.in_silico_predictors.cadd.raw_score,
- ),
- revelMax=ht.in_silico_predictors.revel_max,
- spliceaiDsMax=ht.in_silico_predictors.spliceai_ds_max,
- pangolinLargestDs=ht.in_silico_predictors.pangolin_largest_ds,
- phylop=ht.in_silico_predictors.phylop,
- siftMax=ht.in_silico_predictors.sift_max,
- polyphenMax=ht.in_silico_predictors.polyphen_max,
+ # Extract cross references to GnomAD:
+ dbXrefs=hl.array(
+ [
+ hl.struct(
+ id=hl.str("-").join(
+ [
+ ht.locus.contig.replace("chr", ""),
+ hl.str(ht.locus.position),
+ ht.alleles[0],
+ ht.alleles[1],
+ ]
+ ),
+ source=hl.str("gnomad"),
+ )
+ ]
),
)
.key_by("chromosome", "position")
@@ -127,5 +129,5 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation:
.select_globals()
.to_spark(flatten=False)
),
- _schema=VariantAnnotation.get_schema(),
+ _schema=VariantIndex.get_schema(),
)
diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py
index d6763cb35..b4e9a2d45 100644
--- a/src/gentropy/datasource/gwas_catalog/associations.py
+++ b/src/gentropy/datasource/gwas_catalog/associations.py
@@ -26,7 +26,7 @@
if TYPE_CHECKING:
from pyspark.sql import Column, DataFrame
- from gentropy.dataset.variant_annotation import VariantAnnotation
+ from gentropy.dataset.variant_index import VariantIndex
@dataclass
@@ -197,14 +197,14 @@ def _collect_rsids(
return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele))
@staticmethod
- def _map_to_variant_annotation_variants(
- gwas_associations: DataFrame, variant_annotation: VariantAnnotation
+ def _map_variants_to_variant_index(
+ gwas_associations: DataFrame, variant_index: VariantIndex
) -> DataFrame:
"""Add variant metadata in associations.
Args:
- gwas_associations (DataFrame): raw GWAS Catalog associations
- variant_annotation (VariantAnnotation): variant annotation dataset
+ gwas_associations (DataFrame): raw GWAS Catalog associations.
+ variant_index (VariantIndex): GnomaAD variants dataset with allele frequencies.
Returns:
DataFrame: GWAS Catalog associations data including `variantId`, `referenceAllele`,
@@ -228,7 +228,7 @@ def _map_to_variant_annotation_variants(
)
# Subset of variant annotation required for GWAS Catalog annotations:
- va_subset = variant_annotation.df.select(
+ va_subset = variant_index.df.select(
"variantId",
"chromosome",
# Calculate the position in Ensembl coordinates for indels:
@@ -241,7 +241,7 @@ def _map_to_variant_annotation_variants(
"referenceAllele",
"alternateAllele",
"alleleFrequencies",
- variant_annotation.max_maf().alias("maxMaf"),
+ variant_index.max_maf().alias("maxMaf"),
).join(
f.broadcast(
gwas_associations_subset.select(
@@ -1035,7 +1035,7 @@ def _qc_palindromic_alleles(
def from_source(
cls: type[GWASCatalogCuratedAssociationsParser],
gwas_associations: DataFrame,
- variant_annotation: VariantAnnotation,
+ variant_index: VariantIndex,
pvalue_threshold: float = WindowBasedClumpingStepConfig.gwas_significance,
) -> StudyLocusGWASCatalog:
"""Read GWASCatalog associations.
@@ -1044,9 +1044,9 @@ def from_source(
applies some pre-defined filters on the data:
Args:
- gwas_associations (DataFrame): GWAS Catalog raw associations dataset
- variant_annotation (VariantAnnotation): Variant annotation dataset
- pvalue_threshold (float): P-value threshold for flagging associations
+ gwas_associations (DataFrame): GWAS Catalog raw associations dataset.
+ variant_index (VariantIndex): Variant index dataset with available allele frequencies.
+ pvalue_threshold (float): P-value threshold for flagging associations.
Returns:
StudyLocusGWASCatalog: GWASCatalogAssociations dataset
@@ -1060,8 +1060,8 @@ def from_source(
.transform(
# Map/harmonise variants to variant annotation dataset:
# This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position
- lambda df: GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants(
- df, variant_annotation
+ lambda df: GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index(
+ df, variant_index
)
)
.withColumn(
diff --git a/src/gentropy/datasource/ukbiobank/__init__.py b/src/gentropy/datasource/ukbiobank/__init__.py
index 544779b18..910603703 100644
--- a/src/gentropy/datasource/ukbiobank/__init__.py
+++ b/src/gentropy/datasource/ukbiobank/__init__.py
@@ -1,3 +1,3 @@
-"""GWAS Catalog Data Source."""
+"""UK biobank."""
from __future__ import annotations
diff --git a/src/gentropy/ld_index.py b/src/gentropy/gnomad_ingestion.py
similarity index 54%
rename from src/gentropy/ld_index.py
rename to src/gentropy/gnomad_ingestion.py
index 0cc00cf14..07b7fd58b 100644
--- a/src/gentropy/ld_index.py
+++ b/src/gentropy/gnomad_ingestion.py
@@ -1,14 +1,15 @@
-"""Step to dump a filtered version of a LD matrix (block matrix) as Parquet files."""
+"""Step to dump a filtered version of a LD matrix (block matrix) and GnomAD variants."""
from __future__ import annotations
import hail as hl
from gentropy.common.session import Session
-from gentropy.common.types import LD_Population
+from gentropy.common.types import LD_Population, VariantPopulation
from gentropy.common.version_engine import VersionEngine
-from gentropy.config import LDIndexConfig
+from gentropy.config import GnomadVariantConfig, LDIndexConfig
from gentropy.datasource.gnomad.ld import GnomADLDMatrix
+from gentropy.datasource.gnomad.variants import GnomADVariants
class LDIndexStep:
@@ -69,3 +70,56 @@ def __init__(
.parquet(ld_index_out)
)
session.logger.info(ld_index_out)
+
+
+class GnomadVariantIndexStep:
+ """A step to generate variant index dataset from gnomad data.
+
+ Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
+ This dataset is used to validate variants and as a source of annotation.
+ """
+
+ def __init__(
+ self,
+ session: Session,
+ gnomad_variant_output: str,
+ gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
+ gnomad_variant_populations: list[
+ VariantPopulation | str
+ ] = GnomadVariantConfig().gnomad_variant_populations,
+ use_version_from_input: bool = GnomadVariantConfig().use_version_from_input,
+ ) -> None:
+ """Run Variant Annotation step.
+
+ Args:
+ session (Session): Session object.
+ gnomad_variant_output (str): Path to resulting dataset.
+ gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
+ gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
+ use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output gnomad_variant_output. Defaults to False.
+
+ In case use_version_from_input is set to True,
+ data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
+ All defaults are stored in the GnomadVariantConfig.
+ """
+ # amend data source version to output path
+ if use_version_from_input:
+ gnomad_variant_output = VersionEngine("gnomad").amend_version(
+ gnomad_genomes_path, gnomad_variant_output
+ )
+
+ # Initialise hail session.
+ hl.init(sc=session.spark.sparkContext, log="/dev/null")
+
+ # Parse variant info from source.
+ gnomad_variants = GnomADVariants(
+ gnomad_genomes_path=gnomad_genomes_path,
+ gnomad_variant_populations=gnomad_variant_populations,
+ ).as_variant_index()
+
+ # Write data partitioned by chromosome and position.
+ (
+ gnomad_variants.df.write.mode(session.write_mode).parquet(
+ gnomad_variant_output
+ )
+ )
diff --git a/src/gentropy/gwas_catalog_ingestion.py b/src/gentropy/gwas_catalog_ingestion.py
index 6930a2df9..725f1ca4d 100644
--- a/src/gentropy/gwas_catalog_ingestion.py
+++ b/src/gentropy/gwas_catalog_ingestion.py
@@ -1,8 +1,9 @@
"""Step to process GWAS Catalog associations and study table."""
+
from __future__ import annotations
from gentropy.common.session import Session
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
from gentropy.datasource.gwas_catalog.associations import (
GWASCatalogCuratedAssociationsParser,
)
@@ -26,7 +27,7 @@ def __init__(
catalog_ancestry_files: list[str],
catalog_sumstats_lut: str,
catalog_associations_file: str,
- variant_annotation_path: str,
+ gnomad_variant_path: str,
catalog_studies_out: str,
catalog_associations_out: str,
gwas_catalog_study_curation_file: str | None = None,
@@ -40,14 +41,14 @@ def __init__(
catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
catalog_associations_file (str): Raw GWAS catalog associations file.
- variant_annotation_path (str): Input variant annotation path.
+ gnomad_variant_path (str): Path to GnomAD variants.
catalog_studies_out (str): Output GWAS catalog studies path.
catalog_associations_out (str): Output GWAS catalog associations path.
gwas_catalog_study_curation_file (str | None): file of the curation table. Optional.
inclusion_list_path (str | None): optional inclusion list (parquet)
"""
# Extract
- va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+ gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path)
catalog_studies = session.spark.read.csv(
list(catalog_study_files), sep="\t", header=True
)
@@ -69,7 +70,9 @@ def __init__(
StudyIndexGWASCatalogParser.from_source(
catalog_studies, ancestry_lut, sumstats_lut
).annotate_from_study_curation(gwas_catalog_study_curation),
- GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va),
+ GWASCatalogCuratedAssociationsParser.from_source(
+ catalog_associations, gnomad_variants
+ ),
)
# if inclusion list is provided apply filter:
diff --git a/src/gentropy/gwas_catalog_study_inclusion.py b/src/gentropy/gwas_catalog_study_inclusion.py
index 872177601..f07f851a7 100644
--- a/src/gentropy/gwas_catalog_study_inclusion.py
+++ b/src/gentropy/gwas_catalog_study_inclusion.py
@@ -1,4 +1,5 @@
"""Step to generate an GWAS Catalog study identifier inclusion and exclusion list."""
+
from __future__ import annotations
from typing import TYPE_CHECKING
@@ -6,7 +7,7 @@
from pyspark.sql import functions as f
from gentropy.common.session import Session
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
from gentropy.datasource.gwas_catalog.associations import (
GWASCatalogCuratedAssociationsParser,
)
@@ -84,7 +85,7 @@ def process_harmonised_list(studies: list[str], session: Session) -> DataFrame:
@staticmethod
def get_gwas_catalog_study_index(
session: Session,
- variant_annotation_path: str,
+ gnomad_variant_path: str,
catalog_study_files: list[str],
catalog_ancestry_files: list[str],
harmonised_study_file: str,
@@ -95,7 +96,7 @@ def get_gwas_catalog_study_index(
Args:
session (Session): Session object.
- variant_annotation_path (str): Input variant annotation path.
+ gnomad_variant_path (str): Path to GnomAD variant list.
catalog_study_files (list[str]): List of raw GWAS catalog studies file.
catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
harmonised_study_file (str): GWAS Catalog summary statistics lookup table.
@@ -106,7 +107,7 @@ def get_gwas_catalog_study_index(
StudyIndexGWASCatalog: Completely processed and fully annotated study index.
"""
# Extract
- va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+ gnomad_variants = VariantIndex.from_parquet(session, gnomad_variant_path)
catalog_studies = session.spark.read.csv(
list(catalog_study_files), sep="\t", header=True
)
@@ -130,7 +131,9 @@ def get_gwas_catalog_study_index(
ancestry_lut,
sumstats_lut,
).annotate_from_study_curation(gwas_catalog_study_curation),
- GWASCatalogCuratedAssociationsParser.from_source(catalog_associations, va),
+ GWASCatalogCuratedAssociationsParser.from_source(
+ catalog_associations, gnomad_variants
+ ),
)
return study_index
@@ -142,7 +145,7 @@ def __init__(
catalog_ancestry_files: list[str],
catalog_associations_file: str,
gwas_catalog_study_curation_file: str,
- variant_annotation_path: str,
+ gnomad_variant_path: str,
harmonised_study_file: str,
criteria: str,
inclusion_list_path: str,
@@ -151,12 +154,12 @@ def __init__(
"""Run step.
Args:
- session (Session): Session object.
+ session (Session): Session objecct.
catalog_study_files (list[str]): List of raw GWAS catalog studies file.
catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
catalog_associations_file (str): Raw GWAS catalog associations file.
gwas_catalog_study_curation_file (str): file of the curation table. Optional.
- variant_annotation_path (str): Input variant annotation path.
+ gnomad_variant_path (str): Path to GnomAD variant list.
harmonised_study_file (str): GWAS Catalog summary statistics lookup table.
criteria (str): name of the filter set to be applied.
inclusion_list_path (str): Output path for the inclusion list.
@@ -165,7 +168,7 @@ def __init__(
# Create study index:
study_index = self.get_gwas_catalog_study_index(
session,
- variant_annotation_path,
+ gnomad_variant_path,
catalog_study_files,
catalog_ancestry_files,
harmonised_study_file,
diff --git a/src/gentropy/variant_annotation.py b/src/gentropy/variant_annotation.py
deleted file mode 100644
index 355a4dfea..000000000
--- a/src/gentropy/variant_annotation.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Step to generate variant annotation dataset."""
-
-from __future__ import annotations
-
-import hail as hl
-
-from gentropy.common.session import Session
-from gentropy.common.types import VariantPopulation
-from gentropy.common.version_engine import VersionEngine
-from gentropy.config import VariantAnnotationConfig
-from gentropy.datasource.gnomad.variants import GnomADVariants
-
-
-class VariantAnnotationStep:
- """Variant annotation step.
-
- Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
- This dataset is used to validate variants and as a source of annotation.
- """
-
- def __init__(
- self,
- session: Session,
- variant_annotation_path: str,
- gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path,
- gnomad_variant_populations: list[
- VariantPopulation | str
- ] = VariantAnnotationConfig().gnomad_variant_populations,
- chain_38_37: str = VariantAnnotationConfig().chain_38_37,
- use_version_from_input: bool = VariantAnnotationConfig().use_version_from_input,
- ) -> None:
- """Run Variant Annotation step.
-
- Args:
- session (Session): Session object.
- variant_annotation_path (str): Variant annotation dataset path.
- gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
- gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
- chain_38_37 (str): Path to GRCh38 to GRCh37 chain file for lifover.
- use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False.
-
- In case use_version_from_input is set to True,
- data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
- All defaults are stored in the VariantAnnotationConfig.
- """
- # amend data source version to output path
- if use_version_from_input:
- variant_annotation_path = VersionEngine("gnomad").amend_version(
- gnomad_genomes_path, variant_annotation_path
- )
-
- # Initialise hail session.
- hl.init(sc=session.spark.sparkContext, log="/dev/null")
- # Run variant annotation.
- variant_annotation = GnomADVariants(
- gnomad_genomes_path=gnomad_genomes_path,
- gnomad_variant_populations=gnomad_variant_populations,
- chain_38_37=chain_38_37,
- ).as_variant_annotation()
-
- # Write data partitioned by chromosome and position.
- (
- variant_annotation.df.write.mode(session.write_mode).parquet(
- variant_annotation_path
- )
- )
diff --git a/src/gentropy/variant_index.py b/src/gentropy/variant_index.py
index bdf838ce2..ba86c4602 100644
--- a/src/gentropy/variant_index.py
+++ b/src/gentropy/variant_index.py
@@ -1,44 +1,53 @@
-"""Step to generate variant index dataset."""
+"""Step to generate variant index dataset based on VEP output."""
+
from __future__ import annotations
from gentropy.common.session import Session
-from gentropy.dataset.study_locus import StudyLocus
-from gentropy.dataset.variant_annotation import VariantAnnotation
from gentropy.dataset.variant_index import VariantIndex
+from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser
class VariantIndexStep:
- """Run variant index step to only variants in study-locus sets.
+ """Generate variant index based on a VEP output in json format.
- Using a `VariantAnnotation` dataset as a reference, this step creates and writes a dataset of the type `VariantIndex` that includes only variants that have disease-association data with a reduced set of annotations.
+ The variant index is a dataset that contains variant annotations extracted from VEP output. It is expected that all variants in the VEP output are present in the variant index.
+ There's an option to provide extra variant annotations to be added to the variant index eg. allele frequencies from GnomAD.
"""
def __init__(
self: VariantIndexStep,
session: Session,
- variant_annotation_path: str,
- credible_set_path: str,
+ vep_output_json_path: str,
variant_index_path: str,
+ gnomad_variant_annotations_path: str | None = None,
) -> None:
"""Run VariantIndex step.
Args:
session (Session): Session object.
- variant_annotation_path (str): Variant annotation dataset path.
- credible_set_path (str): Credible set dataset path.
- variant_index_path (str): Variant index dataset path.
+ vep_output_json_path (str): Variant effect predictor output path (in json format).
+ variant_index_path (str): Variant index dataset path to save resulting data.
+ gnomad_variant_annotations_path (str | None): Path to extra variant annotation dataset.
"""
- # Extract
- va = VariantAnnotation.from_parquet(session, variant_annotation_path)
- credible_set = StudyLocus.from_parquet(
- session, credible_set_path, recursiveFileLookup=True
+ # Extract variant annotations from VEP output:
+ variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep(
+ session.spark, vep_output_json_path
)
- # Transform
- vi = VariantIndex.from_variant_annotation(va, credible_set)
+ # Process variant annotations if provided:
+ if gnomad_variant_annotations_path:
+ # Read variant annotations from parquet:
+ annotations = VariantIndex.from_parquet(
+ session=session,
+ path=gnomad_variant_annotations_path,
+ recursiveFileLookup=True,
+ )
+
+ # Update file with extra annotations:
+ variant_index = variant_index.add_annotation(annotations)
(
- vi.df.write.partitionBy("chromosome")
+ variant_index.df.write.partitionBy("chromosome")
.mode(session.write_mode)
.parquet(variant_index_path)
)
diff --git a/src/gentropy/v2g.py b/src/gentropy/variant_to_gene.py
similarity index 81%
rename from src/gentropy/v2g.py
rename to src/gentropy/variant_to_gene.py
index e98348a01..cf21053d7 100644
--- a/src/gentropy/v2g.py
+++ b/src/gentropy/variant_to_gene.py
@@ -1,16 +1,16 @@
"""Step to generate variant annotation dataset."""
+
from __future__ import annotations
from functools import reduce
-import pyspark.sql.functions as f
+from pyspark.sql import functions as f
from gentropy.common.Liftover import LiftOverSpark
from gentropy.common.session import Session
from gentropy.dataset.gene_index import GeneIndex
from gentropy.dataset.intervals import Intervals
from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
from gentropy.dataset.variant_index import VariantIndex
@@ -26,7 +26,6 @@ class V2GStep:
Attributes:
session (Session): Session object.
variant_index_path (str): Input variant index path.
- variant_annotation_path (str): Input variant annotation path.
gene_index_path (str): Input gene index path.
vep_consequences_path (str): Input VEP consequences path.
liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
@@ -41,7 +40,6 @@ def __init__(
self,
session: Session,
variant_index_path: str,
- variant_annotation_path: str,
gene_index_path: str,
vep_consequences_path: str,
liftover_chain_file_path: str,
@@ -56,7 +54,6 @@ def __init__(
Args:
session (Session): Session object.
variant_index_path (str): Input variant index path.
- variant_annotation_path (str): Input variant annotation path.
gene_index_path (str): Input gene index path.
vep_consequences_path (str): Input VEP consequences path.
liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
@@ -69,16 +66,10 @@ def __init__(
# Read
gene_index = GeneIndex.from_parquet(session, gene_index_path)
vi = VariantIndex.from_parquet(session, variant_index_path).persist()
- va = VariantAnnotation.from_parquet(session, variant_annotation_path)
+ # Reading VEP consequence to score table and cast the score to the right type:
vep_consequences = session.spark.read.csv(
vep_consequences_path, sep="\t", header=True
- ).select(
- f.element_at(f.split("Accession", r"/"), -1).alias(
- "variantFunctionalConsequenceId"
- ),
- f.col("Term").alias("label"),
- f.col("v2g_score").cast("double").alias("score"),
- )
+ ).withColumn("score", f.col("score").cast("double"))
# Transform
lift = LiftOverSpark(
@@ -90,10 +81,7 @@ def __init__(
# Filter gene index by approved biotypes to define V2G gene universe
list(approved_biotypes)
)
- va_slimmed = va.filter_by_variant_df(
- # Variant annotation reduced to the variant index to define V2G variant universe
- vi.df
- ).persist()
+
intervals = Intervals(
_df=reduce(
lambda x, y: x.unionByName(y, allowMissingColumns=True),
@@ -108,9 +96,11 @@ def __init__(
_schema=Intervals.get_schema(),
)
v2g_datasets = [
- va_slimmed.get_distance_to_tss(gene_index_filtered, max_distance),
- va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
- va_slimmed.get_plof_v2g(gene_index_filtered),
+ vi.get_distance_to_tss(gene_index_filtered, max_distance),
+ vi.get_most_severe_transcript_consequence(
+ vep_consequences, gene_index_filtered
+ ),
+ vi.get_plof_v2g(gene_index_filtered),
intervals.v2g(vi),
]
v2g = V2G(
diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
index 62b873355..c35188466 100644
--- a/tests/gentropy/conftest.py
+++ b/tests/gentropy/conftest.py
@@ -23,7 +23,6 @@
from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
from gentropy.dataset.variant_index import VariantIndex
from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping
from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex
@@ -276,45 +275,6 @@ def mock_v2g(spark: SparkSession) -> V2G:
return V2G(_df=data_spec.build(), _schema=v2g_schema)
-@pytest.fixture()
-def mock_variant_annotation(spark: SparkSession) -> VariantAnnotation:
- """Mock variant annotation."""
- va_schema = VariantAnnotation.get_schema()
-
- data_spec = (
- dg.DataGenerator(
- spark,
- rows=400,
- partitions=4,
- randomSeedMethod="hash_fieldname",
- )
- .withSchema(va_schema)
- .withColumnSpec("alleleType", percentNulls=0.1)
- .withColumnSpec("chromosomeB37", percentNulls=0.1)
- .withColumnSpec("positionB37", percentNulls=0.1)
- # Nested column handling workaround
- # https://github.com/databrickslabs/dbldatagen/issues/135
- # It's a workaround for nested column handling in dbldatagen.
- .withColumnSpec(
- "alleleFrequencies",
- expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))',
- percentNulls=0.1,
- )
- .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
- .withColumnSpec(
- "vep",
- expr='named_struct("mostSevereConsequence", cast(rand() as string), "transcriptConsequences", array(named_struct("aminoAcids", cast(rand() as string), "consequenceTerms", array(cast(rand() as string)), "geneId", cast(rand() as string), "lof", cast(rand() as string))))',
- percentNulls=0.1,
- )
- .withColumnSpec(
- "inSilicoPredictors",
- expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))',
- percentNulls=0.1,
- )
- )
- return VariantAnnotation(_df=data_spec.build(), _schema=va_schema)
-
-
@pytest.fixture()
def mock_variant_index(spark: SparkSession) -> VariantIndex:
"""Mock variant index."""
@@ -328,23 +288,65 @@ def mock_variant_index(spark: SparkSession) -> VariantIndex:
randomSeedMethod="hash_fieldname",
)
.withSchema(vi_schema)
- .withColumnSpec("chromosomeB37", percentNulls=0.1)
- .withColumnSpec("positionB37", percentNulls=0.1)
- .withColumnSpec("mostSevereConsequence", percentNulls=0.1)
+ .withColumnSpec("mostSevereConsequenceId", percentNulls=0.1)
# Nested column handling workaround
# https://github.com/databrickslabs/dbldatagen/issues/135
# It's a workaround for nested column handling in dbldatagen.
+ .withColumnSpec(
+ "inSilicoPredictors",
+ expr="""
+ array(
+ named_struct(
+ "method", cast(rand() as string),
+ "assessment", cast(rand() as string),
+ "score", rand(),
+ "assessmentFlag", cast(rand() as string),
+ "targetId", cast(rand() as string)
+ )
+ )
+ """,
+ percentNulls=0.1,
+ )
.withColumnSpec(
"alleleFrequencies",
expr='array(named_struct("alleleFrequency", rand(), "populationName", cast(rand() as string)))',
percentNulls=0.1,
)
+ .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
.withColumnSpec(
- "inSilicoPredictors",
- expr='named_struct("cadd", named_struct("phred", cast(rand() as float), "raw_score", cast(rand() as float)), "revelMax", cast(rand() as double), "spliceaiDsMax", cast(rand() as float), "pangolinLargestDs", cast(rand() as double), "phylop", cast(rand() as double), "polyphenMax", cast(rand() as double), "siftMax", cast(rand() as double))',
+ "transcriptConsequences",
+ expr="""
+ array(
+ named_struct(
+ "variantFunctionalConsequenceIds", array(cast(rand() as string)),
+ "aminoAcidChange", cast(rand() as string),
+ "uniprotAccessions", array(cast(rand() as string)),
+ "isEnsemblCanonical", cast(rand() as boolean),
+ "codons", cast(rand() as string),
+ "distance", cast(rand() as long),
+ "targetId", cast(rand() as string),
+ "impact", cast(rand() as string),
+ "lofteePrediction", cast(rand() as string),
+ "siftPrediction", rand(),
+ "polyphenPrediction", rand(),
+ "transcriptId", cast(rand() as string)
+ )
+ )
+ """,
+ percentNulls=0.1,
+ )
+ .withColumnSpec(
+ "dbXrefs",
+ expr="""
+ array(
+ named_struct(
+ "id", cast(rand() as string),
+ "source", cast(rand() as string)
+ )
+ )
+ """,
percentNulls=0.1,
)
- .withColumnSpec("rsIds", expr="array(cast(rand() AS string))", percentNulls=0.1)
)
return VariantIndex(_df=data_spec.build(), _schema=vi_schema)
diff --git a/tests/gentropy/data_samples/finucane_PIPs.npy b/tests/gentropy/data_samples/finucane_PIPs.npy
deleted file mode 100644
index 2a93a5c71..000000000
Binary files a/tests/gentropy/data_samples/finucane_PIPs.npy and /dev/null differ
diff --git a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz b/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz
deleted file mode 100644
index 74c0844a4..000000000
Binary files a/tests/gentropy/data_samples/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz and /dev/null differ
diff --git a/tests/gentropy/data_samples/vep_consequences_sample.tsv b/tests/gentropy/data_samples/vep_consequences_sample.tsv
deleted file mode 100644
index 7b5de68b2..000000000
--- a/tests/gentropy/data_samples/vep_consequences_sample.tsv
+++ /dev/null
@@ -1,45 +0,0 @@
-Accession Term Description Display term IMPACT v2g_score eco_score
-http://purl.obolibrary.org/obo/SO_0001893 transcript_ablation A feature ablation whereby the deleted region includes a transcript feature Transcript ablation HIGH 1 1
-http://identifiers.org/eco/cttv_mapping_pipeline cttv_mapping_pipeline 1
-http://purl.obolibrary.org/obo/ECO_0000205 curator_inference 1
-http://purl.obolibrary.org/obo/SO_0002165 trinucleotide_repeat_expansion 1
-http://purl.obolibrary.org/obo/SO_0001574 splice_acceptor_variant A splice variant that changes the 2 base region at the 3' end of an intron Splice acceptor variant HIGH 1 0.95
-http://purl.obolibrary.org/obo/SO_0001575 splice_donor_variant A splice variant that changes the 2 base region at the 5' end of an intron Splice donor variant HIGH 1 0.95
-http://purl.obolibrary.org/obo/SO_0001587 stop_gained A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript Stop gained HIGH 1 0.95
-http://purl.obolibrary.org/obo/SO_0001589 frameshift_variant A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three Frameshift variant HIGH 1 0.95
-http://purl.obolibrary.org/obo/SO_0002012 start_lost A codon variant that changes at least one base of the canonical start codon Start lost HIGH 1 0.95
-http://purl.obolibrary.org/obo/SO_0001578 stop_lost A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript Stop lost HIGH 1 0.9
-http://purl.obolibrary.org/obo/SO_0001889 transcript_amplification A feature amplification of a region containing a transcript Transcript amplification HIGH 1 0.9
-http://purl.obolibrary.org/obo/SO_0001894 regulatory_region_ablation A feature ablation whereby the deleted region includes a regulatory region Regulatory region ablation MODERATE 0.66 0.9
-http://purl.obolibrary.org/obo/SO_0001583 missense_variant A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved Missense variant MODERATE 0.66 0.7
-http://purl.obolibrary.org/obo/SO_0001818 protein_altering_variant A sequence_variant which is predicted to change the protein encoded in the coding sequence Protein altering variant MODERATE 0.66 0.7
-http://purl.obolibrary.org/obo/SO_0001821 inframe_insertion An inframe non synonymous variant that inserts bases into in the coding sequence Inframe insertion MODERATE 0.66 0.7
-http://purl.obolibrary.org/obo/SO_0001822 inframe_deletion An inframe non synonymous variant that deletes bases from the coding sequence Inframe deletion MODERATE 0.66 0.7
-http://purl.obolibrary.org/obo/SO_0001582 initiator_codon_variant 0.7
-http://purl.obolibrary.org/obo/SO_0001630 splice_region_variant A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron Splice region variant LOW 0.33 0.65
-http://purl.obolibrary.org/obo/SO_0001626 incomplete_terminal_codon_variant A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed Incomplete terminal codon variant LOW 0.33 0.65
-http://purl.obolibrary.org/obo/SO_0001567 stop_retained_variant A sequence variant where at least one base in the terminator codon is changed, but the terminator remains Stop retained variant LOW 0.33 0.65
-http://purl.obolibrary.org/obo/SO_0001819 synonymous_variant A sequence variant where there is no resulting change to the encoded amino acid Synonymous variant LOW 0.33 0.65
-http://purl.obolibrary.org/obo/SO_0002019 start_retained_variant A sequence variant where at least one base in the start codon is changed, but the start remains Start retained variant LOW 0.33 0.65
-http://purl.obolibrary.org/obo/SO_0001619 non_coding_transcript_variant A transcript variant of a non coding RNA gene Non coding transcript variant MODIFIER 0 0.65
-http://purl.obolibrary.org/obo/SO_0001620 mature_miRNA_variant A transcript variant located with the sequence of the mature miRNA Mature miRNA variant MODIFIER 0 0.65
-http://purl.obolibrary.org/obo/SO_0001621 NMD_transcript_variant A variant in a transcript that is the target of NMD NMD transcript variant MODIFIER 0.1 0.65
-http://purl.obolibrary.org/obo/SO_0001623 5_prime_UTR_variant A UTR variant of the 5' UTR 5 prime UTR variant MODIFIER 0.1 0.65
-http://purl.obolibrary.org/obo/SO_0001624 3_prime_UTR_variant A UTR variant of the 3' UTR 3 prime UTR variant MODIFIER 0.1 0.65
-http://purl.obolibrary.org/obo/SO_0001627 intron_variant A transcript variant occurring within an intron Intron variant MODIFIER 0.1 0.65
-http://purl.obolibrary.org/obo/SO_0001792 non_coding_transcript_exon_variant A sequence variant that changes non-coding exon sequence in a non-coding transcript Non coding transcript exon variant MODIFIER 0 0.65
-http://purl.obolibrary.org/obo/SO_0001580 coding_sequence_variant A sequence variant that changes the coding sequence Coding sequence variant MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001566 regulatory_region_variant A sequence variant located within a regulatory region Regulatory region variant MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001631 upstream_gene_variant A sequence variant located 5' of a gene Upstream gene variant MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001632 downstream_gene_variant A sequence variant located 3' of a gene Downstream gene variant MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001782 TF_binding_site_variant A sequence variant located within a transcription factor binding site TF binding site variant MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001891 regulatory_region_amplification A feature amplification of a region containing a regulatory region Regulatory region amplification MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001892 TFBS_amplification A feature amplification of a region containing a transcription factor binding site TFBS amplification MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001895 TFBS_ablation A feature ablation whereby the deleted region includes a transcription factor binding site TFBS ablation MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001906 feature_truncation A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence Feature truncation MODIFIER 0 0.6
-http://purl.obolibrary.org/obo/SO_0001907 feature_elongation A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence Feature elongation MODIFIER 0 0.6
-http://targetvalidation.org/sequence/regulatory_nearest_gene_five_prime_end Regulatory nearest gene 5' end 0.6
-http://purl.obolibrary.org/obo/SO_0001628 intergenic_variant A sequence variant located in the intergenic region, between genes Intergenic variant MODIFIER 0 0.5
-http://purl.obolibrary.org/obo/SO_0001060 sequence_variant 0.5
-http://purl.obolibrary.org/obo/SO_0001825 conservative_inframe_deletion 0.5
-http://targetvalidation.org/sequence/nearest_gene_five_prime_end Nearest gene counting from 5' end 0.5
diff --git a/tests/gentropy/data_samples/vep_sample.jsonl b/tests/gentropy/data_samples/vep_sample.jsonl
new file mode 100644
index 000000000..78a76f150
--- /dev/null
+++ b/tests/gentropy/data_samples/vep_sample.jsonl
@@ -0,0 +1,2 @@
+{"transcript_consequences":[{"gene_id":"ENSG00000168702","swissprot":["Q9NZR2.181"],"uniparc":["UPI00001B045B"],"transcript_id":"ENST00000389484","impact":"MODIFIER","canonical":1,"consequence_terms":["intron_variant"],"strand":-1,"variant_allele":"T"}],"assembly_name":"GRCh38","most_severe_consequence":"intron_variant","strand":1,"seq_region_name":"2","start":140699626,"end":140699625,"input":"2\t140699625\t2_140699625_G_GT\tG\tGT","allele_string":"-/T","id":"2_140699625_G_GT"}
+{"transcript_consequences":[{"impact":"MODIFIER","consequence_terms":["upstream_gene_variant"],"distance":682,"trembl":["A0A2U3TZJ1.14"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000336451","strand":-1,"variant_allele":"T","gene_id":"ENSG00000155906","uniparc":["UPI000D18E792"]},{"lof_info":"INTRON_SIZE:8753","consequence_terms":["splice_donor_variant"],"impact":"HIGH","transcript_id":"ENST00000444024","cadd_phred":27.1,"cadd_raw":4.846757,"strand":-1,"variant_allele":"T","lof":"HC","canonical":1,"uniprot_isoform":["Q9NWS8-1"],"uniparc":["UPI00001AEAE1"],"swissprot":["Q9NWS8.145"],"gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"lof":"HC","variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000491268","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant"],"impact":"HIGH","lof_info":"INTRON_SIZE:8753"},{"variant_allele":"T","strand":-1,"uniparc":["UPI0002A12044"],"gene_id":"ENSG00000155906","consequence_terms":["intron_variant"],"impact":"MODIFIER","transcript_id":"ENST00000622845","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A087WXU0.51"]},{"transcript_id":"ENST00000643564","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER","gene_id":"ENSG00000155906","cdna_start":578,"cdna_end":578,"strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644054","cadd_phred":27.1,"trembl":["A0A2R8YFC3.14"],"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI001B8998C5"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T"},{"transcript_id":"ENST00000644711","cadd_phred":27.1,"cadd_raw":4.846757,"trembl":["A0A2R8Y4J4.16"],"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","uniparc":["UPI000D1907AF"],"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000645367","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"cadd_phred":27.1,"transcript_id":"ENST00000645895","cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","cdna_end":724,"cdna_start":724,"variant_allele":"T","strand":-1,"cadd_phred":27.1,"transcript_id":"ENST00000645917","cadd_raw":4.846757,"consequence_terms":["non_coding_transcript_exon_variant"],"impact":"MODIFIER"},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"cadd_raw":4.846757,"trembl":["A0A2R8Y4P5.16"],"cadd_phred":27.1,"transcript_id":"ENST00000646926","variant_allele":"T","strand":-1,"uniparc":["UPI001B89CA49"],"gene_id":"ENSG00000155906"},{"transcript_id":"ENST00000682004","cadd_phred":27.1,"cadd_raw":4.846757,"gene_id":"ENSG00000155906","variant_allele":"T","strand":-1,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH"},{"gene_id":"ENSG00000155906","uniparc":["UPI001B88E93C"],"lof":"HC","strand":-1,"variant_allele":"T","trembl":["A0A804HHY2.7"],"cadd_raw":4.846757,"transcript_id":"ENST00000682299","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"uniparc":["UPI001B8920B1"],"gene_id":"ENSG00000155906","strand":-1,"variant_allele":"T","trembl":["A0A804HLE1.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000682392","impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"]},{"cadd_raw":4.846757,"trembl":["A0A804HHW6.7"],"cadd_phred":27.1,"transcript_id":"ENST00000682641","impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753","gene_id":"ENSG00000155906","uniparc":["UPI001B893F2B"],"lof":"HC","variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000682760","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","strand":-1,"variant_allele":"T"},{"variant_allele":"T","strand":-1,"impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000683439","gene_id":"ENSG00000155906"},{"gene_id":"ENSG00000155906","swissprot":["Q9NWS8.145"],"uniparc":["UPI00001AEAE1"],"lof":"HC","uniprot_isoform":["Q9NWS8-1"],"strand":-1,"variant_allele":"T","cadd_raw":4.846757,"transcript_id":"ENST00000683724","cadd_phred":27.1,"impact":"HIGH","consequence_terms":["splice_donor_variant"],"lof_info":"INTRON_SIZE:8753"},{"gene_id":"ENSG00000155906","transcript_id":"ENST00000683740","cadd_phred":27.1,"cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1},{"cadd_phred":27.1,"transcript_id":"ENST00000684301","cadd_raw":4.846757,"consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"impact":"HIGH","gene_id":"ENSG00000155906","uniparc":["UPI000006DC2F"],"swissprot":["Q9NWS8.145"],"uniprot_isoform":["Q9NWS8-3"],"variant_allele":"T","strand":-1},{"gene_id":"ENSG00000155906","cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684658","impact":"HIGH","consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"strand":-1,"variant_allele":"T"},{"consequence_terms":["splice_donor_variant","non_coding_transcript_variant"],"impact":"HIGH","variant_allele":"T","strand":-1,"gene_id":"ENSG00000155906","cadd_phred":27.1,"transcript_id":"ENST00000684715","cadd_raw":4.846757},{"impact":"HIGH","consequence_terms":["splice_donor_variant","NMD_transcript_variant"],"trembl":["A0A804HKF8.7"],"cadd_raw":4.846757,"cadd_phred":27.1,"transcript_id":"ENST00000684765","strand":-1,"variant_allele":"T","uniparc":["UPI001B89CAE3"],"gene_id":"ENSG00000155906"}],"seq_region_name":"6","assembly_name":"GRCh38","end":151445307,"strand":1,"most_severe_consequence":"splice_donor_variant","start":151445307,"id":"6_151445307_C_T","colocated_variants":[{"clin_sig_allele":"T:pathogenic;T:likely_pathogenic","end":151445307,"var_synonyms":{"ClinVar":["RCV000032983","VCV000039764","RCV001814019"],"OMIM":[614917.0001]},"strand":1,"start":151445307,"allele_string":"C/T","clin_sig":["likely_pathogenic","pathogenic"],"seq_region_name":"6","pubmed":[23022099,18835491],"phenotype_or_disease":1,"id":"rs1562800908"}],"input":"6\t151445307\t6_151445307_C_T\tC\tT","allele_string":"C/T"}
diff --git a/tests/gentropy/dataset/test_variant_annotation.py b/tests/gentropy/dataset/test_variant_annotation.py
deleted file mode 100644
index 05fa0cd60..000000000
--- a/tests/gentropy/dataset/test_variant_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Tests variant annotation dataset."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- from gentropy.dataset.gene_index import GeneIndex
-
-from gentropy.dataset.v2g import V2G
-from gentropy.dataset.variant_annotation import VariantAnnotation
-
-
-def test_variant_index_creation(mock_variant_annotation: VariantAnnotation) -> None:
- """Test gene index creation with mock gene index."""
- assert isinstance(mock_variant_annotation, VariantAnnotation)
-
-
-def test_get_plof_v2g(
- mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex
-) -> None:
- """Test get_plof_v2g with mock variant annotation."""
- assert isinstance(mock_variant_annotation.get_plof_v2g(mock_gene_index), V2G)
-
-
-def test_get_distance_to_tss(
- mock_variant_annotation: VariantAnnotation, mock_gene_index: GeneIndex
-) -> None:
- """Test get_distance_to_tss with mock variant annotation."""
- assert isinstance(mock_variant_annotation.get_distance_to_tss(mock_gene_index), V2G)
diff --git a/tests/gentropy/dataset/test_variant_index.py b/tests/gentropy/dataset/test_variant_index.py
index a41db3031..4d97ef536 100644
--- a/tests/gentropy/dataset/test_variant_index.py
+++ b/tests/gentropy/dataset/test_variant_index.py
@@ -4,11 +4,12 @@
from typing import TYPE_CHECKING
+from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.v2g import V2G
from gentropy.dataset.variant_index import VariantIndex
if TYPE_CHECKING:
- from gentropy.dataset.study_locus import StudyLocus
- from gentropy.dataset.variant_annotation import VariantAnnotation
+ pass
def test_variant_index_creation(mock_variant_index: VariantIndex) -> None:
@@ -16,11 +17,15 @@ def test_variant_index_creation(mock_variant_index: VariantIndex) -> None:
assert isinstance(mock_variant_index, VariantIndex)
-def test_from_variant_annotation(
- mock_variant_annotation: VariantAnnotation, mock_study_locus: StudyLocus
+def test_get_plof_v2g(
+ mock_variant_index: VariantIndex, mock_gene_index: GeneIndex
) -> None:
- """Test variant index creation from variant annotation."""
- variant_index = VariantIndex.from_variant_annotation(
- mock_variant_annotation, mock_study_locus
- )
- assert isinstance(variant_index, VariantIndex)
+ """Test get_plof_v2g with mock variant annotation."""
+ assert isinstance(mock_variant_index.get_plof_v2g(mock_gene_index), V2G)
+
+
+def test_get_distance_to_tss(
+ mock_variant_index: VariantIndex, mock_gene_index: GeneIndex
+) -> None:
+ """Test get_distance_to_tss with mock variant annotation."""
+ assert isinstance(mock_variant_index.get_distance_to_tss(mock_gene_index), V2G)
diff --git a/tests/gentropy/datasource/ensembl/test_vep_variants.py b/tests/gentropy/datasource/ensembl/test_vep_variants.py
new file mode 100644
index 000000000..1401f1b6c
--- /dev/null
+++ b/tests/gentropy/datasource/ensembl/test_vep_variants.py
@@ -0,0 +1,145 @@
+"""Testing VEP parsing and variant index extraction."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from gentropy.dataset.variant_index import VariantIndex
+from gentropy.datasource.ensembl.vep_parser import VariantEffectPredictorParser
+from pyspark.sql import DataFrame
+from pyspark.sql import functions as f
+
+if TYPE_CHECKING:
+ from pyspark.sql import SparkSession
+
+
+class TestVEPParserInSilicoExtractor:
+ """Testing the _vep_in_silico_prediction_extractor method of the VEP parser class.
+
+ These tests assumes that the _get_most_severe_transcript() method works correctly, as it's not tested.
+
+ The test cases try to cover the following scenarios:
+ - Transcripts with no assessments.
+ - Transcripts without assessments flag.
+ - Transcripts with no score.
+ - Testing cases, where no score column is provided.
+ """
+
+ # Data prototype:
+ SAMPLE_DATA = [
+ # Complete dataset:
+ ("v1", "deleterious", 0.1, "gene1", "flag"),
+ # No assessment:
+ ("v2", None, 0.1, "gene1", "flag"),
+ # No flag:
+ ("v3", "deleterious", 0.1, "gene1", None),
+ # No score:
+ ("v4", "deleterious", None, "gene1", "flag"),
+ ]
+
+ SAMPLE_COLUMNS = ["variantId", "assessment", "score", "gene_id", "flag"]
+
+ @pytest.fixture(autouse=True)
+ def _setup(self: TestVEPParserInSilicoExtractor, spark: SparkSession) -> None:
+ """Setup fixture."""
+ parsed_df = (
+ spark.createDataFrame(self.SAMPLE_DATA, self.SAMPLE_COLUMNS)
+ .groupBy("variantId")
+ .agg(
+ f.collect_list(
+ f.struct(
+ f.col("assessment").alias("assessment"),
+ f.col("score").alias("score"),
+ f.col("flag").alias("flag"),
+ f.col("gene_id").alias("gene_id"),
+ )
+ ).alias("transcripts")
+ )
+ .select(
+ "variantId",
+ VariantEffectPredictorParser._vep_in_silico_prediction_extractor(
+ "transcripts", "method_name", "score", "assessment", "flag"
+ ).alias("in_silico_predictions"),
+ )
+ ).persist()
+
+ self.df = parsed_df
+
+ def test_in_silico_output_missing_value(
+ self: TestVEPParserInSilicoExtractor,
+ ) -> None:
+ """Test if the in silico output count is correct."""
+ variant_with_missing_score = [
+ x[0] for x in filter(lambda x: x[2] is None, self.SAMPLE_DATA)
+ ]
+ # Assert that the correct variants return null:
+ assert (
+ [
+ x["variantId"]
+ for x in self.df.filter(
+ f.col("in_silico_predictions").isNull()
+ ).collect()
+ ]
+ == variant_with_missing_score
+ ), "Not the right variants got nullified in-silico predictor object."
+
+
+class TestVEPParser:
+ """Testing VEP parser class.
+
+ Some of the input data:
+ - 6_151445307_C_T - complicated variant with numerous annotations.
+ - 2_140699625_G_GT - simple variant with no annotations whatsoever.
+ """
+
+ SAMPLE_VEP_DATA_PATH = "tests/gentropy/data_samples/vep_sample.jsonl"
+
+ @pytest.fixture(autouse=True)
+ def _setup(self: TestVEPParser, spark: SparkSession) -> None:
+ """Setup fixture."""
+ self.raw_vep_output = spark.read.json(
+ self.SAMPLE_VEP_DATA_PATH,
+ schema=VariantEffectPredictorParser.get_schema(),
+ )
+ self.processed_vep_output = VariantEffectPredictorParser.process_vep_output(
+ self.raw_vep_output
+ )
+
+ def test_extract_variant_index_from_vep(
+ self: TestVEPParser, spark: SparkSession
+ ) -> None:
+ """Test if the variant index can be extracted from the VEP output."""
+ variant_index = VariantEffectPredictorParser.extract_variant_index_from_vep(
+ spark, self.SAMPLE_VEP_DATA_PATH
+ )
+
+ assert isinstance(
+ variant_index, VariantIndex
+ ), "VariantIndex object not created."
+
+ def test_process(self: TestVEPParser) -> None:
+ """Test process method."""
+ df = VariantEffectPredictorParser.process_vep_output(self.raw_vep_output)
+ assert isinstance(df, DataFrame), "Processed VEP output is not a DataFrame."
+ assert df.count() > 0, "No variant data in processed VEP dataframe."
+
+ def test_conversion(self: TestVEPParser) -> None:
+ """Test if processed data can be converted into a VariantIndex object."""
+ variant_index = VariantIndex(
+ _df=self.processed_vep_output,
+ _schema=VariantIndex.get_schema(),
+ )
+
+ assert isinstance(
+ variant_index, VariantIndex
+ ), "VariantIndex object not created."
+
+ def test_variant_count(self: TestVEPParser) -> None:
+ """Test if the number of variants is correct.
+
+ It is expected that all rows from the parsed VEP output are present in the processed VEP output.
+ """
+ assert (
+ self.raw_vep_output.count() == self.processed_vep_output.count()
+ ), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
diff --git a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
index de70e8b63..e7067e3d9 100644
--- a/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
+++ b/tests/gentropy/datasource/gwas_catalog/test_gwas_catalog_associations.py
@@ -2,7 +2,7 @@
from __future__ import annotations
-from gentropy.dataset.variant_annotation import VariantAnnotation
+from gentropy.dataset.variant_index import VariantIndex
from gentropy.datasource.gwas_catalog.associations import (
GWASCatalogCuratedAssociationsParser,
StudyLocusGWASCatalog,
@@ -50,29 +50,29 @@ def test_qc_ambiguous_study(
def test_study_locus_gwas_catalog_from_source(
- mock_variant_annotation: VariantAnnotation,
+ mock_variant_index: VariantIndex,
sample_gwas_catalog_associations: DataFrame,
) -> None:
"""Test study locus from gwas catalog mock data."""
assert isinstance(
GWASCatalogCuratedAssociationsParser.from_source(
- sample_gwas_catalog_associations, mock_variant_annotation
+ sample_gwas_catalog_associations, mock_variant_index
),
StudyLocusGWASCatalog,
)
-def test__map_to_variant_annotation_variants(
+def test_map_variants_to_variant_index(
sample_gwas_catalog_associations: DataFrame,
- mock_variant_annotation: VariantAnnotation,
+ mock_variant_index: VariantIndex,
) -> None:
"""Test mapping to variant annotation variants."""
assert isinstance(
- GWASCatalogCuratedAssociationsParser._map_to_variant_annotation_variants(
+ GWASCatalogCuratedAssociationsParser._map_variants_to_variant_index(
sample_gwas_catalog_associations.withColumn(
"studyLocusId", f.monotonically_increasing_id().cast(LongType())
),
- mock_variant_annotation,
+ mock_variant_index,
),
DataFrame,
)
diff --git a/tests/gentropy/test_schemas.py b/tests/gentropy/test_schemas.py
index 417eb4657..630abd0ab 100644
--- a/tests/gentropy/test_schemas.py
+++ b/tests/gentropy/test_schemas.py
@@ -57,6 +57,9 @@ def test_schema_columns_camelcase(schema_json: str) -> None:
Args:
schema_json (str): schema filename
"""
+ if schema_json == "vep_json_output.json":
+ pytest.skip("VEP schema is exempt from camelCase check.")
+
core_schema = json.loads(Path(SCHEMA_DIR, schema_json).read_text(encoding="utf-8"))
schema = StructType.fromJson(core_schema)
# Use a regular expression to check if the identifier is in camelCase