opentargets · project-defiant · Jun 14, 2024 · May 20, 2024 · May 22, 2024 · May 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ src/airflow/logs/*
 !src/airflow/logs/.gitkeep
 site/
 .env
+.coverage*
diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ setup-dev: ## Setup development environment
 
 check: ## Lint and format code
 	@echo "Linting API..."
-	@poetry run ruff src/gentropy .
+	@poetry run ruff check src/gentropy .
 	@echo "Linting docstrings..."
 	@poetry run pydoclint --config=pyproject.toml src
 	@poetry run pydoclint --config=pyproject.toml --skip-checking-short-docstrings=true tests

diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
@@ -1,5 +1,5 @@
 # Release specific configuration:
-release_version: "24.03"
+release_version: "24.06"
 dev_version: XX.XX
 release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version}
 
@@ -8,6 +8,7 @@ static_assets: gs://genetics_etl_python_playground/static_assets
 outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version}
 
 ## Datasets:
+# GWAS
 gwas_catalog_dataset: gs://gwas_catalog_data
 # Ingestion input files:
 gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv
@@ -29,7 +30,18 @@ gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index
 gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets
 gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets
 
-# Input datasets
+# GnomAD
+gnomad_public_bucket: gs://gcp-public-data--gnomad/release/
+# LD generation
+# Templates require placeholders {POP} to expand template to match multiple populationwise paths
+ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm
+ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht
+liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht
+# variant_annotation
+gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/
+
+# Others
+chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz
 chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain
 vep_consequences: ${datasets.static_assets}/vep_consequences.tsv
 anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed
@@ -49,7 +61,7 @@ summary_statistics: ${datasets.outputs}/summary_statistics
 study_locus_overlap: ${datasets.outputs}/study_locus_overlap
 susie_finemapping: ${datasets.outputs}/finngen_susie_finemapping
 
-ld_index: ${datasets.outputs}/ld_index
+ld_index: ${datasets.static_assets}/ld_index
 catalog_study_index: ${datasets.study_index}/catalog
 catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
 
@@ -60,6 +72,7 @@ from_sumstats_pics: ${datasets.credible_set}/from_sumstats
 l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
 l2g_model: ${datasets.release_folder}/locus_to_gene_model
 l2g_predictions: ${datasets.release_folder}/locus_to_gene_predictions
+l2g_feature_matrix: ${datasets.release_folder}/locus_to_gene_feature_matrix
 colocalisation: ${datasets.release_folder}/colocalisation
 study_index: ${datasets.release_folder}/study_index
 variant_index: ${datasets.release_folder}/variant_index

diff --git a/config/step/ot_ld_based_clumping.yaml b/config/step/ot_ld_based_clumping.yaml
@@ -1,7 +1,7 @@
 defaults:
   - ld_based_clumping
 
-ld_index_path: ${datasets.ld_index}
+ld_index_path: ${datasets.ld_index}/2.1.1
 study_locus_input_path: ???
 study_index_path: ???
 clumped_study_locus_output_path: ???
diff --git a/config/step/ot_ld_index.yaml b/config/step/ot_ld_index.yaml
@@ -2,3 +2,19 @@ defaults:
   - ld_index
 
 ld_index_out: ${datasets.ld_index}
+ld_matrix_template: ${datasets.ld_matrix_template}
+ld_index_raw_template: ${datasets.ld_index_raw_template}
+grch37_to_grch38_chain_path: ${datasets.chain_37_38}
+liftover_ht_path: ${datasets.liftover_ht_path}
+ld_populations:
+  - afr # African-American
+  - amr # American Admixed/Latino
+  - asj # Ashkenazi Jewish
+  - eas # East Asian
+  - est # Estonian
+  - fin # Finnish
+  - nfe # Non-Finnish European
+  - nwe # Northwestern European
+  - seu # Southeastern European
+# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out.
+use_version_from_input: true
diff --git a/config/step/ot_locus_to_gene_predict.yaml b/config/step/ot_locus_to_gene_predict.yaml
@@ -4,6 +4,7 @@ defaults:
 run_mode: predict
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
+feature_matrix_path: ${datasets.l2g_feature_matrix}
 credible_set_path: ${datasets.credible_set}
 variant_gene_path: ${datasets.v2g}
 colocalisation_path: ${datasets.colocalisation}

diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml
@@ -2,3 +2,18 @@ defaults:
   - variant_annotation
 
 variant_annotation_path: ${datasets.variant_annotation}
+gnomad_genomes_path: ${datasets.gnomad_genomes_path}
+chain_38_37: ${datasets.chain_38_37}
+gnomad_variant_populations:
+  - afr # African-American
+  - amr # American Admixed/Latino
+  - ami # Amish ancestry
+  - asj # Ashkenazi Jewish
+  - eas # East Asian
+  - fin # Finnish
+  - nfe # Non-Finnish European
+  - mid # Middle Eastern
+  - sas # South Asian
+  - remaining # Other
+# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out.
+use_version_from_input: true
diff --git a/config/step/ot_window_based_clumping.yaml b/config/step/ot_window_based_clumping.yaml
@@ -4,3 +4,4 @@ defaults:
 summary_statistics_input_path: ???
 study_locus_output_path: ???
 inclusion_list_path: ???
+gwas_significance: 1e-8
diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md
@@ -10,3 +10,4 @@ The overall architecture of the package distinguishes between:
 - [**Datasets**](datasets/_datasets.md): data model
 - [**Methods**](methods/_methods.md): statistical analysis tools
 - [**Steps**](steps/_steps.md): pipeline steps
+- [**Common**](common/_common.md): Common classes
diff --git a/docs/python_api/common/_common.md b/docs/python_api/common/_common.md
@@ -0,0 +1,8 @@
+---
+title: Common
+---
+
+Common utilities used in gentropy package.
+
+- [**Version Engine**](version_engine.md): class to extract version from datasource input paths
+- [**Types**](types.md): Literal types used in the gentropy
diff --git a/docs/python_api/common/types.md b/docs/python_api/common/types.md
@@ -0,0 +1,8 @@
+---
+title: Literal Types
+---
+
+:::gentropy.common.types
+:::gentropy.common.types.LD_Population
+:::gentropy.common.types.VariantPopulation
+:::gentropy.common.types.DataSourceType
diff --git a/docs/python_api/common/version_engine.md b/docs/python_api/common/version_engine.md
@@ -0,0 +1,12 @@
+---
+title: VersionEngine
+---
+
+**VersionEngine**:
+
+Version engine allows for registering datasource specific version seeker class to retrieve datasource version used as input to gentropy steps. Currently implemented only for GnomAD datasource.
+
+This class can be then used to produce automation over output directory versioning.
+
+:::gentropy.common.version_engine.VersionEngine
+:::gentropy.common.version_engine.GnomADVersionSeeker
diff --git a/docs/python_api/datasets/l2g_prediction.md b/docs/python_api/datasets/l2g_prediction.md
@@ -6,4 +6,4 @@ title: L2G Prediction
 
 ## Schema
 
---8<-- "assets/schemas/l2g_prediction.md"
+--8<-- "assets/schemas/l2g_predictions.md"
diff --git a/docs/python_api/methods/clumping.md b/docs/python_api/methods/clumping.md
@@ -10,6 +10,7 @@ We have implemented two clumping methods:
 
 1. **Distance-based clumping:** Uses genomic window to clump the significant SNPs into one hit.
 2. **LD-based clumping:** Uses genomic window and LD to clump the significant SNPs into one hit.
+3. **Locus-breaker clumping:** Applies a distance cutoff between baseline significant SNPs. Returns the start and end position of the locus as well.
 
 The algorithmic logic is similar to classic clumping approaches from PLINK (Reference: [PLINK Clump Documentation](https://zzz.bwh.harvard.edu/plink/clump.shtml)). See details below:
 
@@ -20,3 +21,7 @@ The algorithmic logic is similar to classic clumping approaches from PLINK (Refe
 # LD-based clumping:
 
 ::: gentropy.method.clump.LDclumping
+
+# Locus-breaker clumping
+
+::: gentropy.method.locus_breaker_clumping.locus_breaker
diff --git a/docs/roadmap.md b/docs/roadmap.md
@@ -14,7 +14,7 @@ The Open Targets core team is working on refactoring Open Targets Genetics, aimi
 - Faster/robust addition of new datasets and datatypes
 - Reduce computational and financial cost
 
-See [here](https://github.com/opentargets/issues/issues?q=is%3Aissue+is%3Aopen+label%3AGenetics_ETL_refactoring) for a list of open issues for this project.
+See [here](https://github.com/opentargets/issues/issues?q=is%3Aissue+is%3Aopen+label%3Agentropy) for a list of open issues for this project.
 
 Schematic diagram representing the drafted process:
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ src/airflow/logs/* @@
     !src/airflow/logs/.gitkeep
     site/
     .env
+    .coverage*
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@ title: L2G Prediction

		## Schema

		--8<-- "assets/schemas/l2g_prediction.md"
		--8<-- "assets/schemas/l2g_predictions.md"