From 4bf218ff9c05963161aca353cff84040889de668 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Wed, 17 Jan 2024 16:31:01 +0000 Subject: [PATCH 01/12] docs: documentation link (#429) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0f530f498..849a8f6f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ packages = [{ include = "gentropy", from = "src" }] [tool.poetry.urls] "Bug Tracker" = "http://github.com/opentargets/issues" "Funding" = "https://www.opentargets.org" +"Documentation" = "https://opentargets.github.io/gentropy/" [tool.poetry.scripts] gentropy = "gentropy.cli:main" From 79d86c11bf63cc3f1e76c57f2f7661976c331c57 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Wed, 17 Jan 2024 16:33:03 +0000 Subject: [PATCH 02/12] docs: gentropy hero image (#430) --- README.md | 10 +- docs/assets/imgs/gentropy.svg | 292 ++++++++++++++++++++++++++++++++++ docs/index.md | 2 +- 3 files changed, 302 insertions(+), 2 deletions(-) create mode 100644 docs/assets/imgs/gentropy.svg diff --git a/README.md b/README.md index 8329c4c02..df661c60a 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,14 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) -# Genetics Portal Data Pipeline (experimental) +

+ +

+ - [Documentation](https://opentargets.github.io/gentropy/) diff --git a/docs/assets/imgs/gentropy.svg b/docs/assets/imgs/gentropy.svg new file mode 100644 index 000000000..9a0186638 --- /dev/null +++ b/docs/assets/imgs/gentropy.svg @@ -0,0 +1,292 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/index.md b/docs/index.md index 2efc84814..8ef0073ad 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ hide: ---

- +

+ +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic leads. + +## Installation + +We recommend to install Open Targets Gentropy using Pypi: + +```bash +pip install gentropy +``` + +For alternative ways to install the package visit the [Documentation](https://opentargets.github.io/gentropy/installation/) + +## References - [Documentation](https://opentargets.github.io/gentropy/) +- [Issue tracker](https://github.com/opentargets/issues/issues) + +## About Open Targets + +Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org). From d8aac4e0e1869cc21b4643554a963818bc6a44ab Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Wed, 17 Jan 2024 17:37:58 +0000 Subject: [PATCH 04/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 489c76e8c..c9bc9c5d9 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@

-Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic leads. +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. ## Installation From ac844d70bb5747d3674c8eba91f80831a12b20b0 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Wed, 17 Jan 2024 17:40:46 +0000 Subject: [PATCH 05/12] docs: update index.md --- docs/index.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/index.md b/docs/index.md index 8ef0073ad..c386e19dd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,16 +17,18 @@ hide: --- -[![status: experimental](https://github.com/GIScience/badges/raw/master/status/experimental.svg)](https://github.com/GIScience/badges#experimental) -![docs](https://github.com/opentargets/gentropy/actions/workflows/docs.yaml/badge.svg) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) [![codecov](https://codecov.io/gh/opentargets/gentropy/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/gentropy) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) --- -Ingestion and analysis of genetic and functional genomic data for the identification and prioritisation of drug targets. +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. -This project is still in experimental phase. Please refer to the [roadmap section](roadmap.md) for more information. -For all development information, including running the code, troubleshooting, or contributing, see the [development section](development/_development.md). +## About Open Targets + +Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org). + From be46928d6104fa607eb6edab8b38d0411e43f296 Mon Sep 17 00:00:00 2001 From: Yakov Date: Thu, 18 Jan 2024 09:29:08 +0000 Subject: [PATCH 06/12] docs: release documentation preparation (#432) * docs: realease documentation preparation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/index.md | 2 -- docs/installation.md | 6 ++++- docs/python_api/_python_api.md | 5 ++++ docs/python_api/method/_method.md | 6 ++--- docs/python_api/method/clumping.md | 15 ++++++++++-- docs/python_api/method/l2g/_l2g.md | 23 +++++++++++++++++-- docs/python_api/method/pics.md | 8 +++++++ .../method/window_based_clumping.md | 5 ---- docs/python_api/step/_step.md | 2 +- 9 files changed, 55 insertions(+), 17 deletions(-) delete mode 100644 docs/python_api/method/window_based_clumping.md diff --git a/docs/index.md b/docs/index.md index c386e19dd..fb84698fb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,8 +27,6 @@ hide: Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. - ## About Open Targets Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org). - diff --git a/docs/installation.md b/docs/installation.md index bccb6684d..8e9c130bc 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -7,4 +7,8 @@ hide: # Installation -TBC +To install Open Targets Gentropy, we recommend using PyPI. You can install it using the following pip command: + +``` +pip install gentropy +``` diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md index e69de29bb..32ee92820 100644 --- a/docs/python_api/_python_api.md +++ b/docs/python_api/_python_api.md @@ -0,0 +1,5 @@ +--- +title: Open Targets Gentropy +--- + +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. diff --git a/docs/python_api/method/_method.md b/docs/python_api/method/_method.md index 94a1008c9..d6dc55ab4 100644 --- a/docs/python_api/method/_method.md +++ b/docs/python_api/method/_method.md @@ -1,7 +1,5 @@ --- -title: Method +title: Methods --- -# Method - -TBC +This section consists of all the methods available in the package. It provides detailed explanations and usage examples for each method. Developers can refer to this section to understand how to use the methods effectively in their code. The list of methods is constantly updated. diff --git a/docs/python_api/method/clumping.md b/docs/python_api/method/clumping.md index ff996672f..6cc368013 100644 --- a/docs/python_api/method/clumping.md +++ b/docs/python_api/method/clumping.md @@ -4,8 +4,19 @@ title: Clumping # Clumping -Clumping is a commonly used post-processing method that allows for identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal. +Clumping is a commonly used post-processing method that allows for the identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal. -We have implemented 2 clumping methods: +We have implemented two clumping methods: + +1. **Distance-based clumping:** Uses genomic window to clump the significant SNPs into one hit. +2. **LD-based clumping:** Uses genomic window and LD to clump the significant SNPs into one hit. + +The algorithmic logic is similar to classic clumping approaches from PLINK (Reference: [PLINK Clump Documentation](https://zzz.bwh.harvard.edu/plink/clump.shtml)). See details below: + +# Distance-based clumping + +::: gentropy.method.window_based_clumping.WindowBasedClumping + +# LD-based clumping: ::: gentropy.method.clump.LDclumping diff --git a/docs/python_api/method/l2g/_l2g.md b/docs/python_api/method/l2g/_l2g.md index d62ab9588..fca3ba79d 100644 --- a/docs/python_api/method/l2g/_l2g.md +++ b/docs/python_api/method/l2g/_l2g.md @@ -1,5 +1,24 @@ --- -title: Locus to Gene (L2G) classifier +title: Locus to Gene (L2G) model --- -TBC +The **“locus-to-gene” (L2G)** model derives features to prioritize likely causal genes at each GWAS locus based on genetic and functional genomics features. The main categories of predictive features are: + +- **Distance:** (from credible set variants to gene) +- **Molecular QTL Colocalization** +- **Chromatin Interaction:** (e.g., promoter-capture Hi-C) +- **Variant Pathogenicity:** (from VEP) + +The L2G model is distinct from the variant-to-gene (V2G) pipeline in that it: + +- Uses a machine-learning model to learn the weights of each evidence source based on a gold standard of previously identified causal genes. +- Relies upon fine-mapping and colocalization data. + +Some of the predictive features weight variant-to-gene (or genomic region-to-gene) evidence based on the posterior probability that the variant is causal, determined through fine-mapping of the GWAS association. + +Details of the L2G model are provided in our Nature Genetics publication (ref - [Nature Genetics Publication](https://www.nature.com/articles/s41588-021-00945-5)): + +- **Title:** An open approach to systematically prioritize causal variants and genes at all published human GWAS trait-associated loci. +- **Authors:** Mountjoy, E., Schmidt, E.M., Carmona, M. et al. +- **Journal:** Nat Genet 53, 1527–1533 (2021). +- **DOI:** [10.1038/s41588-021-00945-5](https://doi.org/10.1038/s41588-021-00945-5) diff --git a/docs/python_api/method/pics.md b/docs/python_api/method/pics.md index f049ef91d..41de539a0 100644 --- a/docs/python_api/method/pics.md +++ b/docs/python_api/method/pics.md @@ -2,4 +2,12 @@ title: PICS --- +**PICS Overview:** + +PICS is a fine-mapping method designed to identify the most likely causal SNPs associated with a trait or disease within a genomic region. It leverages both haplotype information and the observed association patterns from genome-wide association studies (GWAS). + +Please refer to the original publication for in-depth details: [PICS Publication](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4336207/). + +We use PICS for both GWAS clumping results and GWAS curated studies. + :::gentropy.method.pics.PICS diff --git a/docs/python_api/method/window_based_clumping.md b/docs/python_api/method/window_based_clumping.md deleted file mode 100644 index ba8252774..000000000 --- a/docs/python_api/method/window_based_clumping.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Window-based clumping ---- - -:::gentropy.method.window_based_clumping.WindowBasedClumping diff --git a/docs/python_api/step/_step.md b/docs/python_api/step/_step.md index 987f31b91..e19fd5fcc 100644 --- a/docs/python_api/step/_step.md +++ b/docs/python_api/step/_step.md @@ -4,4 +4,4 @@ title: Step # Step -TBC +This section provides description for the `Step` class. Each `Step` uses its own set of Methods. From 8803387e3f643ab0aef63c2c82edd5ef757f0164 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 18 Jan 2024 11:17:17 +0000 Subject: [PATCH 07/12] docs: several enhancements on docs including index and installation (#433) --- README.md | 2 +- docs/development/airflow.md | 12 ++++-- docs/development/contributing.md | 2 +- docs/howto/run_step_using_config.md | 3 +- docs/index.md | 12 +++--- docs/installation.md | 22 ++++++++++- docs/python_api/_python_api.md | 9 ++++- docs/python_api/dataset/_dataset.md | 5 --- docs/python_api/datasets/_datasets.md | 7 ++++ .../{dataset => datasets}/colocalisation.md | 0 .../{dataset => datasets}/gene_index.md | 0 .../{dataset => datasets}/intervals.md | 0 .../{dataset => datasets}/l2g_feature.md | 0 .../l2g_feature_matrix.md | 0 .../l2g_gold_standard.md | 0 .../{dataset => datasets}/l2g_prediction.md | 0 .../{dataset => datasets}/ld_index.md | 0 .../{dataset => datasets}/study_index.md | 0 .../{dataset => datasets}/study_locus.md | 0 .../study_locus_overlap.md | 0 .../summary_statistics.md | 0 .../variant_annotation.md | 0 .../{dataset => datasets}/variant_index.md | 0 .../{dataset => datasets}/variant_to_gene.md | 0 docs/python_api/datasource/_datasource.md | 24 ------------ docs/python_api/datasources/_datasources.md | 37 +++++++++++++++++++ .../eqtl_catalogue/_eqtl_catalogue.md | 0 .../eqtl_catalogue/study_index.md | 0 .../eqtl_catalogue/summary_stats.md | 0 .../finngen/_finngen.md | 0 .../finngen/study_index.md | 0 .../gnomad/_gnomad.md | 0 .../gnomad/gnomad_ld.md | 0 .../gnomad/gnomad_variants.md | 0 .../gwas_catalog/_gwas_catalog.md | 0 .../gwas_catalog/associations.md | 0 .../gwas_catalog/study_index.md | 0 .../gwas_catalog/study_splitter.md | 0 .../gwas_catalog/summary_statistics.md | 0 .../intervals/_intervals.md | 0 .../intervals/andersson.md | 0 .../intervals/javierre.md | 0 .../intervals/jung.md | 0 .../intervals/thurman.md | 0 .../open_targets/_open_targets.md | 0 .../open_targets/l2g_gold_standard.md | 0 .../open_targets/target.md | 0 .../_method.md => methods/_methods.md} | 0 docs/python_api/{method => methods}/carma.md | 0 .../{method => methods}/clumping.md | 0 docs/python_api/{method => methods}/coloc.md | 0 .../python_api/{method => methods}/ecaviar.md | 0 .../{method => methods}/l2g/_l2g.md | 0 .../{method => methods}/l2g/evaluator.md | 0 .../l2g/feature_factory.md | 0 .../{method => methods}/l2g/model.md | 0 .../{method => methods}/l2g/trainer.md | 0 .../{method => methods}/ld_annotator.md | 0 docs/python_api/{method => methods}/pics.md | 0 docs/python_api/step/_step.md | 7 ---- docs/python_api/steps/_steps.md | 7 ++++ .../{step => steps}/colocalisation.md | 0 .../{step => steps}/eqtl_catalogue.md | 0 .../{step => steps}/finngen_studies.md | 0 .../finngen_sumstat_preprocess.md | 0 docs/python_api/{step => steps}/gene_index.md | 0 .../{step => steps}/gwas_catalog_curation.md | 0 .../{step => steps}/gwas_catalog_inclusion.md | 0 .../{step => steps}/gwas_catalog_ingestion.md | 0 .../gwas_catalog_sumstat_preprocess.md | 0 docs/python_api/{step => steps}/l2g.md | 0 docs/python_api/{step => steps}/ld_clump.md | 0 docs/python_api/{step => steps}/ld_index.md | 0 docs/python_api/{step => steps}/pics.md | 0 .../variant_annotation_step.md | 0 .../{step => steps}/variant_index_step.md | 0 .../{step => steps}/variant_to_gene_step.md | 0 .../{step => steps}/window_based_clumping.md | 0 mkdocs.yml | 2 +- pyproject.toml | 2 +- src/gentropy/dataset/dataset.py | 2 +- src/gentropy/dataset/study_locus_overlap.py | 1 + src/gentropy/ld_index.py | 1 + src/gentropy/method/colocalisation.py | 1 + src/gentropy/method/pics.py | 1 + 85 files changed, 104 insertions(+), 55 deletions(-) delete mode 100644 docs/python_api/dataset/_dataset.md create mode 100644 docs/python_api/datasets/_datasets.md rename docs/python_api/{dataset => datasets}/colocalisation.md (100%) rename docs/python_api/{dataset => datasets}/gene_index.md (100%) rename docs/python_api/{dataset => datasets}/intervals.md (100%) rename docs/python_api/{dataset => datasets}/l2g_feature.md (100%) rename docs/python_api/{dataset => datasets}/l2g_feature_matrix.md (100%) rename docs/python_api/{dataset => datasets}/l2g_gold_standard.md (100%) rename docs/python_api/{dataset => datasets}/l2g_prediction.md (100%) rename docs/python_api/{dataset => datasets}/ld_index.md (100%) rename docs/python_api/{dataset => datasets}/study_index.md (100%) rename docs/python_api/{dataset => datasets}/study_locus.md (100%) rename docs/python_api/{dataset => datasets}/study_locus_overlap.md (100%) rename docs/python_api/{dataset => datasets}/summary_statistics.md (100%) rename docs/python_api/{dataset => datasets}/variant_annotation.md (100%) rename docs/python_api/{dataset => datasets}/variant_index.md (100%) rename docs/python_api/{dataset => datasets}/variant_to_gene.md (100%) delete mode 100644 docs/python_api/datasource/_datasource.md create mode 100644 docs/python_api/datasources/_datasources.md rename docs/python_api/{datasource => datasources}/eqtl_catalogue/_eqtl_catalogue.md (100%) rename docs/python_api/{datasource => datasources}/eqtl_catalogue/study_index.md (100%) rename docs/python_api/{datasource => datasources}/eqtl_catalogue/summary_stats.md (100%) rename docs/python_api/{datasource => datasources}/finngen/_finngen.md (100%) rename docs/python_api/{datasource => datasources}/finngen/study_index.md (100%) rename docs/python_api/{datasource => datasources}/gnomad/_gnomad.md (100%) rename docs/python_api/{datasource => datasources}/gnomad/gnomad_ld.md (100%) rename docs/python_api/{datasource => datasources}/gnomad/gnomad_variants.md (100%) rename docs/python_api/{datasource => datasources}/gwas_catalog/_gwas_catalog.md (100%) rename docs/python_api/{datasource => datasources}/gwas_catalog/associations.md (100%) rename docs/python_api/{datasource => datasources}/gwas_catalog/study_index.md (100%) rename docs/python_api/{datasource => datasources}/gwas_catalog/study_splitter.md (100%) rename docs/python_api/{datasource => datasources}/gwas_catalog/summary_statistics.md (100%) rename docs/python_api/{datasource => datasources}/intervals/_intervals.md (100%) rename docs/python_api/{datasource => datasources}/intervals/andersson.md (100%) rename docs/python_api/{datasource => datasources}/intervals/javierre.md (100%) rename docs/python_api/{datasource => datasources}/intervals/jung.md (100%) rename docs/python_api/{datasource => datasources}/intervals/thurman.md (100%) rename docs/python_api/{datasource => datasources}/open_targets/_open_targets.md (100%) rename docs/python_api/{datasource => datasources}/open_targets/l2g_gold_standard.md (100%) rename docs/python_api/{datasource => datasources}/open_targets/target.md (100%) rename docs/python_api/{method/_method.md => methods/_methods.md} (100%) rename docs/python_api/{method => methods}/carma.md (100%) rename docs/python_api/{method => methods}/clumping.md (100%) rename docs/python_api/{method => methods}/coloc.md (100%) rename docs/python_api/{method => methods}/ecaviar.md (100%) rename docs/python_api/{method => methods}/l2g/_l2g.md (100%) rename docs/python_api/{method => methods}/l2g/evaluator.md (100%) rename docs/python_api/{method => methods}/l2g/feature_factory.md (100%) rename docs/python_api/{method => methods}/l2g/model.md (100%) rename docs/python_api/{method => methods}/l2g/trainer.md (100%) rename docs/python_api/{method => methods}/ld_annotator.md (100%) rename docs/python_api/{method => methods}/pics.md (100%) delete mode 100644 docs/python_api/step/_step.md create mode 100644 docs/python_api/steps/_steps.md rename docs/python_api/{step => steps}/colocalisation.md (100%) rename docs/python_api/{step => steps}/eqtl_catalogue.md (100%) rename docs/python_api/{step => steps}/finngen_studies.md (100%) rename docs/python_api/{step => steps}/finngen_sumstat_preprocess.md (100%) rename docs/python_api/{step => steps}/gene_index.md (100%) rename docs/python_api/{step => steps}/gwas_catalog_curation.md (100%) rename docs/python_api/{step => steps}/gwas_catalog_inclusion.md (100%) rename docs/python_api/{step => steps}/gwas_catalog_ingestion.md (100%) rename docs/python_api/{step => steps}/gwas_catalog_sumstat_preprocess.md (100%) rename docs/python_api/{step => steps}/l2g.md (100%) rename docs/python_api/{step => steps}/ld_clump.md (100%) rename docs/python_api/{step => steps}/ld_index.md (100%) rename docs/python_api/{step => steps}/pics.md (100%) rename docs/python_api/{step => steps}/variant_annotation_step.md (100%) rename docs/python_api/{step => steps}/variant_index_step.md (100%) rename docs/python_api/{step => steps}/variant_to_gene_step.md (100%) rename docs/python_api/{step => steps}/window_based_clumping.md (100%) diff --git a/README.md b/README.md index c9bc9c5d9..7d5a1d410 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy)

- +

Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. diff --git a/docs/development/airflow.md b/docs/development/airflow.md index b73ad614e..ff5f7906c 100644 --- a/docs/development/airflow.md +++ b/docs/development/airflow.md @@ -8,12 +8,14 @@ This section describes how to set up a local Airflow server which will orchestra - [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) !!! warning macOS Docker memory allocation -On macOS, the default amount of memory available for Docker might not be enough to get Airflow up and running. Allocate at least 4GB of memory for the Docker Engine (ideally 8GB). [More info](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#) + + On macOS, the default amount of memory available for Docker might not be enough to get Airflow up and running. Allocate at least 4GB of memory for the Docker Engine (ideally 8GB). [More info](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#) ## Configure Airflow access to Google Cloud Platform !!! warning Specifying Google Cloud parameters -Run the next two command with the appropriate Google Cloud project ID and service account name to ensure the correct Google default application credentials are set up. + + Run the next two command with the appropriate Google Cloud project ID and service account name to ensure the correct Google default application credentials are set up. Authenticate to Google Cloud: @@ -38,7 +40,8 @@ cd src/airflow ### Build Docker image !!! note Custom Docker image for Airflow -The custom Dockerfile built by the command below extends the official [Airflow Docker Compose YAML](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml). We add support for Google Cloud SDK, Google Dataproc operators, and access to GCP credentials. + + The custom Dockerfile built by the command below extends the official [Airflow Docker Compose YAML](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml). We add support for Google Cloud SDK, Google Dataproc operators, and access to GCP credentials. ```bash docker build . --tag extending_airflow:latest @@ -47,7 +50,8 @@ docker build . --tag extending_airflow:latest ### Set Airflow user ID !!! note Setting Airflow user ID -These commands allow Airflow running inside Docker to access the credentials file which was generated earlier. + + These commands allow Airflow running inside Docker to access the credentials file which was generated earlier. ```bash # If any user ID is already specified in .env, remove it. diff --git a/docs/development/contributing.md b/docs/development/contributing.md index 1b4e5451f..a12ac4951 100644 --- a/docs/development/contributing.md +++ b/docs/development/contributing.md @@ -8,7 +8,7 @@ title: Contributing guidelines The steps in this section only ever need to be done once on any particular system. -Google Cloud configuration: +For Google Cloud configuration: 1. Install Google Cloud SDK: https://cloud.google.com/sdk/docs/install. diff --git a/docs/howto/run_step_using_config.md b/docs/howto/run_step_using_config.md index 926cdbf45..bbce9a195 100644 --- a/docs/howto/run_step_using_config.md +++ b/docs/howto/run_step_using_config.md @@ -7,7 +7,8 @@ Title: Run step using config It's possible to parametrise the functionality of a step using a YAML configuration file. This is useful when you want to run a step multiple times with different parameters or simply to avoid having to specify the same parameters every time you run a step. !!! info Configuration files using Hydra -The package uses [Hydra](https://hydra.cc) to handle configuration files. For more information, please visit the [Hydra documentation](https://hydra.cc/docs/intro/). + + The package uses [Hydra](https://hydra.cc) to handle configuration files. For more information, please visit the [Hydra documentation](https://hydra.cc/docs/intro/). To run a step using a configuration file, you need to create a configuration file in YAML format. diff --git a/docs/index.md b/docs/index.md index fb84698fb..31762c17e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,13 +1,13 @@ --- -title: Open Targets Genetics +title: Open Targets Gentropy hide: - navigation - toc --- -

- -

+
+ + ---- +
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) @@ -25,7 +25,7 @@ hide: --- -Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. This package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. ## About Open Targets diff --git a/docs/installation.md b/docs/installation.md index 8e9c130bc..07c5493f0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -7,8 +7,26 @@ hide: # Installation -To install Open Targets Gentropy, we recommend using PyPI. You can install it using the following pip command: +!!! note Python compatibility -``` + In the early stages of development, we are using Python version 3.10. We recommend using [pyenv](https://github.com/pyenv/pyenv) or similar tools to manage your local Python version. We intend to support more Python versions in the future. + +## Pypi + +We recommend installing Open Targets Gentropy using Pypi: + +```bash pip install gentropy ``` + +## Source + +Alternatively, you can install Open Targets Gentropy from source. Check the [contributing](development/contributing.md) section for more information. + +## Uninstall + +```bash +pip uninstall gentropy -y +``` + +For any issues with the installation, check the [troubleshooting section](development/troubleshooting.md). diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md index 32ee92820..02c1941af 100644 --- a/docs/python_api/_python_api.md +++ b/docs/python_api/_python_api.md @@ -1,5 +1,12 @@ --- -title: Open Targets Gentropy +title: Python API --- Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. + +The Overall architecture of the package distinguishes between: + +- [**Data Sources**](datasources/_datasources.md): data sources harmonisation tools +- [**Datasets**](datasets/_datasets.md): data model +- [**Methods**](methods/_methods.md): statistical analysis tools +- [**Steps**](steps/_steps.md): pipeline steps diff --git a/docs/python_api/dataset/_dataset.md b/docs/python_api/dataset/_dataset.md deleted file mode 100644 index 4f2fdb6f6..000000000 --- a/docs/python_api/dataset/_dataset.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Dataset ---- - -::: gentropy.dataset.dataset.Dataset diff --git a/docs/python_api/datasets/_datasets.md b/docs/python_api/datasets/_datasets.md new file mode 100644 index 000000000..058ab5fa2 --- /dev/null +++ b/docs/python_api/datasets/_datasets.md @@ -0,0 +1,7 @@ +--- +title: Datasets +--- + +The Dataset classes define the data model behind Open Targets Gentropy. Every class inherits from the `Dataset` class and contains a dataframe with a predefined schema that can be found in the respective classes. + +::: gentropy.dataset.dataset.Dataset diff --git a/docs/python_api/dataset/colocalisation.md b/docs/python_api/datasets/colocalisation.md similarity index 100% rename from docs/python_api/dataset/colocalisation.md rename to docs/python_api/datasets/colocalisation.md diff --git a/docs/python_api/dataset/gene_index.md b/docs/python_api/datasets/gene_index.md similarity index 100% rename from docs/python_api/dataset/gene_index.md rename to docs/python_api/datasets/gene_index.md diff --git a/docs/python_api/dataset/intervals.md b/docs/python_api/datasets/intervals.md similarity index 100% rename from docs/python_api/dataset/intervals.md rename to docs/python_api/datasets/intervals.md diff --git a/docs/python_api/dataset/l2g_feature.md b/docs/python_api/datasets/l2g_feature.md similarity index 100% rename from docs/python_api/dataset/l2g_feature.md rename to docs/python_api/datasets/l2g_feature.md diff --git a/docs/python_api/dataset/l2g_feature_matrix.md b/docs/python_api/datasets/l2g_feature_matrix.md similarity index 100% rename from docs/python_api/dataset/l2g_feature_matrix.md rename to docs/python_api/datasets/l2g_feature_matrix.md diff --git a/docs/python_api/dataset/l2g_gold_standard.md b/docs/python_api/datasets/l2g_gold_standard.md similarity index 100% rename from docs/python_api/dataset/l2g_gold_standard.md rename to docs/python_api/datasets/l2g_gold_standard.md diff --git a/docs/python_api/dataset/l2g_prediction.md b/docs/python_api/datasets/l2g_prediction.md similarity index 100% rename from docs/python_api/dataset/l2g_prediction.md rename to docs/python_api/datasets/l2g_prediction.md diff --git a/docs/python_api/dataset/ld_index.md b/docs/python_api/datasets/ld_index.md similarity index 100% rename from docs/python_api/dataset/ld_index.md rename to docs/python_api/datasets/ld_index.md diff --git a/docs/python_api/dataset/study_index.md b/docs/python_api/datasets/study_index.md similarity index 100% rename from docs/python_api/dataset/study_index.md rename to docs/python_api/datasets/study_index.md diff --git a/docs/python_api/dataset/study_locus.md b/docs/python_api/datasets/study_locus.md similarity index 100% rename from docs/python_api/dataset/study_locus.md rename to docs/python_api/datasets/study_locus.md diff --git a/docs/python_api/dataset/study_locus_overlap.md b/docs/python_api/datasets/study_locus_overlap.md similarity index 100% rename from docs/python_api/dataset/study_locus_overlap.md rename to docs/python_api/datasets/study_locus_overlap.md diff --git a/docs/python_api/dataset/summary_statistics.md b/docs/python_api/datasets/summary_statistics.md similarity index 100% rename from docs/python_api/dataset/summary_statistics.md rename to docs/python_api/datasets/summary_statistics.md diff --git a/docs/python_api/dataset/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md similarity index 100% rename from docs/python_api/dataset/variant_annotation.md rename to docs/python_api/datasets/variant_annotation.md diff --git a/docs/python_api/dataset/variant_index.md b/docs/python_api/datasets/variant_index.md similarity index 100% rename from docs/python_api/dataset/variant_index.md rename to docs/python_api/datasets/variant_index.md diff --git a/docs/python_api/dataset/variant_to_gene.md b/docs/python_api/datasets/variant_to_gene.md similarity index 100% rename from docs/python_api/dataset/variant_to_gene.md rename to docs/python_api/datasets/variant_to_gene.md diff --git a/docs/python_api/datasource/_datasource.md b/docs/python_api/datasource/_datasource.md deleted file mode 100644 index 9fab444bf..000000000 --- a/docs/python_api/datasource/_datasource.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Data Source ---- - -# Data Source - -This section contains information about the data sources used in Open Targets Genetics. - -We use GnomAD v4.0 as a source for variant annotation and GnomAD v2.1.1 as a source for linkage disequilibrium (LD) information (described in the **GnomAD** section). - -We rely on Open Targets as a source for the list of targets and the Gold Standard training set (described in the **Open Targets** section). - -## Study Sources - -1. GWAS catalog -2. FinnGen - -## Molecular QTLs - -1. eQTL catalogue - -## Interaction / Interval-based Experiments - -We integrate a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications. For more detils see section **"Intervals"**. diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md new file mode 100644 index 000000000..c06bef177 --- /dev/null +++ b/docs/python_api/datasources/_datasources.md @@ -0,0 +1,37 @@ +--- +title: Data Sources +--- + +# Data Sources + +This section contains information about the data source harmonisation tools available in Open Targets Gentropy. + +## GWAS study sources + +1. [GWAS catalog](gwas_catalog/_gwas_catalog.md) (with or without full summary statistics) +1. [FinnGen](finngen/_finngen.md) + +## Molecular QTLs + +1. [GTEx (eQTL catalogue)](eqtl_catalogue/_eqtl_catalogue.md) + +## Interaction / Interval-based Experiments + +1. [Intervals](intervals/_intervals.md)-based datasets, informing about the relationships between genetic elements and their functional implications. + +## Variant annotation/validation + +1. [GnomAD](gnomad/_gnomad.md) v4.0 +1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data) + +## Linkage desiquilibrium + +1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries) + +## Locus-to-gene gold standard + +1. [Open Targets training set](open_targets/l2g_gold_standard.md) + +## Gene annotation + +1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl) diff --git a/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md b/docs/python_api/datasources/eqtl_catalogue/_eqtl_catalogue.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md rename to docs/python_api/datasources/eqtl_catalogue/_eqtl_catalogue.md diff --git a/docs/python_api/datasource/eqtl_catalogue/study_index.md b/docs/python_api/datasources/eqtl_catalogue/study_index.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/study_index.md rename to docs/python_api/datasources/eqtl_catalogue/study_index.md diff --git a/docs/python_api/datasource/eqtl_catalogue/summary_stats.md b/docs/python_api/datasources/eqtl_catalogue/summary_stats.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/summary_stats.md rename to docs/python_api/datasources/eqtl_catalogue/summary_stats.md diff --git a/docs/python_api/datasource/finngen/_finngen.md b/docs/python_api/datasources/finngen/_finngen.md similarity index 100% rename from docs/python_api/datasource/finngen/_finngen.md rename to docs/python_api/datasources/finngen/_finngen.md diff --git a/docs/python_api/datasource/finngen/study_index.md b/docs/python_api/datasources/finngen/study_index.md similarity index 100% rename from docs/python_api/datasource/finngen/study_index.md rename to docs/python_api/datasources/finngen/study_index.md diff --git a/docs/python_api/datasource/gnomad/_gnomad.md b/docs/python_api/datasources/gnomad/_gnomad.md similarity index 100% rename from docs/python_api/datasource/gnomad/_gnomad.md rename to docs/python_api/datasources/gnomad/_gnomad.md diff --git a/docs/python_api/datasource/gnomad/gnomad_ld.md b/docs/python_api/datasources/gnomad/gnomad_ld.md similarity index 100% rename from docs/python_api/datasource/gnomad/gnomad_ld.md rename to docs/python_api/datasources/gnomad/gnomad_ld.md diff --git a/docs/python_api/datasource/gnomad/gnomad_variants.md b/docs/python_api/datasources/gnomad/gnomad_variants.md similarity index 100% rename from docs/python_api/datasource/gnomad/gnomad_variants.md rename to docs/python_api/datasources/gnomad/gnomad_variants.md diff --git a/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md b/docs/python_api/datasources/gwas_catalog/_gwas_catalog.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/_gwas_catalog.md rename to docs/python_api/datasources/gwas_catalog/_gwas_catalog.md diff --git a/docs/python_api/datasource/gwas_catalog/associations.md b/docs/python_api/datasources/gwas_catalog/associations.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/associations.md rename to docs/python_api/datasources/gwas_catalog/associations.md diff --git a/docs/python_api/datasource/gwas_catalog/study_index.md b/docs/python_api/datasources/gwas_catalog/study_index.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/study_index.md rename to docs/python_api/datasources/gwas_catalog/study_index.md diff --git a/docs/python_api/datasource/gwas_catalog/study_splitter.md b/docs/python_api/datasources/gwas_catalog/study_splitter.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/study_splitter.md rename to docs/python_api/datasources/gwas_catalog/study_splitter.md diff --git a/docs/python_api/datasource/gwas_catalog/summary_statistics.md b/docs/python_api/datasources/gwas_catalog/summary_statistics.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/summary_statistics.md rename to docs/python_api/datasources/gwas_catalog/summary_statistics.md diff --git a/docs/python_api/datasource/intervals/_intervals.md b/docs/python_api/datasources/intervals/_intervals.md similarity index 100% rename from docs/python_api/datasource/intervals/_intervals.md rename to docs/python_api/datasources/intervals/_intervals.md diff --git a/docs/python_api/datasource/intervals/andersson.md b/docs/python_api/datasources/intervals/andersson.md similarity index 100% rename from docs/python_api/datasource/intervals/andersson.md rename to docs/python_api/datasources/intervals/andersson.md diff --git a/docs/python_api/datasource/intervals/javierre.md b/docs/python_api/datasources/intervals/javierre.md similarity index 100% rename from docs/python_api/datasource/intervals/javierre.md rename to docs/python_api/datasources/intervals/javierre.md diff --git a/docs/python_api/datasource/intervals/jung.md b/docs/python_api/datasources/intervals/jung.md similarity index 100% rename from docs/python_api/datasource/intervals/jung.md rename to docs/python_api/datasources/intervals/jung.md diff --git a/docs/python_api/datasource/intervals/thurman.md b/docs/python_api/datasources/intervals/thurman.md similarity index 100% rename from docs/python_api/datasource/intervals/thurman.md rename to docs/python_api/datasources/intervals/thurman.md diff --git a/docs/python_api/datasource/open_targets/_open_targets.md b/docs/python_api/datasources/open_targets/_open_targets.md similarity index 100% rename from docs/python_api/datasource/open_targets/_open_targets.md rename to docs/python_api/datasources/open_targets/_open_targets.md diff --git a/docs/python_api/datasource/open_targets/l2g_gold_standard.md b/docs/python_api/datasources/open_targets/l2g_gold_standard.md similarity index 100% rename from docs/python_api/datasource/open_targets/l2g_gold_standard.md rename to docs/python_api/datasources/open_targets/l2g_gold_standard.md diff --git a/docs/python_api/datasource/open_targets/target.md b/docs/python_api/datasources/open_targets/target.md similarity index 100% rename from docs/python_api/datasource/open_targets/target.md rename to docs/python_api/datasources/open_targets/target.md diff --git a/docs/python_api/method/_method.md b/docs/python_api/methods/_methods.md similarity index 100% rename from docs/python_api/method/_method.md rename to docs/python_api/methods/_methods.md diff --git a/docs/python_api/method/carma.md b/docs/python_api/methods/carma.md similarity index 100% rename from docs/python_api/method/carma.md rename to docs/python_api/methods/carma.md diff --git a/docs/python_api/method/clumping.md b/docs/python_api/methods/clumping.md similarity index 100% rename from docs/python_api/method/clumping.md rename to docs/python_api/methods/clumping.md diff --git a/docs/python_api/method/coloc.md b/docs/python_api/methods/coloc.md similarity index 100% rename from docs/python_api/method/coloc.md rename to docs/python_api/methods/coloc.md diff --git a/docs/python_api/method/ecaviar.md b/docs/python_api/methods/ecaviar.md similarity index 100% rename from docs/python_api/method/ecaviar.md rename to docs/python_api/methods/ecaviar.md diff --git a/docs/python_api/method/l2g/_l2g.md b/docs/python_api/methods/l2g/_l2g.md similarity index 100% rename from docs/python_api/method/l2g/_l2g.md rename to docs/python_api/methods/l2g/_l2g.md diff --git a/docs/python_api/method/l2g/evaluator.md b/docs/python_api/methods/l2g/evaluator.md similarity index 100% rename from docs/python_api/method/l2g/evaluator.md rename to docs/python_api/methods/l2g/evaluator.md diff --git a/docs/python_api/method/l2g/feature_factory.md b/docs/python_api/methods/l2g/feature_factory.md similarity index 100% rename from docs/python_api/method/l2g/feature_factory.md rename to docs/python_api/methods/l2g/feature_factory.md diff --git a/docs/python_api/method/l2g/model.md b/docs/python_api/methods/l2g/model.md similarity index 100% rename from docs/python_api/method/l2g/model.md rename to docs/python_api/methods/l2g/model.md diff --git a/docs/python_api/method/l2g/trainer.md b/docs/python_api/methods/l2g/trainer.md similarity index 100% rename from docs/python_api/method/l2g/trainer.md rename to docs/python_api/methods/l2g/trainer.md diff --git a/docs/python_api/method/ld_annotator.md b/docs/python_api/methods/ld_annotator.md similarity index 100% rename from docs/python_api/method/ld_annotator.md rename to docs/python_api/methods/ld_annotator.md diff --git a/docs/python_api/method/pics.md b/docs/python_api/methods/pics.md similarity index 100% rename from docs/python_api/method/pics.md rename to docs/python_api/methods/pics.md diff --git a/docs/python_api/step/_step.md b/docs/python_api/step/_step.md deleted file mode 100644 index e19fd5fcc..000000000 --- a/docs/python_api/step/_step.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Step ---- - -# Step - -This section provides description for the `Step` class. Each `Step` uses its own set of Methods. diff --git a/docs/python_api/steps/_steps.md b/docs/python_api/steps/_steps.md new file mode 100644 index 000000000..9c6d7679c --- /dev/null +++ b/docs/python_api/steps/_steps.md @@ -0,0 +1,7 @@ +--- +title: Step +--- + +# Step + +This section provides description for the `Step` class. Each `Step` uses its own set of Methods and Datasets and implements the logic necessary to read a set of inputs, perform the transformation and write the outputs. All steps are available through the command line interface when running the `gentropy` command. diff --git a/docs/python_api/step/colocalisation.md b/docs/python_api/steps/colocalisation.md similarity index 100% rename from docs/python_api/step/colocalisation.md rename to docs/python_api/steps/colocalisation.md diff --git a/docs/python_api/step/eqtl_catalogue.md b/docs/python_api/steps/eqtl_catalogue.md similarity index 100% rename from docs/python_api/step/eqtl_catalogue.md rename to docs/python_api/steps/eqtl_catalogue.md diff --git a/docs/python_api/step/finngen_studies.md b/docs/python_api/steps/finngen_studies.md similarity index 100% rename from docs/python_api/step/finngen_studies.md rename to docs/python_api/steps/finngen_studies.md diff --git a/docs/python_api/step/finngen_sumstat_preprocess.md b/docs/python_api/steps/finngen_sumstat_preprocess.md similarity index 100% rename from docs/python_api/step/finngen_sumstat_preprocess.md rename to docs/python_api/steps/finngen_sumstat_preprocess.md diff --git a/docs/python_api/step/gene_index.md b/docs/python_api/steps/gene_index.md similarity index 100% rename from docs/python_api/step/gene_index.md rename to docs/python_api/steps/gene_index.md diff --git a/docs/python_api/step/gwas_catalog_curation.md b/docs/python_api/steps/gwas_catalog_curation.md similarity index 100% rename from docs/python_api/step/gwas_catalog_curation.md rename to docs/python_api/steps/gwas_catalog_curation.md diff --git a/docs/python_api/step/gwas_catalog_inclusion.md b/docs/python_api/steps/gwas_catalog_inclusion.md similarity index 100% rename from docs/python_api/step/gwas_catalog_inclusion.md rename to docs/python_api/steps/gwas_catalog_inclusion.md diff --git a/docs/python_api/step/gwas_catalog_ingestion.md b/docs/python_api/steps/gwas_catalog_ingestion.md similarity index 100% rename from docs/python_api/step/gwas_catalog_ingestion.md rename to docs/python_api/steps/gwas_catalog_ingestion.md diff --git a/docs/python_api/step/gwas_catalog_sumstat_preprocess.md b/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md similarity index 100% rename from docs/python_api/step/gwas_catalog_sumstat_preprocess.md rename to docs/python_api/steps/gwas_catalog_sumstat_preprocess.md diff --git a/docs/python_api/step/l2g.md b/docs/python_api/steps/l2g.md similarity index 100% rename from docs/python_api/step/l2g.md rename to docs/python_api/steps/l2g.md diff --git a/docs/python_api/step/ld_clump.md b/docs/python_api/steps/ld_clump.md similarity index 100% rename from docs/python_api/step/ld_clump.md rename to docs/python_api/steps/ld_clump.md diff --git a/docs/python_api/step/ld_index.md b/docs/python_api/steps/ld_index.md similarity index 100% rename from docs/python_api/step/ld_index.md rename to docs/python_api/steps/ld_index.md diff --git a/docs/python_api/step/pics.md b/docs/python_api/steps/pics.md similarity index 100% rename from docs/python_api/step/pics.md rename to docs/python_api/steps/pics.md diff --git a/docs/python_api/step/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md similarity index 100% rename from docs/python_api/step/variant_annotation_step.md rename to docs/python_api/steps/variant_annotation_step.md diff --git a/docs/python_api/step/variant_index_step.md b/docs/python_api/steps/variant_index_step.md similarity index 100% rename from docs/python_api/step/variant_index_step.md rename to docs/python_api/steps/variant_index_step.md diff --git a/docs/python_api/step/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md similarity index 100% rename from docs/python_api/step/variant_to_gene_step.md rename to docs/python_api/steps/variant_to_gene_step.md diff --git a/docs/python_api/step/window_based_clumping.md b/docs/python_api/steps/window_based_clumping.md similarity index 100% rename from docs/python_api/step/window_based_clumping.md rename to docs/python_api/steps/window_based_clumping.md diff --git a/mkdocs.yml b/mkdocs.yml index ad928ec7b..d25076a73 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Open Targets Genetics +site_name: Open Targets Gentropy nav: - Home: index.md diff --git a/pyproject.toml b/pyproject.toml index 849a8f6f7..30713de46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "gentropy" # !! version is managed by semantic_release version = "0.0.0" -description = "Open targets Genetics Portal Python ETL" +description = "Open Targets python framework for post-GWAS analysis" authors = ["Open Targets core team"] license = "Apache-2.0" readme = "README.md" diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index 5a282ce35..401c5d6c6 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -18,7 +18,7 @@ @dataclass class Dataset(ABC): - """Open Targets Genetics Dataset. + """Open Targets Gentropy Dataset. `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `schemas` module. """ diff --git a/src/gentropy/dataset/study_locus_overlap.py b/src/gentropy/dataset/study_locus_overlap.py index ee1e81b32..5f839bd9c 100644 --- a/src/gentropy/dataset/study_locus_overlap.py +++ b/src/gentropy/dataset/study_locus_overlap.py @@ -21,6 +21,7 @@ class StudyLocusOverlap(Dataset): This dataset captures pairs of overlapping `StudyLocus`: that is associations whose credible sets share at least one tagging variant. !!! note + This is a helpful dataset for other downstream analyses, such as colocalisation. This dataset will contain the overlapping signals between studyLocus associations once they have been clumped and fine-mapped. """ diff --git a/src/gentropy/ld_index.py b/src/gentropy/ld_index.py index dfdd90306..cb260977d 100644 --- a/src/gentropy/ld_index.py +++ b/src/gentropy/ld_index.py @@ -11,6 +11,7 @@ class LDIndexStep: """LD index step. !!! warning "This step is resource intensive" + Suggested params: high memory machine, 5TB of boot disk, no SSDs. """ diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py index b0699b88f..fd56398f0 100644 --- a/src/gentropy/method/colocalisation.py +++ b/src/gentropy/method/colocalisation.py @@ -100,6 +100,7 @@ class Coloc: | H4 | both traits are associated and share the same single causal variant | !!! warning "Bayes factors required" + Coloc requires the availability of Bayes factors (BF) for each variant in the credible set (`logBF` column). """ diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py index bcf17280a..e5ed5f2c6 100644 --- a/src/gentropy/method/pics.py +++ b/src/gentropy/method/pics.py @@ -185,6 +185,7 @@ def finemap( """Run PICS on a study locus. !!! info "Study locus needs to be LD annotated" + The study locus needs to be LD annotated before PICS can be calculated. Args: From 2d8b08bf6671210618f4cb027c2e805210bca795 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Thu, 18 Jan 2024 11:22:52 +0000 Subject: [PATCH 08/12] chore: tidying up gwas catalog ingestion and process configuration (#426) * chore: updaing configs to the propsed release folder structure * chore: configuration of GWAS Catalog ingestion clean up * chore: updatig more gwas catalog related configs * chore: finalising dags * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chore: finalising dag config * refactor: reverting ad-hoc changes * chore: truning full DAG on * fix: docs updated so mkdocs won't fail --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- config/datasets/ot_gcp.yaml | 35 ++++-- config/step/ot_gwas_catalog_ingestion.yaml | 14 +-- .../step/ot_gwas_catalog_study_curation.yaml | 6 +- .../step/ot_gwas_catalog_study_inclusion.yaml | 8 +- config/step/ot_ld_based_clumping.yaml | 2 +- docs/python_api/steps/ld_clump.md | 2 +- .../dags/gwas_catalog_harmonisation.py | 21 ++-- src/airflow/dags/gwas_catalog_preprocess.py | 93 ++++++++++------ src/gentropy/config.py | 4 +- src/gentropy/ld_based_clumping.py | 2 +- utils/update_GWAS_Catalog_data.sh | 102 +++++++++++++----- 11 files changed, 193 insertions(+), 96 deletions(-) diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index 8e588ec62..93630c3a3 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -1,10 +1,33 @@ # Release specific configuration: release_version: "24.01" +version: "XX.XX" release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version} inputs: gs://genetics_etl_python_playground/input outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.version} +## Datasets: +gwas_catalog_dataset: gs://gwas_catalog_data +# Ingestion input files: +gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv +gwas_catalog_studies: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_studies.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_studies.tsv +gwas_catalog_ancestries: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_ancestries.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_ancestries.tsv +gwas_catalog_sumstats_lut: ${datasets.gwas_catalog_dataset}/curated_inputs/harmonised_list.txt +gwas_catalog_study_curation: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_study_curation.tsv +# Harmonised summary statistics list: +gwas_catalog_summary_stats_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_harmonised_sumstats_list.txt +# Inclusion lists: +gwas_catalog_curated_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_curated_included_studies +gwas_catalog_summary_satistics_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_summary_statistics_included_studies +# Ingestion output folders: +gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index +gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets +gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets + # Input datasets chain_37_38: ${datasets.inputs}/v2g_input/grch37_to_grch38.over.chain target_index: ${datasets.inputs}/v2g_input/targets_correct_tss @@ -13,16 +36,6 @@ anderson: gs://genetics-portal-input/v2g_input/andersson2014/enhancer_tss_associ javierre: gs://genetics-portal-input/v2g_input/javierre_2016_preprocessed.parquet jung: gs://genetics-portal-raw/pchic_jung2019/jung2019_pchic_tableS3.csv thurman: gs://genetics-portal-input/v2g_input/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz -catalog_associations: ${datasets.inputs}/v2d/gwas_catalog_v1.0.2-associations_e110_r2023-12-21.tsv -catalog_studies: - # To get a complete representation of all GWAS Catalog studies, we need to - # ingest the list of unpublished studies from a different file. - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-studies-r2023-12-21.tsv -catalog_ancestries: - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-ancestries-r2023-12-21.tsv -catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-12-21.txt gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv @@ -37,7 +50,7 @@ study_locus_overlap: ${datasets.outputs}/study_locus_overlap ld_index: ${datasets.outputs}/ld_index catalog_study_index: ${datasets.study_index}/catalog catalog_study_locus: ${datasets.study_locus}/catalog_study_locus -gwas_catalog_study_curation: ${datasets.inputs}/v2d/GWAS_Catalog_study_curation.tsv + finngen_study_index: ${datasets.study_index}/finngen finngen_summary_stats: ${datasets.summary_statistics}/finngen from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats diff --git a/config/step/ot_gwas_catalog_ingestion.yaml b/config/step/ot_gwas_catalog_ingestion.yaml index 65606b7e4..fc82b82c2 100644 --- a/config/step/ot_gwas_catalog_ingestion.yaml +++ b/config/step/ot_gwas_catalog_ingestion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_ingestion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} variant_annotation_path: ${datasets.variant_annotation} -catalog_studies_out: ${datasets.catalog_study_index} -catalog_associations_out: ${datasets.catalog_study_locus} +catalog_studies_out: ${datasets.gwas_catalog_study_index} +catalog_associations_out: ${datasets.gwas_catalog_study_locus_folder}/gwas_catalog_curated_associations gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -inclusion_list_path: ??? +inclusion_list_path: ${datasets.gwas_catalog_curated_inclusion_list} diff --git a/config/step/ot_gwas_catalog_study_curation.yaml b/config/step/ot_gwas_catalog_study_curation.yaml index eb6c0ec78..77c1d7834 100644 --- a/config/step/ot_gwas_catalog_study_curation.yaml +++ b/config/step/ot_gwas_catalog_study_curation.yaml @@ -1,8 +1,8 @@ defaults: - gwas_catalog_study_curation -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} gwas_catalog_study_curation_out: ??? diff --git a/config/step/ot_gwas_catalog_study_inclusion.yaml b/config/step/ot_gwas_catalog_study_inclusion.yaml index 8a560127e..7f3bf80b3 100644 --- a/config/step/ot_gwas_catalog_study_inclusion.yaml +++ b/config/step/ot_gwas_catalog_study_inclusion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_study_inclusion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} variant_annotation_path: ${datasets.variant_annotation} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -harmonised_study_file: ??? +harmonised_study_file: ${datasets.gwas_catalog_summary_stats_list} criteria: ??? inclusion_list_path: ??? exclusion_list_path: ??? diff --git a/config/step/ot_ld_based_clumping.yaml b/config/step/ot_ld_based_clumping.yaml index d25ca84b7..d02c0acdd 100644 --- a/config/step/ot_ld_based_clumping.yaml +++ b/config/step/ot_ld_based_clumping.yaml @@ -1,7 +1,7 @@ defaults: - ld_based_clumping +ld_index_path: ${datasets.ld_index} study_locus_input_path: ??? -ld_index_path: ??? study_index_path: ??? clumped_study_locus_output_path: ??? diff --git a/docs/python_api/steps/ld_clump.md b/docs/python_api/steps/ld_clump.md index 75097bd8c..00fc8f2e4 100644 --- a/docs/python_api/steps/ld_clump.md +++ b/docs/python_api/steps/ld_clump.md @@ -2,4 +2,4 @@ title: LD-based clumping --- -::: gentropy.ld_based_clumping.LdBasedClumpingStep +::: gentropy.ld_based_clumping.LDBasedClumpingStep diff --git a/src/airflow/dags/gwas_catalog_harmonisation.py b/src/airflow/dags/gwas_catalog_harmonisation.py index 7e7790224..5713e223d 100644 --- a/src/airflow/dags/gwas_catalog_harmonisation.py +++ b/src/airflow/dags/gwas_catalog_harmonisation.py @@ -14,7 +14,9 @@ CLUSTER_NAME = "otg-gwascatalog-harmonisation" AUTOSCALING = "gwascatalog-harmonisation" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" +SUMMARY_STATS_BUCKET_NAME = "gwas_catalog_data" +RAW_SUMMARY_STATISTICS_PREFIX = "raw_summary_statistics" +HARMONISED_SUMMARY_STATISTICS_PREFIX = "harmonised_summary_statistics" with DAG( dag_id=Path(__file__).stem, @@ -26,14 +28,14 @@ list_inputs = GCSListObjectsOperator( task_id="list_raw_harmonised", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="raw-harmonised", + prefix=RAW_SUMMARY_STATISTICS_PREFIX, match_glob="**/*.h.tsv.gz", ) # List parquet files that have been previously processed list_outputs = GCSListObjectsOperator( task_id="list_harmonised_parquet", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + prefix=HARMONISED_SUMMARY_STATISTICS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,11 +61,15 @@ def create_to_do_list(**kwargs: Any) -> Any: print("Number of parquet files: ", len(parquets)) # noqa: T201 for path in raw_harmonised: match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + path, ) if match_result: study_id = match_result.group(2) - if f"harmonised/{study_id}.parquet/_SUCCESS" not in parquets: + if ( + f"{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet/_SUCCESS" + not in parquets + ): to_do_list.append(path) print("Number of jobs to submit: ", len(to_do_list)) # noqa: T201 ti.xcom_push(key="to_do_list", value=to_do_list) @@ -85,7 +91,8 @@ def submit_jobs(**kwargs: Any) -> None: time.sleep(60) input_path = todo[i] match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + input_path, ) if match_result: study_id = match_result.group(2) @@ -95,7 +102,7 @@ def submit_jobs(**kwargs: Any) -> None: step_id="ot_gwas_catalog_sumstat_preprocess", other_args=[ f"step.raw_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{input_path}", - f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/harmonised/{study_id}.parquet", + f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet", ], ) diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py index 33d4062a9..9df22586a 100644 --- a/src/airflow/dags/gwas_catalog_preprocess.py +++ b/src/airflow/dags/gwas_catalog_preprocess.py @@ -13,11 +13,44 @@ CLUSTER_NAME = "otg-preprocess-gwascatalog" AUTOSCALING = "otg-preprocess-gwascatalog" -RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX" -RELEASEBUCKET_NAME = "genetics_etl_python_playground" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" -SUMSTATS = "gs://open-targets-gwas-summary-stats/harmonised" -MANIFESTS_PATH = f"{RELEASEBUCKET}/manifests/" +# Setting up bucket name and output object names: +GWAS_CATALOG_BUCKET_NAME = "gwas_catalog_data" +HARMONISED_SUMSTATS_PREFIX = "harmonised_summary_statistics" + +# Manifest paths: +MANIFESTS_PATH = f"gs://{GWAS_CATALOG_BUCKET_NAME}/manifests/" + +# The name of the manifest files have to be consistent with the config file: +HARMONISED_SUMSTATS_LIST_OBJECT_NAME = ( + "manifests/gwas_catalog_harmonised_sumstats_list.txt" +) +HARMONISED_SUMSTATS_LIST_FULL_NAME = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_LIST_OBJECT_NAME}" +) +CURATION_INCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curated_included_studies" +CURATION_EXCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curation_excluded_studies" +SUMMARY_STATISTICS_INCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_included_studies" +) +SUMMARY_STATISTICS_EXCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_excluded_studies" +) + +# Study index: +STUDY_INDEX = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_index" + +# Study loci: +CURATED_STUDY_LOCI = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations" +CURATED_LD_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations_ld_clumped" +WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped" +LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped" +# Credible sets: +CURATED_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated" +) +SUMMARY_STATISTICS_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats" +) def upload_harmonized_study_list( @@ -48,8 +81,8 @@ def upload_harmonized_study_list( # Getting list of folders (each a gwas study with summary statistics) list_harmonised_sumstats = GCSListObjectsOperator( task_id="list_harmonised_parquet", - bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + bucket=GWAS_CATALOG_BUCKET_NAME, + prefix=HARMONISED_SUMSTATS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,8 +92,8 @@ def upload_harmonized_study_list( python_callable=upload_harmonized_study_list, op_kwargs={ "concatenated_studies": '{{ "\n".join(ti.xcom_pull( key="return_value", task_ids="list_harmonised_parquet")) }}', - "bucket_name": RELEASEBUCKET_NAME, - "object_name": "output/python_etl/parquet/XX.XX/manifests/harmonised_sumstats.txt", + "bucket_name": GWAS_CATALOG_BUCKET_NAME, + "object_name": HARMONISED_SUMSTATS_LIST_OBJECT_NAME, }, ) @@ -73,9 +106,9 @@ def upload_harmonized_study_list( task_id="catalog_curation_inclusion_list", other_args=[ "step.criteria=curation", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_curation", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={CURATION_INCLUSION_NAME}", + f"step.exclusion_list_path={CURATION_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -84,7 +117,7 @@ def upload_harmonized_study_list( cluster_name=CLUSTER_NAME, step_id="ot_gwas_catalog_ingestion", task_id="ingest_curated_gwas_catalog_data", - other_args=[f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation"], + other_args=[f"step.inclusion_list_path={CURATION_INCLUSION_NAME}"], ) # Run LD-annotation and clumping on curated data: @@ -93,10 +126,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_curation_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/catalog_curated", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", + f"step.study_locus_input_path={CURATED_STUDY_LOCI}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={CURATED_LD_CLUMPED}", ], ) @@ -106,8 +138,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_curation_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/catalog_curated", + f"step.study_locus_ld_annotated_in={CURATED_LD_CLUMPED}", + f"step.picsed_study_locus_out={CURATED_CREDIBLE_SETS}", ], ) @@ -130,9 +162,9 @@ def upload_harmonized_study_list( task_id="catalog_sumstats_inclusion_list", other_args=[ "step.criteria=summary_stats", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_sumstats", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.exclusion_list_path={SUMMARY_STATISTICS_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -142,9 +174,9 @@ def upload_harmonized_study_list( step_id="ot_window_based_clumping", task_id="catalog_sumstats_window_clumping", other_args=[ - f"step.summary_statistics_input_path={SUMSTATS}", - f"step.study_locus_output_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", + f"step.summary_statistics_input_path=gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_PREFIX}", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}", ], ) @@ -154,10 +186,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_sumstats_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", + f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={LD_BASED_CLUMPED}", ], ) @@ -167,8 +198,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_sumstats_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/from_sumstats/catalog", + f"step.study_locus_ld_annotated_in={LD_BASED_CLUMPED}", + f"step.picsed_study_locus_out={SUMMARY_STATISTICS_CREDIBLE_SETS}", ], ) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index ba213a349..ac9ce821d 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -78,7 +78,7 @@ class GWASCatalogStudyInclusionConfig(StepConfig): inclusion_list_path: str = MISSING exclusion_list_path: str = MISSING _target_: str = ( - "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionStep" + "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator" ) @@ -301,7 +301,7 @@ class WindowBasedClumpingStep(StepConfig): inclusion_list_path: str = MISSING locus_collect_distance: str | None = None - _target_: str = "gentropy.clump.WindowBasedClumpingStep" + _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep" @dataclass diff --git a/src/gentropy/ld_based_clumping.py b/src/gentropy/ld_based_clumping.py index ea9646806..e6a477a89 100644 --- a/src/gentropy/ld_based_clumping.py +++ b/src/gentropy/ld_based_clumping.py @@ -7,7 +7,7 @@ from gentropy.dataset.study_locus import StudyLocus -class LdBasedClumpingStep: +class LDBasedClumpingStep: """Step to perform LD-based clumping on study locus dataset. As a first step, study locus is enriched with population specific linked-variants. diff --git a/utils/update_GWAS_Catalog_data.sh b/utils/update_GWAS_Catalog_data.sh index 98ac4cc36..1e380d30c 100755 --- a/utils/update_GWAS_Catalog_data.sh +++ b/utils/update_GWAS_Catalog_data.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash - # Function to get the most recent date: get_most_recent(){ cat $1 | perl -lane 'push @a, $_ if $_ =~ /^\d+$/; END {@a = sort { $a <=> $b} @a; print pop @a }' @@ -21,13 +20,47 @@ get_release_info(){ logging(){ log_prompt="[$(date "+%Y.%m.%d %H:%M")]" - echo "${log_prompt} $@" + echo "${log_prompt} $@" >> ${LOG_FILE} +} + +upload_file_to_gcp(){ + FILENAME=${1} + TARGET=${2} + # Test if file exists: + if [ ! -f ${FILENAME} ]; then + logging "File ${FILENAME} does not exist." + return + fi + + logging "Copying ${FILENAME} to GCP..." + gsutil -mq cp file://$(pwd)/${FILENAME} ${TARGET} + + # Test if file was successfully uploaded: + if [ $? -ne 0 ]; then + logging "File ${FILENAME} failed to upload." + fi } # Resources: export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats -export GCP_TARGET=gs://genetics_etl_python_playground/input/v2d/ +export GCP_TARGET=gs://gwas_catalog_data +export LOG_FILE=gwas_catalog_data_update.log + +export GWAS_CATALOG_STUDY_CURATION_URL=https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv + +ASSOCIATION_FILE=gwas_catalog_associations_ontology_annotated.tsv +PUBLISHED_STUDIES_FILE=gwas_catalog_download_studies.tsv +PUBLISHED_ANCESTRIES_FILE=gwas_catalog_download_ancestries.tsv +UNPUBLISHED_STUDIES_FILE=gwas_catalog_unpublished_studies.tsv +UNPUBLISHED_ANCESTRIES_FILE=gwas_catalog_unpublished_ancestries.tsv +HARMONISED_LIST_FILE=harmonised_list.txt +GWAS_CATALOG_STUDY_CURATION_FILE=gwas_catalog_study_curation.tsv + +# Remove log file if exists: +if [ -f ${LOG_FILE} ]; then + rm -rf ${LOG_FILE} +fi logging "Extracing data from: ${BASE_URL}" logging "Release info fetched fom: ${RELEASE_INFO_URL}" @@ -47,36 +80,49 @@ RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY} logging "Datafiles are fetching from ${RELEASE_URL}" # Fetching files while assigning properly dated and annotated names: -wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv \ - -O gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv -O ${ASSOCIATION_FILE} +logging "File ${ASSOCIATION_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt -O ${PUBLISHED_STUDIES_FILE} +logging "File ${PUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv -O ${UNPUBLISHED_STUDIES_FILE} +logging "File ${UNPUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt -O ${PUBLISHED_ANCESTRIES_FILE} +logging "File ${PUBLISHED_ANCESTRIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv -O ${UNPUBLISHED_ANCESTRIES_FILE} +logging "File ${UNPUBLISHED_ANCESTRIES_FILE} saved." +wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O ${HARMONISED_LIST_FILE} +logging "File ${HARMONISED_LIST_FILE} saved." -wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt -logging "File harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt saved." +wget -q ${GWAS_CATALOG_STUDY_CURATION_URL} -O ${GWAS_CATALOG_STUDY_CURATION_FILE} +logging "In-house GWAS Catalog study curation file fetched from GitHub." logging "Copying files to GCP..." -gsutil -mq cp file://$(pwd)/gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ - -logging "Done." + +upload_file_to_gcp ${ASSOCIATION_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${HARMONISED_LIST_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${GWAS_CATALOG_STUDY_CURATION_FILE} ${GCP_TARGET}/manifests/ + + +logging "Files successfully uploaded." +logging "Removing local files..." +rm ${ASSOCIATION_FILE} \ + ${PUBLISHED_STUDIES_FILE} \ + ${PUBLISHED_ANCESTRIES_FILE} \ + ${HARMONISED_LIST_FILE} \ + ${UNPUBLISHED_STUDIES_FILE} \ + ${UNPUBLISHED_ANCESTRIES_FILE} \ + ${GWAS_CATALOG_STUDY_CURATION_FILE} + +# Uploading log file to GCP manifest folder: +logging "Uploading log file to GCP manifest folder..." +upload_file_to_gcp ${LOG_FILE} ${GCP_TARGET}/manifests/ +cat $LOG_FILE From 094b05c3faa31d14e8a1749aed15344374fa6d2d Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 18 Jan 2024 13:35:25 +0000 Subject: [PATCH 09/12] docs: docs ammendments (#437) * fix: typo * docs: catalog with upper case, dont ask me why * docs: missing doc page * docs: missing link * docs: add doi * docs: add doi to docs * docs: name steps after their CLI steps --- README.md | 2 +- docs/howto/run_step_in_cli.md | 2 +- docs/index.md | 2 +- docs/python_api/_python_api.md | 2 +- docs/python_api/datasources/_datasources.md | 2 +- docs/python_api/datasources/finngen/summary_stats.md | 5 +++++ docs/python_api/steps/colocalisation.md | 2 +- docs/python_api/steps/eqtl_catalogue.md | 2 +- docs/python_api/steps/finngen_studies.md | 2 +- docs/python_api/steps/finngen_sumstat_preprocess.md | 2 +- docs/python_api/steps/gene_index.md | 2 +- docs/python_api/steps/gwas_catalog_curation.md | 2 +- docs/python_api/steps/gwas_catalog_inclusion.md | 2 +- docs/python_api/steps/gwas_catalog_ingestion.md | 2 +- docs/python_api/steps/gwas_catalog_sumstat_preprocess.md | 2 +- docs/python_api/steps/l2g.md | 2 +- docs/python_api/steps/ld_clump.md | 2 +- docs/python_api/steps/ld_index.md | 2 +- docs/python_api/steps/pics.md | 2 +- docs/python_api/steps/variant_annotation_step.md | 2 +- docs/python_api/steps/variant_index_step.md | 2 +- docs/python_api/steps/variant_to_gene_step.md | 2 +- docs/python_api/steps/window_based_clumping.md | 2 +- 23 files changed, 27 insertions(+), 22 deletions(-) create mode 100644 docs/python_api/datasources/finngen/summary_stats.md diff --git a/README.md b/README.md index 7d5a1d410..479620a3d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) [![codecov](https://codecov.io/gh/opentargets/gentropy/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/gentropy) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10527086.svg)](https://doi.org/10.5281/zenodo.10527086)

diff --git a/docs/howto/run_step_in_cli.md b/docs/howto/run_step_in_cli.md index 07ede526d..965c7e079 100644 --- a/docs/howto/run_step_in_cli.md +++ b/docs/howto/run_step_in_cli.md @@ -41,4 +41,4 @@ In most occassions, some mandatory values will be required to run the step. For gentropy step=gene_index step.target_path=/path/to/target step.gene_index_path=/path/to/gene_index ``` -You can find more about the available steps in the [documentation](../python_api/step/_step.md). +You can find more about the available steps in the [documentation](../python_api/steps/_steps.md). diff --git a/docs/index.md b/docs/index.md index 31762c17e..26070ab3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ hide: [![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) [![codecov](https://codecov.io/gh/opentargets/gentropy/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/gentropy) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10527086.svg)](https://doi.org/10.5281/zenodo.10527086) --- diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md index 02c1941af..9952aa56f 100644 --- a/docs/python_api/_python_api.md +++ b/docs/python_api/_python_api.md @@ -4,7 +4,7 @@ title: Python API Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. -The Overall architecture of the package distinguishes between: +The overall architecture of the package distinguishes between: - [**Data Sources**](datasources/_datasources.md): data sources harmonisation tools - [**Datasets**](datasets/_datasets.md): data model diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md index c06bef177..036dbcb37 100644 --- a/docs/python_api/datasources/_datasources.md +++ b/docs/python_api/datasources/_datasources.md @@ -8,7 +8,7 @@ This section contains information about the data source harmonisation tools avai ## GWAS study sources -1. [GWAS catalog](gwas_catalog/_gwas_catalog.md) (with or without full summary statistics) +1. [GWAS Catalog](gwas_catalog/_gwas_catalog.md) (with or without full summary statistics) 1. [FinnGen](finngen/_finngen.md) ## Molecular QTLs diff --git a/docs/python_api/datasources/finngen/summary_stats.md b/docs/python_api/datasources/finngen/summary_stats.md new file mode 100644 index 000000000..ba6f6c263 --- /dev/null +++ b/docs/python_api/datasources/finngen/summary_stats.md @@ -0,0 +1,5 @@ +--- +title: Study Index +--- + +::: gentropy.datasource.finngen.summary_stats.FinnGenSummaryStats diff --git a/docs/python_api/steps/colocalisation.md b/docs/python_api/steps/colocalisation.md index 17a0dfd9a..76ecf7150 100644 --- a/docs/python_api/steps/colocalisation.md +++ b/docs/python_api/steps/colocalisation.md @@ -1,5 +1,5 @@ --- -title: Colocalisation +title: colocalisation --- ::: gentropy.colocalisation.ColocalisationStep diff --git a/docs/python_api/steps/eqtl_catalogue.md b/docs/python_api/steps/eqtl_catalogue.md index 32f2aa257..17ea6a6f6 100644 --- a/docs/python_api/steps/eqtl_catalogue.md +++ b/docs/python_api/steps/eqtl_catalogue.md @@ -1,5 +1,5 @@ --- -title: eQTL Catalogue +title: eqtl_catalogue --- ::: gentropy.eqtl_catalogue.EqtlCatalogueStep diff --git a/docs/python_api/steps/finngen_studies.md b/docs/python_api/steps/finngen_studies.md index 09da6f7c8..1ec4394f4 100644 --- a/docs/python_api/steps/finngen_studies.md +++ b/docs/python_api/steps/finngen_studies.md @@ -1,5 +1,5 @@ --- -title: FinnGen Studies +title: finngen_studies --- ::: gentropy.finngen_studies.FinnGenStudiesStep diff --git a/docs/python_api/steps/finngen_sumstat_preprocess.md b/docs/python_api/steps/finngen_sumstat_preprocess.md index 17b44b95e..57f27658e 100644 --- a/docs/python_api/steps/finngen_sumstat_preprocess.md +++ b/docs/python_api/steps/finngen_sumstat_preprocess.md @@ -1,5 +1,5 @@ --- -title: FinnGen Preprocess Summary Stats +title: finngen_sumstat_preprocess --- ::: gentropy.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep diff --git a/docs/python_api/steps/gene_index.md b/docs/python_api/steps/gene_index.md index bae77b5aa..a0808dcad 100644 --- a/docs/python_api/steps/gene_index.md +++ b/docs/python_api/steps/gene_index.md @@ -1,5 +1,5 @@ --- -title: Gene Index +title: gene_index --- ::: gentropy.gene_index.GeneIndexStep diff --git a/docs/python_api/steps/gwas_catalog_curation.md b/docs/python_api/steps/gwas_catalog_curation.md index 51aa72970..8512e39fc 100644 --- a/docs/python_api/steps/gwas_catalog_curation.md +++ b/docs/python_api/steps/gwas_catalog_curation.md @@ -1,5 +1,5 @@ --- -title: Apply in-house curation on GWAS Catalog studies +title: gwas_catalog_study_curation --- ::: gentropy.gwas_catalog_study_curation.GWASCatalogStudyCurationStep diff --git a/docs/python_api/steps/gwas_catalog_inclusion.md b/docs/python_api/steps/gwas_catalog_inclusion.md index 43abbb171..0a8c33b30 100644 --- a/docs/python_api/steps/gwas_catalog_inclusion.md +++ b/docs/python_api/steps/gwas_catalog_inclusion.md @@ -1,5 +1,5 @@ --- -title: Generate inclusion and exclusions lists for GWAS Catalog study ingestion. +title: gwas_catalog_study_curation --- ::: gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator diff --git a/docs/python_api/steps/gwas_catalog_ingestion.md b/docs/python_api/steps/gwas_catalog_ingestion.md index 17df73fec..69ea92479 100644 --- a/docs/python_api/steps/gwas_catalog_ingestion.md +++ b/docs/python_api/steps/gwas_catalog_ingestion.md @@ -1,5 +1,5 @@ --- -title: GWAS Catalog +title: gwas_catalog_ingestion --- ::: gentropy.gwas_catalog_ingestion.GWASCatalogIngestionStep diff --git a/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md b/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md index 39c64a882..3b0422050 100644 --- a/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md +++ b/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md @@ -1,5 +1,5 @@ --- -title: GWAS Catalog sumstat preprocess +title: gwas_catalog_sumstat_preprocess --- ::: gentropy.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep diff --git a/docs/python_api/steps/l2g.md b/docs/python_api/steps/l2g.md index d2b9e290e..847569e36 100644 --- a/docs/python_api/steps/l2g.md +++ b/docs/python_api/steps/l2g.md @@ -1,5 +1,5 @@ --- -title: Locus-to-gene (L2G) +title: locus_to_gene --- ::: gentropy.l2g.LocusToGeneStep diff --git a/docs/python_api/steps/ld_clump.md b/docs/python_api/steps/ld_clump.md index 00fc8f2e4..fea44f807 100644 --- a/docs/python_api/steps/ld_clump.md +++ b/docs/python_api/steps/ld_clump.md @@ -1,5 +1,5 @@ --- -title: LD-based clumping +title: ld_based_clumping --- ::: gentropy.ld_based_clumping.LDBasedClumpingStep diff --git a/docs/python_api/steps/ld_index.md b/docs/python_api/steps/ld_index.md index eba826266..bf8b9b58e 100644 --- a/docs/python_api/steps/ld_index.md +++ b/docs/python_api/steps/ld_index.md @@ -1,5 +1,5 @@ --- -title: LD Index +title: ld_index --- ::: gentropy.ld_index.LDIndexStep diff --git a/docs/python_api/steps/pics.md b/docs/python_api/steps/pics.md index 5654489f6..aacc6fbaf 100644 --- a/docs/python_api/steps/pics.md +++ b/docs/python_api/steps/pics.md @@ -1,5 +1,5 @@ --- -title: PICS +title: pics --- ::: gentropy.pics.PICSStep diff --git a/docs/python_api/steps/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md index e8d7c2c3f..e65a071b2 100644 --- a/docs/python_api/steps/variant_annotation_step.md +++ b/docs/python_api/steps/variant_annotation_step.md @@ -1,5 +1,5 @@ --- -title: Variant Annotation +title: variant_annotation --- ::: gentropy.variant_annotation.VariantAnnotationStep diff --git a/docs/python_api/steps/variant_index_step.md b/docs/python_api/steps/variant_index_step.md index e38fd8206..8a36f097f 100644 --- a/docs/python_api/steps/variant_index_step.md +++ b/docs/python_api/steps/variant_index_step.md @@ -1,5 +1,5 @@ --- -title: Variant Index +title: variant_index --- ::: gentropy.variant_index.VariantIndexStep diff --git a/docs/python_api/steps/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md index 16db10e8e..1a3e56af8 100644 --- a/docs/python_api/steps/variant_to_gene_step.md +++ b/docs/python_api/steps/variant_to_gene_step.md @@ -1,5 +1,5 @@ --- -title: Variant-to-gene +title: variant_to_gene --- ::: gentropy.v2g.V2GStep diff --git a/docs/python_api/steps/window_based_clumping.md b/docs/python_api/steps/window_based_clumping.md index bbcd2c0d8..f33057c71 100644 --- a/docs/python_api/steps/window_based_clumping.md +++ b/docs/python_api/steps/window_based_clumping.md @@ -1,5 +1,5 @@ --- -title: Window-based clumping +title: window_based_clumping --- ::: gentropy.window_based_clumping.WindowBasedClumpingStep From 1e0912f94aaa634930a37afdd57ff319984df1ff Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 18 Jan 2024 13:48:38 +0000 Subject: [PATCH 10/12] fix: logo css --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 479620a3d..3a1971ef1 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10527086.svg)](https://doi.org/10.5281/zenodo.10527086)

- +

Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. From 1e13e093b29e8582805c18c4a1a6425977983fe2 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 18 Jan 2024 14:02:55 +0000 Subject: [PATCH 11/12] docs: fix step name (#438) --- docs/python_api/steps/gwas_catalog_inclusion.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/python_api/steps/gwas_catalog_inclusion.md b/docs/python_api/steps/gwas_catalog_inclusion.md index 0a8c33b30..e9ede6dd6 100644 --- a/docs/python_api/steps/gwas_catalog_inclusion.md +++ b/docs/python_api/steps/gwas_catalog_inclusion.md @@ -1,5 +1,5 @@ --- -title: gwas_catalog_study_curation +title: gwas_catalog_study_inclusion --- ::: gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator From 84f794d347171a13af48561423aec66ed638c856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:31:21 +0000 Subject: [PATCH 12/12] chore: improvements to generate 2401 data release (#436) * fix(dag): remove ot_gwas_catalog ot_study_locus_overlap from etl dag steps * fix(dag): remove ot_gwas_catalog ot_study_locus_overlap from etl dag steps * fix: include gene_index as a release output * feat(l2g): split step into train and predict * chore(dag): add colocalisation step * feat(l2g): split step into train and predict * chore: rename ot_v2g config to unabbreviated name * chore(colocalisation): remove coloc parameters from config * fix(colocalisation): update credible set path in config * fix(l2g): remove overlaps from config * chore(dag): remove ukbiobank and eqtl from preprocess * fix(l2g): increase driver and executors memory * fix(l2g): make training dependencies optional * fix(l2g): drop studyType before creating gwas_study_locus * fix(l2g): convert features_list from config to list * fix(l2g): include mean vep features in feature_list * chore: change default driver node to n1-highmem-16 * revert(l2g): revert 4eafaf297b85067704c327b88d5fdc361c22e14f * chore: fetch etl inputs from new data structure * feat(study_locus): add and test filter_by_study_type * feat(study_locus): add and test filter_by_study_type * feat(l2g): limit l2g predictions to gwas-derived associations * fix: typo in test_filter_by_study_type --- config/datasets/ot_gcp.yaml | 25 ++++----- config/step/ot_colocalisation.yaml | 2 +- config/step/ot_locus_to_gene_predict.yaml | 10 ++++ ..._gene.yaml => ot_locus_to_gene_train.yaml} | 0 .../{ot_v2g.yaml => ot_variant_to_gene.yaml} | 0 src/airflow/dags/common_airflow.py | 2 +- src/airflow/dags/configs/dag.yaml | 16 +++--- src/airflow/dags/dag_preprocess.py | 2 - src/gentropy/config.py | 30 ++++++----- src/gentropy/dataset/l2g_feature_matrix.py | 16 +++--- src/gentropy/dataset/l2g_prediction.py | 22 ++++---- src/gentropy/dataset/study_locus.py | 29 +++++++++++ src/gentropy/l2g.py | 10 ++-- tests/dataset/test_study_locus.py | 52 +++++++++++++++++++ 14 files changed, 157 insertions(+), 59 deletions(-) create mode 100644 config/step/ot_locus_to_gene_predict.yaml rename config/step/{ot_locus_to_gene.yaml => ot_locus_to_gene_train.yaml} (100%) rename config/step/{ot_v2g.yaml => ot_variant_to_gene.yaml} (100%) diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index 93630c3a3..fe526f906 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -1,10 +1,11 @@ # Release specific configuration: release_version: "24.01" -version: "XX.XX" +dev_version: XX.XX release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version} inputs: gs://genetics_etl_python_playground/input -outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.version} +static_assets: gs://genetics_etl_python_playground/static_assetss +outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version} ## Datasets: gwas_catalog_dataset: gs://gwas_catalog_data @@ -29,19 +30,18 @@ gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_da gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets # Input datasets -chain_37_38: ${datasets.inputs}/v2g_input/grch37_to_grch38.over.chain -target_index: ${datasets.inputs}/v2g_input/targets_correct_tss -vep_consequences: gs://genetics-portal-data/lut/vep_consequences.tsv -anderson: gs://genetics-portal-input/v2g_input/andersson2014/enhancer_tss_associations.bed -javierre: gs://genetics-portal-input/v2g_input/javierre_2016_preprocessed.parquet -jung: gs://genetics-portal-raw/pchic_jung2019/jung2019_pchic_tableS3.csv -thurman: gs://genetics-portal-input/v2g_input/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz +chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain +vep_consequences: ${datasets.static_assets}/vep_consequences.tsv +anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed +javierre: ${datasets.static_assets}/javierre_2016_preprocessed +jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv +thurman: ${datasets.static_assets}/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz +target_index: ${datasets.release_folder}/targets # OTP 23.12 data -gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data +gene_interactions: ${datasets.release_folder}/interaction # OTP 23.12 data eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv -# Output datasets -gene_index: ${datasets.outputs}/gene_index +# Dev output datasets variant_annotation: ${datasets.outputs}/variant_annotation study_locus: ${datasets.outputs}/study_locus summary_statistics: ${datasets.outputs}/summary_statistics @@ -64,4 +64,5 @@ colocalisation: ${datasets.release_folder}/colocalisation study_index: ${datasets.release_folder}/study_index variant_index: ${datasets.release_folder}/variant_index credible_set: ${datasets.release_folder}/credible_set +gene_index: ${datasets.release_folder}/gene_index v2g: ${datasets.release_folder}/variant_to_gene diff --git a/config/step/ot_colocalisation.yaml b/config/step/ot_colocalisation.yaml index fbfe82989..4433595ef 100644 --- a/config/step/ot_colocalisation.yaml +++ b/config/step/ot_colocalisation.yaml @@ -1,6 +1,6 @@ defaults: - colocalisation -credible_set_path: ${datasets.study_locus} +credible_set_path: ${datasets.credible_set} study_index_path: ${datasets.study_index} coloc_path: ${datasets.colocalisation} diff --git a/config/step/ot_locus_to_gene_predict.yaml b/config/step/ot_locus_to_gene_predict.yaml new file mode 100644 index 000000000..bd5c31b25 --- /dev/null +++ b/config/step/ot_locus_to_gene_predict.yaml @@ -0,0 +1,10 @@ +defaults: + - locus_to_gene + +run_mode: predict +model_path: ${datasets.l2g_model} +predictions_path: ${datasets.l2g_predictions} +credible_set_path: ${datasets.credible_set} +variant_gene_path: ${datasets.v2g} +colocalisation_path: ${datasets.colocalisation} +study_index_path: ${datasets.study_index} diff --git a/config/step/ot_locus_to_gene.yaml b/config/step/ot_locus_to_gene_train.yaml similarity index 100% rename from config/step/ot_locus_to_gene.yaml rename to config/step/ot_locus_to_gene_train.yaml diff --git a/config/step/ot_v2g.yaml b/config/step/ot_variant_to_gene.yaml similarity index 100% rename from config/step/ot_v2g.yaml rename to config/step/ot_variant_to_gene.yaml diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py index 9ed7a81b1..e3dc56ccb 100644 --- a/src/airflow/dags/common_airflow.py +++ b/src/airflow/dags/common_airflow.py @@ -61,7 +61,7 @@ def create_cluster( cluster_name: str, - master_machine_type: str = "n1-highmem-8", + master_machine_type: str = "n1-highmem-16", worker_machine_type: str = "n1-standard-16", num_workers: int = 2, num_preemptible_workers: int = 0, diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml index e1c9ea627..0b634caaa 100644 --- a/src/airflow/dags/configs/dag.yaml +++ b/src/airflow/dags/configs/dag.yaml @@ -1,18 +1,18 @@ - id: "ot_gene_index" -- id: "ot_gwas_catalog" - id: "ot_variant_index" - prerequisites: - - "ot_gwas_catalog" - id: "ot_variant_to_gene" prerequisites: - "ot_variant_index" - "ot_gene_index" -- id: "ot_study_locus_overlap" +- id: "ot_colocalisation" +- id: "ot_locus_to_gene_train" prerequisites: - - "ot_gwas_catalog" -- id: "ot_locus_to_gene" + - "ot_variant_index" + - "ot_variant_to_gene" + - "ot_colocalisation" +- id: "ot_locus_to_gene_predict" prerequisites: - - "ot_gwas_catalog" + - "ot_locus_to_gene_train" - "ot_variant_index" - "ot_variant_to_gene" - - "ot_study_locus_overlap" + - "ot_colocalisation" diff --git a/src/airflow/dags/dag_preprocess.py b/src/airflow/dags/dag_preprocess.py index 728d5932f..4439914c5 100644 --- a/src/airflow/dags/dag_preprocess.py +++ b/src/airflow/dags/dag_preprocess.py @@ -9,10 +9,8 @@ CLUSTER_NAME = "otg-preprocess" ALL_STEPS = [ - "ot_eqtl_catalogue", "ot_ld_index", "ot_variant_annotation", - "ot_ukbiobank", ] diff --git a/src/gentropy/config.py b/src/gentropy/config.py index ac9ce821d..af14a54bc 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -34,12 +34,9 @@ class StepConfig: class ColocalisationConfig(StepConfig): """Colocalisation step configuration.""" - study_locus_path: str = MISSING + credible_set_path: str = MISSING study_index_path: str = MISSING coloc_path: str = MISSING - priorc1: float = 1e-4 - priorc2: float = 1e-4 - priorc12: float = 1e-5 _target_: str = "gentropy.colocalisation.ColocalisationStep" @@ -167,7 +164,11 @@ class LocusToGeneConfig(StepConfig): session: Any = field( default_factory=lambda: { - "extended_spark_conf": {"spark.dynamicAllocation.enabled": "false"} + "extended_spark_conf": { + "spark.dynamicAllocation.enabled": "false", + "spark.driver.memory": "48g", + "spark.executor.memory": "48g", + } } ) run_mode: str = MISSING @@ -177,19 +178,22 @@ class LocusToGeneConfig(StepConfig): variant_gene_path: str = MISSING colocalisation_path: str = MISSING study_index_path: str = MISSING - study_locus_overlap_path: str = MISSING - gold_standard_curation_path: str = MISSING - gene_interactions_path: str = MISSING + gold_standard_curation_path: str | None = None + gene_interactions_path: str | None = None features_list: list[str] = field( default_factory=lambda: [ # average distance of all tagging variants to gene TSS "distanceTssMean", - # # minimum distance of all tagging variants to gene TSS + # minimum distance of all tagging variants to gene TSS "distanceTssMinimum", - # # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity + # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity "vepMaximumNeighborhood", - # # maximum vep consequence score of the locus 95% credible set split by gene + # maximum vep consequence score of the locus 95% credible set split by gene "vepMaximum", + # mean vep consequence score of the locus 95% credible set among all genes in the vicinity + "vepMeanNeighborhood", + # mean vep consequence score of the locus 95% credible set split by gene + "vepMean", # max clpp for each (study, locus, gene) aggregating over all eQTLs "eqtlColocClppMaximum", # max clpp for each (study, locus) aggregating over all eQTLs @@ -260,7 +264,7 @@ class VariantIndexConfig(StepConfig): @dataclass -class V2GConfig(StepConfig): +class VariantToGeneConfig(StepConfig): """V2G step configuration.""" variant_index_path: str = MISSING @@ -352,5 +356,5 @@ def register_config() -> None: cs.store(group="step", name="pics", node=PICSConfig) cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig) cs.store(group="step", name="variant_index", node=VariantIndexConfig) - cs.store(group="step", name="variant_to_gene", node=V2GConfig) + cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig) cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep) diff --git a/src/gentropy/dataset/l2g_feature_matrix.py b/src/gentropy/dataset/l2g_feature_matrix.py index c1a02b3b7..fa84499dc 100644 --- a/src/gentropy/dataset/l2g_feature_matrix.py +++ b/src/gentropy/dataset/l2g_feature_matrix.py @@ -76,15 +76,13 @@ def generate_features( raise ValueError("No features found") # raise error if the feature matrix is empty - if fm.limit(1).count() != 0: - return cls( - _df=convert_from_long_to_wide( - fm, ["studyLocusId", "geneId"], "featureName", "featureValue" - ), - _schema=cls.get_schema(), - features_list=features_list, - ) - raise ValueError("L2G Feature matrix is empty") + return cls( + _df=convert_from_long_to_wide( + fm, ["studyLocusId", "geneId"], "featureName", "featureValue" + ), + _schema=cls.get_schema(), + features_list=features_list, + ) @classmethod def get_schema(cls: type[L2GFeatureMatrix]) -> StructType: diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py index ddaa9d741..e24688da3 100644 --- a/src/gentropy/dataset/l2g_prediction.py +++ b/src/gentropy/dataset/l2g_prediction.py @@ -62,21 +62,23 @@ def from_credible_set( Returns: L2GPrediction: L2G dataset """ - gwas_study_locus = StudyLocus( - _df=study_locus.df.join( - study_index.study_type_lut().filter(f.col("studyType") == "gwas"), - on="studyId", - how="inner", - ), - _schema=StudyLocus.get_schema(), - ) fm = L2GFeatureMatrix.generate_features( features_list=features_list, - study_locus=gwas_study_locus, + study_locus=study_locus, study_index=study_index, variant_gene=v2g, colocalisation=coloc, ).fill_na() + + gwas_fm = L2GFeatureMatrix( + _df=( + fm.df.join( + study_locus.filter_by_study_type("gwas", study_index).df, + on="studyLocusId", + ) + ), + _schema=cls.get_schema(), + ) return L2GPrediction( # Load and apply fitted model _df=( @@ -84,7 +86,7 @@ def from_credible_set( model_path, features_list=features_list, ) - .predict(fm) + .predict(gwas_fm) # the probability of the positive class is the second element inside the probability array # - this is selected as the L2G probability .select( diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 992ef304d..41c099959 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -228,6 +228,35 @@ def get_schema(cls: type[StudyLocus]) -> StructType: """ return parse_spark_schema("study_locus.json") + def filter_by_study_type( + self: StudyLocus, study_type: str, study_index: StudyIndex + ) -> StudyLocus: + """Creates a new StudyLocus dataset filtered by study type. + + Args: + study_type (str): Study type to filter for. Can be one of `gwas`, `eqtl`, `pqtl`, `eqtl`. + study_index (StudyIndex): Study index to resolve study types. + + Returns: + StudyLocus: Filtered study-locus dataset. + + Raises: + ValueError: If study type is not supported. + """ + if study_type not in ["gwas", "eqtl", "pqtl", "sqtl"]: + raise ValueError( + f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl." + ) + new_df = ( + self.df.join(study_index.study_type_lut(), on="studyId", how="inner") + .filter(f.col("studyType") == study_type) + .drop("studyType") + ) + return StudyLocus( + _df=new_df, + _schema=self._schema, + ) + def filter_credible_set( self: StudyLocus, credible_interval: CredibleInterval, diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 4bbdaaea0..d00a91596 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -81,13 +81,17 @@ def __init__( "model_path and predictions_path must be set for predict mode." ) predictions = L2GPrediction.from_credible_set( - model_path, features_list, credible_set, studies, v2g, coloc + model_path, list(features_list), credible_set, studies, v2g, coloc ) predictions.df.write.mode(session.write_mode).parquet(predictions_path) session.logger.info(predictions_path) - elif run_mode == "train": + elif ( + run_mode == "train" + and gold_standard_curation_path + and gene_interactions_path + ): # Process gold standard and L2G features - gs_curation = session.spark.read.json(gold_standard_curation_path) + gs_curation = session.spark.read.json(gold_standard_curation_path).persist() interactions = session.spark.read.parquet(gene_interactions_path) study_locus_overlap = StudyLocus( # We just extract overlaps of associations in the gold standard. This parsing is a duplication of the one in the gold standard curation, diff --git a/tests/dataset/test_study_locus.py b/tests/dataset/test_study_locus.py index b01b4f63d..037ede068 100644 --- a/tests/dataset/test_study_locus.py +++ b/tests/dataset/test_study_locus.py @@ -161,6 +161,58 @@ def test_find_overlaps( ) +@pytest.mark.parametrize( + "study_type, expected_sl_count", [("gwas", 1), ("eqtl", 1), ("pqtl", 0)] +) +def test_filter_by_study_type( + spark: SparkSession, study_type: str, expected_sl_count: int +) -> None: + """Test filter by study type.""" + # Input data + sl = StudyLocus( + _df=spark.createDataFrame( + [ + { + # from gwas + "studyLocusId": 1, + "variantId": "lead1", + "studyId": "study1", + }, + { + # from eqtl + "studyLocusId": 2, + "variantId": "lead2", + "studyId": "study2", + }, + ], + StudyLocus.get_schema(), + ), + _schema=StudyLocus.get_schema(), + ) + studies = StudyIndex( + _df=spark.createDataFrame( + [ + { + "studyId": "study1", + "studyType": "gwas", + "traitFromSource": "trait1", + "projectId": "project1", + }, + { + "studyId": "study2", + "studyType": "eqtl", + "traitFromSource": "trait2", + "projectId": "project2", + }, + ] + ), + _schema=StudyIndex.get_schema(), + ) + + observed = sl.filter_by_study_type(study_type, studies) + assert observed.df.count() == expected_sl_count + + def test_filter_credible_set(mock_study_locus: StudyLocus) -> None: """Test credible interval filter.""" assert isinstance(