diff --git a/README.md b/README.md index 8329c4c02..3a1971ef1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,30 @@ -[![status: experimental](https://github.com/GIScience/badges/raw/master/status/experimental.svg)](https://github.com/GIScience/badges#experimental) -[![docs](https://github.com/opentargets/gentropy/actions/workflows/docs.yaml/badge.svg)](https://opentargets.github.io/gentropy/) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) [![codecov](https://codecov.io/gh/opentargets/gentropy/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/gentropy) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10527086.svg)](https://doi.org/10.5281/zenodo.10527086) -# Genetics Portal Data Pipeline (experimental) +
+ +
+ +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. + +## Installation + +We recommend to install Open Targets Gentropy using Pypi: + +```bash +pip install gentropy +``` + +For alternative ways to install the package visit the [Documentation](https://opentargets.github.io/gentropy/installation/) + +## References - [Documentation](https://opentargets.github.io/gentropy/) +- [Issue tracker](https://github.com/opentargets/issues/issues) + +## About Open Targets + +Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org). diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index 8e588ec62..fe526f906 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -1,34 +1,47 @@ # Release specific configuration: release_version: "24.01" +dev_version: XX.XX release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version} inputs: gs://genetics_etl_python_playground/input -outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.version} +static_assets: gs://genetics_etl_python_playground/static_assetss +outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version} + +## Datasets: +gwas_catalog_dataset: gs://gwas_catalog_data +# Ingestion input files: +gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv +gwas_catalog_studies: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_studies.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_studies.tsv +gwas_catalog_ancestries: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_ancestries.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_ancestries.tsv +gwas_catalog_sumstats_lut: ${datasets.gwas_catalog_dataset}/curated_inputs/harmonised_list.txt +gwas_catalog_study_curation: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_study_curation.tsv +# Harmonised summary statistics list: +gwas_catalog_summary_stats_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_harmonised_sumstats_list.txt +# Inclusion lists: +gwas_catalog_curated_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_curated_included_studies +gwas_catalog_summary_satistics_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_summary_statistics_included_studies +# Ingestion output folders: +gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index +gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets +gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets # Input datasets -chain_37_38: ${datasets.inputs}/v2g_input/grch37_to_grch38.over.chain -target_index: ${datasets.inputs}/v2g_input/targets_correct_tss -vep_consequences: gs://genetics-portal-data/lut/vep_consequences.tsv -anderson: gs://genetics-portal-input/v2g_input/andersson2014/enhancer_tss_associations.bed -javierre: gs://genetics-portal-input/v2g_input/javierre_2016_preprocessed.parquet -jung: gs://genetics-portal-raw/pchic_jung2019/jung2019_pchic_tableS3.csv -thurman: gs://genetics-portal-input/v2g_input/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz -catalog_associations: ${datasets.inputs}/v2d/gwas_catalog_v1.0.2-associations_e110_r2023-12-21.tsv -catalog_studies: - # To get a complete representation of all GWAS Catalog studies, we need to - # ingest the list of unpublished studies from a different file. - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-studies-r2023-12-21.tsv -catalog_ancestries: - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-ancestries-r2023-12-21.tsv -catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-12-21.txt - -gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data +chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain +vep_consequences: ${datasets.static_assets}/vep_consequences.tsv +anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed +javierre: ${datasets.static_assets}/javierre_2016_preprocessed +jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv +thurman: ${datasets.static_assets}/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz +target_index: ${datasets.release_folder}/targets # OTP 23.12 data + +gene_interactions: ${datasets.release_folder}/interaction # OTP 23.12 data eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv -# Output datasets -gene_index: ${datasets.outputs}/gene_index +# Dev output datasets variant_annotation: ${datasets.outputs}/variant_annotation study_locus: ${datasets.outputs}/study_locus summary_statistics: ${datasets.outputs}/summary_statistics @@ -37,7 +50,7 @@ study_locus_overlap: ${datasets.outputs}/study_locus_overlap ld_index: ${datasets.outputs}/ld_index catalog_study_index: ${datasets.study_index}/catalog catalog_study_locus: ${datasets.study_locus}/catalog_study_locus -gwas_catalog_study_curation: ${datasets.inputs}/v2d/GWAS_Catalog_study_curation.tsv + finngen_study_index: ${datasets.study_index}/finngen finngen_summary_stats: ${datasets.summary_statistics}/finngen from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats @@ -51,4 +64,5 @@ colocalisation: ${datasets.release_folder}/colocalisation study_index: ${datasets.release_folder}/study_index variant_index: ${datasets.release_folder}/variant_index credible_set: ${datasets.release_folder}/credible_set +gene_index: ${datasets.release_folder}/gene_index v2g: ${datasets.release_folder}/variant_to_gene diff --git a/config/step/ot_colocalisation.yaml b/config/step/ot_colocalisation.yaml index fbfe82989..4433595ef 100644 --- a/config/step/ot_colocalisation.yaml +++ b/config/step/ot_colocalisation.yaml @@ -1,6 +1,6 @@ defaults: - colocalisation -credible_set_path: ${datasets.study_locus} +credible_set_path: ${datasets.credible_set} study_index_path: ${datasets.study_index} coloc_path: ${datasets.colocalisation} diff --git a/config/step/ot_gwas_catalog_ingestion.yaml b/config/step/ot_gwas_catalog_ingestion.yaml index 65606b7e4..fc82b82c2 100644 --- a/config/step/ot_gwas_catalog_ingestion.yaml +++ b/config/step/ot_gwas_catalog_ingestion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_ingestion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} variant_annotation_path: ${datasets.variant_annotation} -catalog_studies_out: ${datasets.catalog_study_index} -catalog_associations_out: ${datasets.catalog_study_locus} +catalog_studies_out: ${datasets.gwas_catalog_study_index} +catalog_associations_out: ${datasets.gwas_catalog_study_locus_folder}/gwas_catalog_curated_associations gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -inclusion_list_path: ??? +inclusion_list_path: ${datasets.gwas_catalog_curated_inclusion_list} diff --git a/config/step/ot_gwas_catalog_study_curation.yaml b/config/step/ot_gwas_catalog_study_curation.yaml index eb6c0ec78..77c1d7834 100644 --- a/config/step/ot_gwas_catalog_study_curation.yaml +++ b/config/step/ot_gwas_catalog_study_curation.yaml @@ -1,8 +1,8 @@ defaults: - gwas_catalog_study_curation -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} gwas_catalog_study_curation_out: ??? diff --git a/config/step/ot_gwas_catalog_study_inclusion.yaml b/config/step/ot_gwas_catalog_study_inclusion.yaml index 8a560127e..7f3bf80b3 100644 --- a/config/step/ot_gwas_catalog_study_inclusion.yaml +++ b/config/step/ot_gwas_catalog_study_inclusion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_study_inclusion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} variant_annotation_path: ${datasets.variant_annotation} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -harmonised_study_file: ??? +harmonised_study_file: ${datasets.gwas_catalog_summary_stats_list} criteria: ??? inclusion_list_path: ??? exclusion_list_path: ??? diff --git a/config/step/ot_ld_based_clumping.yaml b/config/step/ot_ld_based_clumping.yaml index d25ca84b7..d02c0acdd 100644 --- a/config/step/ot_ld_based_clumping.yaml +++ b/config/step/ot_ld_based_clumping.yaml @@ -1,7 +1,7 @@ defaults: - ld_based_clumping +ld_index_path: ${datasets.ld_index} study_locus_input_path: ??? -ld_index_path: ??? study_index_path: ??? clumped_study_locus_output_path: ??? diff --git a/config/step/ot_locus_to_gene_predict.yaml b/config/step/ot_locus_to_gene_predict.yaml new file mode 100644 index 000000000..bd5c31b25 --- /dev/null +++ b/config/step/ot_locus_to_gene_predict.yaml @@ -0,0 +1,10 @@ +defaults: + - locus_to_gene + +run_mode: predict +model_path: ${datasets.l2g_model} +predictions_path: ${datasets.l2g_predictions} +credible_set_path: ${datasets.credible_set} +variant_gene_path: ${datasets.v2g} +colocalisation_path: ${datasets.colocalisation} +study_index_path: ${datasets.study_index} diff --git a/config/step/ot_locus_to_gene.yaml b/config/step/ot_locus_to_gene_train.yaml similarity index 100% rename from config/step/ot_locus_to_gene.yaml rename to config/step/ot_locus_to_gene_train.yaml diff --git a/config/step/ot_v2g.yaml b/config/step/ot_variant_to_gene.yaml similarity index 100% rename from config/step/ot_v2g.yaml rename to config/step/ot_variant_to_gene.yaml diff --git a/docs/assets/imgs/gentropy.svg b/docs/assets/imgs/gentropy.svg new file mode 100644 index 000000000..9a0186638 --- /dev/null +++ b/docs/assets/imgs/gentropy.svg @@ -0,0 +1,292 @@ + + + diff --git a/docs/development/airflow.md b/docs/development/airflow.md index b73ad614e..ff5f7906c 100644 --- a/docs/development/airflow.md +++ b/docs/development/airflow.md @@ -8,12 +8,14 @@ This section describes how to set up a local Airflow server which will orchestra - [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) !!! warning macOS Docker memory allocation -On macOS, the default amount of memory available for Docker might not be enough to get Airflow up and running. Allocate at least 4GB of memory for the Docker Engine (ideally 8GB). [More info](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#) + + On macOS, the default amount of memory available for Docker might not be enough to get Airflow up and running. Allocate at least 4GB of memory for the Docker Engine (ideally 8GB). [More info](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#) ## Configure Airflow access to Google Cloud Platform !!! warning Specifying Google Cloud parameters -Run the next two command with the appropriate Google Cloud project ID and service account name to ensure the correct Google default application credentials are set up. + + Run the next two command with the appropriate Google Cloud project ID and service account name to ensure the correct Google default application credentials are set up. Authenticate to Google Cloud: @@ -38,7 +40,8 @@ cd src/airflow ### Build Docker image !!! note Custom Docker image for Airflow -The custom Dockerfile built by the command below extends the official [Airflow Docker Compose YAML](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml). We add support for Google Cloud SDK, Google Dataproc operators, and access to GCP credentials. + + The custom Dockerfile built by the command below extends the official [Airflow Docker Compose YAML](https://airflow.apache.org/docs/apache-airflow/stable/docker-compose.yaml). We add support for Google Cloud SDK, Google Dataproc operators, and access to GCP credentials. ```bash docker build . --tag extending_airflow:latest @@ -47,7 +50,8 @@ docker build . --tag extending_airflow:latest ### Set Airflow user ID !!! note Setting Airflow user ID -These commands allow Airflow running inside Docker to access the credentials file which was generated earlier. + + These commands allow Airflow running inside Docker to access the credentials file which was generated earlier. ```bash # If any user ID is already specified in .env, remove it. diff --git a/docs/development/contributing.md b/docs/development/contributing.md index 1b4e5451f..a12ac4951 100644 --- a/docs/development/contributing.md +++ b/docs/development/contributing.md @@ -8,7 +8,7 @@ title: Contributing guidelines The steps in this section only ever need to be done once on any particular system. -Google Cloud configuration: +For Google Cloud configuration: 1. Install Google Cloud SDK: https://cloud.google.com/sdk/docs/install. diff --git a/docs/howto/run_step_in_cli.md b/docs/howto/run_step_in_cli.md index 07ede526d..965c7e079 100644 --- a/docs/howto/run_step_in_cli.md +++ b/docs/howto/run_step_in_cli.md @@ -41,4 +41,4 @@ In most occassions, some mandatory values will be required to run the step. For gentropy step=gene_index step.target_path=/path/to/target step.gene_index_path=/path/to/gene_index ``` -You can find more about the available steps in the [documentation](../python_api/step/_step.md). +You can find more about the available steps in the [documentation](../python_api/steps/_steps.md). diff --git a/docs/howto/run_step_using_config.md b/docs/howto/run_step_using_config.md index 926cdbf45..bbce9a195 100644 --- a/docs/howto/run_step_using_config.md +++ b/docs/howto/run_step_using_config.md @@ -7,7 +7,8 @@ Title: Run step using config It's possible to parametrise the functionality of a step using a YAML configuration file. This is useful when you want to run a step multiple times with different parameters or simply to avoid having to specify the same parameters every time you run a step. !!! info Configuration files using Hydra -The package uses [Hydra](https://hydra.cc) to handle configuration files. For more information, please visit the [Hydra documentation](https://hydra.cc/docs/intro/). + + The package uses [Hydra](https://hydra.cc) to handle configuration files. For more information, please visit the [Hydra documentation](https://hydra.cc/docs/intro/). To run a step using a configuration file, you need to create a configuration file in YAML format. diff --git a/docs/index.md b/docs/index.md index 2efc84814..26070ab3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,13 +1,13 @@ --- -title: Open Targets Genetics +title: Open Targets Gentropy hide: - navigation - toc --- -- -
+ + + ---- + -[![status: experimental](https://github.com/GIScience/badges/raw/master/status/experimental.svg)](https://github.com/GIScience/badges#experimental) -![docs](https://github.com/opentargets/gentropy/actions/workflows/docs.yaml/badge.svg) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![image](https://github.com/opentargets/gentropy/actions/workflows/release.yaml/badge.svg)](https://opentargets.github.io/gentropy/) [![codecov](https://codecov.io/gh/opentargets/gentropy/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/gentropy) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/gentropy/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/gentropy) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10527086.svg)](https://doi.org/10.5281/zenodo.10527086) --- -Ingestion and analysis of genetic and functional genomic data for the identification and prioritisation of drug targets. +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. This package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. -This project is still in experimental phase. Please refer to the [roadmap section](roadmap.md) for more information. +## About Open Targets -For all development information, including running the code, troubleshooting, or contributing, see the [development section](development/_development.md). +Open Targets is a pre-competitive, public-private partnership that uses human genetics and genomics data to systematically identify and prioritise drug targets. Through large-scale genomic experiments and the development of innovative computational techniques, the partnership aims to help researchers select the best targets for the development of new therapies. For more information, visit the Open Targets [website](https://www.opentargets.org). diff --git a/docs/installation.md b/docs/installation.md index bccb6684d..07c5493f0 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -7,4 +7,26 @@ hide: # Installation -TBC +!!! note Python compatibility + + In the early stages of development, we are using Python version 3.10. We recommend using [pyenv](https://github.com/pyenv/pyenv) or similar tools to manage your local Python version. We intend to support more Python versions in the future. + +## Pypi + +We recommend installing Open Targets Gentropy using Pypi: + +```bash +pip install gentropy +``` + +## Source + +Alternatively, you can install Open Targets Gentropy from source. Check the [contributing](development/contributing.md) section for more information. + +## Uninstall + +```bash +pip uninstall gentropy -y +``` + +For any issues with the installation, check the [troubleshooting section](development/troubleshooting.md). diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md index e69de29bb..9952aa56f 100644 --- a/docs/python_api/_python_api.md +++ b/docs/python_api/_python_api.md @@ -0,0 +1,12 @@ +--- +title: Python API +--- + +Open Targets Gentropy is a Python package to facilitate the interpretation and analysis of GWAS and functional genomic studies for target identification. The package contains a toolkit for the harmonisation, statistical analysis and prioritisation of genetic signals to assist drug discovery. + +The overall architecture of the package distinguishes between: + +- [**Data Sources**](datasources/_datasources.md): data sources harmonisation tools +- [**Datasets**](datasets/_datasets.md): data model +- [**Methods**](methods/_methods.md): statistical analysis tools +- [**Steps**](steps/_steps.md): pipeline steps diff --git a/docs/python_api/dataset/_dataset.md b/docs/python_api/dataset/_dataset.md deleted file mode 100644 index 4f2fdb6f6..000000000 --- a/docs/python_api/dataset/_dataset.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Dataset ---- - -::: gentropy.dataset.dataset.Dataset diff --git a/docs/python_api/datasets/_datasets.md b/docs/python_api/datasets/_datasets.md new file mode 100644 index 000000000..058ab5fa2 --- /dev/null +++ b/docs/python_api/datasets/_datasets.md @@ -0,0 +1,7 @@ +--- +title: Datasets +--- + +The Dataset classes define the data model behind Open Targets Gentropy. Every class inherits from the `Dataset` class and contains a dataframe with a predefined schema that can be found in the respective classes. + +::: gentropy.dataset.dataset.Dataset diff --git a/docs/python_api/dataset/colocalisation.md b/docs/python_api/datasets/colocalisation.md similarity index 100% rename from docs/python_api/dataset/colocalisation.md rename to docs/python_api/datasets/colocalisation.md diff --git a/docs/python_api/dataset/gene_index.md b/docs/python_api/datasets/gene_index.md similarity index 100% rename from docs/python_api/dataset/gene_index.md rename to docs/python_api/datasets/gene_index.md diff --git a/docs/python_api/dataset/intervals.md b/docs/python_api/datasets/intervals.md similarity index 100% rename from docs/python_api/dataset/intervals.md rename to docs/python_api/datasets/intervals.md diff --git a/docs/python_api/dataset/l2g_feature.md b/docs/python_api/datasets/l2g_feature.md similarity index 100% rename from docs/python_api/dataset/l2g_feature.md rename to docs/python_api/datasets/l2g_feature.md diff --git a/docs/python_api/dataset/l2g_feature_matrix.md b/docs/python_api/datasets/l2g_feature_matrix.md similarity index 100% rename from docs/python_api/dataset/l2g_feature_matrix.md rename to docs/python_api/datasets/l2g_feature_matrix.md diff --git a/docs/python_api/dataset/l2g_gold_standard.md b/docs/python_api/datasets/l2g_gold_standard.md similarity index 100% rename from docs/python_api/dataset/l2g_gold_standard.md rename to docs/python_api/datasets/l2g_gold_standard.md diff --git a/docs/python_api/dataset/l2g_prediction.md b/docs/python_api/datasets/l2g_prediction.md similarity index 100% rename from docs/python_api/dataset/l2g_prediction.md rename to docs/python_api/datasets/l2g_prediction.md diff --git a/docs/python_api/dataset/ld_index.md b/docs/python_api/datasets/ld_index.md similarity index 100% rename from docs/python_api/dataset/ld_index.md rename to docs/python_api/datasets/ld_index.md diff --git a/docs/python_api/dataset/study_index.md b/docs/python_api/datasets/study_index.md similarity index 100% rename from docs/python_api/dataset/study_index.md rename to docs/python_api/datasets/study_index.md diff --git a/docs/python_api/dataset/study_locus.md b/docs/python_api/datasets/study_locus.md similarity index 100% rename from docs/python_api/dataset/study_locus.md rename to docs/python_api/datasets/study_locus.md diff --git a/docs/python_api/dataset/study_locus_overlap.md b/docs/python_api/datasets/study_locus_overlap.md similarity index 100% rename from docs/python_api/dataset/study_locus_overlap.md rename to docs/python_api/datasets/study_locus_overlap.md diff --git a/docs/python_api/dataset/summary_statistics.md b/docs/python_api/datasets/summary_statistics.md similarity index 100% rename from docs/python_api/dataset/summary_statistics.md rename to docs/python_api/datasets/summary_statistics.md diff --git a/docs/python_api/dataset/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md similarity index 100% rename from docs/python_api/dataset/variant_annotation.md rename to docs/python_api/datasets/variant_annotation.md diff --git a/docs/python_api/dataset/variant_index.md b/docs/python_api/datasets/variant_index.md similarity index 100% rename from docs/python_api/dataset/variant_index.md rename to docs/python_api/datasets/variant_index.md diff --git a/docs/python_api/dataset/variant_to_gene.md b/docs/python_api/datasets/variant_to_gene.md similarity index 100% rename from docs/python_api/dataset/variant_to_gene.md rename to docs/python_api/datasets/variant_to_gene.md diff --git a/docs/python_api/datasource/_datasource.md b/docs/python_api/datasource/_datasource.md deleted file mode 100644 index 9fab444bf..000000000 --- a/docs/python_api/datasource/_datasource.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Data Source ---- - -# Data Source - -This section contains information about the data sources used in Open Targets Genetics. - -We use GnomAD v4.0 as a source for variant annotation and GnomAD v2.1.1 as a source for linkage disequilibrium (LD) information (described in the **GnomAD** section). - -We rely on Open Targets as a source for the list of targets and the Gold Standard training set (described in the **Open Targets** section). - -## Study Sources - -1. GWAS catalog -2. FinnGen - -## Molecular QTLs - -1. eQTL catalogue - -## Interaction / Interval-based Experiments - -We integrate a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications. For more detils see section **"Intervals"**. diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md new file mode 100644 index 000000000..036dbcb37 --- /dev/null +++ b/docs/python_api/datasources/_datasources.md @@ -0,0 +1,37 @@ +--- +title: Data Sources +--- + +# Data Sources + +This section contains information about the data source harmonisation tools available in Open Targets Gentropy. + +## GWAS study sources + +1. [GWAS Catalog](gwas_catalog/_gwas_catalog.md) (with or without full summary statistics) +1. [FinnGen](finngen/_finngen.md) + +## Molecular QTLs + +1. [GTEx (eQTL catalogue)](eqtl_catalogue/_eqtl_catalogue.md) + +## Interaction / Interval-based Experiments + +1. [Intervals](intervals/_intervals.md)-based datasets, informing about the relationships between genetic elements and their functional implications. + +## Variant annotation/validation + +1. [GnomAD](gnomad/_gnomad.md) v4.0 +1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data) + +## Linkage desiquilibrium + +1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries) + +## Locus-to-gene gold standard + +1. [Open Targets training set](open_targets/l2g_gold_standard.md) + +## Gene annotation + +1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl) diff --git a/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md b/docs/python_api/datasources/eqtl_catalogue/_eqtl_catalogue.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md rename to docs/python_api/datasources/eqtl_catalogue/_eqtl_catalogue.md diff --git a/docs/python_api/datasource/eqtl_catalogue/study_index.md b/docs/python_api/datasources/eqtl_catalogue/study_index.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/study_index.md rename to docs/python_api/datasources/eqtl_catalogue/study_index.md diff --git a/docs/python_api/datasource/eqtl_catalogue/summary_stats.md b/docs/python_api/datasources/eqtl_catalogue/summary_stats.md similarity index 100% rename from docs/python_api/datasource/eqtl_catalogue/summary_stats.md rename to docs/python_api/datasources/eqtl_catalogue/summary_stats.md diff --git a/docs/python_api/datasource/finngen/_finngen.md b/docs/python_api/datasources/finngen/_finngen.md similarity index 100% rename from docs/python_api/datasource/finngen/_finngen.md rename to docs/python_api/datasources/finngen/_finngen.md diff --git a/docs/python_api/datasource/finngen/study_index.md b/docs/python_api/datasources/finngen/study_index.md similarity index 100% rename from docs/python_api/datasource/finngen/study_index.md rename to docs/python_api/datasources/finngen/study_index.md diff --git a/docs/python_api/datasources/finngen/summary_stats.md b/docs/python_api/datasources/finngen/summary_stats.md new file mode 100644 index 000000000..ba6f6c263 --- /dev/null +++ b/docs/python_api/datasources/finngen/summary_stats.md @@ -0,0 +1,5 @@ +--- +title: Study Index +--- + +::: gentropy.datasource.finngen.summary_stats.FinnGenSummaryStats diff --git a/docs/python_api/datasource/gnomad/_gnomad.md b/docs/python_api/datasources/gnomad/_gnomad.md similarity index 100% rename from docs/python_api/datasource/gnomad/_gnomad.md rename to docs/python_api/datasources/gnomad/_gnomad.md diff --git a/docs/python_api/datasource/gnomad/gnomad_ld.md b/docs/python_api/datasources/gnomad/gnomad_ld.md similarity index 100% rename from docs/python_api/datasource/gnomad/gnomad_ld.md rename to docs/python_api/datasources/gnomad/gnomad_ld.md diff --git a/docs/python_api/datasource/gnomad/gnomad_variants.md b/docs/python_api/datasources/gnomad/gnomad_variants.md similarity index 100% rename from docs/python_api/datasource/gnomad/gnomad_variants.md rename to docs/python_api/datasources/gnomad/gnomad_variants.md diff --git a/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md b/docs/python_api/datasources/gwas_catalog/_gwas_catalog.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/_gwas_catalog.md rename to docs/python_api/datasources/gwas_catalog/_gwas_catalog.md diff --git a/docs/python_api/datasource/gwas_catalog/associations.md b/docs/python_api/datasources/gwas_catalog/associations.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/associations.md rename to docs/python_api/datasources/gwas_catalog/associations.md diff --git a/docs/python_api/datasource/gwas_catalog/study_index.md b/docs/python_api/datasources/gwas_catalog/study_index.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/study_index.md rename to docs/python_api/datasources/gwas_catalog/study_index.md diff --git a/docs/python_api/datasource/gwas_catalog/study_splitter.md b/docs/python_api/datasources/gwas_catalog/study_splitter.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/study_splitter.md rename to docs/python_api/datasources/gwas_catalog/study_splitter.md diff --git a/docs/python_api/datasource/gwas_catalog/summary_statistics.md b/docs/python_api/datasources/gwas_catalog/summary_statistics.md similarity index 100% rename from docs/python_api/datasource/gwas_catalog/summary_statistics.md rename to docs/python_api/datasources/gwas_catalog/summary_statistics.md diff --git a/docs/python_api/datasource/intervals/_intervals.md b/docs/python_api/datasources/intervals/_intervals.md similarity index 100% rename from docs/python_api/datasource/intervals/_intervals.md rename to docs/python_api/datasources/intervals/_intervals.md diff --git a/docs/python_api/datasource/intervals/andersson.md b/docs/python_api/datasources/intervals/andersson.md similarity index 100% rename from docs/python_api/datasource/intervals/andersson.md rename to docs/python_api/datasources/intervals/andersson.md diff --git a/docs/python_api/datasource/intervals/javierre.md b/docs/python_api/datasources/intervals/javierre.md similarity index 100% rename from docs/python_api/datasource/intervals/javierre.md rename to docs/python_api/datasources/intervals/javierre.md diff --git a/docs/python_api/datasource/intervals/jung.md b/docs/python_api/datasources/intervals/jung.md similarity index 100% rename from docs/python_api/datasource/intervals/jung.md rename to docs/python_api/datasources/intervals/jung.md diff --git a/docs/python_api/datasource/intervals/thurman.md b/docs/python_api/datasources/intervals/thurman.md similarity index 100% rename from docs/python_api/datasource/intervals/thurman.md rename to docs/python_api/datasources/intervals/thurman.md diff --git a/docs/python_api/datasource/open_targets/_open_targets.md b/docs/python_api/datasources/open_targets/_open_targets.md similarity index 100% rename from docs/python_api/datasource/open_targets/_open_targets.md rename to docs/python_api/datasources/open_targets/_open_targets.md diff --git a/docs/python_api/datasource/open_targets/l2g_gold_standard.md b/docs/python_api/datasources/open_targets/l2g_gold_standard.md similarity index 100% rename from docs/python_api/datasource/open_targets/l2g_gold_standard.md rename to docs/python_api/datasources/open_targets/l2g_gold_standard.md diff --git a/docs/python_api/datasource/open_targets/target.md b/docs/python_api/datasources/open_targets/target.md similarity index 100% rename from docs/python_api/datasource/open_targets/target.md rename to docs/python_api/datasources/open_targets/target.md diff --git a/docs/python_api/method/_method.md b/docs/python_api/method/_method.md deleted file mode 100644 index 94a1008c9..000000000 --- a/docs/python_api/method/_method.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Method ---- - -# Method - -TBC diff --git a/docs/python_api/method/clumping.md b/docs/python_api/method/clumping.md deleted file mode 100644 index ff996672f..000000000 --- a/docs/python_api/method/clumping.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Clumping ---- - -# Clumping - -Clumping is a commonly used post-processing method that allows for identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal. - -We have implemented 2 clumping methods: - -::: gentropy.method.clump.LDclumping diff --git a/docs/python_api/method/l2g/_l2g.md b/docs/python_api/method/l2g/_l2g.md deleted file mode 100644 index d62ab9588..000000000 --- a/docs/python_api/method/l2g/_l2g.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Locus to Gene (L2G) classifier ---- - -TBC diff --git a/docs/python_api/method/pics.md b/docs/python_api/method/pics.md deleted file mode 100644 index f049ef91d..000000000 --- a/docs/python_api/method/pics.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: PICS ---- - -:::gentropy.method.pics.PICS diff --git a/docs/python_api/method/window_based_clumping.md b/docs/python_api/method/window_based_clumping.md deleted file mode 100644 index ba8252774..000000000 --- a/docs/python_api/method/window_based_clumping.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Window-based clumping ---- - -:::gentropy.method.window_based_clumping.WindowBasedClumping diff --git a/docs/python_api/methods/_methods.md b/docs/python_api/methods/_methods.md new file mode 100644 index 000000000..d6dc55ab4 --- /dev/null +++ b/docs/python_api/methods/_methods.md @@ -0,0 +1,5 @@ +--- +title: Methods +--- + +This section consists of all the methods available in the package. It provides detailed explanations and usage examples for each method. Developers can refer to this section to understand how to use the methods effectively in their code. The list of methods is constantly updated. diff --git a/docs/python_api/method/carma.md b/docs/python_api/methods/carma.md similarity index 100% rename from docs/python_api/method/carma.md rename to docs/python_api/methods/carma.md diff --git a/docs/python_api/methods/clumping.md b/docs/python_api/methods/clumping.md new file mode 100644 index 000000000..6cc368013 --- /dev/null +++ b/docs/python_api/methods/clumping.md @@ -0,0 +1,22 @@ +--- +title: Clumping +--- + +# Clumping + +Clumping is a commonly used post-processing method that allows for the identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal. + +We have implemented two clumping methods: + +1. **Distance-based clumping:** Uses genomic window to clump the significant SNPs into one hit. +2. **LD-based clumping:** Uses genomic window and LD to clump the significant SNPs into one hit. + +The algorithmic logic is similar to classic clumping approaches from PLINK (Reference: [PLINK Clump Documentation](https://zzz.bwh.harvard.edu/plink/clump.shtml)). See details below: + +# Distance-based clumping + +::: gentropy.method.window_based_clumping.WindowBasedClumping + +# LD-based clumping: + +::: gentropy.method.clump.LDclumping diff --git a/docs/python_api/method/coloc.md b/docs/python_api/methods/coloc.md similarity index 100% rename from docs/python_api/method/coloc.md rename to docs/python_api/methods/coloc.md diff --git a/docs/python_api/method/ecaviar.md b/docs/python_api/methods/ecaviar.md similarity index 100% rename from docs/python_api/method/ecaviar.md rename to docs/python_api/methods/ecaviar.md diff --git a/docs/python_api/methods/l2g/_l2g.md b/docs/python_api/methods/l2g/_l2g.md new file mode 100644 index 000000000..fca3ba79d --- /dev/null +++ b/docs/python_api/methods/l2g/_l2g.md @@ -0,0 +1,24 @@ +--- +title: Locus to Gene (L2G) model +--- + +The **“locus-to-gene” (L2G)** model derives features to prioritize likely causal genes at each GWAS locus based on genetic and functional genomics features. The main categories of predictive features are: + +- **Distance:** (from credible set variants to gene) +- **Molecular QTL Colocalization** +- **Chromatin Interaction:** (e.g., promoter-capture Hi-C) +- **Variant Pathogenicity:** (from VEP) + +The L2G model is distinct from the variant-to-gene (V2G) pipeline in that it: + +- Uses a machine-learning model to learn the weights of each evidence source based on a gold standard of previously identified causal genes. +- Relies upon fine-mapping and colocalization data. + +Some of the predictive features weight variant-to-gene (or genomic region-to-gene) evidence based on the posterior probability that the variant is causal, determined through fine-mapping of the GWAS association. + +Details of the L2G model are provided in our Nature Genetics publication (ref - [Nature Genetics Publication](https://www.nature.com/articles/s41588-021-00945-5)): + +- **Title:** An open approach to systematically prioritize causal variants and genes at all published human GWAS trait-associated loci. +- **Authors:** Mountjoy, E., Schmidt, E.M., Carmona, M. et al. +- **Journal:** Nat Genet 53, 1527–1533 (2021). +- **DOI:** [10.1038/s41588-021-00945-5](https://doi.org/10.1038/s41588-021-00945-5) diff --git a/docs/python_api/method/l2g/evaluator.md b/docs/python_api/methods/l2g/evaluator.md similarity index 100% rename from docs/python_api/method/l2g/evaluator.md rename to docs/python_api/methods/l2g/evaluator.md diff --git a/docs/python_api/method/l2g/feature_factory.md b/docs/python_api/methods/l2g/feature_factory.md similarity index 100% rename from docs/python_api/method/l2g/feature_factory.md rename to docs/python_api/methods/l2g/feature_factory.md diff --git a/docs/python_api/method/l2g/model.md b/docs/python_api/methods/l2g/model.md similarity index 100% rename from docs/python_api/method/l2g/model.md rename to docs/python_api/methods/l2g/model.md diff --git a/docs/python_api/method/l2g/trainer.md b/docs/python_api/methods/l2g/trainer.md similarity index 100% rename from docs/python_api/method/l2g/trainer.md rename to docs/python_api/methods/l2g/trainer.md diff --git a/docs/python_api/method/ld_annotator.md b/docs/python_api/methods/ld_annotator.md similarity index 100% rename from docs/python_api/method/ld_annotator.md rename to docs/python_api/methods/ld_annotator.md diff --git a/docs/python_api/methods/pics.md b/docs/python_api/methods/pics.md new file mode 100644 index 000000000..41de539a0 --- /dev/null +++ b/docs/python_api/methods/pics.md @@ -0,0 +1,13 @@ +--- +title: PICS +--- + +**PICS Overview:** + +PICS is a fine-mapping method designed to identify the most likely causal SNPs associated with a trait or disease within a genomic region. It leverages both haplotype information and the observed association patterns from genome-wide association studies (GWAS). + +Please refer to the original publication for in-depth details: [PICS Publication](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4336207/). + +We use PICS for both GWAS clumping results and GWAS curated studies. + +:::gentropy.method.pics.PICS diff --git a/docs/python_api/step/_step.md b/docs/python_api/step/_step.md deleted file mode 100644 index 987f31b91..000000000 --- a/docs/python_api/step/_step.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Step ---- - -# Step - -TBC diff --git a/docs/python_api/step/ld_clump.md b/docs/python_api/step/ld_clump.md deleted file mode 100644 index 75097bd8c..000000000 --- a/docs/python_api/step/ld_clump.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: LD-based clumping ---- - -::: gentropy.ld_based_clumping.LdBasedClumpingStep diff --git a/docs/python_api/steps/_steps.md b/docs/python_api/steps/_steps.md new file mode 100644 index 000000000..9c6d7679c --- /dev/null +++ b/docs/python_api/steps/_steps.md @@ -0,0 +1,7 @@ +--- +title: Step +--- + +# Step + +This section provides description for the `Step` class. Each `Step` uses its own set of Methods and Datasets and implements the logic necessary to read a set of inputs, perform the transformation and write the outputs. All steps are available through the command line interface when running the `gentropy` command. diff --git a/docs/python_api/step/colocalisation.md b/docs/python_api/steps/colocalisation.md similarity index 71% rename from docs/python_api/step/colocalisation.md rename to docs/python_api/steps/colocalisation.md index 17a0dfd9a..76ecf7150 100644 --- a/docs/python_api/step/colocalisation.md +++ b/docs/python_api/steps/colocalisation.md @@ -1,5 +1,5 @@ --- -title: Colocalisation +title: colocalisation --- ::: gentropy.colocalisation.ColocalisationStep diff --git a/docs/python_api/step/eqtl_catalogue.md b/docs/python_api/steps/eqtl_catalogue.md similarity index 71% rename from docs/python_api/step/eqtl_catalogue.md rename to docs/python_api/steps/eqtl_catalogue.md index 32f2aa257..17ea6a6f6 100644 --- a/docs/python_api/step/eqtl_catalogue.md +++ b/docs/python_api/steps/eqtl_catalogue.md @@ -1,5 +1,5 @@ --- -title: eQTL Catalogue +title: eqtl_catalogue --- ::: gentropy.eqtl_catalogue.EqtlCatalogueStep diff --git a/docs/python_api/step/finngen_studies.md b/docs/python_api/steps/finngen_studies.md similarity index 71% rename from docs/python_api/step/finngen_studies.md rename to docs/python_api/steps/finngen_studies.md index 09da6f7c8..1ec4394f4 100644 --- a/docs/python_api/step/finngen_studies.md +++ b/docs/python_api/steps/finngen_studies.md @@ -1,5 +1,5 @@ --- -title: FinnGen Studies +title: finngen_studies --- ::: gentropy.finngen_studies.FinnGenStudiesStep diff --git a/docs/python_api/step/finngen_sumstat_preprocess.md b/docs/python_api/steps/finngen_sumstat_preprocess.md similarity index 66% rename from docs/python_api/step/finngen_sumstat_preprocess.md rename to docs/python_api/steps/finngen_sumstat_preprocess.md index 17b44b95e..57f27658e 100644 --- a/docs/python_api/step/finngen_sumstat_preprocess.md +++ b/docs/python_api/steps/finngen_sumstat_preprocess.md @@ -1,5 +1,5 @@ --- -title: FinnGen Preprocess Summary Stats +title: finngen_sumstat_preprocess --- ::: gentropy.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep diff --git a/docs/python_api/step/gene_index.md b/docs/python_api/steps/gene_index.md similarity index 72% rename from docs/python_api/step/gene_index.md rename to docs/python_api/steps/gene_index.md index bae77b5aa..a0808dcad 100644 --- a/docs/python_api/step/gene_index.md +++ b/docs/python_api/steps/gene_index.md @@ -1,5 +1,5 @@ --- -title: Gene Index +title: gene_index --- ::: gentropy.gene_index.GeneIndexStep diff --git a/docs/python_api/step/gwas_catalog_curation.md b/docs/python_api/steps/gwas_catalog_curation.md similarity index 58% rename from docs/python_api/step/gwas_catalog_curation.md rename to docs/python_api/steps/gwas_catalog_curation.md index 51aa72970..8512e39fc 100644 --- a/docs/python_api/step/gwas_catalog_curation.md +++ b/docs/python_api/steps/gwas_catalog_curation.md @@ -1,5 +1,5 @@ --- -title: Apply in-house curation on GWAS Catalog studies +title: gwas_catalog_study_curation --- ::: gentropy.gwas_catalog_study_curation.GWASCatalogStudyCurationStep diff --git a/docs/python_api/step/gwas_catalog_inclusion.md b/docs/python_api/steps/gwas_catalog_inclusion.md similarity index 51% rename from docs/python_api/step/gwas_catalog_inclusion.md rename to docs/python_api/steps/gwas_catalog_inclusion.md index 43abbb171..e9ede6dd6 100644 --- a/docs/python_api/step/gwas_catalog_inclusion.md +++ b/docs/python_api/steps/gwas_catalog_inclusion.md @@ -1,5 +1,5 @@ --- -title: Generate inclusion and exclusions lists for GWAS Catalog study ingestion. +title: gwas_catalog_study_inclusion --- ::: gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator diff --git a/docs/python_api/step/gwas_catalog_ingestion.md b/docs/python_api/steps/gwas_catalog_ingestion.md similarity index 70% rename from docs/python_api/step/gwas_catalog_ingestion.md rename to docs/python_api/steps/gwas_catalog_ingestion.md index 17df73fec..69ea92479 100644 --- a/docs/python_api/step/gwas_catalog_ingestion.md +++ b/docs/python_api/steps/gwas_catalog_ingestion.md @@ -1,5 +1,5 @@ --- -title: GWAS Catalog +title: gwas_catalog_ingestion --- ::: gentropy.gwas_catalog_ingestion.GWASCatalogIngestionStep diff --git a/docs/python_api/step/gwas_catalog_sumstat_preprocess.md b/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md similarity index 69% rename from docs/python_api/step/gwas_catalog_sumstat_preprocess.md rename to docs/python_api/steps/gwas_catalog_sumstat_preprocess.md index 39c64a882..3b0422050 100644 --- a/docs/python_api/step/gwas_catalog_sumstat_preprocess.md +++ b/docs/python_api/steps/gwas_catalog_sumstat_preprocess.md @@ -1,5 +1,5 @@ --- -title: GWAS Catalog sumstat preprocess +title: gwas_catalog_sumstat_preprocess --- ::: gentropy.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep diff --git a/docs/python_api/step/l2g.md b/docs/python_api/steps/l2g.md similarity index 60% rename from docs/python_api/step/l2g.md rename to docs/python_api/steps/l2g.md index d2b9e290e..847569e36 100644 --- a/docs/python_api/step/l2g.md +++ b/docs/python_api/steps/l2g.md @@ -1,5 +1,5 @@ --- -title: Locus-to-gene (L2G) +title: locus_to_gene --- ::: gentropy.l2g.LocusToGeneStep diff --git a/docs/python_api/steps/ld_clump.md b/docs/python_api/steps/ld_clump.md new file mode 100644 index 000000000..fea44f807 --- /dev/null +++ b/docs/python_api/steps/ld_clump.md @@ -0,0 +1,5 @@ +--- +title: ld_based_clumping +--- + +::: gentropy.ld_based_clumping.LDBasedClumpingStep diff --git a/docs/python_api/step/ld_index.md b/docs/python_api/steps/ld_index.md similarity index 72% rename from docs/python_api/step/ld_index.md rename to docs/python_api/steps/ld_index.md index eba826266..bf8b9b58e 100644 --- a/docs/python_api/step/ld_index.md +++ b/docs/python_api/steps/ld_index.md @@ -1,5 +1,5 @@ --- -title: LD Index +title: ld_index --- ::: gentropy.ld_index.LDIndexStep diff --git a/docs/python_api/step/pics.md b/docs/python_api/steps/pics.md similarity index 75% rename from docs/python_api/step/pics.md rename to docs/python_api/steps/pics.md index 5654489f6..aacc6fbaf 100644 --- a/docs/python_api/step/pics.md +++ b/docs/python_api/steps/pics.md @@ -1,5 +1,5 @@ --- -title: PICS +title: pics --- ::: gentropy.pics.PICSStep diff --git a/docs/python_api/step/variant_annotation_step.md b/docs/python_api/steps/variant_annotation_step.md similarity index 70% rename from docs/python_api/step/variant_annotation_step.md rename to docs/python_api/steps/variant_annotation_step.md index e8d7c2c3f..e65a071b2 100644 --- a/docs/python_api/step/variant_annotation_step.md +++ b/docs/python_api/steps/variant_annotation_step.md @@ -1,5 +1,5 @@ --- -title: Variant Annotation +title: variant_annotation --- ::: gentropy.variant_annotation.VariantAnnotationStep diff --git a/docs/python_api/step/variant_index_step.md b/docs/python_api/steps/variant_index_step.md similarity index 71% rename from docs/python_api/step/variant_index_step.md rename to docs/python_api/steps/variant_index_step.md index e38fd8206..8a36f097f 100644 --- a/docs/python_api/step/variant_index_step.md +++ b/docs/python_api/steps/variant_index_step.md @@ -1,5 +1,5 @@ --- -title: Variant Index +title: variant_index --- ::: gentropy.variant_index.VariantIndexStep diff --git a/docs/python_api/step/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md similarity index 59% rename from docs/python_api/step/variant_to_gene_step.md rename to docs/python_api/steps/variant_to_gene_step.md index 16db10e8e..1a3e56af8 100644 --- a/docs/python_api/step/variant_to_gene_step.md +++ b/docs/python_api/steps/variant_to_gene_step.md @@ -1,5 +1,5 @@ --- -title: Variant-to-gene +title: variant_to_gene --- ::: gentropy.v2g.V2GStep diff --git a/docs/python_api/step/window_based_clumping.md b/docs/python_api/steps/window_based_clumping.md similarity index 70% rename from docs/python_api/step/window_based_clumping.md rename to docs/python_api/steps/window_based_clumping.md index bbcd2c0d8..f33057c71 100644 --- a/docs/python_api/step/window_based_clumping.md +++ b/docs/python_api/steps/window_based_clumping.md @@ -1,5 +1,5 @@ --- -title: Window-based clumping +title: window_based_clumping --- ::: gentropy.window_based_clumping.WindowBasedClumpingStep diff --git a/mkdocs.yml b/mkdocs.yml index ad928ec7b..d25076a73 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Open Targets Genetics +site_name: Open Targets Gentropy nav: - Home: index.md diff --git a/pyproject.toml b/pyproject.toml index 537021fcb..b44832f52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "gentropy" # !! version is managed by semantic_release version = "0.1.0-rc.2" -description = "Open targets Genetics Portal Python ETL" +description = "Open Targets python framework for post-GWAS analysis" authors = ["Open Targets core team"] license = "Apache-2.0" readme = "README.md" @@ -13,6 +13,7 @@ packages = [{ include = "gentropy", from = "src" }] [tool.poetry.urls] "Bug Tracker" = "http://github.com/opentargets/issues" "Funding" = "https://www.opentargets.org" +"Documentation" = "https://opentargets.github.io/gentropy/" [tool.poetry.scripts] gentropy = "gentropy.cli:main" diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py index 9ed7a81b1..e3dc56ccb 100644 --- a/src/airflow/dags/common_airflow.py +++ b/src/airflow/dags/common_airflow.py @@ -61,7 +61,7 @@ def create_cluster( cluster_name: str, - master_machine_type: str = "n1-highmem-8", + master_machine_type: str = "n1-highmem-16", worker_machine_type: str = "n1-standard-16", num_workers: int = 2, num_preemptible_workers: int = 0, diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml index e1c9ea627..0b634caaa 100644 --- a/src/airflow/dags/configs/dag.yaml +++ b/src/airflow/dags/configs/dag.yaml @@ -1,18 +1,18 @@ - id: "ot_gene_index" -- id: "ot_gwas_catalog" - id: "ot_variant_index" - prerequisites: - - "ot_gwas_catalog" - id: "ot_variant_to_gene" prerequisites: - "ot_variant_index" - "ot_gene_index" -- id: "ot_study_locus_overlap" +- id: "ot_colocalisation" +- id: "ot_locus_to_gene_train" prerequisites: - - "ot_gwas_catalog" -- id: "ot_locus_to_gene" + - "ot_variant_index" + - "ot_variant_to_gene" + - "ot_colocalisation" +- id: "ot_locus_to_gene_predict" prerequisites: - - "ot_gwas_catalog" + - "ot_locus_to_gene_train" - "ot_variant_index" - "ot_variant_to_gene" - - "ot_study_locus_overlap" + - "ot_colocalisation" diff --git a/src/airflow/dags/dag_preprocess.py b/src/airflow/dags/dag_preprocess.py index 728d5932f..4439914c5 100644 --- a/src/airflow/dags/dag_preprocess.py +++ b/src/airflow/dags/dag_preprocess.py @@ -9,10 +9,8 @@ CLUSTER_NAME = "otg-preprocess" ALL_STEPS = [ - "ot_eqtl_catalogue", "ot_ld_index", "ot_variant_annotation", - "ot_ukbiobank", ] diff --git a/src/airflow/dags/gwas_catalog_harmonisation.py b/src/airflow/dags/gwas_catalog_harmonisation.py index 7e7790224..5713e223d 100644 --- a/src/airflow/dags/gwas_catalog_harmonisation.py +++ b/src/airflow/dags/gwas_catalog_harmonisation.py @@ -14,7 +14,9 @@ CLUSTER_NAME = "otg-gwascatalog-harmonisation" AUTOSCALING = "gwascatalog-harmonisation" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" +SUMMARY_STATS_BUCKET_NAME = "gwas_catalog_data" +RAW_SUMMARY_STATISTICS_PREFIX = "raw_summary_statistics" +HARMONISED_SUMMARY_STATISTICS_PREFIX = "harmonised_summary_statistics" with DAG( dag_id=Path(__file__).stem, @@ -26,14 +28,14 @@ list_inputs = GCSListObjectsOperator( task_id="list_raw_harmonised", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="raw-harmonised", + prefix=RAW_SUMMARY_STATISTICS_PREFIX, match_glob="**/*.h.tsv.gz", ) # List parquet files that have been previously processed list_outputs = GCSListObjectsOperator( task_id="list_harmonised_parquet", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + prefix=HARMONISED_SUMMARY_STATISTICS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,11 +61,15 @@ def create_to_do_list(**kwargs: Any) -> Any: print("Number of parquet files: ", len(parquets)) # noqa: T201 for path in raw_harmonised: match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + path, ) if match_result: study_id = match_result.group(2) - if f"harmonised/{study_id}.parquet/_SUCCESS" not in parquets: + if ( + f"{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet/_SUCCESS" + not in parquets + ): to_do_list.append(path) print("Number of jobs to submit: ", len(to_do_list)) # noqa: T201 ti.xcom_push(key="to_do_list", value=to_do_list) @@ -85,7 +91,8 @@ def submit_jobs(**kwargs: Any) -> None: time.sleep(60) input_path = todo[i] match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + input_path, ) if match_result: study_id = match_result.group(2) @@ -95,7 +102,7 @@ def submit_jobs(**kwargs: Any) -> None: step_id="ot_gwas_catalog_sumstat_preprocess", other_args=[ f"step.raw_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{input_path}", - f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/harmonised/{study_id}.parquet", + f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet", ], ) diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py index 33d4062a9..9df22586a 100644 --- a/src/airflow/dags/gwas_catalog_preprocess.py +++ b/src/airflow/dags/gwas_catalog_preprocess.py @@ -13,11 +13,44 @@ CLUSTER_NAME = "otg-preprocess-gwascatalog" AUTOSCALING = "otg-preprocess-gwascatalog" -RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX" -RELEASEBUCKET_NAME = "genetics_etl_python_playground" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" -SUMSTATS = "gs://open-targets-gwas-summary-stats/harmonised" -MANIFESTS_PATH = f"{RELEASEBUCKET}/manifests/" +# Setting up bucket name and output object names: +GWAS_CATALOG_BUCKET_NAME = "gwas_catalog_data" +HARMONISED_SUMSTATS_PREFIX = "harmonised_summary_statistics" + +# Manifest paths: +MANIFESTS_PATH = f"gs://{GWAS_CATALOG_BUCKET_NAME}/manifests/" + +# The name of the manifest files have to be consistent with the config file: +HARMONISED_SUMSTATS_LIST_OBJECT_NAME = ( + "manifests/gwas_catalog_harmonised_sumstats_list.txt" +) +HARMONISED_SUMSTATS_LIST_FULL_NAME = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_LIST_OBJECT_NAME}" +) +CURATION_INCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curated_included_studies" +CURATION_EXCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curation_excluded_studies" +SUMMARY_STATISTICS_INCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_included_studies" +) +SUMMARY_STATISTICS_EXCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_excluded_studies" +) + +# Study index: +STUDY_INDEX = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_index" + +# Study loci: +CURATED_STUDY_LOCI = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations" +CURATED_LD_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations_ld_clumped" +WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped" +LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped" +# Credible sets: +CURATED_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated" +) +SUMMARY_STATISTICS_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats" +) def upload_harmonized_study_list( @@ -48,8 +81,8 @@ def upload_harmonized_study_list( # Getting list of folders (each a gwas study with summary statistics) list_harmonised_sumstats = GCSListObjectsOperator( task_id="list_harmonised_parquet", - bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + bucket=GWAS_CATALOG_BUCKET_NAME, + prefix=HARMONISED_SUMSTATS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,8 +92,8 @@ def upload_harmonized_study_list( python_callable=upload_harmonized_study_list, op_kwargs={ "concatenated_studies": '{{ "\n".join(ti.xcom_pull( key="return_value", task_ids="list_harmonised_parquet")) }}', - "bucket_name": RELEASEBUCKET_NAME, - "object_name": "output/python_etl/parquet/XX.XX/manifests/harmonised_sumstats.txt", + "bucket_name": GWAS_CATALOG_BUCKET_NAME, + "object_name": HARMONISED_SUMSTATS_LIST_OBJECT_NAME, }, ) @@ -73,9 +106,9 @@ def upload_harmonized_study_list( task_id="catalog_curation_inclusion_list", other_args=[ "step.criteria=curation", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_curation", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={CURATION_INCLUSION_NAME}", + f"step.exclusion_list_path={CURATION_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -84,7 +117,7 @@ def upload_harmonized_study_list( cluster_name=CLUSTER_NAME, step_id="ot_gwas_catalog_ingestion", task_id="ingest_curated_gwas_catalog_data", - other_args=[f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation"], + other_args=[f"step.inclusion_list_path={CURATION_INCLUSION_NAME}"], ) # Run LD-annotation and clumping on curated data: @@ -93,10 +126,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_curation_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/catalog_curated", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", + f"step.study_locus_input_path={CURATED_STUDY_LOCI}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={CURATED_LD_CLUMPED}", ], ) @@ -106,8 +138,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_curation_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/catalog_curated", + f"step.study_locus_ld_annotated_in={CURATED_LD_CLUMPED}", + f"step.picsed_study_locus_out={CURATED_CREDIBLE_SETS}", ], ) @@ -130,9 +162,9 @@ def upload_harmonized_study_list( task_id="catalog_sumstats_inclusion_list", other_args=[ "step.criteria=summary_stats", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_sumstats", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.exclusion_list_path={SUMMARY_STATISTICS_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -142,9 +174,9 @@ def upload_harmonized_study_list( step_id="ot_window_based_clumping", task_id="catalog_sumstats_window_clumping", other_args=[ - f"step.summary_statistics_input_path={SUMSTATS}", - f"step.study_locus_output_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", + f"step.summary_statistics_input_path=gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_PREFIX}", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}", ], ) @@ -154,10 +186,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_sumstats_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", + f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={LD_BASED_CLUMPED}", ], ) @@ -167,8 +198,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_sumstats_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/from_sumstats/catalog", + f"step.study_locus_ld_annotated_in={LD_BASED_CLUMPED}", + f"step.picsed_study_locus_out={SUMMARY_STATISTICS_CREDIBLE_SETS}", ], ) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index ba213a349..af14a54bc 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -34,12 +34,9 @@ class StepConfig: class ColocalisationConfig(StepConfig): """Colocalisation step configuration.""" - study_locus_path: str = MISSING + credible_set_path: str = MISSING study_index_path: str = MISSING coloc_path: str = MISSING - priorc1: float = 1e-4 - priorc2: float = 1e-4 - priorc12: float = 1e-5 _target_: str = "gentropy.colocalisation.ColocalisationStep" @@ -78,7 +75,7 @@ class GWASCatalogStudyInclusionConfig(StepConfig): inclusion_list_path: str = MISSING exclusion_list_path: str = MISSING _target_: str = ( - "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionStep" + "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator" ) @@ -167,7 +164,11 @@ class LocusToGeneConfig(StepConfig): session: Any = field( default_factory=lambda: { - "extended_spark_conf": {"spark.dynamicAllocation.enabled": "false"} + "extended_spark_conf": { + "spark.dynamicAllocation.enabled": "false", + "spark.driver.memory": "48g", + "spark.executor.memory": "48g", + } } ) run_mode: str = MISSING @@ -177,19 +178,22 @@ class LocusToGeneConfig(StepConfig): variant_gene_path: str = MISSING colocalisation_path: str = MISSING study_index_path: str = MISSING - study_locus_overlap_path: str = MISSING - gold_standard_curation_path: str = MISSING - gene_interactions_path: str = MISSING + gold_standard_curation_path: str | None = None + gene_interactions_path: str | None = None features_list: list[str] = field( default_factory=lambda: [ # average distance of all tagging variants to gene TSS "distanceTssMean", - # # minimum distance of all tagging variants to gene TSS + # minimum distance of all tagging variants to gene TSS "distanceTssMinimum", - # # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity + # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity "vepMaximumNeighborhood", - # # maximum vep consequence score of the locus 95% credible set split by gene + # maximum vep consequence score of the locus 95% credible set split by gene "vepMaximum", + # mean vep consequence score of the locus 95% credible set among all genes in the vicinity + "vepMeanNeighborhood", + # mean vep consequence score of the locus 95% credible set split by gene + "vepMean", # max clpp for each (study, locus, gene) aggregating over all eQTLs "eqtlColocClppMaximum", # max clpp for each (study, locus) aggregating over all eQTLs @@ -260,7 +264,7 @@ class VariantIndexConfig(StepConfig): @dataclass -class V2GConfig(StepConfig): +class VariantToGeneConfig(StepConfig): """V2G step configuration.""" variant_index_path: str = MISSING @@ -301,7 +305,7 @@ class WindowBasedClumpingStep(StepConfig): inclusion_list_path: str = MISSING locus_collect_distance: str | None = None - _target_: str = "gentropy.clump.WindowBasedClumpingStep" + _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep" @dataclass @@ -352,5 +356,5 @@ def register_config() -> None: cs.store(group="step", name="pics", node=PICSConfig) cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig) cs.store(group="step", name="variant_index", node=VariantIndexConfig) - cs.store(group="step", name="variant_to_gene", node=V2GConfig) + cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig) cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep) diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index 5a282ce35..401c5d6c6 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -18,7 +18,7 @@ @dataclass class Dataset(ABC): - """Open Targets Genetics Dataset. + """Open Targets Gentropy Dataset. `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `schemas` module. """ diff --git a/src/gentropy/dataset/l2g_feature_matrix.py b/src/gentropy/dataset/l2g_feature_matrix.py index c1a02b3b7..fa84499dc 100644 --- a/src/gentropy/dataset/l2g_feature_matrix.py +++ b/src/gentropy/dataset/l2g_feature_matrix.py @@ -76,15 +76,13 @@ def generate_features( raise ValueError("No features found") # raise error if the feature matrix is empty - if fm.limit(1).count() != 0: - return cls( - _df=convert_from_long_to_wide( - fm, ["studyLocusId", "geneId"], "featureName", "featureValue" - ), - _schema=cls.get_schema(), - features_list=features_list, - ) - raise ValueError("L2G Feature matrix is empty") + return cls( + _df=convert_from_long_to_wide( + fm, ["studyLocusId", "geneId"], "featureName", "featureValue" + ), + _schema=cls.get_schema(), + features_list=features_list, + ) @classmethod def get_schema(cls: type[L2GFeatureMatrix]) -> StructType: diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py index ddaa9d741..e24688da3 100644 --- a/src/gentropy/dataset/l2g_prediction.py +++ b/src/gentropy/dataset/l2g_prediction.py @@ -62,21 +62,23 @@ def from_credible_set( Returns: L2GPrediction: L2G dataset """ - gwas_study_locus = StudyLocus( - _df=study_locus.df.join( - study_index.study_type_lut().filter(f.col("studyType") == "gwas"), - on="studyId", - how="inner", - ), - _schema=StudyLocus.get_schema(), - ) fm = L2GFeatureMatrix.generate_features( features_list=features_list, - study_locus=gwas_study_locus, + study_locus=study_locus, study_index=study_index, variant_gene=v2g, colocalisation=coloc, ).fill_na() + + gwas_fm = L2GFeatureMatrix( + _df=( + fm.df.join( + study_locus.filter_by_study_type("gwas", study_index).df, + on="studyLocusId", + ) + ), + _schema=cls.get_schema(), + ) return L2GPrediction( # Load and apply fitted model _df=( @@ -84,7 +86,7 @@ def from_credible_set( model_path, features_list=features_list, ) - .predict(fm) + .predict(gwas_fm) # the probability of the positive class is the second element inside the probability array # - this is selected as the L2G probability .select( diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 992ef304d..41c099959 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -228,6 +228,35 @@ def get_schema(cls: type[StudyLocus]) -> StructType: """ return parse_spark_schema("study_locus.json") + def filter_by_study_type( + self: StudyLocus, study_type: str, study_index: StudyIndex + ) -> StudyLocus: + """Creates a new StudyLocus dataset filtered by study type. + + Args: + study_type (str): Study type to filter for. Can be one of `gwas`, `eqtl`, `pqtl`, `eqtl`. + study_index (StudyIndex): Study index to resolve study types. + + Returns: + StudyLocus: Filtered study-locus dataset. + + Raises: + ValueError: If study type is not supported. + """ + if study_type not in ["gwas", "eqtl", "pqtl", "sqtl"]: + raise ValueError( + f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl." + ) + new_df = ( + self.df.join(study_index.study_type_lut(), on="studyId", how="inner") + .filter(f.col("studyType") == study_type) + .drop("studyType") + ) + return StudyLocus( + _df=new_df, + _schema=self._schema, + ) + def filter_credible_set( self: StudyLocus, credible_interval: CredibleInterval, diff --git a/src/gentropy/dataset/study_locus_overlap.py b/src/gentropy/dataset/study_locus_overlap.py index ee1e81b32..5f839bd9c 100644 --- a/src/gentropy/dataset/study_locus_overlap.py +++ b/src/gentropy/dataset/study_locus_overlap.py @@ -21,6 +21,7 @@ class StudyLocusOverlap(Dataset): This dataset captures pairs of overlapping `StudyLocus`: that is associations whose credible sets share at least one tagging variant. !!! note + This is a helpful dataset for other downstream analyses, such as colocalisation. This dataset will contain the overlapping signals between studyLocus associations once they have been clumped and fine-mapped. """ diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 4bbdaaea0..d00a91596 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -81,13 +81,17 @@ def __init__( "model_path and predictions_path must be set for predict mode." ) predictions = L2GPrediction.from_credible_set( - model_path, features_list, credible_set, studies, v2g, coloc + model_path, list(features_list), credible_set, studies, v2g, coloc ) predictions.df.write.mode(session.write_mode).parquet(predictions_path) session.logger.info(predictions_path) - elif run_mode == "train": + elif ( + run_mode == "train" + and gold_standard_curation_path + and gene_interactions_path + ): # Process gold standard and L2G features - gs_curation = session.spark.read.json(gold_standard_curation_path) + gs_curation = session.spark.read.json(gold_standard_curation_path).persist() interactions = session.spark.read.parquet(gene_interactions_path) study_locus_overlap = StudyLocus( # We just extract overlaps of associations in the gold standard. This parsing is a duplication of the one in the gold standard curation, diff --git a/src/gentropy/ld_based_clumping.py b/src/gentropy/ld_based_clumping.py index ea9646806..e6a477a89 100644 --- a/src/gentropy/ld_based_clumping.py +++ b/src/gentropy/ld_based_clumping.py @@ -7,7 +7,7 @@ from gentropy.dataset.study_locus import StudyLocus -class LdBasedClumpingStep: +class LDBasedClumpingStep: """Step to perform LD-based clumping on study locus dataset. As a first step, study locus is enriched with population specific linked-variants. diff --git a/src/gentropy/ld_index.py b/src/gentropy/ld_index.py index dfdd90306..cb260977d 100644 --- a/src/gentropy/ld_index.py +++ b/src/gentropy/ld_index.py @@ -11,6 +11,7 @@ class LDIndexStep: """LD index step. !!! warning "This step is resource intensive" + Suggested params: high memory machine, 5TB of boot disk, no SSDs. """ diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py index b0699b88f..fd56398f0 100644 --- a/src/gentropy/method/colocalisation.py +++ b/src/gentropy/method/colocalisation.py @@ -100,6 +100,7 @@ class Coloc: | H4 | both traits are associated and share the same single causal variant | !!! warning "Bayes factors required" + Coloc requires the availability of Bayes factors (BF) for each variant in the credible set (`logBF` column). """ diff --git a/src/gentropy/method/pics.py b/src/gentropy/method/pics.py index bcf17280a..e5ed5f2c6 100644 --- a/src/gentropy/method/pics.py +++ b/src/gentropy/method/pics.py @@ -185,6 +185,7 @@ def finemap( """Run PICS on a study locus. !!! info "Study locus needs to be LD annotated" + The study locus needs to be LD annotated before PICS can be calculated. Args: diff --git a/tests/dataset/test_study_locus.py b/tests/dataset/test_study_locus.py index b01b4f63d..037ede068 100644 --- a/tests/dataset/test_study_locus.py +++ b/tests/dataset/test_study_locus.py @@ -161,6 +161,58 @@ def test_find_overlaps( ) +@pytest.mark.parametrize( + "study_type, expected_sl_count", [("gwas", 1), ("eqtl", 1), ("pqtl", 0)] +) +def test_filter_by_study_type( + spark: SparkSession, study_type: str, expected_sl_count: int +) -> None: + """Test filter by study type.""" + # Input data + sl = StudyLocus( + _df=spark.createDataFrame( + [ + { + # from gwas + "studyLocusId": 1, + "variantId": "lead1", + "studyId": "study1", + }, + { + # from eqtl + "studyLocusId": 2, + "variantId": "lead2", + "studyId": "study2", + }, + ], + StudyLocus.get_schema(), + ), + _schema=StudyLocus.get_schema(), + ) + studies = StudyIndex( + _df=spark.createDataFrame( + [ + { + "studyId": "study1", + "studyType": "gwas", + "traitFromSource": "trait1", + "projectId": "project1", + }, + { + "studyId": "study2", + "studyType": "eqtl", + "traitFromSource": "trait2", + "projectId": "project2", + }, + ] + ), + _schema=StudyIndex.get_schema(), + ) + + observed = sl.filter_by_study_type(study_type, studies) + assert observed.df.count() == expected_sl_count + + def test_filter_credible_set(mock_study_locus: StudyLocus) -> None: """Test credible interval filter.""" assert isinstance( diff --git a/utils/update_GWAS_Catalog_data.sh b/utils/update_GWAS_Catalog_data.sh index 98ac4cc36..1e380d30c 100755 --- a/utils/update_GWAS_Catalog_data.sh +++ b/utils/update_GWAS_Catalog_data.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash - # Function to get the most recent date: get_most_recent(){ cat $1 | perl -lane 'push @a, $_ if $_ =~ /^\d+$/; END {@a = sort { $a <=> $b} @a; print pop @a }' @@ -21,13 +20,47 @@ get_release_info(){ logging(){ log_prompt="[$(date "+%Y.%m.%d %H:%M")]" - echo "${log_prompt} $@" + echo "${log_prompt} $@" >> ${LOG_FILE} +} + +upload_file_to_gcp(){ + FILENAME=${1} + TARGET=${2} + # Test if file exists: + if [ ! -f ${FILENAME} ]; then + logging "File ${FILENAME} does not exist." + return + fi + + logging "Copying ${FILENAME} to GCP..." + gsutil -mq cp file://$(pwd)/${FILENAME} ${TARGET} + + # Test if file was successfully uploaded: + if [ $? -ne 0 ]; then + logging "File ${FILENAME} failed to upload." + fi } # Resources: export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats -export GCP_TARGET=gs://genetics_etl_python_playground/input/v2d/ +export GCP_TARGET=gs://gwas_catalog_data +export LOG_FILE=gwas_catalog_data_update.log + +export GWAS_CATALOG_STUDY_CURATION_URL=https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv + +ASSOCIATION_FILE=gwas_catalog_associations_ontology_annotated.tsv +PUBLISHED_STUDIES_FILE=gwas_catalog_download_studies.tsv +PUBLISHED_ANCESTRIES_FILE=gwas_catalog_download_ancestries.tsv +UNPUBLISHED_STUDIES_FILE=gwas_catalog_unpublished_studies.tsv +UNPUBLISHED_ANCESTRIES_FILE=gwas_catalog_unpublished_ancestries.tsv +HARMONISED_LIST_FILE=harmonised_list.txt +GWAS_CATALOG_STUDY_CURATION_FILE=gwas_catalog_study_curation.tsv + +# Remove log file if exists: +if [ -f ${LOG_FILE} ]; then + rm -rf ${LOG_FILE} +fi logging "Extracing data from: ${BASE_URL}" logging "Release info fetched fom: ${RELEASE_INFO_URL}" @@ -47,36 +80,49 @@ RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY} logging "Datafiles are fetching from ${RELEASE_URL}" # Fetching files while assigning properly dated and annotated names: -wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv \ - -O gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv -O ${ASSOCIATION_FILE} +logging "File ${ASSOCIATION_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt -O ${PUBLISHED_STUDIES_FILE} +logging "File ${PUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv -O ${UNPUBLISHED_STUDIES_FILE} +logging "File ${UNPUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt -O ${PUBLISHED_ANCESTRIES_FILE} +logging "File ${PUBLISHED_ANCESTRIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv -O ${UNPUBLISHED_ANCESTRIES_FILE} +logging "File ${UNPUBLISHED_ANCESTRIES_FILE} saved." +wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O ${HARMONISED_LIST_FILE} +logging "File ${HARMONISED_LIST_FILE} saved." -wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt -logging "File harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt saved." +wget -q ${GWAS_CATALOG_STUDY_CURATION_URL} -O ${GWAS_CATALOG_STUDY_CURATION_FILE} +logging "In-house GWAS Catalog study curation file fetched from GitHub." logging "Copying files to GCP..." -gsutil -mq cp file://$(pwd)/gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ - -logging "Done." + +upload_file_to_gcp ${ASSOCIATION_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${HARMONISED_LIST_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${GWAS_CATALOG_STUDY_CURATION_FILE} ${GCP_TARGET}/manifests/ + + +logging "Files successfully uploaded." +logging "Removing local files..." +rm ${ASSOCIATION_FILE} \ + ${PUBLISHED_STUDIES_FILE} \ + ${PUBLISHED_ANCESTRIES_FILE} \ + ${HARMONISED_LIST_FILE} \ + ${UNPUBLISHED_STUDIES_FILE} \ + ${UNPUBLISHED_ANCESTRIES_FILE} \ + ${GWAS_CATALOG_STUDY_CURATION_FILE} + +# Uploading log file to GCP manifest folder: +logging "Uploading log file to GCP manifest folder..." +upload_file_to_gcp ${LOG_FILE} ${GCP_TARGET}/manifests/ +cat $LOG_FILE