diff --git a/.gitignore b/.gitignore index 07c8bef82..9aef6d065 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ src/airflow/logs/* !src/airflow/logs/.gitkeep site/ .env +.coverage* diff --git a/Makefile b/Makefile index 3057f95ce..72107b5e4 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ setup-dev: ## Setup development environment check: ## Lint and format code @echo "Linting API..." - @poetry run ruff src/gentropy . + @poetry run ruff check src/gentropy . @echo "Linting docstrings..." @poetry run pydoclint --config=pyproject.toml src @poetry run pydoclint --config=pyproject.toml --skip-checking-short-docstrings=true tests diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index 566e2ec50..d13ad5f5c 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -8,6 +8,7 @@ static_assets: gs://genetics_etl_python_playground/static_assets outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version} ## Datasets: +# GWAS gwas_catalog_dataset: gs://gwas_catalog_data # Ingestion input files: gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv @@ -29,7 +30,18 @@ gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets -# Input datasets +# GnomAD +gnomad_public_bucket: gs://gcp-public-data--gnomad/release +# LD generation +# Templates require placeholders {POP} to expand template to match multiple populationwise paths +ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm +ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht +liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht +# variant_annotation +gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/ + +# Others +chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain vep_consequences: ${datasets.static_assets}/vep_consequences.tsv anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed diff --git a/config/step/ot_ld_index.yaml b/config/step/ot_ld_index.yaml index 70dc6b5ee..e72711618 100644 --- a/config/step/ot_ld_index.yaml +++ b/config/step/ot_ld_index.yaml @@ -2,3 +2,19 @@ defaults: - ld_index ld_index_out: ${datasets.ld_index} +ld_matrix_template: ${datasets.ld_matrix_template} +ld_index_raw_template: ${datasets.ld_index_raw_template} +grch37_to_grch38_chain_path: ${datasets.chain_37_38.} +liftover_ht_path: ${datasets.liftover_ht_path} +ld_populations: + - afr # African-American + - amr # American Admixed/Latino + - asj # Ashkenazi Jewish + - eas # East Asian + - est # Estonian + - fin # Finnish + - nfe # Non-Finnish European + - nwe # Northwestern European + - seu # Southeastern European +# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out. +use_version_from_input: true diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml index 5da76b398..55a9503ce 100644 --- a/config/step/ot_variant_annotation.yaml +++ b/config/step/ot_variant_annotation.yaml @@ -2,3 +2,18 @@ defaults: - variant_annotation variant_annotation_path: ${datasets.variant_annotation} +gnomad_genomes_path: ${datasets.gnomad_genomes_path} +chain_38_37: ${datasets.chain_38_37} +gnomad_variant_populations: + - afr # African-American + - amr # American Admixed/Latino + - ami # Amish ancestry + - asj # Ashkenazi Jewish + - eas # East Asian + - fin # Finnish + - nfe # Non-Finnish European + - mid # Middle Eastern + - sas # South Asian + - remaining # Other +# The version will of the gnomad will be inferred from ld_matrix_template and appended to the ld_index_out. +use_version_from_input: true diff --git a/docs/python_api/_python_api.md b/docs/python_api/_python_api.md index 9952aa56f..0e1b51480 100644 --- a/docs/python_api/_python_api.md +++ b/docs/python_api/_python_api.md @@ -10,3 +10,4 @@ The overall architecture of the package distinguishes between: - [**Datasets**](datasets/_datasets.md): data model - [**Methods**](methods/_methods.md): statistical analysis tools - [**Steps**](steps/_steps.md): pipeline steps +- [**Common**](common/_common.md): Common classes diff --git a/docs/python_api/common/_common.md b/docs/python_api/common/_common.md new file mode 100644 index 000000000..a8abe0f84 --- /dev/null +++ b/docs/python_api/common/_common.md @@ -0,0 +1,8 @@ +--- +title: Common +--- + +Common utilities used in gentropy package. + +- [**Version Engine**](version_engine.md): class to extract version from datasource input paths +- [**Types**](types.md): Literal types used in the gentropy diff --git a/docs/python_api/common/types.md b/docs/python_api/common/types.md new file mode 100644 index 000000000..ffb460a3a --- /dev/null +++ b/docs/python_api/common/types.md @@ -0,0 +1,8 @@ +--- +title: Literal Types +--- + +:::gentropy.common.types +:::gentropy.common.types.LD_Population +:::gentropy.common.types.VariantPopulation +:::gentropy.common.types.DataSourceType diff --git a/docs/python_api/common/version_engine.md b/docs/python_api/common/version_engine.md new file mode 100644 index 000000000..28d9b4b2e --- /dev/null +++ b/docs/python_api/common/version_engine.md @@ -0,0 +1,12 @@ +--- +title: VersionEngine +--- + +**VersionEngine**: + +Version engine allows for registering datasource specific version seeker class to retrieve datasource version used as input to gentropy steps. Currently implemented only for GnomAD datasource. + +This class can be then used to produce automation over output directory versioning. + +:::gentropy.common.version_engine.VersionEngine +:::gentropy.common.version_engine.GnomADVersionSeeker diff --git a/pyproject.toml b/pyproject.toml index 818d0685a..edbf15a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -247,15 +247,15 @@ ignore = [ ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402"] "path/to/file.py" = ["E402"] "**/{tests,docs,tools}/*" = ["E402"] -[tool.ruff.flake8-quotes] +[tool.ruff.lint.flake8-quotes] docstring-quotes = "double" -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" [tool.pydoclint] diff --git a/src/gentropy/common/types.py b/src/gentropy/common/types.py new file mode 100644 index 000000000..6d2cb3a48 --- /dev/null +++ b/src/gentropy/common/types.py @@ -0,0 +1,18 @@ +"""Types and type aliases used in the package.""" + +from typing import Literal + +LD_Population = Literal["afr", "amr", "asj", "eas", "est", "fin", "nfe", "nwe", "seu"] + +VariantPopulation = Literal[ + "afr", "amr", "ami", "asj", "eas", "fin", "nfe", "mid", "sas", "remaining" +] +DataSourceType = Literal[ + "gnomad", + "fingenn", + "gwas_catalog", + "eqtl_catalog", + "ukbiobank", + "open_targets", + "intervals", +] diff --git a/src/gentropy/common/version_engine.py b/src/gentropy/common/version_engine.py new file mode 100644 index 000000000..1cf34bbec --- /dev/null +++ b/src/gentropy/common/version_engine.py @@ -0,0 +1,154 @@ +"""Mechanism to seek version from specific datasource.""" + +from __future__ import annotations + +import re +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Callable + +from gentropy.common.types import DataSourceType + + +class VersionEngine: + """Seek version from the datasource.""" + + def __init__(self, datasource: DataSourceType) -> None: + """Initialize VersionEngine. + + Args: + datasource (DataSourceType): datasource to seek the version from + """ + self.datasource = datasource + + @staticmethod + def version_seekers() -> dict[DataSourceType, DatasourceVersionSeeker]: + """List version seekers. + + Returns: + dict[DataSourceType, DatasourceVersionSeeker]: list of available data sources. + """ + return { + "gnomad": GnomADVersionSeeker(), + } + + def seek(self, text: str | Path) -> str: + """Interface for inferring the version from text by using registered data source version iner method. + + Args: + text (str | Path): text to seek version from + + Returns: + str: inferred version + + Raises: + TypeError: if version can not be found in the text + + Examples: + >>> VersionEngine("gnomad").seek("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz") + '2.1.1' + """ + match text: + case Path() | str(): + text = str(text) + case _: + msg = f"Can not find version in {text}" + raise TypeError(msg) + infer_method = self._get_version_seek_method() + return infer_method(text) + + def _get_version_seek_method(self) -> Callable[[str], str]: + """Method that gets the version seeker for the datasource. + + Returns: + Callable[[str], str]: Method to seek version based on the initialized datasource + + Raises: + ValueError: if datasource is not registered in the list of version seekers + """ + if self.datasource not in self.version_seekers(): + raise ValueError(f"Invalid datasource {self.datasource}") + return self.version_seekers()[self.datasource].seek_version + + def amend_version( + self, analysis_input_path: str | Path, analysis_output_path: str | Path + ) -> str: + """Amend version to the analysis output path if it is not already present. + + Path can be path to g3:// or Path object, absolute or relative. + The analysis_input_path has to contain the version number. + If the analysis_output_path contains the same version as inferred from input version already, + then it will not be appended. + + Args: + analysis_input_path (str | Path): step input path + analysis_output_path (str | Path): step output path + + Returns: + str: Path with the ammended version, does not return Path object! + + Examples: + >>> VersionEngine("gnomad").amend_version("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz", "/some/path/without/version") + '/some/path/without/version/2.1.1' + """ + version = self.seek(analysis_input_path) + output_path = str(analysis_output_path) + if version in output_path: + return output_path + if output_path.endswith("/"): + return f"{analysis_output_path}{version}" + return f"{analysis_output_path}/{version}" + + +class DatasourceVersionSeeker(ABC): + """Interface for datasource version seeker.""" + + @staticmethod + @abstractmethod + def seek_version(text: str) -> str: + """Seek version from text. Implement this method in the subclass. + + Args: + text (str): text to seek version from + + Returns: + str: seeked version + + Raises: + ValueError: if version can not be seeked + + """ + raise NotImplementedError + + +class GnomADVersionSeeker(DatasourceVersionSeeker): + """Seek version from GnomAD datasource.""" + + @staticmethod + def seek_version(text: str) -> str: + """Seek GnomAD version from provided text by using regex. + + Up to 3 digits are allowed in the version number. + Historically gnomAD version numbers have been in the format + 2.1.1, 3.1, etc. as of 2024-05. GnomAD versions can be found by + running `"gs://gcp-public-data--gnomad/release/*/*/*"` + + Args: + text (str): text to seek version from + + Raises: + ValueError: if version can not be seeked + + Returns: + str: seeked version + + Examples: + >>> GnomADVersionSeeker.seek_version("gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz") + '2.1.1' + """ + result = re.search(r"v?((\d+){1}\.(\d+){1}\.?(\d+)?)", text) + match result: + case None: + raise ValueError(f"No GnomAD version found in provided text: {text}") + case _: + return result.group(1) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 59452fed8..5ac0a939b 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -1,4 +1,5 @@ """Interface for application configuration.""" + import os from dataclasses import dataclass, field from typing import Any, Dict, List @@ -157,8 +158,28 @@ class LDIndexConfig(StepConfig): "start_hail": True, } ) - min_r2: float = 0.5 ld_index_out: str = MISSING + min_r2: float = 0.5 + ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" + ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" + liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht" + grch37_to_grch38_chain_path: str = ( + "gs://hail-common/references/grch37_to_grch38.over.chain.gz" + ) + ld_populations: list[str] = field( + default_factory=lambda: [ + "afr", # African-American + "amr", # American Admixed/Latino + "asj", # Ashkenazi Jewish + "eas", # East Asian + "est", # Estionian + "fin", # Finnish + "nfe", # Non-Finnish European + "nwe", # Northwestern European + "seu", # Southeastern European + ] + ) + use_version_from_input: bool = False _target_: str = "gentropy.ld_index.LDIndexStep" @@ -270,6 +291,23 @@ class VariantAnnotationConfig(StepConfig): } ) variant_annotation_path: str = MISSING + gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/" + chain_38_37: str = "gs://hail-common/references/grch38_to_grch37.over.chain.gz" + gnomad_variant_populations: list[str] = field( + default_factory=lambda: [ + "afr", # African-American + "amr", # American Admixed/Latino + "ami", # Amish ancestry + "asj", # Ashkenazi Jewish + "eas", # East Asian + "fin", # Finnish + "nfe", # Non-Finnish European + "mid", # Middle Eastern + "sas", # South Asian + "remaining", # Other + ] + ) + use_version_from_input: bool = False _target_: str = "gentropy.variant_annotation.VariantAnnotationStep" @@ -358,7 +396,6 @@ class FinemapperConfig(StepConfig): imputed_r2_threshold: float = MISSING ld_score_threshold: float = MISSING output_path_log: str = MISSING - _target_: str = "gentropy.susie_finemapper.SusieFineMapperStep" @dataclass diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 471b87cae..a1d211f11 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -3,7 +3,6 @@ from __future__ import annotations import sys -from dataclasses import dataclass, field from functools import reduce from typing import TYPE_CHECKING @@ -14,44 +13,44 @@ from pyspark.sql import Window from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row +from gentropy.common.types import LD_Population from gentropy.common.utils import _liftover_loci +from gentropy.config import LDIndexConfig from gentropy.dataset.ld_index import LDIndex if TYPE_CHECKING: from pyspark.sql import DataFrame, Row -@dataclass class GnomADLDMatrix: - """Toolset ot interact with GnomAD LD dataset (version: r2.1.1). - - Datasets are accessed in Hail's native format, as provided by the [GnomAD consortium](https://gnomad.broadinstitute.org/downloads/#v2-linkage-disequilibrium). - - Attributes: - ld_matrix_template (str): Template for the LD matrix path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm". - ld_index_raw_template (str): Template for the LD index path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht". - grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. Defaults to "gs://hail-common/references/grch37_to_grch38.over.chain.gz". - ld_populations (list[str]): List of populations to use to build the LDIndex. Defaults to ["afr", "amr", "asj", "eas", "fin", "nfe", "nwe", "seu"]. - """ - - ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" - ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" - liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht" - grch37_to_grch38_chain_path: str = ( - "gs://hail-common/references/grch37_to_grch38.over.chain.gz" - ) - ld_populations: list[str] = field( - default_factory=lambda: [ - "afr", # African-American - "amr", # American Admixed/Latino - "asj", # Ashkenazi Jewish - "eas", # East Asian - "fin", # Finnish - "nfe", # Non-Finnish European - "nwe", # Northwestern European - "seu", # Southeastern European - ] - ) + """Toolset ot interact with GnomAD LD dataset (version: r2.1.1).""" + + def __init__( + self, + ld_matrix_template: str = LDIndexConfig().ld_matrix_template, + ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template, + grch37_to_grch38_chain_path: str = LDIndexConfig().grch37_to_grch38_chain_path, + ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations, + liftover_ht_path: str = LDIndexConfig().liftover_ht_path, + ): + """Initialize. + + Datasets are accessed in Hail's native format, as provided by the [GnomAD consortium](https://gnomad.broadinstitute.org/downloads/#v2-linkage-disequilibrium). + + Args: + ld_matrix_template (str): Template for the LD matrix path. + ld_index_raw_template (str): Template for the LD index path. + grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. + ld_populations (list[LD_Population | str]): List of populations to use to build the LDIndex. + liftover_ht_path (str): Path to the liftover ht file. + + Default values are set in LDIndexConfig. + """ + self.ld_matrix_template = ld_matrix_template + self.ld_index_raw_template = ld_index_raw_template + self.grch37_to_grch38_chain_path = grch37_to_grch38_chain_path + self.ld_populations = ld_populations + self.liftover_ht_path = liftover_ht_path @staticmethod def _aggregate_ld_index_across_populations( diff --git a/src/gentropy/datasource/gnomad/variants.py b/src/gentropy/datasource/gnomad/variants.py index b06b4ba6c..fdc67a7cb 100644 --- a/src/gentropy/datasource/gnomad/variants.py +++ b/src/gentropy/datasource/gnomad/variants.py @@ -2,43 +2,41 @@ from __future__ import annotations -from dataclasses import dataclass, field from typing import TYPE_CHECKING import hail as hl +from gentropy.common.types import VariantPopulation +from gentropy.config import VariantAnnotationConfig from gentropy.dataset.variant_annotation import VariantAnnotation if TYPE_CHECKING: pass -@dataclass class GnomADVariants: - """GnomAD variants included in the GnomAD genomes dataset. - - Attributes: - gnomad_genomes (str): Path to gnomAD genomes hail table. Defaults to gnomAD's 4.0 release. - chain_hail_38_37 (str): Path to GRCh38 to GRCh37 chain file. Defaults to Hail's chain file. - populations (list[str]): List of populations to include. Defaults to all populations. - """ - - gnomad_genomes: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/" - chain_hail_38_37: str = "gs://hail-common/references/grch38_to_grch37.over.chain.gz" - populations: list[str] = field( - default_factory=lambda: [ - "afr", # African-American - "amr", # American Admixed/Latino - "ami", # Amish ancestry - "asj", # Ashkenazi Jewish - "eas", # East Asian - "fin", # Finnish - "nfe", # Non-Finnish European - "mid", # Middle Eastern - "sas", # South Asian - "remaining", # Other - ] - ) + """GnomAD variants included in the GnomAD genomes dataset.""" + + def __init__( + self, + gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path, + chain_38_37: str = VariantAnnotationConfig().chain_38_37, + gnomad_variant_populations: list[ + VariantPopulation | str + ] = VariantAnnotationConfig().gnomad_variant_populations, + ): + """Initialize. + + Args: + gnomad_genomes_path (str): Path to gnomAD genomes hail table. + chain_38_37 (str): Path to GRCh38 to GRCh37 chain file. + gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. + + All defaults are stored in VariantAnnotationConfig. + """ + self.gnomad_genomes_path = gnomad_genomes_path + self.chain_38_37 = chain_38_37 + self.gnomad_variant_populations = gnomad_variant_populations def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: """Generate variant annotation dataset from gnomAD. @@ -54,14 +52,14 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: """ # Load variants dataset ht = hl.read_table( - self.gnomad_genomes, + self.gnomad_genomes_path, _load_refs=False, ) # Liftover grch37 = hl.get_reference("GRCh37") grch38 = hl.get_reference("GRCh38") - grch38.add_liftover(self.chain_hail_38_37, grch37) + grch38.add_liftover(self.chain_38_37, grch37) # Drop non biallelic variants ht = ht.filter(ht.alleles.length() == 2) @@ -88,7 +86,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: rsIds=ht.rsid, alleleType=ht.allele_info.allele_type, alleleFrequencies=hl.set( - [f"{pop}_adj" for pop in self.populations] + [f"{pop}_adj" for pop in self.gnomad_variant_populations] ).map( lambda p: hl.struct( populationName=p, diff --git a/src/gentropy/ld_index.py b/src/gentropy/ld_index.py index cb260977d..3941d5c02 100644 --- a/src/gentropy/ld_index.py +++ b/src/gentropy/ld_index.py @@ -1,9 +1,13 @@ """Step to dump a filtered version of a LD matrix (block matrix) as Parquet files.""" + from __future__ import annotations import hail as hl from gentropy.common.session import Session +from gentropy.common.types import LD_Population +from gentropy.common.version_engine import VersionEngine +from gentropy.config import LDIndexConfig from gentropy.datasource.gnomad.ld import GnomADLDMatrix @@ -13,19 +17,49 @@ class LDIndexStep: !!! warning "This step is resource intensive" Suggested params: high memory machine, 5TB of boot disk, no SSDs. + """ - def __init__(self, session: Session, min_r2: float, ld_index_out: str) -> None: + def __init__( + self, + session: Session, + ld_index_out: str, + min_r2: float = LDIndexConfig().min_r2, + ld_matrix_template: str = LDIndexConfig().ld_matrix_template, + ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template, + ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations, + liftover_ht_path: str = LDIndexConfig().liftover_ht_path, + use_version_from_input: bool = LDIndexConfig().use_version_from_input, + ) -> None: """Run step. Args: session (Session): Session object. + ld_index_out (str): Output LD index path. (required) min_r2 (float): Minimum r2 to consider when considering variants within a window. - ld_index_out (str): Output LD index path. + ld_matrix_template (str): Input path to the gnomAD ld file with placeholder for population + ld_index_raw_template (str): Input path to the raw gnomAD LD indices file with placeholder for population string + ld_populations (list[LD_Population | str]): Population names derived from the ld file paths + liftover_ht_path (str): Path to the liftover ht file + use_version_from_input (bool): Append version derived from input ld_matrix_template to the output ld_index_out. Defaults to False. + + In case use_version_from_input is set to True, + data source version inferred from ld_matrix_temolate is appended as the last path segment to the output path. + Default values are provided in LDIndexConfig. """ + if use_version_from_input: + # amend data source version to output path + ld_index_out = VersionEngine("gnomad").amend_version( + ld_matrix_template, ld_index_out + ) hl.init(sc=session.spark.sparkContext, log="/dev/null") ( - GnomADLDMatrix() + GnomADLDMatrix( + ld_matrix_template=ld_matrix_template, + ld_index_raw_template=ld_index_raw_template, + ld_populations=ld_populations, + liftover_ht_path=liftover_ht_path, + ) .as_ld_index(min_r2) .df.write.partitionBy("chromosome") .mode(session.write_mode) diff --git a/src/gentropy/variant_annotation.py b/src/gentropy/variant_annotation.py index ff35c1915..355a4dfea 100644 --- a/src/gentropy/variant_annotation.py +++ b/src/gentropy/variant_annotation.py @@ -1,29 +1,63 @@ """Step to generate variant annotation dataset.""" + from __future__ import annotations import hail as hl from gentropy.common.session import Session +from gentropy.common.types import VariantPopulation +from gentropy.common.version_engine import VersionEngine +from gentropy.config import VariantAnnotationConfig from gentropy.datasource.gnomad.variants import GnomADVariants class VariantAnnotationStep: """Variant annotation step. - Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. This dataset is used to validate variants and as a source of annotation. + Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. + This dataset is used to validate variants and as a source of annotation. """ - def __init__(self, session: Session, variant_annotation_path: str) -> None: + def __init__( + self, + session: Session, + variant_annotation_path: str, + gnomad_genomes_path: str = VariantAnnotationConfig().gnomad_genomes_path, + gnomad_variant_populations: list[ + VariantPopulation | str + ] = VariantAnnotationConfig().gnomad_variant_populations, + chain_38_37: str = VariantAnnotationConfig().chain_38_37, + use_version_from_input: bool = VariantAnnotationConfig().use_version_from_input, + ) -> None: """Run Variant Annotation step. Args: session (Session): Session object. variant_annotation_path (str): Variant annotation dataset path. + gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`. + gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include. + chain_38_37 (str): Path to GRCh38 to GRCh37 chain file for lifover. + use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False. + + In case use_version_from_input is set to True, + data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path. + All defaults are stored in the VariantAnnotationConfig. """ + # amend data source version to output path + if use_version_from_input: + variant_annotation_path = VersionEngine("gnomad").amend_version( + gnomad_genomes_path, variant_annotation_path + ) + # Initialise hail session. hl.init(sc=session.spark.sparkContext, log="/dev/null") # Run variant annotation. - variant_annotation = GnomADVariants().as_variant_annotation() + variant_annotation = GnomADVariants( + gnomad_genomes_path=gnomad_genomes_path, + gnomad_variant_populations=gnomad_variant_populations, + chain_38_37=chain_38_37, + ).as_variant_annotation() + # Write data partitioned by chromosome and position. ( variant_annotation.df.write.mode(session.write_mode).parquet( diff --git a/tests/gentropy/common/test_version_engine.py b/tests/gentropy/common/test_version_engine.py new file mode 100644 index 000000000..46a670165 --- /dev/null +++ b/tests/gentropy/common/test_version_engine.py @@ -0,0 +1,114 @@ +"""Tests version engine class.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from gentropy.common.version_engine import GnomADVersionSeeker, VersionEngine + + +@pytest.mark.parametrize( + ["text", "version"], + [ + pytest.param( + "gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.7.vcf", + "2.1.1", + id="GnomAD v2.1.1", + ), + pytest.param( + "/gcp-public-data--gnomad/release/3.0/vcf/genomes/gnomad.genomes.r3.0.sites.chr6.vcf", + "3.0", + id="GnomAD v3.0", + ), + pytest.param( + "gs://gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr1.vcf", + "3.1.1", + id="GnomAD v3.1.1", + ), + pytest.param( + "gs://gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chrY.vcf", + "3.1.2", + id="GnomAD v3.1.2", + ), + pytest.param( + "gsa://gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chrY.vcf", + "4.0", + id="GnomAD v4.0", + ), + pytest.param( + "gs://gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf", + "4.1", + id="GnomAD v4.1", + ), + pytest.param( + "/some/path/to/the/version/r20.111.44", + "20.111.44", + id="Extreme version number", + ), + ], +) +def test_extracting_version_with_gnomad_seeker(text: str, version: str) -> None: + """Test gnomad version extraction with GnomADVersionSeeker.""" + version_seeker = GnomADVersionSeeker().seek_version + assert version_seeker(text) == version + + +def test_not_registered_datasource_raises_error() -> None: + """Test that unknown datasource raises error.""" + with pytest.raises(ValueError) as e: + VersionEngine("ClinVar").seek("some/path/to/the/version/v20.111.44") # type: ignore + assert e.value.args[0].startswith("Invalid datasource ClinVar") + + +def test_extracting_version_when_no_version_is_found() -> None: + """Test that unknown datasource raises error.""" + with pytest.raises(ValueError) as e: + VersionEngine("ClinVar").seek("some/path/without/version") # type: ignore + assert e.value.args[0].startswith( + "Can not find version in some/path/without/version" + ) + + +def test_non_string_path_raises_error() -> None: + """Test that non-string path raises error.""" + with pytest.raises(TypeError) as e: + VersionEngine("gnomad").seek(123) # type: ignore + assert e.value.args[0].startswith("Can not infer version from 123") + + +@pytest.mark.parametrize( + ["text", "version"], + [ + pytest.param(Path("some/file/path/v3.1.1"), "3.1.1", id="Path object"), + pytest.param("s3://some/file/path/v3.1.1", "3.1.1", id="S3 protocol"), + pytest.param("gs://some/file/path/v3.1.1", "3.1.1", id="GS protocol"), + ], +) +def test_extracting_version_with_version_engine(text: str | Path, version: str) -> None: + """Check if concrete data types and file protocols does not return an error while passed to VersionEngine.""" + assert VersionEngine("gnomad").seek(text) == version + + +@pytest.mark.parametrize( + ["input_path", "output_path", "expected_output"], + [ + pytest.param( + "input/v20.111.44", "output", "output/20.111.44", id="Append version" + ), + pytest.param( + "input/1.0.0", + "output/1.0.0", + "output/1.0.0", + id="Do not append version, already present", + ), + pytest.param( + Path("input/1.0.0"), Path("output/"), "output/1.0.0", id="Path objects" + ), + ], +) +def test_appending_version_to_path( + input_path: Path | str, output_path: Path | str, expected_output: str +) -> None: + """Test that the version is ammended at the end of the output path.""" + VersionEngine("gnomad").amend_version(input_path, output_path) == expected_output