Skip to content

Commit

Permalink
feat: adding foldx ingestion step
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Dec 16, 2024
1 parent bef7252 commit 19fed4b
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,17 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig):
)


@dataclass
class FoldXVariantAnnotationConfig(StepConfig):
"""Step to ingest FoldX amino acid variation data."""

foldx_dataset_path: str = MISSING
plddt_threshold: float = 0.7
annotation_path: str = MISSING

_target_: str = "gentropy.foldx_ingestion.FoldXIngestionStep"


@dataclass
class EqtlCatalogueConfig(StepConfig):
"""eQTL Catalogue step configuration."""
Expand Down Expand Up @@ -767,3 +778,4 @@ def register_config() -> None:
)
cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
cs.store(group="step", name="foldx_integration", node=FoldXVariantAnnotationConfig)
40 changes: 40 additions & 0 deletions src/gentropy/foldx_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Ingest FoldX data from OTAR2081 project."""

from __future__ import annotations

from typing import TYPE_CHECKING

from gentropy.common.session import Session
from gentropy.datasource.open_targets.foldex_integration import OpenTargetsFoldX

if TYPE_CHECKING:
from gentropy.dataset.amino_acid_variants import AminoAcidVariants


class FoldXIngestionStep:
"""Step to ingest proteome-wide FoldX dataset generated by the OTAR2081 project."""

def __init__(
self,
session: Session,
foldx_dataset_path: str,
plddt_threshold: float,
annotation_path: str,
) -> None:
"""Initialize step.
Args:
session (Session): Session object.
foldx_dataset_path (str): path to the FoldX dataset.
plddt_threshold (float): plddt threshold to filter amio acids in the structural model.
annotation_path (str): path of the output dataset.
"""
fold_x_data = session.spark.read.parquet(foldx_dataset_path)
# Transform
gene_index: AminoAcidVariants = OpenTargetsFoldX.ingest_foldx_data(
fold_x_data, plddt_threshold
)
# Load
gene_index.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(annotation_path)

0 comments on commit 19fed4b

Please sign in to comment.