-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
244 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
{ | ||
"fields": [ | ||
{ | ||
"metadata": {}, | ||
"name": "uniprotAccession", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "aminoAcidChange", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "inSilicoPredictors", | ||
"nullable": true, | ||
"type": { | ||
"containsNull": true, | ||
"elementType": { | ||
"fields": [ | ||
{ | ||
"metadata": {}, | ||
"name": "method", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "assessment", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "score", | ||
"nullable": true, | ||
"type": "float" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "assessmentFlag", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "targetId", | ||
"nullable": true, | ||
"type": "string" | ||
}, | ||
{ | ||
"metadata": {}, | ||
"name": "normalisedScore", | ||
"nullable": true, | ||
"type": "double" | ||
} | ||
], | ||
"type": "struct" | ||
}, | ||
"type": "array" | ||
} | ||
} | ||
], | ||
"type": "struct" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
"""Dataset representing consequence of amino-acid changes in protein.""" | ||
|
||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
from gentropy.common.schemas import parse_spark_schema | ||
from gentropy.dataset.dataset import Dataset | ||
|
||
if TYPE_CHECKING: | ||
from pyspark.sql.types import StructType | ||
|
||
|
||
@dataclass | ||
class AminoAcidVariants(Dataset): | ||
"""Dataset representing consequence of amino-acid changes in protein.""" | ||
|
||
@classmethod | ||
def get_schema(cls: type[AminoAcidVariants]) -> StructType: | ||
"""Provides the schema for the AminoAcidVariants dataset. | ||
Returns: | ||
StructType: Schema for the AminoAcidVariants dataset | ||
""" | ||
return parse_spark_schema("amino_acid_variants.json") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
src/gentropy/datasource/open_targets/foldex_integration.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
"""Parser integrating FoldX data from OpenTargets project OTAR2081.""" | ||
|
||
from __future__ import annotations | ||
|
||
import pyspark.sql.functions as f | ||
from pyspark.sql import Column, DataFrame | ||
from pyspark.sql import types as t | ||
|
||
from gentropy.common.spark_helpers import enforce_schema | ||
from gentropy.dataset.amino_acid_variants import AminoAcidVariants | ||
|
||
|
||
class OpenTargetsFoldX: | ||
"""Class to parser FoldX dataset generated by the OTAR2081 project.""" | ||
|
||
INSILICO_SCHEMA = AminoAcidVariants.get_schema()[ | ||
"inSilicoPredictors" | ||
].dataType.elementType | ||
|
||
@staticmethod | ||
@enforce_schema(INSILICO_SCHEMA) | ||
def get_foldx_prediction(score_column: Column) -> Column: | ||
"""Generate inSilicoPredictor object from ddG column. | ||
Args: | ||
score_column (Column): ddG column from the FoldX dataset. | ||
Returns: | ||
Column: struct with the right shape of the in silico predictors. | ||
""" | ||
return f.struct( | ||
f.lit("foldX").alias("method"), | ||
score_column.cast(t.FloatType()).alias("score"), | ||
) | ||
|
||
@classmethod | ||
def ingest_foldx_data( | ||
cls: type[OpenTargetsFoldX], foldx_input: DataFrame, plddt_threshold: float | ||
) -> AminoAcidVariants: | ||
"""Ingest FoldX dataset and convert into a AminoAcidVariants object. | ||
Args: | ||
foldx_input (DataFrame): Input dataframe provided by the FoldX project. | ||
plddt_threshold (float): lower threshold for filtering confident residues from structural models. | ||
Returns: | ||
AminoAcidVariants: _description_ | ||
""" | ||
excluded_identifiers = cls._uniprot_ids_to_exclude(foldx_input) | ||
return AminoAcidVariants( | ||
_df=( | ||
foldx_input.filter(f.col("plddt") > plddt_threshold) | ||
.join(excluded_identifiers, on="protein_acc", how="left_anti") | ||
.select( | ||
f.col("protein_acc").alias("uniprotAccession"), | ||
f.concat( | ||
f.col("wild_type"), f.col("position"), f.col("mutated_type") | ||
).alias("aminoAcidChange"), | ||
cls.get_foldx_prediction(f.col("foldx_ddg")).alias( | ||
"inSilicoPredictor" | ||
), | ||
) | ||
# Collapse all predictors for a single array object to avoid variant explosions: | ||
.groupBy("uniprotAccession", "aminoAcidChange") | ||
.agg( | ||
f.collect_set(f.col("inSilicoPredictor")).alias( | ||
"inSilicoPredictors" | ||
) | ||
) | ||
), | ||
_schema=AminoAcidVariants.get_schema(), | ||
) | ||
|
||
@staticmethod | ||
def _uniprot_ids_to_exclude(foldx_input: DataFrame) -> DataFrame: | ||
"""Compute distinct set of UniprotIDs to drop from the input dataset. | ||
Exclude UniprotIds, where one position in the structure corresponds to multiple positions in the original sequence. | ||
Such cases are impossible to disambiguate. | ||
Args: | ||
foldx_input (DataFrame): raw dataset. | ||
Returns: | ||
DataFrame: one column with uniprot ids. | ||
""" | ||
return ( | ||
foldx_input.groupby("protein_acc", "position", "wild_type") | ||
.agg(f.collect_set("plddt").alias("plddts")) | ||
.filter(f.size("plddts") > 1) | ||
.select( | ||
"protein_acc", | ||
) | ||
.distinct() | ||
) |