Skip to content

Commit

Permalink
feat: ingest OTAR2082 FoldX data
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Dec 13, 2024
1 parent 2fdf343 commit e262aac
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 0 deletions.
67 changes: 67 additions & 0 deletions src/gentropy/assets/schemas/amino_acid_variants.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{
"fields": [
{
"metadata": {},
"name": "uniprotAccession",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "aminoAcidChange",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "inSilicoPredictors",
"nullable": true,
"type": {
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "method",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "assessment",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "score",
"nullable": true,
"type": "float"
},
{
"metadata": {},
"name": "assessmentFlag",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "targetId",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "normalisedScore",
"nullable": true,
"type": "double"
}
],
"type": "struct"
},
"type": "array"
}
}
],
"type": "struct"
}
26 changes: 26 additions & 0 deletions src/gentropy/dataset/amino_acid_variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Dataset representing consequence of amino-acid changes in protein."""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

from gentropy.common.schemas import parse_spark_schema
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
from pyspark.sql.types import StructType


@dataclass
class AminoAcidVariants(Dataset):
"""Dataset representing consequence of amino-acid changes in protein."""

@classmethod
def get_schema(cls: type[AminoAcidVariants]) -> StructType:
"""Provides the schema for the AminoAcidVariants dataset.
Returns:
StructType: Schema for the AminoAcidVariants dataset
"""
return parse_spark_schema("amino_acid_variants.json")
56 changes: 56 additions & 0 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@

import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import (
get_nested_struct_schema,
rename_all_columns,
safe_array_union,
)
from gentropy.dataset.amino_acid_variants import AminoAcidVariants
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down Expand Up @@ -274,6 +276,60 @@ def get_distance_to_gene(
f"max_distance must be less than 500_000. Got {max_distance}."
)

def annotate_with_amino_acid_consequences(
self: VariantIndex, annotation: AminoAcidVariants
) -> VariantIndex:
"""Enriching in silico predictors with amino-acid derived predicted consequences.
Args:
annotation (AminoAcidVariants): amio-acid level variant consequences.
Returns:
VariantIndex: where amino-acid causing variants are enriched with extra annotation
"""
w = Window.partitionBy("variantId").orderBy(f.size("inSilicoPredictors").desc())

return VariantIndex(
_df=self.df
# Extracting variant consequence on Uniprot and amino-acid changes from the transcripts:
.withColumns(
{
"aminoAcidChange": f.filter(
"transcriptConsequences",
lambda vep: vep.aminoAcidChange.isNotNull(),
)[0].aminoAcidChange,
"uniprotAccession": f.explode(
f.filter(
"transcriptConsequences",
lambda vep: vep.aminoAcidChange.isNotNull(),
)[0].uniprotAccessions
),
}
)
# Joining with amino-acid predictions:
.join(
annotation.df.withColumnRenamed("inSilicoPredictors", "annotations"),
on=["uniprotAccession", "aminoAcidChange"],
how="left",
)
# Merge predictors:
.withColumn(
"inSilicoPredictors",
f.when(
f.col("annotations").isNotNull(),
f.array_union("inSilicoPredictors", "annotations"),
).otherwise(f.col("inSilicoPredictors")),
)
# Dropping unused columns:
.drop("uniprotAccession", "aminoAcidChange", "annotations")
# Dropping potentially exploded variant rows:
.distinct()
.withColumn("rank", f.rank().over(w))
.filter(f.col("rank") == 1)
.drop("rank"),
_schema=self.get_schema(),
)

def get_loftee(self: VariantIndex) -> DataFrame:
"""Returns a dataframe with a flag indicating whether a variant is predicted to cause loss of function in a gene. The source of this information is the LOFTEE algorithm (https://github.com/konradjk/loftee).
Expand Down
95 changes: 95 additions & 0 deletions src/gentropy/datasource/open_targets/foldex_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Parser integrating FoldX data from OpenTargets project OTAR2081."""

from __future__ import annotations

import pyspark.sql.functions as f
from pyspark.sql import Column, DataFrame
from pyspark.sql import types as t

from gentropy.common.spark_helpers import enforce_schema
from gentropy.dataset.amino_acid_variants import AminoAcidVariants


class OpenTargetsFoldX:
"""Class to parser FoldX dataset generated by the OTAR2081 project."""

INSILICO_SCHEMA = AminoAcidVariants.get_schema()[
"inSilicoPredictors"
].dataType.elementType

@staticmethod
@enforce_schema(INSILICO_SCHEMA)
def get_foldx_prediction(score_column: Column) -> Column:
"""Generate inSilicoPredictor object from ddG column.
Args:
score_column (Column): ddG column from the FoldX dataset.
Returns:
Column: struct with the right shape of the in silico predictors.
"""
return f.struct(
f.lit("foldX").alias("method"),
score_column.cast(t.FloatType()).alias("score"),
)

@classmethod
def ingest_foldx_data(
cls: type[OpenTargetsFoldX], foldx_input: DataFrame, plddt_threshold: float
) -> AminoAcidVariants:
"""Ingest FoldX dataset and convert into a AminoAcidVariants object.
Args:
foldx_input (DataFrame): Input dataframe provided by the FoldX project.
plddt_threshold (float): lower threshold for filtering confident residues from structural models.
Returns:
AminoAcidVariants: _description_
"""
excluded_identifiers = cls._uniprot_ids_to_exclude(foldx_input)
return AminoAcidVariants(
_df=(
foldx_input.filter(f.col("plddt") > plddt_threshold)
.join(excluded_identifiers, on="protein_acc", how="left_anti")
.select(
f.col("protein_acc").alias("uniprotAccession"),
f.concat(
f.col("wild_type"), f.col("position"), f.col("mutated_type")
).alias("aminoAcidChange"),
cls.get_foldx_prediction(f.col("foldx_ddg")).alias(
"inSilicoPredictor"
),
)
# Collapse all predictors for a single array object to avoid variant explosions:
.groupBy("uniprotAccession", "aminoAcidChange")
.agg(
f.collect_set(f.col("inSilicoPredictor")).alias(
"inSilicoPredictors"
)
)
),
_schema=AminoAcidVariants.get_schema(),
)

@staticmethod
def _uniprot_ids_to_exclude(foldx_input: DataFrame) -> DataFrame:
"""Compute distinct set of UniprotIDs to drop from the input dataset.
Exclude UniprotIds, where one position in the structure corresponds to multiple positions in the original sequence.
Such cases are impossible to disambiguate.
Args:
foldx_input (DataFrame): raw dataset.
Returns:
DataFrame: one column with uniprot ids.
"""
return (
foldx_input.groupby("protein_acc", "position", "wild_type")
.agg(f.collect_set("plddt").alias("plddts"))
.filter(f.size("plddts") > 1)
.select(
"protein_acc",
)
.distinct()
)

0 comments on commit e262aac

Please sign in to comment.