Skip to content

Commit

Permalink
feat: adding GERP conservation score to variant annotation (#933)
Browse files Browse the repository at this point in the history
* feat: adding GERP conservation score to variant annotation

* fix: typo

---------

Co-authored-by: Szymon Szyszkowski <[email protected]>
  • Loading branch information
DSuveges and project-defiant authored Nov 27, 2024
1 parent 8595e5b commit 19219dd
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/gentropy/assets/schemas/vep_json_output.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "conservation",
"nullable": true,
"type": "double"
},
{
"metadata": {},
"name": "hgvsg",
Expand Down Expand Up @@ -294,6 +300,12 @@
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "conservation",
"nullable": true,
"type": "double"
},
{
"metadata": {},
"name": "alphamissense",
Expand Down
33 changes: 33 additions & 0 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ def resolve_predictor_methods(
# The following predictors are not normalised:
.when(method == "SpliceAI", score)
.when(method == "VEP", score)
.when(method == "GERP", cls._normalise_gerp(score))
)

@staticmethod
Expand Down Expand Up @@ -420,6 +421,38 @@ def _normalise_cadd(
.when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
)

@classmethod
def _normalise_gerp(
cls: type[InSilicoPredictorNormaliser],
score: Column,
) -> Column:
"""Normalise GERP scores.
# Score interpretation from here:
# https://pmc.ncbi.nlm.nih.gov/articles/PMC7286533/
# https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg19&g=allHg19RS_BW
Logic: GERP scores are divided into three categories:
- >6 : 1.0 - GERP scores are not bounded, so any value above 6 is considered as 1.0
- 2-6: 0.5-1 - Highly conserved regions are scaled between 0.5 and 1
- 0-2: 0-0.5 - Moderately conserved regions are scaled between 0 and 0.5
- -3-0: -1-0.0 - Negative conservation indicates benign sequence alteration, so scaled between -1 and 0
- < -3: -1.0 - As the score goes below -3, it is considered as -1.0
Args:
score (Column): GERP score.
Returns:
Column: Normalised GERP score.
"""
return (
f.when(score > 6, f.lit(1.0))
.when(score >= 2, cls._rescaleColumnValue(score, 2, 6, 0.5, 1))
.when(score >= 0, cls._rescaleColumnValue(score, 0, 2, 0, 0.5))
.when(score >= -3, cls._rescaleColumnValue(score, -3, 0, -1, 0))
.when(score < -3, f.lit(-1.0))
)

@classmethod
def _normalise_loftee(
cls: type[InSilicoPredictorNormaliser],
Expand Down
12 changes: 12 additions & 0 deletions src/gentropy/datasource/ensembl/vep_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,12 @@ def process_vep_output(
assessment_column_name="lof",
assessment_flag_column_name="lof_filter",
),
# Extract GERP conservation score:
cls._vep_in_silico_prediction_extractor(
method_name="GERP",
transcript_column_name="transcript_consequences",
score_column_name="conservation",
),
# Extract max alpha missense:
cls._get_max_alpha_missense(
f.col("transcript_consequences")
Expand All @@ -686,6 +692,12 @@ def process_vep_output(
method_name="CADD",
score_column_name="cadd_phred",
),
# Extract GERP conservation score:
cls._vep_in_silico_prediction_extractor(
method_name="GERP",
transcript_column_name="intergenic_consequences",
score_column_name="conservation",
),
# Extract VEP prediction:
cls._get_vep_prediction(f.col("most_severe_consequence")),
)
Expand Down

0 comments on commit 19219dd

Please sign in to comment.