feat: adding GERP conservation score to variant annotation (#933)

* feat: adding GERP conservation score to variant annotation * fix: typo --------- Co-authored-by: Szymon Szyszkowski <[email protected]>
opentargets · Nov 27, 2024 · 19219dd · 19219dd
1 parent 8595e5b
commit 19219dd
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 0 deletions.
diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json
@@ -20,6 +20,12 @@
         "containsNull": true,
         "elementType": {
           "fields": [
+            {
+              "metadata": {},
+              "name": "conservation",
+              "nullable": true,
+              "type": "double"
+            },
             {
               "metadata": {},
               "name": "hgvsg",
@@ -294,6 +300,12 @@
         "containsNull": true,
         "elementType": {
           "fields": [
+            {
+              "metadata": {},
+              "name": "conservation",
+              "nullable": true,
+              "type": "double"
+            },
             {
               "metadata": {},
               "name": "alphamissense",

diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
@@ -368,6 +368,7 @@ def resolve_predictor_methods(
             # The following predictors are not normalised:
             .when(method == "SpliceAI", score)
             .when(method == "VEP", score)
+            .when(method == "GERP", cls._normalise_gerp(score))
         )
 
     @staticmethod
@@ -420,6 +421,38 @@ def _normalise_cadd(
             .when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
         )
 
+    @classmethod
+    def _normalise_gerp(
+        cls: type[InSilicoPredictorNormaliser],
+        score: Column,
+    ) -> Column:
+        """Normalise GERP scores.
+
+        # Score interpretation from here:
+        # https://pmc.ncbi.nlm.nih.gov/articles/PMC7286533/
+        # https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg19&g=allHg19RS_BW
+
+        Logic: GERP scores are divided into three categories:
+         - >6 : 1.0 - GERP scores are not bounded, so any value above 6 is considered as 1.0
+         - 2-6: 0.5-1 - Highly conserved regions are scaled between 0.5 and 1
+         - 0-2: 0-0.5 - Moderately conserved regions are scaled between 0 and 0.5
+         - -3-0: -1-0.0 - Negative conservation indicates benign sequence alteration, so scaled between -1 and 0
+         - < -3: -1.0 - As the score goes below -3, it is considered as -1.0
+
+        Args:
+            score (Column): GERP score.
+
+        Returns:
+            Column: Normalised GERP score.
+        """
+        return (
+            f.when(score > 6, f.lit(1.0))
+            .when(score >= 2, cls._rescaleColumnValue(score, 2, 6, 0.5, 1))
+            .when(score >= 0, cls._rescaleColumnValue(score, 0, 2, 0, 0.5))
+            .when(score >= -3, cls._rescaleColumnValue(score, -3, 0, -1, 0))
+            .when(score < -3, f.lit(-1.0))
+        )
+
     @classmethod
     def _normalise_loftee(
         cls: type[InSilicoPredictorNormaliser],

diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py
@@ -668,6 +668,12 @@ def process_vep_output(
                                 assessment_column_name="lof",
                                 assessment_flag_column_name="lof_filter",
                             ),
+                            # Extract GERP conservation score:
+                            cls._vep_in_silico_prediction_extractor(
+                                method_name="GERP",
+                                transcript_column_name="transcript_consequences",
+                                score_column_name="conservation",
+                            ),
                             # Extract max alpha missense:
                             cls._get_max_alpha_missense(
                                 f.col("transcript_consequences")
@@ -686,6 +692,12 @@ def process_vep_output(
                             method_name="CADD",
                             score_column_name="cadd_phred",
                         ),
+                        # Extract GERP conservation score:
+                        cls._vep_in_silico_prediction_extractor(
+                            method_name="GERP",
+                            transcript_column_name="intergenic_consequences",
+                            score_column_name="conservation",
+                        ),
                         # Extract VEP prediction:
                         cls._get_vep_prediction(f.col("most_severe_consequence")),
                     )