diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json index 674788407..14aae6b84 100644 --- a/src/gentropy/assets/schemas/vep_json_output.json +++ b/src/gentropy/assets/schemas/vep_json_output.json @@ -20,6 +20,12 @@ "containsNull": true, "elementType": { "fields": [ + { + "metadata": {}, + "name": "conservation", + "nullable": true, + "type": "double" + }, { "metadata": {}, "name": "hgvsg", @@ -294,6 +300,12 @@ "containsNull": true, "elementType": { "fields": [ + { + "metadata": {}, + "name": "conservation", + "nullable": true, + "type": "double" + }, { "metadata": {}, "name": "alphamissense", diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py index 4092ca961..649b9f3ff 100644 --- a/src/gentropy/dataset/variant_index.py +++ b/src/gentropy/dataset/variant_index.py @@ -368,6 +368,7 @@ def resolve_predictor_methods( # The following predictors are not normalised: .when(method == "SpliceAI", score) .when(method == "VEP", score) + .when(method == "GERP", cls._normalise_gerp(score)) ) @staticmethod @@ -420,6 +421,38 @@ def _normalise_cadd( .when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1)) ) + @classmethod + def _normalise_gerp( + cls: type[InSilicoPredictorNormaliser], + score: Column, + ) -> Column: + """Normalise GERP scores. + + # Score interpretation from here: + # https://pmc.ncbi.nlm.nih.gov/articles/PMC7286533/ + # https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg19&g=allHg19RS_BW + + Logic: GERP scores are divided into three categories: + - >6 : 1.0 - GERP scores are not bounded, so any value above 6 is considered as 1.0 + - 2-6: 0.5-1 - Highly conserved regions are scaled between 0.5 and 1 + - 0-2: 0-0.5 - Moderately conserved regions are scaled between 0 and 0.5 + - -3-0: -1-0.0 - Negative conservation indicates benign sequence alteration, so scaled between -1 and 0 + - < -3: -1.0 - As the score goes below -3, it is considered as -1.0 + + Args: + score (Column): GERP score. + + Returns: + Column: Normalised GERP score. + """ + return ( + f.when(score > 6, f.lit(1.0)) + .when(score >= 2, cls._rescaleColumnValue(score, 2, 6, 0.5, 1)) + .when(score >= 0, cls._rescaleColumnValue(score, 0, 2, 0, 0.5)) + .when(score >= -3, cls._rescaleColumnValue(score, -3, 0, -1, 0)) + .when(score < -3, f.lit(-1.0)) + ) + @classmethod def _normalise_loftee( cls: type[InSilicoPredictorNormaliser], diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py index 9494bf6f3..b0a2e50a2 100644 --- a/src/gentropy/datasource/ensembl/vep_parser.py +++ b/src/gentropy/datasource/ensembl/vep_parser.py @@ -668,6 +668,12 @@ def process_vep_output( assessment_column_name="lof", assessment_flag_column_name="lof_filter", ), + # Extract GERP conservation score: + cls._vep_in_silico_prediction_extractor( + method_name="GERP", + transcript_column_name="transcript_consequences", + score_column_name="conservation", + ), # Extract max alpha missense: cls._get_max_alpha_missense( f.col("transcript_consequences") @@ -686,6 +692,12 @@ def process_vep_output( method_name="CADD", score_column_name="cadd_phred", ), + # Extract GERP conservation score: + cls._vep_in_silico_prediction_extractor( + method_name="GERP", + transcript_column_name="intergenic_consequences", + score_column_name="conservation", + ), # Extract VEP prediction: cls._get_vep_prediction(f.col("most_severe_consequence")), )