From 45d991c9238af880f24337e21ba0cc32ed174de5 Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:06:30 +0100 Subject: [PATCH] feat: credible set quality filtering (#640) * feat: credible set quality filtering * fix: purity threshold --- src/gentropy/susie_finemapper.py | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index cbc024acf..807881a85 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -23,6 +23,7 @@ from gentropy.common.session import Session from gentropy.common.spark_helpers import neglog_pvalue_to_mantissa_and_exponent +from gentropy.dataset.ld_index import LDIndex from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus from gentropy.dataset.summary_statistics import SummaryStatistics @@ -1168,3 +1169,45 @@ def susie_finemapper_one_studylocus_row_v3_dev_ss_gathered( ) return out + + @staticmethod + def credible_set_qc( + cred_sets: StudyLocus, + study_index: StudyIndex, + ld_index: LDIndex, + p_value_threshold: float = 1e-5, + purity_min_r2: float = 0.01, + ) -> StudyLocus: + """Filter credible sets by lead P-value and min-R2 purity, and performs LD clumping. + + Args: + cred_sets (StudyLocus): StudyLocus object with credible sets to filter/clump + study_index (StudyIndex): StudyIndex object + ld_index (LDIndex): LDIndex object + p_value_threshold (float): p-value threshold for filtering credible sets, default is 1e-5 + purity_min_r2 (float): min-R2 purity threshold for filtering credible sets, default is 0.25 + + Returns: + StudyLocus: Credible sets which pass filters and LD clumping. + """ + df = ( + cred_sets.df.withColumn( + "pValue", f.col("pValueMantissa") * f.pow(10, f.col("pValueExponent")) + ) + .filter(f.col("pValue") <= p_value_threshold) + .filter(f.col("purityMinR2") >= purity_min_r2) + .drop("pValue") + ) + cred_sets.df = df + cred_sets = ( + cred_sets.annotate_ld(study_index, ld_index) + .clump() + .filter( + ~f.array_contains( + f.col("qualityControls"), + "Explained by a more significant variant in high LD (clumped)", + ) + ) + ) + + return cred_sets