Merge pull request #48 from msk-access/feature/remove_variant_by_annot

Feature/remove variant by annot
msk-access · Sep 27, 2024 · 5e7f718 · 5e7f718
2 parents 4d5f9f1 + 8cd69ee
commit 5e7f718
Show file tree

Hide file tree

Showing 6 changed files with 240 additions and 52 deletions.
diff --git a/postprocessing_variant_calls/maf/filter/filter_helpers.py b/postprocessing_variant_calls/maf/filter/filter_helpers.py
@@ -224,6 +224,7 @@ def _create_fillout_summary(df_fillout, alt_thres, mutation_key):
             for col in df_fillout.columns
             if not col.endswith(("_simplex_duplex", "_duplex", "_simplex"))
         ]
+
         duplex_columns = [
             col
             for col in df_fillout.columns
@@ -237,6 +238,7 @@ def _create_fillout_summary(df_fillout, alt_thres, mutation_key):
         simplex_duplex_columns = base_columns + [
             col for col in df_fillout.columns if col.endswith(("_simplex_duplex"))
         ]
+
         simplex_duplex_df = df_fillout[simplex_duplex_columns]
         simplex_duplex_df["fillout_type"] = f"{fillout_type}SIMPLEX_DUPLEX_"
         new_fillout_type_SD = f"{fillout_type}SIMPLEX_DUPLEX_"
@@ -336,6 +338,10 @@ def _extract_tn_genotypes(
             "t_alt_count_fragment_simplex",
             "t_ref_count_fragment_simplex",
             "t_vaf_fragment_simplex",
+            "is_exonic_variant",
+            "is_MET_variant",
+            "is_TERT_variant",
+            "hotspot_whitelist",
         ]
     ]
 
@@ -367,6 +373,10 @@ def _extract_tn_genotypes(
                 "t_alt_count_fragment_standard",
                 "t_ref_count_fragment_standard",
                 "t_vaf_fragment_standard",
+                "is_exonic_variant",
+                "is_MET_variant",
+                "is_TERT_variant",
+                "hotspot_whitelist",
             ]
         ]
         df_n_genotype.insert(0, "Matched_Norm_Sample_Barcode", str(normal_samplename))
@@ -493,6 +503,10 @@ def make_pre_filtered_maf(
         axis=1,
     )
 
+    # Remove duplicate columns
+    df_anno_with_genotypes = df_anno_with_genotypes.loc[
+        :, ~df_anno_with_genotypes.columns.duplicated()
+    ]
     df_anno_with_genotypes.index = df_annotation.index
 
     df_pre_filter = (
@@ -746,7 +760,6 @@ def cleanup_post_filter(df_post_filter):
         return df_post_filter
 
     df_post_filter = pre_filter_maf.copy()
-
     df_post_filter["Status"] = ""
 
     for i, mut in df_post_filter.iterrows():
@@ -937,10 +950,10 @@ def format_maf_for_mpath(df_post_filter):
     #     sys.exit(f"The following required columns are missing: {missing_columns_str}")
 
     # compute exon and intron columns
-    renamed_maf_w_dummy_cols["EXON"] = np.vectorize(get_exon, otypes=[str])(
-        renamed_maf_w_dummy_cols["EXON"], renamed_maf_w_dummy_cols["INTRON"]
-    )
-    renamed_maf_w_dummy_cols = renamed_maf_w_dummy_cols.drop(["INTRON"], axis=1)
+    # renamed_maf_w_dummy_cols["EXON"] = np.vectorize(get_exon, otypes=[str])(
+    #     renamed_maf_w_dummy_cols["EXON"], renamed_maf_w_dummy_cols["INTRON"]
+    # )
+    # renamed_maf_w_dummy_cols = renamed_maf_w_dummy_cols.drop(["INTRON"], axis=1)
 
     # computing all the gnomad associated cols
     # "TypeError: '>=' not supported between instances of 'float' and 'str'"

diff --git a/postprocessing_variant_calls/maf/filter/filter_process.py b/postprocessing_variant_calls/maf/filter/filter_process.py
@@ -481,6 +481,7 @@ def access_filters(
         tumor_samplename,
         normal_samplename,
     )
+
     # calls to apply_filter_maf (tagging functions)
     df_post_filter = apply_filter_maf(pre_filter_maf, **args)
     # calls to condensed post filter maf

diff --git a/postprocessing_variant_calls/maf/helper.py b/postprocessing_variant_calls/maf/helper.py
@@ -180,6 +180,7 @@ def make_condensed_post_filter(df_post_filter):
 def _find_VAFandsummary(df, sample_group):  # add category as third argumnet
     # add a line of code here to rename the simplex, duplex and simplex_duplex columns with a prefix of the category they belong to.
     df = df.copy()
+
     # find the VAF from the fillout (the comma separated string values that the summary will later be calculated from)
     # NOTE: col [t_vaf_fragment] already calculated by traceback, no need to create column again
 
@@ -305,6 +306,38 @@ def read_tsv(tsv, separator, canonical_tx_ref_flag=False):
         return pd.read_csv(tsv, sep=separator, skiprows=skip, low_memory=False)
 
 
+def tag_by_hotspots(input_maf, hotspots_maf):
+    """Read an input MAF file and tag any hotspots present in it from corresponding hotspots MAF file.
+
+    Args:
+        maf (File): Input MAF/tsv like format file
+        hotspots_maf (File): Input MAF/tsv like format file containing hotspots
+
+    Returns:
+        data_frame: Output a data frame containing the MAF/tsv tagged with hotspots
+    """
+    cols = ["Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2"]
+    hotspots = set()
+    with open(hotspots_maf, "r") as infile:
+        reader = csv.DictReader(infile, delimiter="\t")
+        for row in reader:
+            key = ":".join([row[k] for k in cols])
+            hotspots.add(tuple(key))
+
+    # Add hotspots column, set initial value to No
+    input_maf["hotspot_whitelist"] = "No"
+    # Create the column containing the cols values by joining values
+    input_maf["key"] = input_maf[cols].astype(str).agg(":".join, axis=1)
+    # update the hotspot whitelist column with Yes values if hotspots are detected
+    input_maf.loc[input_maf["key"].apply(tuple).isin(hotspots), "hotspot_whitelist"] = (
+        "Yes"
+    )
+    # drop the key column since its not needed in final maf
+    input_maf_final = input_maf.drop(columns=["key"])
+
+    return input_maf_final
+
+
 def get_row(tsv_file):
     """Function to skip rows
 
@@ -699,7 +732,7 @@ def tag_by_variant_classification(self, output_dir, ref_lst):
 
         def is_exonic_or_silent(row, ref_lst):
             exonic_result = IS_EXONIC_CLASS(
-                row.Hugo_Symbol, row.Variant_Classification, row.vcf_pos
+                row.Hugo_Symbol, row.Variant_Classification, row.Start_Position
             )
             if exonic_result:
                 if row.Transcript_ID in ref_lst:
@@ -737,43 +770,6 @@ def tag_row(row, canonical):
 
         return maf
 
-        # file_names = [
-        #     EXONIC_FILTERED,
-        #     SILENT_FILTERED,
-        #     NONPANEL_EXONIC_FILTERED,
-        #     NONPANEL_SILENT_FILTERED,
-        #     DROPPED
-        #     ]
-
-        # Create exonic, silent, and nonpanel files.
-        # file_paths = [f"{output_dir}/{name}" for name in file_names]
-
-        # headers = "\t".join(MAF_TSV_COL_MAP.values()) + "\n"
-
-        # with ExitStack() as stack:
-        #     files = [stack.enter_context(open(path, "w")) for path in file_paths]
-
-        #     for file in files:
-        #         file.write(headers)
-
-        # with open(output_dir + "/" + EXONIC_FILTERED, "w") as exonic, open(
-        #     output_dir + "/" + SILENT_FILTERED, "w") as silent, open(
-        #     output_dir + "/" + NONPANEL_EXONIC_FILTERED, "w") as nonpanel_exonic, open(
-        #     output_dir + "/" + NONPANEL_SILENT_FILTERED, "w") as nonpanel_silent, open(
-        #     output_dir + "/" + DROPPED, "w") as dropped:
-
-        #     headers = "\t".join(MAF_TSV_COL_MAP.values()) + "\n"
-        #     for f in [exonic, silent, nonpanel_exonic, nonpanel_silent, dropped]:
-        #         f.write(headers)
-
-        return headers
-
-        # typer.secho(
-        #     f"missing columns expected for {tagging} tagging",
-        #     fg=typer.colors.RED,
-        # )
-        # raise typer.Abort()
-
     def tag_by_variant_annotations(self, rules_df):
         if rules_df is not None:
             for index, row in rules_df.iterrows():
@@ -809,13 +805,13 @@ def tag_by_variant_annotations(self, rules_df):
                 if End_Position != "none":
                     condition &= self.data_frame["End_Position"] <= float(End_Position)
 
-                colname = " ".join(Tag_Column_Name)
+                colname = "".join(Tag_Column_Name)
                 tag_column_name = f"is_{colname}_variant"
                 self.data_frame[tag_column_name] = "No"
                 self.data_frame.loc[condition, tag_column_name] = "Yes"
         else:
             typer.secho(
-                f"MAF File is empty. Please check your inputs again.",
+                f"MAF Rules JSON File is empty. Please check your inputs again.",
                 fg=typer.colors.RED,
             )
             raise typer.Abort()

diff --git a/postprocessing_variant_calls/maf/tag/tag_constants.py b/postprocessing_variant_calls/maf/tag/tag_constants.py
@@ -525,13 +525,13 @@
         # ("Start_Position", "Start"),
         # ("Reference_Allele", "Ref"),
         # ("Tumor_Seq_Allele2", "Alt"),
-        ("vcf_pos", "Start"),
-        ("VCF_REF", "Ref"),
-        ("VCF_ALT", "Alt"),
+        ("Start_Position", "Start_Position"),
+        ("Reference_Allele", "Reference_Allele"),
+        ("Tumor_Seq_Allele2", "Tumor_Seq_Allele2"),
         ("Variant_Classification", "VariantClass"),
         ("Hugo_Symbol", "Gene"),
         ("Call_Confidence", "Call_Confidence"),
-        ("EXON", "Exon"),
+        ("Exon_Number", "Exon_Number"),
         ("Transcript_ID", "TranscriptID"),
         ("Comments", "Comments"),
         ("HGVSc", "cDNAchange"),