Skip to content

Commit

Permalink
Merge pull request #48 from msk-access/feature/remove_variant_by_annot
Browse files Browse the repository at this point in the history
Feature/remove variant by annot
  • Loading branch information
buehlere authored Sep 27, 2024
2 parents 4d5f9f1 + 8cd69ee commit 5e7f718
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 52 deletions.
23 changes: 18 additions & 5 deletions postprocessing_variant_calls/maf/filter/filter_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ def _create_fillout_summary(df_fillout, alt_thres, mutation_key):
for col in df_fillout.columns
if not col.endswith(("_simplex_duplex", "_duplex", "_simplex"))
]

duplex_columns = [
col
for col in df_fillout.columns
Expand All @@ -237,6 +238,7 @@ def _create_fillout_summary(df_fillout, alt_thres, mutation_key):
simplex_duplex_columns = base_columns + [
col for col in df_fillout.columns if col.endswith(("_simplex_duplex"))
]

simplex_duplex_df = df_fillout[simplex_duplex_columns]
simplex_duplex_df["fillout_type"] = f"{fillout_type}SIMPLEX_DUPLEX_"
new_fillout_type_SD = f"{fillout_type}SIMPLEX_DUPLEX_"
Expand Down Expand Up @@ -336,6 +338,10 @@ def _extract_tn_genotypes(
"t_alt_count_fragment_simplex",
"t_ref_count_fragment_simplex",
"t_vaf_fragment_simplex",
"is_exonic_variant",
"is_MET_variant",
"is_TERT_variant",
"hotspot_whitelist",
]
]

Expand Down Expand Up @@ -367,6 +373,10 @@ def _extract_tn_genotypes(
"t_alt_count_fragment_standard",
"t_ref_count_fragment_standard",
"t_vaf_fragment_standard",
"is_exonic_variant",
"is_MET_variant",
"is_TERT_variant",
"hotspot_whitelist",
]
]
df_n_genotype.insert(0, "Matched_Norm_Sample_Barcode", str(normal_samplename))
Expand Down Expand Up @@ -493,6 +503,10 @@ def make_pre_filtered_maf(
axis=1,
)

# Remove duplicate columns
df_anno_with_genotypes = df_anno_with_genotypes.loc[
:, ~df_anno_with_genotypes.columns.duplicated()
]
df_anno_with_genotypes.index = df_annotation.index

df_pre_filter = (
Expand Down Expand Up @@ -746,7 +760,6 @@ def cleanup_post_filter(df_post_filter):
return df_post_filter

df_post_filter = pre_filter_maf.copy()

df_post_filter["Status"] = ""

for i, mut in df_post_filter.iterrows():
Expand Down Expand Up @@ -937,10 +950,10 @@ def format_maf_for_mpath(df_post_filter):
# sys.exit(f"The following required columns are missing: {missing_columns_str}")

# compute exon and intron columns
renamed_maf_w_dummy_cols["EXON"] = np.vectorize(get_exon, otypes=[str])(
renamed_maf_w_dummy_cols["EXON"], renamed_maf_w_dummy_cols["INTRON"]
)
renamed_maf_w_dummy_cols = renamed_maf_w_dummy_cols.drop(["INTRON"], axis=1)
# renamed_maf_w_dummy_cols["EXON"] = np.vectorize(get_exon, otypes=[str])(
# renamed_maf_w_dummy_cols["EXON"], renamed_maf_w_dummy_cols["INTRON"]
# )
# renamed_maf_w_dummy_cols = renamed_maf_w_dummy_cols.drop(["INTRON"], axis=1)

# computing all the gnomad associated cols
# "TypeError: '>=' not supported between instances of 'float' and 'str'"
Expand Down
1 change: 1 addition & 0 deletions postprocessing_variant_calls/maf/filter/filter_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ def access_filters(
tumor_samplename,
normal_samplename,
)

# calls to apply_filter_maf (tagging functions)
df_post_filter = apply_filter_maf(pre_filter_maf, **args)
# calls to condensed post filter maf
Expand Down
76 changes: 36 additions & 40 deletions postprocessing_variant_calls/maf/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def make_condensed_post_filter(df_post_filter):
def _find_VAFandsummary(df, sample_group): # add category as third argumnet
# add a line of code here to rename the simplex, duplex and simplex_duplex columns with a prefix of the category they belong to.
df = df.copy()

# find the VAF from the fillout (the comma separated string values that the summary will later be calculated from)
# NOTE: col [t_vaf_fragment] already calculated by traceback, no need to create column again

Expand Down Expand Up @@ -305,6 +306,38 @@ def read_tsv(tsv, separator, canonical_tx_ref_flag=False):
return pd.read_csv(tsv, sep=separator, skiprows=skip, low_memory=False)


def tag_by_hotspots(input_maf, hotspots_maf):
"""Read an input MAF file and tag any hotspots present in it from corresponding hotspots MAF file.
Args:
maf (File): Input MAF/tsv like format file
hotspots_maf (File): Input MAF/tsv like format file containing hotspots
Returns:
data_frame: Output a data frame containing the MAF/tsv tagged with hotspots
"""
cols = ["Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2"]
hotspots = set()
with open(hotspots_maf, "r") as infile:
reader = csv.DictReader(infile, delimiter="\t")
for row in reader:
key = ":".join([row[k] for k in cols])
hotspots.add(tuple(key))

# Add hotspots column, set initial value to No
input_maf["hotspot_whitelist"] = "No"
# Create the column containing the cols values by joining values
input_maf["key"] = input_maf[cols].astype(str).agg(":".join, axis=1)
# update the hotspot whitelist column with Yes values if hotspots are detected
input_maf.loc[input_maf["key"].apply(tuple).isin(hotspots), "hotspot_whitelist"] = (
"Yes"
)
# drop the key column since its not needed in final maf
input_maf_final = input_maf.drop(columns=["key"])

return input_maf_final


def get_row(tsv_file):
"""Function to skip rows
Expand Down Expand Up @@ -699,7 +732,7 @@ def tag_by_variant_classification(self, output_dir, ref_lst):

def is_exonic_or_silent(row, ref_lst):
exonic_result = IS_EXONIC_CLASS(
row.Hugo_Symbol, row.Variant_Classification, row.vcf_pos
row.Hugo_Symbol, row.Variant_Classification, row.Start_Position
)
if exonic_result:
if row.Transcript_ID in ref_lst:
Expand Down Expand Up @@ -737,43 +770,6 @@ def tag_row(row, canonical):

return maf

# file_names = [
# EXONIC_FILTERED,
# SILENT_FILTERED,
# NONPANEL_EXONIC_FILTERED,
# NONPANEL_SILENT_FILTERED,
# DROPPED
# ]

# Create exonic, silent, and nonpanel files.
# file_paths = [f"{output_dir}/{name}" for name in file_names]

# headers = "\t".join(MAF_TSV_COL_MAP.values()) + "\n"

# with ExitStack() as stack:
# files = [stack.enter_context(open(path, "w")) for path in file_paths]

# for file in files:
# file.write(headers)

# with open(output_dir + "/" + EXONIC_FILTERED, "w") as exonic, open(
# output_dir + "/" + SILENT_FILTERED, "w") as silent, open(
# output_dir + "/" + NONPANEL_EXONIC_FILTERED, "w") as nonpanel_exonic, open(
# output_dir + "/" + NONPANEL_SILENT_FILTERED, "w") as nonpanel_silent, open(
# output_dir + "/" + DROPPED, "w") as dropped:

# headers = "\t".join(MAF_TSV_COL_MAP.values()) + "\n"
# for f in [exonic, silent, nonpanel_exonic, nonpanel_silent, dropped]:
# f.write(headers)

return headers

# typer.secho(
# f"missing columns expected for {tagging} tagging",
# fg=typer.colors.RED,
# )
# raise typer.Abort()

def tag_by_variant_annotations(self, rules_df):
if rules_df is not None:
for index, row in rules_df.iterrows():
Expand Down Expand Up @@ -809,13 +805,13 @@ def tag_by_variant_annotations(self, rules_df):
if End_Position != "none":
condition &= self.data_frame["End_Position"] <= float(End_Position)

colname = " ".join(Tag_Column_Name)
colname = "".join(Tag_Column_Name)
tag_column_name = f"is_{colname}_variant"
self.data_frame[tag_column_name] = "No"
self.data_frame.loc[condition, tag_column_name] = "Yes"
else:
typer.secho(
f"MAF File is empty. Please check your inputs again.",
f"MAF Rules JSON File is empty. Please check your inputs again.",
fg=typer.colors.RED,
)
raise typer.Abort()
Expand Down
8 changes: 4 additions & 4 deletions postprocessing_variant_calls/maf/tag/tag_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,13 +525,13 @@
# ("Start_Position", "Start"),
# ("Reference_Allele", "Ref"),
# ("Tumor_Seq_Allele2", "Alt"),
("vcf_pos", "Start"),
("VCF_REF", "Ref"),
("VCF_ALT", "Alt"),
("Start_Position", "Start_Position"),
("Reference_Allele", "Reference_Allele"),
("Tumor_Seq_Allele2", "Tumor_Seq_Allele2"),
("Variant_Classification", "VariantClass"),
("Hugo_Symbol", "Gene"),
("Call_Confidence", "Call_Confidence"),
("EXON", "Exon"),
("Exon_Number", "Exon_Number"),
("Transcript_ID", "TranscriptID"),
("Comments", "Comments"),
("HGVSc", "cDNAchange"),
Expand Down
Loading

0 comments on commit 5e7f718

Please sign in to comment.