feat: Change folder structure for fp-fn results, add collection of fp…

…-fn results for all covs of a callset and all callsets of a benchmark.
snakemake-workflows · Nov 21, 2024 · 4b1a64b · 4b1a64b
1 parent db68793
commit 4b1a64b
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 3 deletions.
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -467,6 +467,12 @@ def get_collect_stratifications_input(wildcards):
         cov=get_nonempty_coverages(wildcards),
     )
 
+def get_collect_stratifications_fp_fn_input(wildcards):
+    return expand(
+        "results/fp-fn/callsets/{{callset}}/{cov}.{{classification}}.tsv",
+        cov=get_nonempty_coverages(wildcards),
+    )
+
 
 def get_fp_fn_reports(wildcards):
     for genome in used_genomes:
@@ -496,6 +502,12 @@ def get_collect_precision_recall_input(wildcards):
         "results/precision-recall/callsets/{callset}.{{vartype}}.tsv", callset=callsets
     )
 
+def get_collect_fp_fn_benchmark_input(wildcards):
+    callsets = get_benchmark_callsets(wildcards.benchmark)
+    return expand(
+        "results/fp-fn/callsets/{callset}.{{classification}}.tsv", callset=callsets
+    )
+
 
 def get_genome_name(wildcards):
     if hasattr(wildcards, "benchmark"):
@@ -552,7 +564,8 @@ def get_collect_fp_fn_callsets(wildcards):
 def get_collect_fp_fn_input(wildcards):
     callsets = get_collect_fp_fn_callsets(wildcards)
     return expand(
-        "results/fp-fn/callsets/{{cov}}/{callset}/{{classification}}.tsv",
+        "results/fp-fn/callsets/{callset}/{{cov}}.{{classification}}.tsv",
+        # "results/fp-fn/callsets/{{cov}}/{callset}/{{classification}}.tsv",
         callset=callsets,
     )
 

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -338,9 +338,9 @@ rule extract_fp_fn:
         calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
         common_src=common_src,
     output:
-        "results/fp-fn/callsets/{cov}/{callset}/{classification}.tsv",
+        "results/fp-fn/callsets/{callset}/{cov}.{classification}.tsv",
     log:
-        "logs/extract-fp-fn/{cov}/{callset}/{classification}.log",
+        "logs/extract-fp-fn/{callset}/{cov}.{classification}.log",
     conda:
         "../envs/vembrane.yaml"
     script:
@@ -374,6 +374,42 @@ rule collect_fp_fn:
         "../scripts/collect-fp-fn.py"
 
 
+rule collect_stratifications_fp_fn:
+    input:
+        get_collect_stratifications_fp_fn_input,
+    output:
+        "results/fp-fn/callsets/{callset}.{classification}.tsv",
+    params:
+        coverages=get_nonempty_coverages,
+        coverage_lower_bounds=get_coverages,
+    log:
+        "logs/fp-fn/callsets/{callset}.{classification}.log",
+    conda:
+        "../envs/stats.yaml"
+    # This has to happen after precision/recall has been computed, otherwise we risk
+    # extremely high memory usage if a callset does not match the truth at all.
+    priority: 1
+    script:
+        "../scripts/collect-stratifications-fp-fn.py"
+
+rule collect_fp_fn_benchmark:
+    input:
+        tables=get_collect_fp_fn_benchmark_input,
+    output:
+        "results/fp-fn/benchmarks/{benchmark}.{classification}.tsv",
+    params:
+        callsets=lambda w: get_benchmark_callsets(w.benchmark),
+        # labels=get_collect_precision_recall_labels,
+        # vaf=get_vaf_status,
+    log:
+        "logs/fp-fn/benchmarks/{benchmark}.{classification}.log",
+    conda:
+        "../envs/stats.yaml"
+    script:
+        "../scripts/collect-fp-fn-benchmarks.py"
+
+
+
 rule report_fp_fn:
     input:
         main_dataset="results/fp-fn/genomes/{genome}/{cov}/{classification}/main.tsv",

diff --git a/workflow/scripts/collect-fp-fn-benchmarks.py b/workflow/scripts/collect-fp-fn-benchmarks.py
@@ -0,0 +1,45 @@
+import sys
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+
+def load_data(path, callset):
+    d = pd.read_csv(path, sep="\t")
+    d.insert(0, "callset", callset)
+    return d
+
+
+results = pd.concat(
+    [
+        load_data(f, callset)
+        for f, callset in zip(snakemake.input.tables, snakemake.params.callsets)
+    ],
+    axis="rows",
+)
+
+def cov_key(cov_label):
+    # return lower bound as integer for sorting
+    if ".." in cov_label:
+        return int(cov_label.split("..")[0])
+    else:
+        return int(cov_label[1:])
+
+
+
+def sort_key(col):
+    if col.name == "callset":
+        return col
+    if col.name == "coverage":
+        return col.apply(cov_key)
+    else:
+        return col
+
+
+# if snakemake.params.vaf:
+#     results.sort_values(["callset", "vaf", "coverage"], inplace=True, key=sort_key)
+# else:
+results.sort_values(["callset", "coverage"], inplace=True, key=sort_key)
+results["sort_index"] = results["coverage"].apply(cov_key)
+
+results.to_csv(snakemake.output[0], sep="\t", index=False)
diff --git a/workflow/scripts/collect-stratifications-fp-fn.py b/workflow/scripts/collect-stratifications-fp-fn.py
@@ -0,0 +1,56 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+
+def get_cov_label(coverage):
+    lower = snakemake.params.coverage_lower_bounds[coverage]
+    bounds = [
+        bound
+        for bound in snakemake.params.coverage_lower_bounds.values()
+        if bound > lower
+    ]
+    if bounds:
+        upper = min(bounds)
+        return f"{lower}..{upper}"
+    else:
+        return f"≥{lower}"
+
+
+def load_data(f, coverage):
+    d = pd.read_csv(f, sep="\t")
+    d.insert(0, "coverage", get_cov_label(coverage))
+    return d
+
+
+if snakemake.input:
+    report = pd.concat(
+        load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
+    )
+
+    # TODO With separate files for SNVs and indels with e.g. STRELKA no predicted variants for the other type are expected
+    # If later relevant, add annotation to the report
+    # if (report["tp_truth"] == 0).all():
+    #     raise ValueError(
+    #         f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
+    #         "This is likely a technical issue in the callset and should be checked before further evaluation."
+    #     )
+
+    report.to_csv(snakemake.output[0], sep="\t", index=False)
+else:
+    pd.DataFrame(
+        {
+            col: []
+            for col in [
+                "coverage",
+                "class",
+                "chromosome	position",
+                "ref_allele",
+                "alt_allele"
+                "true_genotype",
+                "predicted_genotype"
+            ]
+        }
+    ).to_csv(snakemake.output[0], sep="\t")