Skip to content

Commit

Permalink
feat: Change folder structure for fp-fn results, add collection of fp…
Browse files Browse the repository at this point in the history
…-fn results for all covs of a callset and all callsets of a benchmark.
  • Loading branch information
BiancaStoecker committed Nov 21, 2024
1 parent db68793 commit 4b1a64b
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 3 deletions.
15 changes: 14 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,12 @@ def get_collect_stratifications_input(wildcards):
cov=get_nonempty_coverages(wildcards),
)

def get_collect_stratifications_fp_fn_input(wildcards):
return expand(
"results/fp-fn/callsets/{{callset}}/{cov}.{{classification}}.tsv",
cov=get_nonempty_coverages(wildcards),
)


def get_fp_fn_reports(wildcards):
for genome in used_genomes:
Expand Down Expand Up @@ -496,6 +502,12 @@ def get_collect_precision_recall_input(wildcards):
"results/precision-recall/callsets/{callset}.{{vartype}}.tsv", callset=callsets
)

def get_collect_fp_fn_benchmark_input(wildcards):
callsets = get_benchmark_callsets(wildcards.benchmark)
return expand(
"results/fp-fn/callsets/{callset}.{{classification}}.tsv", callset=callsets
)


def get_genome_name(wildcards):
if hasattr(wildcards, "benchmark"):
Expand Down Expand Up @@ -552,7 +564,8 @@ def get_collect_fp_fn_callsets(wildcards):
def get_collect_fp_fn_input(wildcards):
callsets = get_collect_fp_fn_callsets(wildcards)
return expand(
"results/fp-fn/callsets/{{cov}}/{callset}/{{classification}}.tsv",
"results/fp-fn/callsets/{callset}/{{cov}}.{{classification}}.tsv",
# "results/fp-fn/callsets/{{cov}}/{callset}/{{classification}}.tsv",
callset=callsets,
)

Expand Down
40 changes: 38 additions & 2 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,9 @@ rule extract_fp_fn:
calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
common_src=common_src,
output:
"results/fp-fn/callsets/{cov}/{callset}/{classification}.tsv",
"results/fp-fn/callsets/{callset}/{cov}.{classification}.tsv",
log:
"logs/extract-fp-fn/{cov}/{callset}/{classification}.log",
"logs/extract-fp-fn/{callset}/{cov}.{classification}.log",
conda:
"../envs/vembrane.yaml"
script:
Expand Down Expand Up @@ -374,6 +374,42 @@ rule collect_fp_fn:
"../scripts/collect-fp-fn.py"


rule collect_stratifications_fp_fn:
input:
get_collect_stratifications_fp_fn_input,
output:
"results/fp-fn/callsets/{callset}.{classification}.tsv",
params:
coverages=get_nonempty_coverages,
coverage_lower_bounds=get_coverages,
log:
"logs/fp-fn/callsets/{callset}.{classification}.log",
conda:
"../envs/stats.yaml"
# This has to happen after precision/recall has been computed, otherwise we risk
# extremely high memory usage if a callset does not match the truth at all.
priority: 1
script:
"../scripts/collect-stratifications-fp-fn.py"

rule collect_fp_fn_benchmark:
input:
tables=get_collect_fp_fn_benchmark_input,
output:
"results/fp-fn/benchmarks/{benchmark}.{classification}.tsv",
params:
callsets=lambda w: get_benchmark_callsets(w.benchmark),
# labels=get_collect_precision_recall_labels,
# vaf=get_vaf_status,
log:
"logs/fp-fn/benchmarks/{benchmark}.{classification}.log",
conda:
"../envs/stats.yaml"
script:
"../scripts/collect-fp-fn-benchmarks.py"



rule report_fp_fn:
input:
main_dataset="results/fp-fn/genomes/{genome}/{cov}/{classification}/main.tsv",
Expand Down
45 changes: 45 additions & 0 deletions workflow/scripts/collect-fp-fn-benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def load_data(path, callset):
d = pd.read_csv(path, sep="\t")
d.insert(0, "callset", callset)
return d


results = pd.concat(
[
load_data(f, callset)
for f, callset in zip(snakemake.input.tables, snakemake.params.callsets)
],
axis="rows",
)

def cov_key(cov_label):
# return lower bound as integer for sorting
if ".." in cov_label:
return int(cov_label.split("..")[0])
else:
return int(cov_label[1:])



def sort_key(col):
if col.name == "callset":
return col
if col.name == "coverage":
return col.apply(cov_key)
else:
return col


# if snakemake.params.vaf:
# results.sort_values(["callset", "vaf", "coverage"], inplace=True, key=sort_key)
# else:
results.sort_values(["callset", "coverage"], inplace=True, key=sort_key)
results["sort_index"] = results["coverage"].apply(cov_key)

results.to_csv(snakemake.output[0], sep="\t", index=False)
56 changes: 56 additions & 0 deletions workflow/scripts/collect-stratifications-fp-fn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_cov_label(coverage):
lower = snakemake.params.coverage_lower_bounds[coverage]
bounds = [
bound
for bound in snakemake.params.coverage_lower_bounds.values()
if bound > lower
]
if bounds:
upper = min(bounds)
return f"{lower}..{upper}"
else:
return f"≥{lower}"


def load_data(f, coverage):
d = pd.read_csv(f, sep="\t")
d.insert(0, "coverage", get_cov_label(coverage))
return d


if snakemake.input:
report = pd.concat(
load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
)

# TODO With separate files for SNVs and indels with e.g. STRELKA no predicted variants for the other type are expected
# If later relevant, add annotation to the report
# if (report["tp_truth"] == 0).all():
# raise ValueError(
# f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
# "This is likely a technical issue in the callset and should be checked before further evaluation."
# )

report.to_csv(snakemake.output[0], sep="\t", index=False)
else:
pd.DataFrame(
{
col: []
for col in [
"coverage",
"class",
"chromosome position",
"ref_allele",
"alt_allele"
"true_genotype",
"predicted_genotype"
]
}
).to_csv(snakemake.output[0], sep="\t")

0 comments on commit 4b1a64b

Please sign in to comment.