Skip to content

Commit

Permalink
Fixing a critical bug in the assignment of samples to reference genom…
Browse files Browse the repository at this point in the history
…es and replacing the sourmash dbs with the appropriate versions
  • Loading branch information
marchoeppner committed Jan 17, 2025
1 parent 9648d84 commit fab2d72
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 63 deletions.
9 changes: 9 additions & 0 deletions assets/gabi_template.html
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ <h2>Summary</h2>
<th colspan=2 scope="col"><div class="tooltip">Reference genome<span class="tooltiptext">The highest matching hit in RefSeq to this assembly</span></div></th>
<th colspan=5 scope="col"><div class="tooltip">Assembly<span class="tooltiptext">Information about this assembly</span></div></th>
<th colspan=4 scope="col"><div class="tooltip">Mean coverage<span class="tooltiptext">Mean coverage of reads mapped back to the assembly - bigger is better</span></div></th>
<th colspan=4 scope="col"><div class="tooltip">Coverage 40X (%)<span class="tooltiptext">Percentage of assembly covered at 40X or more</span></div></th>
<th colspan=3 scope="col"><div class="tooltip">Read quality<span class="tooltiptext">Quality metrics of reads after trimming</span></div></th>
<th colspan=2 scope="col"><div class="tooltip">Contamination<span class="tooltiptext">Indicators of contamination</span></div></th>
</tr>
Expand All @@ -123,6 +124,10 @@ <h2>Summary</h2>
<th scope="subcol">ILM</th>
<th scope="subcol">ONT</th>
<th scope="subcol">HiFi</th>
<th scope="subcol"><div class="tooltip">Total<span class="tooltiptext">Across all sequencing technologies</span></div></th>
<th scope="subcol"><div class="tooltip">ILM<span class="tooltiptext">Illumina reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">ONT<span class="tooltiptext">ONT reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">HiFi<span class="tooltiptext">Pacbio reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">ILM Q30 (%)<span class="tooltiptext">Fraction of Illumina reads above Q30.</span></div></th>
<th scope="subcol"><div class="tooltip">ONT Q15 (#)<span class="tooltiptext">Number of ONT reads above Q15.</span></div></th>
<th scope="subcol"><div class="tooltip">ONT N50 (bp)<span class="tooltiptext">N50 of ONT reads</span></div></th>
Expand All @@ -149,6 +154,10 @@ <h2>Summary</h2>
<td scope={{row.coverage_illumina_status}}>{{row.coverage_illumina}}</td>
<td scope={{row.coverage_nanopore_status}}>{{row.coverage_nanopore}}</td>
<td scope={{row.coverage_pacbio_status}}>{{row.coverage_pacbio}}</td>
<td scope={{row.coverage_40_status}}>{{row.coverage_40}}</td>
<td scope={{row.coverage_40_illumina_status}}>{{row.coverage_40_illumina}}</td>
<td scope={{row.coverage_40_nanopore_status}}>{{row.coverage_40_nanopore}}</td>
<td scope={{row.coverage_40_pacbio_status}}>{{row.coverage_40_pacbio}}</td>
<td scope={{row.quality_illumina_status}}>{{row.quality_illumina}}</td>
<td scope="missing">{{row.quality_nanopore}}</td>
<td scope="missing">{{row.nanopore_n50}}</td>
Expand Down
140 changes: 94 additions & 46 deletions bin/gabi.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ def main(yaml, template, output, reference, version, call, wd):
if "fastp" in jdata:
fastp_q30_status = status["pass"]
fastp_summary = jdata["fastp"]["summary"]
fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"], 2)
if fastp_q30 < 0.85:
fastp_q30 = (round(fastp_summary["after_filtering"]["q30_rate"], 2) * 100)
if fastp_q30 < 85:
fastp_q30_status = status["warn"]

##########################
Expand Down Expand Up @@ -208,7 +208,7 @@ def main(yaml, template, output, reference, version, call, wd):
# Get assembly stats
####################

assembly = round((int(jdata["quast"]["Total length"])/1000000), 2)
assembly = round((int(jdata["quast"]["Total length"]) / 1000000), 2)
assembly_status = check_assembly(this_refs, int(jdata["quast"]["Total length"]))

genome_fraction = "-"
Expand All @@ -222,7 +222,6 @@ def main(yaml, template, output, reference, version, call, wd):

if "Genome fraction (%)" in jdata["quast"]:
genome_fraction = round(float(jdata["quast"]["Genome fraction (%)"]), 2)
genome_fraction_status = status["pass"]
quast["duplication"] = jdata["quast"]["Duplication ratio"]
quast["misassembled"] = jdata["quast"]["# misassembled contigs"]
quast["mismatches"] = jdata["quast"]["# mismatches per 100 kbp"]
Expand All @@ -233,16 +232,16 @@ def main(yaml, template, output, reference, version, call, wd):
contigs = int(jdata["quast"]["# contigs"])
contigs_status = check_contigs(this_refs, int(jdata["quast"]["# contigs"]))

n50 = round((int(jdata["quast"]["N50"])/1000), 2)
n50 = round((int(jdata["quast"]["N50"]) / 1000), 2)
n50_status = check_n50(this_refs, int(jdata["quast"]["N50"]))

quast["size"] = jdata["quast"]["Total length (>= 0 bp)"]
quast["N"] = jdata["quast"]["# N's per 100 kbp"]
quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"])/1000), 2)
quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"]) / 1000), 2)
quast["contigs_1k"] = jdata["quast"]["# contigs (>= 1000 bp)"]
quast["contigs_5k"] = jdata["quast"]["# contigs (>= 5000 bp)"]
quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"])/1000000), 2)
quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"])/1000000), 2)
quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"]) / 1000000), 2)
quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"]) / 1000000), 2)
quast["gc"] = float(jdata["quast"]["GC (%)"])
quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"]))

Expand Down Expand Up @@ -281,10 +280,10 @@ def main(yaml, template, output, reference, version, call, wd):
busco = jdata["busco"]
busco_status = status["missing"]
busco_total = int(busco["dataset_total_buscos"])
busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100
busco_fragmented = round((int(busco["F"])/busco_total), 2)*100
busco_missing = round((int(busco["M"])/busco_total), 2)*100
busco_duplicated = round((int(busco["D"])/busco_total), 2)*100
busco_completeness = round(((int(busco["C"])) / int(busco_total)), 2) * 100
busco_fragmented = round((int(busco["F"]) / busco_total), 2) * 100
busco_missing = round((int(busco["M"]) / busco_total), 2) * 100
busco_duplicated = round((int(busco["D"]) / busco_total), 2) * 100
busco["completeness"] = busco_completeness
busco_data_all.append({"Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated})

Expand Down Expand Up @@ -331,6 +330,8 @@ def main(yaml, template, output, reference, version, call, wd):
coverage_pacbio = "-"
coverage_pacbio_status = status["missing"]

# mean coverages

if "total" in jdata["mosdepth"]:
coverage = float(jdata["mosdepth"]["total"]["mean"])
if coverage >= 40:
Expand All @@ -340,40 +341,78 @@ def main(yaml, template, output, reference, version, call, wd):
else:
coverage_status = status["pass"]

if ("illumina" in jdata["mosdepth"]):

coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
if coverage_illumina >= 40:
coverage_illumina_status = status["pass"]
elif coverage_illumina >= 20:
coverage_illumina_status = status["warn"]
if "illumina" in jdata["mosdepth"]:
coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
if coverage_illumina >= 40:
coverage_illumina_status = status["pass"]
elif coverage_illumina >= 20:
coverage_illumina_status = status["warn"]
else:
coverage_illumina_status = status["fail"]

if "nanopore" in jdata["mosdepth"]:
coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
if coverage_nanopore >= 40:
coverage_nanopore_status = status["pass"]
elif coverage_nanopore >= 20:
coverage_nanopore_status = status["warn"]
else:
coverage_nanopore_status = status["fail"]

if "pacbio" in jdata["mosdepth"]:
coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
if coverage_pacbio >= 40:
coverage_pacbio_status = status["pass"]
elif coverage_pacbio >= 20:
coverage_pacbio_status = status["warn"]
else:
coverage_pacbio_status = status["fail"]

# fraction covered at 40X

coverage_40 = "-"
coverage_40_status = status["missing"]
coverage_40_illumina = "-"
coverage_40_illumina_status = status["missing"]
coverage_40_nanopore = "-"
coverage_40_nanopore_status = status["missing"]
coverage_40_pacbio = "-"
coverage_40_pacbio_status = status["missing"]

if "mosdepth_global" in jdata:
if "illumina" in jdata["mosdepth_global"]:
coverage_40_illumina = jdata["mosdepth_global"]["illumina"]["40"]
if coverage_40_illumina < 90:
coverage_40_illumina_status = status["warn"]
else:
coverage_40_illumina_status = status["pass"]
if "nanopore" in jdata["mosdepth_global"]:
coverage_40_nanopore = jdata["mosdepth_global"]["nanopore"]["40"]
if coverage_40_nanopore < 90:
coverage_40_nanopore_status = status["warn"]
else:
coverage_illumina_status = status["fail"]

if ("nanopore" in jdata["mosdepth"]):
coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
if coverage_nanopore >= 40:
coverage_nanopore_status = status["pass"]
elif coverage_nanopore >= 20:
coverage_nanopore_status = status["warn"]
coverage_40_nanopore_status = status["pass"]
if "pacbio" in jdata["mosdepth_global"]:
coverage_40_pacbio = jdata["mosdepth_global"]["pacbio"]["40"]
if coverage_40_pacbio < 90:
coverage_40_pacbio_status = status["warn"]
else:
coverage_nanopore_status = status["fail"]

if ("pacbio" in jdata["mosdepth"]):
coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
if coverage_pacbio >= 40:
coverage_pacbio_status = status["pass"]
elif coverage_pacbio >= 20:
coverage_pacbio_status = status["warn"]
coverage_40_pacbio_status = status["pass"]
if "total" in jdata["mosdepth_global"]:
coverage_40 = jdata["mosdepth_global"]["total"]["40"]
if coverage_40 < 90:
coverage_40_status = status["warn"]
elif coverage_40 < 75:
coverage_40_status = status["fail"]
else:
coverage_pacbio_status = status["fail"]
coverage_40_status = status["pass"]

######################################
# Set the overall status of the sample
######################################

# The metrics that by themselves determine overall status:
for estatus in [confindr_status, taxon_count_status, assembly_status]:
for estatus in [confindr_status, taxon_count_status, assembly_status]:
# if any one metric failed, the whole sample failed
if estatus == status["fail"]:
this_status = estatus
Expand Down Expand Up @@ -413,6 +452,14 @@ def main(yaml, template, output, reference, version, call, wd):
"coverage_nanopore_status": coverage_nanopore_status,
"coverage_pacbio": coverage_pacbio,
"coverage_pacbio_status": coverage_pacbio_status,
"coverage_40": coverage_40,
"coverage_40_status": coverage_40_status,
"coverage_40_illumina": coverage_40_illumina,
"coverage_40_illumina_status": coverage_40_illumina_status,
"coverage_40_nanopore": coverage_40_nanopore,
"coverage_40_nanopore_status": coverage_40_nanopore_status,
"coverage_40_pacbio": coverage_40_pacbio,
"coverage_40_pacbio_status": coverage_40_pacbio_status,
"n50": n50,
"n50_status": n50_status,
"fraction": genome_fraction,
Expand All @@ -432,32 +479,33 @@ def main(yaml, template, output, reference, version, call, wd):
# Plots
#############

# Kraken abundances
if "kraken" in jdata:
# Draw the Kraken abundance table
kdata = pd.DataFrame(data=kraken_data_all, index=samples)
plot_labels = {"index": "Samples", "value": "Percentage"}
h = len(samples)*25 if len(samples) > 10 else 400
h = len(samples) * 25 if len(samples) > 10 else 400
fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)

data["Kraken"] = fig.to_html(full_html=False)

# Insert size distribution
if "samtools" in jdata:
# Crop all insert size histograms to the shortest common length
insert_sizes_all_cropped = {}
for s, ins in insert_sizes_all.items():
list_end = min_insert_size_length-1
list_end = min_insert_size_length - 1
insert_sizes_all_cropped[s] = ins[:list_end]

plot_labels = {"index": "Basepairs", "value": "Count"}
hdata = pd.DataFrame(insert_sizes_all_cropped)
hfig = px.line(hdata, labels=plot_labels)
data["Insertsizes"] = hfig.to_html(full_html=False)

# busco score
if busco_data_all:
# Draw the busco stats graph
bdata = pd.DataFrame(data=busco_data_all, index=samples)
plot_labels = {"index": "Samples", "value": "Percentage"}
h = len(samples)*25 if len(samples) > 10 else 400
h = len(samples) * 25 if len(samples) > 10 else 400
fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h)
data["Busco"] = fig.to_html(full_html=False)

Expand Down Expand Up @@ -505,7 +553,7 @@ def check_assembly(refs, query):
# assembly falls between the allowed sizes
if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any(x >= (query*0.8) for x in ref_intervals) and any(x <= (query*1.2) for x in ref_intervals)):
elif (any(x >= (query * 0.8) for x in ref_intervals) and any(x <= (query * 1.2) for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -523,7 +571,7 @@ def check_contigs(refs, query):

if (any(x >= query for x in ref_intervals)):
return status["pass"]
elif (any((x*1.2) >= query for x in ref_intervals)):
elif (any((x * 1.2) >= query for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -541,7 +589,7 @@ def check_n50(refs, query):

if (any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any((x*0.8) <= query for x in ref_intervals)):
elif (any((x * 0.8) <= query for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -558,7 +606,7 @@ def check_gc(refs, query):
# check if gc falls within expected range, or range +/- 5% - else fail
if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any(x >= (query*0.95) for x in ref_intervals) and any(x <= (query*1.05) for x in ref_intervals)):
elif (any(x >= (query * 0.95) for x in ref_intervals) and any(x <= (query * 1.05) for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand Down
2 changes: 1 addition & 1 deletion bin/gabi_summary.pl
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
} elsif ( $filename =~ /PACBIO.mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'pacbio'} = \%data;
} elsif ( $filename =~ /mosdepth.global.dist.txt/ ) {
} elsif ( $filename =~ /.*mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'total'} = \%data;
} elsif ( $filename =~ /.sistr.tab/) {
Expand Down
2 changes: 1 addition & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ process {
]
}
withName: 'SOURMASH_SKETCH' {
ext.args = "dna"
ext.args = "dna -p scaled=1000,k=31,k=21"
publishDir = [
path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
mode: params.publish_dir_mode,
Expand Down
13 changes: 5 additions & 8 deletions conf/resources.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,13 @@ params {

references {

'mashdb' {
url = 'https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz'
db = "${params.reference_base}/gabi/${params.reference_version}/mashdb/RefSeq88n.msh"
}
'sourmashdb' {
url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-k31.zip'
db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-k31.zip"
url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-k31.sbt.zip'
db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-k31.sbt.zip"
}
'sourmashdb_nr' {
url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip'
db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-reps.k31.zip"
url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.sbt.zip'
db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-reps.k31.sbt.zip"
}
'amrfinderdb' {
db = "${params.reference_base}/gabi/${params.reference_version}/amrfinder/latest"
Expand Down Expand Up @@ -185,6 +181,7 @@ params {
arcobacter_butzleri = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_3/Arcobacter_butzleri_wgMLST"
campylobacter_jejuni = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_4/Campylobacter_jejuni_INNUENDO_wgMLST"
campylobacter_coli = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_4/Campylobacter_jejuni_INNUENDO_wgMLST"
campylobacter_lari = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_4/Campylobacter_jejuni_INNUENDO_wgMLST"
escherichia_coli = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_5/Escherichia_coli_INNUENDO_wgMLST"
listeria_monocytogenes = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_6/Listeria_monocytogenes_Pasteur_cgMLST"
yersinia_enterocolitica = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_7/Yersinia_enterocolitica_INNUENDO_wgMLST"
Expand Down
Loading

0 comments on commit fab2d72

Please sign in to comment.