Skip to content

Commit

Permalink
Merge pull request #16 from bio-raum/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
marchoeppner authored Jan 17, 2025
2 parents 2975101 + 7f3b05f commit 63319fe
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 80 deletions.
9 changes: 9 additions & 0 deletions assets/gabi_template.html
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ <h2>Summary</h2>
<th colspan=2 scope="col"><div class="tooltip">Reference genome<span class="tooltiptext">The highest matching hit in RefSeq to this assembly</span></div></th>
<th colspan=5 scope="col"><div class="tooltip">Assembly<span class="tooltiptext">Information about this assembly</span></div></th>
<th colspan=4 scope="col"><div class="tooltip">Mean coverage<span class="tooltiptext">Mean coverage of reads mapped back to the assembly - bigger is better</span></div></th>
<th colspan=4 scope="col"><div class="tooltip">Coverage 40X (%)<span class="tooltiptext">Percentage of assembly covered at 40X or more</span></div></th>
<th colspan=3 scope="col"><div class="tooltip">Read quality<span class="tooltiptext">Quality metrics of reads after trimming</span></div></th>
<th colspan=2 scope="col"><div class="tooltip">Contamination<span class="tooltiptext">Indicators of contamination</span></div></th>
</tr>
Expand All @@ -123,6 +124,10 @@ <h2>Summary</h2>
<th scope="subcol">ILM</th>
<th scope="subcol">ONT</th>
<th scope="subcol">HiFi</th>
<th scope="subcol"><div class="tooltip">Total<span class="tooltiptext">Across all sequencing technologies</span></div></th>
<th scope="subcol"><div class="tooltip">ILM<span class="tooltiptext">Illumina reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">ONT<span class="tooltiptext">ONT reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">HiFi<span class="tooltiptext">Pacbio reads 40X</span></div></th>
<th scope="subcol"><div class="tooltip">ILM Q30 (%)<span class="tooltiptext">Fraction of Illumina reads above Q30.</span></div></th>
<th scope="subcol"><div class="tooltip">ONT Q15 (#)<span class="tooltiptext">Number of ONT reads above Q15.</span></div></th>
<th scope="subcol"><div class="tooltip">ONT N50 (bp)<span class="tooltiptext">N50 of ONT reads</span></div></th>
Expand All @@ -149,6 +154,10 @@ <h2>Summary</h2>
<td scope={{row.coverage_illumina_status}}>{{row.coverage_illumina}}</td>
<td scope={{row.coverage_nanopore_status}}>{{row.coverage_nanopore}}</td>
<td scope={{row.coverage_pacbio_status}}>{{row.coverage_pacbio}}</td>
<td scope={{row.coverage_40_status}}>{{row.coverage_40}}</td>
<td scope={{row.coverage_40_illumina_status}}>{{row.coverage_40_illumina}}</td>
<td scope={{row.coverage_40_nanopore_status}}>{{row.coverage_40_nanopore}}</td>
<td scope={{row.coverage_40_pacbio_status}}>{{row.coverage_40_pacbio}}</td>
<td scope={{row.quality_illumina_status}}>{{row.quality_illumina}}</td>
<td scope="missing">{{row.quality_nanopore}}</td>
<td scope="missing">{{row.nanopore_n50}}</td>
Expand Down
158 changes: 107 additions & 51 deletions bin/gabi.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ def main(yaml, template, output, reference, version, call, wd):
if "fastp" in jdata:
fastp_q30_status = status["pass"]
fastp_summary = jdata["fastp"]["summary"]
fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"], 2)
if fastp_q30 < 0.85:
fastp_q30 = (round(fastp_summary["after_filtering"]["q30_rate"], 2) * 100)
if fastp_q30 < 85:
fastp_q30_status = status["warn"]

##########################
Expand Down Expand Up @@ -208,33 +208,40 @@ def main(yaml, template, output, reference, version, call, wd):
# Get assembly stats
####################

assembly = round((int(jdata["quast"]["Total length"])/1000000), 2)
assembly = round((int(jdata["quast"]["Total length"]) / 1000000), 2)
assembly_status = check_assembly(this_refs, int(jdata["quast"]["Total length"]))

genome_fraction = round(float(jdata["quast"]["Genome fraction (%)"]), 2)
genome_fraction = "-"
genome_fraction_status = status["missing"]

genome_fraction_status = status["pass"]
quast = {}

quast["duplication"] = "-"
quast["misassembled"] = "-"
quast["mismatches"] = "-"

if "Genome fraction (%)" in jdata["quast"]:
genome_fraction = round(float(jdata["quast"]["Genome fraction (%)"]), 2)
quast["duplication"] = jdata["quast"]["Duplication ratio"]
quast["misassembled"] = jdata["quast"]["# misassembled contigs"]
quast["mismatches"] = jdata["quast"]["# mismatches per 100 kbp"]

# Highlight if a reference coverage is less than 90%
# This might indicate a problem with our assembly (or the mash database...)

contigs = int(jdata["quast"]["# contigs"])
contigs_status = check_contigs(this_refs, int(jdata["quast"]["# contigs"]))

n50 = round((int(jdata["quast"]["N50"])/1000), 2)
n50 = round((int(jdata["quast"]["N50"]) / 1000), 2)
n50_status = check_n50(this_refs, int(jdata["quast"]["N50"]))

quast = {}
quast["size"] = jdata["quast"]["Total length (>= 0 bp)"]
quast["duplication"] = jdata["quast"]["Duplication ratio"]
quast["N"] = jdata["quast"]["# N's per 100 kbp"]
quast["mismatches"] = jdata["quast"]["# mismatches per 100 kbp"]
quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"])/1000), 2)
quast["misassembled"] = jdata["quast"]["# misassembled contigs"]
quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"]) / 1000), 2)
quast["contigs_1k"] = jdata["quast"]["# contigs (>= 1000 bp)"]
quast["contigs_5k"] = jdata["quast"]["# contigs (>= 5000 bp)"]
quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"])/1000000), 2)
quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"])/1000000), 2)
quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"]) / 1000000), 2)
quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"]) / 1000000), 2)
quast["gc"] = float(jdata["quast"]["GC (%)"])
quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"]))

Expand Down Expand Up @@ -273,10 +280,10 @@ def main(yaml, template, output, reference, version, call, wd):
busco = jdata["busco"]
busco_status = status["missing"]
busco_total = int(busco["dataset_total_buscos"])
busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100
busco_fragmented = round((int(busco["F"])/busco_total), 2)*100
busco_missing = round((int(busco["M"])/busco_total), 2)*100
busco_duplicated = round((int(busco["D"])/busco_total), 2)*100
busco_completeness = round(((int(busco["C"])) / int(busco_total)), 2) * 100
busco_fragmented = round((int(busco["F"]) / busco_total), 2) * 100
busco_missing = round((int(busco["M"]) / busco_total), 2) * 100
busco_duplicated = round((int(busco["D"]) / busco_total), 2) * 100
busco["completeness"] = busco_completeness
busco_data_all.append({"Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated})

Expand Down Expand Up @@ -323,6 +330,8 @@ def main(yaml, template, output, reference, version, call, wd):
coverage_pacbio = "-"
coverage_pacbio_status = status["missing"]

# mean coverages

if "total" in jdata["mosdepth"]:
coverage = float(jdata["mosdepth"]["total"]["mean"])
if coverage >= 40:
Expand All @@ -332,40 +341,78 @@ def main(yaml, template, output, reference, version, call, wd):
else:
coverage_status = status["pass"]

if ("illumina" in jdata["mosdepth"]):

coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
if coverage_illumina >= 40:
coverage_illumina_status = status["pass"]
elif coverage_illumina >= 20:
coverage_illumina_status = status["warn"]
if "illumina" in jdata["mosdepth"]:
coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
if coverage_illumina >= 40:
coverage_illumina_status = status["pass"]
elif coverage_illumina >= 20:
coverage_illumina_status = status["warn"]
else:
coverage_illumina_status = status["fail"]

if "nanopore" in jdata["mosdepth"]:
coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
if coverage_nanopore >= 40:
coverage_nanopore_status = status["pass"]
elif coverage_nanopore >= 20:
coverage_nanopore_status = status["warn"]
else:
coverage_nanopore_status = status["fail"]

if "pacbio" in jdata["mosdepth"]:
coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
if coverage_pacbio >= 40:
coverage_pacbio_status = status["pass"]
elif coverage_pacbio >= 20:
coverage_pacbio_status = status["warn"]
else:
coverage_pacbio_status = status["fail"]

# fraction covered at 40X

coverage_40 = "-"
coverage_40_status = status["missing"]
coverage_40_illumina = "-"
coverage_40_illumina_status = status["missing"]
coverage_40_nanopore = "-"
coverage_40_nanopore_status = status["missing"]
coverage_40_pacbio = "-"
coverage_40_pacbio_status = status["missing"]

if "mosdepth_global" in jdata:
if "illumina" in jdata["mosdepth_global"]:
coverage_40_illumina = jdata["mosdepth_global"]["illumina"]["40"]
if coverage_40_illumina < 90:
coverage_40_illumina_status = status["warn"]
else:
coverage_illumina_status = status["fail"]

if ("nanopore" in jdata["mosdepth"]):
coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
if coverage_nanopore >= 40:
coverage_nanopore_status = status["pass"]
elif coverage_nanopore >= 20:
coverage_nanopore_status = status["warn"]
coverage_40_illumina_status = status["pass"]
if "nanopore" in jdata["mosdepth_global"]:
coverage_40_nanopore = jdata["mosdepth_global"]["nanopore"]["40"]
if coverage_40_nanopore < 90:
coverage_40_nanopore_status = status["warn"]
else:
coverage_nanopore_status = status["fail"]

if ("pacbio" in jdata["mosdepth"]):
coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
if coverage_pacbio >= 40:
coverage_pacbio_status = status["pass"]
elif coverage_pacbio >= 20:
coverage_pacbio_status = status["warn"]
coverage_40_nanopore_status = status["pass"]
if "pacbio" in jdata["mosdepth_global"]:
coverage_40_pacbio = jdata["mosdepth_global"]["pacbio"]["40"]
if coverage_40_pacbio < 90:
coverage_40_pacbio_status = status["warn"]
else:
coverage_pacbio_status = status["fail"]
coverage_40_pacbio_status = status["pass"]
if "total" in jdata["mosdepth_global"]:
coverage_40 = jdata["mosdepth_global"]["total"]["40"]
if coverage_40 < 90:
coverage_40_status = status["warn"]
elif coverage_40 < 75:
coverage_40_status = status["fail"]
else:
coverage_40_status = status["pass"]

######################################
# Set the overall status of the sample
######################################

# The metrics that by themselves determine overall status:
for estatus in [confindr_status, taxon_count_status, assembly_status]:
for estatus in [confindr_status, taxon_count_status, assembly_status]:
# if any one metric failed, the whole sample failed
if estatus == status["fail"]:
this_status = estatus
Expand Down Expand Up @@ -405,6 +452,14 @@ def main(yaml, template, output, reference, version, call, wd):
"coverage_nanopore_status": coverage_nanopore_status,
"coverage_pacbio": coverage_pacbio,
"coverage_pacbio_status": coverage_pacbio_status,
"coverage_40": coverage_40,
"coverage_40_status": coverage_40_status,
"coverage_40_illumina": coverage_40_illumina,
"coverage_40_illumina_status": coverage_40_illumina_status,
"coverage_40_nanopore": coverage_40_nanopore,
"coverage_40_nanopore_status": coverage_40_nanopore_status,
"coverage_40_pacbio": coverage_40_pacbio,
"coverage_40_pacbio_status": coverage_40_pacbio_status,
"n50": n50,
"n50_status": n50_status,
"fraction": genome_fraction,
Expand All @@ -424,32 +479,33 @@ def main(yaml, template, output, reference, version, call, wd):
# Plots
#############

# Kraken abundances
if "kraken" in jdata:
# Draw the Kraken abundance table
kdata = pd.DataFrame(data=kraken_data_all, index=samples)
plot_labels = {"index": "Samples", "value": "Percentage"}
h = len(samples)*25 if len(samples) > 10 else 300
h = len(samples) * 25 if len(samples) > 10 else 400
fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)

data["Kraken"] = fig.to_html(full_html=False)

# Insert size distribution
if "samtools" in jdata:
# Crop all insert size histograms to the shortest common length
insert_sizes_all_cropped = {}
for s, ins in insert_sizes_all.items():
list_end = min_insert_size_length-1
list_end = min_insert_size_length - 1
insert_sizes_all_cropped[s] = ins[:list_end]

plot_labels = {"index": "Basepairs", "value": "Count"}
hdata = pd.DataFrame(insert_sizes_all_cropped)
hfig = px.line(hdata, labels=plot_labels)
data["Insertsizes"] = hfig.to_html(full_html=False)

# busco score
if busco_data_all:
# Draw the busco stats graph
bdata = pd.DataFrame(data=busco_data_all, index=samples)
plot_labels = {"index": "Samples", "value": "Percentage"}
h = len(samples)*25 if len(samples) > 10 else 300
h = len(samples) * 25 if len(samples) > 10 else 400
fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h)
data["Busco"] = fig.to_html(full_html=False)

Expand Down Expand Up @@ -497,7 +553,7 @@ def check_assembly(refs, query):
# assembly falls between the allowed sizes
if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any(x >= (query*0.8) for x in ref_intervals) and any(x <= (query*1.2) for x in ref_intervals)):
elif (any(x >= (query * 0.8) for x in ref_intervals) and any(x <= (query * 1.2) for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -515,7 +571,7 @@ def check_contigs(refs, query):

if (any(x >= query for x in ref_intervals)):
return status["pass"]
elif (any((x*1.2) >= query for x in ref_intervals)):
elif (any((x * 1.2) >= query for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -533,7 +589,7 @@ def check_n50(refs, query):

if (any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any((x*0.8) <= query for x in ref_intervals)):
elif (any((x * 0.8) <= query for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand All @@ -550,7 +606,7 @@ def check_gc(refs, query):
# check if gc falls within expected range, or range +/- 5% - else fail
if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
return status["pass"]
elif (any(x >= (query*0.95) for x in ref_intervals) and any(x <= (query*1.05) for x in ref_intervals)):
elif (any(x >= (query * 0.95) for x in ref_intervals) and any(x <= (query * 1.05) for x in ref_intervals)):
return status["warn"]
else:
return status["fail"]
Expand Down
37 changes: 36 additions & 1 deletion bin/gabi_summary.pl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@
"confindr" => [],
"serotype" => [],
"mosdepth" => {},
"reference" => {}
"reference" => {},
"mosdepth_global" => {}
);

my @files = glob '*/*' ;
Expand Down Expand Up @@ -118,6 +119,18 @@
} elsif ( $filename =~ /.*mosdepth.summary.txt/) {
my %data = parse_mosdepth(\@lines);
$matrix{'mosdepth'}{'total'} = \%data;
} elsif ( $filename =~ /ILLUMINA.mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'illumina'} = \%data;
} elsif ( $filename =~ /NANOPORE.mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'nanopore'} = \%data;
} elsif ( $filename =~ /PACBIO.mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'pacbio'} = \%data;
} elsif ( $filename =~ /.*mosdepth.global.dist.txt/ ) {
my %data = parse_mosdepth_global(\@lines);
$matrix{'mosdepth_global'}{'total'} = \%data;
} elsif ( $filename =~ /.sistr.tab/) {
my %data ;
$data{'Sistr'}= parse_sistr(\@lines);
Expand Down Expand Up @@ -281,6 +294,28 @@ sub parse_sistr {
return \%data ;
}

sub parse_mosdepth_global {

my @lines = @{$_[0] };

my $h = shift @lines;
my @header = split "\t", $h ;

my %data;

# Limit to 100X so the json doesnt get too large for no reason.
for my $line (@lines) {
my ($chr,$cov,$perc) = split "\t", $line ;
if ($chr eq "total") {
if ($cov <= 100) {
$data{$cov} = ($perc*100);
}
}
}

return %data;
}

sub parse_mosdepth {

my @lines = @{$_[0] };
Expand Down
2 changes: 1 addition & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ process {
]
}
withName: 'SOURMASH_SKETCH' {
ext.args = "dna"
ext.args = "dna -p scaled=1000,k=31,k=21"
publishDir = [
path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
mode: params.publish_dir_mode,
Expand Down
Loading

0 comments on commit 63319fe

Please sign in to comment.