Merge pull request #16 from bio-raum/dev

Dev
bio-raum · Jan 17, 2025 · 63319fe · 63319fe
2 parents 2975101 + 7f3b05f
commit 63319fe
Show file tree

Hide file tree

Showing 11 changed files with 169 additions and 80 deletions.
diff --git a/assets/gabi_template.html b/assets/gabi_template.html
@@ -105,6 +105,7 @@ <h2>Summary</h2>
         <th colspan=2 scope="col"><div class="tooltip">Reference genome<span class="tooltiptext">The highest matching hit in RefSeq to this assembly</span></div></th>
         <th colspan=5 scope="col"><div class="tooltip">Assembly<span class="tooltiptext">Information about this assembly</span></div></th>
         <th colspan=4 scope="col"><div class="tooltip">Mean coverage<span class="tooltiptext">Mean coverage of reads mapped back to the assembly - bigger is better</span></div></th>
+        <th colspan=4 scope="col"><div class="tooltip">Coverage 40X (%)<span class="tooltiptext">Percentage of assembly covered at 40X or more</span></div></th>
         <th colspan=3 scope="col"><div class="tooltip">Read quality<span class="tooltiptext">Quality metrics of reads after trimming</span></div></th>
         <th colspan=2 scope="col"><div class="tooltip">Contamination<span class="tooltiptext">Indicators of contamination</span></div></th>
     </tr>
@@ -123,6 +124,10 @@ <h2>Summary</h2>
         <th scope="subcol">ILM</th>
         <th scope="subcol">ONT</th>
         <th scope="subcol">HiFi</th>
+        <th scope="subcol"><div class="tooltip">Total<span class="tooltiptext">Across all sequencing technologies</span></div></th>
+        <th scope="subcol"><div class="tooltip">ILM<span class="tooltiptext">Illumina reads 40X</span></div></th>
+        <th scope="subcol"><div class="tooltip">ONT<span class="tooltiptext">ONT reads 40X</span></div></th>
+        <th scope="subcol"><div class="tooltip">HiFi<span class="tooltiptext">Pacbio reads 40X</span></div></th>
         <th scope="subcol"><div class="tooltip">ILM Q30 (%)<span class="tooltiptext">Fraction of Illumina reads above Q30.</span></div></th>
         <th scope="subcol"><div class="tooltip">ONT Q15 (#)<span class="tooltiptext">Number of ONT reads above Q15.</span></div></th>
         <th scope="subcol"><div class="tooltip">ONT N50 (bp)<span class="tooltiptext">N50 of ONT reads</span></div></th>
@@ -149,6 +154,10 @@ <h2>Summary</h2>
         <td scope={{row.coverage_illumina_status}}>{{row.coverage_illumina}}</td>
         <td scope={{row.coverage_nanopore_status}}>{{row.coverage_nanopore}}</td>
         <td scope={{row.coverage_pacbio_status}}>{{row.coverage_pacbio}}</td>
+        <td scope={{row.coverage_40_status}}>{{row.coverage_40}}</td>
+        <td scope={{row.coverage_40_illumina_status}}>{{row.coverage_40_illumina}}</td>
+        <td scope={{row.coverage_40_nanopore_status}}>{{row.coverage_40_nanopore}}</td>
+        <td scope={{row.coverage_40_pacbio_status}}>{{row.coverage_40_pacbio}}</td>
         <td scope={{row.quality_illumina_status}}>{{row.quality_illumina}}</td>
         <td scope="missing">{{row.quality_nanopore}}</td>
         <td scope="missing">{{row.nanopore_n50}}</td>

diff --git a/bin/gabi.py b/bin/gabi.py
@@ -135,8 +135,8 @@ def main(yaml, template, output, reference, version, call, wd):
             if "fastp" in jdata:
                 fastp_q30_status = status["pass"]
                 fastp_summary = jdata["fastp"]["summary"]
-                fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"], 2)
-                if fastp_q30 < 0.85:
+                fastp_q30 = (round(fastp_summary["after_filtering"]["q30_rate"], 2) * 100)
+                if fastp_q30 < 85:
                     fastp_q30_status = status["warn"]
 
             ##########################
@@ -208,33 +208,40 @@ def main(yaml, template, output, reference, version, call, wd):
             # Get assembly stats
             ####################
 
-            assembly = round((int(jdata["quast"]["Total length"])/1000000), 2)
+            assembly = round((int(jdata["quast"]["Total length"]) / 1000000), 2)
             assembly_status = check_assembly(this_refs, int(jdata["quast"]["Total length"]))
 
-            genome_fraction = round(float(jdata["quast"]["Genome fraction (%)"]), 2)
+            genome_fraction = "-"
+            genome_fraction_status = status["missing"]
 
-            genome_fraction_status = status["pass"]
+            quast = {}
+
+            quast["duplication"] = "-"
+            quast["misassembled"] = "-"
+            quast["mismatches"] = "-"
+
+            if "Genome fraction (%)" in jdata["quast"]:
+                genome_fraction = round(float(jdata["quast"]["Genome fraction (%)"]), 2)
+                quast["duplication"] = jdata["quast"]["Duplication ratio"]
+                quast["misassembled"] = jdata["quast"]["# misassembled contigs"]
+                quast["mismatches"] = jdata["quast"]["# mismatches per 100 kbp"]
 
             # Highlight if a reference coverage is less than 90%
             # This might indicate a problem with our assembly (or the mash database...)
 
             contigs = int(jdata["quast"]["# contigs"])
             contigs_status = check_contigs(this_refs, int(jdata["quast"]["# contigs"]))
 
-            n50 = round((int(jdata["quast"]["N50"])/1000), 2)
+            n50 = round((int(jdata["quast"]["N50"]) / 1000), 2)
             n50_status = check_n50(this_refs, int(jdata["quast"]["N50"]))
 
-            quast = {}
             quast["size"] = jdata["quast"]["Total length (>= 0 bp)"]
-            quast["duplication"] = jdata["quast"]["Duplication ratio"]
             quast["N"] = jdata["quast"]["# N's per 100 kbp"]
-            quast["mismatches"] = jdata["quast"]["# mismatches per 100 kbp"]
-            quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"])/1000), 2)
-            quast["misassembled"] = jdata["quast"]["# misassembled contigs"]
+            quast["largest_contig"] = round((int(jdata["quast"]["Largest contig"]) / 1000), 2)
             quast["contigs_1k"] = jdata["quast"]["# contigs (>= 1000 bp)"]
             quast["contigs_5k"] = jdata["quast"]["# contigs (>= 5000 bp)"]
-            quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"])/1000000), 2)
-            quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"])/1000000), 2)
+            quast["size_1k"] = round(float(int(jdata["quast"]["Total length (>= 1000 bp)"]) / 1000000), 2)
+            quast["size_5k"] = round(float(int(jdata["quast"]["Total length (>= 5000 bp)"]) / 1000000), 2)
             quast["gc"] = float(jdata["quast"]["GC (%)"])
             quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"]))
 
@@ -273,10 +280,10 @@ def main(yaml, template, output, reference, version, call, wd):
             busco = jdata["busco"]
             busco_status = status["missing"]
             busco_total = int(busco["dataset_total_buscos"])
-            busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100
-            busco_fragmented = round((int(busco["F"])/busco_total), 2)*100
-            busco_missing = round((int(busco["M"])/busco_total), 2)*100
-            busco_duplicated = round((int(busco["D"])/busco_total), 2)*100
+            busco_completeness = round(((int(busco["C"])) / int(busco_total)), 2) * 100
+            busco_fragmented = round((int(busco["F"]) / busco_total), 2) * 100
+            busco_missing = round((int(busco["M"]) / busco_total), 2) * 100
+            busco_duplicated = round((int(busco["D"]) / busco_total), 2) * 100
             busco["completeness"] = busco_completeness
             busco_data_all.append({"Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated})
 
@@ -323,6 +330,8 @@ def main(yaml, template, output, reference, version, call, wd):
             coverage_pacbio = "-"
             coverage_pacbio_status = status["missing"]
 
+            # mean coverages
+
             if "total" in jdata["mosdepth"]:
                 coverage = float(jdata["mosdepth"]["total"]["mean"])
                 if coverage >= 40:
@@ -332,40 +341,78 @@ def main(yaml, template, output, reference, version, call, wd):
                 else:
                     coverage_status = status["pass"]
 
-                if ("illumina" in jdata["mosdepth"]):
-
-                    coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
-                    if coverage_illumina >= 40:
-                        coverage_illumina_status = status["pass"]
-                    elif coverage_illumina >= 20:
-                        coverage_illumina_status = status["warn"]
+            if "illumina" in jdata["mosdepth"]:
+                coverage_illumina = float(jdata["mosdepth"]["illumina"]["mean"])
+                if coverage_illumina >= 40:
+                    coverage_illumina_status = status["pass"]
+                elif coverage_illumina >= 20:
+                    coverage_illumina_status = status["warn"]
+                else:
+                    coverage_illumina_status = status["fail"]
+
+            if "nanopore" in jdata["mosdepth"]:
+                coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
+                if coverage_nanopore >= 40:
+                    coverage_nanopore_status = status["pass"]
+                elif coverage_nanopore >= 20:
+                    coverage_nanopore_status = status["warn"]
+                else:
+                    coverage_nanopore_status = status["fail"]
+
+            if "pacbio" in jdata["mosdepth"]:
+                coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
+                if coverage_pacbio >= 40:
+                    coverage_pacbio_status = status["pass"]
+                elif coverage_pacbio >= 20:
+                    coverage_pacbio_status = status["warn"]
+                else:
+                    coverage_pacbio_status = status["fail"]
+
+            # fraction covered at 40X
+
+            coverage_40 = "-"
+            coverage_40_status = status["missing"]
+            coverage_40_illumina = "-"
+            coverage_40_illumina_status = status["missing"]
+            coverage_40_nanopore = "-"
+            coverage_40_nanopore_status = status["missing"]
+            coverage_40_pacbio = "-"
+            coverage_40_pacbio_status = status["missing"]
+
+            if "mosdepth_global" in jdata:
+                if "illumina" in jdata["mosdepth_global"]:
+                    coverage_40_illumina = jdata["mosdepth_global"]["illumina"]["40"]
+                    if coverage_40_illumina < 90:
+                        coverage_40_illumina_status = status["warn"]
                     else:
-                        coverage_illumina_status = status["fail"]
-
-                if ("nanopore" in jdata["mosdepth"]):
-                    coverage_nanopore = float(jdata["mosdepth"]["nanopore"]["mean"])
-                    if coverage_nanopore >= 40:
-                        coverage_nanopore_status = status["pass"]
-                    elif coverage_nanopore >= 20:
-                        coverage_nanopore_status = status["warn"]
+                        coverage_40_illumina_status = status["pass"]
+                if "nanopore" in jdata["mosdepth_global"]:
+                    coverage_40_nanopore = jdata["mosdepth_global"]["nanopore"]["40"]
+                    if coverage_40_nanopore < 90:
+                        coverage_40_nanopore_status = status["warn"]
                     else:
-                        coverage_nanopore_status = status["fail"]
-
-                if ("pacbio" in jdata["mosdepth"]):
-                    coverage_pacbio = float(jdata["mosdepth"]["pacbio"]["mean"])
-                    if coverage_pacbio >= 40:
-                        coverage_pacbio_status = status["pass"]
-                    elif coverage_pacbio >= 20:
-                        coverage_pacbio_status = status["warn"]
+                        coverage_40_nanopore_status = status["pass"]
+                if "pacbio" in jdata["mosdepth_global"]:
+                    coverage_40_pacbio = jdata["mosdepth_global"]["pacbio"]["40"]
+                    if coverage_40_pacbio < 90:
+                        coverage_40_pacbio_status = status["warn"]
                     else:
-                        coverage_pacbio_status = status["fail"]
+                        coverage_40_pacbio_status = status["pass"]
+                if "total" in jdata["mosdepth_global"]:
+                    coverage_40 = jdata["mosdepth_global"]["total"]["40"]
+                    if coverage_40 < 90:
+                        coverage_40_status = status["warn"]
+                    elif coverage_40 < 75:
+                        coverage_40_status = status["fail"]
+                    else:
+                        coverage_40_status = status["pass"]
 
             ######################################
             # Set the overall status of the sample
             ######################################
 
             # The metrics that by themselves determine overall status:
-            for estatus in [confindr_status,  taxon_count_status, assembly_status]:
+            for estatus in [confindr_status, taxon_count_status, assembly_status]:
                 # if any one metric failed, the whole sample failed
                 if estatus == status["fail"]:
                     this_status = estatus
@@ -405,6 +452,14 @@ def main(yaml, template, output, reference, version, call, wd):
                 "coverage_nanopore_status": coverage_nanopore_status,
                 "coverage_pacbio": coverage_pacbio,
                 "coverage_pacbio_status": coverage_pacbio_status,
+                "coverage_40": coverage_40,
+                "coverage_40_status": coverage_40_status,
+                "coverage_40_illumina": coverage_40_illumina,
+                "coverage_40_illumina_status": coverage_40_illumina_status,
+                "coverage_40_nanopore": coverage_40_nanopore,
+                "coverage_40_nanopore_status": coverage_40_nanopore_status,
+                "coverage_40_pacbio": coverage_40_pacbio,
+                "coverage_40_pacbio_status": coverage_40_pacbio_status,
                 "n50": n50,
                 "n50_status": n50_status,
                 "fraction": genome_fraction,
@@ -424,32 +479,33 @@ def main(yaml, template, output, reference, version, call, wd):
     # Plots
     #############
 
+    # Kraken abundances
     if "kraken" in jdata:
-        # Draw the Kraken abundance table
         kdata = pd.DataFrame(data=kraken_data_all, index=samples)
         plot_labels = {"index": "Samples", "value": "Percentage"}
-        h = len(samples)*25 if len(samples) > 10 else 300
+        h = len(samples) * 25 if len(samples) > 10 else 400
         fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)
-
         data["Kraken"] = fig.to_html(full_html=False)
 
+    # Insert size distribution
     if "samtools" in jdata:
         # Crop all insert size histograms to the shortest common length
         insert_sizes_all_cropped = {}
         for s, ins in insert_sizes_all.items():
-            list_end = min_insert_size_length-1
+            list_end = min_insert_size_length - 1
             insert_sizes_all_cropped[s] = ins[:list_end]
 
         plot_labels = {"index": "Basepairs", "value": "Count"}
         hdata = pd.DataFrame(insert_sizes_all_cropped)
         hfig = px.line(hdata, labels=plot_labels)
         data["Insertsizes"] = hfig.to_html(full_html=False)
 
+    # busco score
     if busco_data_all:
         # Draw the busco stats graph
         bdata = pd.DataFrame(data=busco_data_all, index=samples)
         plot_labels = {"index": "Samples", "value": "Percentage"}
-        h = len(samples)*25 if len(samples) > 10 else 300
+        h = len(samples) * 25 if len(samples) > 10 else 400
         fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h)
         data["Busco"] = fig.to_html(full_html=False)
 
@@ -497,7 +553,7 @@ def check_assembly(refs, query):
             # assembly falls between the allowed sizes
             if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
                 return status["pass"]
-            elif (any(x >= (query*0.8) for x in ref_intervals) and any(x <= (query*1.2) for x in ref_intervals)):
+            elif (any(x >= (query * 0.8) for x in ref_intervals) and any(x <= (query * 1.2) for x in ref_intervals)):
                 return status["warn"]
             else:
                 return status["fail"]
@@ -515,7 +571,7 @@ def check_contigs(refs, query):
 
             if (any(x >= query for x in ref_intervals)):
                 return status["pass"]
-            elif (any((x*1.2) >= query for x in ref_intervals)):
+            elif (any((x * 1.2) >= query for x in ref_intervals)):
                 return status["warn"]
             else:
                 return status["fail"]
@@ -533,7 +589,7 @@ def check_n50(refs, query):
 
             if (any(x <= query for x in ref_intervals)):
                 return status["pass"]
-            elif (any((x*0.8) <= query for x in ref_intervals)):
+            elif (any((x * 0.8) <= query for x in ref_intervals)):
                 return status["warn"]
             else:
                 return status["fail"]
@@ -550,7 +606,7 @@ def check_gc(refs, query):
         # check if gc falls within expected range, or range +/- 5% - else fail
         if (any(x >= query for x in ref_intervals) and any(x <= query for x in ref_intervals)):
             return status["pass"]
-        elif (any(x >= (query*0.95) for x in ref_intervals) and any(x <= (query*1.05) for x in ref_intervals)):
+        elif (any(x >= (query * 0.95) for x in ref_intervals) and any(x <= (query * 1.05) for x in ref_intervals)):
             return status["warn"]
         else:
             return status["fail"]

diff --git a/bin/gabi_summary.pl b/bin/gabi_summary.pl
@@ -53,7 +53,8 @@
     "confindr" => [],
     "serotype" => [],
     "mosdepth" => {},
-    "reference" => {}
+    "reference" => {},
+    "mosdepth_global" => {}
 );
 
 my @files = glob '*/*' ;
@@ -118,6 +119,18 @@
     } elsif ( $filename =~ /.*mosdepth.summary.txt/) {
         my %data = parse_mosdepth(\@lines);
         $matrix{'mosdepth'}{'total'} = \%data;
+    } elsif ( $filename =~ /ILLUMINA.mosdepth.global.dist.txt/ ) {
+        my %data = parse_mosdepth_global(\@lines);
+        $matrix{'mosdepth_global'}{'illumina'} = \%data;
+    } elsif ( $filename =~ /NANOPORE.mosdepth.global.dist.txt/ ) {
+        my %data = parse_mosdepth_global(\@lines);
+        $matrix{'mosdepth_global'}{'nanopore'} = \%data;
+    } elsif ( $filename =~ /PACBIO.mosdepth.global.dist.txt/ ) {
+        my %data = parse_mosdepth_global(\@lines);
+        $matrix{'mosdepth_global'}{'pacbio'} = \%data;
+    } elsif ( $filename =~ /.*mosdepth.global.dist.txt/ ) {
+        my %data = parse_mosdepth_global(\@lines);
+        $matrix{'mosdepth_global'}{'total'} = \%data;
     } elsif ( $filename =~ /.sistr.tab/) {
         my %data ; 
         $data{'Sistr'}= parse_sistr(\@lines);
@@ -281,6 +294,28 @@ sub parse_sistr {
    return \%data ;
 }
 
+sub parse_mosdepth_global {
+
+    my @lines = @{$_[0] };
+
+    my $h = shift @lines;
+    my @header = split "\t", $h ; 
+
+    my %data;
+
+    # Limit to 100X so the json doesnt get too large for no reason. 
+    for my $line (@lines) {
+        my ($chr,$cov,$perc) = split "\t", $line ;
+        if ($chr eq "total") {
+            if ($cov <= 100) {
+                $data{$cov} = ($perc*100);
+            }
+        }
+    }
+
+    return %data;
+}
+
 sub parse_mosdepth {
 
     my @lines = @{$_[0] };

diff --git a/conf/modules.config b/conf/modules.config
@@ -222,7 +222,7 @@ process {
         ]
     }
     withName: 'SOURMASH_SKETCH' {
-        ext.args = "dna"
+        ext.args = "dna -p scaled=1000,k=31,k=21"
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
             mode: params.publish_dir_mode,