From 7715d9058ce66fbe93bbaad3b4c8297ff92f980b Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Fri, 6 Dec 2024 07:50:58 +0100
Subject: [PATCH 01/14] Fixing merge error in build process

---
 modules/biobloom/maker/main.nf | 5 ++---
 workflows/build_references.nf  | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/modules/biobloom/maker/main.nf b/modules/biobloom/maker/main.nf
index 3d55ebb..47c8ad9 100644
--- a/modules/biobloom/maker/main.nf
+++ b/modules/biobloom/maker/main.nf
@@ -1,5 +1,4 @@
 process BIOBLOOM_MAKER {
-    tag "$meta.sample_id"
 
     label 'medium_parallel'
 
@@ -23,8 +22,8 @@ process BIOBLOOM_MAKER {
     def prefix = task.ext.prefix ?: "host_genomes"
     """
     biobloommaker -p $prefix \\
-    -f $fasta \\
-    -t $task.cpus
+    -t $task.cpus \\
+    $fasta
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/workflows/build_references.nf b/workflows/build_references.nf
index b42cd61..7578e57 100644
--- a/workflows/build_references.nf
+++ b/workflows/build_references.nf
@@ -6,6 +6,8 @@ include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL }   from './../modules/a
 include { PYMLST_WGMLST_INSTALL }                           from './../modules/pymlst/wgmlst_install'
 include { CHEWBBACA_DOWNLOADSCHEMA }                        from './../modules/chewbbaca/downloadschema'
 include { STAGE_FILE as DOWNLOAD_SOURMASH_DB }              from './../modules/helper/stage_file'
+include { GUNZIP as GUNZIP_GENOME }                         from './../modules/gunzip'
+include { BIOBLOOM_MAKER }                                  from './../modules/biobloom/maker'
 
 kraken_db_url       = Channel.fromPath(params.references['kraken2'].url)
 confindr_db_url     = Channel.fromPath(params.references['confindr'].url)
@@ -27,7 +29,7 @@ workflow BUILD_REFERENCES {
     )
 
     BIOBLOOM_MAKER(
-        GUNZIP.out.gunzip.map { m,f -> f }
+        GUNZIP_GENOME.out.gunzip.map { m,f -> f }
     )
     
     /*

From 77936a542809a5f02c34cfdeeb476f64785cbc55 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Tue, 10 Dec 2024 14:20:20 +0100
Subject: [PATCH 02/14] Fixing generic file names being forwarded and causing
 name collisions

---
 conf/modules.config              |  2 +-
 conf/resources.config            |  4 ++++
 modules/quast/main.nf            |  2 +-
 nextflow.config                  |  4 +++-
 subworkflows/qc_illumina/main.nf |  8 +++++++-
 subworkflows/qc_nanopore/main.nf |  9 +++++++--
 workflows/build_references.nf    |  5 +++++
 workflows/gabi.nf                | 23 +++++++++++++++++++----
 8 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 642c148..f614e10 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -286,7 +286,7 @@ process {
     }
     withName: NANOPLOT {
         publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" },
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" },
             mode: params.publish_dir_mode,
             enabled: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
diff --git a/conf/resources.config b/conf/resources.config
index 477b800..b3a160a 100644
--- a/conf/resources.config
+++ b/conf/resources.config
@@ -10,6 +10,10 @@ params {
       url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-k31.zip'
       db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-k31.zip"
     }
+    'sourmashdb_nr' {
+      url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip'
+      db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-reps.k31.zip"
+    }
     'amrfinderdb' {
       db = "${params.reference_base}/gabi/${params.reference_version}/amrfinder/latest"
     }
diff --git a/modules/quast/main.nf b/modules/quast/main.nf
index 2340a97..9187a11 100644
--- a/modules/quast/main.nf
+++ b/modules/quast/main.nf
@@ -27,7 +27,7 @@ process QUAST {
     def features  = gff             ?  "--features $gff" : ''
     def reference = fasta           ?  "-r $fasta"       : ''
     """
-    ln -s $assembly ${prefix}.fasta
+    
     quast.py \\
         --output-dir $prefix \\
         $reference \\
diff --git a/nextflow.config b/nextflow.config
index 35aea9a..8210b9a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -34,12 +34,14 @@ params {
 
     remove_host             = false
 
+    fast_ref                = false
+
     max_coverage            = "100x"
     genome_size             = null
     
     skip_porechop           = true
 
-    onthq                   = true
+    onthq                   = false
     ont_min_length          = 5000
     ont_min_q               = 10
     ont_min_reads           = 1000
diff --git a/subworkflows/qc_illumina/main.nf b/subworkflows/qc_illumina/main.nf
index c9df73e..4606694 100644
--- a/subworkflows/qc_illumina/main.nf
+++ b/subworkflows/qc_illumina/main.nf
@@ -15,7 +15,13 @@ workflow QC_ILLUMINA {
     main:
 
     // Split trimmed reads by sample to find multi-lane data set
-    reads.groupTuple().branch { meta, reads ->
+    reads.map {m,r ->
+        def newMeta = [:]
+        newMeta.sample_id = m.sample_id
+        newMeta.platform = m.platform
+        newMeta.single_end = m.single_end
+        tuple(newMeta,r)
+    }.groupTuple().branch { meta, reads ->
         single: reads.size() == 1
             return [ meta, reads.flatten()]
         multi: reads.size() > 1
diff --git a/subworkflows/qc_nanopore/main.nf b/subworkflows/qc_nanopore/main.nf
index 055aa69..81da756 100644
--- a/subworkflows/qc_nanopore/main.nf
+++ b/subworkflows/qc_nanopore/main.nf
@@ -29,10 +29,15 @@ workflow QC_NANOPORE {
     } else {
         ch_porechop_reads   = reads
     }
-   
 
     // Merge Nanopore reads per sample
-    ch_porechop_reads.groupTuple().branch { meta, reads ->
+    ch_porechop_reads.map { m,r ->
+        def newMeta = [:]
+        newMeta.sample_id = m.sample_id
+        newMeta.platform = m.platform
+        newMeta.single_end = m.single_end
+        tuple(newMeta,r)
+    }.groupTuple().branch { meta, reads ->
         single: reads.size() == 1
             return [ meta, reads.flatten()]
         multi: reads.size() > 1
diff --git a/workflows/build_references.nf b/workflows/build_references.nf
index 7578e57..a5dd5c7 100644
--- a/workflows/build_references.nf
+++ b/workflows/build_references.nf
@@ -6,12 +6,14 @@ include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL }   from './../modules/a
 include { PYMLST_WGMLST_INSTALL }                           from './../modules/pymlst/wgmlst_install'
 include { CHEWBBACA_DOWNLOADSCHEMA }                        from './../modules/chewbbaca/downloadschema'
 include { STAGE_FILE as DOWNLOAD_SOURMASH_DB }              from './../modules/helper/stage_file'
+include { STAGE_FILE as DOWNLOAD_SOURMASH_NR_DB }           from './../modules/helper/stage_file'
 include { GUNZIP as GUNZIP_GENOME }                         from './../modules/gunzip'
 include { BIOBLOOM_MAKER }                                  from './../modules/biobloom/maker'
 
 kraken_db_url       = Channel.fromPath(params.references['kraken2'].url)
 confindr_db_url     = Channel.fromPath(params.references['confindr'].url)
 sourmash_db_url     = params.references['sourmashdb'].url
+sourmash_nr_db_url  = params.references['sourmashdb_nr'].url
 ch_busco_lineage    = Channel.from(['bacteria_odb10'])
 host_genome         = Channel.fromPath(file(params.references['host_genome'].url)).map { f -> [ [target: 'Host'], f] }
 
@@ -39,6 +41,9 @@ workflow BUILD_REFERENCES {
         sourmash_db_url
     )
 
+    DOWNLOAD_SOURMASH_NR_DB(
+        sourmash_nr_db_url
+    )
     /*
     Download the latest version of the AMRfinderplus DB
     This is not ideal since we cannot select specific versions -  but it works
diff --git a/workflows/gabi.nf b/workflows/gabi.nf
index 31fba6d..b31c389 100644
--- a/workflows/gabi.nf
+++ b/workflows/gabi.nf
@@ -11,6 +11,7 @@ include { MULTIQC as MULTIQC_PACBIO }   from './../modules/multiqc'
 include { SHOVILL }                     from './../modules/shovill'
 include { RENAME_CTG as RENAME_SHOVILL_CTG } from './../modules/rename_ctg'
 include { RENAME_CTG as RENAME_DRAGONFLYE_CTG } from './../modules/rename_ctg'
+include { RENAME_CTG as RENAME_PLASMID_CTG } from './../modules/rename_ctg'
 include { DRAGONFLYE }                  from './../modules/dragonflye'
 include { FLYE }                        from './../modules/flye'
 include { BIOBLOOM_CATEGORIZER }        from './../modules/biobloom/categorizer'
@@ -66,7 +67,11 @@ if (params.input) {
     amrfinder_db    = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true)   : []
     kraken2_db      = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true)       : []
 
-    sourmashdb      = params.reference_base ? file(params.references['sourmashdb'].db, checkIfExists:true)    : []
+    if (params.fast_ref) {
+        sourmashdb      = params.reference_base ? file(params.references['sourmashdb_nr'].db, checkIfExists:true)    : []
+    } else {
+        sourmashdb      = params.reference_base ? file(params.references['sourmashdb'].db, checkIfExists:true)    : []
+    }
 
     busco_db_path   = params.reference_base ? file(params.references['busco'].db, checkIfExists:true)         : []
     busco_lineage   = params.busco_lineage
@@ -197,7 +202,12 @@ workflow GABI {
         ch_dragonflye
     )
     ch_versions     = ch_versions.mix(DRAGONFLYE.out.versions)
-    ch_assemblies   = ch_assemblies.mix(DRAGONFLYE.out.contigs)
+    
+    RENAME_DRAGONFLYE_CTG(
+        DRAGONFLYE.out.contigs,
+        'fasta'
+    )
+    ch_assemblies   = ch_assemblies.mix(RENAME_DRAGONFLYE_CTG.out)
 
     /*
     Option: Pacbio HiFi reads
@@ -290,7 +300,12 @@ workflow GABI {
         ch_assemblies_clean
     )
     ch_versions = ch_versions.mix(PLASMIDS.out.versions)
-    ch_assembly_without_plasmids = PLASMIDS.out.chromosome
+
+    RENAME_PLASMID_CTG(
+        PLASMIDS.out.chromosome,
+        'fasta'        
+    )
+    ch_assembly_without_plasmids = RENAME_PLASMID_CTG.out
 
     /*
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -363,7 +378,7 @@ workflow GABI {
     SUB: Perform MLST typing of assemblies
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     */
-
+    
     if (!params.skip_mlst) {
         MLST_TYPING(
             ch_assemblies_without_plasmids_with_taxa

From 88536938ed4607ffdd0257b76c65120ce1a0f395 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 12 Dec 2024 08:19:38 +0100
Subject: [PATCH 03/14] Updating documentation

---
 conf/modules/assembly.config     | 37 +++++++++++++
 conf/modules/installation.config | 61 +++++++++++++++++++++
 conf/modules/mlst.config         | 61 +++++++++++++++++++++
 conf/modules/qc.config           | 92 ++++++++++++++++++++++++++++++++
 docs/installation.md             | 10 ++--
 docs/quickstart.md               |  4 +-
 docs/troubleshooting.md          |  4 ++
 docs/usage.md                    | 67 ++++++++++++-----------
 8 files changed, 299 insertions(+), 37 deletions(-)
 create mode 100644 conf/modules/assembly.config
 create mode 100644 conf/modules/installation.config
 create mode 100644 conf/modules/mlst.config
 create mode 100644 conf/modules/qc.config

diff --git a/conf/modules/assembly.config b/conf/modules/assembly.config
new file mode 100644
index 0000000..30da9e7
--- /dev/null
+++ b/conf/modules/assembly.config
@@ -0,0 +1,37 @@
+process {
+
+    withName: SHOVILL {
+	ext.args = "--assembler ${params.shovill_assembler} --minlen ${params.shovill_contig_minlen}"
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/shovill" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: DRAGONFLYE {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/dragonflye" },
+            mode: params.publish_dir_mode,
+            enabled: true
+        ]
+    }
+    withName: RENAME_SHOVILL_CTG {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ] 
+    }
+    withName: FLYE {
+        ext.args = "--plasmids --pacbio-hifi"
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/flye/" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ] 
+    }
+
+}
\ No newline at end of file
diff --git a/conf/modules/installation.config b/conf/modules/installation.config
new file mode 100644
index 0000000..4eb2092
--- /dev/null
+++ b/conf/modules/installation.config
@@ -0,0 +1,61 @@
+process {
+
+    withName: CONFINDR_INSTALL {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: KRAKEN2_DOWNLOAD {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/kraken2" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: DOWNLOAD_SOURMASH_DB {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/sourmashdb" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: CHEWBBACA_DOWNLOADSCHEMA {
+        ext.args = "--latest"
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/chewbbaca" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: AMRFINDERPLUS_INSTALL {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: BUSCO_INSTALL {
+        publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/busco" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+}
\ No newline at end of file
diff --git a/conf/modules/mlst.config b/conf/modules/mlst.config
new file mode 100644
index 0000000..8102fb6
--- /dev/null
+++ b/conf/modules/mlst.config
@@ -0,0 +1,61 @@
+process {
+    withName: 'CHEWBBACA_ALLELECALL' {
+        ext.args = "--no-inferred"
+        publishDir = [
+            path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CHEWBBACA_ALLELECALLEVALUATOR' { 
+        publishDir = [
+            path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CHEWBBACA_JOINPROFILES' {
+        ext.args = "--common"
+        publishDir = [
+            path: { "${params.outdir}/cgMLST/chewbbaca/samples/${meta.sample_id}/joinprofiles" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CHEWBBACA_ALLELECALL_SINGLE' {
+        ext.args = "--no-inferred"
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/chewbbaca" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: PYMLST_WGMLST_DISTANCE {
+        publishDir = [
+            path: { "${params.outdir}/cgMLST/pymlst" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: PYMLST_WGMLST_ADD {
+        publishDir = [
+            path: { "${params.outdir}/cgMLST/pymlst" },
+            mode: params.publish_dir_mode,
+            enabled: false,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: MLST {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/mlst" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+}
\ No newline at end of file
diff --git a/conf/modules/qc.config b/conf/modules/qc.config
new file mode 100644
index 0000000..b0bce94
--- /dev/null
+++ b/conf/modules/qc.config
@@ -0,0 +1,92 @@
+process {
+    withName: MULTIQC_ILLUMINA {
+        ext.prefix = "multiqc_illumina"
+        publishDir = [
+            path: { "${params.outdir}/reports/Illumina" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: MULTIQC_NANOPORE {
+        ext.prefix = "multiqc_nanopore"
+        publishDir = [
+            path: { "${params.outdir}/reports/Nanopore" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: MULTIQC_PACBIO {
+        ext.prefix = "multiqc_pacbio"
+        publishDir = [
+            path: { "${params.outdir}/reports/Pacbio" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CONFINDR' {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CONFINDR2MQC|CONFINDR2MQC_SUMMARY' {
+        publishDir = [
+            path: { "${params.outdir}/qc" },
+            mode: params.publish_dir_mode,
+            enabled: false,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'CONFINDR2JSON' {
+       publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/confindr" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ] 
+    }
+    withName: FASTQC {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastqc" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: PORECHOP_ABI {
+        ext.args = "--abi"
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/porechop" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: CHOPPER {
+        ext.args2 = [
+            "-l ${params.ont_min_length}",
+            params.ont_min_q ? "-q ${params.ont_min_q}" : ""
+        ].join(' ').trim()
+         publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/chopper" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    
+    }
+    withName: NANOPLOT {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+}
\ No newline at end of file
diff --git a/docs/installation.md b/docs/installation.md
index c203a4a..e8b452c 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -6,7 +6,7 @@ If you are new to our pipeline ecosystem, we recommend you first check out our g
 
 Nextflow is a highly portable pipeline engine. Please see the official [installation guide](https://www.nextflow.io/docs/latest/getstarted.html#installation) to learn how to set it up.
 
-This pipeline expects Nextflow version 23.10.1, available [here](https://github.com/nextflow-io/nextflow/releases/tag/v23.10.1).
+This pipeline expects Nextflow version 24.04.4, available [here](https://github.com/nextflow-io/nextflow/releases/tag/v24.04.4).
 
 ## Software provisioning
 
@@ -39,13 +39,13 @@ nextflow run bio-raum/gabi -profile singularity \\
 --reference_base /path/to/references
 ```
 
-where `/path/to/references` could be something like `/data/pipelines/references` or whatever is most appropriate on your system. On a distributed compute environment, this directory needs to live on a shared file system. If you already use a site-specific [config](https://github.com/marchoeppner/nf-configs) file, the `--reference_base` option does not need to be set. 
+where `/path/to/references` could be something like `/data/pipelines/references` or whatever is most appropriate on your system. In a distributed compute environment, this directory needs to live on a shared file system. If you already use a site-specific [config](https://github.com/marchoeppner/nf-configs) file, the `--reference_base` option does not need to be set. 
 
 If you do not have singularity on your system, you can also specify docker, podman or conda for software provisioning - see the [usage information](usage.md).
 
 Please note that the build process will create a pipeline-specific subfolder (`gabi`) that must not be given as part of the `--reference_base` argument. GABI is part of a collection of pipelines that use a shared reference directory and it will choose/create the appropriate subfolder automatically. 
 
-Finally, depending on your internet connection, the installation process can take a little while - primarily because of the Kraken2 database (8GB). However, once installed you are all set and ready to go. 
+Finally, depending on your internet connection, the installation process can take a little while - some of the reference databases are "fairly" large (8-10GB). However, once installed you are all set and ready to go. 
 
 ## Site-specific config file
 
@@ -77,4 +77,6 @@ conda {
   useMamba = true
   cacheDir = "/path/to/conda_cache"
 }
-``` 
\ No newline at end of file
+``` 
+
+This would be for a single computer, with 16 cores and 64GB Ram, using Conda/Mamba. Conda environments are cached to the specified location and can be re-used for subsequent pipeline runs. 
\ No newline at end of file
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 1b5466c..7c18b50 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -12,7 +12,7 @@ GABI provides software on-the-fly. Use whatever profile (`-profile`) is appropri
 - docker
 - podman
 
-We will use `-profile apptainer` for the examples below. Use a container framework over conda, if at all possible. 
+We will use `-profile apptainer` for the examples below. Use a container framework over conda, if at all possible. Contribute a site-specific profile to our [central repository](https://github.com/bio-raum/nf-configs) if you would like to take advantage of container/environment caching. 
 
 ## Three steps
 
@@ -28,7 +28,7 @@ nextflow run bio-raum/gabi -profile apptainer \
 -r main
 ```
 
-This will download and install the pipeline references to `/path/to/references` (choose an appropriate path here).
+This will download and install the pipeline references to `/path/to/references` (choose an appropriate path here; must be on a shared mount when running in a cluster setting).
 
 ### Run test
 
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 7e46f43..a14fb91 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -42,6 +42,10 @@ We assume you mean the overall start-up time - the performance of the individual
 
 Otherwise, if you run this pipeline without a site-specific config file, the pipeline will not know where to cache the various containers or conda environments. In such cases, it will install/download these dependencies into the respective work directory of your pipeline run, every time you run the pipeline. And yes, that is a little slow. Consider adding your own config file to make use of the caching functionality.
 
+## Sourmash `search` is very slow
+
+We use sourmash to identify the best matching reference genome for each assembly. This database is currently over 10GB in size and highly contigious assemblies can produce very long run times (30mins+). If you do not care about the best reference genome, but are happy to just find a closely related one so GABI knows which species this is, use the `--fast_ref` option. 
+
 ### My ONT assembly crashes with an obscure error
 
 Please check if the option `--onthq` is set to `true` (this is the default!). It's possible that this setting is not appropriate for your data, which can lead Dragonflye to exit on an empty Fasta file halfway through the assembly process; you can disable this option by setting `--onthq false` and resume the pipeline (`-resume`).
diff --git a/docs/usage.md b/docs/usage.md
index 7953b69..279e8ea 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -63,7 +63,7 @@ The `-r` option specifies a github [release tag](https://github.com/bio-raum/gab
 
 ## Choosing an assembly method
 
-How do you choose the assembly method for your data? Well, you don't - the pipeline will take care of that automatically. GABI currently supports three kinds of scenarios:
+GABI automatically chooses the appropriate assembly chain based on your data, supporting three scenarios:
 
 - Samples with only short reads (Assembler: Shovill)
 - Samples with Nanopore reads and **optional** short reads (Assembler: Dragonflye)
@@ -71,11 +71,13 @@ How do you choose the assembly method for your data? Well, you don't - the pipel
 
 This is why it is important to make sure that all reads coming from the same sample are linked by a common sample ID. 
 
+Note: HiFi data cannot be combined with any of the other technologies! (mostly because it is not necessary)
+
 ## Options
 
 ### `--input samples.csv` [default = null]
 
-This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows, depending on your input data
+This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows, depending on your input data:
 
 #### Raw reads
 If you want to assemble genomes "from scratch", you can pass raw reads:
@@ -94,9 +96,8 @@ Allowed platforms and data types are:
 * ILLUMINA (expecting PE Illumina reads in fastq format, fastq.gz)
 * NANOPORE (expecting ONT reads in fastq format, fastq.gz)
 * PACBIO (expecting Pacbio CCS/HiFi reads in fastq format, fastq.gz)
-* TORRENT (expecting single-end IonTorrent reads in fastq format, fastq.gz) (tbd!)
 
-Read data in formats other than FastQ are not currently supported and would have to be converted into the appropriate FastQ format prior to launching the pipeline. If you have a recurring use case where the input must be something other than FastQ, please let us know and we will consider it.
+Read data in formats other than FastQ are not currently supported and would have to be converted into FastQ format prior to launching the pipeline. If you have a recurring use case where the input must be something other than FastQ, please let us know and we will consider it.
 
 #### Pre-assembled genomes
 
@@ -109,6 +110,14 @@ sample_id,assembly
 S100,/path/to/S100.fasta
 ```
 
+### `--build_references` [ default = null ]
+
+This option is only used when installing the pipelines references as described [here](installation.md).
+
+### `--fast_ref` [ default = false ]
+
+By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics will notably deteriorate.
+
 ### `--run_name` [ default = null]
 
 A name to use for various output files. This tend to be useful to relate analyses back to individual pipeline runs or projects later on. 
@@ -117,52 +126,37 @@ A name to use for various output files. This tend to be useful to relate analyse
 
 This option should point to the base directory in which you have installed the pipeline references. See our [installation](installation.md) instructions for details. For users who have contributed a site-specific config file, this option does not need to be set. 
 
-### `--onthq` [ default = true ]
+### `--onthq` [ default = false ]
 
-Set this option to true if you believe your ONT data to be of "high quality". This is typically the case for data generated with chemistry version 10.4.1 or later. This option is set to true by default because chemistry version 10.4.1 is the standard kit distributed by ONT at the time of writing. You can disable this option by setting it to `false`. 
+Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default..
 
 ### `--ont_min_q` [ default = 10 ]
 
-Discard nanopore reads below this mean quality.
+Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data. 
 
 ### `--ont_min_length`  [ default = 5000 ]
 
-Discard nanopore reads below this length.
-
-### `--build_references` [ default = null ]
-
-This option is only used when installing the pipelines references as described [here](installation.md).
+Discard nanopore reads below this length. Depending on your DNA extraction and/or library preparation, you will see a range of sequence lengths. If you have sequenced at sufficient depths, you may decide to discard shorter reads to improve your assembly contiguity. However, please note that discarding shorter reads may essentially throw away very short plasmids (which can be as short as ~1kb). 
 
 ## Expert options
 
 These options are only meant for users who have a specific reason to touch them. For most use cases, the defaults should be fine. 
 
-### `--skip_failed` [ default = false ]
-
-By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied:
-
-- Remove highly fragmented assemblies (see [--max_contigs](#--max_contigs))
-- Remove reads that fail the ConfindR QC for intra-/inter species contamination (Illumina and Pacbio only)
-
-### `--max_contigs` [ default = 150 ]
-
-If `--skip_failed` is enabled, this parameter controls the maximum number of contigs an assembly is allowed to have before it is stopped. High contig numbers are typically a sign of insufficient coverage and/or read length (in some cases it can also be a sign of excessive contamination).
-
-### `--skip_circos` [ default = false ]
+### `--confindr_db` [ default = null ]
 
-Skip generation of circos plots.
+A local version of the ConfindR rMLST database, available [here](https://olc-bioinformatics.github.io/ConFindr/install/#downloading-confindr-databases). Unfortunately, this database requires a personalized registration so we cannot bundle it with GABI. If no database is provided, CondindR will run without one and can consquently only use the built-in references for Escherichia, Listeria and Salmonella. 
 
-### `--shovill_assembler` [ default = spades ]
+### `--genome_size` [ default = null ]
 
-Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades.
+If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. 
 
 ### `--max_coverage` [ default = '100x']
 
 If a genome size is specified (`--genome_size`), this is the target coverage for downsampling the read data. 
 
-### `--genome_size` [ default = null ]
+### `--max_contigs` [ default = 150 ]
 
-If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. 
+If `--skip_failed` is enabled, this parameter controls the maximum number of contigs an assembly is allowed to have before it is stopped. High contig numbers are typically a sign of insufficient coverage and/or read length (in some cases it can also be a sign of excessive contamination).
 
 ### `--prokka_proteins` [ default = null ]
 
@@ -172,9 +166,20 @@ If you analyse a single species and wish to optimize the quality of the genome a
 
 If you analyse a single species and wish to optimize the quality of the genome annotation, you can pass a custom prodigal training file using this option, as described [here](https://github.com/tseemann/prokka?tab=readme-ov-file#option---prodigaltf).
 
-### `--confindr_db` [ default = null ]
+### `--skip_failed` [ default = false ]
 
-A local version of the ConfindR rMLST database, available [here](https://olc-bioinformatics.github.io/ConFindr/install/#downloading-confindr-databases). Unfortunately, this database requires a personalized registration so we cannot bundle it with GABI. If no database is provided, CondindR will run without one and can consquently only use the built-in references for Escherichia, Listeria and Salmonella. 
+By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied:
+
+- Remove highly fragmented assemblies (see [--max_contigs](#--max_contigs))
+- Remove reads that fail the ConfindR QC for intra-/inter species contamination (Illumina and Pacbio only)
+
+### `--skip_circos` [ default = false ]
+
+Skip generation of circos plots.
+
+### `--shovill_assembler` [ default = spades ]
+
+Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades.
 
 ### `--skip_mlst` [ default = false ]
 Do not run MLST typing tools (chewbbaca, MLST)

From 015a1b70093bd9cedf0e0e3ad68f006e342076cf Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 12 Dec 2024 08:36:18 +0100
Subject: [PATCH 04/14] Removing documentation deployment for now

---
 .github/workflows/documentation_dev.yml | 41 -------------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 .github/workflows/documentation_dev.yml

diff --git a/.github/workflows/documentation_dev.yml b/.github/workflows/documentation_dev.yml
deleted file mode 100644
index 8650687..0000000
--- a/.github/workflows/documentation_dev.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: Build/Publish Develop Docs 
-on:
-  push:
-    branches:
-      - dev
-  workflow_dispatch:
-permissions:
-  contents: write
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Configure Git Credentials
-        run: |
-          git config user.name github-actions[bot]
-          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.x
-      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 
-      - uses: actions/cache@v4
-        with:
-          key: mkdocs-material-${{ env.cache_id }}
-          path: .cache
-          restore-keys: |
-            mkdocs-material-
-      - name: Update files
-        run: |
-          cat CONTRIBUTING.md > docs/about/contributing.md
-          cat CHANGELOG.md > docs/about/changelog.md
-          cat LICENSE > docs/about/license.md
-        
-      - name: Install Dependencies
-        run: |
-          pip install mkdocs-material
-          pip install mike
-      - name: Build Docs Website
-        run: |
-          git fetch origin gh-pages --depth=1
-          mike deploy --push dev

From b4699b77a503519e7f5408ed5f1f8b1dfa8a2eae Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 12 Dec 2024 11:53:13 +0100
Subject: [PATCH 05/14] Removing PyMLST and fixing some minor issues

---
 bin/download_mlst.rb                          | 117 ------------------
 bin/download_pymlst_wgmlst.sh                 |  10 --
 conf/modules.config                           |  32 -----
 conf/modules/assembly.config                  |   3 +-
 conf/modules/installation.config              |   8 --
 conf/modules/mlst.config                      |  16 ---
 conf/modules/qc.config                        |   2 +-
 docs/software.md                              |   3 -
 docs/usage.md                                 |   4 +
 modules/pymlst/wgmlst/add/environment.yml     |   7 --
 modules/pymlst/wgmlst/add/main.nf             |  52 --------
 .../pymlst/wgmlst/distance/environment.yml    |   7 --
 modules/pymlst/wgmlst/distance/main.nf        |  37 ------
 modules/pymlst/wgmlst_install/environment.yml |   7 --
 modules/pymlst/wgmlst_install/main.nf         |  18 ---
 modules/quast/main.nf                         |   2 +-
 modules/sourmash/compare/main.nf              |   1 -
 nextflow.config                               |   4 +
 subworkflows/mlst/main.nf                     |  63 +---------
 workflows/build_references.nf                 |   6 -
 workflows/gabi.nf                             |   5 +-
 21 files changed, 16 insertions(+), 388 deletions(-)
 delete mode 100644 bin/download_mlst.rb
 delete mode 100755 bin/download_pymlst_wgmlst.sh
 delete mode 100644 modules/pymlst/wgmlst/add/environment.yml
 delete mode 100644 modules/pymlst/wgmlst/add/main.nf
 delete mode 100644 modules/pymlst/wgmlst/distance/environment.yml
 delete mode 100644 modules/pymlst/wgmlst/distance/main.nf
 delete mode 100644 modules/pymlst/wgmlst_install/environment.yml
 delete mode 100644 modules/pymlst/wgmlst_install/main.nf

diff --git a/bin/download_mlst.rb b/bin/download_mlst.rb
deleted file mode 100644
index b9b3062..0000000
--- a/bin/download_mlst.rb
+++ /dev/null
@@ -1,117 +0,0 @@
-require 'optparse'
-require 'ostruct'
-require 'rest_client'
-require 'json'
-
-def rest_get(url)
-    $request_counter ||= 0   # Initialise if unset
-    $last_request_time ||= 0 # Initialise if unset
-
-    # Rate limiting: Sleep for the remainder of a second since the last request on every third request
-    $request_counter += 1
-    if $request_counter == 15 
-        diff = Time.now - $last_request_time
-        sleep(1-diff) if diff < 1
-        $request_counter = 0
-    end
-
-    begin
-        response = RestClient.get "#{$server}/#{url}", {:accept => :json}
-
-        $last_request_time = Time.now
-        JSON.parse(response)
-    rescue RestClient::Exception => e
-        puts "Failed for #{url}! #{response ? "Status code: #{response}. " : ''}Reason: #{e.message}"
-
-        # Sleep for specified number of seconds if there is a Retry-After header
-        if e.response.headers[:retry_after]
-            sleep(e.response.headers[:retry_after].to_f)
-            retry # This retries from the start of the begin block
-        else
-            abort("Quitting... #{e.inspect}")
-        end
-    end
-end
-
-def clean_url(url)
-    return url.gsub("https://rest.pubmlst.org/","")
-end
-### Get the script arguments and open relevant files
-options = OpenStruct.new()
-opts = OptionParser.new()
-opts.on("-s","--set_id", "=SETID","Get info for this set") {|argument| options.set_id = argument }
-opts.on("-o","--outfile", "=OUTFILE","Output file") {|argument| options.outfile = argument }
-opts.on("-h","--help","Display the usage information") {
-    puts opts
-    exit
-}
-
-opts.parse! 
-
-$server = 'https://rest.pubmlst.org/'
-
-info = rest_get("db")
-
-banned = [ "rMLST", "test"]
-
-info.each do |i|
-    
-    full_name = i["description"]
-    name = i["name"]
-    
-    warn "#{name} | #{full_name}"
-
-    next if banned.include?(name)
-
-    databases = i["databases"].select{|d| d["href"].include?("seqdef") }
-
-    databases.each do |database|
-
-        entry = rest_get(clean_url(database["href"]))
-
-        schemas = rest_get(clean_url(entry["schemes"]))
-    
-        mlsts = schemas["schemes"].select{|s| s["description"].include?("MLST") }
-
-        mlsts.each do |this_mlst|
-
-            schema = rest_get(clean_url(this_mlst["scheme"]))
-
-            desc = schema["description"]
-
-            mlst = desc.gsub(" ", "_").gsub(/[(,)]/, "").gsub("/", "").downcase
-
-            profile_name = "#{name}_#{mlst}"
-
-            next if mlst.include?("gmlst")
-
-            command = "wget -O #{name}_#{mlst}_profiles_csv #{schema['profiles_csv']}"
-
-            puts command
-
-            loci = schema["loci"]
-
-            list = []
-
-            loci.each do |locus|
-
-                l = rest_get(clean_url(locus))
-
-                locus_name = l["id"]
-
-                fasta = locus_name + ".fasta"
-
-                list << fasta
-                
-                command = "wget -O #{fasta} #{l["alleles_fasta"]}"
-                puts command
-
-            end
-
-            command = "claMLST create #{profile_name} #{profile_name}_profiles_csv #{list.join(' ')}"
-            puts command
-
-        end
-    end
-
-end
diff --git a/bin/download_pymlst_wgmlst.sh b/bin/download_pymlst_wgmlst.sh
deleted file mode 100755
index 276bbf3..0000000
--- a/bin/download_pymlst_wgmlst.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-wgMLST import cgmlst_db/escherichia 'Escherichia coli'
-wgMLST import cgmlst_db/listeria_monocytogenes 'Listeria monocytogenes'
-wgMLST import cgmlst_db/klebsiella_pneumoniae 'Klebsiella pneumoniae'
-wgMLST import cgmlst_db/staphylococcus_aureus 'Staphylococcus aureus'
-wgMLST import cgmlst_db/acinetobacter_baumannii 'Acinetobacter baumannii'
-wgMLST import cgmlst_db/salmonella_enterica 'Salmonella enterica'
-wgMLST import cgmlst_db/campylobacter 'Campylobacter'
-wgMLST import cgmlst_db/clostridium_perfringens 'Clostridium perfringens'
-wgMLST import cgmlst_db/streptococcus_pyogenes 'Streptococcus pyogenes'
-wgMLST import cgmlst_db/klebsiella_oxytoca 'Klebsiella oxytoca'
\ No newline at end of file
diff --git a/conf/modules.config b/conf/modules.config
index f614e10..d5c8036 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -340,14 +340,6 @@ process {
             enabled: true
         ]
     }
-    withName: RENAME_SHOVILL_CTG {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ] 
-    }
     withName: MOBSUITE_RECON {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/plasmids" },
@@ -414,30 +406,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: PYMLST_WGMLST_DISTANCE {
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/pymlst" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: PYMLST_WGMLST_ADD {
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/pymlst" },
-            mode: params.publish_dir_mode,
-            enabled: false,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: MLST {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/mlst" },
diff --git a/conf/modules/assembly.config b/conf/modules/assembly.config
index 30da9e7..c606584 100644
--- a/conf/modules/assembly.config
+++ b/conf/modules/assembly.config
@@ -16,7 +16,7 @@ process {
             enabled: true
         ]
     }
-    withName: RENAME_SHOVILL_CTG {
+    withName: 'RENAME_*_CTG' {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/assembly" },
             mode: params.publish_dir_mode,
@@ -24,6 +24,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ] 
     }
+
     withName: FLYE {
         ext.args = "--plasmids --pacbio-hifi"
         publishDir = [
diff --git a/conf/modules/installation.config b/conf/modules/installation.config
index 4eb2092..64dacb4 100644
--- a/conf/modules/installation.config
+++ b/conf/modules/installation.config
@@ -33,14 +33,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: AMRFINDERPLUS_INSTALL {
         publishDir = [
             path: { "${params.reference_base}/gabi/${params.reference_version}" },
diff --git a/conf/modules/mlst.config b/conf/modules/mlst.config
index 8102fb6..d385231 100644
--- a/conf/modules/mlst.config
+++ b/conf/modules/mlst.config
@@ -34,22 +34,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: PYMLST_WGMLST_DISTANCE {
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/pymlst" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: PYMLST_WGMLST_ADD {
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/pymlst" },
-            mode: params.publish_dir_mode,
-            enabled: false,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: MLST {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/mlst" },
diff --git a/conf/modules/qc.config b/conf/modules/qc.config
index b0bce94..983467d 100644
--- a/conf/modules/qc.config
+++ b/conf/modules/qc.config
@@ -82,7 +82,7 @@ process {
     }
     withName: NANOPLOT {
         publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" },
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" },
             mode: params.publish_dir_mode,
             enabled: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
diff --git a/docs/software.md b/docs/software.md
index c92b909..1bce26e 100644
--- a/docs/software.md
+++ b/docs/software.md
@@ -9,9 +9,6 @@ Version 1.19, [doi](https://doi.org/10.1093/bioinformatics/btw354) | [PubMed](ht
 **Samtools**
 Version 1.19, [doi](https://doi.org/10.1093/bioinformatics/btp352) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/19505943/) | [github](https://github.com/samtools/samtools)
 
-**pyMLST**
-Version 2.1.6, [doi](https://doi.org/10.1099/mgen.0.001126) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/37966168/) | [github](https://github.com/bvalot/pyMLST)
-
 **Flye**
 Version 2.9, [doi](https://doi.org/10.1038/s41587-019-0072-8) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/30936562/) |  [github](https://github.com/fenderglass/Flye/tree/flye)
 
diff --git a/docs/usage.md b/docs/usage.md
index 819d610..cd931c8 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -166,6 +166,10 @@ If you analyse a single species and wish to optimize the quality of the genome a
 
 If you analyse a single species and wish to optimize the quality of the genome annotation, you can pass a custom prodigal training file using this option, as described [here](https://github.com/tseemann/prokka?tab=readme-ov-file#option---prodigaltf).
 
+### `--remove_host` [ default = false ]
+
+This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an isse and we will consider adding it. 
+
 ### `--skip_failed` [ default = false ]
 
 By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied:
diff --git a/modules/pymlst/wgmlst/add/environment.yml b/modules/pymlst/wgmlst/add/environment.yml
deleted file mode 100644
index 22e887f..0000000
--- a/modules/pymlst/wgmlst/add/environment.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: pymlst_wgmlst_add
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bioconda::pymlst=2.1.6
diff --git a/modules/pymlst/wgmlst/add/main.nf b/modules/pymlst/wgmlst/add/main.nf
deleted file mode 100644
index 3f1b8e2..0000000
--- a/modules/pymlst/wgmlst/add/main.nf
+++ /dev/null
@@ -1,52 +0,0 @@
-process PYMLST_WGMLST_ADD {
-    maxForks 1
-
-    tag "${meta.sample_id}"
-
-    label 'short_parallel'
-
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' :
-        'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }"
-
-    input:
-    tuple val(meta), path(assembly), val(db)
-
-    output:
-    tuple val(meta), path('*mlst.txt')  , emit: report
-    path('versions.yml')                , emit: versions
-
-    script:
-
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: meta.sample_id
-
-    // Work-around - we remove these strains first, if they already exist in the database, to avoid a downstream error
-    // For example, if we resume the workflow with different settings.
-    // We also wait a little bit to avoid file lock issues.
-    """
-    echo ${meta.sample_id} >> sample.txt
-
-    wgMLST \\
-    remove --strains \\
-    -f sample.txt \\
-    $db
-
-    wgMLST \\
-    add \\
-    $args \\
-    -s ${meta.sample_id} \\
-    $db \\
-    $assembly
-    touch ${prefix}.mlst.txt \\
-
-    sleep 2
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        pyMLST: \$(claMLST --version 2>&1 | head -n1 | sed -e "s/Version: //g")
-    END_VERSIONS
-
-    """
-}
diff --git a/modules/pymlst/wgmlst/distance/environment.yml b/modules/pymlst/wgmlst/distance/environment.yml
deleted file mode 100644
index a8bc9fe..0000000
--- a/modules/pymlst/wgmlst/distance/environment.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: pymlst_wgmlst_distance
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bioconda::pymlst=2.1.6
diff --git a/modules/pymlst/wgmlst/distance/main.nf b/modules/pymlst/wgmlst/distance/main.nf
deleted file mode 100644
index d62b09a..0000000
--- a/modules/pymlst/wgmlst/distance/main.nf
+++ /dev/null
@@ -1,37 +0,0 @@
-process PYMLST_WGMLST_DISTANCE {
-    maxForks 1
-
-    tag "${meta.sample_id}"
-
-    label 'short_parallel'
-
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' :
-        'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }"
-
-    input:
-    tuple val(meta), val(db)
-
-    output:
-    tuple val(meta), path('*cgmlst.txt')  , emit: report
-    path('versions.yml')                , emit: versions
-
-    script:
-
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: meta.sample_id
-
-    """
-    wgMLST \\
-    distance \\
-    --output ${prefix}.cgmlst.txt \\
-    $db $args
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        pyMLST: \$(claMLST --version 2>&1 | head -n1 | sed -e "s/Version: //g")
-    END_VERSIONS
-
-    """
-}
diff --git a/modules/pymlst/wgmlst_install/environment.yml b/modules/pymlst/wgmlst_install/environment.yml
deleted file mode 100644
index 15fa0a1..0000000
--- a/modules/pymlst/wgmlst_install/environment.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: pymlst_wgmlst_install
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bioconda::pymlst=2.1.6
diff --git a/modules/pymlst/wgmlst_install/main.nf b/modules/pymlst/wgmlst_install/main.nf
deleted file mode 100644
index ec1a4c4..0000000
--- a/modules/pymlst/wgmlst_install/main.nf
+++ /dev/null
@@ -1,18 +0,0 @@
-process PYMLST_WGMLST_INSTALL {
-    label 'short_serial'
-
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' :
-        'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }"
-
-    output:
-    path("cgmlst_db"), emit: db
-
-    script:
-
-    '''
-    mkdir -p cgmlst_db
-    download_pymlst_wgmlst.sh
-    '''
-}
diff --git a/modules/quast/main.nf b/modules/quast/main.nf
index 9187a11..8c9437b 100644
--- a/modules/quast/main.nf
+++ b/modules/quast/main.nf
@@ -34,7 +34,7 @@ process QUAST {
         $features \\
         --threads $task.cpus \\
         $args \\
-        ${prefix}.fasta
+        $assembly
 
     ln -s ${prefix}/report.tsv ${prefix}.tsv
     [ -f  ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv
diff --git a/modules/sourmash/compare/main.nf b/modules/sourmash/compare/main.nf
index 9bc63f7..fc0f244 100644
--- a/modules/sourmash/compare/main.nf
+++ b/modules/sourmash/compare/main.nf
@@ -24,7 +24,6 @@ process SOURMASH_COMPARE {
 
     script:
     def args   = task.ext.args     ?: ''
-    def prefix = task.ext.prefix   ?: "${meta.sample_id}"
     def comp   = save_numpy_matrix ? "--output comp.npy"  : ''
     def csv    = save_csv          ? "--csv comp.csv" : ''
     if ( !save_numpy_matrix && !save_csv ) error "Supply either save_numpy_matrix, save_csv, or both or no output will be created"
diff --git a/nextflow.config b/nextflow.config
index 8210b9a..35fadc9 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -115,6 +115,10 @@ dag {
 
 // Module-specific configuration options
 includeConfig 'conf/modules.config'
+includeConfig 'conf/modules/assembly.config'
+includeConfig 'conf/modules/mlst.config'
+includeConfig 'conf/modules/qc.config'
+includeConfig 'conf/modules/installation.config'
 
 // Load centrally stored profiles
 try {
diff --git a/subworkflows/mlst/main.nf b/subworkflows/mlst/main.nf
index f508ef7..c5f88f3 100644
--- a/subworkflows/mlst/main.nf
+++ b/subworkflows/mlst/main.nf
@@ -1,5 +1,3 @@
-include { PYMLST_WGMLST_ADD }               from './../../modules/pymlst/wgmlst/add'
-include { PYMLST_WGMLST_DISTANCE }          from './../../modules/pymlst/wgmlst/distance'
 include { CHEWBBACA_ALLELECALL }            from './../../modules/chewbbaca/allelecall'
 include { CHEWBBACA_ALLELECALL as CHEWBBACA_ALLELECALL_SINGLE }            from './../../modules/chewbbaca/allelecall'
 include { CHEWBBACA_JOINPROFILES }          from './../../modules/chewbbaca/joinprofiles'
@@ -41,28 +39,6 @@ workflow MLST_TYPING {
         pass: db
     }.set { assembly_with_mlst_db }
 
-    /*
-    We use the previously attempted taxonomic classification
-    to choose the appropriate cgMLST schema, if any
-    */
-    ch_assembly_filtered.annotated.map { m, a ->
-        def (genus,species) = m.taxon.toLowerCase().split(' ')
-        def cg_db = null
-        if (params.cgmlst[genus]) {
-            cg_db = params.cgmlst[genus]
-            m.db_name = genus
-        } else if (params.cgmlst["${genus}_${species}"]) {
-            cg_db = params.cgmlst["${genus}_${species}"]
-            m.db_name = "${genus}_${species}"
-        } else {
-            cg_db = null
-        }
-        tuple(m, a, cg_db)
-    }.branch { m, a, db ->
-        fail: db == null
-        pass: db
-    }.set { assembly_with_cg_db }
-
     /*
     We use the previously attempted taxonomic classification
     to choose the appropriate Chewbbaca cgMLST schema, if any
@@ -105,45 +81,11 @@ workflow MLST_TYPING {
         /*
         Inform users about to-be-skipped samples due to a lack of a matching cgMLST database
         */
-        assembly_with_cg_db.fail.subscribe { m, s, d ->
-            log.warn "${m.sample_id} - could not match a pyMLST cgMLST database to ${m.taxon}."
-        }
+       
         assembly_with_chewie_db.fail.subscribe { m, s, d ->
             log.warn "${m.sample_id} - could not match a Chewbbaca cgMLST database to ${m.taxon}."
         }
 
-        /*
-        Run wgMLST on assemblies for which we have taxonomic information
-        and a matching cgMLST schema configured, i.e. the last element must
-        not be null
-        */
-        PYMLST_WGMLST_ADD(
-            assembly_with_cg_db.pass
-        )
-        ch_versions = ch_versions.mix(PYMLST_WGMLST_ADD.out.versions)
-
-        /*
-        Get the databases for which we have assemblies to
-        perform cgMLST clustering
-        */
-        assembly_with_cg_db.pass.map { m, a, d ->
-            tuple(m, d)
-        }
-        .groupTuple(by: 1)
-        .map { metas, db ->
-            def meta = [:]
-            meta.db_name = file(db).getSimpleName()
-            meta.sample_id = file(db).getSimpleName()
-            tuple(meta, db)
-        }.set { ch_cgmlst_database }
-        /*
-        Perform clustering on the given database
-        */
-        PYMLST_WGMLST_DISTANCE(
-            ch_cgmlst_database
-        )
-        ch_versions = ch_versions.mix(PYMLST_WGMLST_DISTANCE.out.versions)
-
         /*
         Perform cgMLST calling with Chewbbaca
         Part one consists of a joint allele calling approach in which all samples belonging to the same species are jointly call
@@ -153,7 +95,6 @@ workflow MLST_TYPING {
             assembly_with_chewie_db.pass
         )
         ch_versions = ch_versions.mix(CHEWBBACA_ALLELECALL_SINGLE.out.versions)
-
     
         /* Join assemblies and databases to generate
         [ meta, [ assemblies ], db ] and filter out all
@@ -187,4 +128,4 @@ workflow MLST_TYPING {
     emit:
     versions = ch_versions
     report = MLST.out.json
-    }
+}
diff --git a/workflows/build_references.nf b/workflows/build_references.nf
index a5dd5c7..a217490 100644
--- a/workflows/build_references.nf
+++ b/workflows/build_references.nf
@@ -3,7 +3,6 @@ include { KRAKEN2_DOWNLOAD }                                from './../modules/k
 include { CONFINDR_INSTALL  }                               from './../modules/helper/confindr_install'
 include { BUSCO_DOWNLOAD as BUSCO_INSTALL }                 from './../modules/busco/download'
 include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL }   from './../modules/amrfinderplus/update'
-include { PYMLST_WGMLST_INSTALL }                           from './../modules/pymlst/wgmlst_install'
 include { CHEWBBACA_DOWNLOADSCHEMA }                        from './../modules/chewbbaca/downloadschema'
 include { STAGE_FILE as DOWNLOAD_SOURMASH_DB }              from './../modules/helper/stage_file'
 include { STAGE_FILE as DOWNLOAD_SOURMASH_NR_DB }           from './../modules/helper/stage_file'
@@ -73,11 +72,6 @@ workflow BUILD_REFERENCES {
         confindr_db_url
     )
 
-    /*
-    Install cgMLST schemas
-    */
-    PYMLST_WGMLST_INSTALL()
-
     /*
     Install Chewbbaca schemas based on schema ID
     */
diff --git a/workflows/gabi.nf b/workflows/gabi.nf
index b31c389..7c00b0c 100644
--- a/workflows/gabi.nf
+++ b/workflows/gabi.nf
@@ -67,6 +67,7 @@ if (params.input) {
     amrfinder_db    = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true)   : []
     kraken2_db      = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true)       : []
 
+    // Sourmash DB choice - either the full thing or a smaller "nr" one to speed up searches at the cost of some precision
     if (params.fast_ref) {
         sourmashdb      = params.reference_base ? file(params.references['sourmashdb_nr'].db, checkIfExists:true)    : []
     } else {
@@ -303,7 +304,7 @@ workflow GABI {
 
     RENAME_PLASMID_CTG(
         PLASMIDS.out.chromosome,
-        'fasta'        
+        'chromosomes.fasta'        
     )
     ch_assembly_without_plasmids = RENAME_PLASMID_CTG.out
 
@@ -314,7 +315,6 @@ workflow GABI {
     errors
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     */
-    
     VARIANTS(
         ch_illumina_trimmed.map { m,r ->
             tuple(m.sample_id,m,r)
@@ -366,7 +366,6 @@ workflow GABI {
     SUB: Perform serotyping of assemblies
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     */
-
     SEROTYPING(
         ch_assemblies_without_plasmids_with_taxa
     )

From 6fa60bd36c9e2f15233efe9356f7f9b972277853 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 12 Dec 2024 13:54:06 +0100
Subject: [PATCH 06/14] Adding ecoli specific abricate step and redirecting
 sourmash sketch output

---
 bin/gabi.py                        |  2 +-
 conf/modules.config                | 24 ++++++++++++++----------
 subworkflows/amr_profiling/main.nf | 20 +++++++++++++++++++-
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/bin/gabi.py b/bin/gabi.py
index 7b08f3d..50afcac 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -326,7 +326,7 @@ def main(yaml, template, output, reference):
         # Draw the Kraken abundance table
         kdata = pd.DataFrame(data=kraken_data_all, index=samples)
         plot_labels = {"index": "Samples", "value": "Percentage"}
-        h = len(samples)*20 if len(samples) > 10 else 400
+        h = len(samples)*25 if len(samples) > 10 else 450
         fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)
 
         data["Kraken"] = fig.to_html(full_html=False)
diff --git a/conf/modules.config b/conf/modules.config
index d5c8036..77b7039 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -65,15 +65,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: SHOVILL {
-	ext.args = "--assembler ${params.shovill_assembler} --minlen ${params.shovill_contig_minlen}"
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/shovill" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: PROKKA {
         ext.args = "--force"
         publishDir = [
@@ -237,6 +228,19 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+    withName: ABRICATE_RUN_ECOLI_VIRULENCE {
+       ext.args =  [
+            "--db ecoli_vf",
+            "--minid ${params.arg_abricate_minid}",
+            "--mincov ${params.arg_abricate_mincov}"
+        ].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/ecoli_vf" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ] 
+    }
     withName: HAMRONIZATION_ABRICATE {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate" },
@@ -414,7 +418,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: SOURMASH_SEARCH {
+    withName: 'SOURMASH_SEARCH|SOURMASH_SKETCH' {
         ext.args = "--best-only"
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf
index d9c2ee4..25c8f0a 100644
--- a/subworkflows/amr_profiling/main.nf
+++ b/subworkflows/amr_profiling/main.nf
@@ -7,6 +7,8 @@ include { HAMRONIZATION_AMRFINDERPLUS }     from './../../modules/hamronization/
 include { HAMRONIZATION_ABRICATE }          from './../../modules/hamronization/abricate'
 include { HAMRONIZATION_SUMMARIZE }         from './../../modules/hamronization/summarize'
 include { ABRICATE_RUN }                    from './../../modules/abricate/run'
+include { ABRICATE_RUN as ABRICATE_RUN_ECOLI_VIRULENCE } from './../../modules/abricate/run'
+
 
 ch_versions = Channel.from([])
 multiqc_files = Channel.from([])
@@ -19,6 +21,12 @@ workflow AMR_PROFILING {
 
     main:
 
+     assembly.branch { m, a ->
+        ecoli: m.taxon ==~ /^Escherichia.*/
+        salmonella: m.taxon ==~ /^Salmonella.*/
+        listeria: m.taxon ==~ /^Listeria.*/
+    }.set { assembly_by_taxon }
+
     /*
     Run AMRFinderPlus and make JSON report
     */
@@ -55,8 +63,18 @@ workflow AMR_PROFILING {
     )
     ch_versions = ch_versions.mix(ABRICATE_RUN.out.versions)
 
+    /*  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Taxon-specific abricate analyses
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~  */ 
+    // E. coli
+    ABRICATE_RUN_ECOLI_VIRULENCE(
+        assembly_by_taxon.ecoli
+    )
+    ch_versions = ch_versions.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.versions)
+
+    // Join basic Abricate results
     HAMRONIZATION_ABRICATE(
-        ABRICATE_RUN.out.report,
+        ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report),
         'json',
         '1.0.1',
         '2021-Mar-27'

From c528eb25b6389443b969a0aa8048cfc168266880 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 12 Dec 2024 14:19:26 +0100
Subject: [PATCH 07/14] Refactoring module configs

---
 conf/modules.config                        | 261 +--------------------
 conf/modules/assembly_qc.config            |  26 ++
 conf/modules/installation.config           |   8 +
 conf/modules/{qc.config => read_qc.config} |  27 +++
 nextflow.config                            |   3 +-
 5 files changed, 71 insertions(+), 254 deletions(-)
 create mode 100644 conf/modules/assembly_qc.config
 rename conf/modules/{qc.config => read_qc.config} (78%)

diff --git a/conf/modules.config b/conf/modules.config
index 77b7039..6367440 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -6,41 +6,7 @@ process {
         enabled: true,
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
-    withName: 'MULTIQC|GABI_REPORT' {
-        publishDir = [
-            path: { "${params.outdir}/reports" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: MULTIQC_ILLUMINA {
-        ext.prefix = "multiqc_illumina"
-        publishDir = [
-            path: { "${params.outdir}/reports/Illumina" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: MULTIQC_NANOPORE {
-        ext.prefix = "multiqc_nanopore"
-        publishDir = [
-            path: { "${params.outdir}/reports/Nanopore" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: MULTIQC_PACBIO {
-        ext.prefix = "multiqc_pacbio"
-        publishDir = [
-            path: { "${params.outdir}/reports/Pacbio" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
+    
     withName: GABI_SUMMARY {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/" },
@@ -49,22 +15,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: QUAST {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/quast" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: MUMMER2CIRCOS {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/plots" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: PROKKA {
         ext.args = "--force"
         publishDir = [
@@ -82,14 +32,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: BUSCO_BUSCO {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/busco" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: KRAKEN2_KRAKEN2 {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/taxonomy/kraken2" },
@@ -130,31 +72,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-
-    withName: 'CONFINDR' {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: 'CONFINDR2MQC|CONFINDR2MQC_SUMMARY' {
-        publishDir = [
-            path: { "${params.outdir}/qc" },
-            mode: params.publish_dir_mode,
-            enabled: false,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: 'CONFINDR2JSON' {
-       publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/confindr" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ] 
-    }
     withName: 'ECTYPER|SEQSERO2|LISSERO|SISTR|STECFINDER' {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/serotype/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
@@ -183,14 +100,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: FASTQC {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastqc" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: AMRFINDERPLUS_RUN {
         ext.args = [
             "--ident_min ${params.arg_amrfinderplus_identmin}",
@@ -266,36 +175,6 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: PORECHOP_ABI {
-        ext.args = "--abi"
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/porechop" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: CHOPPER {
-        ext.args2 = [
-            "-l ${params.ont_min_length}",
-            params.ont_min_q ? "-q ${params.ont_min_q}" : ""
-        ].join(' ').trim()
-         publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/chopper" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    
-    }
-    withName: NANOPLOT {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
     withName: CAT_FASTQ {
         publishDir = [
             path: { "${params.outdir}/cat" },
@@ -303,33 +182,6 @@ process {
             enabled: false
         ]
     }
-    withName: BIOBLOOM_CATEGORIZER {
-        ext.args = "-g -n --fq"
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/biobloom" },
-            mode: params.publish_dir_mode,
-            enabled: false
-        ]
-    }
-    withName: RASUSA {
-        ext.args = [
-            "--genome-size ${params.genome_size}",
-            "--coverage ${params.max_coverage}"
-        ].join(' ').trim()
-        publishDir = [
-            path: { "${params.outdir}/rasusa" },
-            mode: params.publish_dir_mode,
-            enabled: false
-        ]
-    }
-    withName: FASTP {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastp" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.contains('.json') ? filename : null }
-        ]
-    }
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/custom" },
@@ -337,13 +189,6 @@ process {
             enabled: false
         ]
     }
-    withName: DRAGONFLYE {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/dragonflye" },
-            mode: params.publish_dir_mode,
-            enabled: true
-        ]
-    }
     withName: MOBSUITE_RECON {
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/plasmids" },
@@ -352,15 +197,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: FLYE {
-        ext.args = "--plasmids --pacbio-hifi"
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/flye/" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ] 
-    }
+
     withName: AMRFINDERPLUS_INSTALL {
         publishDir = [
             path: { "${params.reference_base}/gabi/${params.reference_version}" },
@@ -369,56 +206,7 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: BUSCO_INSTALL {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/busco" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: KRAKEN2_DOWNLOAD {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/kraken2" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: DOWNLOAD_SOURMASH_DB {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/sourmashdb" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: CHEWBBACA_DOWNLOADSCHEMA {
-        ext.args = "--latest"
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/chewbbaca" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: CONFINDR_INSTALL {
-        publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: MLST {
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/mlst" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: 'SOURMASH_SEARCH|SOURMASH_SKETCH' {
+    withName: 'SOURMASH_SEARCH' {
         ext.args = "--best-only"
         publishDir = [
             path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
@@ -427,48 +215,15 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
-    withName: 'CHEWBBACA_ALLELECALLEVALUATOR' {
-        
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: 'CHEWBBACA_JOINPROFILES' {
-        ext.args = "--common"
-        publishDir = [
-            path: { "${params.outdir}/cgMLST/chewbbaca/samples/${meta.sample_id}/joinprofiles" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: 'CHEWBBACA_ALLELECALL_SINGLE' {
-        ext.args = "--no-inferred"
-        publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/chewbbaca" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-    withName: BIOBLOOM_MAKER {
-       publishDir = [
-            path: { "${params.reference_base}/gabi/${params.reference_version}/biobloom" },
-            mode: params.publish_dir_mode,
-            enabled: true,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ] 
-    }
-    withName: 'CHEWBBACA_ALLELECALL' {
-        ext.args = "--no-inferred"
+    withName: 'SOURMASH_SKETCH' {
+        ext.args = "dna"
         publishDir = [
-            path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" },
+            path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" },
             mode: params.publish_dir_mode,
             enabled: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+
+
 }
diff --git a/conf/modules/assembly_qc.config b/conf/modules/assembly_qc.config
new file mode 100644
index 0000000..32ea5dc
--- /dev/null
+++ b/conf/modules/assembly_qc.config
@@ -0,0 +1,26 @@
+process {
+    withName: BUSCO_BUSCO {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/busco" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: QUAST {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/assembly/quast" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: MUMMER2CIRCOS {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/plots" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+}
\ No newline at end of file
diff --git a/conf/modules/installation.config b/conf/modules/installation.config
index 64dacb4..864ee89 100644
--- a/conf/modules/installation.config
+++ b/conf/modules/installation.config
@@ -49,5 +49,13 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+    withName: BIOBLOOM_MAKER {
+       publishDir = [
+            path: { "${params.reference_base}/gabi/${params.reference_version}/biobloom" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ] 
+    }
 
 }
\ No newline at end of file
diff --git a/conf/modules/qc.config b/conf/modules/read_qc.config
similarity index 78%
rename from conf/modules/qc.config
rename to conf/modules/read_qc.config
index 983467d..bd49a2e 100644
--- a/conf/modules/qc.config
+++ b/conf/modules/read_qc.config
@@ -88,5 +88,32 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+    withName: FASTP {
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastp" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.contains('.json') ? filename : null }
+        ]
+    }
+    withName: RASUSA {
+        ext.args = [
+            "--genome-size ${params.genome_size}",
+            "--coverage ${params.max_coverage}"
+        ].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/rasusa" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+    withName: BIOBLOOM_CATEGORIZER {
+        ext.args = "-g -n --fq"
+        publishDir = [
+            path: { "${params.outdir}/samples/${meta.sample_id}/biobloom" },
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
 
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 35fadc9..54704bc 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -117,7 +117,8 @@ dag {
 includeConfig 'conf/modules.config'
 includeConfig 'conf/modules/assembly.config'
 includeConfig 'conf/modules/mlst.config'
-includeConfig 'conf/modules/qc.config'
+includeConfig 'conf/modules/read_qc.config'
+includeConfig 'conf/modules/assembly_qc.config'
 includeConfig 'conf/modules/installation.config'
 
 // Load centrally stored profiles

From 44c731c906886192455c70562b4def65b53316b0 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Fri, 3 Jan 2025 09:45:52 +0100
Subject: [PATCH 08/14] Fixing mishandled confindr results for certain edge
 cases

---
 bin/gabi.py           | 43 ++++++++++++++++++++++++++-----------------
 conf/resources.config | 13 -------------
 docs/usage.md         | 12 ++++++------
 3 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/bin/gabi.py b/bin/gabi.py
index 50afcac..a29719c 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -83,22 +83,31 @@ def main(yaml, template, output, reference):
                     for read in set:
                         if read["ContamStatus"] == "True":
                             contaminated = True
-                            if (read["PercentContam"] == "ND"):
-                                perc = "ND"
-                                this_status = status["missing"]
-                            else:
-                                perc = float(read["PercentContam"])
-
-                                if (perc > contaminated):
-                                    contaminated = perc
 
-                                if (perc >= 10.0):
-                                    confindr_status = status["fail"]
+                            if "PercentContam" in read:
+                                if (read["PercentContam"] == "ND"):
+                                    perc = "ND"
+                                    if ":" in read["Genus"]:
+                                        perc = read["Genus"]
                                     this_status = status["fail"]
-                                elif (perc > 0.0 and confindr_status == status["pass"]):
-                                    confindr_status = status["warn"]
-                                    if (this_status == status["pass"]):
-                                        this_status = status["warn"]
+                                    confindr_status = status["fail"]
+                                    contaminated = perc
+                                else:
+                                    perc = float(read["PercentContam"])
+
+                                    if (perc > contaminated):
+                                        contaminated = perc
+
+                                    if (perc >= 10.0):
+                                        confindr_status = status["fail"]
+                                        this_status = status["fail"]
+                                    elif (perc > 0.0 and confindr_status == status["pass"]):
+                                        confindr_status = status["warn"]
+                                        if (this_status == status["pass"]):
+                                            this_status = status["warn"]
+                            else:
+                                contaminated = "ND"
+                                confindr_status = status["warn"]
 
             # All the relevant values and optional status classes
             sample = jdata["sample"]
@@ -113,9 +122,9 @@ def main(yaml, template, output, reference):
             if "kraken" in jdata:
 
                 taxon_perc = float(jdata["kraken"][0]["percentage"])
-                if taxon_perc >= 80.0:
+                if taxon_perc >= 90.0:
                     taxon_status = status["pass"]
-                elif taxon_perc >= 60.0:
+                elif taxon_perc >= 70.0:
                     taxon_status = status["warn"]
                 else:
                     taxon_status = status["fail"]
@@ -130,7 +139,7 @@ def main(yaml, template, output, reference):
 
                     kraken_results[this_taxon] = tperc
 
-                    if (tperc > 10.0):
+                    if (tperc > 5.0):
                         taxon_count += 1
 
                 kraken_data_all.append(kraken_results)
diff --git a/conf/resources.config b/conf/resources.config
index b3a160a..b81b19a 100644
--- a/conf/resources.config
+++ b/conf/resources.config
@@ -179,19 +179,6 @@ params {
 
   }
 
-  cgmlst {
-    escherichia = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/escherichia"
-    listeria_monocytogenes = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/listeria_monocytogenes"
-    klebsiella_pneumoniae = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/klebsiella_pneumoniae"
-    staphylococcus_aureus = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/staphylococcus_aureus"
-    acinetobacter_baumannii = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/acinetobacter_baumannii"
-    salmonella_enterica = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/salmonella_enterica"
-    campylobacter = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/campylobacter"
-    clostridium_perfringens = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/clostridium_perfringens"
-    streptococcus_pyogenes = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/streptococcus_pyogenes"
-    klebsiella_oxytoca = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/klebsiella_oxytoca"
-  }
-
   chewbbaca {
     streptococcus_pyogenes = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_1/Streptococcus_pyogenes_wgMLST"
     acinetobacter_baumannii = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_2/Acinetobacter_baumannii_cgMLSTRidom"
diff --git a/docs/usage.md b/docs/usage.md
index cd931c8..0a98b77 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -116,7 +116,7 @@ This option is only used when installing the pipelines references as described [
 
 ### `--fast_ref` [ default = false ]
 
-By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics will notably deteriorate.
+By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics may notably deteriorate as you are no longer guaranteed to get the closest possible match.
 
 ### `--run_name` [ default = null]
 
@@ -128,13 +128,13 @@ This option should point to the base directory in which you have installed the p
 
 ### `--onthq` [ default = false ]
 
-Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default..
+Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default.
 
 ### `--ont_min_q` [ default = 10 ]
 
-Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data. 
+Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data in favor of higher quality reads. 
 
-### `--ont_min_length`  [ default = 5000 ]
+### `--ont_min_length`  [ default = 1000 ]
 
 Discard nanopore reads below this length. Depending on your DNA extraction and/or library preparation, you will see a range of sequence lengths. If you have sequenced at sufficient depths, you may decide to discard shorter reads to improve your assembly contiguity. However, please note that discarding shorter reads may essentially throw away very short plasmids (which can be as short as ~1kb). 
 
@@ -148,7 +148,7 @@ A local version of the ConfindR rMLST database, available [here](https://olc-bio
 
 ### `--genome_size` [ default = null ]
 
-If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. 
+If enabled, this is the assumed genome size against which the coverage is measured for downsampling the reads (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size.
 
 ### `--max_coverage` [ default = '100x']
 
@@ -168,7 +168,7 @@ If you analyse a single species and wish to optimize the quality of the genome a
 
 ### `--remove_host` [ default = false ]
 
-This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an isse and we will consider adding it. 
+This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an issue and we will consider adding it. 
 
 ### `--skip_failed` [ default = false ]
 

From e8de5142e6ebb84177b26b7dc9c614df1ede68af Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Tue, 7 Jan 2025 13:21:03 +0100
Subject: [PATCH 09/14] Adding run infos to html report as well as ONT metrics

---
 assets/gabi_template.html          | 85 ++++++++++++++++++++----------
 bin/gabi.py                        | 84 +++++++++++++++++++++--------
 bin/gabi_summary.pl                | 44 ++++++++++++++++
 conf/modules.config                |  8 +++
 docs/usage.md                      | 19 +++++--
 modules/fastp/main.nf              |  2 +-
 modules/helper/gabi_report/main.nf |  7 +++
 modules/input_check.nf             |  5 ++
 nextflow.config                    |  1 +
 subworkflows/qc/main.nf            |  2 +
 subworkflows/qc_illumina/main.nf   | 11 ++--
 subworkflows/qc_nanopore/main.nf   |  7 +--
 workflows/gabi.nf                  | 14 ++---
 13 files changed, 222 insertions(+), 67 deletions(-)

diff --git a/assets/gabi_template.html b/assets/gabi_template.html
index dfa17e0..7888318 100644
--- a/assets/gabi_template.html
+++ b/assets/gabi_template.html
@@ -16,7 +16,7 @@
     .general { border-style: none; border-spacing: 0px ;}
     .table-caption { font-weight: bold; display: block;}
     table { border-collapse: collapse;}
-    tr.row { border-bottom: 1px solid grey;}
+    tr.row td { border-bottom: 1px solid grey; padding-top: 5px; padding-bottom: 5px;}
     td { padding-left: 5px; padding-right: 5px;}
     th { border-left: 1px solid white; padding-right: 5px;}
     tr td[scope="sample-id"] {background-color: rgb(233, 233, 233); font-weight: bold;}
@@ -61,10 +61,35 @@ <h1>
         {% if Kraken %}
             <td><a href="#kraken">Kraken2</a></td>
         {% endif %}
-        <td><a href="#serotypes">Serotypes</a></td>
+        {% if serotypes %}
+            <td><a href="#serotypes">Serotypes</a></td>
+        {% endif %}
         <td><a href="#software">Software</a></td>
     </tr>
 </table>
+
+<!--
+Some basic information about the analysis run
+-->
+<h2>Run Infos</h2>
+<table class="general">
+    <tr>
+        <td>User</td><td>{{user}}</td>
+    </tr>
+    <tr>
+        <td>Date</td><td>{{date}}</td>
+    </tr>
+    <tr>
+        <td>Pipeline version</td><td>{{version}}</td>
+    </tr>
+    <tr>
+        <td>Command line call</td><td>{{call}}</td>
+    </tr>
+    <tr>
+        <td>Work directory</td><td>{{wd}}</td>
+    </tr>
+</table>
+
 <!--
 The summary table with all key metrics per sample
 -->
@@ -73,11 +98,11 @@ <h2>Summary</h2>
     <tr>
         <th scope="col">Sample</th>
         <th scope="col"><div class="tooltip">Status<span class="tooltiptext">The overall analysis status: pass: ok to use, warn: potential issues found, fail: most probably not usable</span></div></th>
-        <th scope="col"><div class="tooltip">Best-guess taxon<span class="tooltiptext">The highest scoring taxon in the Kraken2 analysis - green: robust call, orange: weak call, red: very weak call</span></div></th>
+        <th scope="col"><div class="tooltip">Best-guess taxon<span class="tooltiptext">The highest scoring taxon using kmer matching (S/MASH)</span></div></th>
         <th colspan=2 scope="col"><div class="tooltip">Reference genome<span class="tooltiptext">The highest matching hit in RefSeq to this assembly</span></div></th>
         <th colspan=5 scope="col"><div class="tooltip">Assembly<span class="tooltiptext">Information about this assembly</span></div></th>
         <th colspan=4 scope="col"><div class="tooltip">Mean coverage<span class="tooltiptext">Mean coverage of reads mapped back to the assembly - bigger is better</span></div></th>
-        <th scope="col"><div class="tooltip">Mean insert size<span class="tooltiptext">The mean insert size as determined from mapped reads</span></div></th>
+        <th colspan=3 scope="col"><div class="tooltip">Read quality<span class="tooltiptext">Quality metrics of reads after trimming</span></div></th>
         <th colspan=2 scope="col"><div class="tooltip">Contamination<span class="tooltiptext">Indicators of contamination</span></div></th>
     </tr>
     <tr class="row">
@@ -90,12 +115,14 @@ <h2>Summary</h2>
         <th scope="subcol"><div class="tooltip">#Contigs<span class="tooltiptext">The number of chromosomal contigs, i.e. without plasmids.</span></div></th>
         <th scope="subcol"><div class="tooltip">N50 (Kb)<span class="tooltiptext">The size of contigs (>=)in which 50% of the assembly are represented.</span><div></th>
         <th scope="subcol"><div class="tooltip">Gene space (%)<span class="tooltiptext">The fraction of broadly conserved genes fully covered in this assembly (BUSCO).</span></div></th>
-        <th scope="subcol">GC (%)</th>
+        <th scope="subcol"><div class="tooltip">GC (%)<span class="tooltiptext">GC content of the assembly. Deviations from the species default are highlighted in orange (mild) and red (strong, something likely wrong)</span></div></th>
         <th scope="subcol">Total</th>
-        <th scope="subcol">Illumina</th>
+        <th scope="subcol">ILM</th>
         <th scope="subcol">ONT</th>
         <th scope="subcol">HiFi</th>
-        <th scope="subcol">Illumina</th>
+        <th scope="subcol"><div class="tooltip">ILM Q30 (%)<span class="tooltiptext">Fraction of Illumina reads above Q30.</span></div></th>
+        <th scope="subcol"><div class="tooltip">ONT Q15 (#)<span class="tooltiptext">Number of ONT reads above Q15.</span></div></th>
+        <th scope="subcol"><div class="tooltip">ONT N50 (bp)<span class="tooltiptext">N50 of ONT reads</span></div></th>
         <th scope="subcol">Confindr (%)</th>
         <th scope="subcol">Taxa >10%</th>
     </tr>
@@ -108,7 +135,7 @@ <h2>Summary</h2>
         <td scope="sample-id">{{row.sample}}</td>
         <td scope={{row.status}}>{{row.status}}</td>
         <td scope="missing">{{row.taxon}}</td>
-        <td scope="missing"><a href="https://www.ncbi.nlm.nih.gov/datasets/genome/{{row.reference.assembly}}" target="_blank" alt="Genome in GenBank">{{row.reference.assembly}}<br>{{row.reference.definition}}</a></td>
+        <td scope="missing"><a href="https://www.ncbi.nlm.nih.gov/datasets/genome/{{row.reference.assembly}}" target="_blank" title="{{row.reference.definition}}" alt="Genome in GenBank">{{row.reference.assembly}}</a></td>
         <td scope={{row.fraction_status}}>{{row.fraction}}</td>
         <td scope={{row.assembly_status}}>{{row.assembly}}</td>
         <td scope={{row.contigs_status}}>{{row.contigs}}</td>
@@ -119,7 +146,9 @@ <h2>Summary</h2>
         <td scope={{row.coverage_illumina_status}}>{{row.coverage_illumina}}</td>
         <td scope={{row.coverage_nanopore_status}}>{{row.coverage_nanopore}}</td>
         <td scope={{row.coverage_pacbio_status}}>{{row.coverage_pacbio}}</td>
-        <td scope="missing">{{row.samtools.mean_insert_size}}</td>
+        <td scope={{row.quality_illumina_status}}>{{row.quality_illumina}}</td>
+        <td scope="missing">{{row.quality_nanopore}}</td>
+        <td scope="missing">{{row.nanopore_n50}}</td>
         <td scope={{row.confindr_status}}>{{row.contamination}}</td>
         <td scope={{row.taxon_count_status}}>{{row.taxon_count}}</td>
     </tr>
@@ -216,28 +245,30 @@ <h2>Kraken2 - taxonomic composition</h2>
 <!--
 Results from one or multiple serotyping tools
 -->
-<div id="serotypes"></div>
-<h2>Serotyping</h2>
+{% if serotypes %}
+    <div id="serotypes"></div>
+    <h2>Serotyping</h2>
 
-{% for stool,stypes in serotypes.items() %}
-    <div class="table-caption">{{stool}}</div>
-    <table>
-        <tr>
-            <th scope="col">Sample</th>
-            <th scope="col">Serotype</th>
-        </tr>
-        {% for stype in stypes %}
+    {% for stool,stypes in serotypes.items() %}
+        <div class="table-caption">{{stool}}</div>
+        <table>
             <tr>
-                <td scope="sample-id">{{stype.sample}}</td>
-                <td scope="col">{{stype.serotype}}</td>
+                <th scope="col">Sample</th>
+                <th scope="col">Serotype</th>
             </tr>
-        {% endfor %}
-    </table>
-    <p></p>
-{% endfor %}
+            {% for stype in stypes %}
+                <tr>
+                    <td scope="sample-id">{{stype.sample}}</td>
+                    <td scope="col">{{stype.serotype}}</td>
+                </tr>
+            {% endfor %}
+        </table>
+        <p></p>
+    {% endfor %}
 
-<p></p>
-<a href="#navigation">top</a>
+    <p></p>
+    <a href="#navigation">top</a>
+{% endif %}
 
 <!-- 
 Information on used software packages
diff --git a/bin/gabi.py b/bin/gabi.py
index a29719c..90bc830 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import plotly.express as px
 from jinja2 import Template
+import datetime
 import pandas as pd
 import os
 import json
@@ -12,7 +13,11 @@
 parser.add_argument("--input", help="An input option")
 parser.add_argument("--references", help="Reference values for various taxa")
 parser.add_argument("--template", help="A JINJA2 template")
+parser.add_argument("--version", help="Pipeline version")
+parser.add_argument("--call", help="Command line call")
+parser.add_argument("--wd", help="work directory")
 parser.add_argument("--output")
+
 args = parser.parse_args()
 
 status = {
@@ -23,13 +28,20 @@
 }
 
 
-def main(yaml, template, output, reference):
+def main(yaml, template, output, reference, version, call, wd):
 
     # Read all the JSON files we see in this folder
     json_files = [pos_json for pos_json in os.listdir('.') if pos_json.endswith('.json') and "AQUAMIS" not in pos_json]
     json_files.sort()
 
     data = {}
+
+    data["user"] = os.getlogin()
+    data["date"] = datetime.datetime.now()
+    data["version"] = version
+    data["call"] = call
+    data["wd"] = wd
+
     data["summary"] = []
 
     samples = []
@@ -113,6 +125,29 @@ def main(yaml, template, output, reference):
             sample = jdata["sample"]
             samples.append(sample)
 
+            fastp_q30 = "-"
+            fastp_q30_status = status["missing"]
+
+            # Read quality via FastP
+            if "fastp" in jdata:
+                fastp_q30_status = status["pass"]
+                fastp_summary = jdata["fastp"]["summary"]
+                fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"],2)
+                if fastp_q30 < 0.85:
+                    fastp_q30_status = status["warn"]
+
+            # Read stats from NanoStat
+            nanostat_q15 = "-"
+            nanostat_q15_status = status["missing"]
+            nanostat_mean_read_length = "-"
+            nanostat_read_n50 = "-"
+
+            if "nanostat" in jdata:
+                nanostat_data = jdata["nanostat"]
+                nanostat_q15 = int(nanostat_data["Q15"])
+                nanostat_mean_read_length = nanostat_data["mean_read_length"]
+                nanostat_read_n50 = nanostat_data["read_length_n50"]
+
             # Get Kraken results
 
             taxon_status = status["missing"]
@@ -195,25 +230,26 @@ def main(yaml, template, output, reference):
             quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"]))
 
             # Get serotype(s)
-            serotypes = jdata["serotype"]
-            for sentry in serotypes:
-                for stool, sresults in sentry.items():
-                    if (stool == "ectyper"):
-                        serotype = sresults["Serotype"]
-                    elif (stool == "Stecfinder"):
-                        serotype = sresults["Serotype"]
-                    elif (stool == "SeqSero2"):
-                        serotype = f"{sresults['Predicted serotype']} ({sresults['Predicted antigenic profile']})"
-                    elif (stool == "Sistr"):
-                        serotype = f"{sresults['serovar']} ({sresults['serogroup']})"
-                    elif (stool == "Lissero"):
-                        serotype = sresults["SEROTYPE"]
-
-                stool_name = f"{stool} ({taxon})"
-                if (stool_name in serotypes_all):
-                    serotypes_all[stool_name].append({"sample": sample, "serotype": serotype})
-                else:
-                    serotypes_all[stool_name] = [{"sample": sample, "serotype": serotype}]
+            if "serotype" in jdata:
+                serotypes = jdata["serotype"]
+                for sentry in serotypes:
+                    for stool, sresults in sentry.items():
+                        if (stool == "ectyper"):
+                            serotype = sresults["Serotype"]
+                        elif (stool == "Stecfinder"):
+                            serotype = sresults["Serotype"]
+                        elif (stool == "SeqSero2"):
+                            serotype = f"{sresults['Predicted serotype']} ({sresults['Predicted antigenic profile']})"
+                        elif (stool == "Sistr"):
+                            serotype = f"{sresults['serovar']} ({sresults['serogroup']})"
+                        elif (stool == "Lissero"):
+                            serotype = sresults["SEROTYPE"]
+
+                    stool_name = f"{stool} ({taxon})"
+                    if (stool_name in serotypes_all):
+                        serotypes_all[stool_name].append({"sample": sample, "serotype": serotype})
+                    else:
+                        serotypes_all[stool_name] = [{"sample": sample, "serotype": serotype}]
 
             # Reference genome
             reference = jdata["reference"]
@@ -304,6 +340,10 @@ def main(yaml, template, output, reference):
                 "samtools": samtools,
                 "taxon": taxon,
                 "busco": busco,
+                "quality_illumina": fastp_q30,
+                "quality_illumina_status": fastp_q30_status,
+                "quality_nanopore": nanostat_q15,
+                "nanopore_n50": nanostat_read_n50,
                 "busco_status": busco_status,
                 "taxon_status": taxon_status,
                 "taxon_count": taxon_count,
@@ -335,7 +375,7 @@ def main(yaml, template, output, reference):
         # Draw the Kraken abundance table
         kdata = pd.DataFrame(data=kraken_data_all, index=samples)
         plot_labels = {"index": "Samples", "value": "Percentage"}
-        h = len(samples)*25 if len(samples) > 10 else 450
+        h = len(samples)*25 if len(samples) > 10 else 550
         fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)
 
         data["Kraken"] = fig.to_html(full_html=False)
@@ -451,4 +491,4 @@ def check_gc(refs, query):
 
 
 if __name__ == '__main__':
-    main(args.input, args.template, args.output, args.references)
+    main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd)
diff --git a/bin/gabi_summary.pl b/bin/gabi_summary.pl
index 90988af..49ff418 100755
--- a/bin/gabi_summary.pl
+++ b/bin/gabi_summary.pl
@@ -71,6 +71,12 @@
     if ($filename =~ /.*kraken.*/) {
         my @data = parse_kraken(\@lines);
         $matrix{"kraken"} = \@data;
+    } elsif ( $filename =~ /.NanoStats.txt/) {
+        my %data = parse_nanostat(\@lines);
+        $matrix{"nanostat"} = \%data;
+    } elsif ( $filename =~ /.fastp.json/) {
+        my %data = parse_fastp(\@lines);
+        $matrix{"fastp"} = \%data;
     } elsif ( $filename =~ /.*mlst.json/) {
         my %data = parse_mlst(\@lines);
         push( @{ $matrix{"mlst"} }, \%data );
@@ -145,6 +151,44 @@
 # Tool-specific parsing methods
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+sub parse_nanostat {
+    my @lines = @{$_[0]} ;
+
+    my %data ;
+
+    foreach my $line (@lines) {
+        my @elements = split /\t/, $line ;
+        if ( scalar(@elements) > 1) {
+            my $key = @elements[0];
+            if ( $key =~ /^>Q.*/) {
+                $key =~ s/^>//g ;
+                $key =~ s/\://g ;
+                my $value = (split /\s+/, @elements[1])[0];
+                $data{$key} = $value ;
+            }
+        }  else {
+            my @elements = split /\s+/, $line ;
+            my $value = @elements[-1];
+            $value =~ s/,// ;
+            if ($line =~ /^Mean read length.*/) {
+                $data{"mean_read_length"} = $value ;
+            } elsif ($line =~ /^Read length N50/) {
+                $data{'read_length_n50'} = $value ;
+            }
+        }
+    }
+
+   return %data ;
+}
+
+sub parse_fastp {
+
+    my @lines = @{$_[0]} ;
+    my $text = join " ",@lines;
+    my $json = JSON::XS->new->utf8->decode($text);
+    return  %$json{'summary'};
+}
+
 sub parse_stecfinder {
     my @lines = @{$_[0]} ;
 
diff --git a/conf/modules.config b/conf/modules.config
index 6367440..80a6411 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -15,6 +15,14 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
+    withName: GABI_REPORT {
+        publishDir = [
+            path: { "${params.outdir}/reports/" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
     withName: PROKKA {
         ext.args = "--force"
         publishDir = [
diff --git a/docs/usage.md b/docs/usage.md
index 0a98b77..0c38649 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -181,12 +181,25 @@ By default, all samples are processed all the way to the end of the pipeline. Th
 
 Skip generation of circos plots.
 
-### `--shovill_assembler` [ default = spades ]
+### `--skip_serotyping` [ default = false ]
 
-Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades.
+Skip Serotyping
+
+### `--skip_cgmlst` [ default = false ]
+
+Skip cgMLST analysis
 
 ### `--skip_mlst` [ default = false ]
-Do not run MLST typing tools (chewbbaca, MLST)
+
+Skip all MLST analyses (incl. cgMLST)
+
+### `--skip_amr` [ default = false ]
+
+Skip prediction of AMR genes
+
+### `--shovill_assembler` [ default = spades ]
+
+Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades.
 
 ## Resources
 
diff --git a/modules/fastp/main.nf b/modules/fastp/main.nf
index 7a17d9c..f408a15 100644
--- a/modules/fastp/main.nf
+++ b/modules/fastp/main.nf
@@ -13,7 +13,7 @@ process FASTP {
 
     output:
     tuple val(meta), path('*trimmed.fastq.gz'), emit: reads
-    path("*.json"), emit: json
+    tuple val(meta), path("*.json"), emit: json
     path('versions.yml'), emit: versions
 
     script:
diff --git a/modules/helper/gabi_report/main.nf b/modules/helper/gabi_report/main.nf
index 03e7898..0195ac5 100644
--- a/modules/helper/gabi_report/main.nf
+++ b/modules/helper/gabi_report/main.nf
@@ -21,10 +21,17 @@ process GABI_REPORT {
     def prefix = task.ext.prefix ?: params.run_name
     result = prefix + '.html'
 
+    version = workflow.manifest.version
+    call = workflow.commandLine
+    wd = workflow.workDir
+
     """
     gabi.py --template $template \
     --input $yml \
     --references $refs \
+    --version $version \
+    --call '$call' \
+    --wd $wd \
     $args \
     --output $result
 
diff --git a/modules/input_check.nf b/modules/input_check.nf
index ec12862..d67a57d 100644
--- a/modules/input_check.nf
+++ b/modules/input_check.nf
@@ -23,6 +23,11 @@ workflow INPUT_CHECK {
 
 def input_channel(LinkedHashMap row) {
     meta = [:]
+
+    if (!row.sample_id) {
+        exit 1, "ERROR: Please check input samplesheet -> no sample_id column found!\n"
+    }
+
     meta.sample_id    = row.sample_id
     meta.assembly     = false
     meta.reads        = false
diff --git a/nextflow.config b/nextflow.config
index 54704bc..e684aaa 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -59,6 +59,7 @@ params {
     skip_mlst               = false
     skip_amr                = false
     skip_circos             = false
+    skip_serotyping         = false
     
     plaintext_email         = false
     skip_multiqc            = false
diff --git a/subworkflows/qc/main.nf b/subworkflows/qc/main.nf
index a248ae7..e053738 100644
--- a/subworkflows/qc/main.nf
+++ b/subworkflows/qc/main.nf
@@ -82,6 +82,8 @@ workflow QC {
     ch_qc = ch_qc.mix(CONFINDR2MQC_SUMMARY.out.json)
 
     emit:
+    fastp_json = QC_ILLUMINA.out.fastp_json
+    nanoplot_stats = QC_NANOPORE.out.nanoplot_stats
     confindr_reports = ch_confindr_reports
     qc_illumina = QC_ILLUMINA.out.qc.mix(QC_ILLUMINA.out.confindr_qc)
     qc_nanopore = QC_NANOPORE.out.qc
diff --git a/subworkflows/qc_illumina/main.nf b/subworkflows/qc_illumina/main.nf
index 4606694..0b75e7e 100644
--- a/subworkflows/qc_illumina/main.nf
+++ b/subworkflows/qc_illumina/main.nf
@@ -40,7 +40,7 @@ workflow QC_ILLUMINA {
         ch_reads_merged
     )
     ch_versions = ch_versions.mix(FASTP.out.versions)
-    multiqc_files = multiqc_files.mix(FASTP.out.json)
+    multiqc_files = multiqc_files.mix(FASTP.out.json.map{ m,j -> j})
 
     FASTQC(
         FASTP.out.reads
@@ -68,8 +68,9 @@ workflow QC_ILLUMINA {
     emit:
     confindr_report = CONTAMINATION.out.report
     confindr_json   = CONTAMINATION.out.confindr_json
-    confindr_qc = CONTAMINATION.out.qc
-    reads = ch_processed_reads
-    versions = ch_versions
-    qc = multiqc_files
+    fastp_json      = FASTP.out.json
+    confindr_qc     = CONTAMINATION.out.qc
+    reads           = ch_processed_reads
+    versions        = ch_versions
+    qc              = multiqc_files
     }
diff --git a/subworkflows/qc_nanopore/main.nf b/subworkflows/qc_nanopore/main.nf
index 81da756..93683f0 100644
--- a/subworkflows/qc_nanopore/main.nf
+++ b/subworkflows/qc_nanopore/main.nf
@@ -88,7 +88,8 @@ workflow QC_NANOPORE {
     emit:
     //confindr_report = CONTAMINATION.out.report
     //confindr_json   = CONTAMINATION.out.json
-    reads = ch_processed_reads
-    qc = multiqc_files
-    versions = ch_versions
+    reads           = ch_processed_reads
+    qc              = multiqc_files
+    nanoplot_stats  = NANOPLOT.out.txt
+    versions        = ch_versions
     }
diff --git a/workflows/gabi.nf b/workflows/gabi.nf
index 7c00b0c..f21a418 100644
--- a/workflows/gabi.nf
+++ b/workflows/gabi.nf
@@ -113,7 +113,7 @@ workflow GABI {
     ch_ont_trimmed      = QC.out.ont
     ch_pacbio_trimmed   = QC.out.pacbio
     multiqc_files       = multiqc_files.mix(QC.out.qc)
-    ch_report           = ch_report.mix(QC.out.confindr_reports)
+    ch_report           = ch_report.mix(QC.out.confindr_reports, QC.out.fastp_json, QC.out.nanoplot_stats)
 
     ch_multiqc_illumina = ch_multiqc_illumina.mix(QC.out.qc_illumina)
     ch_multiqc_nanopore = ch_multiqc_nanopore.mix(QC.out.qc_nanopore)
@@ -366,11 +366,13 @@ workflow GABI {
     SUB: Perform serotyping of assemblies
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     */
-    SEROTYPING(
-        ch_assemblies_without_plasmids_with_taxa
-    )
-    ch_versions     = ch_versions.mix(SEROTYPING.out.versions)
-    ch_report       = ch_report.mix(SEROTYPING.out.reports)
+    if (!params.skip_serotyping) {
+        SEROTYPING(
+            ch_assemblies_without_plasmids_with_taxa
+        )
+        ch_versions     = ch_versions.mix(SEROTYPING.out.versions)
+        ch_report       = ch_report.mix(SEROTYPING.out.reports)
+    }
 
     /*
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 7a3e2ac729e50203519afd54acc83d6283580044 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Tue, 7 Jan 2025 14:02:30 +0100
Subject: [PATCH 10/14] Fixing linting error in gabi.py

---
 bin/gabi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/gabi.py b/bin/gabi.py
index 90bc830..05cb784 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -132,14 +132,14 @@ def main(yaml, template, output, reference, version, call, wd):
             if "fastp" in jdata:
                 fastp_q30_status = status["pass"]
                 fastp_summary = jdata["fastp"]["summary"]
-                fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"],2)
+                fastp_q30 = round(fastp_summary["after_filtering"]["q30_rate"], 2)
                 if fastp_q30 < 0.85:
                     fastp_q30_status = status["warn"]
 
             # Read stats from NanoStat
             nanostat_q15 = "-"
-            nanostat_q15_status = status["missing"]
-            nanostat_mean_read_length = "-"
+            #nanostat_q15_status = status["missing"]
+            #nanostat_mean_read_length = "-"
             nanostat_read_n50 = "-"
 
             if "nanostat" in jdata:

From ad51871c1789b84d17fa310f53a25d13ebd6bb87 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Wed, 8 Jan 2025 14:31:45 +0100
Subject: [PATCH 11/14] Cosmetic changes to the report

---
 assets/gabi_template.html     | 76 +++++++++++++++++++++++----
 bin/gabi.py                   | 96 ++++++++++++++++++++++++++++++-----
 subworkflows/coverage/main.nf |  1 +
 3 files changed, 149 insertions(+), 24 deletions(-)

diff --git a/assets/gabi_template.html b/assets/gabi_template.html
index 7888318..6a8cbbd 100644
--- a/assets/gabi_template.html
+++ b/assets/gabi_template.html
@@ -29,6 +29,8 @@
     tr th[scope="subcol-inactive"] {background-color: #6e6e6e; color: #fff;}
     .versions { font-size: 9px; }
     .versions td { padding: 0px; }
+    .infotext { color:#6e6e6e; font-size: 10px ;}
+    .infotext a { text-decoration: underline;}
     /* Tooltip container */
     .tooltip { position: relative; display: inline-block; border-bottom: 1px dotted black; }
     .tooltip .tooltiptext { visibility: hidden; width: 250px; background-color: rgb(189, 186, 186); color: black; text-align: center; padding: 5px 0; border-radius: 6px;  position: absolute;  z-index: 1; }
@@ -57,10 +59,11 @@ <h1>
         {% if Insertsizes %}
             <td><a href="#insert-sizes">Insert sizes</a></td>
         {% endif %}
-        <td><a href="#mlst">MLST</a></td>
+        <td><a href="#busco">Busco</a></td>
         {% if Kraken %}
             <td><a href="#kraken">Kraken2</a></td>
         {% endif %}
+        <td><a href="#mlst">MLST</a></td>
         {% if serotypes %}
             <td><a href="#serotypes">Serotypes</a></td>
         {% endif %}
@@ -162,6 +165,10 @@ <h2>Summary</h2>
 -->
 <div id="assembly"></div>
 <h2>Assembly metrics</h2>
+<div class="infotext">
+   Descriptive metrics of individual assemblies determined by <a href="https://quast.sourceforge.net/" target="_new">Quast</a>. 
+   <p></p>
+</div>
 
 <table>
     <tr>
@@ -196,10 +203,19 @@ <h2>Assembly metrics</h2>
 <p></p>
 <a href="#navigation">top</a>
 
+<!-- 
+Check if insert sizes are present, then render the relevant secion
+-->
 {% if Insertsizes %}
     <div id="insert-sizes"></div>
 
     <h2>Insert size distribution (Illumina)</h2>
+    <div class="infotext">
+        Insert size refers to the size of the sequenced DNA fragment. Depending on the exact library protocol, this size will fall fairly uniformly around a mean value (~300-500bp). 
+        For Illumina data, that value should typically be (slightly) larger than the combined length of forward and reverse read for optimal data yield. Very flat curves may (depending on the protocol!) 
+        indicate a failure during fragment size selection/enrichment. Neither small insert sizes nor flat curves are a clear predictor for subsequent assembly issues, but can inform any potential debugging efforts. 
+    </div>
+    <p></p>
 
     {{Insertsizes}}
 
@@ -207,12 +223,55 @@ <h2>Insert size distribution (Illumina)</h2>
     <a href="#navigation">top</a>
 {% endif %}
 
+<!--
+Busco scores
+-->
+<div id="busco"></div>
+<h2>BUSCO scores</h2>
+
+<div class="infotext">
+    <a href="https://busco.ezlab.org/" target="_new">BUSCO</a> scores describe the coverage of the assemblied gene space against a set of broadly conserved singleton genes (here: bacteria_odb10). A perfect assembly should 
+    have a complete coverage of the gene space (complete: 100%), without any fragmentation or, worse, duplication. A high value of duplication may indicate assembly errors or contamination. Some taxa with very 
+    streamlined gene content, such as <i>Campylobacter</i>, will typically have a completeness score of less than 100%. The Completeness estimates may include duplicated genes, so values greater than 100% are 
+    possible (i.e. all genes present, of which x % are duplicated). 
+</div>
+
+{{Busco}}
+
+<p></p>
+<a href="#navigation">top</a>
+
+<!--
+Check if Kraken data is present, then render the relevant section
+-->
+{% if Kraken %}
+    <div id="kraken"></div>
+    
+    <h2>Kraken2 - taxonomic composition</h2>
+
+    <div class="infotext">
+        <a href="https://github.com/DerrickWood/kraken2", target="_new">Kraken2</a> matches kmers from raw sequencing reads against a reference database to determine the taxonomic composition of a read set. For DNA from
+        pure cultures (which is the focus of GABI), only one species should be identified at dominant proportions. For some taxa, like <i>Campylobacter</i>, several species from the same genus may be found at comparative 
+        abundances due to a lack of sufficient DNA differences. Otherwise, identification of multiple taxa at higher proportions may indicate a contamination issue. 
+    </div>
+
+    {{Kraken}}
+
+    <a href="#navigation">top</a>
+{% endif %}
+
 <!--
 MLST sequence types by schema
 -->
 <div id="mlst"></div>
 <h2>MLST</h2>
 
+<div class="infotext">
+    Taxa-specific <a href="https://github.com/tseemann/mlst" target="_new">MLST schemas</a> classify assemblies into pre-defined types or groups. Results are divided by typing schema (and consequently taxa). 
+</div>
+
+<p></p>
+
 {% for scheme,mtypes in mlst.items() %}
 
     <div class="table-caption">Scheme: {{scheme}}</div>
@@ -233,15 +292,6 @@ <h2>MLST</h2>
 <p></p>
 <a href="#navigation">top</a>
 
-{% if Kraken %}
-    <div id="kraken"></div>
-
-    <h2>Kraken2 - taxonomic composition</h2>
-
-    {{Kraken}}
-
-    <a href="#navigation">top</a>
-{% endif %}
 <!--
 Results from one or multiple serotyping tools
 -->
@@ -249,6 +299,12 @@ <h2>Kraken2 - taxonomic composition</h2>
     <div id="serotypes"></div>
     <h2>Serotyping</h2>
 
+    <div class="infotext">
+        Serotyes, similar to MLST types, classify assemblies based on a set of predefined gene profiles.
+    </div>
+
+    <p></p>
+
     {% for stool,stypes in serotypes.items() %}
         <div class="table-caption">{{stool}}</div>
         <table>
diff --git a/bin/gabi.py b/bin/gabi.py
index 05cb784..2f2e2d7 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -51,6 +51,7 @@ def main(yaml, template, output, reference, version, call, wd):
     mlst_all = {}
     insert_sizes_all = {}
     min_insert_size_length = 1000
+    busco_data_all = []
 
     with open(reference) as r:
         ref_data = json.load(r)["thresholds"]
@@ -112,11 +113,9 @@ def main(yaml, template, output, reference, version, call, wd):
 
                                     if (perc >= 10.0):
                                         confindr_status = status["fail"]
-                                        this_status = status["fail"]
                                     elif (perc > 0.0 and confindr_status == status["pass"]):
                                         confindr_status = status["warn"]
-                                        if (this_status == status["pass"]):
-                                            this_status = status["warn"]
+
                             else:
                                 contaminated = "ND"
                                 confindr_status = status["warn"]
@@ -128,7 +127,10 @@ def main(yaml, template, output, reference, version, call, wd):
             fastp_q30 = "-"
             fastp_q30_status = status["missing"]
 
+            ########################
             # Read quality via FastP
+            ########################
+
             if "fastp" in jdata:
                 fastp_q30_status = status["pass"]
                 fastp_summary = jdata["fastp"]["summary"]
@@ -136,19 +138,22 @@ def main(yaml, template, output, reference, version, call, wd):
                 if fastp_q30 < 0.85:
                     fastp_q30_status = status["warn"]
 
+            ##########################
             # Read stats from NanoStat
+            ##########################
+
             nanostat_q15 = "-"
-            #nanostat_q15_status = status["missing"]
-            #nanostat_mean_read_length = "-"
             nanostat_read_n50 = "-"
 
             if "nanostat" in jdata:
                 nanostat_data = jdata["nanostat"]
                 nanostat_q15 = int(nanostat_data["Q15"])
-                nanostat_mean_read_length = nanostat_data["mean_read_length"]
+                # nanostat_mean_read_length = nanostat_data["mean_read_length"]
                 nanostat_read_n50 = nanostat_data["read_length_n50"]
 
+            ####################
             # Get Kraken results
+            ####################
 
             taxon_status = status["missing"]
             taxon_count = "-"
@@ -181,13 +186,13 @@ def main(yaml, template, output, reference, version, call, wd):
 
                 if (taxon_count > 3):
                     taxon_count_status = status["fail"]
-                    this_status = status["fail"]
                 elif (taxon_count > 1):
                     taxon_count_status = status["warn"]
-                    if (this_status == status["pass"]):
-                        this_status = status["warn"]
 
+            ####################
             # Get samtools stats
+            ####################
+
             samtools = {"mean_insert_size": "-", }
             if ("samtools" in jdata):
                 insert_size = float(jdata["samtools"]["insert size average"])
@@ -198,7 +203,10 @@ def main(yaml, template, output, reference, version, call, wd):
                 if (len(inserts) < min_insert_size_length):
                     min_insert_size_length = len(inserts)
 
+            ####################
             # Get assembly stats
+            ####################
+
             assembly = round((int(jdata["quast"]["Total length"])/1000000), 2)
             assembly_status = check_assembly(this_refs, int(jdata["quast"]["Total length"]))
 
@@ -229,7 +237,10 @@ def main(yaml, template, output, reference, version, call, wd):
             quast["gc"] = float(jdata["quast"]["GC (%)"])
             quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"]))
 
+            #################
             # Get serotype(s)
+            #################
+
             if "serotype" in jdata:
                 serotypes = jdata["serotype"]
                 for sentry in serotypes:
@@ -254,22 +265,35 @@ def main(yaml, template, output, reference, version, call, wd):
             # Reference genome
             reference = jdata["reference"]
 
+            ##############
             # Busco scores
+            ##############
+
             busco = jdata["busco"]
             busco_status = status["missing"]
-            busco_completeness = round(((int(busco["C"]))/int(busco["dataset_total_buscos"])), 2)*100
+            busco_total= int(busco["dataset_total_buscos"])
+            busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100
+            busco_fragmented = round((int(busco["F"])/busco_total), 2)*100
+            busco_missing = round((int(busco["M"])/busco_total), 2)*100
+            busco_duplicated = round((int(busco["D"])/busco_total), 2)*100
             busco["completeness"] = busco_completeness
+            busco_data_all.append({ "Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated })
+
             if (busco_completeness > 90.0):
                 busco_status = status["pass"]
             elif (busco_completeness > 80.0):
                 busco_status = status["warn"]
-                if (this_status == status["pass"]):
-                    this_status = status["warn"]
             else:
                 busco_status = status["fail"]
-                this_status = status["fail"]
 
+            # Warn if there are duplications in the gene set and busco wasnt already failed
+            if (busco_duplicated > 5.0) & (busco_status != status["fail"]):
+                busco_status = status["warn"]
+
+            ##############
             # MLST types
+            ##############
+
             mlst = jdata["mlst"]
 
             for mentry in mlst:
@@ -282,7 +306,10 @@ def main(yaml, template, output, reference, version, call, wd):
                 else:
                     mlst_all[scheme_name] = [{"sample": sample, "sequence_type": sequence_type}]
 
+            ##############
             # Get coverage(s)
+            ##############
+
             coverage = "-"
             coverage_status = status["missing"]
 
@@ -332,7 +359,28 @@ def main(yaml, template, output, reference, version, call, wd):
                     else:
                         coverage_pacbio_status = status["fail"]
 
+            ######################################
+            # Set the overall status of the sample
+            ######################################
+
+            # The metrics that by themselves determine overall status:
+            for estatus in [ confindr_status,  taxon_count_status, assembly_status ]:
+                # if any one metric failed, the whole sample failed
+                if estatus == status["fail"]:
+                   this_status = estatus
+                # if a metric is dubious, the entire sample is dubious, unless it already failed or warned
+                elif (estatus == status["warn"]) & (this_status == status["pass"]):
+                    this_status = estatus
+
+            # The other metrics should at most warn, but never fail the sample
+            for estatus in [ busco_status, contigs_status ]:
+                if (estatus != status["missing"]) & (this_status != status["fail"]) & (estatus != status["pass"]):
+                    this_status = status["warn"]
+
+            #########################
             # sample-level dictionary
+            #########################
+
             rtable = {
                 "sample": sample,
                 "reference": reference,
@@ -371,11 +419,15 @@ def main(yaml, template, output, reference, version, call, wd):
 
         data["summary"].append(rtable)
 
+    #############
+    # Plots
+    #############
+
     if "kraken" in jdata:
         # Draw the Kraken abundance table
         kdata = pd.DataFrame(data=kraken_data_all, index=samples)
         plot_labels = {"index": "Samples", "value": "Percentage"}
-        h = len(samples)*25 if len(samples) > 10 else 550
+        h = len(samples)*25 if len(samples) > 10 else 300
         fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h)
 
         data["Kraken"] = fig.to_html(full_html=False)
@@ -392,11 +444,23 @@ def main(yaml, template, output, reference, version, call, wd):
         hfig = px.line(hdata, labels=plot_labels)
         data["Insertsizes"] = hfig.to_html(full_html=False)
 
+
+    if busco_data_all:
+        # Draw the busco stats graph
+        bdata = pd.DataFrame(data=busco_data_all, index=samples)
+        plot_labels = { "index": "Samples", "value": "Percentage"}
+        h = len(samples)*25 if len(samples) > 10 else 300
+        fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h)
+        data["Busco"] = fig.to_html(full_html=False)
+
     data["serotypes"] = serotypes_all
 
     data["mlst"] = mlst_all
 
+    ##############################
     # Parse the versions YAML file
+    ##############################
+
     software = {}
     current_module = ""
     rmod = re.compile('^[A-Za-z0.*/]')
@@ -412,6 +476,10 @@ def main(yaml, template, output, reference, version, call, wd):
 
     data["packages"] = software
 
+    ########################
+    # Render Jinja2 template
+    ########################
+
     with open(output, "w", encoding="utf-8") as output_file:
         with open(template) as template_file:
             j2_template = Template(template_file.read())
diff --git a/subworkflows/coverage/main.nf b/subworkflows/coverage/main.nf
index 86167f7..f23b22d 100644
--- a/subworkflows/coverage/main.nf
+++ b/subworkflows/coverage/main.nf
@@ -112,6 +112,7 @@ workflow COVERAGE {
     emit:
     versions    = ch_versions
     report      = MOSDEPTH.out.global_txt
+    bam         = SAMTOOLS_INDEX.out.bam
     bam_stats   = SAMTOOLS_STATS.out.stats
     summary     = MOSDEPTH.out.summary_txt
     summary_by_platform = ch_summary_by_platform

From d47c6daacd8e98c19f8f00aa46236f600ee48f1b Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 9 Jan 2025 13:54:41 +0100
Subject: [PATCH 12/14] Abricate now runs on several databases in parallel

---
 bin/gabi.py                        | 16 ++++++++--------
 conf/modules.config                |  6 ++----
 modules/abricate/run/main.nf       |  7 ++++---
 nextflow.config                    |  2 +-
 subworkflows/amr_profiling/main.nf | 11 +++++++----
 workflows/gabi.nf                  |  4 +++-
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/bin/gabi.py b/bin/gabi.py
index 2f2e2d7..18ef7fc 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import os
 import json
+import getpass
 import re
 import argparse
 
@@ -36,7 +37,7 @@ def main(yaml, template, output, reference, version, call, wd):
 
     data = {}
 
-    data["user"] = os.getlogin()
+    data["user"] = getpass.getuser()
     data["date"] = datetime.datetime.now()
     data["version"] = version
     data["call"] = call
@@ -271,13 +272,13 @@ def main(yaml, template, output, reference, version, call, wd):
 
             busco = jdata["busco"]
             busco_status = status["missing"]
-            busco_total= int(busco["dataset_total_buscos"])
+            busco_total = int(busco["dataset_total_buscos"])
             busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100
             busco_fragmented = round((int(busco["F"])/busco_total), 2)*100
             busco_missing = round((int(busco["M"])/busco_total), 2)*100
             busco_duplicated = round((int(busco["D"])/busco_total), 2)*100
             busco["completeness"] = busco_completeness
-            busco_data_all.append({ "Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated })
+            busco_data_all.append({"Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated})
 
             if (busco_completeness > 90.0):
                 busco_status = status["pass"]
@@ -364,16 +365,16 @@ def main(yaml, template, output, reference, version, call, wd):
             ######################################
 
             # The metrics that by themselves determine overall status:
-            for estatus in [ confindr_status,  taxon_count_status, assembly_status ]:
+            for estatus in [confindr_status,  taxon_count_status, assembly_status]:
                 # if any one metric failed, the whole sample failed
                 if estatus == status["fail"]:
-                   this_status = estatus
+                    this_status = estatus
                 # if a metric is dubious, the entire sample is dubious, unless it already failed or warned
                 elif (estatus == status["warn"]) & (this_status == status["pass"]):
                     this_status = estatus
 
             # The other metrics should at most warn, but never fail the sample
-            for estatus in [ busco_status, contigs_status ]:
+            for estatus in [busco_status, contigs_status]:
                 if (estatus != status["missing"]) & (this_status != status["fail"]) & (estatus != status["pass"]):
                     this_status = status["warn"]
 
@@ -444,7 +445,6 @@ def main(yaml, template, output, reference, version, call, wd):
         hfig = px.line(hdata, labels=plot_labels)
         data["Insertsizes"] = hfig.to_html(full_html=False)
 
-
     if busco_data_all:
         # Draw the busco stats graph
         bdata = pd.DataFrame(data=busco_data_all, index=samples)
@@ -559,4 +559,4 @@ def check_gc(refs, query):
 
 
 if __name__ == '__main__':
-    main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd)
+    main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd)
\ No newline at end of file
diff --git a/conf/modules.config b/conf/modules.config
index 80a6411..670f876 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -134,12 +134,11 @@ process {
     }
     withName: ABRICATE_RUN {
         ext.args =  [
-            "--db ${params.arg_abricate_db}",
             "--minid ${params.arg_abricate_minid}",
             "--mincov ${params.arg_abricate_mincov}"
         ].join(' ').trim()
         publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate" },
+            path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/${db}" },
             mode: params.publish_dir_mode,
             enabled: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
@@ -147,12 +146,11 @@ process {
     }
     withName: ABRICATE_RUN_ECOLI_VIRULENCE {
        ext.args =  [
-            "--db ecoli_vf",
             "--minid ${params.arg_abricate_minid}",
             "--mincov ${params.arg_abricate_mincov}"
         ].join(' ').trim()
         publishDir = [
-            path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/ecoli_vf" },
+            path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/${db}" },
             mode: params.publish_dir_mode,
             enabled: true,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
diff --git a/modules/abricate/run/main.nf b/modules/abricate/run/main.nf
index b2b9e2e..54ec7cf 100644
--- a/modules/abricate/run/main.nf
+++ b/modules/abricate/run/main.nf
@@ -1,5 +1,5 @@
 process ABRICATE_RUN {
-    tag "$meta.sample_id"
+    tag "${meta.sample_id}|$db"
     label 'short_serial'
 
     conda "${moduleDir}/environment.yml"
@@ -8,7 +8,7 @@ process ABRICATE_RUN {
         'quay.io/biocontainers/abricate:1.0.1--ha8f3691_1' }"
 
     input:
-    tuple val(meta), path(assembly)
+    tuple val(meta), path(assembly), val(db)
 
     output:
     tuple val(meta), path('*.txt'), emit: report
@@ -19,12 +19,13 @@ process ABRICATE_RUN {
 
     script:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.sample_id}"
+    def prefix = task.ext.prefix ?: "${meta.sample_id}.${db}"
     """
     abricate \\
         $args \\
         --threads $task.cpus \\
         $assembly \\
+        --db $db \\
         > ${prefix}.txt
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/nextflow.config b/nextflow.config
index e684aaa..7003f97 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -23,9 +23,9 @@ params {
 
     arg_hamronization_summarizeformat = 'tsv'
 
-    arg_abricate_db         = 'vfdb'
     arg_abricate_minid      = 80
     arg_abricate_mincov     = 80
+    abricate_dbs            = ['vfdb', 'resfinder', 'argannot', 'card', 'megares']
 
     busco_lineage           = "bacteria"
     busco_db_path           = null
diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf
index 25c8f0a..aa56b1d 100644
--- a/subworkflows/amr_profiling/main.nf
+++ b/subworkflows/amr_profiling/main.nf
@@ -17,7 +17,8 @@ ch_hamronization_input = Channel.from([])
 workflow AMR_PROFILING {
     take:
     assembly
-    db
+    db              // The AMRfinder database to run
+    abricate_dbs    // A list of abricate databases to run (should be generic!)
 
     main:
 
@@ -58,17 +59,19 @@ workflow AMR_PROFILING {
     /*
     Run Abricate and make JSON report
     */
+
+    assembly_with_db = assembly.combine(abricate_dbs)
     ABRICATE_RUN(
-        assembly
+        assembly_with_db
     )
     ch_versions = ch_versions.mix(ABRICATE_RUN.out.versions)
 
     /*  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Taxon-specific abricate analyses
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~  */ 
-    // E. coli
+    // E. coli - here we use a specific database!
     ABRICATE_RUN_ECOLI_VIRULENCE(
-        assembly_by_taxon.ecoli
+        assembly_by_taxon.ecoli.map { m,a -> [ m, a, 'ecoli_vf']}
     )
     ch_versions = ch_versions.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.versions)
 
diff --git a/workflows/gabi.nf b/workflows/gabi.nf
index f21a418..b5ea684 100644
--- a/workflows/gabi.nf
+++ b/workflows/gabi.nf
@@ -64,6 +64,7 @@ if (params.input) {
     ch_prokka_proteins = params.prokka_proteins ? Channel.fromPath(params.prokka_proteins, checkIfExists: true).collect()   : []
     ch_prokka_prodigal = params.prokka_prodigal ? Channel.fromPath(params.prokka_prodigal, checkIfExists:true).collect()    : []
 
+    abricate_dbs    = Channel.from(params.abricate_dbs)
     amrfinder_db    = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true)   : []
     kraken2_db      = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true)       : []
 
@@ -416,7 +417,8 @@ workflow GABI {
     if (!params.skip_amr) {
         AMR_PROFILING(
             ch_assemblies_clean,
-            amrfinder_db
+            amrfinder_db,
+            abricate_dbs
         )
         ch_versions = ch_versions.mix(AMR_PROFILING.out.versions)
         ch_report   = ch_report.mix(AMR_PROFILING.out.amrfinder_report)

From 04ea662a00ecc8ea6cd65b3f5bf5cb6b77694c04 Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Thu, 9 Jan 2025 14:03:43 +0100
Subject: [PATCH 13/14] Adding missing taxonomic information to AMR input

---
 subworkflows/amr_profiling/main.nf |  4 ++--
 workflows/gabi.nf                  | 11 ++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf
index aa56b1d..8130322 100644
--- a/subworkflows/amr_profiling/main.nf
+++ b/subworkflows/amr_profiling/main.nf
@@ -21,8 +21,8 @@ workflow AMR_PROFILING {
     abricate_dbs    // A list of abricate databases to run (should be generic!)
 
     main:
-
-     assembly.branch { m, a ->
+    
+    assembly.branch { m, a ->
         ecoli: m.taxon ==~ /^Escherichia.*/
         salmonella: m.taxon ==~ /^Salmonella.*/
         listeria: m.taxon ==~ /^Listeria.*/
diff --git a/workflows/gabi.nf b/workflows/gabi.nf
index b5ea684..f726d82 100644
--- a/workflows/gabi.nf
+++ b/workflows/gabi.nf
@@ -362,6 +362,15 @@ workflow GABI {
         tuple(m,s)
     }.set { ch_assemblies_without_plasmids_with_taxa }
 
+    // as well as a channel with the clean assembly and taxon information
+    ch_assemblies_clean.map {m,s ->
+        tuple(m.sample_id, s)
+    }.join(
+        FIND_REFERENCES.out.reference.map { m, r, g, k ->
+            tuple(m.sample_id, m)
+        }
+    ).map { m,s,n -> tuple(n,s) }
+    .set { ch_assemblies_clean_with_taxa }
     /*
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     SUB: Perform serotyping of assemblies
@@ -416,7 +425,7 @@ workflow GABI {
 
     if (!params.skip_amr) {
         AMR_PROFILING(
-            ch_assemblies_clean,
+            ch_assemblies_clean_with_taxa,
             amrfinder_db,
             abricate_dbs
         )

From daff863c8cdcbb0028725ec1bd97eb48bb3e0edc Mon Sep 17 00:00:00 2001
From: Marc Hoeppner <mphoeppner@gmail.com>
Date: Fri, 10 Jan 2025 07:26:18 +0100
Subject: [PATCH 14/14] Merging abricate databases per sample into one json

---
 assets/test/samples.csv                | 2 +-
 bin/gabi.py                            | 4 ++--
 modules/hamronization/abricate/main.nf | 8 ++++----
 nextflow.config                        | 2 +-
 subworkflows/amr_profiling/main.nf     | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/assets/test/samples.csv b/assets/test/samples.csv
index 691872d..2b22e3a 100644
--- a/assets/test/samples.csv
+++ b/assets/test/samples.csv
@@ -1,2 +1,2 @@
 sample_id,platform,single_end,R1,R2
-ERR1008684,ILLUMINA,false,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR100/004/ERR1008684/ERR1008684_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR100/004/ERR1008684/ERR1008684_2.fastq.gz
+SAMEA2707761,ILLUMINA,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR580/ERR580964/ERR580964_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR580/ERR580964/ERR580964_2.fastq.gz
diff --git a/bin/gabi.py b/bin/gabi.py
index 18ef7fc..669caa5 100755
--- a/bin/gabi.py
+++ b/bin/gabi.py
@@ -448,7 +448,7 @@ def main(yaml, template, output, reference, version, call, wd):
     if busco_data_all:
         # Draw the busco stats graph
         bdata = pd.DataFrame(data=busco_data_all, index=samples)
-        plot_labels = { "index": "Samples", "value": "Percentage"}
+        plot_labels = {"index": "Samples", "value": "Percentage"}
         h = len(samples)*25 if len(samples) > 10 else 300
         fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h)
         data["Busco"] = fig.to_html(full_html=False)
@@ -559,4 +559,4 @@ def check_gc(refs, query):
 
 
 if __name__ == '__main__':
-    main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd)
\ No newline at end of file
+    main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd)
diff --git a/modules/hamronization/abricate/main.nf b/modules/hamronization/abricate/main.nf
index 186853d..09ede0d 100644
--- a/modules/hamronization/abricate/main.nf
+++ b/modules/hamronization/abricate/main.nf
@@ -8,7 +8,7 @@ process HAMRONIZATION_ABRICATE {
         'quay.io/biocontainers/hamronization:1.1.4--pyhdfd78af_0' }"
 
     input:
-    tuple val(meta), path(report)
+    tuple val(meta), path(reports)
     val(format)
     val(software_version)
     val(reference_db_version)
@@ -23,16 +23,16 @@ process HAMRONIZATION_ABRICATE {
 
     script:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.sample_id}"
+    def prefix = task.ext.prefix ?: "${report.getBaseName()}"
     """
     hamronize \\
         abricate \\
-        ${report} \\
+        ${reports} \\
         $args \\
         --format ${format} \\
         --analysis_software_version ${software_version} \\
         --reference_database_version ${reference_db_version} \\
-        > ${prefix}.${format}
+        --output ${prefix}.${format}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 7003f97..0a9f63f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -84,7 +84,7 @@ params {
 
 manifest {
     name = "bio-raum/gabi"
-    version = "0.9.1"
+    version = "0.9.2"
     description = "GABI Pipeline for assembly and profiling of bacterial isolates"
     author = "Marc Hoeppner"
     homePage = "https://github.com/bio-raum/gabi"
diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf
index 8130322..ee857b5 100644
--- a/subworkflows/amr_profiling/main.nf
+++ b/subworkflows/amr_profiling/main.nf
@@ -77,7 +77,7 @@ workflow AMR_PROFILING {
 
     // Join basic Abricate results
     HAMRONIZATION_ABRICATE(
-        ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report),
+        ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report).groupTuple(),
         'json',
         '1.0.1',
         '2021-Mar-27'