From 7715d9058ce66fbe93bbaad3b4c8297ff92f980b Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Fri, 6 Dec 2024 07:50:58 +0100 Subject: [PATCH 01/14] Fixing merge error in build process --- modules/biobloom/maker/main.nf | 5 ++--- workflows/build_references.nf | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/biobloom/maker/main.nf b/modules/biobloom/maker/main.nf index 3d55ebb..47c8ad9 100644 --- a/modules/biobloom/maker/main.nf +++ b/modules/biobloom/maker/main.nf @@ -1,5 +1,4 @@ process BIOBLOOM_MAKER { - tag "$meta.sample_id" label 'medium_parallel' @@ -23,8 +22,8 @@ process BIOBLOOM_MAKER { def prefix = task.ext.prefix ?: "host_genomes" """ biobloommaker -p $prefix \\ - -f $fasta \\ - -t $task.cpus + -t $task.cpus \\ + $fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/build_references.nf b/workflows/build_references.nf index b42cd61..7578e57 100644 --- a/workflows/build_references.nf +++ b/workflows/build_references.nf @@ -6,6 +6,8 @@ include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL } from './../modules/a include { PYMLST_WGMLST_INSTALL } from './../modules/pymlst/wgmlst_install' include { CHEWBBACA_DOWNLOADSCHEMA } from './../modules/chewbbaca/downloadschema' include { STAGE_FILE as DOWNLOAD_SOURMASH_DB } from './../modules/helper/stage_file' +include { GUNZIP as GUNZIP_GENOME } from './../modules/gunzip' +include { BIOBLOOM_MAKER } from './../modules/biobloom/maker' kraken_db_url = Channel.fromPath(params.references['kraken2'].url) confindr_db_url = Channel.fromPath(params.references['confindr'].url) @@ -27,7 +29,7 @@ workflow BUILD_REFERENCES { ) BIOBLOOM_MAKER( - GUNZIP.out.gunzip.map { m,f -> f } + GUNZIP_GENOME.out.gunzip.map { m,f -> f } ) /* From 77936a542809a5f02c34cfdeeb476f64785cbc55 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Tue, 10 Dec 2024 14:20:20 +0100 Subject: [PATCH 02/14] Fixing generic file names being forwarded and causing name collisions --- conf/modules.config | 2 +- conf/resources.config | 4 ++++ modules/quast/main.nf | 2 +- nextflow.config | 4 +++- subworkflows/qc_illumina/main.nf | 8 +++++++- subworkflows/qc_nanopore/main.nf | 9 +++++++-- workflows/build_references.nf | 5 +++++ workflows/gabi.nf | 23 +++++++++++++++++++---- 8 files changed, 47 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 642c148..f614e10 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -286,7 +286,7 @@ process { } withName: NANOPLOT { publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" }, + path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" }, mode: params.publish_dir_mode, enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } diff --git a/conf/resources.config b/conf/resources.config index 477b800..b3a160a 100644 --- a/conf/resources.config +++ b/conf/resources.config @@ -10,6 +10,10 @@ params { url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-k31.zip' db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-k31.zip" } + 'sourmashdb_nr' { + url = 'https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/gtdb-rs214/gtdb-rs214-reps.k31.zip' + db = "${params.reference_base}/gabi/${params.reference_version}/sourmashdb/gtdb-rs214-reps.k31.zip" + } 'amrfinderdb' { db = "${params.reference_base}/gabi/${params.reference_version}/amrfinder/latest" } diff --git a/modules/quast/main.nf b/modules/quast/main.nf index 2340a97..9187a11 100644 --- a/modules/quast/main.nf +++ b/modules/quast/main.nf @@ -27,7 +27,7 @@ process QUAST { def features = gff ? "--features $gff" : '' def reference = fasta ? "-r $fasta" : '' """ - ln -s $assembly ${prefix}.fasta + quast.py \\ --output-dir $prefix \\ $reference \\ diff --git a/nextflow.config b/nextflow.config index 35aea9a..8210b9a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,12 +34,14 @@ params { remove_host = false + fast_ref = false + max_coverage = "100x" genome_size = null skip_porechop = true - onthq = true + onthq = false ont_min_length = 5000 ont_min_q = 10 ont_min_reads = 1000 diff --git a/subworkflows/qc_illumina/main.nf b/subworkflows/qc_illumina/main.nf index c9df73e..4606694 100644 --- a/subworkflows/qc_illumina/main.nf +++ b/subworkflows/qc_illumina/main.nf @@ -15,7 +15,13 @@ workflow QC_ILLUMINA { main: // Split trimmed reads by sample to find multi-lane data set - reads.groupTuple().branch { meta, reads -> + reads.map {m,r -> + def newMeta = [:] + newMeta.sample_id = m.sample_id + newMeta.platform = m.platform + newMeta.single_end = m.single_end + tuple(newMeta,r) + }.groupTuple().branch { meta, reads -> single: reads.size() == 1 return [ meta, reads.flatten()] multi: reads.size() > 1 diff --git a/subworkflows/qc_nanopore/main.nf b/subworkflows/qc_nanopore/main.nf index 055aa69..81da756 100644 --- a/subworkflows/qc_nanopore/main.nf +++ b/subworkflows/qc_nanopore/main.nf @@ -29,10 +29,15 @@ workflow QC_NANOPORE { } else { ch_porechop_reads = reads } - // Merge Nanopore reads per sample - ch_porechop_reads.groupTuple().branch { meta, reads -> + ch_porechop_reads.map { m,r -> + def newMeta = [:] + newMeta.sample_id = m.sample_id + newMeta.platform = m.platform + newMeta.single_end = m.single_end + tuple(newMeta,r) + }.groupTuple().branch { meta, reads -> single: reads.size() == 1 return [ meta, reads.flatten()] multi: reads.size() > 1 diff --git a/workflows/build_references.nf b/workflows/build_references.nf index 7578e57..a5dd5c7 100644 --- a/workflows/build_references.nf +++ b/workflows/build_references.nf @@ -6,12 +6,14 @@ include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL } from './../modules/a include { PYMLST_WGMLST_INSTALL } from './../modules/pymlst/wgmlst_install' include { CHEWBBACA_DOWNLOADSCHEMA } from './../modules/chewbbaca/downloadschema' include { STAGE_FILE as DOWNLOAD_SOURMASH_DB } from './../modules/helper/stage_file' +include { STAGE_FILE as DOWNLOAD_SOURMASH_NR_DB } from './../modules/helper/stage_file' include { GUNZIP as GUNZIP_GENOME } from './../modules/gunzip' include { BIOBLOOM_MAKER } from './../modules/biobloom/maker' kraken_db_url = Channel.fromPath(params.references['kraken2'].url) confindr_db_url = Channel.fromPath(params.references['confindr'].url) sourmash_db_url = params.references['sourmashdb'].url +sourmash_nr_db_url = params.references['sourmashdb_nr'].url ch_busco_lineage = Channel.from(['bacteria_odb10']) host_genome = Channel.fromPath(file(params.references['host_genome'].url)).map { f -> [ [target: 'Host'], f] } @@ -39,6 +41,9 @@ workflow BUILD_REFERENCES { sourmash_db_url ) + DOWNLOAD_SOURMASH_NR_DB( + sourmash_nr_db_url + ) /* Download the latest version of the AMRfinderplus DB This is not ideal since we cannot select specific versions - but it works diff --git a/workflows/gabi.nf b/workflows/gabi.nf index 31fba6d..b31c389 100644 --- a/workflows/gabi.nf +++ b/workflows/gabi.nf @@ -11,6 +11,7 @@ include { MULTIQC as MULTIQC_PACBIO } from './../modules/multiqc' include { SHOVILL } from './../modules/shovill' include { RENAME_CTG as RENAME_SHOVILL_CTG } from './../modules/rename_ctg' include { RENAME_CTG as RENAME_DRAGONFLYE_CTG } from './../modules/rename_ctg' +include { RENAME_CTG as RENAME_PLASMID_CTG } from './../modules/rename_ctg' include { DRAGONFLYE } from './../modules/dragonflye' include { FLYE } from './../modules/flye' include { BIOBLOOM_CATEGORIZER } from './../modules/biobloom/categorizer' @@ -66,7 +67,11 @@ if (params.input) { amrfinder_db = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true) : [] kraken2_db = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true) : [] - sourmashdb = params.reference_base ? file(params.references['sourmashdb'].db, checkIfExists:true) : [] + if (params.fast_ref) { + sourmashdb = params.reference_base ? file(params.references['sourmashdb_nr'].db, checkIfExists:true) : [] + } else { + sourmashdb = params.reference_base ? file(params.references['sourmashdb'].db, checkIfExists:true) : [] + } busco_db_path = params.reference_base ? file(params.references['busco'].db, checkIfExists:true) : [] busco_lineage = params.busco_lineage @@ -197,7 +202,12 @@ workflow GABI { ch_dragonflye ) ch_versions = ch_versions.mix(DRAGONFLYE.out.versions) - ch_assemblies = ch_assemblies.mix(DRAGONFLYE.out.contigs) + + RENAME_DRAGONFLYE_CTG( + DRAGONFLYE.out.contigs, + 'fasta' + ) + ch_assemblies = ch_assemblies.mix(RENAME_DRAGONFLYE_CTG.out) /* Option: Pacbio HiFi reads @@ -290,7 +300,12 @@ workflow GABI { ch_assemblies_clean ) ch_versions = ch_versions.mix(PLASMIDS.out.versions) - ch_assembly_without_plasmids = PLASMIDS.out.chromosome + + RENAME_PLASMID_CTG( + PLASMIDS.out.chromosome, + 'fasta' + ) + ch_assembly_without_plasmids = RENAME_PLASMID_CTG.out /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -363,7 +378,7 @@ workflow GABI { SUB: Perform MLST typing of assemblies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - + if (!params.skip_mlst) { MLST_TYPING( ch_assemblies_without_plasmids_with_taxa From 88536938ed4607ffdd0257b76c65120ce1a0f395 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 12 Dec 2024 08:19:38 +0100 Subject: [PATCH 03/14] Updating documentation --- conf/modules/assembly.config | 37 +++++++++++++ conf/modules/installation.config | 61 +++++++++++++++++++++ conf/modules/mlst.config | 61 +++++++++++++++++++++ conf/modules/qc.config | 92 ++++++++++++++++++++++++++++++++ docs/installation.md | 10 ++-- docs/quickstart.md | 4 +- docs/troubleshooting.md | 4 ++ docs/usage.md | 67 ++++++++++++----------- 8 files changed, 299 insertions(+), 37 deletions(-) create mode 100644 conf/modules/assembly.config create mode 100644 conf/modules/installation.config create mode 100644 conf/modules/mlst.config create mode 100644 conf/modules/qc.config diff --git a/conf/modules/assembly.config b/conf/modules/assembly.config new file mode 100644 index 0000000..30da9e7 --- /dev/null +++ b/conf/modules/assembly.config @@ -0,0 +1,37 @@ +process { + + withName: SHOVILL { + ext.args = "--assembler ${params.shovill_assembler} --minlen ${params.shovill_contig_minlen}" + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly/shovill" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: DRAGONFLYE { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly/dragonflye" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + withName: RENAME_SHOVILL_CTG { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: FLYE { + ext.args = "--plasmids --pacbio-hifi" + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly/flye/" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +} \ No newline at end of file diff --git a/conf/modules/installation.config b/conf/modules/installation.config new file mode 100644 index 0000000..4eb2092 --- /dev/null +++ b/conf/modules/installation.config @@ -0,0 +1,61 @@ +process { + + withName: CONFINDR_INSTALL { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: KRAKEN2_DOWNLOAD { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/kraken2" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: DOWNLOAD_SOURMASH_DB { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/sourmashdb" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CHEWBBACA_DOWNLOADSCHEMA { + ext.args = "--latest" + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/chewbbaca" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: AMRFINDERPLUS_INSTALL { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: BUSCO_INSTALL { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/busco" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +} \ No newline at end of file diff --git a/conf/modules/mlst.config b/conf/modules/mlst.config new file mode 100644 index 0000000..8102fb6 --- /dev/null +++ b/conf/modules/mlst.config @@ -0,0 +1,61 @@ +process { + withName: 'CHEWBBACA_ALLELECALL' { + ext.args = "--no-inferred" + publishDir = [ + path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CHEWBBACA_ALLELECALLEVALUATOR' { + publishDir = [ + path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CHEWBBACA_JOINPROFILES' { + ext.args = "--common" + publishDir = [ + path: { "${params.outdir}/cgMLST/chewbbaca/samples/${meta.sample_id}/joinprofiles" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CHEWBBACA_ALLELECALL_SINGLE' { + ext.args = "--no-inferred" + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/chewbbaca" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: PYMLST_WGMLST_DISTANCE { + publishDir = [ + path: { "${params.outdir}/cgMLST/pymlst" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: PYMLST_WGMLST_ADD { + publishDir = [ + path: { "${params.outdir}/cgMLST/pymlst" }, + mode: params.publish_dir_mode, + enabled: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: MLST { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/mlst" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} \ No newline at end of file diff --git a/conf/modules/qc.config b/conf/modules/qc.config new file mode 100644 index 0000000..b0bce94 --- /dev/null +++ b/conf/modules/qc.config @@ -0,0 +1,92 @@ +process { + withName: MULTIQC_ILLUMINA { + ext.prefix = "multiqc_illumina" + publishDir = [ + path: { "${params.outdir}/reports/Illumina" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: MULTIQC_NANOPORE { + ext.prefix = "multiqc_nanopore" + publishDir = [ + path: { "${params.outdir}/reports/Nanopore" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: MULTIQC_PACBIO { + ext.prefix = "multiqc_pacbio" + publishDir = [ + path: { "${params.outdir}/reports/Pacbio" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CONFINDR' { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/qc" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CONFINDR2MQC|CONFINDR2MQC_SUMMARY' { + publishDir = [ + path: { "${params.outdir}/qc" }, + mode: params.publish_dir_mode, + enabled: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'CONFINDR2JSON' { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/confindr" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: FASTQC { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastqc" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: PORECHOP_ABI { + ext.args = "--abi" + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/qc/porechop" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CHOPPER { + ext.args2 = [ + "-l ${params.ont_min_length}", + params.ont_min_q ? "-q ${params.ont_min_q}" : "" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/chopper" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + } + withName: NANOPLOT { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +} \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index c203a4a..e8b452c 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -6,7 +6,7 @@ If you are new to our pipeline ecosystem, we recommend you first check out our g Nextflow is a highly portable pipeline engine. Please see the official [installation guide](https://www.nextflow.io/docs/latest/getstarted.html#installation) to learn how to set it up. -This pipeline expects Nextflow version 23.10.1, available [here](https://github.com/nextflow-io/nextflow/releases/tag/v23.10.1). +This pipeline expects Nextflow version 24.04.4, available [here](https://github.com/nextflow-io/nextflow/releases/tag/v24.04.4). ## Software provisioning @@ -39,13 +39,13 @@ nextflow run bio-raum/gabi -profile singularity \\ --reference_base /path/to/references ``` -where `/path/to/references` could be something like `/data/pipelines/references` or whatever is most appropriate on your system. On a distributed compute environment, this directory needs to live on a shared file system. If you already use a site-specific [config](https://github.com/marchoeppner/nf-configs) file, the `--reference_base` option does not need to be set. +where `/path/to/references` could be something like `/data/pipelines/references` or whatever is most appropriate on your system. In a distributed compute environment, this directory needs to live on a shared file system. If you already use a site-specific [config](https://github.com/marchoeppner/nf-configs) file, the `--reference_base` option does not need to be set. If you do not have singularity on your system, you can also specify docker, podman or conda for software provisioning - see the [usage information](usage.md). Please note that the build process will create a pipeline-specific subfolder (`gabi`) that must not be given as part of the `--reference_base` argument. GABI is part of a collection of pipelines that use a shared reference directory and it will choose/create the appropriate subfolder automatically. -Finally, depending on your internet connection, the installation process can take a little while - primarily because of the Kraken2 database (8GB). However, once installed you are all set and ready to go. +Finally, depending on your internet connection, the installation process can take a little while - some of the reference databases are "fairly" large (8-10GB). However, once installed you are all set and ready to go. ## Site-specific config file @@ -77,4 +77,6 @@ conda { useMamba = true cacheDir = "/path/to/conda_cache" } -``` \ No newline at end of file +``` + +This would be for a single computer, with 16 cores and 64GB Ram, using Conda/Mamba. Conda environments are cached to the specified location and can be re-used for subsequent pipeline runs. \ No newline at end of file diff --git a/docs/quickstart.md b/docs/quickstart.md index 1b5466c..7c18b50 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -12,7 +12,7 @@ GABI provides software on-the-fly. Use whatever profile (`-profile`) is appropri - docker - podman -We will use `-profile apptainer` for the examples below. Use a container framework over conda, if at all possible. +We will use `-profile apptainer` for the examples below. Use a container framework over conda, if at all possible. Contribute a site-specific profile to our [central repository](https://github.com/bio-raum/nf-configs) if you would like to take advantage of container/environment caching. ## Three steps @@ -28,7 +28,7 @@ nextflow run bio-raum/gabi -profile apptainer \ -r main ``` -This will download and install the pipeline references to `/path/to/references` (choose an appropriate path here). +This will download and install the pipeline references to `/path/to/references` (choose an appropriate path here; must be on a shared mount when running in a cluster setting). ### Run test diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 7e46f43..a14fb91 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -42,6 +42,10 @@ We assume you mean the overall start-up time - the performance of the individual Otherwise, if you run this pipeline without a site-specific config file, the pipeline will not know where to cache the various containers or conda environments. In such cases, it will install/download these dependencies into the respective work directory of your pipeline run, every time you run the pipeline. And yes, that is a little slow. Consider adding your own config file to make use of the caching functionality. +## Sourmash `search` is very slow + +We use sourmash to identify the best matching reference genome for each assembly. This database is currently over 10GB in size and highly contigious assemblies can produce very long run times (30mins+). If you do not care about the best reference genome, but are happy to just find a closely related one so GABI knows which species this is, use the `--fast_ref` option. + ### My ONT assembly crashes with an obscure error Please check if the option `--onthq` is set to `true` (this is the default!). It's possible that this setting is not appropriate for your data, which can lead Dragonflye to exit on an empty Fasta file halfway through the assembly process; you can disable this option by setting `--onthq false` and resume the pipeline (`-resume`). diff --git a/docs/usage.md b/docs/usage.md index 7953b69..279e8ea 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -63,7 +63,7 @@ The `-r` option specifies a github [release tag](https://github.com/bio-raum/gab ## Choosing an assembly method -How do you choose the assembly method for your data? Well, you don't - the pipeline will take care of that automatically. GABI currently supports three kinds of scenarios: +GABI automatically chooses the appropriate assembly chain based on your data, supporting three scenarios: - Samples with only short reads (Assembler: Shovill) - Samples with Nanopore reads and **optional** short reads (Assembler: Dragonflye) @@ -71,11 +71,13 @@ How do you choose the assembly method for your data? Well, you don't - the pipel This is why it is important to make sure that all reads coming from the same sample are linked by a common sample ID. +Note: HiFi data cannot be combined with any of the other technologies! (mostly because it is not necessary) + ## Options ### `--input samples.csv` [default = null] -This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows, depending on your input data +This pipeline expects a CSV-formatted sample sheet to properly pull various meta data through the processes. The required format looks as follows, depending on your input data: #### Raw reads If you want to assemble genomes "from scratch", you can pass raw reads: @@ -94,9 +96,8 @@ Allowed platforms and data types are: * ILLUMINA (expecting PE Illumina reads in fastq format, fastq.gz) * NANOPORE (expecting ONT reads in fastq format, fastq.gz) * PACBIO (expecting Pacbio CCS/HiFi reads in fastq format, fastq.gz) -* TORRENT (expecting single-end IonTorrent reads in fastq format, fastq.gz) (tbd!) -Read data in formats other than FastQ are not currently supported and would have to be converted into the appropriate FastQ format prior to launching the pipeline. If you have a recurring use case where the input must be something other than FastQ, please let us know and we will consider it. +Read data in formats other than FastQ are not currently supported and would have to be converted into FastQ format prior to launching the pipeline. If you have a recurring use case where the input must be something other than FastQ, please let us know and we will consider it. #### Pre-assembled genomes @@ -109,6 +110,14 @@ sample_id,assembly S100,/path/to/S100.fasta ``` +### `--build_references` [ default = null ] + +This option is only used when installing the pipelines references as described [here](installation.md). + +### `--fast_ref` [ default = false ] + +By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics will notably deteriorate. + ### `--run_name` [ default = null] A name to use for various output files. This tend to be useful to relate analyses back to individual pipeline runs or projects later on. @@ -117,52 +126,37 @@ A name to use for various output files. This tend to be useful to relate analyse This option should point to the base directory in which you have installed the pipeline references. See our [installation](installation.md) instructions for details. For users who have contributed a site-specific config file, this option does not need to be set. -### `--onthq` [ default = true ] +### `--onthq` [ default = false ] -Set this option to true if you believe your ONT data to be of "high quality". This is typically the case for data generated with chemistry version 10.4.1 or later. This option is set to true by default because chemistry version 10.4.1 is the standard kit distributed by ONT at the time of writing. You can disable this option by setting it to `false`. +Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default.. ### `--ont_min_q` [ default = 10 ] -Discard nanopore reads below this mean quality. +Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data. ### `--ont_min_length` [ default = 5000 ] -Discard nanopore reads below this length. - -### `--build_references` [ default = null ] - -This option is only used when installing the pipelines references as described [here](installation.md). +Discard nanopore reads below this length. Depending on your DNA extraction and/or library preparation, you will see a range of sequence lengths. If you have sequenced at sufficient depths, you may decide to discard shorter reads to improve your assembly contiguity. However, please note that discarding shorter reads may essentially throw away very short plasmids (which can be as short as ~1kb). ## Expert options These options are only meant for users who have a specific reason to touch them. For most use cases, the defaults should be fine. -### `--skip_failed` [ default = false ] - -By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied: - -- Remove highly fragmented assemblies (see [--max_contigs](#--max_contigs)) -- Remove reads that fail the ConfindR QC for intra-/inter species contamination (Illumina and Pacbio only) - -### `--max_contigs` [ default = 150 ] - -If `--skip_failed` is enabled, this parameter controls the maximum number of contigs an assembly is allowed to have before it is stopped. High contig numbers are typically a sign of insufficient coverage and/or read length (in some cases it can also be a sign of excessive contamination). - -### `--skip_circos` [ default = false ] +### `--confindr_db` [ default = null ] -Skip generation of circos plots. +A local version of the ConfindR rMLST database, available [here](https://olc-bioinformatics.github.io/ConFindr/install/#downloading-confindr-databases). Unfortunately, this database requires a personalized registration so we cannot bundle it with GABI. If no database is provided, CondindR will run without one and can consquently only use the built-in references for Escherichia, Listeria and Salmonella. -### `--shovill_assembler` [ default = spades ] +### `--genome_size` [ default = null ] -Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades. +If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. ### `--max_coverage` [ default = '100x'] If a genome size is specified (`--genome_size`), this is the target coverage for downsampling the read data. -### `--genome_size` [ default = null ] +### `--max_contigs` [ default = 150 ] -If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. +If `--skip_failed` is enabled, this parameter controls the maximum number of contigs an assembly is allowed to have before it is stopped. High contig numbers are typically a sign of insufficient coverage and/or read length (in some cases it can also be a sign of excessive contamination). ### `--prokka_proteins` [ default = null ] @@ -172,9 +166,20 @@ If you analyse a single species and wish to optimize the quality of the genome a If you analyse a single species and wish to optimize the quality of the genome annotation, you can pass a custom prodigal training file using this option, as described [here](https://github.com/tseemann/prokka?tab=readme-ov-file#option---prodigaltf). -### `--confindr_db` [ default = null ] +### `--skip_failed` [ default = false ] -A local version of the ConfindR rMLST database, available [here](https://olc-bioinformatics.github.io/ConFindr/install/#downloading-confindr-databases). Unfortunately, this database requires a personalized registration so we cannot bundle it with GABI. If no database is provided, CondindR will run without one and can consquently only use the built-in references for Escherichia, Listeria and Salmonella. +By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied: + +- Remove highly fragmented assemblies (see [--max_contigs](#--max_contigs)) +- Remove reads that fail the ConfindR QC for intra-/inter species contamination (Illumina and Pacbio only) + +### `--skip_circos` [ default = false ] + +Skip generation of circos plots. + +### `--shovill_assembler` [ default = spades ] + +Choose which assembly tool to use with Shovill. Valid options are skesa, velvet, megahit or spades. Default is: spades. ### `--skip_mlst` [ default = false ] Do not run MLST typing tools (chewbbaca, MLST) From 015a1b70093bd9cedf0e0e3ad68f006e342076cf Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 12 Dec 2024 08:36:18 +0100 Subject: [PATCH 04/14] Removing documentation deployment for now --- .github/workflows/documentation_dev.yml | 41 ------------------------- 1 file changed, 41 deletions(-) delete mode 100644 .github/workflows/documentation_dev.yml diff --git a/.github/workflows/documentation_dev.yml b/.github/workflows/documentation_dev.yml deleted file mode 100644 index 8650687..0000000 --- a/.github/workflows/documentation_dev.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Build/Publish Develop Docs -on: - push: - branches: - - dev - workflow_dispatch: -permissions: - contents: write -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Configure Git Credentials - run: | - git config user.name github-actions[bot] - git config user.email 41898282+github-actions[bot]@users.noreply.github.com - - uses: actions/setup-python@v5 - with: - python-version: 3.x - - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - - uses: actions/cache@v4 - with: - key: mkdocs-material-${{ env.cache_id }} - path: .cache - restore-keys: | - mkdocs-material- - - name: Update files - run: | - cat CONTRIBUTING.md > docs/about/contributing.md - cat CHANGELOG.md > docs/about/changelog.md - cat LICENSE > docs/about/license.md - - - name: Install Dependencies - run: | - pip install mkdocs-material - pip install mike - - name: Build Docs Website - run: | - git fetch origin gh-pages --depth=1 - mike deploy --push dev From b4699b77a503519e7f5408ed5f1f8b1dfa8a2eae Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 12 Dec 2024 11:53:13 +0100 Subject: [PATCH 05/14] Removing PyMLST and fixing some minor issues --- bin/download_mlst.rb | 117 ------------------ bin/download_pymlst_wgmlst.sh | 10 -- conf/modules.config | 32 ----- conf/modules/assembly.config | 3 +- conf/modules/installation.config | 8 -- conf/modules/mlst.config | 16 --- conf/modules/qc.config | 2 +- docs/software.md | 3 - docs/usage.md | 4 + modules/pymlst/wgmlst/add/environment.yml | 7 -- modules/pymlst/wgmlst/add/main.nf | 52 -------- .../pymlst/wgmlst/distance/environment.yml | 7 -- modules/pymlst/wgmlst/distance/main.nf | 37 ------ modules/pymlst/wgmlst_install/environment.yml | 7 -- modules/pymlst/wgmlst_install/main.nf | 18 --- modules/quast/main.nf | 2 +- modules/sourmash/compare/main.nf | 1 - nextflow.config | 4 + subworkflows/mlst/main.nf | 63 +--------- workflows/build_references.nf | 6 - workflows/gabi.nf | 5 +- 21 files changed, 16 insertions(+), 388 deletions(-) delete mode 100644 bin/download_mlst.rb delete mode 100755 bin/download_pymlst_wgmlst.sh delete mode 100644 modules/pymlst/wgmlst/add/environment.yml delete mode 100644 modules/pymlst/wgmlst/add/main.nf delete mode 100644 modules/pymlst/wgmlst/distance/environment.yml delete mode 100644 modules/pymlst/wgmlst/distance/main.nf delete mode 100644 modules/pymlst/wgmlst_install/environment.yml delete mode 100644 modules/pymlst/wgmlst_install/main.nf diff --git a/bin/download_mlst.rb b/bin/download_mlst.rb deleted file mode 100644 index b9b3062..0000000 --- a/bin/download_mlst.rb +++ /dev/null @@ -1,117 +0,0 @@ -require 'optparse' -require 'ostruct' -require 'rest_client' -require 'json' - -def rest_get(url) - $request_counter ||= 0 # Initialise if unset - $last_request_time ||= 0 # Initialise if unset - - # Rate limiting: Sleep for the remainder of a second since the last request on every third request - $request_counter += 1 - if $request_counter == 15 - diff = Time.now - $last_request_time - sleep(1-diff) if diff < 1 - $request_counter = 0 - end - - begin - response = RestClient.get "#{$server}/#{url}", {:accept => :json} - - $last_request_time = Time.now - JSON.parse(response) - rescue RestClient::Exception => e - puts "Failed for #{url}! #{response ? "Status code: #{response}. " : ''}Reason: #{e.message}" - - # Sleep for specified number of seconds if there is a Retry-After header - if e.response.headers[:retry_after] - sleep(e.response.headers[:retry_after].to_f) - retry # This retries from the start of the begin block - else - abort("Quitting... #{e.inspect}") - end - end -end - -def clean_url(url) - return url.gsub("https://rest.pubmlst.org/","") -end -### Get the script arguments and open relevant files -options = OpenStruct.new() -opts = OptionParser.new() -opts.on("-s","--set_id", "=SETID","Get info for this set") {|argument| options.set_id = argument } -opts.on("-o","--outfile", "=OUTFILE","Output file") {|argument| options.outfile = argument } -opts.on("-h","--help","Display the usage information") { - puts opts - exit -} - -opts.parse! - -$server = 'https://rest.pubmlst.org/' - -info = rest_get("db") - -banned = [ "rMLST", "test"] - -info.each do |i| - - full_name = i["description"] - name = i["name"] - - warn "#{name} | #{full_name}" - - next if banned.include?(name) - - databases = i["databases"].select{|d| d["href"].include?("seqdef") } - - databases.each do |database| - - entry = rest_get(clean_url(database["href"])) - - schemas = rest_get(clean_url(entry["schemes"])) - - mlsts = schemas["schemes"].select{|s| s["description"].include?("MLST") } - - mlsts.each do |this_mlst| - - schema = rest_get(clean_url(this_mlst["scheme"])) - - desc = schema["description"] - - mlst = desc.gsub(" ", "_").gsub(/[(,)]/, "").gsub("/", "").downcase - - profile_name = "#{name}_#{mlst}" - - next if mlst.include?("gmlst") - - command = "wget -O #{name}_#{mlst}_profiles_csv #{schema['profiles_csv']}" - - puts command - - loci = schema["loci"] - - list = [] - - loci.each do |locus| - - l = rest_get(clean_url(locus)) - - locus_name = l["id"] - - fasta = locus_name + ".fasta" - - list << fasta - - command = "wget -O #{fasta} #{l["alleles_fasta"]}" - puts command - - end - - command = "claMLST create #{profile_name} #{profile_name}_profiles_csv #{list.join(' ')}" - puts command - - end - end - -end diff --git a/bin/download_pymlst_wgmlst.sh b/bin/download_pymlst_wgmlst.sh deleted file mode 100755 index 276bbf3..0000000 --- a/bin/download_pymlst_wgmlst.sh +++ /dev/null @@ -1,10 +0,0 @@ -wgMLST import cgmlst_db/escherichia 'Escherichia coli' -wgMLST import cgmlst_db/listeria_monocytogenes 'Listeria monocytogenes' -wgMLST import cgmlst_db/klebsiella_pneumoniae 'Klebsiella pneumoniae' -wgMLST import cgmlst_db/staphylococcus_aureus 'Staphylococcus aureus' -wgMLST import cgmlst_db/acinetobacter_baumannii 'Acinetobacter baumannii' -wgMLST import cgmlst_db/salmonella_enterica 'Salmonella enterica' -wgMLST import cgmlst_db/campylobacter 'Campylobacter' -wgMLST import cgmlst_db/clostridium_perfringens 'Clostridium perfringens' -wgMLST import cgmlst_db/streptococcus_pyogenes 'Streptococcus pyogenes' -wgMLST import cgmlst_db/klebsiella_oxytoca 'Klebsiella oxytoca' \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index f614e10..d5c8036 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -340,14 +340,6 @@ process { enabled: true ] } - withName: RENAME_SHOVILL_CTG { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: MOBSUITE_RECON { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/plasmids" }, @@ -414,30 +406,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: PYMLST_WGMLST_DISTANCE { - publishDir = [ - path: { "${params.outdir}/cgMLST/pymlst" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: PYMLST_WGMLST_ADD { - publishDir = [ - path: { "${params.outdir}/cgMLST/pymlst" }, - mode: params.publish_dir_mode, - enabled: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: MLST { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/mlst" }, diff --git a/conf/modules/assembly.config b/conf/modules/assembly.config index 30da9e7..c606584 100644 --- a/conf/modules/assembly.config +++ b/conf/modules/assembly.config @@ -16,7 +16,7 @@ process { enabled: true ] } - withName: RENAME_SHOVILL_CTG { + withName: 'RENAME_*_CTG' { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/assembly" }, mode: params.publish_dir_mode, @@ -24,6 +24,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: FLYE { ext.args = "--plasmids --pacbio-hifi" publishDir = [ diff --git a/conf/modules/installation.config b/conf/modules/installation.config index 4eb2092..64dacb4 100644 --- a/conf/modules/installation.config +++ b/conf/modules/installation.config @@ -33,14 +33,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'PYMLST_CLAMLST_INSTALL|PYMLST_WGMLST_INSTALL' { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/mlst" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: AMRFINDERPLUS_INSTALL { publishDir = [ path: { "${params.reference_base}/gabi/${params.reference_version}" }, diff --git a/conf/modules/mlst.config b/conf/modules/mlst.config index 8102fb6..d385231 100644 --- a/conf/modules/mlst.config +++ b/conf/modules/mlst.config @@ -34,22 +34,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: PYMLST_WGMLST_DISTANCE { - publishDir = [ - path: { "${params.outdir}/cgMLST/pymlst" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: PYMLST_WGMLST_ADD { - publishDir = [ - path: { "${params.outdir}/cgMLST/pymlst" }, - mode: params.publish_dir_mode, - enabled: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: MLST { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/mlst" }, diff --git a/conf/modules/qc.config b/conf/modules/qc.config index b0bce94..983467d 100644 --- a/conf/modules/qc.config +++ b/conf/modules/qc.config @@ -82,7 +82,7 @@ process { } withName: NANOPLOT { publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot/${meta.library_id}" }, + path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" }, mode: params.publish_dir_mode, enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } diff --git a/docs/software.md b/docs/software.md index c92b909..1bce26e 100644 --- a/docs/software.md +++ b/docs/software.md @@ -9,9 +9,6 @@ Version 1.19, [doi](https://doi.org/10.1093/bioinformatics/btw354) | [PubMed](ht **Samtools** Version 1.19, [doi](https://doi.org/10.1093/bioinformatics/btp352) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/19505943/) | [github](https://github.com/samtools/samtools) -**pyMLST** -Version 2.1.6, [doi](https://doi.org/10.1099/mgen.0.001126) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/37966168/) | [github](https://github.com/bvalot/pyMLST) - **Flye** Version 2.9, [doi](https://doi.org/10.1038/s41587-019-0072-8) | [PubMed](https://pubmed.ncbi.nlm.nih.gov/30936562/) | [github](https://github.com/fenderglass/Flye/tree/flye) diff --git a/docs/usage.md b/docs/usage.md index 819d610..cd931c8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -166,6 +166,10 @@ If you analyse a single species and wish to optimize the quality of the genome a If you analyse a single species and wish to optimize the quality of the genome annotation, you can pass a custom prodigal training file using this option, as described [here](https://github.com/tseemann/prokka?tab=readme-ov-file#option---prodigaltf). +### `--remove_host` [ default = false ] + +This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an isse and we will consider adding it. + ### `--skip_failed` [ default = false ] By default, all samples are processed all the way to the end of the pipeline. This flag allows you to apply criteria to stop samples along the processing graph. The following criteria will be applied: diff --git a/modules/pymlst/wgmlst/add/environment.yml b/modules/pymlst/wgmlst/add/environment.yml deleted file mode 100644 index 22e887f..0000000 --- a/modules/pymlst/wgmlst/add/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: pymlst_wgmlst_add -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::pymlst=2.1.6 diff --git a/modules/pymlst/wgmlst/add/main.nf b/modules/pymlst/wgmlst/add/main.nf deleted file mode 100644 index 3f1b8e2..0000000 --- a/modules/pymlst/wgmlst/add/main.nf +++ /dev/null @@ -1,52 +0,0 @@ -process PYMLST_WGMLST_ADD { - maxForks 1 - - tag "${meta.sample_id}" - - label 'short_parallel' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' : - 'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }" - - input: - tuple val(meta), path(assembly), val(db) - - output: - tuple val(meta), path('*mlst.txt') , emit: report - path('versions.yml') , emit: versions - - script: - - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample_id - - // Work-around - we remove these strains first, if they already exist in the database, to avoid a downstream error - // For example, if we resume the workflow with different settings. - // We also wait a little bit to avoid file lock issues. - """ - echo ${meta.sample_id} >> sample.txt - - wgMLST \\ - remove --strains \\ - -f sample.txt \\ - $db - - wgMLST \\ - add \\ - $args \\ - -s ${meta.sample_id} \\ - $db \\ - $assembly - touch ${prefix}.mlst.txt \\ - - sleep 2 - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pyMLST: \$(claMLST --version 2>&1 | head -n1 | sed -e "s/Version: //g") - END_VERSIONS - - """ -} diff --git a/modules/pymlst/wgmlst/distance/environment.yml b/modules/pymlst/wgmlst/distance/environment.yml deleted file mode 100644 index a8bc9fe..0000000 --- a/modules/pymlst/wgmlst/distance/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: pymlst_wgmlst_distance -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::pymlst=2.1.6 diff --git a/modules/pymlst/wgmlst/distance/main.nf b/modules/pymlst/wgmlst/distance/main.nf deleted file mode 100644 index d62b09a..0000000 --- a/modules/pymlst/wgmlst/distance/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -process PYMLST_WGMLST_DISTANCE { - maxForks 1 - - tag "${meta.sample_id}" - - label 'short_parallel' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' : - 'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }" - - input: - tuple val(meta), val(db) - - output: - tuple val(meta), path('*cgmlst.txt') , emit: report - path('versions.yml') , emit: versions - - script: - - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample_id - - """ - wgMLST \\ - distance \\ - --output ${prefix}.cgmlst.txt \\ - $db $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pyMLST: \$(claMLST --version 2>&1 | head -n1 | sed -e "s/Version: //g") - END_VERSIONS - - """ -} diff --git a/modules/pymlst/wgmlst_install/environment.yml b/modules/pymlst/wgmlst_install/environment.yml deleted file mode 100644 index 15fa0a1..0000000 --- a/modules/pymlst/wgmlst_install/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: pymlst_wgmlst_install -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::pymlst=2.1.6 diff --git a/modules/pymlst/wgmlst_install/main.nf b/modules/pymlst/wgmlst_install/main.nf deleted file mode 100644 index ec1a4c4..0000000 --- a/modules/pymlst/wgmlst_install/main.nf +++ /dev/null @@ -1,18 +0,0 @@ -process PYMLST_WGMLST_INSTALL { - label 'short_serial' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pymlst:2.1.6--pyhdfd78af_0' : - 'quay.io/biocontainers/pymlst:2.1.6--pyhdfd78af_0' }" - - output: - path("cgmlst_db"), emit: db - - script: - - ''' - mkdir -p cgmlst_db - download_pymlst_wgmlst.sh - ''' -} diff --git a/modules/quast/main.nf b/modules/quast/main.nf index 9187a11..8c9437b 100644 --- a/modules/quast/main.nf +++ b/modules/quast/main.nf @@ -34,7 +34,7 @@ process QUAST { $features \\ --threads $task.cpus \\ $args \\ - ${prefix}.fasta + $assembly ln -s ${prefix}/report.tsv ${prefix}.tsv [ -f ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ] && ln -s ${prefix}/contigs_reports/all_alignments_transcriptome.tsv ${prefix}_transcriptome.tsv diff --git a/modules/sourmash/compare/main.nf b/modules/sourmash/compare/main.nf index 9bc63f7..fc0f244 100644 --- a/modules/sourmash/compare/main.nf +++ b/modules/sourmash/compare/main.nf @@ -24,7 +24,6 @@ process SOURMASH_COMPARE { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.sample_id}" def comp = save_numpy_matrix ? "--output comp.npy" : '' def csv = save_csv ? "--csv comp.csv" : '' if ( !save_numpy_matrix && !save_csv ) error "Supply either save_numpy_matrix, save_csv, or both or no output will be created" diff --git a/nextflow.config b/nextflow.config index 8210b9a..35fadc9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -115,6 +115,10 @@ dag { // Module-specific configuration options includeConfig 'conf/modules.config' +includeConfig 'conf/modules/assembly.config' +includeConfig 'conf/modules/mlst.config' +includeConfig 'conf/modules/qc.config' +includeConfig 'conf/modules/installation.config' // Load centrally stored profiles try { diff --git a/subworkflows/mlst/main.nf b/subworkflows/mlst/main.nf index f508ef7..c5f88f3 100644 --- a/subworkflows/mlst/main.nf +++ b/subworkflows/mlst/main.nf @@ -1,5 +1,3 @@ -include { PYMLST_WGMLST_ADD } from './../../modules/pymlst/wgmlst/add' -include { PYMLST_WGMLST_DISTANCE } from './../../modules/pymlst/wgmlst/distance' include { CHEWBBACA_ALLELECALL } from './../../modules/chewbbaca/allelecall' include { CHEWBBACA_ALLELECALL as CHEWBBACA_ALLELECALL_SINGLE } from './../../modules/chewbbaca/allelecall' include { CHEWBBACA_JOINPROFILES } from './../../modules/chewbbaca/joinprofiles' @@ -41,28 +39,6 @@ workflow MLST_TYPING { pass: db }.set { assembly_with_mlst_db } - /* - We use the previously attempted taxonomic classification - to choose the appropriate cgMLST schema, if any - */ - ch_assembly_filtered.annotated.map { m, a -> - def (genus,species) = m.taxon.toLowerCase().split(' ') - def cg_db = null - if (params.cgmlst[genus]) { - cg_db = params.cgmlst[genus] - m.db_name = genus - } else if (params.cgmlst["${genus}_${species}"]) { - cg_db = params.cgmlst["${genus}_${species}"] - m.db_name = "${genus}_${species}" - } else { - cg_db = null - } - tuple(m, a, cg_db) - }.branch { m, a, db -> - fail: db == null - pass: db - }.set { assembly_with_cg_db } - /* We use the previously attempted taxonomic classification to choose the appropriate Chewbbaca cgMLST schema, if any @@ -105,45 +81,11 @@ workflow MLST_TYPING { /* Inform users about to-be-skipped samples due to a lack of a matching cgMLST database */ - assembly_with_cg_db.fail.subscribe { m, s, d -> - log.warn "${m.sample_id} - could not match a pyMLST cgMLST database to ${m.taxon}." - } + assembly_with_chewie_db.fail.subscribe { m, s, d -> log.warn "${m.sample_id} - could not match a Chewbbaca cgMLST database to ${m.taxon}." } - /* - Run wgMLST on assemblies for which we have taxonomic information - and a matching cgMLST schema configured, i.e. the last element must - not be null - */ - PYMLST_WGMLST_ADD( - assembly_with_cg_db.pass - ) - ch_versions = ch_versions.mix(PYMLST_WGMLST_ADD.out.versions) - - /* - Get the databases for which we have assemblies to - perform cgMLST clustering - */ - assembly_with_cg_db.pass.map { m, a, d -> - tuple(m, d) - } - .groupTuple(by: 1) - .map { metas, db -> - def meta = [:] - meta.db_name = file(db).getSimpleName() - meta.sample_id = file(db).getSimpleName() - tuple(meta, db) - }.set { ch_cgmlst_database } - /* - Perform clustering on the given database - */ - PYMLST_WGMLST_DISTANCE( - ch_cgmlst_database - ) - ch_versions = ch_versions.mix(PYMLST_WGMLST_DISTANCE.out.versions) - /* Perform cgMLST calling with Chewbbaca Part one consists of a joint allele calling approach in which all samples belonging to the same species are jointly call @@ -153,7 +95,6 @@ workflow MLST_TYPING { assembly_with_chewie_db.pass ) ch_versions = ch_versions.mix(CHEWBBACA_ALLELECALL_SINGLE.out.versions) - /* Join assemblies and databases to generate [ meta, [ assemblies ], db ] and filter out all @@ -187,4 +128,4 @@ workflow MLST_TYPING { emit: versions = ch_versions report = MLST.out.json - } +} diff --git a/workflows/build_references.nf b/workflows/build_references.nf index a5dd5c7..a217490 100644 --- a/workflows/build_references.nf +++ b/workflows/build_references.nf @@ -3,7 +3,6 @@ include { KRAKEN2_DOWNLOAD } from './../modules/k include { CONFINDR_INSTALL } from './../modules/helper/confindr_install' include { BUSCO_DOWNLOAD as BUSCO_INSTALL } from './../modules/busco/download' include { AMRFINDERPLUS_UPDATE as AMRFINDERPLUS_INSTALL } from './../modules/amrfinderplus/update' -include { PYMLST_WGMLST_INSTALL } from './../modules/pymlst/wgmlst_install' include { CHEWBBACA_DOWNLOADSCHEMA } from './../modules/chewbbaca/downloadschema' include { STAGE_FILE as DOWNLOAD_SOURMASH_DB } from './../modules/helper/stage_file' include { STAGE_FILE as DOWNLOAD_SOURMASH_NR_DB } from './../modules/helper/stage_file' @@ -73,11 +72,6 @@ workflow BUILD_REFERENCES { confindr_db_url ) - /* - Install cgMLST schemas - */ - PYMLST_WGMLST_INSTALL() - /* Install Chewbbaca schemas based on schema ID */ diff --git a/workflows/gabi.nf b/workflows/gabi.nf index b31c389..7c00b0c 100644 --- a/workflows/gabi.nf +++ b/workflows/gabi.nf @@ -67,6 +67,7 @@ if (params.input) { amrfinder_db = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true) : [] kraken2_db = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true) : [] + // Sourmash DB choice - either the full thing or a smaller "nr" one to speed up searches at the cost of some precision if (params.fast_ref) { sourmashdb = params.reference_base ? file(params.references['sourmashdb_nr'].db, checkIfExists:true) : [] } else { @@ -303,7 +304,7 @@ workflow GABI { RENAME_PLASMID_CTG( PLASMIDS.out.chromosome, - 'fasta' + 'chromosomes.fasta' ) ch_assembly_without_plasmids = RENAME_PLASMID_CTG.out @@ -314,7 +315,6 @@ workflow GABI { errors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - VARIANTS( ch_illumina_trimmed.map { m,r -> tuple(m.sample_id,m,r) @@ -366,7 +366,6 @@ workflow GABI { SUB: Perform serotyping of assemblies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - SEROTYPING( ch_assemblies_without_plasmids_with_taxa ) From 6fa60bd36c9e2f15233efe9356f7f9b972277853 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 12 Dec 2024 13:54:06 +0100 Subject: [PATCH 06/14] Adding ecoli specific abricate step and redirecting sourmash sketch output --- bin/gabi.py | 2 +- conf/modules.config | 24 ++++++++++++++---------- subworkflows/amr_profiling/main.nf | 20 +++++++++++++++++++- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/bin/gabi.py b/bin/gabi.py index 7b08f3d..50afcac 100755 --- a/bin/gabi.py +++ b/bin/gabi.py @@ -326,7 +326,7 @@ def main(yaml, template, output, reference): # Draw the Kraken abundance table kdata = pd.DataFrame(data=kraken_data_all, index=samples) plot_labels = {"index": "Samples", "value": "Percentage"} - h = len(samples)*20 if len(samples) > 10 else 400 + h = len(samples)*25 if len(samples) > 10 else 450 fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h) data["Kraken"] = fig.to_html(full_html=False) diff --git a/conf/modules.config b/conf/modules.config index d5c8036..77b7039 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -65,15 +65,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: SHOVILL { - ext.args = "--assembler ${params.shovill_assembler} --minlen ${params.shovill_contig_minlen}" - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly/shovill" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: PROKKA { ext.args = "--force" publishDir = [ @@ -237,6 +228,19 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: ABRICATE_RUN_ECOLI_VIRULENCE { + ext.args = [ + "--db ecoli_vf", + "--minid ${params.arg_abricate_minid}", + "--mincov ${params.arg_abricate_mincov}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/ecoli_vf" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: HAMRONIZATION_ABRICATE { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate" }, @@ -414,7 +418,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: SOURMASH_SEARCH { + withName: 'SOURMASH_SEARCH|SOURMASH_SKETCH' { ext.args = "--best-only" publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" }, diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf index d9c2ee4..25c8f0a 100644 --- a/subworkflows/amr_profiling/main.nf +++ b/subworkflows/amr_profiling/main.nf @@ -7,6 +7,8 @@ include { HAMRONIZATION_AMRFINDERPLUS } from './../../modules/hamronization/ include { HAMRONIZATION_ABRICATE } from './../../modules/hamronization/abricate' include { HAMRONIZATION_SUMMARIZE } from './../../modules/hamronization/summarize' include { ABRICATE_RUN } from './../../modules/abricate/run' +include { ABRICATE_RUN as ABRICATE_RUN_ECOLI_VIRULENCE } from './../../modules/abricate/run' + ch_versions = Channel.from([]) multiqc_files = Channel.from([]) @@ -19,6 +21,12 @@ workflow AMR_PROFILING { main: + assembly.branch { m, a -> + ecoli: m.taxon ==~ /^Escherichia.*/ + salmonella: m.taxon ==~ /^Salmonella.*/ + listeria: m.taxon ==~ /^Listeria.*/ + }.set { assembly_by_taxon } + /* Run AMRFinderPlus and make JSON report */ @@ -55,8 +63,18 @@ workflow AMR_PROFILING { ) ch_versions = ch_versions.mix(ABRICATE_RUN.out.versions) + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Taxon-specific abricate analyses + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + // E. coli + ABRICATE_RUN_ECOLI_VIRULENCE( + assembly_by_taxon.ecoli + ) + ch_versions = ch_versions.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.versions) + + // Join basic Abricate results HAMRONIZATION_ABRICATE( - ABRICATE_RUN.out.report, + ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report), 'json', '1.0.1', '2021-Mar-27' From c528eb25b6389443b969a0aa8048cfc168266880 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 12 Dec 2024 14:19:26 +0100 Subject: [PATCH 07/14] Refactoring module configs --- conf/modules.config | 261 +-------------------- conf/modules/assembly_qc.config | 26 ++ conf/modules/installation.config | 8 + conf/modules/{qc.config => read_qc.config} | 27 +++ nextflow.config | 3 +- 5 files changed, 71 insertions(+), 254 deletions(-) create mode 100644 conf/modules/assembly_qc.config rename conf/modules/{qc.config => read_qc.config} (78%) diff --git a/conf/modules.config b/conf/modules.config index 77b7039..6367440 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -6,41 +6,7 @@ process { enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: 'MULTIQC|GABI_REPORT' { - publishDir = [ - path: { "${params.outdir}/reports" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: MULTIQC_ILLUMINA { - ext.prefix = "multiqc_illumina" - publishDir = [ - path: { "${params.outdir}/reports/Illumina" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: MULTIQC_NANOPORE { - ext.prefix = "multiqc_nanopore" - publishDir = [ - path: { "${params.outdir}/reports/Nanopore" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: MULTIQC_PACBIO { - ext.prefix = "multiqc_pacbio" - publishDir = [ - path: { "${params.outdir}/reports/Pacbio" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: GABI_SUMMARY { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/" }, @@ -49,22 +15,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: QUAST { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly/quast" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: MUMMER2CIRCOS { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/plots" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: PROKKA { ext.args = "--force" publishDir = [ @@ -82,14 +32,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: BUSCO_BUSCO { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly/busco" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: KRAKEN2_KRAKEN2 { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/taxonomy/kraken2" }, @@ -130,31 +72,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: 'CONFINDR' { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CONFINDR2MQC|CONFINDR2MQC_SUMMARY' { - publishDir = [ - path: { "${params.outdir}/qc" }, - mode: params.publish_dir_mode, - enabled: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CONFINDR2JSON' { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/confindr" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: 'ECTYPER|SEQSERO2|LISSERO|SISTR|STECFINDER' { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/serotype/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, @@ -183,14 +100,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: FASTQC { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastqc" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: AMRFINDERPLUS_RUN { ext.args = [ "--ident_min ${params.arg_amrfinderplus_identmin}", @@ -266,36 +175,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: PORECHOP_ABI { - ext.args = "--abi" - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/porechop" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: CHOPPER { - ext.args2 = [ - "-l ${params.ont_min_length}", - params.ont_min_q ? "-q ${params.ont_min_q}" : "" - ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/chopper" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - - } - withName: NANOPLOT { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/nanoplot" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } withName: CAT_FASTQ { publishDir = [ path: { "${params.outdir}/cat" }, @@ -303,33 +182,6 @@ process { enabled: false ] } - withName: BIOBLOOM_CATEGORIZER { - ext.args = "-g -n --fq" - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/biobloom" }, - mode: params.publish_dir_mode, - enabled: false - ] - } - withName: RASUSA { - ext.args = [ - "--genome-size ${params.genome_size}", - "--coverage ${params.max_coverage}" - ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/rasusa" }, - mode: params.publish_dir_mode, - enabled: false - ] - } - withName: FASTP { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastp" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.contains('.json') ? filename : null } - ] - } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/custom" }, @@ -337,13 +189,6 @@ process { enabled: false ] } - withName: DRAGONFLYE { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly/dragonflye" }, - mode: params.publish_dir_mode, - enabled: true - ] - } withName: MOBSUITE_RECON { publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/plasmids" }, @@ -352,15 +197,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: FLYE { - ext.args = "--plasmids --pacbio-hifi" - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/assembly/flye/" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: AMRFINDERPLUS_INSTALL { publishDir = [ path: { "${params.reference_base}/gabi/${params.reference_version}" }, @@ -369,56 +206,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: BUSCO_INSTALL { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/busco" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: KRAKEN2_DOWNLOAD { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/kraken2" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: DOWNLOAD_SOURMASH_DB { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/sourmashdb" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: CHEWBBACA_DOWNLOADSCHEMA { - ext.args = "--latest" - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/chewbbaca" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: CONFINDR_INSTALL { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: MLST { - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/mlst" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'SOURMASH_SEARCH|SOURMASH_SKETCH' { + withName: 'SOURMASH_SEARCH' { ext.args = "--best-only" publishDir = [ path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" }, @@ -427,48 +215,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'CHEWBBACA_ALLELECALLEVALUATOR' { - - publishDir = [ - path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CHEWBBACA_JOINPROFILES' { - ext.args = "--common" - publishDir = [ - path: { "${params.outdir}/cgMLST/chewbbaca/samples/${meta.sample_id}/joinprofiles" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CHEWBBACA_ALLELECALL_SINGLE' { - ext.args = "--no-inferred" - publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/chewbbaca" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: BIOBLOOM_MAKER { - publishDir = [ - path: { "${params.reference_base}/gabi/${params.reference_version}/biobloom" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CHEWBBACA_ALLELECALL' { - ext.args = "--no-inferred" + withName: 'SOURMASH_SKETCH' { + ext.args = "dna" publishDir = [ - path: { "${params.outdir}/cgMLST/chewbbaca/${meta.sample_id}" }, + path: { "${params.outdir}/samples/${meta.sample_id}/sourmash" }, mode: params.publish_dir_mode, enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + } diff --git a/conf/modules/assembly_qc.config b/conf/modules/assembly_qc.config new file mode 100644 index 0000000..32ea5dc --- /dev/null +++ b/conf/modules/assembly_qc.config @@ -0,0 +1,26 @@ +process { + withName: BUSCO_BUSCO { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly/busco" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: QUAST { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/assembly/quast" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: MUMMER2CIRCOS { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/plots" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} \ No newline at end of file diff --git a/conf/modules/installation.config b/conf/modules/installation.config index 64dacb4..864ee89 100644 --- a/conf/modules/installation.config +++ b/conf/modules/installation.config @@ -49,5 +49,13 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: BIOBLOOM_MAKER { + publishDir = [ + path: { "${params.reference_base}/gabi/${params.reference_version}/biobloom" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } \ No newline at end of file diff --git a/conf/modules/qc.config b/conf/modules/read_qc.config similarity index 78% rename from conf/modules/qc.config rename to conf/modules/read_qc.config index 983467d..bd49a2e 100644 --- a/conf/modules/qc.config +++ b/conf/modules/read_qc.config @@ -88,5 +88,32 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: FASTP { + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/qc/fastp" }, + mode: params.publish_dir_mode, + enabled: true, + saveAs: { filename -> filename.contains('.json') ? filename : null } + ] + } + withName: RASUSA { + ext.args = [ + "--genome-size ${params.genome_size}", + "--coverage ${params.max_coverage}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/rasusa" }, + mode: params.publish_dir_mode, + enabled: false + ] + } + withName: BIOBLOOM_CATEGORIZER { + ext.args = "-g -n --fq" + publishDir = [ + path: { "${params.outdir}/samples/${meta.sample_id}/biobloom" }, + mode: params.publish_dir_mode, + enabled: false + ] + } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 35fadc9..54704bc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -117,7 +117,8 @@ dag { includeConfig 'conf/modules.config' includeConfig 'conf/modules/assembly.config' includeConfig 'conf/modules/mlst.config' -includeConfig 'conf/modules/qc.config' +includeConfig 'conf/modules/read_qc.config' +includeConfig 'conf/modules/assembly_qc.config' includeConfig 'conf/modules/installation.config' // Load centrally stored profiles From 44c731c906886192455c70562b4def65b53316b0 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Fri, 3 Jan 2025 09:45:52 +0100 Subject: [PATCH 08/14] Fixing mishandled confindr results for certain edge cases --- bin/gabi.py | 43 ++++++++++++++++++++++++++----------------- conf/resources.config | 13 ------------- docs/usage.md | 12 ++++++------ 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/bin/gabi.py b/bin/gabi.py index 50afcac..a29719c 100755 --- a/bin/gabi.py +++ b/bin/gabi.py @@ -83,22 +83,31 @@ def main(yaml, template, output, reference): for read in set: if read["ContamStatus"] == "True": contaminated = True - if (read["PercentContam"] == "ND"): - perc = "ND" - this_status = status["missing"] - else: - perc = float(read["PercentContam"]) - - if (perc > contaminated): - contaminated = perc - if (perc >= 10.0): - confindr_status = status["fail"] + if "PercentContam" in read: + if (read["PercentContam"] == "ND"): + perc = "ND" + if ":" in read["Genus"]: + perc = read["Genus"] this_status = status["fail"] - elif (perc > 0.0 and confindr_status == status["pass"]): - confindr_status = status["warn"] - if (this_status == status["pass"]): - this_status = status["warn"] + confindr_status = status["fail"] + contaminated = perc + else: + perc = float(read["PercentContam"]) + + if (perc > contaminated): + contaminated = perc + + if (perc >= 10.0): + confindr_status = status["fail"] + this_status = status["fail"] + elif (perc > 0.0 and confindr_status == status["pass"]): + confindr_status = status["warn"] + if (this_status == status["pass"]): + this_status = status["warn"] + else: + contaminated = "ND" + confindr_status = status["warn"] # All the relevant values and optional status classes sample = jdata["sample"] @@ -113,9 +122,9 @@ def main(yaml, template, output, reference): if "kraken" in jdata: taxon_perc = float(jdata["kraken"][0]["percentage"]) - if taxon_perc >= 80.0: + if taxon_perc >= 90.0: taxon_status = status["pass"] - elif taxon_perc >= 60.0: + elif taxon_perc >= 70.0: taxon_status = status["warn"] else: taxon_status = status["fail"] @@ -130,7 +139,7 @@ def main(yaml, template, output, reference): kraken_results[this_taxon] = tperc - if (tperc > 10.0): + if (tperc > 5.0): taxon_count += 1 kraken_data_all.append(kraken_results) diff --git a/conf/resources.config b/conf/resources.config index b3a160a..b81b19a 100644 --- a/conf/resources.config +++ b/conf/resources.config @@ -179,19 +179,6 @@ params { } - cgmlst { - escherichia = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/escherichia" - listeria_monocytogenes = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/listeria_monocytogenes" - klebsiella_pneumoniae = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/klebsiella_pneumoniae" - staphylococcus_aureus = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/staphylococcus_aureus" - acinetobacter_baumannii = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/acinetobacter_baumannii" - salmonella_enterica = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/salmonella_enterica" - campylobacter = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/campylobacter" - clostridium_perfringens = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/clostridium_perfringens" - streptococcus_pyogenes = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/streptococcus_pyogenes" - klebsiella_oxytoca = "${params.reference_base}/gabi/${params.reference_version}/mlst/cgmlst_db/klebsiella_oxytoca" - } - chewbbaca { streptococcus_pyogenes = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_1/Streptococcus_pyogenes_wgMLST" acinetobacter_baumannii = "${params.reference_base}/gabi/${params.reference_version}/chewbbaca/schema_2/Acinetobacter_baumannii_cgMLSTRidom" diff --git a/docs/usage.md b/docs/usage.md index cd931c8..0a98b77 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -116,7 +116,7 @@ This option is only used when installing the pipelines references as described [ ### `--fast_ref` [ default = false ] -By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics will notably deteriorate. +By default, Gabi uses a comprehensive reference database to identify the best reference match per assembly. This can take a substantial amount of time, depending on completeness of the assembly and hardware. If you do not care about the best reference, but are happy with a "close enough" inference to get the correct species only, you can set this option to true. This will then run a reduced version of the database with a focus on covering relevant taxonomic groups at a much less dense sampling. Note that some of the Quast metrics may notably deteriorate as you are no longer guaranteed to get the closest possible match. ### `--run_name` [ default = null] @@ -128,13 +128,13 @@ This option should point to the base directory in which you have installed the p ### `--onthq` [ default = false ] -Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default.. +Set this option to true if you believe your ONT data to be of "high quality" (much of the reads >= Q20). This is typically the case for data generated with chemistry version 10.4.1 or later, preferably using a ligation protocol. This option is set to false by default. ### `--ont_min_q` [ default = 10 ] -Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data. +Discard nanopore reads below this mean quality. ONT sequencing will produce a spread of qualities, typically ranging from Q10 to Q30 (the higher, the better). This option is mostly useful if you have sequenced at sufficient depth to be able to tolerate removable of some of the data in favor of higher quality reads. -### `--ont_min_length` [ default = 5000 ] +### `--ont_min_length` [ default = 1000 ] Discard nanopore reads below this length. Depending on your DNA extraction and/or library preparation, you will see a range of sequence lengths. If you have sequenced at sufficient depths, you may decide to discard shorter reads to improve your assembly contiguity. However, please note that discarding shorter reads may essentially throw away very short plasmids (which can be as short as ~1kb). @@ -148,7 +148,7 @@ A local version of the ConfindR rMLST database, available [here](https://olc-bio ### `--genome_size` [ default = null ] -If enabled, this is the assumed genome size against which the coverage is measured for downsampling the raeds (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. +If enabled, this is the assumed genome size against which the coverage is measured for downsampling the reads (e.g. '5Mb'). Since this pipeline supports processing of diverse species in parallel, you may wish to set this to a size that works across all expected taxa, like '6Mb'. The reads will then be downsampled to the desired max coverage, given the genome size. ### `--max_coverage` [ default = '100x'] @@ -168,7 +168,7 @@ If you analyse a single species and wish to optimize the quality of the genome a ### `--remove_host` [ default = false ] -This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an isse and we will consider adding it. +This option will perform filtering of short reads against a built-in reference (currently: horse) to remove any host contamination from the data. This option was found to be useful for Campylobacter, which is often grown in blood medium (in our case: horse). If you use another kind of medium and require decontamination, please open an issue and we will consider adding it. ### `--skip_failed` [ default = false ] From e8de5142e6ebb84177b26b7dc9c614df1ede68af Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Tue, 7 Jan 2025 13:21:03 +0100 Subject: [PATCH 09/14] Adding run infos to html report as well as ONT metrics --- assets/gabi_template.html | 85 ++++++++++++++++++++---------- bin/gabi.py | 84 +++++++++++++++++++++-------- bin/gabi_summary.pl | 44 ++++++++++++++++ conf/modules.config | 8 +++ docs/usage.md | 19 +++++-- modules/fastp/main.nf | 2 +- modules/helper/gabi_report/main.nf | 7 +++ modules/input_check.nf | 5 ++ nextflow.config | 1 + subworkflows/qc/main.nf | 2 + subworkflows/qc_illumina/main.nf | 11 ++-- subworkflows/qc_nanopore/main.nf | 7 +-- workflows/gabi.nf | 14 ++--- 13 files changed, 222 insertions(+), 67 deletions(-) diff --git a/assets/gabi_template.html b/assets/gabi_template.html index dfa17e0..7888318 100644 --- a/assets/gabi_template.html +++ b/assets/gabi_template.html @@ -16,7 +16,7 @@ .general { border-style: none; border-spacing: 0px ;} .table-caption { font-weight: bold; display: block;} table { border-collapse: collapse;} - tr.row { border-bottom: 1px solid grey;} + tr.row td { border-bottom: 1px solid grey; padding-top: 5px; padding-bottom: 5px;} td { padding-left: 5px; padding-right: 5px;} th { border-left: 1px solid white; padding-right: 5px;} tr td[scope="sample-id"] {background-color: rgb(233, 233, 233); font-weight: bold;} @@ -61,10 +61,35 @@

{% if Kraken %} Kraken2 {% endif %} - Serotypes + {% if serotypes %} + Serotypes + {% endif %} Software + + +

Run Infos

+ + + + + + + + + + + + + + + + +
User{{user}}
Date{{date}}
Pipeline version{{version}}
Command line call{{call}}
Work directory{{wd}}
+ @@ -73,11 +98,11 @@

Summary

Sample
StatusThe overall analysis status: pass: ok to use, warn: potential issues found, fail: most probably not usable
-
Best-guess taxonThe highest scoring taxon in the Kraken2 analysis - green: robust call, orange: weak call, red: very weak call
+
Best-guess taxonThe highest scoring taxon using kmer matching (S/MASH)
Reference genomeThe highest matching hit in RefSeq to this assembly
AssemblyInformation about this assembly
Mean coverageMean coverage of reads mapped back to the assembly - bigger is better
-
Mean insert sizeThe mean insert size as determined from mapped reads
+
Read qualityQuality metrics of reads after trimming
ContaminationIndicators of contamination
@@ -90,12 +115,14 @@

Summary

#ContigsThe number of chromosomal contigs, i.e. without plasmids.
N50 (Kb)The size of contigs (>=)in which 50% of the assembly are represented.
Gene space (%)The fraction of broadly conserved genes fully covered in this assembly (BUSCO).
- GC (%) +
GC (%)GC content of the assembly. Deviations from the species default are highlighted in orange (mild) and red (strong, something likely wrong)
Total - Illumina + ILM ONT HiFi - Illumina +
ILM Q30 (%)Fraction of Illumina reads above Q30.
+
ONT Q15 (#)Number of ONT reads above Q15.
+
ONT N50 (bp)N50 of ONT reads
Confindr (%) Taxa >10% @@ -108,7 +135,7 @@

Summary

{{row.sample}} {{row.status}} {{row.taxon}} - {{row.reference.assembly}}
{{row.reference.definition}}
+ {{row.reference.assembly}} {{row.fraction}} {{row.assembly}} {{row.contigs}} @@ -119,7 +146,9 @@

Summary

{{row.coverage_illumina}} {{row.coverage_nanopore}} {{row.coverage_pacbio}} - {{row.samtools.mean_insert_size}} + {{row.quality_illumina}} + {{row.quality_nanopore}} + {{row.nanopore_n50}} {{row.contamination}} {{row.taxon_count}} @@ -216,28 +245,30 @@

Kraken2 - taxonomic composition

-
-

Serotyping

+{% if serotypes %} +
+

Serotyping

-{% for stool,stypes in serotypes.items() %} -
{{stool}}
- - - - - - {% for stype in stypes %} + {% for stool,stypes in serotypes.items() %} +
{{stool}}
+
SampleSerotype
- - + + - {% endfor %} -
{{stype.sample}}{{stype.serotype}}SampleSerotype
-

-{% endfor %} + {% for stype in stypes %} + + {{stype.sample}} + {{stype.serotype}} + + {% endfor %} + +

+ {% endfor %} -

-top +

+ top +{% endif %}

Assembly metrics

+
+ Descriptive metrics of individual assemblies determined by Quast. +

+
@@ -196,10 +203,19 @@

Assembly metrics

top + {% if Insertsizes %}

Insert size distribution (Illumina)

+
+ Insert size refers to the size of the sequenced DNA fragment. Depending on the exact library protocol, this size will fall fairly uniformly around a mean value (~300-500bp). + For Illumina data, that value should typically be (slightly) larger than the combined length of forward and reverse read for optimal data yield. Very flat curves may (depending on the protocol!) + indicate a failure during fragment size selection/enrichment. Neither small insert sizes nor flat curves are a clear predictor for subsequent assembly issues, but can inform any potential debugging efforts. +
+

{{Insertsizes}} @@ -207,12 +223,55 @@

Insert size distribution (Illumina)

top {% endif %} + +
+

BUSCO scores

+ +
+ BUSCO scores describe the coverage of the assemblied gene space against a set of broadly conserved singleton genes (here: bacteria_odb10). A perfect assembly should + have a complete coverage of the gene space (complete: 100%), without any fragmentation or, worse, duplication. A high value of duplication may indicate assembly errors or contamination. Some taxa with very + streamlined gene content, such as Campylobacter, will typically have a completeness score of less than 100%. The Completeness estimates may include duplicated genes, so values greater than 100% are + possible (i.e. all genes present, of which x % are duplicated). +
+ +{{Busco}} + +

+top + + +{% if Kraken %} +
+ +

Kraken2 - taxonomic composition

+ +
+ Kraken2 matches kmers from raw sequencing reads against a reference database to determine the taxonomic composition of a read set. For DNA from + pure cultures (which is the focus of GABI), only one species should be identified at dominant proportions. For some taxa, like Campylobacter, several species from the same genus may be found at comparative + abundances due to a lack of sufficient DNA differences. Otherwise, identification of multiple taxa at higher proportions may indicate a contamination issue. +
+ + {{Kraken}} + + top +{% endif %} +

MLST

+
+ Taxa-specific MLST schemas classify assemblies into pre-defined types or groups. Results are divided by typing schema (and consequently taxa). +
+ +

+ {% for scheme,mtypes in mlst.items() %}
Scheme: {{scheme}}
@@ -233,15 +292,6 @@

MLST

top -{% if Kraken %} -
- -

Kraken2 - taxonomic composition

- - {{Kraken}} - - top -{% endif %} @@ -249,6 +299,12 @@

Kraken2 - taxonomic composition

Serotyping

+
+ Serotyes, similar to MLST types, classify assemblies based on a set of predefined gene profiles. +
+ +

+ {% for stool,stypes in serotypes.items() %}
{{stool}}
diff --git a/bin/gabi.py b/bin/gabi.py index 05cb784..2f2e2d7 100755 --- a/bin/gabi.py +++ b/bin/gabi.py @@ -51,6 +51,7 @@ def main(yaml, template, output, reference, version, call, wd): mlst_all = {} insert_sizes_all = {} min_insert_size_length = 1000 + busco_data_all = [] with open(reference) as r: ref_data = json.load(r)["thresholds"] @@ -112,11 +113,9 @@ def main(yaml, template, output, reference, version, call, wd): if (perc >= 10.0): confindr_status = status["fail"] - this_status = status["fail"] elif (perc > 0.0 and confindr_status == status["pass"]): confindr_status = status["warn"] - if (this_status == status["pass"]): - this_status = status["warn"] + else: contaminated = "ND" confindr_status = status["warn"] @@ -128,7 +127,10 @@ def main(yaml, template, output, reference, version, call, wd): fastp_q30 = "-" fastp_q30_status = status["missing"] + ######################## # Read quality via FastP + ######################## + if "fastp" in jdata: fastp_q30_status = status["pass"] fastp_summary = jdata["fastp"]["summary"] @@ -136,19 +138,22 @@ def main(yaml, template, output, reference, version, call, wd): if fastp_q30 < 0.85: fastp_q30_status = status["warn"] + ########################## # Read stats from NanoStat + ########################## + nanostat_q15 = "-" - #nanostat_q15_status = status["missing"] - #nanostat_mean_read_length = "-" nanostat_read_n50 = "-" if "nanostat" in jdata: nanostat_data = jdata["nanostat"] nanostat_q15 = int(nanostat_data["Q15"]) - nanostat_mean_read_length = nanostat_data["mean_read_length"] + # nanostat_mean_read_length = nanostat_data["mean_read_length"] nanostat_read_n50 = nanostat_data["read_length_n50"] + #################### # Get Kraken results + #################### taxon_status = status["missing"] taxon_count = "-" @@ -181,13 +186,13 @@ def main(yaml, template, output, reference, version, call, wd): if (taxon_count > 3): taxon_count_status = status["fail"] - this_status = status["fail"] elif (taxon_count > 1): taxon_count_status = status["warn"] - if (this_status == status["pass"]): - this_status = status["warn"] + #################### # Get samtools stats + #################### + samtools = {"mean_insert_size": "-", } if ("samtools" in jdata): insert_size = float(jdata["samtools"]["insert size average"]) @@ -198,7 +203,10 @@ def main(yaml, template, output, reference, version, call, wd): if (len(inserts) < min_insert_size_length): min_insert_size_length = len(inserts) + #################### # Get assembly stats + #################### + assembly = round((int(jdata["quast"]["Total length"])/1000000), 2) assembly_status = check_assembly(this_refs, int(jdata["quast"]["Total length"])) @@ -229,7 +237,10 @@ def main(yaml, template, output, reference, version, call, wd): quast["gc"] = float(jdata["quast"]["GC (%)"]) quast["gc_status"] = check_gc(this_refs, float(jdata["quast"]["GC (%)"])) + ################# # Get serotype(s) + ################# + if "serotype" in jdata: serotypes = jdata["serotype"] for sentry in serotypes: @@ -254,22 +265,35 @@ def main(yaml, template, output, reference, version, call, wd): # Reference genome reference = jdata["reference"] + ############## # Busco scores + ############## + busco = jdata["busco"] busco_status = status["missing"] - busco_completeness = round(((int(busco["C"]))/int(busco["dataset_total_buscos"])), 2)*100 + busco_total= int(busco["dataset_total_buscos"]) + busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100 + busco_fragmented = round((int(busco["F"])/busco_total), 2)*100 + busco_missing = round((int(busco["M"])/busco_total), 2)*100 + busco_duplicated = round((int(busco["D"])/busco_total), 2)*100 busco["completeness"] = busco_completeness + busco_data_all.append({ "Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated }) + if (busco_completeness > 90.0): busco_status = status["pass"] elif (busco_completeness > 80.0): busco_status = status["warn"] - if (this_status == status["pass"]): - this_status = status["warn"] else: busco_status = status["fail"] - this_status = status["fail"] + # Warn if there are duplications in the gene set and busco wasnt already failed + if (busco_duplicated > 5.0) & (busco_status != status["fail"]): + busco_status = status["warn"] + + ############## # MLST types + ############## + mlst = jdata["mlst"] for mentry in mlst: @@ -282,7 +306,10 @@ def main(yaml, template, output, reference, version, call, wd): else: mlst_all[scheme_name] = [{"sample": sample, "sequence_type": sequence_type}] + ############## # Get coverage(s) + ############## + coverage = "-" coverage_status = status["missing"] @@ -332,7 +359,28 @@ def main(yaml, template, output, reference, version, call, wd): else: coverage_pacbio_status = status["fail"] + ###################################### + # Set the overall status of the sample + ###################################### + + # The metrics that by themselves determine overall status: + for estatus in [ confindr_status, taxon_count_status, assembly_status ]: + # if any one metric failed, the whole sample failed + if estatus == status["fail"]: + this_status = estatus + # if a metric is dubious, the entire sample is dubious, unless it already failed or warned + elif (estatus == status["warn"]) & (this_status == status["pass"]): + this_status = estatus + + # The other metrics should at most warn, but never fail the sample + for estatus in [ busco_status, contigs_status ]: + if (estatus != status["missing"]) & (this_status != status["fail"]) & (estatus != status["pass"]): + this_status = status["warn"] + + ######################### # sample-level dictionary + ######################### + rtable = { "sample": sample, "reference": reference, @@ -371,11 +419,15 @@ def main(yaml, template, output, reference, version, call, wd): data["summary"].append(rtable) + ############# + # Plots + ############# + if "kraken" in jdata: # Draw the Kraken abundance table kdata = pd.DataFrame(data=kraken_data_all, index=samples) plot_labels = {"index": "Samples", "value": "Percentage"} - h = len(samples)*25 if len(samples) > 10 else 550 + h = len(samples)*25 if len(samples) > 10 else 300 fig = px.bar(kdata, orientation='h', labels=plot_labels, height=h) data["Kraken"] = fig.to_html(full_html=False) @@ -392,11 +444,23 @@ def main(yaml, template, output, reference, version, call, wd): hfig = px.line(hdata, labels=plot_labels) data["Insertsizes"] = hfig.to_html(full_html=False) + + if busco_data_all: + # Draw the busco stats graph + bdata = pd.DataFrame(data=busco_data_all, index=samples) + plot_labels = { "index": "Samples", "value": "Percentage"} + h = len(samples)*25 if len(samples) > 10 else 300 + fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h) + data["Busco"] = fig.to_html(full_html=False) + data["serotypes"] = serotypes_all data["mlst"] = mlst_all + ############################## # Parse the versions YAML file + ############################## + software = {} current_module = "" rmod = re.compile('^[A-Za-z0.*/]') @@ -412,6 +476,10 @@ def main(yaml, template, output, reference, version, call, wd): data["packages"] = software + ######################## + # Render Jinja2 template + ######################## + with open(output, "w", encoding="utf-8") as output_file: with open(template) as template_file: j2_template = Template(template_file.read()) diff --git a/subworkflows/coverage/main.nf b/subworkflows/coverage/main.nf index 86167f7..f23b22d 100644 --- a/subworkflows/coverage/main.nf +++ b/subworkflows/coverage/main.nf @@ -112,6 +112,7 @@ workflow COVERAGE { emit: versions = ch_versions report = MOSDEPTH.out.global_txt + bam = SAMTOOLS_INDEX.out.bam bam_stats = SAMTOOLS_STATS.out.stats summary = MOSDEPTH.out.summary_txt summary_by_platform = ch_summary_by_platform From d47c6daacd8e98c19f8f00aa46236f600ee48f1b Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 9 Jan 2025 13:54:41 +0100 Subject: [PATCH 12/14] Abricate now runs on several databases in parallel --- bin/gabi.py | 16 ++++++++-------- conf/modules.config | 6 ++---- modules/abricate/run/main.nf | 7 ++++--- nextflow.config | 2 +- subworkflows/amr_profiling/main.nf | 11 +++++++---- workflows/gabi.nf | 4 +++- 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/bin/gabi.py b/bin/gabi.py index 2f2e2d7..18ef7fc 100755 --- a/bin/gabi.py +++ b/bin/gabi.py @@ -5,6 +5,7 @@ import pandas as pd import os import json +import getpass import re import argparse @@ -36,7 +37,7 @@ def main(yaml, template, output, reference, version, call, wd): data = {} - data["user"] = os.getlogin() + data["user"] = getpass.getuser() data["date"] = datetime.datetime.now() data["version"] = version data["call"] = call @@ -271,13 +272,13 @@ def main(yaml, template, output, reference, version, call, wd): busco = jdata["busco"] busco_status = status["missing"] - busco_total= int(busco["dataset_total_buscos"]) + busco_total = int(busco["dataset_total_buscos"]) busco_completeness = round(((int(busco["C"]))/int(busco_total)), 2)*100 busco_fragmented = round((int(busco["F"])/busco_total), 2)*100 busco_missing = round((int(busco["M"])/busco_total), 2)*100 busco_duplicated = round((int(busco["D"])/busco_total), 2)*100 busco["completeness"] = busco_completeness - busco_data_all.append({ "Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated }) + busco_data_all.append({"Complete": busco_completeness, "Missing": busco_missing, "Fragmented": busco_fragmented, "Duplicated": busco_duplicated}) if (busco_completeness > 90.0): busco_status = status["pass"] @@ -364,16 +365,16 @@ def main(yaml, template, output, reference, version, call, wd): ###################################### # The metrics that by themselves determine overall status: - for estatus in [ confindr_status, taxon_count_status, assembly_status ]: + for estatus in [confindr_status, taxon_count_status, assembly_status]: # if any one metric failed, the whole sample failed if estatus == status["fail"]: - this_status = estatus + this_status = estatus # if a metric is dubious, the entire sample is dubious, unless it already failed or warned elif (estatus == status["warn"]) & (this_status == status["pass"]): this_status = estatus # The other metrics should at most warn, but never fail the sample - for estatus in [ busco_status, contigs_status ]: + for estatus in [busco_status, contigs_status]: if (estatus != status["missing"]) & (this_status != status["fail"]) & (estatus != status["pass"]): this_status = status["warn"] @@ -444,7 +445,6 @@ def main(yaml, template, output, reference, version, call, wd): hfig = px.line(hdata, labels=plot_labels) data["Insertsizes"] = hfig.to_html(full_html=False) - if busco_data_all: # Draw the busco stats graph bdata = pd.DataFrame(data=busco_data_all, index=samples) @@ -559,4 +559,4 @@ def check_gc(refs, query): if __name__ == '__main__': - main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd) + main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd) \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 80a6411..670f876 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -134,12 +134,11 @@ process { } withName: ABRICATE_RUN { ext.args = [ - "--db ${params.arg_abricate_db}", "--minid ${params.arg_abricate_minid}", "--mincov ${params.arg_abricate_mincov}" ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate" }, + path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/${db}" }, mode: params.publish_dir_mode, enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } @@ -147,12 +146,11 @@ process { } withName: ABRICATE_RUN_ECOLI_VIRULENCE { ext.args = [ - "--db ecoli_vf", "--minid ${params.arg_abricate_minid}", "--mincov ${params.arg_abricate_mincov}" ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/ecoli_vf" }, + path: { "${params.outdir}/samples/${meta.sample_id}/amr/abricate/${db}" }, mode: params.publish_dir_mode, enabled: true, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } diff --git a/modules/abricate/run/main.nf b/modules/abricate/run/main.nf index b2b9e2e..54ec7cf 100644 --- a/modules/abricate/run/main.nf +++ b/modules/abricate/run/main.nf @@ -1,5 +1,5 @@ process ABRICATE_RUN { - tag "$meta.sample_id" + tag "${meta.sample_id}|$db" label 'short_serial' conda "${moduleDir}/environment.yml" @@ -8,7 +8,7 @@ process ABRICATE_RUN { 'quay.io/biocontainers/abricate:1.0.1--ha8f3691_1' }" input: - tuple val(meta), path(assembly) + tuple val(meta), path(assembly), val(db) output: tuple val(meta), path('*.txt'), emit: report @@ -19,12 +19,13 @@ process ABRICATE_RUN { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.sample_id}" + def prefix = task.ext.prefix ?: "${meta.sample_id}.${db}" """ abricate \\ $args \\ --threads $task.cpus \\ $assembly \\ + --db $db \\ > ${prefix}.txt cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index e684aaa..7003f97 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,9 +23,9 @@ params { arg_hamronization_summarizeformat = 'tsv' - arg_abricate_db = 'vfdb' arg_abricate_minid = 80 arg_abricate_mincov = 80 + abricate_dbs = ['vfdb', 'resfinder', 'argannot', 'card', 'megares'] busco_lineage = "bacteria" busco_db_path = null diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf index 25c8f0a..aa56b1d 100644 --- a/subworkflows/amr_profiling/main.nf +++ b/subworkflows/amr_profiling/main.nf @@ -17,7 +17,8 @@ ch_hamronization_input = Channel.from([]) workflow AMR_PROFILING { take: assembly - db + db // The AMRfinder database to run + abricate_dbs // A list of abricate databases to run (should be generic!) main: @@ -58,17 +59,19 @@ workflow AMR_PROFILING { /* Run Abricate and make JSON report */ + + assembly_with_db = assembly.combine(abricate_dbs) ABRICATE_RUN( - assembly + assembly_with_db ) ch_versions = ch_versions.mix(ABRICATE_RUN.out.versions) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Taxon-specific abricate analyses ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - // E. coli + // E. coli - here we use a specific database! ABRICATE_RUN_ECOLI_VIRULENCE( - assembly_by_taxon.ecoli + assembly_by_taxon.ecoli.map { m,a -> [ m, a, 'ecoli_vf']} ) ch_versions = ch_versions.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.versions) diff --git a/workflows/gabi.nf b/workflows/gabi.nf index f21a418..b5ea684 100644 --- a/workflows/gabi.nf +++ b/workflows/gabi.nf @@ -64,6 +64,7 @@ if (params.input) { ch_prokka_proteins = params.prokka_proteins ? Channel.fromPath(params.prokka_proteins, checkIfExists: true).collect() : [] ch_prokka_prodigal = params.prokka_prodigal ? Channel.fromPath(params.prokka_prodigal, checkIfExists:true).collect() : [] + abricate_dbs = Channel.from(params.abricate_dbs) amrfinder_db = params.reference_base ? file(params.references['amrfinderdb'].db, checkIfExists:true) : [] kraken2_db = params.reference_base ? file(params.references['kraken2'].db, checkIfExists:true) : [] @@ -416,7 +417,8 @@ workflow GABI { if (!params.skip_amr) { AMR_PROFILING( ch_assemblies_clean, - amrfinder_db + amrfinder_db, + abricate_dbs ) ch_versions = ch_versions.mix(AMR_PROFILING.out.versions) ch_report = ch_report.mix(AMR_PROFILING.out.amrfinder_report) From 04ea662a00ecc8ea6cd65b3f5bf5cb6b77694c04 Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Thu, 9 Jan 2025 14:03:43 +0100 Subject: [PATCH 13/14] Adding missing taxonomic information to AMR input --- subworkflows/amr_profiling/main.nf | 4 ++-- workflows/gabi.nf | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf index aa56b1d..8130322 100644 --- a/subworkflows/amr_profiling/main.nf +++ b/subworkflows/amr_profiling/main.nf @@ -21,8 +21,8 @@ workflow AMR_PROFILING { abricate_dbs // A list of abricate databases to run (should be generic!) main: - - assembly.branch { m, a -> + + assembly.branch { m, a -> ecoli: m.taxon ==~ /^Escherichia.*/ salmonella: m.taxon ==~ /^Salmonella.*/ listeria: m.taxon ==~ /^Listeria.*/ diff --git a/workflows/gabi.nf b/workflows/gabi.nf index b5ea684..f726d82 100644 --- a/workflows/gabi.nf +++ b/workflows/gabi.nf @@ -362,6 +362,15 @@ workflow GABI { tuple(m,s) }.set { ch_assemblies_without_plasmids_with_taxa } + // as well as a channel with the clean assembly and taxon information + ch_assemblies_clean.map {m,s -> + tuple(m.sample_id, s) + }.join( + FIND_REFERENCES.out.reference.map { m, r, g, k -> + tuple(m.sample_id, m) + } + ).map { m,s,n -> tuple(n,s) } + .set { ch_assemblies_clean_with_taxa } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUB: Perform serotyping of assemblies @@ -416,7 +425,7 @@ workflow GABI { if (!params.skip_amr) { AMR_PROFILING( - ch_assemblies_clean, + ch_assemblies_clean_with_taxa, amrfinder_db, abricate_dbs ) From daff863c8cdcbb0028725ec1bd97eb48bb3e0edc Mon Sep 17 00:00:00 2001 From: Marc Hoeppner Date: Fri, 10 Jan 2025 07:26:18 +0100 Subject: [PATCH 14/14] Merging abricate databases per sample into one json --- assets/test/samples.csv | 2 +- bin/gabi.py | 4 ++-- modules/hamronization/abricate/main.nf | 8 ++++---- nextflow.config | 2 +- subworkflows/amr_profiling/main.nf | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/assets/test/samples.csv b/assets/test/samples.csv index 691872d..2b22e3a 100644 --- a/assets/test/samples.csv +++ b/assets/test/samples.csv @@ -1,2 +1,2 @@ sample_id,platform,single_end,R1,R2 -ERR1008684,ILLUMINA,false,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR100/004/ERR1008684/ERR1008684_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR100/004/ERR1008684/ERR1008684_2.fastq.gz +SAMEA2707761,ILLUMINA,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR580/ERR580964/ERR580964_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR580/ERR580964/ERR580964_2.fastq.gz diff --git a/bin/gabi.py b/bin/gabi.py index 18ef7fc..669caa5 100755 --- a/bin/gabi.py +++ b/bin/gabi.py @@ -448,7 +448,7 @@ def main(yaml, template, output, reference, version, call, wd): if busco_data_all: # Draw the busco stats graph bdata = pd.DataFrame(data=busco_data_all, index=samples) - plot_labels = { "index": "Samples", "value": "Percentage"} + plot_labels = {"index": "Samples", "value": "Percentage"} h = len(samples)*25 if len(samples) > 10 else 300 fig = px.bar(bdata, orientation='h', labels=plot_labels, height=h) data["Busco"] = fig.to_html(full_html=False) @@ -559,4 +559,4 @@ def check_gc(refs, query): if __name__ == '__main__': - main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd) \ No newline at end of file + main(args.input, args.template, args.output, args.references, args.version, args.call, args.wd) diff --git a/modules/hamronization/abricate/main.nf b/modules/hamronization/abricate/main.nf index 186853d..09ede0d 100644 --- a/modules/hamronization/abricate/main.nf +++ b/modules/hamronization/abricate/main.nf @@ -8,7 +8,7 @@ process HAMRONIZATION_ABRICATE { 'quay.io/biocontainers/hamronization:1.1.4--pyhdfd78af_0' }" input: - tuple val(meta), path(report) + tuple val(meta), path(reports) val(format) val(software_version) val(reference_db_version) @@ -23,16 +23,16 @@ process HAMRONIZATION_ABRICATE { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.sample_id}" + def prefix = task.ext.prefix ?: "${report.getBaseName()}" """ hamronize \\ abricate \\ - ${report} \\ + ${reports} \\ $args \\ --format ${format} \\ --analysis_software_version ${software_version} \\ --reference_database_version ${reference_db_version} \\ - > ${prefix}.${format} + --output ${prefix}.${format} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 7003f97..0a9f63f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,7 +84,7 @@ params { manifest { name = "bio-raum/gabi" - version = "0.9.1" + version = "0.9.2" description = "GABI Pipeline for assembly and profiling of bacterial isolates" author = "Marc Hoeppner" homePage = "https://github.com/bio-raum/gabi" diff --git a/subworkflows/amr_profiling/main.nf b/subworkflows/amr_profiling/main.nf index 8130322..ee857b5 100644 --- a/subworkflows/amr_profiling/main.nf +++ b/subworkflows/amr_profiling/main.nf @@ -77,7 +77,7 @@ workflow AMR_PROFILING { // Join basic Abricate results HAMRONIZATION_ABRICATE( - ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report), + ABRICATE_RUN.out.report.mix(ABRICATE_RUN_ECOLI_VIRULENCE.out.report).groupTuple(), 'json', '1.0.1', '2021-Mar-27'