Refactored the flags, allowing users to override metadata from ENA

- Make MEGAHIT fail if the contigs are empty - MultiQC ENA Metadata report (remove the hardcoded name from the process) - Added some more unit tests - Fix the assembler override ( only use single_end auto selection if the assembler is null )
EBI-Metagenomics · Jun 4, 2024 · eec8ab9 · eec8ab9
1 parent 3efde10
commit eec8ab9
Show file tree

Hide file tree

Showing 13 changed files with 152 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -21,33 +21,42 @@ This pipeline is still in early development. It's mostly a direct port of the mi
 Pipeline help:
 
 ```bash
-nextflow run ebi-metagenomics/miassembler --help
+Typical pipeline command:
+
+  nextflow run ebi-metagenomics/miassembler --help
 
 Input/output options
-  --study_accession                  [string]  The ENA Study secondary accession
-  --reads_accession                  [string]  The ENA Run primary accession
-  --private_study                    [boolean] To use if the ENA study is private [default: false]
-  --assembler                        [string]  The short reads assembler (accepted: spades, metaspades, megahit) [default: metaspades]
-  --reference_genome                 [string]  The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal
-                                               directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna, honeybee.fna,
-                                               rainbow_trout.fna, rat.fna, ...)
-  --blast_reference_genomes_folder   [string]  The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal
-                                               directory.
-  --bwamem2_reference_genomes_folder [string]  The folder with the reference genome bwa-mem2 indexes, defaults to the Microbiome Informatics internal
-                                               directory.
-  --remove_human_phix                [boolean] Remove human and phiX reads pre assembly, and contigs matching those genomes. [default: true]
-  --human_phix_blast_index_name      [string]  Combined Human and phiX BLAST db. [default: human_phix]
-  --human_phix_bwamem2_index_name    [string]  Combined Human and phiX bwa-mem2 index. [default: human_phix]
-  --min_contig_length                [integer] Minimum contig length filter. [default: 500]
-  --assembly_memory                  [integer] Default memory allocated for the assembly process. [default: 100]
-  --spades_only_assembler            [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true]
-  --outdir                           [string]  The output directory where the results will be saved. You have to use absolute paths to storage on Cloud
-                                               infrastructure.
-  --email                            [string]  Email address for completion summary.
-  --multiqc_title                    [string]  MultiQC report title. Printed as page header, used for filename if not otherwise specified.
+  --study_accession                       [string]  The ENA Study secondary accession
+  --reads_accession                       [string]  The ENA Run primary accession
+  --private_study                         [boolean] To use if the ENA study is private
+  --assembler                             [string]  The short reads assembler (accepted: spades, metaspades, megahit)
+  --single_end                            [boolean] Force the single_end value for the study / reads
+  --library_strategy                      [string]  Force the library_strategy value for the study / reads (accepted: metagenomic, metatranscriptomic,
+                                                    genomic, transcriptomic, other)
+  --library_layout                        [string]  Force the library_layout value for the study / reads (accepted: single, paired)
+  --spades_version                        [string]  null [default: 3.15.5]
+  --megahit_version                       [string]  null [default: 1.2.9]
+  --reference_genome                      [string]  The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics
+                                                    internal directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna,
+                                                    honeybee.fna, rainbow_trout.fna, ...)
+  --blast_reference_genomes_folder        [string]  The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal
+                                                    directory.
+  --bwamem2_reference_genomes_folder      [string]  The folder with the reference genome bwa-mem2 indexes, defaults to the Microbiome Informatics internal
+                                                    directory.
+  --remove_human_phix                     [boolean] Remove human and phiX reads pre assembly, and contigs matching those genomes. [default: true]
+  --human_phix_blast_index_name           [string]  Combined Human and phiX BLAST db. [default: human_phix]
+  --human_phix_bwamem2_index_name         [string]  Combined Human and phiX bwa-mem2 index. [default: human_phix]
+  --min_contig_length                     [integer] Minimum contig length filter. [default: 500]
+  --min_contig_length_metatranscriptomics [integer] Minimum contig length filter for metaT. [default: 200]
+  --assembly_memory                       [integer] Default memory allocated for the assembly process. [default: 100]
+  --spades_only_assembler                 [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true]
+  --outdir                                [string]  The output directory where the results will be saved. You have to use absolute paths to storage on Cloud
+                                                    infrastructure. [default: results]
+  --email                                 [string]  Email address for completion summary.
+  --multiqc_title                         [string]  MultiQC report title. Printed as page header, used for filename if not otherwise specified.
 
 Generic options
-  --multiqc_methods_description      [string]  Custom MultiQC yaml file containing HTML including a methods description.
+  --multiqc_methods_description           [string]  Custom MultiQC yaml file containing HTML including a methods description.
 ```
 
 Example:

diff --git a/modules/local/fetchtool_reads.nf b/modules/local/fetchtool_reads.nf
@@ -12,8 +12,8 @@ process FETCHTOOL_READS {
     output:
     tuple val(meta), path("download_folder/${study_accession}/raw/${reads_accession}*.fastq.gz"), env(library_strategy), env(library_layout), emit: reads
     // The '_mqc.' is for multiQC
-    tuple val(meta), path("download_folder/${study_accession}/fetch_tool_mqc.tsv")                                     , emit: metadata_tsv
-    path "versions.yml"                                                                                                , emit: versions
+    tuple val(meta), path("download_folder/${study_accession}/${study_accession}.txt")                                     , emit: metadata_tsv
+    path "versions.yml"                                                                                                    , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -29,10 +29,8 @@ process FETCHTOOL_READS {
     -c ${fetchtool_config} \\
     -v ${private_study} ${args}
 
-    library_strategy=\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 7)
-    library_layout=\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 5)
-
-    cp download_folder/${study_accession}/${study_accession}.txt download_folder/${study_accession}/fetch_tool_mqc.tsv
+    library_strategy=\$(echo "\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 7)" | tr '[:upper:]' '[:lower:]')
+    library_layout=\$(echo "\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 5)" | tr '[:upper:]' '[:lower:]')
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf
diff --git a/modules/nf-core/megahit/main.nf b/modules/nf-core/megahit/main.nf
diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf
diff --git a/modules/nf-core/seqkit/seq/main.nf b/modules/nf-core/seqkit/seq/main.nf
diff --git a/modules/nf-core/seqkit/seq/seqkit-seq.diff b/modules/nf-core/seqkit/seq/seqkit-seq.diff
diff --git a/modules/nf-core/spades/main.nf b/modules/nf-core/spades/main.nf
diff --git a/modules/nf-core/spades/spades.diff b/modules/nf-core/spades/spades.diff
diff --git a/nextflow.config b/nextflow.config
@@ -33,10 +33,12 @@ params {
     */
     assembler                        = null
 
-    // The pipeline will use the metadata from the fetch_tool to
-    // library_layout to figure out if paired or not
-    // use this option to force it
+    // The pipeline will use the metadata from ENA (obtained by the fetch_tool)
+    // As the metadata can be incorrect, we provide the following parameters to
+    // "force" them
     single_end                       = null
+    library_layout                   = null
+    library_strategy                 = null
 
     // Reference genome
     reference_genome                 = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -43,6 +43,16 @@
                     "type": "boolean",
                     "description": "Force the single_end value for the study / reads"
                 },
+                "library_strategy": {
+                    "type": "string",
+                    "description": "Force the library_strategy value for the study / reads",
+                    "enum": ["metagenomic", "metatranscriptomic", "genomic", "transcriptomic", "other"]
+                },
+                "library_layout": {
+                    "type": "string",
+                    "description": "Force the library_layout value for the study / reads",
+                    "enum": ["single", "paired"]
+                },
                 "spades_version": {
                     "type": "string",
                     "default": "3.15.5"

diff --git a/tests/main.nf.test b/tests/main.nf.test
@@ -51,54 +51,50 @@ nextflow_pipeline {
 
     }
 
-    // test("metaSPAdes - single end") {
-
-    //     when {
-    //         params {
-    //             outdir = "tests/results"
-    //             assembler = "metaspades"
-    //             bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem"
-    //             blast_reference_genomes_folder   = "${projectDir}/tests/human_phix/blast"
-    //             study_accession                  = "ERP012810"
-    //             reads_accession                  = "ERR1076564"
-    //         }
-    //     }
-
-    //     then {
-    //         with(workflow) {
-    //             assert success
-    //             assert trace.succeeded().contains("SPADES")
-    //             assert !trace.succeeded().contains("MEGAHIT")
-    //             assert trace.tasks().size() == 17
-    //         }
-    //     }
-
-    // }
-
-    // test("MEGAHIT - single end") {
-
-    //     when {
-    //         params {
-    //             outdir = "tests/results"
-    //             bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem"
-    //             blast_reference_genomes_folder   = "${projectDir}/tests/human_phix/blast"
-    //             study_accession                  = "ERP012810"
-    //             reads_accession                  = "ERR1076564"
-    //         }
-    //     }
-
-    //     then {
-    //         with(workflow) {
-    //             assert success
-    //             assert trace.succeeded().contains("MEGAHIT")
-    //             assert !trace.succeeded().contains("SPADES")
-    //             assert trace.tasks().size() == 17
-    //         }
-    //     }
-
-    // }
-
-    test("MEGAHIT - metaT - single") {
+    test("metaSPAdes - single end - should fail") {
+
+        when {
+            params {
+                outdir                           = "tests/results"
+                assembler                        = "metaspades"
+                bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem"
+                blast_reference_genomes_folder   = "${projectDir}/tests/human_phix/blast"
+                study_accession                  = "ERP012810"
+                reads_accession                  = "ERR1076564"
+            }
+        }
+
+        then {
+            with(workflow) {
+                // No contigs
+                assert !success
+                assert trace.failed().count{ task -> task.name.contains("SPADES") } == 1
+            }
+        }
+
+    }
+
+    test("MEGAHIT - single end - should fail") {
+
+        when {
+            params {
+                outdir = "tests/results"
+                bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem"
+                blast_reference_genomes_folder   = "${projectDir}/tests/human_phix/blast"
+                study_accession                  = "ERP012810"
+                reads_accession                  = "ERR1076564"
+            }
+        }
+
+        then {
+            with(workflow) {
+                assert !success
+                assert trace.failed().count{ task -> task.name.contains("MEGAHIT") } == 1
+            }
+        }
+    }
+
+    test("MEGAHIT - metaT - single end") {
 
         when {
             params {
@@ -121,4 +117,26 @@ nextflow_pipeline {
 
     }
 
+    test("Spades - metaT - single end should fail") {
+
+        when {
+            params {
+                outdir = "tests/results"
+                assembler = "spades"
+                bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem"
+                blast_reference_genomes_folder   = "${projectDir}/tests/human_phix/blast"
+                study_accession                  = "DRP007622"
+                reads_accession                  = "DRR280712"
+            }
+        }
+
+        then {
+            with(workflow) {
+                assert !success
+                assert trace.failed().count{ task -> task.name.contains("SPADES") } == 1
+            }
+        }
+
+    }
+
 }
diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf
@@ -84,10 +84,10 @@ workflow MIASSEMBLER {
     // Push the library strategy into the meta of the reads, this is to make it easier to handle downstream
     fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout -> {
             [ meta + [
-                "library_strategy": library_strategy,
-                "library_layout": library_layout,
-                //  The user defined single_end is preferred over the metadata
-                "single_end": params.single_end ?: library_layout == "SINGLE"
+                //  -- The metadata will be overriden by the parameters -- //
+                "library_strategy": params.library_strategy ?: library_strategy,
+                "library_layout": params.library_layout ?: library_layout,
+                "single_end": params.single_end ?: library_layout == "single"
             ], reads ]
         }
     }
@@ -118,9 +118,9 @@ workflow MIASSEMBLER {
         - An error is raised if the assembler and read layout are incompatible (shouldn't happen...)
     */
     qc_reads_extended = READS_QC.out.qc_reads.map { meta, reads ->
-        if ( params.assembler == "megahit" || meta.single_end ) {
+        if ( params.assembler == "megahit" || ( meta.single_end && params.assembler == null ) ) {
             return [ meta + [assembler: "megahit", assembler_version: params.megahit_version], reads]
-        } else if ( ["metaspades", "spades"].contains(params.assembler) || !meta.single_end ) {
+        } else if ( ["metaspades", "spades"].contains(params.assembler) || ( !meta.single_end && params.assembler == null ) ) {
             def xspades_assembler = params.assembler ?: "metaspades" // Default to "metaspades" if the user didn't select one
             return [ meta + [assembler: xspades_assembler, assembler_version: params.spades_version], reads]
         } else {
@@ -190,6 +190,14 @@ workflow MIASSEMBLER {
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )
 
+    // Metadata for MultiQC
+    fetch_tool_metadata = FETCHTOOL_READS.out.metadata_tsv.map { it[1] }.collectFile(
+        name: 'fetch_tool_mqc.tsv',
+        newLine: true,
+        keepHeader: true,
+        skip: 1
+    )
+
     //
     // MODULE: MultiQC
     //
@@ -203,7 +211,7 @@ workflow MIASSEMBLER {
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
-    ch_multiqc_files = ch_multiqc_files.mix(FETCHTOOL_READS.out.metadata_tsv.collect{it[1]}.ifEmpty([]))
+    ch_multiqc_files = ch_multiqc_files.mix(fetch_tool_metadata)
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC_BEFORE.out.zip.collect{it[1]}.ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(FASTQC_AFTER.out.zip.collect{it[1]}.ifEmpty([]))
     ch_multiqc_files = ch_multiqc_files.mix(ASSEMBLY_COVERAGE.out.samtools_idxstats.collect{ it[1] }.ifEmpty([]))