From 2a6c0623c1c3e41a345bd1f0e995190b3c2705ac Mon Sep 17 00:00:00 2001 From: Ge94 Date: Mon, 2 Sep 2024 15:44:30 +0100 Subject: [PATCH 01/33] Addition of long-reads pre-assembly qcs --- bin/check_raw_quality.py | 22 + conf/modules.config | 55 +- conf/test.config | 2 + modules.json | 32 +- modules/local/fetchtool_reads.nf | 19 +- modules/local/raw_read_quality_check.nf | 24 + modules/nf-core/canu/environment.yml | 6 + modules/nf-core/canu/main.nf | 50 ++ modules/nf-core/canu/meta.yml | 79 +++ modules/nf-core/fastp/main.nf | 13 +- modules/nf-core/fastp/meta.yml | 6 +- modules/nf-core/flye/environment.yml | 6 + modules/nf-core/flye/main.nf | 68 +++ modules/nf-core/flye/meta.yml | 68 +++ modules/nf-core/flye/tests/main.nf.test | 258 ++++++++++ modules/nf-core/flye/tests/main.nf.test.snap | 80 +++ modules/nf-core/flye/tests/nextflow.config | 4 + modules/nf-core/flye/tests/tags.yml | 2 + modules/nf-core/medaka/environment.yml | 6 + modules/nf-core/medaka/main.nf | 40 ++ modules/nf-core/medaka/meta.yml | 45 ++ modules/nf-core/medaka/tests/main.nf.test | 33 ++ .../nf-core/medaka/tests/main.nf.test.snap | 33 ++ modules/nf-core/medaka/tests/tags.yml | 2 + .../nf-core/minimap2/align/environment.yml | 11 + modules/nf-core/minimap2/align/main.nf | 81 +++ modules/nf-core/minimap2/align/meta.yml | 84 ++++ .../nf-core/minimap2/align/tests/main.nf.test | 441 ++++++++++++++++ .../minimap2/align/tests/main.nf.test.snap | 476 ++++++++++++++++++ modules/nf-core/minimap2/align/tests/tags.yml | 2 + modules/nf-core/porechop/abi/environment.yml | 9 + modules/nf-core/porechop/abi/main.nf | 50 ++ modules/nf-core/porechop/abi/meta.yml | 48 ++ .../nf-core/porechop/abi/tests/main.nf.test | 59 +++ .../porechop/abi/tests/main.nf.test.snap | 94 ++++ modules/nf-core/porechop/abi/tests/tags.yml | 2 + modules/nf-core/racon/environment.yml | 6 + modules/nf-core/racon/main.nf | 38 ++ modules/nf-core/racon/meta.yml | 51 ++ nextflow.config | 19 + nextflow_schema.json | 38 +- subworkflows/local/long_reads_qc.nf | 91 ++++ subworkflows/local/ont_hq.nf | 16 + subworkflows/local/ont_lq.nf | 18 + subworkflows/local/pacbio_hifi.nf | 3 + subworkflows/local/pacbio_lq.nf | 14 + subworkflows/local/reads_qc.nf | 1 + tests/samplesheet/test_minION_SRR10303629.csv | 2 + workflows/longreadassembler.nf | 244 +++++++++ 49 files changed, 2836 insertions(+), 15 deletions(-) create mode 100755 bin/check_raw_quality.py create mode 100644 modules/local/raw_read_quality_check.nf create mode 100644 modules/nf-core/canu/environment.yml create mode 100644 modules/nf-core/canu/main.nf create mode 100644 modules/nf-core/canu/meta.yml create mode 100644 modules/nf-core/flye/environment.yml create mode 100644 modules/nf-core/flye/main.nf create mode 100644 modules/nf-core/flye/meta.yml create mode 100644 modules/nf-core/flye/tests/main.nf.test create mode 100644 modules/nf-core/flye/tests/main.nf.test.snap create mode 100644 modules/nf-core/flye/tests/nextflow.config create mode 100644 modules/nf-core/flye/tests/tags.yml create mode 100644 modules/nf-core/medaka/environment.yml create mode 100644 modules/nf-core/medaka/main.nf create mode 100644 modules/nf-core/medaka/meta.yml create mode 100644 modules/nf-core/medaka/tests/main.nf.test create mode 100644 modules/nf-core/medaka/tests/main.nf.test.snap create mode 100644 modules/nf-core/medaka/tests/tags.yml create mode 100644 modules/nf-core/minimap2/align/environment.yml create mode 100644 modules/nf-core/minimap2/align/main.nf create mode 100644 modules/nf-core/minimap2/align/meta.yml create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/minimap2/align/tests/tags.yml create mode 100644 modules/nf-core/porechop/abi/environment.yml create mode 100644 modules/nf-core/porechop/abi/main.nf create mode 100644 modules/nf-core/porechop/abi/meta.yml create mode 100644 modules/nf-core/porechop/abi/tests/main.nf.test create mode 100644 modules/nf-core/porechop/abi/tests/main.nf.test.snap create mode 100644 modules/nf-core/porechop/abi/tests/tags.yml create mode 100644 modules/nf-core/racon/environment.yml create mode 100644 modules/nf-core/racon/main.nf create mode 100644 modules/nf-core/racon/meta.yml create mode 100644 subworkflows/local/long_reads_qc.nf create mode 100644 subworkflows/local/ont_hq.nf create mode 100644 subworkflows/local/ont_lq.nf create mode 100644 subworkflows/local/pacbio_hifi.nf create mode 100644 subworkflows/local/pacbio_lq.nf create mode 100644 tests/samplesheet/test_minION_SRR10303629.csv create mode 100644 workflows/longreadassembler.nf diff --git a/bin/check_raw_quality.py b/bin/check_raw_quality.py new file mode 100755 index 0000000..9a9dc5b --- /dev/null +++ b/bin/check_raw_quality.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import json +import argparse + +parser = argparse.ArgumentParser(description="Evaluate run quality from fastp output") +parser.add_argument('--json','-j',help='Fastp json output',required=True) + +argv = parser.parse_args() + +fastp_out = argv.json +data = json.load(open(fastp_out)) + +q20_bases = float(data['read1_before_filtering']['q20_bases']) +total_bases = float(data['read1_before_filtering']['total_bases']) +q20_percentage = q20_bases/total_bases*100 + +quality = "low" +if q20_percentage >= 80: + quality = "high" + +print(quality) \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 367222c..8cf286e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,7 +20,7 @@ process { ext.args = params.private_study ? "--private" : "" } - withName: 'FASTP' { + withName: 'FASTP*' { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 36.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } @@ -50,6 +50,16 @@ process { ] } + withName: 'FASTP_LR' { + ext.args = [ + '--average_qual', + '10', + '--length_required', + "${params.min_read_length}", + '--disable_adapter_trimming' + ].join(' ').trim() + } + withName: 'FASTQC' { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 36.GB * task.attempt, 'memory' ) } @@ -89,13 +99,54 @@ process { ext.prefix = "decontaminated" } - withName: 'HUMAN_PHIX_DECONTAMINATION' { + withName: 'HUMAN*_DECONTAMINATION' { memory = { check_max( 64.GB * task.attempt, 'memory' ) } } withName: 'HOST_DECONTAMINATION' { memory = { check_max( 24.GB * task.attempt, 'memory' ) } } + + withName: 'CANU*' { + cpus = { check_max( 4 , 'cpus' ) } + memory = { check_max( 3.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + + ext.args = [ + '-trim', + '-corrected', + 'corMinCoverage=0', + 'stopOnLowCoverage=0', + 'minInputCoverage=0', + 'maxInputCoverage=10000', + 'corOutCoverage=all', + 'corMhapSensitivity=high', + 'corMaxEvidenceCoverageLocal=10', + 'corMaxEvidenceCoverageGlobal=10', + 'oeaMemory=10', + 'redMemory=10', + 'batMemory=10', + ].join(' ').trim() + } + + withName: 'CANU_ONT' { + ext.args2 = [ + 'correctedErrorRate=0.16', + ].join(' ').trim() + } + + withName: 'CANU_PACBIO' { + ext.args2 = [ + 'correctedErrorRate=0.105', + ].join(' ').trim() + } + + withName: 'PORECHOP_ONT' { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + /* --------- */ /* Assembly */ diff --git a/conf/test.config b/conf/test.config index 9e95f65..421e7f7 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,6 +22,8 @@ profiles { blast_reference_genomes_folder = "tests/human_phix/blast" human_phix_blast_index_name = "human_phix" human_phix_bwamem2_index_name = "human_phix" + human_blast_index_name = "human" + human_bwamem2_index_name = "human" } } } diff --git a/modules.json b/modules.json index 54f81f3..f510e07 100644 --- a/modules.json +++ b/modules.json @@ -32,6 +32,11 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, + "canu": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "82024cf6325d2ee194e7f056d841ecad2f6856e9", @@ -39,7 +44,7 @@ }, "fastp": { "branch": "master", - "git_sha": "95cf5fe0194c7bf5cb0e3027a2eb7e7c89385080", + "git_sha": "1ceaa8ba4d0fd886dbca0e545815d905b7407de7", "installed_by": ["modules"], "patch": "modules/nf-core/fastp/fastp.diff" }, @@ -49,6 +54,16 @@ "installed_by": ["modules"], "patch": "modules/nf-core/fastqc/fastqc.diff" }, + "flye": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, + "medaka": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, "megahit": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", @@ -60,17 +75,32 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, + "minimap2/align": { + "branch": "master", + "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "314d742bdb357a1df5f9b88427b3b6ac78aa33f7", "installed_by": ["modules"] }, + "porechop/abi": { + "branch": "master", + "git_sha": "870f9af2eaf0000c94d74910d762cf153752af98", + "installed_by": ["modules"] + }, "quast": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"], "patch": "modules/nf-core/quast/quast.diff" }, + "racon": { + "branch": "master", + "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", + "installed_by": ["modules"] + }, "samtools/idxstats": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", diff --git a/modules/local/fetchtool_reads.nf b/modules/local/fetchtool_reads.nf index 129e452..e62484a 100644 --- a/modules/local/fetchtool_reads.nf +++ b/modules/local/fetchtool_reads.nf @@ -3,17 +3,17 @@ process FETCHTOOL_READS { label 'process_single' - container "quay.io/microbiome-informatics/fetch-tool:v1.0.0rc" + container "quay.io/microbiome-informatics/fetch-tool:v1.0.2" input: tuple val(meta), val(study_accession), val(reads_accession) path fetchtool_config output: - tuple val(meta), path("download_folder/${study_accession}/raw/${reads_accession}*.fastq.gz"), env(library_strategy), env(library_layout), emit: reads + tuple val(meta), path("download_folder/${study_accession}/raw/${reads_accession}*.fastq.gz"), env(library_strategy), env(library_layout), env(platform), emit: reads // The '_mqc.' is for multiQC - tuple val(meta), path("download_folder/${study_accession}/${study_accession}.txt") , emit: metadata_tsv - path "versions.yml" , emit: versions + tuple val(meta), path("download_folder/${study_accession}/${study_accession}.txt") , emit: metadata_tsv + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -32,6 +32,15 @@ process FETCHTOOL_READS { library_strategy=\$(echo "\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 7)" | tr '[:upper:]' '[:lower:]') library_layout=\$(echo "\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 5)" | tr '[:upper:]' '[:lower:]') + export metadata_platform=\$(echo "\$(grep ${reads_accession} download_folder/${study_accession}/${study_accession}.txt | cut -f 8)" | tr '[:upper:]' '[:lower:]') + if [[ \$metadata_platform == "minion" || \$metadata_platform == "promethion" || \$metadata_platform == "gridion" ]]; then + platform="ont" + elif [[ \$metadata_platform == "pacbio rs" || \$metadata_platform == "pacbio rs ii" ]]; then + platform="pacbio" + else + platform="short" + fi + cat <<-END_VERSIONS > versions.yml "${task.process}": fetch-tool: \$(fetch-read-tool --version) @@ -53,4 +62,4 @@ process FETCHTOOL_READS { fetch-tool: \$(fetch-read-tool --version) END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/local/raw_read_quality_check.nf b/modules/local/raw_read_quality_check.nf new file mode 100644 index 0000000..01ea6f2 --- /dev/null +++ b/modules/local/raw_read_quality_check.nf @@ -0,0 +1,24 @@ +process RAW_READ_QUALITY_CHECK { + tag "$reads_accession" + label 'process_single' + + input: + tuple val(meta), path(fastp_json) + + output: + env(quality) , emit: quality + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + quality=\$(check_raw_quality.py -j ${fastp_json}) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/canu/environment.yml b/modules/nf-core/canu/environment.yml new file mode 100644 index 0000000..7b601cb --- /dev/null +++ b/modules/nf-core/canu/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::canu=2.2 diff --git a/modules/nf-core/canu/main.nf b/modules/nf-core/canu/main.nf new file mode 100644 index 0000000..7c5deab --- /dev/null +++ b/modules/nf-core/canu/main.nf @@ -0,0 +1,50 @@ +process CANU { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/canu:2.2--ha47f30e_0': + 'biocontainers/canu:2.2--ha47f30e_0' }" + + input: + tuple val(meta), path(reads) + val mode + val genomesize + + output: + tuple val(meta), path("*.report") , emit: report + tuple val(meta), path("*.contigs.fasta.gz") , emit: assembly , optional: true + tuple val(meta), path("*.unassembled.fasta.gz") , emit: contigs , optional: true + tuple val(meta), path("*.correctedReads.fasta.gz") , emit: corrected_reads , optional: true + tuple val(meta), path("*.trimmedReads.fasta.gz") , emit: corrected_trimmed_reads , optional: true + tuple val(meta), path("*.contigs.layout") , emit: metadata , optional: true + tuple val(meta), path("*.contigs.layout.readToTig") , emit: contig_position , optional: true + tuple val(meta), path("*.contigs.layout.tigInfo") , emit: contig_info , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def valid_mode = ["-pacbio", "-nanopore", "-pacbio-hifi"] + if ( !valid_mode.contains(mode) ) { error "Unrecognised mode to run Canu. Options: ${valid_mode.join(', ')}" } + """ + canu \\ + -p ${prefix} \\ + $mode \\ + genomeSize=${genomesize} \\ + $args \\ + $args2 \\ + maxThreads=$task.cpus \\ + $reads + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + canu: \$(echo \$(canu --version 2>&1) | sed 's/^.*canu //; s/Using.*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/canu/meta.yml b/modules/nf-core/canu/meta.yml new file mode 100644 index 0000000..2feed43 --- /dev/null +++ b/modules/nf-core/canu/meta.yml @@ -0,0 +1,79 @@ +name: "canu" +description: Accurate assembly of segmental duplications, satellites, and allelic variants from high-fidelity long reads. +keywords: + - Assembly + - pacbio + - hifi + - nanopore +tools: + - "canu": + description: "Canu is a fork of the Celera Assembler designed for high-noise single-molecule sequencing." + homepage: "https://canu.readthedocs.io/en/latest/index.html#" + documentation: "https://canu.readthedocs.io/en/latest/tutorial.html" + tool_dev_url: "https://github.com/marbl/canu" + doi: "10.1101/gr.215087.116" + licence: "['GPL v2 and others']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:true ] + - reads: + type: file + description: fasta/fastq file + pattern: "*.{fasta,fastq}" + - mode: + type: value + description: Canu mode depending on the input data (source and error rate) + pattern: "-pacbio|-nanopore|-pacbio-hifi" + - genomesize: + type: value + description: An estimate of the size of the genome. Common suffices are allowed, for example, 3.7m or 2.8g + pattern: "[g|m|k]" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - report: + type: file + description: Most of the analysis reported during assembly + pattern: "*.report" + - assembly: + type: file + description: Everything which could be assembled and is the full assembly, including both unique, repetitive, and bubble elements. + pattern: "*.contigs.fasta" + - contigs: + type: file + description: Reads and low-coverage contigs which could not be incorporated into the primary assembly. + pattern: "*.unassembled.fasta" + - corrected_reads: + type: file + description: The reads after correction. + pattern: "*.correctedReads.fasta.gz" + - corrected_trimmed_reads: + type: file + description: The corrected reads after overlap based trimming + pattern: "*.trimmedReads.fasta.gz" + - metadata: + type: file + description: (undocumented) + pattern: "*.contigs.layout" + - contig_position: + type: file + description: The position of each read in a contig + pattern: "*.contigs.layout.readToTig" + - contig_info: + type: file + description: A list of the contigs, lengths, coverage, number of reads and other metadata. Essentially the same information provided in the FASTA header line. + pattern: "*.contigs.layout.tigInfo" +authors: + - "@scorreard" +maintainers: + - "@scorreard" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 3d97ca9..7c51260 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -10,6 +10,7 @@ process FASTP { input: tuple val(meta), path(reads) path adapter_fasta + val discard_trimmed_pass val save_trimmed_fail val save_merged val trim_polyA @@ -32,8 +33,11 @@ process FASTP { def polyA = ( trim_polyA || meta.library_strategy == "metatranscriptomic" ) ? "--trim_poly_x" : '' def prefix = task.ext.prefix ?: "${meta.id}" def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" - def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--failed_out ${prefix}.paired.fail.fastq.gz --unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def out_fq1 = discard_trimmed_pass ?: ( meta.single_end ? "--out1 ${prefix}.fastp.fastq.gz" : "--out1 ${prefix}_1.fastp.fastq.gz" ) + def out_fq2 = discard_trimmed_pass ?: "--out2 ${prefix}_2.fastp.fastq.gz" // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. if ( task.ext.args?.contains('--interleaved_in') ) { """ [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz @@ -62,6 +66,7 @@ process FASTP { fastp \\ --in1 ${prefix}.fastq.gz \\ + $out_fq1 \\ --out1 ${prefix}.fastp.fastq.gz \\ --thread $task.cpus \\ --json ${prefix}.fastp.json \\ @@ -85,8 +90,8 @@ process FASTP { fastp \\ --in1 ${prefix}_1.fastq.gz \\ --in2 ${prefix}_2.fastq.gz \\ - --out1 ${prefix}_1.fastp.fastq.gz \\ - --out2 ${prefix}_2.fastp.fastq.gz \\ + $out_fq1 \\ + $out_fq2 \\ --json ${prefix}.fastp.json \\ --html ${prefix}.fastp.html \\ $adapter_list \\ @@ -96,7 +101,7 @@ process FASTP { --thread $task.cpus \\ --detect_adapter_for_pe \\ $args \\ - 2> ${prefix}.fastp.log + 2> >(tee ${prefix}.fastp.log >&2) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml index c22a16a..8dfecc1 100644 --- a/modules/nf-core/fastp/meta.yml +++ b/modules/nf-core/fastp/meta.yml @@ -27,12 +27,16 @@ input: type: file description: File in FASTA format containing possible adapters to remove. pattern: "*.{fasta,fna,fas,fa}" + - discard_trimmed_pass: + type: boolean + description: Specify true to not write any reads that pass trimming thresholds. | + This can be used to use fastp for the output report only. - save_trimmed_fail: type: boolean description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` - save_merged: type: boolean - description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + description: Specify true to save all merged reads to a file ending in `*.merged.fastq.gz` output: - meta: type: map diff --git a/modules/nf-core/flye/environment.yml b/modules/nf-core/flye/environment.yml new file mode 100644 index 0000000..f5364d5 --- /dev/null +++ b/modules/nf-core/flye/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::flye=2.9 diff --git a/modules/nf-core/flye/main.nf b/modules/nf-core/flye/main.nf new file mode 100644 index 0000000..3d89218 --- /dev/null +++ b/modules/nf-core/flye/main.nf @@ -0,0 +1,68 @@ +process FLYE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/flye:2.9--py39h6935b12_1' : + 'biocontainers/flye:2.9--py39h6935b12_1' }" + + input: + tuple val(meta), path(reads) + val mode + + output: + tuple val(meta), path("*.fasta.gz"), emit: fasta + tuple val(meta), path("*.gfa.gz") , emit: gfa + tuple val(meta), path("*.gv.gz") , emit: gv + tuple val(meta), path("*.txt") , emit: txt + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*.json") , emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def valid_mode = ["--pacbio-raw", "--pacbio-corr", "--pacbio-hifi", "--nano-raw", "--nano-corr", "--nano-hq"] + if ( !valid_mode.contains(mode) ) { error "Unrecognised mode to run Flye. Options: ${valid_mode.join(', ')}" } + """ + flye \\ + $mode \\ + $reads \\ + --out-dir . \\ + --threads \\ + $task.cpus \\ + $args + + gzip -c assembly.fasta > ${prefix}.assembly.fasta.gz + gzip -c assembly_graph.gfa > ${prefix}.assembly_graph.gfa.gz + gzip -c assembly_graph.gv > ${prefix}.assembly_graph.gv.gz + mv assembly_info.txt ${prefix}.assembly_info.txt + mv flye.log ${prefix}.flye.log + mv params.json ${prefix}.params.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flye: \$( flye --version ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo stub | gzip -c > ${prefix}.assembly.fasta.gz + echo stub | gzip -c > ${prefix}.assembly_graph.gfa.gz + echo stub | gzip -c > ${prefix}.assembly_graph.gv.gz + echo contig_1 > ${prefix}.assembly_info.txt + echo stub > ${prefix}.flye.log + echo stub > ${prefix}.params.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + flye: \$( flye --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/flye/meta.yml b/modules/nf-core/flye/meta.yml new file mode 100644 index 0000000..5c3c816 --- /dev/null +++ b/modules/nf-core/flye/meta.yml @@ -0,0 +1,68 @@ +name: "flye" +description: De novo assembler for single molecule sequencing reads +keywords: + - assembly + - genome + - de novo + - genome assembler + - single molecule +tools: + - "flye": + description: "Fast and accurate de novo assembler for single molecule sequencing reads" + homepage: "https://github.com/fenderglass/Flye" + documentation: "https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md" + tool_dev_url: "https://github.com/fenderglass/Flye" + doi: "10.1038/s41592-020-00971-x" + licence: "['BSD-3-clause']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: file + description: Input reads from Oxford Nanopore or PacBio data in FASTA/FASTQ format. + pattern: "*.{fasta,fastq,fasta.gz,fastq.gz,fa,fq,fa.gz,fq.gz}" + - mode: + type: string + description: Flye mode depending on the input data (source and error rate) + pattern: "--pacbio-raw|--pacbio-corr|--pacbio-hifi|--nano-raw|--nano-corr|--nano-hq" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Assembled FASTA file + pattern: "*.fasta.gz" + - gfa: + type: file + description: Repeat graph in gfa format + pattern: "*.gfa.gz" + - gv: + type: file + description: Repeat graph in gv format + pattern: "*.gv.gz" + - txt: + type: file + description: Extra information and statistics about resulting contigs + pattern: "*.txt" + - log: + type: file + description: Flye log file + pattern: "*.log" + - json: + type: file + description: Flye parameters + pattern: "*.json" +authors: + - "@mirpedrol" +maintainers: + - "@mirpedrol" diff --git a/modules/nf-core/flye/tests/main.nf.test b/modules/nf-core/flye/tests/main.nf.test new file mode 100644 index 0000000..f06aa1b --- /dev/null +++ b/modules/nf-core/flye/tests/main.nf.test @@ -0,0 +1,258 @@ +// According to the issue https://github.com/fenderglass/Flye/issues/164 +// Some fluctuations are expected because of the heuristics +// Here we check the that test.assembly_info.txt contains at least one contig + +nextflow_process { + + name "Test Process FLYE" + script "../main.nf" + process "FLYE" + config "./nextflow.config" + tag "flye" + tag "modules" + tag "modules_nfcore" + + + test("flye_pacbio_raw") { + tag "flye_pacbio_raw" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--pacbio-raw" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + { assert process.out.json.get(0).get(1) ==~ '.*/test.params.json' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + + ) + } + + } + + + test("flye_pacbio_corr") { + tag "flye_pacbio_corr" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--pacbio-corr" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + + + ) + } + + } + + test("flye_pacbio_hifi") { + tag "flye_pacbio_hifi" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--pacbio-hifi" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + + + ) + } + + } + + test("flye_nano_raw") { + tag "flye_nano_raw" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--nano-raw" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + { assert process.out.json.get(0).get(1) ==~ '.*/test.params.json' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + + ) + } + + } + + test("flye_nano_corr") { + tag "flye_nano_corr" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--nano-corr" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + { assert process.out.json.get(0).get(1) ==~ '.*/test.params.json' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + ) + } + + } + + + test("flye_nano_hq") { + tag "flye_nano_hq" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true) + ] + input[1] = "--nano-hq" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.fasta.get(0).get(1) ==~ '.*/test.assembly.fasta.gz' }, + { assert process.out.gfa.get(0).get(1) ==~ '.*/test.assembly_graph.gfa.gz' }, + { assert process.out.gv.get(0).get(1) ==~ '.*/test.assembly_graph.gv.gz' }, + { assert process.out.log.get(0).get(1) ==~ '.*/test.flye.log' }, + { assert process.out.txt.get(0).get(1) ==~ '.*/test.assembly_info.txt' }, + { assert process.out.json.get(0).get(1) ==~ '.*/test.params.json' }, + + // check for contig_1 in assembly_info + { assert path(process.out.txt.get(0).get(1)).text =~ /contig_1/ }, + // Check if test.params.json matches + { assert snapshot(process.out.json).match() } + + + ) + } + + } + + + +} diff --git a/modules/nf-core/flye/tests/main.nf.test.snap b/modules/nf-core/flye/tests/main.nf.test.snap new file mode 100644 index 0000000..a4aef73 --- /dev/null +++ b/modules/nf-core/flye/tests/main.nf.test.snap @@ -0,0 +1,80 @@ +{ + "flye_pacbio_raw": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T09:38:04.835173617" + }, + "flye_pacbio_hifi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T08:38:39.624137639" + }, + "flye_nano_raw": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T09:51:24.546896915" + }, + "flye_pacbio_corr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T08:34:15.751344742" + }, + "flye_nano_corr": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T09:17:49.861781685" + }, + "flye_nano_hq": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.params.json:md5,54b576cb6d4d27656878a7fd3657bde9" + ] + ] + ], + "timestamp": "2023-10-18T09:26:29.081427909" + } +} \ No newline at end of file diff --git a/modules/nf-core/flye/tests/nextflow.config b/modules/nf-core/flye/tests/nextflow.config new file mode 100644 index 0000000..40cf878 --- /dev/null +++ b/modules/nf-core/flye/tests/nextflow.config @@ -0,0 +1,4 @@ +// profile=docker with tests flye_pacbio_raw and flye_nano_raw need more memory that the default of 3.GB +process { + memory = 6.GB +} diff --git a/modules/nf-core/flye/tests/tags.yml b/modules/nf-core/flye/tests/tags.yml new file mode 100644 index 0000000..31103d1 --- /dev/null +++ b/modules/nf-core/flye/tests/tags.yml @@ -0,0 +1,2 @@ +flye: + - modules/nf-core/flye/** diff --git a/modules/nf-core/medaka/environment.yml b/modules/nf-core/medaka/environment.yml new file mode 100644 index 0000000..fea1532 --- /dev/null +++ b/modules/nf-core/medaka/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::medaka=1.4.4 diff --git a/modules/nf-core/medaka/main.nf b/modules/nf-core/medaka/main.nf new file mode 100644 index 0000000..e87c910 --- /dev/null +++ b/modules/nf-core/medaka/main.nf @@ -0,0 +1,40 @@ +process MEDAKA { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/medaka:1.4.4--py38h130def0_0' : + 'biocontainers/medaka:1.4.4--py38h130def0_0' }" + + input: + tuple val(meta), path(reads), path(assembly) + + output: + tuple val(meta), path("*.fa.gz"), emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + medaka_consensus \\ + -t $task.cpus \\ + $args \\ + -i $reads \\ + -d $assembly \\ + -o ./ + + mv consensus.fasta ${prefix}.fa + + gzip -n ${prefix}.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/medaka/meta.yml b/modules/nf-core/medaka/meta.yml new file mode 100644 index 0000000..9ed3589 --- /dev/null +++ b/modules/nf-core/medaka/meta.yml @@ -0,0 +1,45 @@ +name: medaka +description: A tool to create consensus sequences and variant calls from nanopore sequencing data +keywords: + - assembly + - polishing + - nanopore +tools: + - medaka: + description: Neural network sequence error correction. + homepage: https://nanoporetech.github.io/medaka/index.html + documentation: https://nanoporetech.github.io/medaka/index.html + tool_dev_url: https://github.com/nanoporetech/medaka + licence: ["Mozilla Public License 2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input nanopore fasta/FastQ files + pattern: "*.{fasta,fa,fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - assembly: + type: file + description: Polished genome assembly + pattern: "*.fa.gz" +authors: + - "@avantonder" +maintainers: + - "@avantonder" diff --git a/modules/nf-core/medaka/tests/main.nf.test b/modules/nf-core/medaka/tests/main.nf.test new file mode 100644 index 0000000..1c5c55f --- /dev/null +++ b/modules/nf-core/medaka/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process MEDAKA" + tag "modules_nfcore" + tag "modules" + tag "medaka" + script "../main.nf" + process "MEDAKA" + + test("Medaka") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/medaka/tests/main.nf.test.snap b/modules/nf-core/medaka/tests/main.nf.test.snap new file mode 100644 index 0000000..d3fcba2 --- /dev/null +++ b/modules/nf-core/medaka/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Medaka": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fa.gz:md5,f42303f1d6c2c79175faeb00e10b9a6e" + ] + ], + "1": [ + "versions.yml:md5,739bb00a08faba4029f9f5ab9c15275a" + ], + "assembly": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fa.gz:md5,f42303f1d6c2c79175faeb00e10b9a6e" + ] + ], + "versions": [ + "versions.yml:md5,739bb00a08faba4029f9f5ab9c15275a" + ] + } + ], + "timestamp": "2023-10-18T12:38:17.806031909" + } +} \ No newline at end of file diff --git a/modules/nf-core/medaka/tests/tags.yml b/modules/nf-core/medaka/tests/tags.yml new file mode 100644 index 0000000..dd9fb10 --- /dev/null +++ b/modules/nf-core/medaka/tests/tags.yml @@ -0,0 +1,2 @@ +medaka: + - modules/nf-core/medaka/** diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 0000000..41e8fe9 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,11 @@ +name: minimap2_align + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::htslib=1.20 + - bioconda::minimap2=2.28 + - bioconda::samtools=1.20 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 0000000..cbfc5bf --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,81 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_high' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:3161f532a5ea6f1dec9be5667c9efc2afdac6104-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) + val prefix2 + val bam_format + val bam_index_extension + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.minimap*") , optional: true, emit: filtered_fastq + tuple val(meta), path("*.paf") , optional: true, emit: paf + tuple val(meta), path("*.bam") , optional: true, emit: bam + tuple val(meta), path("*.bam.${bam_index_extension}"), optional: true, emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_index = bam_index_extension ? "${prefix}.bam##idx##${prefix}.bam.${bam_index_extension} --write-index" : "${prefix}.bam" + def map_mode = "${meta.platform}" ? "-x map-${meta.platform}" : '' + def bam_output = bam_format ? "-a | samtools fastq -f 4 | gzip > ${prefix}.${prefix2}.minimap.fastq.gz" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + def bam_input = "${reads.extension}".matches('sam|bam|cram') + def samtools_reset_fastq = bam_input ? "samtools reset --threads ${task.cpus-1} $args3 $reads | samtools fastq --threads ${task.cpus-1} $args4 |" : '' + def query = bam_input ? "-" : reads + def target = reference ?: (bam_input ? error("BAM input requires reference") : reads) + + """ + $samtools_reset_fastq \\ + minimap2 \\ + $args \\ + -t $task.cpus \\ + $map_mode \\ + $target \\ + $query \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: c + def output_file = bam_format ? "${prefix}.bam" : "${prefix}.paf" + def bam_index = bam_index_extension ? "touch ${prefix}.bam.${bam_index_extension}" : "" + def bam_input = "${reads.extension}".matches('sam|bam|cram') + def target = reference ?: (bam_input ? error("BAM input requires reference") : reads) + + """ + touch $output_file + ${bam_index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 0000000..8996f88 --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,84 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - bam_index_extension: + type: string + description: BAM alignment index extension (e.g. "bai") + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - index: + type: file + description: BAM alignment index + pattern: "*.bam.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" + - "@fellen31" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" + - "@fellen31" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 0000000..4072c17 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,441 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, [], false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, 'bai', false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.index[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, [], false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, 'bai', false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getHeader(), + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.index[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - bam, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.failed } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, 'bai', false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, false, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = false + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, [], false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, fasta, true, 'bai', false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = true + input[3] = 'bai' + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam, [], true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = [] + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.failed } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 0000000..12264a8 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,476 @@ +{ + "sarscov2 - bam, fasta, true, 'bai', false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta -", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam##idx##test.bam.bai --write-index" + ], + "5d426b9a5f5b2c54f1d7f1e4c238ae94", + "test.bam.bai", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-25T09:03:00.827260362" + }, + "sarscov2 - bam, fasta, true, 'bai', false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:21:37.92353539" + }, + "sarscov2 - fastq, fasta, true, 'bai', false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:29:44.669021368" + }, + "sarscov2 - fastq, fasta, false, [], false, false - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.paf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + + ], + "index": [ + + ], + "paf": [ + [ + { + "id": "test", + "single_end": true + }, + "test.paf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:15:52.738781039" + }, + "sarscov2 - fastq, fasta, true, [], false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:15:23.033808223" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz test_2.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "1bc392244f228bf52cf0b5a8f6a654c9", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:18.964586894" + }, + "sarscov2 - fastq, fasta, true, [], false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "f194745c0ccfcb2a9c0aee094a08750", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:17:48.667488325" + }, + "sarscov2 - fastq, fasta, true, 'bai', false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam##idx##test.bam.bai --write-index" + ], + "f194745c0ccfcb2a9c0aee094a08750", + "test.bam.bai", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:02.517416733" + }, + "sarscov2 - bam, fasta, true, [], false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:MT192765.1\tLN:29829", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a genome.fasta -", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "5d426b9a5f5b2c54f1d7f1e4c238ae94", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-25T09:02:49.64829488" + }, + "sarscov2 - bam, fasta, true, [], false, false - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "index": [ + + ], + "paf": [ + + ], + "versions": [ + "versions.yml:md5,98b8f5f36aa54b82210094f0b0d11938" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:21:22.162291795" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + [ + "@HD\tVN:1.6\tSO:coordinate", + "@SQ\tSN:ERR5069949.2151832\tLN:150", + "@SQ\tSN:ERR5069949.576388\tLN:77", + "@SQ\tSN:ERR5069949.501486\tLN:146", + "@SQ\tSN:ERR5069949.1331889\tLN:132", + "@SQ\tSN:ERR5069949.2161340\tLN:80", + "@SQ\tSN:ERR5069949.973930\tLN:79", + "@SQ\tSN:ERR5069949.2417063\tLN:150", + "@SQ\tSN:ERR5069949.376959\tLN:151", + "@SQ\tSN:ERR5069949.1088785\tLN:149", + "@SQ\tSN:ERR5069949.1066259\tLN:147", + "@SQ\tSN:ERR5069949.2832676\tLN:139", + "@SQ\tSN:ERR5069949.2953930\tLN:151", + "@SQ\tSN:ERR5069949.324865\tLN:151", + "@SQ\tSN:ERR5069949.2185111\tLN:150", + "@SQ\tSN:ERR5069949.937422\tLN:151", + "@SQ\tSN:ERR5069949.2431709\tLN:150", + "@SQ\tSN:ERR5069949.1246538\tLN:148", + "@SQ\tSN:ERR5069949.1189252\tLN:98", + "@SQ\tSN:ERR5069949.2216307\tLN:147", + "@SQ\tSN:ERR5069949.3273002\tLN:148", + "@SQ\tSN:ERR5069949.3277445\tLN:151", + "@SQ\tSN:ERR5069949.3022231\tLN:147", + "@SQ\tSN:ERR5069949.184542\tLN:151", + "@SQ\tSN:ERR5069949.540529\tLN:149", + "@SQ\tSN:ERR5069949.686090\tLN:150", + "@SQ\tSN:ERR5069949.2787556\tLN:106", + "@SQ\tSN:ERR5069949.2650879\tLN:150", + "@SQ\tSN:ERR5069949.2064910\tLN:149", + "@SQ\tSN:ERR5069949.2328704\tLN:150", + "@SQ\tSN:ERR5069949.1067032\tLN:150", + "@SQ\tSN:ERR5069949.3338256\tLN:151", + "@SQ\tSN:ERR5069949.1412839\tLN:147", + "@SQ\tSN:ERR5069949.1538968\tLN:150", + "@SQ\tSN:ERR5069949.147998\tLN:94", + "@SQ\tSN:ERR5069949.366975\tLN:106", + "@SQ\tSN:ERR5069949.1372331\tLN:151", + "@SQ\tSN:ERR5069949.1709367\tLN:129", + "@SQ\tSN:ERR5069949.2388984\tLN:150", + "@SQ\tSN:ERR5069949.1132353\tLN:150", + "@SQ\tSN:ERR5069949.1151736\tLN:151", + "@SQ\tSN:ERR5069949.479807\tLN:150", + "@SQ\tSN:ERR5069949.2176303\tLN:151", + "@SQ\tSN:ERR5069949.2772897\tLN:151", + "@SQ\tSN:ERR5069949.1020777\tLN:122", + "@SQ\tSN:ERR5069949.465452\tLN:151", + "@SQ\tSN:ERR5069949.1704586\tLN:149", + "@SQ\tSN:ERR5069949.1258508\tLN:151", + "@SQ\tSN:ERR5069949.986441\tLN:119", + "@SQ\tSN:ERR5069949.2674295\tLN:148", + "@SQ\tSN:ERR5069949.885966\tLN:79", + "@SQ\tSN:ERR5069949.2342766\tLN:151", + "@SQ\tSN:ERR5069949.3122970\tLN:127", + "@SQ\tSN:ERR5069949.3279513\tLN:72", + "@SQ\tSN:ERR5069949.309410\tLN:151", + "@SQ\tSN:ERR5069949.532979\tLN:149", + "@SQ\tSN:ERR5069949.2888794\tLN:151", + "@SQ\tSN:ERR5069949.2205229\tLN:150", + "@SQ\tSN:ERR5069949.786562\tLN:151", + "@SQ\tSN:ERR5069949.919671\tLN:151", + "@SQ\tSN:ERR5069949.1328186\tLN:151", + "@SQ\tSN:ERR5069949.870926\tLN:149", + "@SQ\tSN:ERR5069949.2257580\tLN:151", + "@SQ\tSN:ERR5069949.3249622\tLN:77", + "@SQ\tSN:ERR5069949.611123\tLN:125", + "@SQ\tSN:ERR5069949.651338\tLN:142", + "@SQ\tSN:ERR5069949.169513\tLN:92", + "@SQ\tSN:ERR5069949.155944\tLN:150", + "@SQ\tSN:ERR5069949.2033605\tLN:150", + "@SQ\tSN:ERR5069949.2730382\tLN:142", + "@SQ\tSN:ERR5069949.2125592\tLN:150", + "@SQ\tSN:ERR5069949.1062611\tLN:151", + "@SQ\tSN:ERR5069949.1778133\tLN:151", + "@SQ\tSN:ERR5069949.3057020\tLN:95", + "@SQ\tSN:ERR5069949.2972968\tLN:141", + "@SQ\tSN:ERR5069949.2734474\tLN:149", + "@SQ\tSN:ERR5069949.856527\tLN:151", + "@SQ\tSN:ERR5069949.2098070\tLN:151", + "@SQ\tSN:ERR5069949.1552198\tLN:150", + "@SQ\tSN:ERR5069949.2385514\tLN:150", + "@SQ\tSN:ERR5069949.2270078\tLN:151", + "@SQ\tSN:ERR5069949.114870\tLN:150", + "@SQ\tSN:ERR5069949.2668880\tLN:147", + "@SQ\tSN:ERR5069949.257821\tLN:139", + "@SQ\tSN:ERR5069949.2243023\tLN:150", + "@SQ\tSN:ERR5069949.2605155\tLN:146", + "@SQ\tSN:ERR5069949.1340552\tLN:151", + "@SQ\tSN:ERR5069949.1561137\tLN:150", + "@SQ\tSN:ERR5069949.2361683\tLN:149", + "@SQ\tSN:ERR5069949.2521353\tLN:150", + "@SQ\tSN:ERR5069949.1261808\tLN:149", + "@SQ\tSN:ERR5069949.2734873\tLN:98", + "@SQ\tSN:ERR5069949.3017828\tLN:107", + "@SQ\tSN:ERR5069949.573706\tLN:150", + "@SQ\tSN:ERR5069949.1980512\tLN:151", + "@SQ\tSN:ERR5069949.1014693\tLN:150", + "@SQ\tSN:ERR5069949.3184655\tLN:150", + "@SQ\tSN:ERR5069949.29668\tLN:89", + "@SQ\tSN:ERR5069949.3258358\tLN:151", + "@SQ\tSN:ERR5069949.1476386\tLN:151", + "@SQ\tSN:ERR5069949.2415814\tLN:150", + "@PG\tID:minimap2\tPN:minimap2\tVN:2.28-r1209\tCL:minimap2 -t 2 -a test_1.fastq.gz test_1.fastq.gz", + "@PG\tID:samtools\tPN:samtools\tPP:minimap2\tVN:1.20\tCL:samtools sort -@ 1 -o test.bam" + ], + "16c1c651f8ec67383bcdee3c55aed94f", + [ + "versions.yml:md5,3548eeba9066efbf8d78ea99f8d813fd" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-23T11:18:34.246998277" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 0000000..39dba37 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/porechop/abi/environment.yml b/modules/nf-core/porechop/abi/environment.yml new file mode 100644 index 0000000..4dd2eab --- /dev/null +++ b/modules/nf-core/porechop/abi/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: porechop_abi +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::porechop_abi=0.5.0 diff --git a/modules/nf-core/porechop/abi/main.nf b/modules/nf-core/porechop/abi/main.nf new file mode 100644 index 0000000..88ec5bd --- /dev/null +++ b/modules/nf-core/porechop/abi/main.nf @@ -0,0 +1,50 @@ +process PORECHOP_ABI { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop_abi:0.5.0--py310h590eda1_0': + 'biocontainers/porechop_abi:0.5.0--py310h590eda1_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz") , emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.porechop_abi" + if ("$reads" == "${prefix}.fastq.gz") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + porechop_abi \\ + --input $reads \\ + --threads $task.cpus \\ + $args \\ + --output ${prefix}.fastq.gz \\ + | tee ${prefix}.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop_abi: \$( porechop_abi --version ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.porechop_abi" + """ + echo "" | gzip > ${prefix}.fastq.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop_abi: \$( porechop_abi --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/porechop/abi/meta.yml b/modules/nf-core/porechop/abi/meta.yml new file mode 100644 index 0000000..a856ffb --- /dev/null +++ b/modules/nf-core/porechop/abi/meta.yml @@ -0,0 +1,48 @@ +name: "porechop_abi" +description: Extension of Porechop whose purpose is to process adapter sequences in ONT reads. +keywords: + - porechop_abi + - adapter + - nanopore +tools: + - "porechop_abi": + description: Extension of Porechop whose purpose is to process adapter sequences in ONT reads. + homepage: "https://github.com/bonsai-team/Porechop_ABI" + documentation: "https://github.com/bonsai-team/Porechop_ABI" + tool_dev_url: "https://github.com/bonsai-team/Porechop_ABI" + doi: "10.1101/2022.07.07.499093" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Adapter-trimmed fastq.gz file + pattern: "*.fastq.gz" + - log: + type: file + description: Log file containing stdout information + pattern: "*.log" +authors: + - "@sofstam" + - "LilyAnderssonLee" +maintainers: + - "@sofstam" + - "LilyAnderssonLee" diff --git a/modules/nf-core/porechop/abi/tests/main.nf.test b/modules/nf-core/porechop/abi/tests/main.nf.test new file mode 100644 index 0000000..b5a29f9 --- /dev/null +++ b/modules/nf-core/porechop/abi/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_process { + + name "Test Process PORECHOP_ABI" + script "../main.nf" + process "PORECHOP_ABI" + tag "modules" + tag "modules_nfcore" + tag "porechop" + tag "porechop/abi" + + test("sarscov2-nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.reads, + file(process.out.log.get(0).get(1)).readLines()[20..40], + process.out.versions).match() + } + ) + } + } + + test("sarscov2-nanopore - stub") { + + options "-stub" + + when { + + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/porechop/abi/tests/main.nf.test.snap b/modules/nf-core/porechop/abi/tests/main.nf.test.snap new file mode 100644 index 0000000..ad63f4e --- /dev/null +++ b/modules/nf-core/porechop/abi/tests/main.nf.test.snap @@ -0,0 +1,94 @@ +{ + "sarscov2-nanopore": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,886fdb859fb50e0dddd35007bcff043e" + ] + ], + [ + " Best \u001b[0m", + " read Best \u001b[0m", + " start read end\u001b[0m", + " \u001b[4mSet %ID %ID \u001b[0m", + " \u001b[32mSQK-NSK007 100.0 73.1\u001b[0m", + " Rapid 40.4 0.0", + " RBK004_upstream 77.5 0.0", + " SQK-MAP006 75.8 72.7", + " SQK-MAP006 short 65.5 66.7", + " PCR adapters 1 73.9 69.6", + " PCR adapters 2 80.0 72.7", + " PCR adapters 3 70.8 69.6", + " 1D^2 part 1 71.4 70.0", + " 1D^2 part 2 84.8 75.8", + " cDNA SSP 63.0 61.7", + " \u001b[32mBarcode 1 (reverse) 100.0 100.0\u001b[0m", + " Barcode 2 (reverse) 70.8 69.2", + " Barcode 3 (reverse) 76.0 70.4", + " Barcode 4 (reverse) 74.1 71.4", + " Barcode 5 (reverse) 77.8 80.8", + " Barcode 6 (reverse) 73.1 70.8" + ], + [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-29T13:50:49.318599" + }, + "sarscov2-nanopore - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.porechop_abi.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ], + "log": [ + [ + { + "id": "test" + }, + "test.porechop_abi.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "reads": [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-29T13:50:54.425389" + } +} \ No newline at end of file diff --git a/modules/nf-core/porechop/abi/tests/tags.yml b/modules/nf-core/porechop/abi/tests/tags.yml new file mode 100644 index 0000000..e19350c --- /dev/null +++ b/modules/nf-core/porechop/abi/tests/tags.yml @@ -0,0 +1,2 @@ +porechop/abi: + - "modules/nf-core/porechop/abi/**" diff --git a/modules/nf-core/racon/environment.yml b/modules/nf-core/racon/environment.yml new file mode 100644 index 0000000..e5cd0b8 --- /dev/null +++ b/modules/nf-core/racon/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::racon=1.4.20 diff --git a/modules/nf-core/racon/main.nf b/modules/nf-core/racon/main.nf new file mode 100644 index 0000000..de29e35 --- /dev/null +++ b/modules/nf-core/racon/main.nf @@ -0,0 +1,38 @@ +process RACON { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/racon:1.4.20--h9a82719_1' : + 'biocontainers/racon:1.4.20--h9a82719_1' }" + + input: + tuple val(meta), path(reads), path(assembly), path(paf) + + output: + tuple val(meta), path('*_assembly_consensus.fasta.gz') , emit: improved_assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + racon -t "$task.cpus" \\ + "${reads}" \\ + "${paf}" \\ + $args \\ + "${assembly}" > \\ + ${prefix}_assembly_consensus.fasta + + gzip -n ${prefix}_assembly_consensus.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + racon: \$( racon --version 2>&1 | sed 's/^.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/racon/meta.yml b/modules/nf-core/racon/meta.yml new file mode 100644 index 0000000..9698c0a --- /dev/null +++ b/modules/nf-core/racon/meta.yml @@ -0,0 +1,51 @@ +name: racon +description: Consensus module for raw de novo DNA assembly of long uncorrected reads +keywords: + - assembly + - pacbio + - nanopore + - polish +tools: + - racon: + description: Ultrafast consensus module for raw de novo genome assembly of long uncorrected reads. + homepage: https://github.com/lbcb-sci/racon + documentation: https://github.com/lbcb-sci/racon + tool_dev_url: https://github.com/lbcb-sci/racon + doi: 10.1101/gr.214270.116 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files. Racon expects single end reads + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + - assembly: + type: file + description: Genome assembly to be improved + pattern: "*.{fasta,fa}" + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - improved_assembly: + type: file + description: Improved genome assembly + pattern: "*_assembly_consensus.fasta.gz" +authors: + - "@avantonder" +maintainers: + - "@avantonder" diff --git a/nextflow.config b/nextflow.config index 168873d..8791b13 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { study_accession = null reads_accession = null private_study = false + min_read_length = 200 // For already fetched data samplesheet = null @@ -36,8 +37,13 @@ params { * for metaspades are prohibitively high, such as: * - Memory >1TB * - Runtime >3-4 days + * + * - flye: Use for any long-read assembly. assembler_config + * should be selected depending on input data (if ONT or + * pacbio, and if data quality is high or low) */ assembler = null + assembler_config = null // The pipeline will use the metadata from ENA (obtained by the fetch_tool) // As the metadata can be incorrect, we provide the following parameters to @@ -45,6 +51,7 @@ params { single_end = null library_layout = null library_strategy = null + platform = null // Reads QC filtering options filter_ratio_threshold = 0.9 @@ -53,6 +60,14 @@ params { // Reference genome reference_genome = null + /* + * Long-read assemblies won't require phiX, + * parameters should be defined as follows: + * remove_human = true + * human_blast_index_name = "human" + * human_bwamem2_index_name = "human" + * Need to integrate them + */ remove_human_phix = true human_phix_blast_index_name = "human_phix" human_phix_bwamem2_index_name = "human_phix" @@ -93,6 +108,7 @@ params { // Assembler versions spades_version = "3.15.5" megahit_version = "1.2.9" + flye_version = "2.9" } @@ -200,6 +216,9 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } + test { + includeConfig 'conf/test.config' + } codon_slurm { includeConfig 'conf/codon_slurm.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index ebfb512..541ee4d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -40,14 +40,24 @@ "fa_icon": "far fa-address-card", "minLength": 3 }, + "min_read_length": { + "type": "integer", + "description": "Minimum read length for pre-assembly quality filtering", + "default": 200 + }, "private_study": { "type": "boolean", "description": "To use if the ENA study is private" }, "assembler": { "type": "string", - "enum": ["spades", "metaspades", "megahit"], - "description": "The short reads assembler" + "enum": ["spades", "metaspades", "megahit", "flye"], + "description": "The short or long reads assembler" + }, + "assembler_config": { + "type": "string", + "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", + "default": "" }, "single_end": { "type": "boolean", @@ -63,6 +73,15 @@ "description": "Force the library_layout value for the study / reads", "enum": ["single", "paired"] }, + "platform": { + "type": "string", + "description": "Force the instrument_platform value for the study / reads", + "default": "ont" + }, + "flye_version": { + "type": "string", + "default": "2.9" + }, "spades_version": { "type": "string", "default": "3.15.5" @@ -104,16 +123,31 @@ "description": "Remove human and phiX reads pre assembly, and contigs matching those genomes.", "default": true }, + "remove_human": { + "type": "boolean", + "description": "Remove human reads pre assembly, and contigs matching those genomes.", + "default": true + }, "human_phix_blast_index_name": { "type": "string", "description": "Combined Human and phiX BLAST db.", "default": "human_phix" }, + "human_blast_index_name": { + "type": "string", + "description": "Human BLAST db.", + "default": "human" + }, "human_phix_bwamem2_index_name": { "type": "string", "description": "Combined Human and phiX bwa-mem2 index.", "default": "human_phix" }, + "human_bwamem2_index_name": { + "type": "string", + "description": "Human bwa-mem2 index.", + "default": "human" + }, "min_contig_length": { "type": "integer", "default": 500, diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf new file mode 100644 index 0000000..13635ba --- /dev/null +++ b/subworkflows/local/long_reads_qc.nf @@ -0,0 +1,91 @@ +include { FASTP_LR } from '../../modules/nf-core/fastp/main' +include { RAW_READ_QUALITY_CHECK } from '../../modules/local/raw_read_quality_check/' +include { MINIMAP2_ALIGN as HUMAN_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as HOST_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' + +workflow LONG_READS_QC { + take: + reads // [ val(meta), path(reads) ] + host_reference_genome // [ val(meta2), path(reference_genome) ] + + main: + ch_versions = Channel.empty() + + FASTP_LR( + reads, + [], + false, + false, + false, + false + ) + + ch_versions = ch_versions.mix(FASTP.out.versions) + + RAW_READ_QUALITY_CHECK( + FASTP.out.json + ) + + decontaminated_reads = channel.empty() + + if ( params.remove_human ) { + + ch_bwamem2_human_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${params.human_blast_index_name}.fna", checkIfExists: true) + .collect().map { + files -> [ ["id": params.human_blast_index_name], files ] + } + + // TODO: can we change the way human/host are given via prefixes? + + HUMAN_DECONTAMINATION( + FASTP.out.reads, + ch_bwamem2_human_refs, + "human", + true, + "bai", + false, + true + ) + + ch_versions = ch_versions.mix(HUMAN_DECONTAMINATION.out.versions) + + decontaminated_reads = HUMAN_DECONTAMINATION.out.filtered_fastq + + } else { + decontaminated_reads = FASTP.out.reads + } + + if ( host_reference_genome != null ) { + + ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + .collect().map { + files -> [ ["id": host_reference_genome], files ] + } + + HOST_DECONTAMINATION( + decontaminated_reads, + ch_bwamem2_host_refs, + "host", + true, + "bai", + false, + true + ) + + ch_versions = ch_versions.mix(HOST_DECONTAMINATION.out.versions) + + decontaminated_reads = HOST_DECONTAMINATION.out.filtered_fastq + } + + final_reads = decontaminated_reads + .map{ meta, reads -> { + [ meta + [ + "quality": RAW_READ_QUALITY_CHECK.out.quality.val + ], reads ] + } + } + + emit: + qc_reads = final_reads + versions = ch_versions +} diff --git a/subworkflows/local/ont_hq.nf b/subworkflows/local/ont_hq.nf new file mode 100644 index 0000000..7255d24 --- /dev/null +++ b/subworkflows/local/ont_hq.nf @@ -0,0 +1,16 @@ +include { PORECHOP_ABI as PORECHOP_ONT } from '../../modules/nf-core/porechop/abi/main' + +workflow ONT_HQ { + take: + reads // [ val(meta), path(reads) ] + + main: + PORECHOP_ONT( + reads + ) + PORECHOP_ONT.out.reads.view() + + // temporary just to test the module + emit: + contigs = PORECHOP_ONT.out.reads +} diff --git a/subworkflows/local/ont_lq.nf b/subworkflows/local/ont_lq.nf new file mode 100644 index 0000000..6538c14 --- /dev/null +++ b/subworkflows/local/ont_lq.nf @@ -0,0 +1,18 @@ +include { CANU as CANU_ONT } from '../../modules/nf-core/canu/main' + +workflow ONT_LQ { + take: + reads // [ val(meta), path(reads) ] + + main: + CANU_ONT( + reads, + "-nanopore", + "5m" + ) + CANU_ONT.out.corrected_trimmed_reads.view() + + // temporary just to test the module + emit: + contigs = CANU_ONT.out.corrected_trimmed_reads +} diff --git a/subworkflows/local/pacbio_hifi.nf b/subworkflows/local/pacbio_hifi.nf new file mode 100644 index 0000000..491bf28 --- /dev/null +++ b/subworkflows/local/pacbio_hifi.nf @@ -0,0 +1,3 @@ +workflow PACBIO_HIFI { + +} \ No newline at end of file diff --git a/subworkflows/local/pacbio_lq.nf b/subworkflows/local/pacbio_lq.nf new file mode 100644 index 0000000..df49b01 --- /dev/null +++ b/subworkflows/local/pacbio_lq.nf @@ -0,0 +1,14 @@ +include { CANU as CANU_PACBIO } from '../../modules/nf-core/canu/main' + +workflow PACBIO_LQ { + take: + reads // [ val(meta), path(reads) ] + + main: + CANU_PACBIO( + reads, + "-pacbio", + "5m" + ) + CANU_PACBIO.out.corrected_reads.view() +} diff --git a/subworkflows/local/reads_qc.nf b/subworkflows/local/reads_qc.nf index a3e99af..4cbbbe6 100644 --- a/subworkflows/local/reads_qc.nf +++ b/subworkflows/local/reads_qc.nf @@ -16,6 +16,7 @@ workflow READS_QC { [], false, false, + false, false ) diff --git a/tests/samplesheet/test_minION_SRR10303629.csv b/tests/samplesheet/test_minION_SRR10303629.csv new file mode 100644 index 0000000..c6ac9e8 --- /dev/null +++ b/tests/samplesheet/test_minION_SRR10303629.csv @@ -0,0 +1,2 @@ +study_accession,reads_accession,fastq_1,library_layout,library_strategy,assembler,assembly_memory +SRP226117,SRR10303629,/home/germana/Desktop/EBI_root/Git/long-read-assembly/tests/test_reads/SRR10303629_1.fastq.gz,single,metagenomic,, \ No newline at end of file diff --git a/workflows/longreadassembler.nf b/workflows/longreadassembler.nf new file mode 100644 index 0000000..227e62d --- /dev/null +++ b/workflows/longreadassembler.nf @@ -0,0 +1,244 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PRINT PARAMS SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { validateParameters; paramsSummaryLog; paramsSummaryMap; samplesheetToList } from 'plugin/nf-schema' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation + +validateParameters() + +if (params.help) { + log.info paramsHelp("nextflow run ebi-metagenomics/longreadsassembly --help") + exit 0 +} + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// + +include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' +include { LONG_READS_QC } from '../subworkflows/local/long_reads_qc' +include { ONT_LQ } from '../subworkflows/local/ont_lq' +include { ONT_HQ } from '../subworkflows/local/ont_hq' +// include { PACBIO_LQ } from '../subworkflows/local/pacbio_lq' +// include { PACBIO_HIFI } from '../subworkflows/local/pacbio_hifi' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Info required for completion email and summary +def multiqc_report = [] + +workflow LONGREADSASSEMBLY { + + ch_versions = Channel.empty() + longReads = Channel.empty() + fetch_tool_metadata = Channel.empty() + + if ( params.samplesheet ) { + + longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, assembler_config, assembly_memory -> + return tuple( + [ + "id": reads_accession, + "study_accession": study_accession, + "library_strategy": library_strategy, + "library_layout": library_layout, + "single_end": true, + "assembler": assembler ?: params.assembler, + "assembler_config": assembler_config ?: params.assembler_config, + "assembly_memory": assembly_memory ?: params.assembly_memory + ], + [fq1] + ) + } + + samplesheet = Channel.fromList(samplesheetToList(params.samplesheet, "./assets/schema_input.json")) + + fetch_reads_transformed = samplesheet.map(longReads) + + } else { + // TODO: remove when the fetch tools gets published on bioconda + fetch_tool_config = file("${projectDir}/assets/fetch_tool_anonymous.json", checkIfExists: true) + + if ( params.private_study ) { + fetch_tool_config = file("${projectDir}/assets/fetch_tool_credentials.json", checkIfExists: true) + } + + FETCHTOOL_READS( + [ [id: params.reads_accession], params.study_accession, params.reads_accession ], + fetch_tool_config + ) + + ch_versions = ch_versions.mix(FETCHTOOL_READS.out.versions) + + // Push the library strategy into the meta of the reads, this is to make it easier to handle downstream + fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout, platform -> { + [ meta + [ + // -- The metadata will be overriden by the parameters -- // + "assembler": params.assembler, + "assembly_memory": params.assembly_memory, + "assembler_config": params.assembler_config, + "library_strategy": params.library_strategy ?: library_strategy, + "library_layout": params.library_layout ?: library_layout, + "single_end": params.single_end ?: library_layout == "single", + "platform": params.platform ?: platform + ], reads ] + } + } + + // Metadata for MultiQC + fetch_tool_metadata = FETCHTOOL_READS.out.metadata_tsv.map { it[1] }.collectFile( + name: 'fetch_tool_mqc.tsv', + newLine: true, + keepHeader: true, + skip: 1 + ) + } + + LONG_READS_QC ( + fetch_reads_transformed, + params.reference_genome + ) + ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) + + /*********************************************************************************/ + /* Selecting the combination of adapter trimming, assembler, and post-processing */ + /*********************************************************************************/ + /* + The selection process ensures that: + - The user selected assembler configuration is always used (either from the samplesheet assembler column (with precedence) or the params.assembler) + - Low-quality ONT reads are trimmed with canu and assembled with flye --nano-corr/raw), unless specified otherwise. + - High-quality ONT reads are trimmed with porechob_abi and assembled with flye --nano-hq), unless specified otherwise. + - Low-quality pacbio reads are trimmed with canu and assembled with flye --pacbio-corr/raw), unless specified otherwise. + - High-quality pacbio reads are trimmed with HiFiAdapterFilt and assembled with flye --pacbio-hifi), unless specified otherwise. + Extra polishing steps are applied to low-quality reads. All subworkflows also apply post-assembly host decontamination. + */ + + reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> + if (meta.platform == "ont") { + if (params.assembler_config == "nano-raw" || meta.quality == "low") { + return [meta + ["assembler_config": "nano-raw"], reads] + } else if (params.assembler_config == "nano-hq" || meta.quality == "high") { + return [meta + ["assembler_config": "nano-hq"], reads] + } + } else if (meta.platform == "pacbio") { + if (params.assembler_config == "pacbio-raw" || meta.quality == "low") { + return [meta + ["assembler_config": "pacbio-raw"], reads] + } else if (params.assembler_config == "pacbio-hifi" || meta.quality == "high") { + return [meta + ["assembler_config": "pacbio-hifi"], reads] + } + } else { + error "Incompatible configuration" + } + } + + reads_assembler_config.branch { meta, reads -> + lq_ont: meta.assembler_config == "nano-raw" + hq_ont: meta.assembler_config == "pacbio-raw" + lq_pacbio: meta.assembler_config == "nano-hq" + hq_pacbio: meta.assembler_config == "pacbio-hifi" + }.set {subworkflow_platform_reads} + + ONT_LQ( + subworkflow_platform_reads.lq_ont + ) + + ONT_HQ( + subworkflow_platform_reads.hq_ont + ) + + // PACBIO_LQ( + // subworkflow_platform_reads.lq_pacbio.map { meta, reads -> [meta, reads] } + // ) + + // PACBIO_HIFI( + // subworkflow_platform_reads.hq_pacbio.map { meta, reads -> [meta, reads] } + // ) + + assembly = ONT_LQ.out.contigs.mix( ONT_HQ.out.contigs )//, PACBIO_LQ.out.contigs, PACBIO_HIFI.out.contigs ) + + /*************************************/ + /* Post-assembly: coverage and stats */ + /*************************************/ + + // + // MODULE: Run FastQC + // + // FASTQC ( + // INPUT_CHECK.out.reads + // ) + // ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + // CUSTOM_DUMPSOFTWAREVERSIONS ( + // ch_versions.unique().collectFile(name: 'collated_versions.yml') + // ) + + // + // MODULE: MultiQC + // + // workflow_summary = WorkflowLongreadsassembly.paramsSummaryMultiqc(workflow, summary_params) + // ch_workflow_summary = Channel.value(workflow_summary) + + // methods_description = WorkflowLongreadsassembly.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + // ch_methods_description = Channel.value(methods_description) + + // ch_multiqc_files = Channel.empty() + // ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + // ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + // ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + // ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + // MULTIQC ( + // ch_multiqc_files.collect(), + // ch_multiqc_config.toList(), + // ch_multiqc_custom_config.toList(), + // ch_multiqc_logo.toList() + // ) + // multiqc_report = MULTIQC.out.report.toList() +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ From 76e8d011d9caef7acdcef4b8b0a23df24102cc42 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 3 Sep 2024 14:01:55 +0100 Subject: [PATCH 02/33] Adapt params to short/long reads workflows --- README.md | 17 +++-- conf/codon_slurm.config | 1 + conf/test.config | 4 +- modules/nf-core/quast/main.nf | 4 +- modules/nf-core/quast/quast.diff | 4 +- modules/nf-core/seqkit/seq/main.nf | 4 +- modules/nf-core/seqkit/seq/seqkit-seq.diff | 4 +- nextflow.config | 89 ++++++++++++---------- nextflow_schema.json | 24 +++--- subworkflows/local/assembly_qc.nf | 4 +- subworkflows/local/long_reads_qc.nf | 8 +- tests/main.nf.test | 7 ++ workflows/longreadassembler.nf | 32 ++++---- workflows/miassembler.nf | 14 ++-- 14 files changed, 115 insertions(+), 101 deletions(-) diff --git a/README.md b/README.md index f0809fd..64ad609 100644 --- a/README.md +++ b/README.md @@ -37,18 +37,21 @@ Input/output options --library_layout [string] Force the library_layout value for the study / reads (accepted: single, paired) --spades_version [string] null [default: 3.15.5] --megahit_version [string] null [default: 1.2.9] - --reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics + --flye_version [string] null [default: 2.9] + --host_reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna, honeybee.fna, rainbow_trout.fna, ...) --blast_reference_genomes_folder [string] The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal directory. --bwamem2_reference_genomes_folder [string] The folder with the reference genome bwa-mem2 indexes, defaults to the Microbiome Informatics internal + + --reference_genomes_folder [string] The folder with reference genomes, defaults to the Microbiome Informatics internal directory. --remove_human_phix [boolean] Remove human and phiX reads pre assembly, and contigs matching those genomes. [default: true] --human_phix_blast_index_name [string] Combined Human and phiX BLAST db. [default: human_phix] --human_phix_bwamem2_index_name [string] Combined Human and phiX bwa-mem2 index. [default: human_phix] - --min_contig_length [integer] Minimum contig length filter. [default: 500] - --min_contig_length_metatranscriptomics [integer] Minimum contig length filter for metaT. [default: 200] + --short_reads_min_contig_length [integer] Minimum contig length filter. [default: 500] + --short_reads_min_contig_length_metat [integer] Minimum contig length filter for metaT. [default: 200] --assembly_memory [integer] Default memory allocated for the assembly process. [default: 100] --spades_only_assembler [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true] --outdir [string] The output directory where the results will be saved. You have to use absolute paths to storage on Cloud @@ -66,7 +69,7 @@ Example: nextflow run ebi-metagenomics/miassembler \ -profile codon_slurm \ --assembler metaspades \ - --reference_genome human \ + --host_reference_genome human \ --outdir testing_results \ --study_accession SRP002480 \ --reads_accession SRR1631361 @@ -182,15 +185,15 @@ Runs that fail QC checks are excluded from the assembly process. These runs are Example: ```csv -SRR6180434,filter_ratio_threshold_exceeded +SRR6180434,short_reads_filter_ratio_threshold_exceeded ``` ##### Runs exclusion messages | Exclusion Message | Description | | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. | -| `low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | +| `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. | +| `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | #### Assembled Runs diff --git a/conf/codon_slurm.config b/conf/codon_slurm.config index 541a69d..7fb4789 100644 --- a/conf/codon_slurm.config +++ b/conf/codon_slurm.config @@ -1,4 +1,5 @@ params { + reference_genomes_folder = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bwa-mem2/" bwamem2_reference_genomes_folder = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bwa-mem2/" blast_reference_genomes_folder = "/nfs/production/rdf/metagenomics/pipelines/prod/assembly-pipeline/blast_dbs/" human_phix_blast_index_name = "human_phix" diff --git a/conf/test.config b/conf/test.config index 421e7f7..60db88e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,12 +18,12 @@ profiles { max_memory = '6.GB' max_time = '6.h' + reference_genomes_folder = "tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "tests/human_phix/bwa2mem" blast_reference_genomes_folder = "tests/human_phix/blast" human_phix_blast_index_name = "human_phix" human_phix_bwamem2_index_name = "human_phix" - human_blast_index_name = "human" - human_bwamem2_index_name = "human" + human_fasta_prefix = "human" } } } diff --git a/modules/nf-core/quast/main.nf b/modules/nf-core/quast/main.nf index ce9befd..da16b9f 100644 --- a/modules/nf-core/quast/main.nf +++ b/modules/nf-core/quast/main.nf @@ -26,9 +26,9 @@ process QUAST { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def min_contig_len = "--min-contig ${params.min_contig_length}" + def min_contig_len = "--min-contig ${params.short_reads_min_contig_length}" if ( meta.library_strategy == "metatranscriptomics" ) { - min_contig_len = "--min-contig ${params.min_contig_length_metatranscriptomics}" + min_contig_len = "--min-contig ${params.short_reads_min_contig_length_metat}" } def features = gff ? "--features $gff" : '' def reference = fasta ? "-r $fasta" : '' diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff index 7d48832..bfaf013 100644 --- a/modules/nf-core/quast/quast.diff +++ b/modules/nf-core/quast/quast.diff @@ -14,9 +14,9 @@ Changes in module 'nf-core/quast' script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" -+ def min_contig_len = "--min-contig ${params.min_contig_length}" ++ def min_contig_len = "--min-contig ${params.short_reads_min_contig_length}" + if ( meta.library_strategy == "metatranscriptomics" ) { -+ min_contig_len = "--min-contig ${params.min_contig_length_metatranscriptomics}" ++ min_contig_len = "--min-contig ${params.short_reads_min_contig_length_metat}" + } def features = gff ? "--features $gff" : '' def reference = fasta ? "-r $fasta" : '' diff --git a/modules/nf-core/seqkit/seq/main.nf b/modules/nf-core/seqkit/seq/main.nf index a6a05b7..4a1d0f3 100644 --- a/modules/nf-core/seqkit/seq/main.nf +++ b/modules/nf-core/seqkit/seq/main.nf @@ -19,9 +19,9 @@ process SEQKIT_SEQ { task.ext.when == null || task.ext.when script: - def min_len = params.min_contig_length + def min_len = params.short_reads_min_contig_length if ( meta.library_strategy == "metatranscriptomic" ) { - min_len = params.min_contig_length_metatranscriptomics + min_len = params.short_reads_min_contig_length_metat } def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' diff --git a/modules/nf-core/seqkit/seq/seqkit-seq.diff b/modules/nf-core/seqkit/seq/seqkit-seq.diff index 168ac0b..af070e2 100644 --- a/modules/nf-core/seqkit/seq/seqkit-seq.diff +++ b/modules/nf-core/seqkit/seq/seqkit-seq.diff @@ -5,9 +5,9 @@ Changes in module 'nf-core/seqkit/seq' task.ext.when == null || task.ext.when script: -+ def min_len = params.min_contig_length ++ def min_len = params.short_reads_min_contig_length + if ( meta.library_strategy == "metatranscriptomic" ) { -+ min_len = params.min_contig_length_metatranscriptomics ++ min_len = params.short_reads_min_contig_length_metat + } def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' diff --git a/nextflow.config b/nextflow.config index 8791b13..ad7e5d6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,14 +17,51 @@ params { study_accession = null reads_accession = null private_study = false - min_read_length = 200 // For already fetched data samplesheet = null - /* - * Assembler options, by default the pipeline will pick - * - metaspades for pair-end - * - megahit for single-end + + // The pipeline will use the metadata from ENA (obtained by the fetch_tool) + // As the metadata can be incorrect, we provide the following parameters to + // "force" them + single_end = null + library_layout = null + library_strategy = null + platform = null + + // QC FILTERING + + // Short reads options + short_reads_filter_ratio_threshold = 0.9 + short_reads_low_reads_count_threshold = 1000 + + // Long reads options + long_read_min_read_length = 200 + + // Reference genome name (to select from list) + bwamem2_reference_genomes_folder = "" + blast_reference_genomes_folder = "" + host_reference_genome = null + + // Short-read sequences and assemblies are + // automatically polished from human and phix seqs + // Both blast and bwa indices are needed + remove_human_phix = true + human_phix_blast_index_name = "human_phix" + human_phix_bwamem2_index_name = "human_phix" + + // Long-read assemblies don't require phiX + // nor indices, just a fasta file + reference_genomes_folder = null + remove_human = true + human_fasta_prefix = "human" + + // ASSEMBLY + + /* By default the pipeline will pick + * - metaspades for paired-end short reads + * - megahit for single-end short reads + * - flye for long reads * * Setting --assembler will force the assembler * @@ -43,43 +80,13 @@ params { * pacbio, and if data quality is high or low) */ assembler = null - assembler_config = null - - // The pipeline will use the metadata from ENA (obtained by the fetch_tool) - // As the metadata can be incorrect, we provide the following parameters to - // "force" them - single_end = null - library_layout = null - library_strategy = null - platform = null - - // Reads QC filtering options - filter_ratio_threshold = 0.9 - low_reads_count_threshold = 1000 - - // Reference genome - reference_genome = null - - /* - * Long-read assemblies won't require phiX, - * parameters should be defined as follows: - * remove_human = true - * human_blast_index_name = "human" - * human_bwamem2_index_name = "human" - * Need to integrate them - */ - remove_human_phix = true - human_phix_blast_index_name = "human_phix" - human_phix_bwamem2_index_name = "human_phix" - - bwamem2_reference_genomes_folder = "" - blast_reference_genomes_folder = "" // Assembly options - spades_only_assembler = true - min_contig_length = 500 - min_contig_length_metatranscriptomics = 200 - assembly_memory = 100 + spades_only_assembler = true + short_reads_min_contig_length = 500 + short_reads_min_contig_length_metat = 200 + long_read_assembler_config = null + assembly_memory = 100 // MultiQC options multiqc_config = null @@ -108,7 +115,7 @@ params { // Assembler versions spades_version = "3.15.5" megahit_version = "1.2.9" - flye_version = "2.9" + flye_version = "2.9" } diff --git a/nextflow_schema.json b/nextflow_schema.json index 541ee4d..d592847 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,10 +11,6 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "blast_reference_genomes_folder", - "bwamem2_reference_genomes_folder", - "human_phix_blast_index_name", - "human_phix_bwamem2_index_name", "outdir" ], "properties": { @@ -54,7 +50,7 @@ "enum": ["spades", "metaspades", "megahit", "flye"], "description": "The short or long reads assembler" }, - "assembler_config": { + "long_read_assembler_config": { "type": "string", "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", "default": "" @@ -90,7 +86,7 @@ "type": "string", "default": "1.2.9" }, - "reference_genome": { + "host_reference_genome": { "type": "string", "description": "The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory", "enum": [ @@ -143,20 +139,20 @@ "description": "Combined Human and phiX bwa-mem2 index.", "default": "human_phix" }, - "human_bwamem2_index_name": { + "human_fasta_prefix": { "type": "string", - "description": "Human bwa-mem2 index.", + "description": "Human prefix name.", "default": "human" }, - "min_contig_length": { + "short_reads_min_contig_length": { "type": "integer", "default": 500, - "description": "Minimum contig length filter." + "description": "Minimum contig length filter for short reads." }, - "min_contig_length_metatranscriptomics": { + "short_reads_min_contig_length_metat": { "type": "integer", "default": 200, - "description": "Minimum contig length filter for metaT." + "description": "Minimum contig length filter for short reads metaT." }, "assembly_memory": { "type": "integer", @@ -196,14 +192,14 @@ "description": "Set the thresholds for the reads QC/filtering steps. Reads that fail QC won't be assembled.", "help_text": "Use these options to define the quality control thresholds for your reads. You can specify the maximum allowed filtering ratio and the minimum acceptable read count. If the filtering ratio exceeds the set limit or the read count falls below the threshold, the reads will be flagged and excluded from further assembly. The information about those runs that failed are aggregated in the qc_failed_runs.csv file.", "properties": { - "filter_ratio_threshold": { + "short_reads_filter_ratio_threshold": { "type": "number", "description": "The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled.", "default": 0.9, "minimum": 0.0, "maximum": 1.0 }, - "low_reads_count_threshold": { + "short_reads_low_reads_count_threshold": { "type": "number", "description": "The minimum number of reads required after filtering. If below, it flags a low read count and the run is not assembled.", "default": 1000 diff --git a/subworkflows/local/assembly_qc.nf b/subworkflows/local/assembly_qc.nf index f5bfa7d..e96a475 100644 --- a/subworkflows/local/assembly_qc.nf +++ b/subworkflows/local/assembly_qc.nf @@ -22,13 +22,13 @@ workflow ASSEMBLY_QC { take: assembly // [ val(meta), path(assembly_fasta) ] - host_reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome + host_reference_genome // [ val(meta2), path(host_reference_genome) ] | meta2 contains the name of the reference genome main: ch_versions = Channel.empty() - /* Len filter using the parameter "min_contig_length" */ + /* Len filter using the parameter "short_reads_min_contig_length" */ SEQKIT_SEQ( assembly ) diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index 13635ba..5ef54dd 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -30,7 +30,7 @@ workflow LONG_READS_QC { if ( params.remove_human ) { - ch_bwamem2_human_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${params.human_blast_index_name}.fna", checkIfExists: true) + human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.fna", checkIfExists: true) .collect().map { files -> [ ["id": params.human_blast_index_name], files ] } @@ -39,7 +39,7 @@ workflow LONG_READS_QC { HUMAN_DECONTAMINATION( FASTP.out.reads, - ch_bwamem2_human_refs, + human_reference, "human", true, "bai", @@ -57,14 +57,14 @@ workflow LONG_READS_QC { if ( host_reference_genome != null ) { - ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + host_reference = Channel.fromPath( "${params.reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) .collect().map { files -> [ ["id": host_reference_genome], files ] } HOST_DECONTAMINATION( decontaminated_reads, - ch_bwamem2_host_refs, + host_reference, "host", true, "bai", diff --git a/tests/main.nf.test b/tests/main.nf.test index 06e3213..ed59d5b 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -11,6 +11,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" samplesheet = "${projectDir}/tests/samplesheet/test.csv" @@ -35,6 +36,7 @@ nextflow_pipeline { params { outdir = "tests/results" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "SRP115494" @@ -63,6 +65,7 @@ nextflow_pipeline { assembler = "megahit" study_accession = "SRP115494" reads_accession = "SRR6180434" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" } @@ -87,6 +90,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "metaspades" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "ERP012810" @@ -111,6 +115,7 @@ nextflow_pipeline { when { params { outdir = "tests/results" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "ERP012810" @@ -133,6 +138,7 @@ nextflow_pipeline { when { params { outdir = "tests/results" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "DRP007622" @@ -159,6 +165,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" + reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "DRP007622" diff --git a/workflows/longreadassembler.nf b/workflows/longreadassembler.nf index 227e62d..7b39717 100644 --- a/workflows/longreadassembler.nf +++ b/workflows/longreadassembler.nf @@ -76,7 +76,7 @@ workflow LONGREADSASSEMBLY { if ( params.samplesheet ) { - longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, assembler_config, assembly_memory -> + longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, long_read_assembler_config, assembly_memory -> return tuple( [ "id": reads_accession, @@ -85,7 +85,7 @@ workflow LONGREADSASSEMBLY { "library_layout": library_layout, "single_end": true, "assembler": assembler ?: params.assembler, - "assembler_config": assembler_config ?: params.assembler_config, + "long_read_assembler_config": long_read_assembler_config ?: params.long_read_assembler_config, "assembly_memory": assembly_memory ?: params.assembly_memory ], [fq1] @@ -117,7 +117,7 @@ workflow LONGREADSASSEMBLY { // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, "assembly_memory": params.assembly_memory, - "assembler_config": params.assembler_config, + "long_read_assembler_config": params.long_read_assembler_config, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, "single_end": params.single_end ?: library_layout == "single", @@ -137,7 +137,7 @@ workflow LONGREADSASSEMBLY { LONG_READS_QC ( fetch_reads_transformed, - params.reference_genome + params.host_reference_genome ) ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) @@ -156,16 +156,16 @@ workflow LONGREADSASSEMBLY { reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> if (meta.platform == "ont") { - if (params.assembler_config == "nano-raw" || meta.quality == "low") { - return [meta + ["assembler_config": "nano-raw"], reads] - } else if (params.assembler_config == "nano-hq" || meta.quality == "high") { - return [meta + ["assembler_config": "nano-hq"], reads] + if (params.long_read_assembler_config == "nano-raw" || meta.quality == "low") { + return [meta + ["long_read_assembler_config": "nano-raw"], reads] + } else if (params.long_read_assembler_config == "nano-hq" || meta.quality == "high") { + return [meta + ["long_read_assembler_config": "nano-hq"], reads] } } else if (meta.platform == "pacbio") { - if (params.assembler_config == "pacbio-raw" || meta.quality == "low") { - return [meta + ["assembler_config": "pacbio-raw"], reads] - } else if (params.assembler_config == "pacbio-hifi" || meta.quality == "high") { - return [meta + ["assembler_config": "pacbio-hifi"], reads] + if (params.long_read_assembler_config == "pacbio-raw" || meta.quality == "low") { + return [meta + ["long_read_assembler_config": "pacbio-raw"], reads] + } else if (params.long_read_assembler_config == "pacbio-hifi" || meta.quality == "high") { + return [meta + ["long_read_assembler_config": "pacbio-hifi"], reads] } } else { error "Incompatible configuration" @@ -173,10 +173,10 @@ workflow LONGREADSASSEMBLY { } reads_assembler_config.branch { meta, reads -> - lq_ont: meta.assembler_config == "nano-raw" - hq_ont: meta.assembler_config == "pacbio-raw" - lq_pacbio: meta.assembler_config == "nano-hq" - hq_pacbio: meta.assembler_config == "pacbio-hifi" + lq_ont: meta.long_read_assembler_config == "nano-raw" + hq_ont: meta.long_read_assembler_config == "pacbio-raw" + lq_pacbio: meta.long_read_assembler_config == "nano-hq" + hq_pacbio: meta.long_read_assembler_config == "pacbio-hifi" }.set {subworkflow_platform_reads} ONT_LQ( diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 69a80d6..f4e62ec 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -182,7 +182,7 @@ workflow MIASSEMBLER { READS_QC( fetch_reads_transformed, - params.reference_genome + params.host_reference_genome ) FASTQC_AFTER ( @@ -199,8 +199,8 @@ workflow MIASSEMBLER { bf_total_reads = json_txt?.summary?.before_filtering?.total_reads ?: 0; af_total_reads = json_txt?.summary?.after_filtering?.total_reads ?: 0; reads_qc_meta = [ - "low_reads_count": af_total_reads <= params.low_reads_count_threshold, - "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.filter_ratio_threshold ) + "short_reads_low_reads_count": af_total_reads <= params.short_reads_low_reads_count_threshold, + "short_reads_filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.short_reads_filter_ratio_threshold ) ] return [meta, reads_qc_meta] } @@ -210,7 +210,7 @@ workflow MIASSEMBLER { extended_reads_qc.branch { meta, reads, reads_qc_meta -> // Filter out failed reads // - qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.filter_ratio_threshold_exceeded + qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.short_reads_filter_ratio_threshold_exceeded megahit: meta.assembler == "megahit" xspades: ["metaspades", "spades"].contains(meta.assembler) }.set { qc_filtered_reads } @@ -239,7 +239,7 @@ workflow MIASSEMBLER { // Clean the assembly contigs // ASSEMBLY_QC( assembly, - params.reference_genome + params.host_reference_genome ) ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions) @@ -370,8 +370,8 @@ workflow MIASSEMBLER { if ( extended_meta.low_reads_count ) { return "${meta.id},low_reads_count" } - if ( extended_meta.filter_ratio_threshold_exceeded ) { - return "${meta.id},filter_ratio_threshold_exceeded" + if ( extended_meta.short_reads_filter_ratio_threshold_exceeded ) { + return "${meta.id},short_reads_filter_ratio_threshold_exceeded" } error "Unexpected. meta: ${meta}, extended_meta: ${extended_meta}" } From bf547ffcc4b97e56a6fc9d34aef4f8f4121e387e Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 3 Sep 2024 14:17:53 +0100 Subject: [PATCH 03/33] Debug missed parameters --- conf/modules.config | 2 +- nextflow.config | 10 +++++----- nextflow_schema.json | 4 ++-- workflows/longreadassembler.nf | 30 +++++++++++++++--------------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8cf286e..55eec1d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { '--average_qual', '10', '--length_required', - "${params.min_read_length}", + "${params.long_reads_min_read_length}", '--disable_adapter_trimming' ].join(' ').trim() } diff --git a/nextflow.config b/nextflow.config index ad7e5d6..9940d9a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -36,7 +36,7 @@ params { short_reads_low_reads_count_threshold = 1000 // Long reads options - long_read_min_read_length = 200 + long_reads_min_read_length = 200 // Reference genome name (to select from list) bwamem2_reference_genomes_folder = "" @@ -54,10 +54,10 @@ params { // nor indices, just a fasta file reference_genomes_folder = null remove_human = true - human_fasta_prefix = "human" + human_fasta_prefix = "human" // ASSEMBLY - + /* By default the pipeline will pick * - metaspades for paired-end short reads * - megahit for single-end short reads @@ -75,7 +75,7 @@ params { * - Memory >1TB * - Runtime >3-4 days * - * - flye: Use for any long-read assembly. assembler_config + * - flye: Use for any long-read assembly. long_reads_assembler_config * should be selected depending on input data (if ONT or * pacbio, and if data quality is high or low) */ @@ -85,7 +85,7 @@ params { spades_only_assembler = true short_reads_min_contig_length = 500 short_reads_min_contig_length_metat = 200 - long_read_assembler_config = null + long_reads_assembler_config = null assembly_memory = 100 // MultiQC options diff --git a/nextflow_schema.json b/nextflow_schema.json index d592847..2fbd699 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -36,7 +36,7 @@ "fa_icon": "far fa-address-card", "minLength": 3 }, - "min_read_length": { + "long_reads_min_read_length": { "type": "integer", "description": "Minimum read length for pre-assembly quality filtering", "default": 200 @@ -50,7 +50,7 @@ "enum": ["spades", "metaspades", "megahit", "flye"], "description": "The short or long reads assembler" }, - "long_read_assembler_config": { + "long_reads_assembler_config": { "type": "string", "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", "default": "" diff --git a/workflows/longreadassembler.nf b/workflows/longreadassembler.nf index 7b39717..dfd7bec 100644 --- a/workflows/longreadassembler.nf +++ b/workflows/longreadassembler.nf @@ -76,7 +76,7 @@ workflow LONGREADSASSEMBLY { if ( params.samplesheet ) { - longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, long_read_assembler_config, assembly_memory -> + longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, long_reads_assembler_config, assembly_memory -> return tuple( [ "id": reads_accession, @@ -85,7 +85,7 @@ workflow LONGREADSASSEMBLY { "library_layout": library_layout, "single_end": true, "assembler": assembler ?: params.assembler, - "long_read_assembler_config": long_read_assembler_config ?: params.long_read_assembler_config, + "long_reads_assembler_config": long_reads_assembler_config ?: params.long_reads_assembler_config, "assembly_memory": assembly_memory ?: params.assembly_memory ], [fq1] @@ -117,7 +117,7 @@ workflow LONGREADSASSEMBLY { // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, "assembly_memory": params.assembly_memory, - "long_read_assembler_config": params.long_read_assembler_config, + "long_reads_assembler_config": params.long_reads_assembler_config, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, "single_end": params.single_end ?: library_layout == "single", @@ -156,16 +156,16 @@ workflow LONGREADSASSEMBLY { reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> if (meta.platform == "ont") { - if (params.long_read_assembler_config == "nano-raw" || meta.quality == "low") { - return [meta + ["long_read_assembler_config": "nano-raw"], reads] - } else if (params.long_read_assembler_config == "nano-hq" || meta.quality == "high") { - return [meta + ["long_read_assembler_config": "nano-hq"], reads] + if (params.long_reads_assembler_config == "nano-raw" || meta.quality == "low") { + return [meta + ["long_reads_assembler_config": "nano-raw"], reads] + } else if (params.long_reads_assembler_config == "nano-hq" || meta.quality == "high") { + return [meta + ["long_reads_assembler_config": "nano-hq"], reads] } } else if (meta.platform == "pacbio") { - if (params.long_read_assembler_config == "pacbio-raw" || meta.quality == "low") { - return [meta + ["long_read_assembler_config": "pacbio-raw"], reads] - } else if (params.long_read_assembler_config == "pacbio-hifi" || meta.quality == "high") { - return [meta + ["long_read_assembler_config": "pacbio-hifi"], reads] + if (params.long_reads_assembler_config == "pacbio-raw" || meta.quality == "low") { + return [meta + ["long_reads_assembler_config": "pacbio-raw"], reads] + } else if (params.long_reads_assembler_config == "pacbio-hifi" || meta.quality == "high") { + return [meta + ["long_reads_assembler_config": "pacbio-hifi"], reads] } } else { error "Incompatible configuration" @@ -173,10 +173,10 @@ workflow LONGREADSASSEMBLY { } reads_assembler_config.branch { meta, reads -> - lq_ont: meta.long_read_assembler_config == "nano-raw" - hq_ont: meta.long_read_assembler_config == "pacbio-raw" - lq_pacbio: meta.long_read_assembler_config == "nano-hq" - hq_pacbio: meta.long_read_assembler_config == "pacbio-hifi" + lq_ont: meta.long_reads_assembler_config == "nano-raw" + hq_ont: meta.long_reads_assembler_config == "pacbio-raw" + lq_pacbio: meta.long_reads_assembler_config == "nano-hq" + hq_pacbio: meta.long_reads_assembler_config == "pacbio-hifi" }.set {subworkflow_platform_reads} ONT_LQ( From 7982339921c872ecee30354c3a0a0503c5049408 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 3 Sep 2024 14:42:27 +0100 Subject: [PATCH 04/33] Remove extra parameters from tests --- tests/main.nf.test | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/main.nf.test b/tests/main.nf.test index ed59d5b..06e3213 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -11,7 +11,6 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" samplesheet = "${projectDir}/tests/samplesheet/test.csv" @@ -36,7 +35,6 @@ nextflow_pipeline { params { outdir = "tests/results" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "SRP115494" @@ -65,7 +63,6 @@ nextflow_pipeline { assembler = "megahit" study_accession = "SRP115494" reads_accession = "SRR6180434" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" } @@ -90,7 +87,6 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "metaspades" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "ERP012810" @@ -115,7 +111,6 @@ nextflow_pipeline { when { params { outdir = "tests/results" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "ERP012810" @@ -138,7 +133,6 @@ nextflow_pipeline { when { params { outdir = "tests/results" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "DRP007622" @@ -165,7 +159,6 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" study_accession = "DRP007622" From 70bef0228729f6533e6e0899626359930ef5e70a Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 3 Sep 2024 14:46:40 +0100 Subject: [PATCH 05/33] Remove extra parameter from test --- conf/test.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 60db88e..223443f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,7 +18,6 @@ profiles { max_memory = '6.GB' max_time = '6.h' - reference_genomes_folder = "tests/human_phix/bwa2mem" bwamem2_reference_genomes_folder = "tests/human_phix/bwa2mem" blast_reference_genomes_folder = "tests/human_phix/blast" human_phix_blast_index_name = "human_phix" From 120d5fe8467d724d5b8bfe4a92db9b090ffa19b3 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 3 Sep 2024 14:49:16 +0100 Subject: [PATCH 06/33] Add ref genome folder to schema --- nextflow_schema.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index 2fbd699..d585895 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -104,6 +104,11 @@ "zebrafish.fna" ] }, + "reference_genomes_folder": { + "type": "string", + "description": "The folder with the reference genomes, defaults to the Microbiome Informatics internal directory.", + "format": "directory-path" + }, "blast_reference_genomes_folder": { "type": "string", "description": "The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal directory.", From 2c2faa9c14e7b949d8708217f96aaaf48c1decc9 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Tue, 3 Sep 2024 16:29:54 +0100 Subject: [PATCH 07/33] WIP - Restructure the pipeline to support Long Reads and Short Reads --- assets/schema_input.json | 6 + conf/modules.config | 4 +- conf/puthi.config | 30 +++ nextflow.config | 8 +- subworkflows/local/long_reads_qc.nf | 1 + ...ge.nf => short_reads_assembly_coverage.nf} | 2 +- ...embly_qc.nf => short_reads_assembly_qc.nf} | 2 +- .../local/{reads_qc.nf => short_reads_qc.nf} | 2 +- tests/samplesheet/test.csv | 8 +- ...adassembler.nf => long_reads_assembler.nf} | 142 ++++-------- workflows/miassembler.nf | 212 ++++++------------ workflows/short_reads_assembler.nf | 180 +++++++++++++++ 12 files changed, 347 insertions(+), 250 deletions(-) create mode 100644 conf/puthi.config rename subworkflows/local/{assembly_coverage.nf => short_reads_assembly_coverage.nf} (97%) rename subworkflows/local/{assembly_qc.nf => short_reads_assembly_qc.nf} (98%) rename subworkflows/local/{reads_qc.nf => short_reads_qc.nf} (98%) rename workflows/{longreadassembler.nf => long_reads_assembler.nf} (57%) create mode 100644 workflows/short_reads_assembler.nf diff --git a/assets/schema_input.json b/assets/schema_input.json index 84444d1..5da904b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -47,6 +47,9 @@ "enum": ["metagenomic", "metatranscriptomic", "genomic", "transcriptomic", "other"], "errorMessage": "library strategy should be only value from list: 'metagenomic', 'metatranscriptomic', 'genomic', 'transcriptomic', 'other'" }, + "platform": { + "type": "string" + }, "assembler": { "type": "string", "enum": ["spades", "metaspades", "megahit"], @@ -57,6 +60,9 @@ "type": "integer", "default": null, "description": "Default memory (in GB) allocated for the assembly process for the run." + }, + "assembler_config": { + "type": "string" } }, "required": ["study_accession", "reads_accession", "fastq_1", "library_layout", "library_strategy"] diff --git a/conf/modules.config b/conf/modules.config index 8cf286e..2668475 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,7 +146,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } - + /* --------- */ /* Assembly */ @@ -383,6 +383,8 @@ process { return null; } def output_file = new File(filename); + println ("COSO") + println (meta) return "${study_reads_folder( meta )}/assembly/${meta.assembler}/${meta.assembler_version}/qc/multiqc/${output_file.name}"; } } diff --git a/conf/puthi.config b/conf/puthi.config new file mode 100644 index 0000000..a5c1e69 --- /dev/null +++ b/conf/puthi.config @@ -0,0 +1,30 @@ +params { + bwamem2_reference_genomes_folder = "/projappl/project_2010686/ebi/reference_dbs/bwamem2" + blast_reference_genomes_folder = "/projappl/project_2010686/ebi/reference_dbs/blast" + human_phix_blast_index_name = "human_phix" + human_phix_bwamem2_index_name = "human_phix" +} + +executor { + name = "slurm" + queueSize = 200 + queueGlobalStatus = true + submitRateLimit = "10 sec" + pollInterval = "10 sec" +} + +conda.enabled = false + +// If true, on a successful completion of a run all files in work directory are automatically deleted. +cleanup = true + +singularity { + enabled = true + autoMounts = true + cacheDir = "/projappl/project_2010686/ebi/singularity_cache" +} + +conda.enabled = false + +// If true, on a successful completion of a run all files in work directory are automatically deleted. +cleanup = true diff --git a/nextflow.config b/nextflow.config index 8791b13..41c103f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,7 +39,7 @@ params { * - Runtime >3-4 days * * - flye: Use for any long-read assembly. assembler_config - * should be selected depending on input data (if ONT or + * should be selected depending on input data (if ONT or * pacbio, and if data quality is high or low) */ assembler = null @@ -61,7 +61,7 @@ params { reference_genome = null /* - * Long-read assemblies won't require phiX, + * Long-read assemblies won't require phiX, * parameters should be defined as follows: * remove_human = true * human_blast_index_name = "human" @@ -216,8 +216,8 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { - includeConfig 'conf/test.config' + test { + includeConfig 'conf/test.config' } codon_slurm { includeConfig 'conf/codon_slurm.config' } } diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index 13635ba..9d2f491 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -4,6 +4,7 @@ include { MINIMAP2_ALIGN as HUMAN_DECONTAMINATION } from '../../modules/nf-core/ include { MINIMAP2_ALIGN as HOST_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' workflow LONG_READS_QC { + take: reads // [ val(meta), path(reads) ] host_reference_genome // [ val(meta2), path(reference_genome) ] diff --git a/subworkflows/local/assembly_coverage.nf b/subworkflows/local/short_reads_assembly_coverage.nf similarity index 97% rename from subworkflows/local/assembly_coverage.nf rename to subworkflows/local/short_reads_assembly_coverage.nf index cfd6698..194c8fc 100644 --- a/subworkflows/local/assembly_coverage.nf +++ b/subworkflows/local/short_reads_assembly_coverage.nf @@ -3,7 +3,7 @@ include { BWAMEM2_MEM as BWAMEM2_MEM_COVERAGE } from '../../modules/ebi-metagen include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/samtools/idxstats/main' include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../modules/nf-core/metabat2/jgisummarizebamcontigdepths/main' -workflow ASSEMBLY_COVERAGE { +workflow SHORT_READS_ASSEMBLY_COVERAGE { take: assembly_reads // [ val(meta), path(assembly_fasta), path(reads) ] diff --git a/subworkflows/local/assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf similarity index 98% rename from subworkflows/local/assembly_qc.nf rename to subworkflows/local/short_reads_assembly_qc.nf index f5bfa7d..fc54f89 100644 --- a/subworkflows/local/assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -18,7 +18,7 @@ process PUBLISH_CLEANED_CONTIGS { """ } -workflow ASSEMBLY_QC { +workflow SHORT_READS_ASSEMBLY_QC { take: assembly // [ val(meta), path(assembly_fasta) ] diff --git a/subworkflows/local/reads_qc.nf b/subworkflows/local/short_reads_qc.nf similarity index 98% rename from subworkflows/local/reads_qc.nf rename to subworkflows/local/short_reads_qc.nf index 4cbbbe6..7dbd198 100644 --- a/subworkflows/local/reads_qc.nf +++ b/subworkflows/local/short_reads_qc.nf @@ -2,7 +2,7 @@ include { FASTP } from '../../module include { BWAMEM2DECONTNOBAMS as HUMAN_PHIX_DECONTAMINATION } from '../../modules/ebi-metagenomics/bwamem2decontnobams/main' include { BWAMEM2DECONTNOBAMS as HOST_DECONTAMINATION } from '../../modules/ebi-metagenomics/bwamem2decontnobams/main' -workflow READS_QC { +workflow SHORT_READS_QC { take: reads // [ val(meta), path(reads) ] diff --git a/tests/samplesheet/test.csv b/tests/samplesheet/test.csv index b2c4b99..fab7d69 100644 --- a/tests/samplesheet/test.csv +++ b/tests/samplesheet/test.csv @@ -1,4 +1,4 @@ -study_accession,reads_accession,fastq_1,fastq_2,library_layout,library_strategy,assembler,assembly_memory -SRP115494,SRR6180434,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_2.fastq.gz,paired,metagenomic,, -SRP115494,SRR5949318,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_2.fastq.gz,paired,metagenomic,, -DRP007622,DRR280712,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/DRR280712.fastq.gz,,single,metatranscriptomic,megahit, +study_accession,reads_accession,fastq_1,fastq_2,library_layout,library_strategy,platform,assembler,assembly_memory,assembler_config +SRP115494,SRR6180434,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_2.fastq.gz,paired,metagenomic,,,,, +SRP115494,SRR5949318,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_2.fastq.gz,paired,metagenomic,,,,, +DRP007622,DRR280712,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/DRR280712.fastq.gz,,single,metatranscriptomic,megahit,,,, diff --git a/workflows/longreadassembler.nf b/workflows/long_reads_assembler.nf similarity index 57% rename from workflows/longreadassembler.nf rename to workflows/long_reads_assembler.nf index 227e62d..0c269ab 100644 --- a/workflows/longreadassembler.nf +++ b/workflows/long_reads_assembler.nf @@ -1,37 +1,3 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PRINT PARAMS SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { validateParameters; paramsSummaryLog; paramsSummaryMap; samplesheetToList } from 'plugin/nf-schema' - -def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) -def citation = '\n' + WorkflowMain.citation(workflow) + '\n' -def summary_params = paramsSummaryMap(workflow) - -// Print parameter summary log to screen -log.info logo + paramsSummaryLog(workflow) + citation - -validateParameters() - -if (params.help) { - log.info paramsHelp("nextflow run ebi-metagenomics/longreadsassembly --help") - exit 0 -} - - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -44,6 +10,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' include { LONG_READS_QC } from '../subworkflows/local/long_reads_qc' + include { ONT_LQ } from '../subworkflows/local/ont_lq' include { ONT_HQ } from '../subworkflows/local/ont_hq' // include { PACBIO_LQ } from '../subworkflows/local/pacbio_lq' @@ -66,79 +33,54 @@ include { ONT_HQ } from '../subworkflows/local/ont_hq' */ // Info required for completion email and summary -def multiqc_report = [] - -workflow LONGREADSASSEMBLY { - ch_versions = Channel.empty() - longReads = Channel.empty() - fetch_tool_metadata = Channel.empty() - - if ( params.samplesheet ) { - - longReads = { study_accession, reads_accession, fq1, library_layout, library_strategy, assembler, assembler_config, assembly_memory -> - return tuple( - [ - "id": reads_accession, - "study_accession": study_accession, - "library_strategy": library_strategy, - "library_layout": library_layout, - "single_end": true, - "assembler": assembler ?: params.assembler, - "assembler_config": assembler_config ?: params.assembler_config, - "assembly_memory": assembly_memory ?: params.assembly_memory - ], - [fq1] - ) - } +workflow LONGREADSASSEMBLER { - samplesheet = Channel.fromList(samplesheetToList(params.samplesheet, "./assets/schema_input.json")) + take: + reads // TODO - fetch_reads_transformed = samplesheet.map(longReads) + main: - } else { - // TODO: remove when the fetch tools gets published on bioconda - fetch_tool_config = file("${projectDir}/assets/fetch_tool_anonymous.json", checkIfExists: true) + LONG_READS_QC ( + reads, + params.reference_genome + ) - if ( params.private_study ) { - fetch_tool_config = file("${projectDir}/assets/fetch_tool_credentials.json", checkIfExists: true) - } + /*********************************************************************************/ + /* Selecting the combination of adapter trimming, assembler, and post-processing */ + /*********************************************************************************/ + /* + The selection process ensures that: + - The user selected assembler configuration is always used (either from the samplesheet assembler column (with precedence) or the params.assembler) + - Low-quality ONT reads are trimmed with canu and assembled with flye --nano-corr/raw), unless specified otherwise. + - High-quality ONT reads are trimmed with porechob_abi and assembled with flye --nano-hq), unless specified otherwise. + - Low-quality pacbio reads are trimmed with canu and assembled with flye --pacbio-corr/raw), unless specified otherwise. + - High-quality pacbio reads are trimmed with HiFiAdapterFilt and assembled with flye --pacbio-hifi), unless specified otherwise. + Extra polishing steps are applied to low-quality reads. All subworkflows also apply post-assembly host decontamination. + */ - FETCHTOOL_READS( - [ [id: params.reads_accession], params.study_accession, params.reads_accession ], - fetch_tool_config - ) - - ch_versions = ch_versions.mix(FETCHTOOL_READS.out.versions) - - // Push the library strategy into the meta of the reads, this is to make it easier to handle downstream - fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout, platform -> { - [ meta + [ - // -- The metadata will be overriden by the parameters -- // - "assembler": params.assembler, - "assembly_memory": params.assembly_memory, - "assembler_config": params.assembler_config, - "library_strategy": params.library_strategy ?: library_strategy, - "library_layout": params.library_layout ?: library_layout, - "single_end": params.single_end ?: library_layout == "single", - "platform": params.platform ?: platform - ], reads ] + reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> + if (meta.platform == "ont") { + if (params.assembler_config == "nano-raw" || meta.quality == "low") { + return [meta + ["assembler_config": "nano-raw"], reads] + } else if (params.assembler_config == "nano-hq" || meta.quality == "high") { + return [meta + ["assembler_config": "nano-hq"], reads] + } + } else if (meta.platform == "pacbio") { + if (params.assembler_config == "pacbio-raw" || meta.quality == "low") { + return [meta + ["assembler_config": "pacbio-raw"], reads] + } else if (params.assembler_config == "pacbio-hifi" || meta.quality == "high") { + return [meta + ["assembler_config": "pacbio-hifi"], reads] } + } else { + error "Incompatible configuration" } - - // Metadata for MultiQC - fetch_tool_metadata = FETCHTOOL_READS.out.metadata_tsv.map { it[1] }.collectFile( - name: 'fetch_tool_mqc.tsv', - newLine: true, - keepHeader: true, - skip: 1 - ) } - LONG_READS_QC ( - fetch_reads_transformed, - params.reference_genome - ) + + ch_versions = Channel.empty() + + ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) /*********************************************************************************/ @@ -151,7 +93,7 @@ workflow LONGREADSASSEMBLY { - High-quality ONT reads are trimmed with porechob_abi and assembled with flye --nano-hq), unless specified otherwise. - Low-quality pacbio reads are trimmed with canu and assembled with flye --pacbio-corr/raw), unless specified otherwise. - High-quality pacbio reads are trimmed with HiFiAdapterFilt and assembled with flye --pacbio-hifi), unless specified otherwise. - Extra polishing steps are applied to low-quality reads. All subworkflows also apply post-assembly host decontamination. + Extra polishing steps are applied to low-quality reads. All subworkflows also apply post-assembly host decontamination. */ reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> @@ -171,7 +113,7 @@ workflow LONGREADSASSEMBLY { error "Incompatible configuration" } } - + reads_assembler_config.branch { meta, reads -> lq_ont: meta.assembler_config == "nano-raw" hq_ont: meta.assembler_config == "pacbio-raw" @@ -200,7 +142,7 @@ workflow LONGREADSASSEMBLY { /*************************************/ /* Post-assembly: coverage and stats */ /*************************************/ - + // // MODULE: Run FastQC // diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 69a80d6..5ef05de 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -23,7 +23,6 @@ if (params.help) { exit 0 } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -35,37 +34,40 @@ ch_multiqc_custom_config = params.multiqc_config ? file( params.multiqc_config ch_multiqc_logo = params.multiqc_logo ? file( params.multiqc_logo, checkIfExists: true ) : file("$projectDir/assets/mgnify_logo.png", checkIfExists: true) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT LOCAL MODULES/SUBWORKFLOWS + IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// MODULE: Installed directly from nf-core/modules // -include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' -include { READS_QC } from '../subworkflows/local/reads_qc' -include { ASSEMBLY_QC } from '../subworkflows/local/assembly_qc' -include { ASSEMBLY_COVERAGE } from '../subworkflows/local/assembly_coverage' + +include { MULTIQC as MULTIQC_STUDY } from '../modules/nf-core/multiqc/main' +include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS + IMPORT THE MAIN ENTRY POINT WORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // -// MODULE: Installed directly from nf-core/modules +// WORKFLOWS // -include { FASTQC as FASTQC_BEFORE } from '../modules/nf-core/fastqc/main' -include { FASTQC as FASTQC_AFTER } from '../modules/nf-core/fastqc/main' -include { MULTIQC as MULTIQC_STUDY } from '../modules/nf-core/multiqc/main' -include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { SPADES } from '../modules/nf-core/spades/main' -include { MEGAHIT } from '../modules/nf-core/megahit/main' -include { QUAST } from '../modules/nf-core/quast/main' +include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler' +// include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -80,20 +82,21 @@ def multiqc_report = [] workflow MIASSEMBLER { ch_versions = Channel.empty() - fetch_tool_metadata = Channel.empty() if ( params.samplesheet ) { - groupReads = { study_accession, reads_accession, fq1, fq2, library_layout, library_strategy, assembler, assembly_memory -> + groupReads = { study_accession, reads_accession, fq1, fq2, library_layout, library_strategy, platform, assembler, assembly_memory, assembler_config -> if (fq2 == []) { return tuple(["id": reads_accession, "study_accession": study_accession, - "library_strategy": library_strategy, "library_layout": library_layout, + "library_strategy": library_strategy, + "platform": params.platform ?: platform, "single_end": true, "assembler": assembler ?: params.assembler, - "assembly_memory": assembly_memory ?: params.assembly_memory + "assembly_memory": assembly_memory ?: params.assembly_memory, + "assembler_config": params.assembler_config ], [fq1] ) @@ -104,7 +107,9 @@ workflow MIASSEMBLER { "library_layout": library_layout, "single_end": false, "assembler": assembler ?: params.assembler, - "assembly_memory": assembly_memory ?: params.assembly_memory + "assembly_memory": assembly_memory ?: params.assembly_memory, + "assembler_config": params.assembler_config, + "platform": params.platform ?: platform ], [fq1, fq2]) } @@ -112,7 +117,7 @@ workflow MIASSEMBLER { samplesheet = Channel.fromList(samplesheetToList(params.samplesheet, "./assets/schema_input.json")) - // [ study, sample, read1, [read2], library_layout, library_strategy, assembly_memory ] + // [ study, sample, read1, [read2], library_layout, library_strategy, platform, assembly_memory] fetch_reads_transformed = samplesheet.map(groupReads) } else { @@ -131,14 +136,16 @@ workflow MIASSEMBLER { ch_versions = ch_versions.mix(FETCHTOOL_READS.out.versions) // Push the library strategy into the meta of the reads, this is to make it easier to handle downstream - fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout -> { + fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout, platform -> { [ meta + [ // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, + "assembler_config": params.assembler_config, "assembly_memory": params.assembler_memory, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, - "single_end": params.single_end ?: library_layout == "single" + "single_end": params.single_end ?: library_layout == "single", + "platform": params.platform ?: platform ], reads ] } } @@ -152,112 +159,41 @@ workflow MIASSEMBLER { ) } - /***************************/ - /* Selecting the assembler */ - /***************************/ - /* - The selection process ensures that: - - The user selected assembler is always used (either from the samplesheet assembler column (with precedesnse) or the params.assembler) - - Single-end reads are assembled with MEGAHIT, unless specified otherwise. - - Paired-end reads are assembled with MetaSPAdes, unless specified otherwise - - An error is raised if the assembler and read layout are incompatible (shouldn't happen...) - */ - fetch_reads_transformed = fetch_reads_transformed.map { meta, reads -> - def selected_assembler = meta.assembler; - if ( selected_assembler == "megahit" || ( meta.single_end && selected_assembler == null ) ) { - return [ meta + [assembler: "megahit", assembler_version: params.megahit_version], reads] - } else if ( ["metaspades", "spades"].contains(selected_assembler) || ( !meta.single_end && selected_assembler == null ) ) { - def xspades_assembler = selected_assembler ?: "metaspades" // Default to "metaspades" if the user didn't select one - return [ meta + [assembler: xspades_assembler, assembler_version: params.spades_version], reads] - } else { - error "Incompatible assembler and/or reads layout. We can't assembly data that is. Reads - single end value: ${meta.single_end}." - } - } - - FASTQC_BEFORE ( - fetch_reads_transformed - ) - - ch_versions = ch_versions.mix(FASTQC_BEFORE.out.versions) - - READS_QC( - fetch_reads_transformed, - params.reference_genome - ) - - FASTQC_AFTER ( - READS_QC.out.qc_reads - ) + /********************************************/ + /* Selecting the assembly pipeline flavour */ + /*******************************************/ - /******************************************/ - /* Reads that fail the following rules: */ - /* - Reads discarded by fastp > 90% (default value) */ - /* - Less than 1k reads */ - /******************************************/ - extended_qc = READS_QC.out.fastp_json.map { meta, json -> { - json_txt = new JsonSlurper().parseText(json.text) - bf_total_reads = json_txt?.summary?.before_filtering?.total_reads ?: 0; - af_total_reads = json_txt?.summary?.after_filtering?.total_reads ?: 0; - reads_qc_meta = [ - "low_reads_count": af_total_reads <= params.low_reads_count_threshold, - "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.filter_ratio_threshold ) - ] - return [meta, reads_qc_meta] + classified_reads = fetch_reads_transformed.map { meta, reads -> + // Long reads // + if ( ["ont", "pacbio"].contains( meta.platform ) ) { + return [ meta + [long_reads: true], reads] + // Short reads // + } else { + return [ meta + [short_reads: true], reads] } } - extended_reads_qc = READS_QC.out.qc_reads.join( extended_qc ) - - extended_reads_qc.branch { meta, reads, reads_qc_meta -> - // Filter out failed reads // - qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.filter_ratio_threshold_exceeded - megahit: meta.assembler == "megahit" - xspades: ["metaspades", "spades"].contains(meta.assembler) - }.set { qc_filtered_reads } + classified_reads.branch { meta, reads -> + short_reads: meta.short_reads + long_reads: meta.long_reads + }.set { reads_to_assemble } - ch_versions = ch_versions.mix(READS_QC.out.versions) + /***************************************/ + /* Assemble short reads and long reads */ + /***************************************/ - /*********************/ - /* Assembly */ - /********************/ - SPADES( - qc_filtered_reads.xspades.map { meta, reads, _ -> [meta, reads, [], []] }, - [], // yml input parameters, which we don't use - [] // hmm, not used + SHORT_READS_ASSEMBLER( + reads_to_assemble.short_reads ) - ch_versions = ch_versions.mix(SPADES.out.versions) + ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) - MEGAHIT( - qc_filtered_reads.megahit.map { meta, reads, _ -> [meta, reads] } - ) + // TODO: enable once this is ready + // LONG_READS_ASSEMBLER( + // reads_to_assemble.out.long_reads + // ) - assembly = SPADES.out.contigs.mix( MEGAHIT.out.contigs ) - - ch_versions = ch_versions.mix(MEGAHIT.out.versions) - - // Clean the assembly contigs // - ASSEMBLY_QC( - assembly, - params.reference_genome - ) - - ch_versions = ch_versions.mix(ASSEMBLY_QC.out.versions) - - // Coverage // - ASSEMBLY_COVERAGE( - ASSEMBLY_QC.out.filtered_contigs.join( READS_QC.out.qc_reads, remainder: false ) - ) - - ch_versions = ch_versions.mix(ASSEMBLY_COVERAGE.out.versions) - - // Stats // - /* The QUAST module was modified to run metaQUAST instead */ - QUAST( - ASSEMBLY_QC.out.filtered_contigs, - [ [], [] ], // reference - [ [], [] ] // gff - ) + // ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -307,12 +243,12 @@ workflow MIASSEMBLER { ch_multiqc_study_tools_files = Channel.empty() - ch_multiqc_study_tools_files = FASTQC_BEFORE.out.zip.map(meta_by_study) - .join( FASTQC_AFTER.out.zip.map(meta_by_study) ) - .join( ASSEMBLY_COVERAGE.out.samtools_idxstats.map(meta_by_study), remainder: true ) // the assembly step could fail - .join( QUAST.out.results.map(meta_by_study), remainder: true ) // the assembly step could fail + study_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_study) + .join( SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_study) ) + .join( SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_study), remainder: true ) // the assembly step could fail + .join( SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_study), remainder: true ) // the assembly step could fail - ch_multiqc_study_tools_files = ch_multiqc_study_tools_files.flatMap( combineFiles ).groupTuple() + ch_multiqc_study_tools_files = study_multiqc_files.flatMap( combineFiles ).groupTuple() // TODO: add the fetch tool log file MULTIQC_STUDY ( @@ -331,15 +267,13 @@ workflow MIASSEMBLER { [ meta.subMap("study_accession", "id", "assembler", "assembler_version"), result_artifact ] } - ch_multiqc_run_tools_files = Channel.empty() - - ch_multiqc_run_tools_files = FASTQC_BEFORE.out.zip.map(meta_by_run) - .join( FASTQC_AFTER.out.zip.map(meta_by_run) ) - .join( ASSEMBLY_COVERAGE.out.samtools_idxstats.map(meta_by_run), remainder: true ) // the assembly step could fail - .join( QUAST.out.results.map(meta_by_run), remainder: true ) // the assembly step could fail + run_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_run) + .join( SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_run) ) + .join( SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_run), remainder: true ) // the assembly step could fail + .join( SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_run), remainder: true ) // the assembly step could fail // Filter out the non-assembled runs // - ch_multiqc_run_tools_files = ch_multiqc_run_tools_files.filter { meta, fastqc_before, fastqc_after, assembly_coverage, quast -> { + ch_multiqc_run_tools_files = run_multiqc_files.filter { meta, fastqc_before, fastqc_after, assembly_coverage, quast -> { return assembly_coverage != null && quast != null } } .flatMap( combineFiles ).groupTuple() @@ -357,15 +291,17 @@ workflow MIASSEMBLER { /* End of execution reports */ /****************************/ - // Asssembled runs // - ASSEMBLY_COVERAGE.out.samtools_idxstats.map { + // TODO: we need to add LR end-of-run reports + + // Short reads asssembled runs // + SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map { meta, _ -> { return "${meta.id},${meta.assembler},${meta.assembler_version}" } }.collectFile(name: "assembled_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) - // Reads QC failed // - qc_failed_entries = qc_filtered_reads.qc_failed.map { + // Short reads QC failed // + short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed.map { meta, _, extended_meta -> { if ( extended_meta.low_reads_count ) { return "${meta.id},low_reads_count" @@ -377,7 +313,7 @@ workflow MIASSEMBLER { } } - qc_failed_entries.collectFile(name: "qc_failed_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) + short_reads_qc_failed_entries.collectFile(name: "qc_failed_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) } /* diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf new file mode 100644 index 0000000..862652a --- /dev/null +++ b/workflows/short_reads_assembler.nf @@ -0,0 +1,180 @@ +import groovy.json.JsonSlurper + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? file( params.multiqc_config, checkIfExists: true ) : [] +ch_multiqc_logo = params.multiqc_logo ? file( params.multiqc_logo, checkIfExists: true ) : file("$projectDir/assets/mgnify_logo.png", checkIfExists: true) +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// + +include { SHORT_READS_QC } from '../subworkflows/local/short_reads_qc' +include { SHORT_READS_ASSEMBLY_QC } from '../subworkflows/local/short_reads_assembly_qc' +include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_reads_assembly_coverage' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// +include { FASTQC as FASTQC_BEFORE } from '../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_AFTER } from '../modules/nf-core/fastqc/main' +include { SPADES } from '../modules/nf-core/spades/main' +include { MEGAHIT } from '../modules/nf-core/megahit/main' +include { QUAST } from '../modules/nf-core/quast/main' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow SHORT_READS_ASSEMBLER { + + take: + reads // tuple(meta), path(reads) + + main: + + ch_versions = Channel.empty() + + /***************************/ + /* Selecting the assembler */ + /***************************/ + /* + The selection process ensures that: + - The user selected assembler is always used (either from the samplesheet assembler column (with precedesnse) or the params.assembler) + - Single-end reads are assembled with MEGAHIT, unless specified otherwise. + - Paired-end reads are assembled with MetaSPAdes, unless specified otherwise + - An error is raised if the assembler and read layout are incompatible (shouldn't happen...) + */ + reads_by_assembler = reads.map { meta, reads -> + def selected_assembler = meta.assembler; + if ( selected_assembler == "megahit" || ( meta.single_end && selected_assembler == null ) ) { + return [ meta + [assembler: "megahit", assembler_version: params.megahit_version], reads] + } else if ( ["metaspades", "spades"].contains(selected_assembler) || ( !meta.single_end && selected_assembler == null ) ) { + def xspades_assembler = selected_assembler ?: "metaspades" // Default to "metaspades" if the user didn't select one + return [ meta + [assembler: xspades_assembler, assembler_version: params.spades_version], reads] + } else { + error "Incompatible assembler and/or reads layout. We can't assembly data that is. Reads - single end value: ${meta.single_end}." + } + } + + FASTQC_BEFORE ( + reads_by_assembler + ) + + ch_versions = ch_versions.mix(FASTQC_BEFORE.out.versions) + + SHORT_READS_QC( + reads_by_assembler, + params.reference_genome + ) + + FASTQC_AFTER ( + SHORT_READS_QC.out.qc_reads + ) + + /******************************************/ + /* Reads that fail the following rules: */ + /* - Reads discarded by fastp > 90% (default value) */ + /* - Less than 1k reads */ + /******************************************/ + extended_qc = SHORT_READS_QC.out.fastp_json.map { meta, json -> { + json_txt = new JsonSlurper().parseText(json.text) + bf_total_reads = json_txt?.summary?.before_filtering?.total_reads ?: 0; + af_total_reads = json_txt?.summary?.after_filtering?.total_reads ?: 0; + reads_qc_meta = [ + "low_reads_count": af_total_reads <= params.low_reads_count_threshold, + "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.filter_ratio_threshold ) + ] + return [meta, reads_qc_meta] + } + } + + extended_reads_qc = SHORT_READS_QC.out.qc_reads.join( extended_qc ) + + extended_reads_qc.branch { meta, reads, reads_qc_meta -> + // Filter out failed reads // + qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.filter_ratio_threshold_exceeded + megahit: meta.assembler == "megahit" + xspades: ["metaspades", "spades"].contains(meta.assembler) + }.set { qc_filtered_reads } + + ch_versions = ch_versions.mix(SHORT_READS_QC.out.versions) + + /*********************/ + /* Assembly */ + /********************/ + SPADES( + qc_filtered_reads.xspades.map { meta, reads, _ -> [meta, reads, [], []] }, + [], // yml input parameters, which we don't use + [] // hmm, not used + ) + + ch_versions = ch_versions.mix(SPADES.out.versions) + + MEGAHIT( + qc_filtered_reads.megahit.map { meta, reads, _ -> [meta, reads] } + ) + + assembly = SPADES.out.contigs.mix( MEGAHIT.out.contigs ) + + ch_versions = ch_versions.mix(MEGAHIT.out.versions) + + // Clean the assembly contigs // + SHORT_READS_ASSEMBLY_QC( + assembly, + params.reference_genome + ) + + ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_QC.out.versions) + + // Coverage // + SHORT_READS_ASSEMBLY_COVERAGE( + SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join( SHORT_READS_QC.out.qc_reads, remainder: false ) + ) + + ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_COVERAGE.out.versions) + + // Stats // + /* The QUAST module was modified to run metaQUAST instead */ + QUAST( + SHORT_READS_ASSEMBLY_QC.out.filtered_contigs, + [ [], [] ], // reference + [ [], [] ] // gff + ) + + ch_versions = ch_versions.mix(QUAST.out.versions) + + emit: + fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) + qc_failed = qc_filtered_reads.qc_failed // tuple(meta) + fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) + assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) + quast_results = QUAST.out.results // tuple(meta) + versions = ch_versions +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ From 0ef2e63c97cef217b46d9b3a372e4fcb2901e6d3 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Tue, 3 Sep 2024 16:32:13 +0100 Subject: [PATCH 08/33] Remove debug println statements --- conf/modules.config | 2 -- 1 file changed, 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2668475..c7a3174 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -383,8 +383,6 @@ process { return null; } def output_file = new File(filename); - println ("COSO") - println (meta) return "${study_reads_folder( meta )}/assembly/${meta.assembler}/${meta.assembler_version}/qc/multiqc/${output_file.name}"; } } From 48aa4ee594865e2fbc3aa7d89185f16aa07d8958 Mon Sep 17 00:00:00 2001 From: Ekaterina Sakharova Date: Fri, 6 Sep 2024 13:44:39 +0100 Subject: [PATCH 09/33] add dbs links --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f0809fd..10020a7 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ This pipeline is still in early development. It's mostly a direct port of the mi ## Usage > [!WARNING] -> It only runs in Codon using Slurm ATM. +> It only runs in EBI Codon cluster using Slurm ATM. Pipeline help: @@ -60,6 +60,12 @@ Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. ``` +### Required DBs: +- `--reference_genome`: reference genome in FASTA format +- `--blast_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) +- `--bwamem2_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) + + Example: ```bash From ea24e1f902a24bea33eac951a7d4c5b3cb56263a Mon Sep 17 00:00:00 2001 From: Germana Baldi Date: Sun, 8 Sep 2024 11:46:42 +0100 Subject: [PATCH 10/33] Add db generation instructions to README --- README.md | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 10020a7..997b801 100644 --- a/README.md +++ b/README.md @@ -60,11 +60,6 @@ Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. ``` -### Required DBs: -- `--reference_genome`: reference genome in FASTA format -- `--blast_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) -- `--bwamem2_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) - Example: @@ -78,6 +73,33 @@ nextflow run ebi-metagenomics/miassembler \ --reads_accession SRR1631361 ``` +### Required DBs: +- `--reference_genome`: reference genome in FASTA format +- `--blast_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) +- `--bwamem2_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) + +Blast and bwa-mem2 reference databases can be generated for any reference genome to polish input sequences with. + +#### BWA-MEM2 +As explained in [bwa-mem2's README](https://github.com/bwa-mem2/bwa-mem2?tab=readme-ov-file#getting-started): +``` +# Use precompiled binaries (recommended) +curl -L https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 \ + | tar jxf - + +# Index your reference genome with +bwa-mem2-2.2.1_x64-linux/bwa-mem2 index ref.fa +``` + +This will generate multiple index files in a folder. The folder containing them is the one to use as `bwamem2_reference_genomes_folder`. + +#### BLAST +``` +makeblastdb -in -dbtype nucl -out +``` + +As with bwa-mem2, numerous files will be generated in the same folder, which should be used for `blast_reference_genomes_folder`. + ### Samplesheet The samplesheet is a comma-separated file (.csv) with the following columns: From d63ce268bc2a0698e79de9947ba5f5858ef7d810 Mon Sep 17 00:00:00 2001 From: Sandy Rogers Date: Tue, 17 Sep 2024 11:10:57 +0100 Subject: [PATCH 11/33] Bugfix/assembler memory (#19) --- conf/modules.config | 4 ++++ nextflow.config | 2 ++ nextflow_schema.json | 18 +++++++++++++++++- tests/main.nf.test | 33 ++++++++++++++++++++++++++++++++- tests/samplesheet/test_mem.csv | 2 ++ workflows/miassembler.nf | 2 +- 6 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 tests/samplesheet/test_mem.csv diff --git a/conf/modules.config b/conf/modules.config index 367222c..1e7016d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -109,6 +109,8 @@ process { // TODO: tweak this based on input ( using the biome maybe? ) time = { check_max( 168.h * task.attempt, 'time') } ext.args = params.spades_only_assembler ? "--only-assembler" : "" + errorStrategy = 'retry' + maxRetries = params.max_spades_retries publishDir = [ [ @@ -145,6 +147,8 @@ process { } cpus = { check_max( 12 * task.attempt, 'cpus' ) } time = { check_max( 16.h * task.attempt, 'time' ) } + errorStrategy = 'retry' + maxRetries = params.max_megahit_retries publishDir = [ [ diff --git a/nextflow.config b/nextflow.config index 168873d..47baab4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -89,6 +89,8 @@ params { max_memory = '1.TB' max_cpus = 32 max_time = '168.h' // 7 days + max_spades_retries = 3 + max_megahit_retries = 3 // Assembler versions spades_version = "3.15.5" diff --git a/nextflow_schema.json b/nextflow_schema.json index ebfb512..3619bc5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -125,7 +125,7 @@ "description": "Minimum contig length filter for metaT." }, "assembly_memory": { - "type": "integer", + "type": "number", "default": 100, "description": "Default memory allocated for the assembly process." }, @@ -208,6 +208,22 @@ "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + }, + "max_spades_retries": { + "type": "integer", + "description": "Maximum number of task attempt retries for (meta)spades assembly steps only.", + "default": 3, + "fa_icon": "fas fa-repeat", + "hidden": true, + "help_text": "Each retry will increase the memory by 50%. Use to limit how many times this increase-and-retry happens." + }, + "max_megahit_retries": { + "type": "integer", + "description": "Maximum number of task attempt retries for megahit assembly steps only.", + "default": 3, + "fa_icon": "fas fa-repeat", + "hidden": true, + "help_text": "Each retry will increase the memory by 50%. Use to limit how many times this increase-and-retry happens." } } }, diff --git a/tests/main.nf.test b/tests/main.nf.test index 06e3213..872a323 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -174,4 +174,35 @@ nextflow_pipeline { } } -} + + test("Samplesheet spades - retries") { + + tag "samplesheet" + tag "retries" + + when { + params { + outdir = "tests/results" + assembler = "spades" + bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" + blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + samplesheet = "${projectDir}/tests/samplesheet/test_mem.csv" + assembly_memory = 0.5 + // will will be [0.5GB, 0.75GB, 1.13GB, ...] which rounds down to [0, 0, 1, ...] so should definitely fail twice before succeeding. after a few trys. + max_spades_retries = 5 + } + } + + then { + with(workflow) { + // eventual success: + assert success + assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 1 + + // but failed and therefore retried multiple times first: + assert trace.failed().count{ task -> task.name.contains("SPADES") } >= 2 + } + } + + } +} \ No newline at end of file diff --git a/tests/samplesheet/test_mem.csv b/tests/samplesheet/test_mem.csv new file mode 100644 index 0000000..bad87ae --- /dev/null +++ b/tests/samplesheet/test_mem.csv @@ -0,0 +1,2 @@ +study_accession,reads_accession,fastq_1,fastq_2,library_layout,library_strategy,assembler,assembly_memory +SRP115494,SRR5949318,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_2.fastq.gz,paired,metagenomic,, diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 69a80d6..508470e 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -135,7 +135,7 @@ workflow MIASSEMBLER { [ meta + [ // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, - "assembly_memory": params.assembler_memory, + "assembly_memory": params.assembly_memory, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, "single_end": params.single_end ?: library_layout == "single" From 1fc455fb370cc9df87a767aa85ec9b026cb1106f Mon Sep 17 00:00:00 2001 From: Ge94 Date: Fri, 25 Oct 2024 17:32:26 +0100 Subject: [PATCH 12/33] LR-SR miassembler merge working --- README.md | 4 ++-- nextflow.config | 2 +- nextflow_schema.json | 2 +- subworkflows/local/long_reads_qc.nf | 15 +++++++++------ subworkflows/local/short_reads_assembly_qc.nf | 2 +- subworkflows/local/short_reads_qc.nf | 2 +- workflows/long_reads_assembler.nf | 18 +++++++++--------- workflows/miassembler.nf | 13 ++++++------- workflows/short_reads_assembler.nf | 4 ++-- 9 files changed, 32 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index f0809fd..4e64e3f 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Input/output options --library_layout [string] Force the library_layout value for the study / reads (accepted: single, paired) --spades_version [string] null [default: 3.15.5] --megahit_version [string] null [default: 1.2.9] - --reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics + --host_reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna, honeybee.fna, rainbow_trout.fna, ...) --blast_reference_genomes_folder [string] The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal @@ -66,7 +66,7 @@ Example: nextflow run ebi-metagenomics/miassembler \ -profile codon_slurm \ --assembler metaspades \ - --reference_genome human \ + --host_reference_genome human \ --outdir testing_results \ --study_accession SRP002480 \ --reads_accession SRR1631361 diff --git a/nextflow.config b/nextflow.config index 41c103f..6bab930 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { low_reads_count_threshold = 1000 // Reference genome - reference_genome = null + host_reference_genome = null /* * Long-read assemblies won't require phiX, diff --git a/nextflow_schema.json b/nextflow_schema.json index 541ee4d..5f24c7e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -90,7 +90,7 @@ "type": "string", "default": "1.2.9" }, - "reference_genome": { + "host_reference_genome": { "type": "string", "description": "The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory", "enum": [ diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index 9d2f491..7ff199b 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -1,4 +1,4 @@ -include { FASTP_LR } from '../../modules/nf-core/fastp/main' +include { FASTP as FASTP_LR } from '../../modules/nf-core/fastp/main' include { RAW_READ_QUALITY_CHECK } from '../../modules/local/raw_read_quality_check/' include { MINIMAP2_ALIGN as HUMAN_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as HOST_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' @@ -21,15 +21,18 @@ workflow LONG_READS_QC { false ) - ch_versions = ch_versions.mix(FASTP.out.versions) + ch_versions = ch_versions.mix(FASTP_LR.out.versions) RAW_READ_QUALITY_CHECK( - FASTP.out.json + FASTP_LR.out.json ) decontaminated_reads = channel.empty() if ( params.remove_human ) { + // TODO: make this consistent with short_reads + // can we use the same flag, even if one has phix but not the other? + // Check file extensions too ch_bwamem2_human_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${params.human_blast_index_name}.fna", checkIfExists: true) .collect().map { @@ -39,7 +42,7 @@ workflow LONG_READS_QC { // TODO: can we change the way human/host are given via prefixes? HUMAN_DECONTAMINATION( - FASTP.out.reads, + FASTP_LR.out.reads, ch_bwamem2_human_refs, "human", true, @@ -53,12 +56,12 @@ workflow LONG_READS_QC { decontaminated_reads = HUMAN_DECONTAMINATION.out.filtered_fastq } else { - decontaminated_reads = FASTP.out.reads + decontaminated_reads = FASTP_LR.out.reads } if ( host_reference_genome != null ) { - ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${host_reference_genome}", checkIfExists: true) .collect().map { files -> [ ["id": host_reference_genome], files ] } diff --git a/subworkflows/local/short_reads_assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf index fc54f89..563a215 100644 --- a/subworkflows/local/short_reads_assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -22,7 +22,7 @@ workflow SHORT_READS_ASSEMBLY_QC { take: assembly // [ val(meta), path(assembly_fasta) ] - host_reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome + host_reference_genome // [ val(meta2), path(host_reference_genome) ] | meta2 contains the name of the reference genome main: diff --git a/subworkflows/local/short_reads_qc.nf b/subworkflows/local/short_reads_qc.nf index 7dbd198..1be9cfd 100644 --- a/subworkflows/local/short_reads_qc.nf +++ b/subworkflows/local/short_reads_qc.nf @@ -6,7 +6,7 @@ workflow SHORT_READS_QC { take: reads // [ val(meta), path(reads) ] - host_reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome + host_reference_genome // [ val(meta2), path(host_reference_genome) ] | meta2 contains the name of the reference genome main: ch_versions = Channel.empty() diff --git a/workflows/long_reads_assembler.nf b/workflows/long_reads_assembler.nf index 0c269ab..7cb9f13 100644 --- a/workflows/long_reads_assembler.nf +++ b/workflows/long_reads_assembler.nf @@ -34,17 +34,20 @@ include { ONT_HQ } from '../subworkflows/local/ont_hq' // Info required for completion email and summary -workflow LONGREADSASSEMBLER { +workflow LONG_READS_ASSEMBLER { take: - reads // TODO + reads // tuple(meta), path(reads) main: + ch_versions = Channel.empty() + LONG_READS_QC ( reads, - params.reference_genome + params.host_reference_genome ) + ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) /*********************************************************************************/ /* Selecting the combination of adapter trimming, assembler, and post-processing */ @@ -77,12 +80,6 @@ workflow LONGREADSASSEMBLER { } } - - ch_versions = Channel.empty() - - - ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) - /*********************************************************************************/ /* Selecting the combination of adapter trimming, assembler, and post-processing */ /*********************************************************************************/ @@ -177,6 +174,9 @@ workflow LONGREADSASSEMBLER { // ch_multiqc_logo.toList() // ) // multiqc_report = MULTIQC.out.report.toList() + + emit: + versions = ch_versions } /* diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 5ef05de..d357681 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -59,7 +59,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsof // WORKFLOWS // include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler' -// include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' +include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -141,7 +141,7 @@ workflow MIASSEMBLER { // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, "assembler_config": params.assembler_config, - "assembly_memory": params.assembler_memory, + "assembly_memory": params.assembly_memory, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, "single_end": params.single_end ?: library_layout == "single", @@ -188,12 +188,11 @@ workflow MIASSEMBLER { ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) - // TODO: enable once this is ready - // LONG_READS_ASSEMBLER( - // reads_to_assemble.out.long_reads - // ) + LONG_READS_ASSEMBLER( + reads_to_assemble.long_reads + ) - // ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) + ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 862652a..744c4e9 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -85,7 +85,7 @@ workflow SHORT_READS_ASSEMBLER { SHORT_READS_QC( reads_by_assembler, - params.reference_genome + params.host_reference_genome ) FASTQC_AFTER ( @@ -142,7 +142,7 @@ workflow SHORT_READS_ASSEMBLER { // Clean the assembly contigs // SHORT_READS_ASSEMBLY_QC( assembly, - params.reference_genome + params.host_reference_genome ) ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_QC.out.versions) From 8fd885c0734b0f8ab63b44f670ced3bd8aa04645 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Mon, 28 Oct 2024 13:16:34 +0000 Subject: [PATCH 13/33] Updated conflict params and flags after merging --- nextflow.config | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index 12b1476..da8586b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,7 +31,7 @@ params { // QC FILTERING - // Short reads options + // Short reads QC filtering options short_reads_filter_ratio_threshold = 0.9 short_reads_low_reads_count_threshold = 1000 @@ -80,7 +80,6 @@ params { * pacbio, and if data quality is high or low) */ assembler = null - assembler_config = null // The pipeline will use the metadata from ENA (obtained by the fetch_tool) // As the metadata can be incorrect, we provide the following parameters to @@ -90,10 +89,6 @@ params { library_strategy = null platform = null - // Reads QC filtering options - filter_ratio_threshold = 0.9 - low_reads_count_threshold = 1000 - // Reference genome host_reference_genome = null From 123ffbc830d4533c994c19753247c13fd5a12a6f Mon Sep 17 00:00:00 2001 From: Ge94 Date: Fri, 1 Nov 2024 16:44:31 +0000 Subject: [PATCH 14/33] Fixed tests, refined variables, harmonised SR+LR --- conf/test.config | 8 ++- nextflow.config | 34 ++----------- nextflow_schema.json | 5 -- subworkflows/local/long_reads_qc.nf | 2 +- tests/human/human.fna | 79 +++++++++++++++++++++++++++++ tests/main.nf.test | 58 ++++++++++++++++----- tests/samplesheet/test.csv | 2 +- workflows/long_reads_assembler.nf | 12 ++--- workflows/miassembler.nf | 14 ++--- workflows/short_reads_assembler.nf | 4 +- 10 files changed, 146 insertions(+), 72 deletions(-) create mode 100644 tests/human/human.fna diff --git a/conf/test.config b/conf/test.config index 223443f..cde44ce 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,11 +18,9 @@ profiles { max_memory = '6.GB' max_time = '6.h' - bwamem2_reference_genomes_folder = "tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "tests/human_phix/blast" - human_phix_blast_index_name = "human_phix" - human_phix_bwamem2_index_name = "human_phix" - human_fasta_prefix = "human" + bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" + blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + reference_genomes_folder = "${projectDir}/tests/human/" } } } diff --git a/nextflow.config b/nextflow.config index da8586b..0846ccc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,9 +38,11 @@ params { // Long reads options long_reads_min_read_length = 200 - // Reference genome name (to select from list) + // Short reads reference databases (name to be selected from list) bwamem2_reference_genomes_folder = "" blast_reference_genomes_folder = "" + + // Long reads reference genome host_reference_genome = null // Short-read sequences and assemblies are @@ -52,7 +54,7 @@ params { // Long-read assemblies don't require phiX // nor indices, just a fasta file - reference_genomes_folder = null + reference_genomes_folder = "" remove_human = true human_fasta_prefix = "human" @@ -75,38 +77,12 @@ params { * - Memory >1TB * - Runtime >3-4 days * - * - flye: Use for any long-read assembly. assembler_config + * - flye: Use for any long-read assembly. long_reads_assembler_config * should be selected depending on input data (if ONT or * pacbio, and if data quality is high or low) */ assembler = null - // The pipeline will use the metadata from ENA (obtained by the fetch_tool) - // As the metadata can be incorrect, we provide the following parameters to - // "force" them - single_end = null - library_layout = null - library_strategy = null - platform = null - - // Reference genome - host_reference_genome = null - - /* - * Long-read assemblies won't require phiX, - * parameters should be defined as follows: - * remove_human = true - * human_blast_index_name = "human" - * human_bwamem2_index_name = "human" - * Need to integrate them - */ - remove_human_phix = true - human_phix_blast_index_name = "human_phix" - human_phix_bwamem2_index_name = "human_phix" - - bwamem2_reference_genomes_folder = "" - blast_reference_genomes_folder = "" - // Assembly options spades_only_assembler = true short_reads_min_contig_length = 500 diff --git a/nextflow_schema.json b/nextflow_schema.json index d585895..502976c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -134,11 +134,6 @@ "description": "Combined Human and phiX BLAST db.", "default": "human_phix" }, - "human_blast_index_name": { - "type": "string", - "description": "Human BLAST db.", - "default": "human" - }, "human_phix_bwamem2_index_name": { "type": "string", "description": "Combined Human and phiX bwa-mem2 index.", diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index af7a5c3..cbc4d58 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -36,7 +36,7 @@ workflow LONG_READS_QC { human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.fna", checkIfExists: true) .collect().map { - files -> [ ["id": params.human_blast_index_name], files ] + files -> [ ["id": params.human_fasta_prefix], files ] } // TODO: can we change the way human/host are given via prefixes? diff --git a/tests/human/human.fna b/tests/human/human.fna new file mode 100644 index 0000000..a4ebdb7 --- /dev/null +++ b/tests/human/human.fna @@ -0,0 +1,79 @@ +>NC_001422.1 Escherichia phage phiX174, complete genome +GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT +GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA +ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG +TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA +GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC +TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT +TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT +CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT +TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG +TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC +GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA +CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG +TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT +AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC +CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA +TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC +TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA +CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA +GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT +GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA +ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC +TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT +TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC +ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC +CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT +GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC +CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC +TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG +TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT +TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA +AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT +TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT +ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC +GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC +TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT +TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA +TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG +TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC +CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG +AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC +CGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT +TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG +CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA +AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT +GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG +GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA +TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT +CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG +TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA +GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC +CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA +TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA +AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC +TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT +CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA +TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG +TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT +CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT +TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC +ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG +TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA +ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG +GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC +CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT +GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG +GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT +ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG +CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC +CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC +GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT +CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG +CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA +TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT +TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG +TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC +AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC +TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA + diff --git a/tests/main.nf.test b/tests/main.nf.test index 06e3213..3f36f99 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -11,8 +11,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + samplesheet = "${projectDir}/tests/samplesheet/test.csv" } } @@ -27,6 +26,37 @@ nextflow_pipeline { } + test("Samplesheet - no assembled - reads filtered ") { + + tag "samplesheet" + + when { + params { + outdir = "tests/results" + assembler = "spades" + + short_reads_low_reads_count_threshold = 1000000 + + samplesheet = "${projectDir}/tests/samplesheet/test.csv" + } + } + + then { + with(workflow) { + assert success + assert trace.succeeded().count{ task -> task.name.contains("FASTQC_BEFORE") } == 3 + assert trace.succeeded().count{ task -> task.name.contains("FASTP") } == 3 + assert trace.succeeded().count{ task -> task.name.contains("HUMAN_PHIX_DECONTAMINATION") } == 3 + assert trace.succeeded().count{ task -> task.name.contains("FASTQC_AFTER") } == 3 + assert trace.succeeded().count{ task -> task.name.contains("CUSTOM_DUMPSOFTWAREVERSIONS") } == 1 + assert trace.succeeded().count{ task -> task.name.contains("MULTIQC_STUDY") } == 2 + assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 0 + assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 0 + } + } + + } + test("metaSPAdes - paired end") { tag "ena-portal-api" @@ -35,8 +65,10 @@ nextflow_pipeline { params { outdir = "tests/results" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + + // Force the assembly + short_reads_filter_ratio_threshold = 0.1 + study_accession = "SRP115494" reads_accession = "SRR6180434" } @@ -61,10 +93,12 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "megahit" + + // Force the assembly + short_reads_filter_ratio_threshold = 0.1 + study_accession = "SRP115494" reads_accession = "SRR6180434" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" } } @@ -87,8 +121,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "metaspades" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + study_accession = "ERP012810" reads_accession = "ERR1076564" } @@ -111,8 +144,7 @@ nextflow_pipeline { when { params { outdir = "tests/results" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + study_accession = "ERP012810" reads_accession = "ERR1076564" } @@ -133,8 +165,7 @@ nextflow_pipeline { when { params { outdir = "tests/results" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + study_accession = "DRP007622" reads_accession = "DRR280712" } @@ -159,8 +190,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + study_accession = "DRP007622" reads_accession = "DRR280712" } diff --git a/tests/samplesheet/test.csv b/tests/samplesheet/test.csv index fab7d69..8137806 100644 --- a/tests/samplesheet/test.csv +++ b/tests/samplesheet/test.csv @@ -1,4 +1,4 @@ study_accession,reads_accession,fastq_1,fastq_2,library_layout,library_strategy,platform,assembler,assembly_memory,assembler_config SRP115494,SRR6180434,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR6180434_2.fastq.gz,paired,metagenomic,,,,, SRP115494,SRR5949318,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_1.fastq.gz,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/SRR5949318_2.fastq.gz,paired,metagenomic,,,,, -DRP007622,DRR280712,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/DRR280712.fastq.gz,,single,metatranscriptomic,megahit,,,, +DRP007622,DRR280712,https://github.com/EBI-Metagenomics/miassembler/raw/main/tests/test_reads/DRR280712.fastq.gz,,single,metatranscriptomic,,megahit,,, diff --git a/workflows/long_reads_assembler.nf b/workflows/long_reads_assembler.nf index de4b2d2..0148c01 100644 --- a/workflows/long_reads_assembler.nf +++ b/workflows/long_reads_assembler.nf @@ -64,15 +64,15 @@ workflow LONG_READS_ASSEMBLER { reads_assembler_config = LONG_READS_QC.out.qc_reads.map { meta, reads -> if (meta.platform == "ont") { - if (params.assembler_config == "nano-raw" || meta.quality == "low") { + if (params.long_reads_assembler_config == "nano-raw" || meta.quality == "low") { return [meta + ["assembler_config": "nano-raw"], reads] - } else if (params.assembler_config == "nano-hq" || meta.quality == "high") { + } else if (params.long_reads_assembler_config == "nano-hq" || meta.quality == "high") { return [meta + ["assembler_config": "nano-hq"], reads] } } else if (meta.platform == "pacbio") { - if (params.assembler_config == "pacbio-raw" || meta.quality == "low") { + if (params.long_reads_assembler_config == "pacbio-raw" || meta.quality == "low") { return [meta + ["assembler_config": "pacbio-raw"], reads] - } else if (params.assembler_config == "pacbio-hifi" || meta.quality == "high") { + } else if (params.long_reads_assembler_config == "pacbio-hifi" || meta.quality == "high") { return [meta + ["assembler_config": "pacbio-hifi"], reads] } } else { @@ -148,10 +148,6 @@ workflow LONG_READS_ASSEMBLER { // ) // ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - // CUSTOM_DUMPSOFTWAREVERSIONS ( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - // ) - // // MODULE: MultiQC // diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index d17ea7c..7f029bf 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -96,7 +96,7 @@ workflow MIASSEMBLER { "single_end": true, "assembler": assembler ?: params.assembler, "assembly_memory": assembly_memory ?: params.assembly_memory, - "assembler_config": params.assembler_config + "assembler_config": params.long_reads_assembler_config ], [fq1] ) @@ -108,7 +108,7 @@ workflow MIASSEMBLER { "single_end": false, "assembler": assembler ?: params.assembler, "assembly_memory": assembly_memory ?: params.assembly_memory, - "assembler_config": params.assembler_config, + "assembler_config": params.long_reads_assembler_config, "platform": params.platform ?: platform ], [fq1, fq2]) @@ -140,7 +140,7 @@ workflow MIASSEMBLER { [ meta + [ // -- The metadata will be overriden by the parameters -- // "assembler": params.assembler, - "assembler_config": params.assembler_config, + "assembler_config": params.long_reads_assembler_config, "assembly_memory": params.assembly_memory, "library_strategy": params.library_strategy ?: library_strategy, "library_layout": params.library_layout ?: library_layout, @@ -186,13 +186,13 @@ workflow MIASSEMBLER { reads_to_assemble.short_reads ) - ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) + ch_versions = ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) LONG_READS_ASSEMBLER( reads_to_assemble.long_reads ) - ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) + ch_versions = ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -305,8 +305,8 @@ workflow MIASSEMBLER { if ( extended_meta.low_reads_count ) { return "${meta.id},low_reads_count" } - if ( extended_meta.short_reads_filter_ratio_threshold_exceeded ) { - return "${meta.id},short_reads_filter_ratio_threshold_exceeded" + if ( extended_meta.filter_ratio_threshold_exceeded ) { + return "${meta.id},filter_ratio_threshold_exceeded" } error "Unexpected. meta: ${meta}, extended_meta: ${extended_meta}" } diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 744c4e9..8ab6696 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -102,8 +102,8 @@ workflow SHORT_READS_ASSEMBLER { bf_total_reads = json_txt?.summary?.before_filtering?.total_reads ?: 0; af_total_reads = json_txt?.summary?.after_filtering?.total_reads ?: 0; reads_qc_meta = [ - "low_reads_count": af_total_reads <= params.low_reads_count_threshold, - "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.filter_ratio_threshold ) + "low_reads_count": af_total_reads <= params.short_reads_low_reads_count_threshold, + "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.short_reads_filter_ratio_threshold ) ] return [meta, reads_qc_meta] } From d95f0e5897c880e22ef3d80f36d8edd850d42cf3 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Wed, 6 Nov 2024 12:14:26 +0000 Subject: [PATCH 15/33] Updated reference_genome param --- README.md | 4 ++-- nextflow.config | 2 +- nextflow_schema.json | 2 +- subworkflows/local/long_reads_qc.nf | 8 ++++---- subworkflows/local/short_reads_assembly_qc.nf | 8 ++++---- subworkflows/local/short_reads_qc.nf | 8 ++++---- workflows/long_reads_assembler.nf | 2 +- workflows/short_reads_assembler.nf | 4 ++-- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 64ad609..8414420 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Input/output options --spades_version [string] null [default: 3.15.5] --megahit_version [string] null [default: 1.2.9] --flye_version [string] null [default: 2.9] - --host_reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics + --reference_genome [string] The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna, honeybee.fna, rainbow_trout.fna, ...) --blast_reference_genomes_folder [string] The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal @@ -69,7 +69,7 @@ Example: nextflow run ebi-metagenomics/miassembler \ -profile codon_slurm \ --assembler metaspades \ - --host_reference_genome human \ + --reference_genome human \ --outdir testing_results \ --study_accession SRP002480 \ --reads_accession SRR1631361 diff --git a/nextflow.config b/nextflow.config index 0846ccc..5af2985 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,7 +43,7 @@ params { blast_reference_genomes_folder = "" // Long reads reference genome - host_reference_genome = null + reference_genome = null // Short-read sequences and assemblies are // automatically polished from human and phix seqs diff --git a/nextflow_schema.json b/nextflow_schema.json index 502976c..33e7cca 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -86,7 +86,7 @@ "type": "string", "default": "1.2.9" }, - "host_reference_genome": { + "reference_genome": { "type": "string", "description": "The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics internal directory", "enum": [ diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index cbc4d58..45a3ab3 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -7,7 +7,7 @@ workflow LONG_READS_QC { take: reads // [ val(meta), path(reads) ] - host_reference_genome // [ val(meta2), path(reference_genome) ] + reference_genome // [ val(meta2), path(reference_genome) ] main: ch_versions = Channel.empty() @@ -59,11 +59,11 @@ workflow LONG_READS_QC { decontaminated_reads = FASTP_LR.out.reads } - if ( host_reference_genome != null ) { + if ( reference_genome != null ) { - host_reference = Channel.fromPath( "${params.reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + host_reference = Channel.fromPath( "${params.reference_genomes_folder}/${reference_genome}*", checkIfExists: true) .collect().map { - files -> [ ["id": host_reference_genome], files ] + files -> [ ["id": reference_genome], files ] } HOST_DECONTAMINATION( diff --git a/subworkflows/local/short_reads_assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf index d085a04..5e273af 100644 --- a/subworkflows/local/short_reads_assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -22,7 +22,7 @@ workflow SHORT_READS_ASSEMBLY_QC { take: assembly // [ val(meta), path(assembly_fasta) ] - host_reference_genome // [ val(meta2), path(host_reference_genome) ] | meta2 contains the name of the reference genome + reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome main: @@ -60,11 +60,11 @@ workflow SHORT_READS_ASSEMBLY_QC { ch_versions = ch_versions.mix(SEQKIT_GREP_HUMAN_PHIX.out.versions) } - if ( host_reference_genome != null ) { + if ( reference_genome != null ) { - ch_blast_host_refs = Channel.fromPath( "${params.blast_reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + ch_blast_host_refs = Channel.fromPath( "${params.blast_reference_genomes_folder}/${reference_genome}*", checkIfExists: true) .collect().map { - files -> [ ["id": host_reference_genome], files ] + files -> [ ["id": reference_genome], files ] } BLAST_BLASTN_HOST( diff --git a/subworkflows/local/short_reads_qc.nf b/subworkflows/local/short_reads_qc.nf index 1be9cfd..5cbe55b 100644 --- a/subworkflows/local/short_reads_qc.nf +++ b/subworkflows/local/short_reads_qc.nf @@ -6,7 +6,7 @@ workflow SHORT_READS_QC { take: reads // [ val(meta), path(reads) ] - host_reference_genome // [ val(meta2), path(host_reference_genome) ] | meta2 contains the name of the reference genome + reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome main: ch_versions = Channel.empty() @@ -44,11 +44,11 @@ workflow SHORT_READS_QC { decontaminated_reads = FASTP.out.reads } - if ( host_reference_genome != null ) { + if ( reference_genome != null ) { - ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${host_reference_genome}*", checkIfExists: true) + ch_bwamem2_host_refs = Channel.fromPath( "${params.bwamem2_reference_genomes_folder}/${reference_genome}*", checkIfExists: true) .collect().map { - files -> [ ["id": host_reference_genome], files ] + files -> [ ["id": reference_genome], files ] } HOST_DECONTAMINATION( diff --git a/workflows/long_reads_assembler.nf b/workflows/long_reads_assembler.nf index 0148c01..092c850 100644 --- a/workflows/long_reads_assembler.nf +++ b/workflows/long_reads_assembler.nf @@ -45,7 +45,7 @@ workflow LONG_READS_ASSEMBLER { LONG_READS_QC ( reads, - params.host_reference_genome + params.reference_genome ) ch_versions = ch_versions.mix(LONG_READS_QC.out.versions) diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 8ab6696..5b30d1c 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -85,7 +85,7 @@ workflow SHORT_READS_ASSEMBLER { SHORT_READS_QC( reads_by_assembler, - params.host_reference_genome + params.reference_genome ) FASTQC_AFTER ( @@ -142,7 +142,7 @@ workflow SHORT_READS_ASSEMBLER { // Clean the assembly contigs // SHORT_READS_ASSEMBLY_QC( assembly, - params.host_reference_genome + params.reference_genome ) ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_QC.out.versions) From 3cc1d79e13d9e61d9eeb5fd6c9034e7839bdae84 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 6 Nov 2024 15:24:45 +0000 Subject: [PATCH 16/33] Remove puthi.conf file that was pushed by accident. --- conf/puthi.config | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 conf/puthi.config diff --git a/conf/puthi.config b/conf/puthi.config deleted file mode 100644 index a5c1e69..0000000 --- a/conf/puthi.config +++ /dev/null @@ -1,30 +0,0 @@ -params { - bwamem2_reference_genomes_folder = "/projappl/project_2010686/ebi/reference_dbs/bwamem2" - blast_reference_genomes_folder = "/projappl/project_2010686/ebi/reference_dbs/blast" - human_phix_blast_index_name = "human_phix" - human_phix_bwamem2_index_name = "human_phix" -} - -executor { - name = "slurm" - queueSize = 200 - queueGlobalStatus = true - submitRateLimit = "10 sec" - pollInterval = "10 sec" -} - -conda.enabled = false - -// If true, on a successful completion of a run all files in work directory are automatically deleted. -cleanup = true - -singularity { - enabled = true - autoMounts = true - cacheDir = "/projappl/project_2010686/ebi/singularity_cache" -} - -conda.enabled = false - -// If true, on a successful completion of a run all files in work directory are automatically deleted. -cleanup = true From 2379cb758202c36d72e2f5a70c6521b0ac357860 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 7 Nov 2024 12:50:30 +0000 Subject: [PATCH 17/33] Update raw_reads qc script with json parser --- modules/local/raw_read_quality_check.nf | 24 ------------------------ subworkflows/local/long_reads_qc.nf | 16 +++++++++++++++- 2 files changed, 15 insertions(+), 25 deletions(-) delete mode 100644 modules/local/raw_read_quality_check.nf diff --git a/modules/local/raw_read_quality_check.nf b/modules/local/raw_read_quality_check.nf deleted file mode 100644 index 01ea6f2..0000000 --- a/modules/local/raw_read_quality_check.nf +++ /dev/null @@ -1,24 +0,0 @@ -process RAW_READ_QUALITY_CHECK { - tag "$reads_accession" - label 'process_single' - - input: - tuple val(meta), path(fastp_json) - - output: - env(quality) , emit: quality - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - quality=\$(check_raw_quality.py -j ${fastp_json}) - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index 45a3ab3..431b340 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -1,5 +1,4 @@ include { FASTP as FASTP_LR } from '../../modules/nf-core/fastp/main' -include { RAW_READ_QUALITY_CHECK } from '../../modules/local/raw_read_quality_check/' include { MINIMAP2_ALIGN as HUMAN_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as HOST_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' @@ -23,6 +22,21 @@ workflow LONG_READS_QC { ch_versions = ch_versions.mix(FASTP_LR.out.versions) + quality_levels_ch = FASTP_LR.out.json.map { meta, json -> { + json_txt = new JsonSlurper().parseText(json.text) + q20bases = json_txt?.summary?.before_filtering?.q20_bases ?: 0; + total_bases = json_txt?.summary?.before_filtering?.total_bases ?: 0; + + q20_percentage = q20_bases / total_bases * 100 + + quality = [ + "high_quality": q20_percentage >= 80, + "low_quality": q20_percentage < 80, + ] + return [meta, quality] + } + } + RAW_READ_QUALITY_CHECK( FASTP_LR.out.json ) From d43a86203a0b3460a98d2c5e539b436081df05c5 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 7 Nov 2024 14:22:09 +0000 Subject: [PATCH 18/33] Comment and update --platform --- README.md | 1 + modules/local/fetchtool_reads.nf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8414420..1135e75 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Input/output options --library_strategy [string] Force the library_strategy value for the study / reads (accepted: metagenomic, metatranscriptomic, genomic, transcriptomic, other) --library_layout [string] Force the library_layout value for the study / reads (accepted: single, paired) + --platform [string] Force the sequencing_platform value for the study / reads --spades_version [string] null [default: 3.15.5] --megahit_version [string] null [default: 1.2.9] --flye_version [string] null [default: 2.9] diff --git a/modules/local/fetchtool_reads.nf b/modules/local/fetchtool_reads.nf index e62484a..05baecb 100644 --- a/modules/local/fetchtool_reads.nf +++ b/modules/local/fetchtool_reads.nf @@ -38,7 +38,7 @@ process FETCHTOOL_READS { elif [[ \$metadata_platform == "pacbio rs" || \$metadata_platform == "pacbio rs ii" ]]; then platform="pacbio" else - platform="short" + platform=\$metadata_platform fi cat <<-END_VERSIONS > versions.yml From c2a2bb614a359a10ada67b210843bac7579a6e67 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Thu, 7 Nov 2024 15:22:27 +0000 Subject: [PATCH 19/33] Minor refinements and documentation --- subworkflows/local/long_reads_qc.nf | 56 ++++++++++++----------------- subworkflows/local/ont_hq.nf | 2 +- subworkflows/local/ont_lq.nf | 2 +- workflows/long_reads_assembler.nf | 4 +-- 4 files changed, 27 insertions(+), 37 deletions(-) diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index 431b340..f3dc9ac 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -1,11 +1,11 @@ -include { FASTP as FASTP_LR } from '../../modules/nf-core/fastp/main' -include { MINIMAP2_ALIGN as HUMAN_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' -include { MINIMAP2_ALIGN as HOST_DECONTAMINATION } from '../../modules/nf-core/minimap2/align/main' +include { FASTP as FASTP_LR } from '../../modules/nf-core/fastp/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_HUMAN } from '../../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_HOST } from '../../modules/nf-core/minimap2/align/main' workflow LONG_READS_QC { take: - reads // [ val(meta), path(reads) ] + reads // [ val(meta), path(reads) ] reference_genome // [ val(meta2), path(reference_genome) ] main: @@ -13,11 +13,11 @@ workflow LONG_READS_QC { FASTP_LR( reads, - [], - false, - false, - false, - false + [], // no input adapters + false, // keep passing reads in the output + false, // omit trimmed reads in the output + false, // don't merge all reads in the output + false // don't trim for polyA ) ch_versions = ch_versions.mix(FASTP_LR.out.versions) @@ -37,9 +37,7 @@ workflow LONG_READS_QC { } } - RAW_READ_QUALITY_CHECK( - FASTP_LR.out.json - ) + // TODO: add filter if too many reads are removed decontaminated_reads = channel.empty() @@ -55,14 +53,14 @@ workflow LONG_READS_QC { // TODO: can we change the way human/host are given via prefixes? - HUMAN_DECONTAMINATION( + MINIMAP2_ALIGN_HUMAN( FASTP_LR.out.reads, human_reference, "human", - true, - "bai", - false, - true + true, // output bam format + "bai", // bam index extension + false, // no CIGAR in paf format + true // allow for long CIGAR ) ch_versions = ch_versions.mix(HUMAN_DECONTAMINATION.out.versions) @@ -80,30 +78,22 @@ workflow LONG_READS_QC { files -> [ ["id": reference_genome], files ] } - HOST_DECONTAMINATION( + MINIMAP2_ALIGN_HOST( decontaminated_reads, host_reference, "host", - true, - "bai", - false, - true + true, // output bam format + "bai", // bam index extension + false, // no CIGAR in paf format + true // allow for long CIGAR ) - ch_versions = ch_versions.mix(HOST_DECONTAMINATION.out.versions) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_HOST.out.versions) - decontaminated_reads = HOST_DECONTAMINATION.out.filtered_fastq + decontaminated_reads = MINIMAP2_ALIGN_HOST.out.filtered_fastq } - final_reads = decontaminated_reads - .map{ meta, reads -> { - [ meta + [ - "quality": RAW_READ_QUALITY_CHECK.out.quality.val - ], reads ] - } - } - emit: - qc_reads = final_reads + qc_reads = decontaminated_reads versions = ch_versions } diff --git a/subworkflows/local/ont_hq.nf b/subworkflows/local/ont_hq.nf index 7255d24..aa21574 100644 --- a/subworkflows/local/ont_hq.nf +++ b/subworkflows/local/ont_hq.nf @@ -1,4 +1,4 @@ -include { PORECHOP_ABI as PORECHOP_ONT } from '../../modules/nf-core/porechop/abi/main' +include { PORECHOP_ABI } from '../../modules/nf-core/porechop/abi/main' workflow ONT_HQ { take: diff --git a/subworkflows/local/ont_lq.nf b/subworkflows/local/ont_lq.nf index 6538c14..b37b063 100644 --- a/subworkflows/local/ont_lq.nf +++ b/subworkflows/local/ont_lq.nf @@ -1,4 +1,4 @@ -include { CANU as CANU_ONT } from '../../modules/nf-core/canu/main' +include { CANU } from '../../modules/nf-core/canu/main' workflow ONT_LQ { take: diff --git a/workflows/long_reads_assembler.nf b/workflows/long_reads_assembler.nf index 092c850..f796f54 100644 --- a/workflows/long_reads_assembler.nf +++ b/workflows/long_reads_assembler.nf @@ -13,8 +13,8 @@ include { LONG_READS_QC } from '../subworkflows/local/long_reads_qc' include { ONT_LQ } from '../subworkflows/local/ont_lq' include { ONT_HQ } from '../subworkflows/local/ont_hq' -// include { PACBIO_LQ } from '../subworkflows/local/pacbio_lq' -// include { PACBIO_HIFI } from '../subworkflows/local/pacbio_hifi' +include { PACBIO_LQ } from '../subworkflows/local/pacbio_lq' +include { PACBIO_HIFI } from '../subworkflows/local/pacbio_hifi' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From aa18aa0409c468f292f77f85e4ead20a0e8182d5 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Wed, 13 Nov 2024 16:31:28 +0000 Subject: [PATCH 20/33] Fixed tests and configs --- bin/check_raw_quality.py | 22 ---------------------- conf/test.config | 8 ++++++++ modules/local/fetchtool_reads.nf | 2 +- subworkflows/local/long_reads_qc.nf | 6 ++++-- subworkflows/local/ont_hq.nf | 6 +++--- subworkflows/local/ont_lq.nf | 2 +- subworkflows/local/pacbio_lq.nf | 2 +- workflows/short_reads_assembler.nf | 11 +++-------- 8 files changed, 21 insertions(+), 38 deletions(-) delete mode 100755 bin/check_raw_quality.py diff --git a/bin/check_raw_quality.py b/bin/check_raw_quality.py deleted file mode 100755 index 9a9dc5b..0000000 --- a/bin/check_raw_quality.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 - -import json -import argparse - -parser = argparse.ArgumentParser(description="Evaluate run quality from fastp output") -parser.add_argument('--json','-j',help='Fastp json output',required=True) - -argv = parser.parse_args() - -fastp_out = argv.json -data = json.load(open(fastp_out)) - -q20_bases = float(data['read1_before_filtering']['q20_bases']) -total_bases = float(data['read1_before_filtering']['total_bases']) -q20_percentage = q20_bases/total_bases*100 - -quality = "low" -if q20_percentage >= 80: - quality = "high" - -print(quality) \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index cde44ce..2e734c2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,6 +21,14 @@ profiles { bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" reference_genomes_folder = "${projectDir}/tests/human/" + + max_spades_retries = -1 + max_megahit_retries = -1 + } + + process { + errorStrategy = 'ignore' + maxRetries = 0 } } } diff --git a/modules/local/fetchtool_reads.nf b/modules/local/fetchtool_reads.nf index 05baecb..3dbeae1 100644 --- a/modules/local/fetchtool_reads.nf +++ b/modules/local/fetchtool_reads.nf @@ -38,7 +38,7 @@ process FETCHTOOL_READS { elif [[ \$metadata_platform == "pacbio rs" || \$metadata_platform == "pacbio rs ii" ]]; then platform="pacbio" else - platform=\$metadata_platform + platform="\$metadata_platform" fi cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index f3dc9ac..da0b059 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -1,3 +1,5 @@ +import groovy.json.JsonSlurper + include { FASTP as FASTP_LR } from '../../modules/nf-core/fastp/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_HUMAN } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_HOST } from '../../modules/nf-core/minimap2/align/main' @@ -63,9 +65,9 @@ workflow LONG_READS_QC { true // allow for long CIGAR ) - ch_versions = ch_versions.mix(HUMAN_DECONTAMINATION.out.versions) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_HUMAN.out.versions) - decontaminated_reads = HUMAN_DECONTAMINATION.out.filtered_fastq + decontaminated_reads = MINIMAP2_ALIGN_HUMAN.out.filtered_fastq } else { decontaminated_reads = FASTP_LR.out.reads diff --git a/subworkflows/local/ont_hq.nf b/subworkflows/local/ont_hq.nf index aa21574..4537d46 100644 --- a/subworkflows/local/ont_hq.nf +++ b/subworkflows/local/ont_hq.nf @@ -5,12 +5,12 @@ workflow ONT_HQ { reads // [ val(meta), path(reads) ] main: - PORECHOP_ONT( + PORECHOP_ABI( reads ) - PORECHOP_ONT.out.reads.view() + PORECHOP_ABI.out.reads.view() // temporary just to test the module emit: - contigs = PORECHOP_ONT.out.reads + contigs = PORECHOP_ABI.out.reads } diff --git a/subworkflows/local/ont_lq.nf b/subworkflows/local/ont_lq.nf index b37b063..d53db8c 100644 --- a/subworkflows/local/ont_lq.nf +++ b/subworkflows/local/ont_lq.nf @@ -1,4 +1,4 @@ -include { CANU } from '../../modules/nf-core/canu/main' +include { CANU as CANU_ONT } from '../../modules/nf-core/canu/main' workflow ONT_LQ { take: diff --git a/subworkflows/local/pacbio_lq.nf b/subworkflows/local/pacbio_lq.nf index df49b01..0db719d 100644 --- a/subworkflows/local/pacbio_lq.nf +++ b/subworkflows/local/pacbio_lq.nf @@ -1,4 +1,4 @@ -include { CANU as CANU_PACBIO } from '../../modules/nf-core/canu/main' +include { CANU as CANU_PACBIO } from '../../modules/nf-core/canu/main' workflow PACBIO_LQ { take: diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 5b30d1c..f159e84 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -80,13 +80,13 @@ workflow SHORT_READS_ASSEMBLER { FASTQC_BEFORE ( reads_by_assembler ) - ch_versions = ch_versions.mix(FASTQC_BEFORE.out.versions) SHORT_READS_QC( reads_by_assembler, params.reference_genome ) + ch_versions = ch_versions.mix(SHORT_READS_QC.out.versions) FASTQC_AFTER ( SHORT_READS_QC.out.qc_reads @@ -118,8 +118,6 @@ workflow SHORT_READS_ASSEMBLER { xspades: ["metaspades", "spades"].contains(meta.assembler) }.set { qc_filtered_reads } - ch_versions = ch_versions.mix(SHORT_READS_QC.out.versions) - /*********************/ /* Assembly */ /********************/ @@ -128,23 +126,20 @@ workflow SHORT_READS_ASSEMBLER { [], // yml input parameters, which we don't use [] // hmm, not used ) - ch_versions = ch_versions.mix(SPADES.out.versions) MEGAHIT( qc_filtered_reads.megahit.map { meta, reads, _ -> [meta, reads] } ) - - assembly = SPADES.out.contigs.mix( MEGAHIT.out.contigs ) - ch_versions = ch_versions.mix(MEGAHIT.out.versions) + + assembly = SPADES.out.contigs.mix( MEGAHIT.out.contigs ) // Clean the assembly contigs // SHORT_READS_ASSEMBLY_QC( assembly, params.reference_genome ) - ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_QC.out.versions) // Coverage // From 9a3cf9f6230db9a453f06d72c8cb9fba9fe8bbc4 Mon Sep 17 00:00:00 2001 From: Jennifer Mattock Date: Wed, 20 Nov 2024 15:05:08 +0000 Subject: [PATCH 21/33] amended filter ratio threshold to 10% --- README.md | 2 +- nextflow.config | 2 +- workflows/short_reads_assembler.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2d15f73..037c9eb 100644 --- a/README.md +++ b/README.md @@ -221,7 +221,7 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded | Exclusion Message | Description | | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. | +| `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. | | `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | #### Assembled Runs diff --git a/nextflow.config b/nextflow.config index e89c6d5..52f1cd3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,7 +32,7 @@ params { // QC FILTERING // Short reads QC filtering options - short_reads_filter_ratio_threshold = 0.9 + short_reads_filter_ratio_threshold = 0.1 short_reads_low_reads_count_threshold = 1000 // Long reads options diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index f159e84..e38a676 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -94,7 +94,7 @@ workflow SHORT_READS_ASSEMBLER { /******************************************/ /* Reads that fail the following rules: */ - /* - Reads discarded by fastp > 90% (default value) */ + /* - Reads kept by fastp < 10% (default value) */ /* - Less than 1k reads */ /******************************************/ extended_qc = SHORT_READS_QC.out.fastp_json.map { meta, json -> { From 8aa06e74108adaad241fffb9dfe1e5dadba12ef8 Mon Sep 17 00:00:00 2001 From: jmattock5 <80533767+jmattock5@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:19:11 +0000 Subject: [PATCH 22/33] Update nextflow_schema.json Changed ratio threshold default to 0.1 and amended description --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index da9c14b..f3e69d7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -194,8 +194,8 @@ "properties": { "short_reads_filter_ratio_threshold": { "type": "number", - "description": "The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled.", - "default": 0.9, + "description": "The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled.", + "default": 0.1, "minimum": 0.0, "maximum": 1.0 }, From 703c0c720115192664dcfd4286bae907b10e1e5b Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 20 Nov 2024 17:29:03 +0000 Subject: [PATCH 23/33] WIP - EBI FIRE S3 module for embargoed data --- bin/s3fire_downloader.py | 141 ++++++++++++++++++++++++++++ modules/local/download_from_fire.nf | 49 ++++++++++ workflows/miassembler.nf | 24 ++++- 3 files changed, 212 insertions(+), 2 deletions(-) create mode 100755 bin/s3fire_downloader.py create mode 100644 modules/local/download_from_fire.nf diff --git a/bin/s3fire_downloader.py b/bin/s3fire_downloader.py new file mode 100755 index 0000000..09a7282 --- /dev/null +++ b/bin/s3fire_downloader.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +import argparse +import logging +from typing import Optional, Tuple, List +import os + +import boto3 +from botocore import UNSIGNED +from botocore.config import Config + + +FIRE_ENDPOINT: str = "https://hl.fire.sdo.ebi.ac.uk" +PUBLIC_BUCKET: str = "era-public" +PRIVATE_BUCKET: str = "era-private" + + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + + +def transform_ftp_to_s3(ftp_path: str) -> Tuple[str, str]: + """ + Transforms an FTP path to a FIRE S3 object key, it also returns if it's public or private. + + :param ftp_path: The FTP path of the file to be transformed. + :type ftp_path: str + :return: A tuple containing the S3 object key and the corresponding bucket name. + :rtype: Tuple[str, str] + :raises ValueError: If the FTP path does not match the expected format. + """ + if ftp_path.startswith("ftp.sra.ebi.ac.uk/vol1/"): + s3_key = ftp_path.replace("ftp.sra.ebi.ac.uk/vol1/", "") + logger.info(f"Detected a public file for FTP path: {ftp_path}") + return s3_key, PUBLIC_BUCKET + elif ftp_path.startswith("ftp.dcc-private.ebi.ac.uk/vol1/"): + s3_key = ftp_path.replace("ftp.dcc-private.ebi.ac.uk/vol1/", "") + logger.info(f"Detected a private file for FTP path: {ftp_path}") + return s3_key, PRIVATE_BUCKET + else: + raise ValueError( + f"Invalid FTP path: {ftp_path}. Must start with 'ftp.sra.ebi.ac.uk/vol1/' or 'ftp.dcc-private.ebi.ac.uk/vol1/'." + ) + + +def download_file_from_fire( + s3_key: str, bucket: str, outdir: str, access_key: Optional[str] = None, secret_key: Optional[str] = None +) -> None: + """ + Downloads an individual file from FIRE S3 using its object key. + + :param s3_key: The S3 object key of the file to download. + :type s3_key: str + :param bucket: The name of the S3 bucket. + :type bucket: str + :param outdir: The local directory to save the downloaded file. + :type outdir: str + :param access_key: The access key for private S3 buckets (optional for public files). + :type access_key: Optional[str] + :param secret_key: The secret key for private S3 buckets (optional for public files). + :type secret_key: Optional[str] + :return: None + :rtype: None + :raises ValueError: If credentials are missing for private files. + :raises Exception: For other download errors. + """ + s3_args = {"endpoint_url": FIRE_ENDPOINT} + if bucket == PRIVATE_BUCKET: + if not access_key or not secret_key: + logger.error("Missing credentials for private files.") + raise ValueError("Access key and secret key are required for private files.") + s3_args.update( + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + ) + else: + # Public bucket configuration with unsigned requests + s3_args.update({"config": Config(signature_version=UNSIGNED)}) + + s3 = boto3.client("s3", **s3_args) + + os.makedirs(outdir, exist_ok=True) + local_file_path = os.path.join(outdir, os.path.basename(s3_key)) + + try: + logger.info(f"Downloading {s3_key} from S3 bucket {bucket} to {local_file_path}...") + s3.download_file(bucket, s3_key, local_file_path) + logger.info(f"File successfully downloaded to: {local_file_path}") + except Exception as e: + logger.error(f"Error downloading file from S3: {e}") + raise + + +def download_files(ftp_paths: List[str], outdir: str, access_key: Optional[str], secret_key: Optional[str]) -> None: + """ + Downloads multiple files from their FTP paths. + + :param ftp_paths: List of FTP paths to download. + :type ftp_paths: List[str] + :param outdir: Directory to save the downloaded files. + :type outdir: str + :param access_key: Access key for private files. + :type access_key: Optional[str] + :param secret_key: Secret key for private files. + :type secret_key: Optional[str] + """ + for ftp_path in ftp_paths: + try: + s3_key, bucket = transform_ftp_to_s3(ftp_path) + download_file_from_fire(s3_key, bucket, outdir, access_key, secret_key) + except ValueError as ve: + logger.error(f"Skipping download due to error: {ve}") + except Exception as e: + logger.error(f"Unexpected error while downloading {ftp_path}: {e}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Download multiple files from FTP paths via FIRE S3 (supports public and private files)." + ) + parser.add_argument( + "--ftp_paths", + nargs="+", + required=True, + help="Space-separated list of FTP paths to download (e.g., ftp.sra.ebi.ac.uk/vol1/.../file1 ftp.sra.ebi.ac.uk/vol1/.../file2).", + ) + parser.add_argument("--outdir", required=True, help="Local destination directory for the downloaded files.") + parser.add_argument("--access-key", required=False, help="S3 access key (required for private files).") + parser.add_argument("--secret-key", required=False, help="S3 secret key (required for private files).") + args = parser.parse_args() + + try: + logger.info("Starting the file download process...") + download_files(args.ftp_paths, args.outdir, args.access_key, args.secret_key) + logger.info("All files have been processed.") + except Exception as e: + logger.error(f"Unexpected error: {e}") + + +if __name__ == "__main__": + main() diff --git a/modules/local/download_from_fire.nf b/modules/local/download_from_fire.nf new file mode 100644 index 0000000..a6b81e9 --- /dev/null +++ b/modules/local/download_from_fire.nf @@ -0,0 +1,49 @@ +process DOWNLOAD_FROM_FIRE { + + secret 'FIRE_ACCESS_KEY' + secret 'FIRE_SECRET_KEY' + + tag "${meta.id}" + + label 'process_single' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/boto3:1.35.37--a82b4d378d332259' : + 'community.wave.seqera.io/library/pip_boto3:501beb4bd409b3e1' }" + + input: + tuple val(meta), val(input_reads) + + output: + tuple val(meta), path("fastq_files/*fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + script: + """ + s3fire_downloader.py \\ + --access-key \${FIRE_ACCESS_KEY} \\ + --secret-key \${FIRE_SECRET_KEY} \\ + --ftp_paths ${input_reads.join(" ")} \\ + --outdir fastq_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + boto: \$(python -c "import boto3; print(boto3.__version__)") + END_VERSIONS + """ + + stub: + """ + mkdir -p fastq_files + touch fastq_files/${meta.id}_1.fastq + touch fastq_files/${meta.id}_2.fastq + gzip fastq_files/* + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + boto: \$(python -c "import boto3; print(boto3.__version__)") + END_VERSIONS + """ +} diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 7f029bf..eb8a7dd 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -68,6 +68,7 @@ include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' */ include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' +include { DOWNLOAD_FROM_FIRE } from '../modules/local/download_from_fire.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -173,7 +174,7 @@ workflow MIASSEMBLER { } } - classified_reads.branch { meta, reads -> + classified_reads.branch { meta, _reads -> short_reads: meta.short_reads long_reads: meta.long_reads }.set { reads_to_assemble } @@ -182,8 +183,27 @@ workflow MIASSEMBLER { /* Assemble short reads and long reads */ /***************************************/ + def short_reads_to_assemble = channel.empty() + + // If running for a private study on EBI infrastructure // + if ( params.private_study ) { + /* + * For private studies we need to bypass Nextflow S3 integration until https://github.com/nextflow-io/nextflow/issues/4873 is fixed + * The EBI parameter is needed as this only works on EBI network, FIRE is not accessible otherwise + */ + DOWNLOAD_FROM_FIRE( + reads_to_assemble.short_reads + ) + + short_reads_to_assemble = DOWNLOAD_FROM_FIRE.out.reads + + } else { + // Carry on + short_reads_to_assemble = reads_to_assemble.short_reads + } + SHORT_READS_ASSEMBLER( - reads_to_assemble.short_reads + short_reads_to_assemble ) ch_versions = ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) From 2e51c9202131678bf37a0f51d44e633aea2b9f32 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 20 Nov 2024 23:20:17 +0000 Subject: [PATCH 24/33] Fire - private study - test case works on my laptop :) --- .nf-core.yml | 2 +- README.md | 17 +- modules.json | 7 +- modules/local/download_from_fire.nf | 4 +- modules/nf-core/fastqc/environment.yml | 2 - modules/nf-core/fastqc/fastqc.diff | 27 -- modules/nf-core/fastqc/main.nf | 18 +- modules/nf-core/fastqc/meta.yml | 58 +-- modules/nf-core/fastqc/tests/main.nf.test | 368 +++++++++++++---- .../nf-core/fastqc/tests/main.nf.test.snap | 386 +++++++++++++++++- modules/nf-core/multiqc/environment.yml | 4 +- modules/nf-core/multiqc/main.nf | 20 +- modules/nf-core/multiqc/meta.yml | 78 ++-- modules/nf-core/multiqc/tests/main.nf.test | 8 + .../nf-core/multiqc/tests/main.nf.test.snap | 24 +- modules/nf-core/multiqc/tests/nextflow.config | 5 + nextflow_schema.json | 8 +- workflows/miassembler.nf | 331 ++++++++------- workflows/short_reads_assembler.nf | 141 +++---- 19 files changed, 1063 insertions(+), 445 deletions(-) delete mode 100644 modules/nf-core/fastqc/fastqc.diff create mode 100644 modules/nf-core/multiqc/tests/nextflow.config diff --git a/.nf-core.yml b/.nf-core.yml index b48640b..4db2825 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -38,7 +38,7 @@ lint: - .gitignore multiqc_config: - report_comment - nextflow_config: False + nextflow_config: - params.input - params.validationSchemaIgnoreParams - params.custom_config_version diff --git a/README.md b/README.md index 2d15f73..812293b 100644 --- a/README.md +++ b/README.md @@ -28,14 +28,14 @@ Typical pipeline command: Input/output options --study_accession [string] The ENA Study secondary accession --reads_accession [string] The ENA Run primary accession - --private_study [boolean] To use if the ENA study is private + --private_study [boolean] To use if the ENA study is private, *this feature only works on EBI infrastructure at the moment* --samplesheet [string] Path to comma-separated file containing information about the raw reads with the prefix to be used. --assembler [string] The short reads assembler (accepted: spades, metaspades, megahit) --single_end [boolean] Force the single_end value for the study / reads --library_strategy [string] Force the library_strategy value for the study / reads (accepted: metagenomic, metatranscriptomic, genomic, transcriptomic, other) --library_layout [string] Force the library_layout value for the study / reads (accepted: single, paired) - --platform [string] Force the sequencing_platform value for the study / reads + --platform [string] Force the sequencing_platform value for the study / reads --spades_version [string] null [default: 3.15.5] --megahit_version [string] null [default: 1.2.9] --flye_version [string] null [default: 2.9] @@ -45,7 +45,7 @@ Input/output options --blast_reference_genomes_folder [string] The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal directory. --bwamem2_reference_genomes_folder [string] The folder with the reference genome bwa-mem2 indexes, defaults to the Microbiome Informatics internal - + --reference_genomes_folder [string] The folder with reference genomes, defaults to the Microbiome Informatics internal directory. --remove_human_phix [boolean] Remove human and phiX reads pre assembly, and contigs matching those genomes. [default: true] @@ -64,7 +64,6 @@ Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. ``` - Example: ```bash @@ -78,6 +77,7 @@ nextflow run ebi-metagenomics/miassembler \ ``` ### Required DBs: + - `--reference_genome`: reference genome in FASTA format - `--blast_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) - `--bwamem2_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/) @@ -85,7 +85,9 @@ nextflow run ebi-metagenomics/miassembler \ Blast and bwa-mem2 reference databases can be generated for any reference genome to polish input sequences with. #### BWA-MEM2 + As explained in [bwa-mem2's README](https://github.com/bwa-mem2/bwa-mem2?tab=readme-ov-file#getting-started): + ``` # Use precompiled binaries (recommended) curl -L https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 \ @@ -98,6 +100,7 @@ bwa-mem2-2.2.1_x64-linux/bwa-mem2 index ref.fa This will generate multiple index files in a folder. The folder containing them is the one to use as `bwamem2_reference_genomes_folder`. #### BLAST + ``` makeblastdb -in -dbtype nucl -out ``` @@ -219,10 +222,10 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded ##### Runs exclusion messages -| Exclusion Message | Description | -| --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Exclusion Message | Description | +| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. | -| `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | +| `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | #### Assembled Runs diff --git a/modules.json b/modules.json index f510e07..c34d7cc 100644 --- a/modules.json +++ b/modules.json @@ -50,9 +50,8 @@ }, "fastqc": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["modules"], - "patch": "modules/nf-core/fastqc/fastqc.diff" + "git_sha": "21f230b8cca43755bf73470e6fd0290832a98aef", + "installed_by": ["modules"] }, "flye": { "branch": "master", @@ -82,7 +81,7 @@ }, "multiqc": { "branch": "master", - "git_sha": "314d742bdb357a1df5f9b88427b3b6ac78aa33f7", + "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] }, "porechop/abi": { diff --git a/modules/local/download_from_fire.nf b/modules/local/download_from_fire.nf index a6b81e9..7226f72 100644 --- a/modules/local/download_from_fire.nf +++ b/modules/local/download_from_fire.nf @@ -29,7 +29,7 @@ process DOWNLOAD_FROM_FIRE { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version 2>&1 | sed 's/Python //g') - boto: \$(python -c "import boto3; print(boto3.__version__)") + boto3: \$(python -c "import boto3; print(boto3.__version__)") END_VERSIONS """ @@ -43,7 +43,7 @@ process DOWNLOAD_FROM_FIRE { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version 2>&1 | sed 's/Python //g') - boto: \$(python -c "import boto3; print(boto3.__version__)") + boto3: \$(python -c "import boto3; print(boto3.__version__)") END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml index 1787b38..691d4c7 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/nf-core/fastqc/environment.yml @@ -1,7 +1,5 @@ -name: fastqc channels: - conda-forge - bioconda - - defaults dependencies: - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/fastqc.diff b/modules/nf-core/fastqc/fastqc.diff deleted file mode 100644 index 0dd7d4d..0000000 --- a/modules/nf-core/fastqc/fastqc.diff +++ /dev/null @@ -1,27 +0,0 @@ -Changes in module 'nf-core/fastqc' ---- modules/nf-core/fastqc/main.nf -+++ modules/nf-core/fastqc/main.nf -@@ -21,19 +21,12 @@ - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" -- // Make list of old name and new name pairs to use for renaming in the bash while loop -- def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } -- def rename_to = old_new_pairs*.join(' ').join(' ') -- def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') -+ - """ -- printf "%s %s\\n" $rename_to | while read old_name new_name; do -- [ -f "\${new_name}" ] || ln -s \$old_name \$new_name -- done -- - fastqc \\ - $args \\ - --threads $task.cpus \\ -- $renamed_files -+ $reads - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - -************************************************************ diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 0a11817..d8989f4 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -21,12 +21,28 @@ process FASTQC { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + // Make list of old name and new name pairs to use for renaming in the bash while loop + def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } + def rename_to = old_new_pairs*.join(' ').join(' ') + def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + + // The total amount of allocated RAM by FastQC is equal to the number of threads defined (--threads) time the amount of RAM defined (--memory) + // https://github.com/s-andrews/FastQC/blob/1faeea0412093224d7f6a07f777fad60a5650795/fastqc#L211-L222 + // Dividing the task.memory by task.cpu allows to stick to requested amount of RAM in the label + def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') / task.cpus + // FastQC memory value allowed range (100 - 10000) + def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) """ + printf "%s %s\\n" $rename_to | while read old_name new_name; do + [ -f "\${new_name}" ] || ln -s \$old_name \$new_name + done + fastqc \\ $args \\ --threads $task.cpus \\ - $reads + --memory $fastqc_memory \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index ee5507e..2b2e62b 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -11,40 +11,50 @@ tools: FastQC gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%A/C/G/T). + You get information about adapter contamination and other overrepresented sequences. homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ licence: ["GPL-2.0-only"] + identifier: biotools:fastqc input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.html": + type: file + description: FastQC report + pattern: "*_{fastqc.html}" - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.zip": + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@drpatelh" - "@grst" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index b9e8f92..e9d79a0 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -3,107 +3,307 @@ nextflow_process { name "Test Process FASTQC" script "../main.nf" process "FASTQC" + tag "modules" tag "modules_nfcore" tag "fastqc" - test("Single-Read") { + test("sarscov2 single-end [fastq]") { when { - params { - outdir = "$outputDir" + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ } + } + + then { + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
Mon 2 Oct 2023
test.gz
+ // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { process { """ - input[0] = [ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, + { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, + { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, + { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, + { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 paired-end [fastq] - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 interleaved [fastq] - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 paired-end [bam] - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 multiple [fastq] - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 custom_prefix - stub") { + + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) """ } } then { assertAll ( - { assert process.success }, - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + { assert process.success }, + { assert snapshot(process.out).match() } ) } } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 636a32c..d5db309 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -1,10 +1,392 @@ { - "versions": { + "sarscov2 custom_prefix": { "content": [ [ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-10-09T23:40:54+0000" + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:16.374038" + }, + "sarscov2 single-end [fastq] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "test", + "single_end": true + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "test", + "single_end": true + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:24.993809" + }, + "sarscov2 custom_prefix - stub": { + "content": [ + { + "0": [ + [ + { + "id": "mysample", + "single_end": true + }, + "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "mysample", + "single_end": true + }, + "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "mysample", + "single_end": true + }, + "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "mysample", + "single_end": true + }, + "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:03:10.93942" + }, + "sarscov2 interleaved [fastq]": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:01:42.355718" + }, + "sarscov2 paired-end [bam]": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:01:53.276274" + }, + "sarscov2 multiple [fastq]": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:05.527626" + }, + "sarscov2 paired-end [fastq]": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:01:31.188871" + }, + "sarscov2 paired-end [fastq] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:34.273566" + }, + "sarscov2 multiple [fastq] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:03:02.304411" + }, + "sarscov2 single-end [fastq]": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:01:19.095607" + }, + "sarscov2 interleaved [fastq] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:44.640184" + }, + "sarscov2 paired-end [bam] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "html": [ + [ + { + "id": "test", + "single_end": false + }, + "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ], + "zip": [ + [ + { + "id": "test", + "single_end": false + }, + "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:02:53.550742" } } \ No newline at end of file diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index ecb7dd7..6f5b867 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,7 +1,5 @@ -name: multiqc channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::multiqc=1.22.3 + - bioconda::multiqc=1.25.1 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 0c81a3b..8a816ac 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,15 +3,17 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.22.3--pyhdfd78af_0' : - 'biocontainers/multiqc:1.22.3--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.25.1--pyhdfd78af_0' : + 'biocontainers/multiqc:1.25.1--pyhdfd78af_0' }" input: - path(multiqc_base_files, stageAs: "?/*") - tuple val(meta), path(files, stageAs: "?/*") + path(multiqc_files, stageAs: "?/*") + tuple val(meta), path(pipeline_files, stageAs: "?/*") path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) + path(replace_names) + path(sample_names) output: path "*multiqc_report.html", emit: report @@ -24,16 +26,22 @@ process MULTIQC { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ $args \\ $config \\ + $prefix \\ $extra_config \\ $logo \\ + $replace \\ + $samples \\ . cat <<-END_VERSIONS > versions.yml @@ -45,7 +53,7 @@ process MULTIQC { stub: """ mkdir multiqc_data - touch multiqc_plots + mkdir multiqc_plots touch multiqc_report.html cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc3..b16c187 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,6 @@ name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report +description: Aggregate results from bioinformatics analyses across many samples into + a single report keywords: - QC - bioinformatics tools @@ -12,40 +13,59 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + identifier: biotools:multiqc input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" + - - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections + in multiqc_config. + pattern: "*.{yml,yaml}" + - - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + - - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + - - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" output: - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" + - "*multiqc_report.html": + type: file + description: MultiQC report file + pattern: "multiqc_report.html" - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" + - "*_plots": + type: file + description: Plots created by MultiQC + pattern: "*_data" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@abhi18av" - "@bunop" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index f1c4242..33316a7 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -8,6 +8,8 @@ nextflow_process { tag "modules_nfcore" tag "multiqc" + config "./nextflow.config" + test("sarscov2 single-end [fastqc]") { when { @@ -17,6 +19,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -41,6 +45,8 @@ nextflow_process { input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -66,6 +72,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index 0a4760e..2fcbb5f 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" + "nf-test": "0.9.0", + "nextflow": "24.04.4" }, - "timestamp": "2024-06-25T12:31:21.878452033" + "timestamp": "2024-10-02T17:51:46.317523" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" + "nf-test": "0.9.0", + "nextflow": "24.04.4" }, - "timestamp": "2024-06-25T12:32:02.322196503" + "timestamp": "2024-10-02T17:52:20.680978" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,bf3b209659477254bb8fa5a9405f9984" + "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" + "nf-test": "0.9.0", + "nextflow": "24.04.4" }, - "timestamp": "2024-06-25T12:31:50.064227638" + "timestamp": "2024-10-02T17:52:09.185842" } } \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/nextflow.config b/modules/nf-core/multiqc/tests/nextflow.config new file mode 100644 index 0000000..c537a6a --- /dev/null +++ b/modules/nf-core/multiqc/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'MULTIQC' { + ext.prefix = null + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index da9c14b..590f635 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,9 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "outdir" - ], + "required": ["outdir"], "properties": { "samplesheet": { "type": "string", @@ -43,7 +41,7 @@ }, "private_study": { "type": "boolean", - "description": "To use if the ENA study is private" + "description": "To use if the ENA study is private, *this feature ony works on EBI infrastructure at the moment*" }, "assembler": { "type": "string", @@ -52,7 +50,7 @@ }, "long_reads_assembler_config": { "type": "string", - "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", + "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", "default": "" }, "single_end": { diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index eb8a7dd..2c61bac 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -1,39 +1,16 @@ -// Groovy // -import groovy.json.JsonSlurper - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { validateParameters; paramsSummaryLog; paramsSummaryMap; samplesheetToList } from 'plugin/nf-schema' - -def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) -def citation = '\n' + WorkflowMain.citation(workflow) + '\n' -def summary_params = paramsSummaryMap(workflow) - -// Print parameter summary log to screen -log.info logo + paramsSummaryLog(workflow) + citation - -validateParameters() - -if (params.help) { - log.info paramsHelp("nextflow run ebi-metagenomics/miassembler --help") - exit 0 -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? file( params.multiqc_config, checkIfExists: true ) : [] -ch_multiqc_logo = params.multiqc_logo ? file( params.multiqc_logo, checkIfExists: true ) : file("$projectDir/assets/mgnify_logo.png", checkIfExists: true) -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - +include { + validateParameters ; + paramsSummaryLog ; + paramsSummaryMap ; + samplesheetToList ; + paramsHelp +} from 'plugin/nf-schema' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -45,9 +22,9 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // MODULE: Installed directly from nf-core/modules // -include { MULTIQC as MULTIQC_STUDY } from '../modules/nf-core/multiqc/main' -include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MULTIQC as MULTIQC_STUDY } from '../modules/nf-core/multiqc/main' +include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -58,8 +35,8 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsof // // WORKFLOWS // -include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler' -include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' +include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler' +include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -67,8 +44,7 @@ include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' -include { DOWNLOAD_FROM_FIRE } from '../modules/local/download_from_fire.nf' +include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -76,168 +52,189 @@ include { DOWNLOAD_FROM_FIRE } from '../modules/local/download_from_fire.nf' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Info required for completion email and summary -def multiqc_report = [] - workflow MIASSEMBLER { - ch_versions = Channel.empty() - fetch_tool_metadata = Channel.empty() + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + INIT + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def summary_params = paramsSummaryMap(workflow) + + // Print parameter summary log to screen + log.info(logo + paramsSummaryLog(workflow) + citation) + + validateParameters() + + if (params.help) { + log.info(paramsHelp("nextflow run ebi-metagenomics/miassembler --help")) + exit(0) + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + def ch_multiqc_config = file("${projectDir}/assets/multiqc_config.yml", checkIfExists: true) + def ch_multiqc_custom_config = params.multiqc_config ? file(params.multiqc_config, checkIfExists: true) : [] + def ch_multiqc_logo = params.multiqc_logo ? file(params.multiqc_logo, checkIfExists: true) : file("${projectDir}/assets/mgnify_logo.png", checkIfExists: true) + def ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("${projectDir}/assets/methods_description_template.yml", checkIfExists: true) + + + def ch_versions = Channel.empty() + def fetch_tool_metadata = Channel.empty() + def fetch_reads_transformed = Channel.empty() - if ( params.samplesheet ) { + if (params.samplesheet) { - groupReads = { study_accession, reads_accession, fq1, fq2, library_layout, library_strategy, platform, assembler, assembly_memory, assembler_config -> + def groupReads = { study_accession, reads_accession, fq1, fq2, library_layout, library_strategy, platform, assembler, assembly_memory, assembler_config -> if (fq2 == []) { - return tuple(["id": reads_accession, - "study_accession": study_accession, - "library_layout": library_layout, - "library_strategy": library_strategy, - "platform": params.platform ?: platform, - "single_end": true, - "assembler": assembler ?: params.assembler, - "assembly_memory": assembly_memory ?: params.assembly_memory, - "assembler_config": params.long_reads_assembler_config - ], - [fq1] - ) - } else { - return tuple(["id": reads_accession, - "study_accession": study_accession, - "library_strategy": library_strategy, - "library_layout": library_layout, - "single_end": false, - "assembler": assembler ?: params.assembler, - "assembly_memory": assembly_memory ?: params.assembly_memory, - "assembler_config": params.long_reads_assembler_config, - "platform": params.platform ?: platform - ], - [fq1, fq2]) + return tuple( + [ + "id": reads_accession, + "study_accession": study_accession, + "library_layout": library_layout, + "library_strategy": library_strategy, + "platform": params.platform ?: platform, + "single_end": true, + "assembler": assembler ?: params.assembler, + "assembly_memory": assembly_memory ?: params.assembly_memory, + "assembler_config": assembler_config ?: params.long_reads_assembler_config + ], + [fq1] + ) + } + else { + return tuple( + [ + "id": reads_accession, + "study_accession": study_accession, + "library_strategy": library_strategy, + "library_layout": library_layout, + "single_end": false, + "assembler": assembler ?: params.assembler, + "assembly_memory": assembly_memory ?: params.assembly_memory, + "assembler_config": assembler_config ?: params.long_reads_assembler_config, + "platform": params.platform ?: platform + ], + [fq1, fq2] + ) } } - samplesheet = Channel.fromList(samplesheetToList(params.samplesheet, "./assets/schema_input.json")) + def samplesheet = Channel.fromList(samplesheetToList(params.samplesheet, "./assets/schema_input.json")) // [ study, sample, read1, [read2], library_layout, library_strategy, platform, assembly_memory] fetch_reads_transformed = samplesheet.map(groupReads) - - } else { + } + else { // TODO: remove when the fetch tools get's published on bioconda - fetch_tool_config = file("${projectDir}/assets/fetch_tool_anonymous.json", checkIfExists: true) + def fetch_tool_config = file("${projectDir}/assets/fetch_tool_anonymous.json", checkIfExists: true) - if ( params.private_study ) { + if (params.private_study) { fetch_tool_config = file("${projectDir}/assets/fetch_tool_credentials.json", checkIfExists: true) } FETCHTOOL_READS( - [ [id: params.reads_accession], params.study_accession, params.reads_accession ], + [[id: params.reads_accession], params.study_accession, params.reads_accession], fetch_tool_config ) ch_versions = ch_versions.mix(FETCHTOOL_READS.out.versions) // Push the library strategy into the meta of the reads, this is to make it easier to handle downstream - fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout, platform -> { - [ meta + [ - // -- The metadata will be overriden by the parameters -- // - "assembler": params.assembler, - "assembler_config": params.long_reads_assembler_config, - "assembly_memory": params.assembly_memory, - "library_strategy": params.library_strategy ?: library_strategy, - "library_layout": params.library_layout ?: library_layout, - "single_end": params.single_end ?: library_layout == "single", - "platform": params.platform ?: platform - ], reads ] + fetch_reads_transformed = FETCHTOOL_READS.out.reads.map { meta, reads, library_strategy, library_layout, platform -> + { + [ + meta + [ + "assembler": params.assembler, + "assembler_config": params.long_reads_assembler_config, + "assembly_memory": params.assembly_memory, + "library_strategy": params.library_strategy ?: library_strategy, + "library_layout": params.library_layout ?: library_layout, + "single_end": params.single_end ?: library_layout == "single", + "platform": params.platform ?: platform + ], + reads + ] } } // Metadata for MultiQC - fetch_tool_metadata = FETCHTOOL_READS.out.metadata_tsv.map { it[1] }.collectFile( - name: 'fetch_tool_mqc.tsv', - newLine: true, - keepHeader: true, - skip: 1 - ) + fetch_tool_metadata = FETCHTOOL_READS.out.metadata_tsv + .map { it[1] } + .collectFile( + name: 'fetch_tool_mqc.tsv', + newLine: true, + keepHeader: true, + skip: 1 + ) } /********************************************/ /* Selecting the assembly pipeline flavour */ /*******************************************/ - - classified_reads = fetch_reads_transformed.map { meta, reads -> + def classified_reads = fetch_reads_transformed.map { meta, reads -> // Long reads // - if ( ["ont", "pacbio"].contains( meta.platform ) ) { - return [ meta + [long_reads: true], reads] - // Short reads // - } else { - return [ meta + [short_reads: true], reads] + if (["ont", "pacbio"].contains(meta.platform)) { + return [meta + [long_reads: true], reads] + } + else { + return [meta + [short_reads: true], reads] } } - classified_reads.branch { meta, _reads -> - short_reads: meta.short_reads - long_reads: meta.long_reads - }.set { reads_to_assemble } + classified_reads + .branch { meta, _reads -> + short_reads: meta.short_reads + long_reads: meta.long_reads + } + .set { reads_to_assemble } /***************************************/ /* Assemble short reads and long reads */ /***************************************/ - - def short_reads_to_assemble = channel.empty() - - // If running for a private study on EBI infrastructure // - if ( params.private_study ) { - /* - * For private studies we need to bypass Nextflow S3 integration until https://github.com/nextflow-io/nextflow/issues/4873 is fixed - * The EBI parameter is needed as this only works on EBI network, FIRE is not accessible otherwise - */ - DOWNLOAD_FROM_FIRE( - reads_to_assemble.short_reads - ) - - short_reads_to_assemble = DOWNLOAD_FROM_FIRE.out.reads - - } else { - // Carry on - short_reads_to_assemble = reads_to_assemble.short_reads - } - SHORT_READS_ASSEMBLER( - short_reads_to_assemble + reads_to_assemble.short_reads ) - ch_versions = ch_versions.mix( SHORT_READS_ASSEMBLER.out.versions ) + ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLER.out.versions) LONG_READS_ASSEMBLER( reads_to_assemble.long_reads ) - ch_versions = ch_versions.mix( LONG_READS_ASSEMBLER.out.versions ) + ch_versions = ch_versions.mix(LONG_READS_ASSEMBLER.out.versions) - CUSTOM_DUMPSOFTWAREVERSIONS ( + CUSTOM_DUMPSOFTWAREVERSIONS( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) // // MODULE: MultiQC // - workflow_summary = WorkflowMiassembler.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) + def workflow_summary = WorkflowMiassembler.paramsSummaryMultiqc(workflow, summary_params) + def ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowMiassembler.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) + def methods_description = WorkflowMiassembler.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + def ch_methods_description = Channel.value(methods_description) - ch_multiqc_base_files = Channel.empty() - ch_multiqc_base_files = ch_multiqc_base_files.mix( CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect() ) - ch_multiqc_base_files = ch_multiqc_base_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) - ch_multiqc_base_files = ch_multiqc_base_files.mix( ch_methods_description.collectFile(name: 'methods_description_mqc.yaml') ) + def ch_multiqc_base_files = Channel.empty() + ch_multiqc_base_files = ch_multiqc_base_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + ch_multiqc_base_files = ch_multiqc_base_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_base_files = ch_multiqc_base_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) /**************************************/ /* MultiQC report for the whole study */ /**************************************/ def meta_by_study = { meta, result_artifact -> - [ meta.subMap("study_accession"), result_artifact ] + [meta.subMap("study_accession"), result_artifact] } // Helper method for the MultiQC aggregation by study and runs // @@ -260,22 +257,25 @@ workflow MIASSEMBLER { } } - ch_multiqc_study_tools_files = Channel.empty() + def ch_multiqc_study_tools_files = Channel.empty() - study_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_study) - .join( SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_study) ) - .join( SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_study), remainder: true ) // the assembly step could fail - .join( SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_study), remainder: true ) // the assembly step could fail + def study_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_study) \ + .join(SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_study)) \ + .join(SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_study), remainder: true) \ + .join(SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_study), remainder: true) - ch_multiqc_study_tools_files = study_multiqc_files.flatMap( combineFiles ).groupTuple() + ch_multiqc_study_tools_files = study_multiqc_files.flatMap(combineFiles).groupTuple() // TODO: add the fetch tool log file - MULTIQC_STUDY ( + + MULTIQC_STUDY( ch_multiqc_base_files.collect(), ch_multiqc_study_tools_files, ch_multiqc_config, ch_multiqc_custom_config, - ch_multiqc_logo + ch_multiqc_logo, + [], + [] ) /**************************/ @@ -283,27 +283,28 @@ workflow MIASSEMBLER { /*************************/ def meta_by_run = { meta, result_artifact -> - [ meta.subMap("study_accession", "id", "assembler", "assembler_version"), result_artifact ] + [meta.subMap("study_accession", "id", "assembler", "assembler_version"), result_artifact] } - run_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_run) - .join( SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_run) ) - .join( SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_run), remainder: true ) // the assembly step could fail - .join( SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_run), remainder: true ) // the assembly step could fail + def run_multiqc_files = SHORT_READS_ASSEMBLER.out.fastqc_before_zip.map(meta_by_run).join(SHORT_READS_ASSEMBLER.out.fastqc_after_zip.map(meta_by_run)).join(SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map(meta_by_run), remainder: true).join(SHORT_READS_ASSEMBLER.out.quast_results.map(meta_by_run), remainder: true) + // the assembly step could fail // Filter out the non-assembled runs // - ch_multiqc_run_tools_files = run_multiqc_files.filter { meta, fastqc_before, fastqc_after, assembly_coverage, quast -> { + def ch_multiqc_run_tools_files = run_multiqc_files.filter { _meta, _fastqc_before, _fastqc_after, assembly_coverage, quast -> + { return assembly_coverage != null && quast != null } - } .flatMap( combineFiles ).groupTuple() + }.flatMap(combineFiles).groupTuple() // TODO: add the fetch tool log file - MULTIQC_RUN ( + MULTIQC_RUN( ch_multiqc_base_files.collect(), ch_multiqc_run_tools_files, ch_multiqc_config, ch_multiqc_custom_config, - ch_multiqc_logo + ch_multiqc_logo, + [], + [] ) /*****************************/ @@ -313,30 +314,26 @@ workflow MIASSEMBLER { // TODO: we need to add LR end-of-run reports // Short reads asssembled runs // - SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats.map { - meta, _ -> { - return "${meta.id},${meta.assembler},${meta.assembler_version}" + SHORT_READS_ASSEMBLER.out.assembly_coverage_samtools_idxstats + .map { meta, __ -> + { + return "${meta.id},${meta.assembler},${meta.assembler_version}" + } } - }.collectFile(name: "assembled_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) + .collectFile(name: "assembled_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) // Short reads QC failed // - short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed.map { - meta, _, extended_meta -> { - if ( extended_meta.low_reads_count ) { + def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed.map { meta, __, extended_meta -> + { + if (extended_meta.low_reads_count) { return "${meta.id},low_reads_count" } - if ( extended_meta.filter_ratio_threshold_exceeded ) { + if (extended_meta.filter_ratio_threshold_exceeded) { return "${meta.id},filter_ratio_threshold_exceeded" } - error "Unexpected. meta: ${meta}, extended_meta: ${extended_meta}" + error("Unexpected. meta: ${meta}, extended_meta: ${extended_meta}") } } short_reads_qc_failed_entries.collectFile(name: "qc_failed_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index f159e84..1aca4a5 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -1,16 +1,3 @@ -import groovy.json.JsonSlurper - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? file( params.multiqc_config, checkIfExists: true ) : [] -ch_multiqc_logo = params.multiqc_logo ? file( params.multiqc_logo, checkIfExists: true ) : file("$projectDir/assets/mgnify_logo.png", checkIfExists: true) -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -21,9 +8,11 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { SHORT_READS_QC } from '../subworkflows/local/short_reads_qc' -include { SHORT_READS_ASSEMBLY_QC } from '../subworkflows/local/short_reads_assembly_qc' -include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_reads_assembly_coverage' +include { DOWNLOAD_FROM_FIRE } from '../modules/local/download_from_fire.nf' + +include { SHORT_READS_QC } from '../subworkflows/local/short_reads_qc' +include { SHORT_READS_ASSEMBLY_QC } from '../subworkflows/local/short_reads_assembly_qc' +include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_reads_assembly_coverage' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -34,11 +23,11 @@ include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_rea // // MODULE: Installed directly from nf-core/modules // -include { FASTQC as FASTQC_BEFORE } from '../modules/nf-core/fastqc/main' -include { FASTQC as FASTQC_AFTER } from '../modules/nf-core/fastqc/main' -include { SPADES } from '../modules/nf-core/spades/main' -include { MEGAHIT } from '../modules/nf-core/megahit/main' -include { QUAST } from '../modules/nf-core/quast/main' +include { FASTQC as FASTQC_BEFORE } from '../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_AFTER } from '../modules/nf-core/fastqc/main' +include { SPADES } from '../modules/nf-core/spades/main' +include { MEGAHIT } from '../modules/nf-core/megahit/main' +include { QUAST } from '../modules/nf-core/quast/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -47,13 +36,28 @@ include { QUAST } from '../modules/nf-core/quast/main' */ workflow SHORT_READS_ASSEMBLER { - take: - reads // tuple(meta), path(reads) + input_reads // tuple(meta), path(reads) main: - ch_versions = Channel.empty() + def ch_versions = Channel.empty() + def reads_to_assemble = input_reads + + // If running for a private study on EBI infrastructure // + if (params.private_study) { + /* + * For private studies we need to bypass Nextflow S3 integration until https://github.com/nextflow-io/nextflow/issues/4873 is fixed + * The EBI parameter is needed as this only works on EBI network, FIRE is not accessible otherwise + */ + DOWNLOAD_FROM_FIRE( + input_reads + ) + + ch_versions = ch_versions.mix(DOWNLOAD_FROM_FIRE.out.versions.first()) + + reads_to_assemble = DOWNLOAD_FROM_FIRE.out.reads + } /***************************/ /* Selecting the assembler */ @@ -65,19 +69,22 @@ workflow SHORT_READS_ASSEMBLER { - Paired-end reads are assembled with MetaSPAdes, unless specified otherwise - An error is raised if the assembler and read layout are incompatible (shouldn't happen...) */ - reads_by_assembler = reads.map { meta, reads -> - def selected_assembler = meta.assembler; - if ( selected_assembler == "megahit" || ( meta.single_end && selected_assembler == null ) ) { - return [ meta + [assembler: "megahit", assembler_version: params.megahit_version], reads] - } else if ( ["metaspades", "spades"].contains(selected_assembler) || ( !meta.single_end && selected_assembler == null ) ) { - def xspades_assembler = selected_assembler ?: "metaspades" // Default to "metaspades" if the user didn't select one - return [ meta + [assembler: xspades_assembler, assembler_version: params.spades_version], reads] - } else { - error "Incompatible assembler and/or reads layout. We can't assembly data that is. Reads - single end value: ${meta.single_end}." + def reads_by_assembler = reads_to_assemble.map { meta, reads -> + def selected_assembler = meta.assembler + if (selected_assembler == "megahit" || (meta.single_end && selected_assembler == null)) { + return [meta + [assembler: "megahit", assembler_version: params.megahit_version], reads] + } + else if (["metaspades", "spades"].contains(selected_assembler) || (!meta.single_end && selected_assembler == null)) { + def xspades_assembler = selected_assembler ?: "metaspades" + // Default to "metaspades" if the user didn't select one + return [meta + [assembler: xspades_assembler, assembler_version: params.spades_version], reads] + } + else { + error("Incompatible assembler and/or reads layout. We can't assembly data that is. Reads - single end value: ${meta.single_end}.") } } - FASTQC_BEFORE ( + FASTQC_BEFORE( reads_by_assembler ) ch_versions = ch_versions.mix(FASTQC_BEFORE.out.versions) @@ -88,7 +95,7 @@ workflow SHORT_READS_ASSEMBLER { ) ch_versions = ch_versions.mix(SHORT_READS_QC.out.versions) - FASTQC_AFTER ( + FASTQC_AFTER( SHORT_READS_QC.out.qc_reads ) @@ -97,43 +104,45 @@ workflow SHORT_READS_ASSEMBLER { /* - Reads discarded by fastp > 90% (default value) */ /* - Less than 1k reads */ /******************************************/ - extended_qc = SHORT_READS_QC.out.fastp_json.map { meta, json -> { - json_txt = new JsonSlurper().parseText(json.text) - bf_total_reads = json_txt?.summary?.before_filtering?.total_reads ?: 0; - af_total_reads = json_txt?.summary?.after_filtering?.total_reads ?: 0; - reads_qc_meta = [ + def extended_qc = SHORT_READS_QC.out.fastp_json.map { meta, json -> + { + def json_txt = new groovy.json.JsonSlurper().parseText(json.text) + def bf_total_reads = json_txt.summary.before_filtering.total_reads ?: 0 + def af_total_reads = json_txt.summary.after_filtering.total_reads ?: 0 + def reads_qc_meta = [ "low_reads_count": af_total_reads <= params.short_reads_low_reads_count_threshold, - "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.short_reads_filter_ratio_threshold ) + "filter_ratio_threshold_exceeded": af_total_reads == 0 || ((af_total_reads / bf_total_reads) <= params.short_reads_filter_ratio_threshold) ] return [meta, reads_qc_meta] } } - extended_reads_qc = SHORT_READS_QC.out.qc_reads.join( extended_qc ) + def extended_reads_qc = SHORT_READS_QC.out.qc_reads.join(extended_qc) - extended_reads_qc.branch { meta, reads, reads_qc_meta -> - // Filter out failed reads // - qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.filter_ratio_threshold_exceeded - megahit: meta.assembler == "megahit" - xspades: ["metaspades", "spades"].contains(meta.assembler) - }.set { qc_filtered_reads } + extended_reads_qc + .branch { meta, _reads, reads_qc_meta -> + qc_failed: reads_qc_meta.low_reads_count || reads_qc_meta.filter_ratio_threshold_exceeded + megahit: meta.assembler == "megahit" + xspades: ["metaspades", "spades"].contains(meta.assembler) + } + .set { qc_filtered_reads } /*********************/ /* Assembly */ /********************/ SPADES( - qc_filtered_reads.xspades.map { meta, reads, _ -> [meta, reads, [], []] }, - [], // yml input parameters, which we don't use - [] // hmm, not used + qc_filtered_reads.xspades.map { meta, reads, __ -> [meta, reads, [], []] }, + [], + [] ) ch_versions = ch_versions.mix(SPADES.out.versions) MEGAHIT( - qc_filtered_reads.megahit.map { meta, reads, _ -> [meta, reads] } + qc_filtered_reads.megahit.map { meta, reads, __ -> [meta, reads] } ) ch_versions = ch_versions.mix(MEGAHIT.out.versions) - - assembly = SPADES.out.contigs.mix( MEGAHIT.out.contigs ) + + assembly = SPADES.out.contigs.mix(MEGAHIT.out.contigs) // Clean the assembly contigs // SHORT_READS_ASSEMBLY_QC( @@ -144,7 +153,7 @@ workflow SHORT_READS_ASSEMBLER { // Coverage // SHORT_READS_ASSEMBLY_COVERAGE( - SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join( SHORT_READS_QC.out.qc_reads, remainder: false ) + SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false) ) ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_COVERAGE.out.versions) @@ -153,23 +162,17 @@ workflow SHORT_READS_ASSEMBLER { /* The QUAST module was modified to run metaQUAST instead */ QUAST( SHORT_READS_ASSEMBLY_QC.out.filtered_contigs, - [ [], [] ], // reference - [ [], [] ] // gff + [[], []], + [[], []] ) ch_versions = ch_versions.mix(QUAST.out.versions) emit: - fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) - qc_failed = qc_filtered_reads.qc_failed // tuple(meta) - fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) - assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) - quast_results = QUAST.out.results // tuple(meta) - versions = ch_versions + fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) + qc_failed = qc_filtered_reads.qc_failed // tuple(meta) + fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) + assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) + quast_results = QUAST.out.results // tuple(meta) + versions = ch_versions } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ From f97e774a97967e442f3710887a0144aeebc8404d Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 12:45:13 +0000 Subject: [PATCH 25/33] Add docs around private studies and one shallow test --- README.md | 15 ++++++++++--- tests/main.nf.test | 55 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 812293b..2c4a906 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,6 @@ This pipeline is still in early development. It's mostly a direct port of the mi ## Usage -> [!WARNING] -> It only runs in EBI Codon cluster using Slurm ATM. - Pipeline help: ```bash @@ -150,6 +147,18 @@ PRJ1,ERR1,/path/to/reads/ERR1_1.fq.gz,/path/to/reads/ERR1_2.fq.gz,paired,metagen PRJ2,ERR2,/path/to/reads/ERR2.fq.gz,,single,genomic,megahit,32 ``` +### ENA Private Data + +The pipeline includes a module to download private data from ENA using the EMBL-EBI FIRE (File Replication) system. This system is restricted for use within the EMBL-EBI network and will not work unless connected to that network. + +If you have private data to assemble, you must provide the full path to the files on a system that Nextflow can access. + +#### Microbiome Informatics Team + +To process private data, the pipeline should be launched with the `--private_study` flag, and the samplesheet must include the private FTP (transfer services) paths. The `download_from_fire` module will be utilized to download the files. + +This module uses [Nextflow secrets](https://www.nextflow.io/docs/latest/secrets.html#how-it-works). Specifically, it requires the `FIRE_ACCESS_KEY` and `FIRE_SECRET_KEY` secrets to authenticate and download the files. + ## Outputs The outputs of the pipeline are organized as follows: diff --git a/tests/main.nf.test b/tests/main.nf.test index 3dcfd64..fb6ab8f 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -37,7 +37,7 @@ nextflow_pipeline { short_reads_low_reads_count_threshold = 1000000 - samplesheet = "${projectDir}/tests/samplesheet/test.csv" + samplesheet = "${projectDir}/tests/samplesheet/test.csv" } } @@ -52,6 +52,8 @@ nextflow_pipeline { assert trace.succeeded().count{ task -> task.name.contains("MULTIQC_STUDY") } == 2 assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 0 assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 0 + // This process should not have been called + assert trace.succeeded().count{ task -> task.name.contains("DOWNLOAD_FROM_FIRE") == 0} } } @@ -67,10 +69,10 @@ nextflow_pipeline { outdir = "tests/results" // Force the assembly - short_reads_filter_ratio_threshold = 0.1 + short_reads_filter_ratio_threshold = 0.1 - study_accession = "SRP115494" - reads_accession = "SRR6180434" + study_accession = "SRP115494" + reads_accession = "SRR6180434" } } @@ -91,14 +93,14 @@ nextflow_pipeline { when { params { - outdir = "tests/results" - assembler = "megahit" + outdir = "tests/results" + assembler = "megahit" // Force the assembly - short_reads_filter_ratio_threshold = 0.1 + short_reads_filter_ratio_threshold = 0.1 - study_accession = "SRP115494" - reads_accession = "SRR6180434" + study_accession = "SRP115494" + reads_accession = "SRR6180434" } } @@ -165,7 +167,7 @@ nextflow_pipeline { when { params { outdir = "tests/results" - + study_accession = "DRP007622" reads_accession = "DRR280712" } @@ -190,7 +192,7 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - + study_accession = "DRP007622" reads_accession = "DRR280712" } @@ -218,7 +220,7 @@ nextflow_pipeline { blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" samplesheet = "${projectDir}/tests/samplesheet/test_mem.csv" assembly_memory = 0.5 - // will will be [0.5GB, 0.75GB, 1.13GB, ...] which rounds down to [0, 0, 1, ...] so should definitely fail twice before succeeding. after a few trys. + // will will be [0.5GB, 0.75GB, 1.13GB, ...] which rounds down to [0, 0, 1, ...] so should definitely fail twice before succeeding. after a few trys. max_spades_retries = 5 } } @@ -235,4 +237,31 @@ nextflow_pipeline { } } -} \ No newline at end of file + + test("Private study reads - this one should fail") { + + tag "samplesheet" + tag "private" + + when { + params { + outdir = "tests/results" + assembler = "spades" + bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" + blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" + samplesheet = "${projectDir}/tests/samplesheet/test.csv" + private_study = true + } + } + + // Complete this test when secrets are implemented in nf-test https://github.com/askimed/nf-test/issues/145 + then { + with(workflow) { + assert !success + assert stdout.count{ line -> line.contains("Required secrets are missing") } == 1 + } + } + + } + +} From dfaffbadf881c6700421c822647fb5c559255806 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 13:43:16 +0000 Subject: [PATCH 26/33] Adjust the version on the wf metadata --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 52f1cd3..3080801 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,7 +20,7 @@ params { // For already fetched data samplesheet = null - + // The pipeline will use the metadata from ENA (obtained by the fetch_tool) // As the metadata can be incorrect, we provide the following parameters to // "force" them @@ -288,7 +288,7 @@ manifest { description = """Microbiome Informatics metagenomes assembly pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0dev' + version = 'v1.0.0' doi = '' } From eb26726659a604145e6a72bdc691d90ca4ff850d Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 15:56:13 +0000 Subject: [PATCH 27/33] Massive commit, sorry about that. nf-core linting + fixes all over the place to follow sensible nf-core linting rules Updates multiqc and blastn Enabled a nf-core linting github action Tweaked the tests slightly (I think they are still failling .. testing and fixing ATM) Upgraded to nf-schema 2.0.2 - also pinned this dependency Remove check_max and moved to resources limits (require nextflow 24.0.0 as min - which I've set on the config) Created some missing .diff for some modules --- .github/workflows/linting.yml | 80 ++++++++ .github/workflows/{ci.yml => nf_tests.yml} | 20 +- .nf-core.yml | 4 + assets/multiqc_config.yml | 6 +- conf/base.config | 12 +- conf/codon_slurm.config | 1 - conf/modules.config | 124 ++++++------- conf/test.config | 11 +- main.nf | 30 ++- modules.json | 16 +- .../bwamem2/mem/bwamem2-mem.diff | 29 +++ .../nf-core/blast/blastn/blast-blastn.diff | 7 + modules/nf-core/blast/blastn/environment.yml | 4 +- modules/nf-core/blast/blastn/main.nf | 4 +- modules/nf-core/blast/blastn/meta.yml | 61 ++++--- .../nf-core/blast/blastn/tests/main.nf.test | 6 +- .../blast/blastn/tests/main.nf.test.snap | 4 +- modules/nf-core/canu/canu.diff | 37 ++++ modules/nf-core/megahit/environment.yml | 4 +- modules/nf-core/megahit/main.nf | 104 +++++------ modules/nf-core/megahit/megahit.diff | 53 +++--- modules/nf-core/megahit/meta.yml | 123 +++++++++---- modules/nf-core/megahit/tests/main.nf.test | 126 +++++++++++++ .../nf-core/megahit/tests/main.nf.test.snap | 172 ++++++++++++++++++ modules/nf-core/megahit/tests/tags.yml | 2 + .../minimap2/align/minimap2-align.diff | 59 ++++++ modules/nf-core/multiqc/multiqc.diff | 22 +++ modules/nf-core/seqkit/grep/seqkit-grep.diff | 18 ++ nextflow.config | 52 +----- nextflow_schema.json | 40 +--- nf-test.config | 1 - tests/main.nf.test | 14 +- workflows/miassembler.nf | 15 +- 33 files changed, 897 insertions(+), 364 deletions(-) create mode 100644 .github/workflows/linting.yml rename .github/workflows/{ci.yml => nf_tests.yml} (65%) create mode 100644 modules/ebi-metagenomics/bwamem2/mem/bwamem2-mem.diff create mode 100644 modules/nf-core/canu/canu.diff create mode 100644 modules/nf-core/megahit/tests/main.nf.test create mode 100644 modules/nf-core/megahit/tests/main.nf.test.snap create mode 100644 modules/nf-core/megahit/tests/tags.yml create mode 100644 modules/nf-core/minimap2/align/minimap2-align.diff create mode 100644 modules/nf-core/multiqc/multiqc.diff diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..62a2d28 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,80 @@ +name: nf-core linting +on: + push: + branches: + - dev + pull_request: + release: + types: [published] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + with: + python-version: "3.12" + + - name: Install pre-commit + run: pip install pre-commit + + - name: Run pre-commit + run: pre-commit run --all-files + + nf-core: + runs-on: ubuntu-latest + steps: + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v2 + + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + with: + python-version: "3.12" + architecture: "x64" + + - name: read .nf-core.yml + uses: pietrobolcato/action-read-yaml@1.1.0 + id: read_yml + with: + config: ${{ github.workspace }}/.nf-core.yml + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }} + + - name: Run nf-core pipelines lint + if: ${{ github.base_ref != 'main' }} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + + - name: Run nf-core pipelines lint --release + if: ${{ github.base_ref == 'main' }} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + + - name: Save PR number + if: ${{ always() }} + run: echo ${{ github.event.pull_request.number }} > PR_number.txt + + - name: Upload linting log file artifact + if: ${{ always() }} + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 + with: + name: linting-logs + path: | + lint_log.txt + lint_results.md + PR_number.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/nf_tests.yml similarity index 65% rename from .github/workflows/ci.yml rename to .github/workflows/nf_tests.yml index b55e0f6..f29e7d8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/nf_tests.yml @@ -1,11 +1,9 @@ name: nf-test CI on: - push: - branches: - - dev pull_request: release: types: [published] + workflow_dispatch: env: NXF_ANSI_LOG: false @@ -15,22 +13,24 @@ jobs: name: Run pipeline with test data runs-on: ubuntu-latest + strategy: + matrix: + # Nextflow versions: check pipeline minimum and current latest + NXF_VER: ["24.04.0"] + steps: - name: Check out pipeline code uses: actions/checkout@v4 - - uses: actions/setup-java@99b8673ff64fbf99d8d325f52d9a5bdedb8483e9 # v4 - with: - distribution: "temurin" - java-version: "17" - - name: Setup Nextflow - uses: nf-core/setup-nextflow@v2 + uses: nf-core/setup-nextflow@v2.0.0 + with: + version: "${{ matrix.NXF_VER }}" - name: Install nf-test uses: nf-core/setup-nf-test@v1 with: - version: 0.9.0 + install-pdiff: true - name: Run pipeline with test data run: | diff --git a/.nf-core.yml b/.nf-core.yml index 4db2825..6074a56 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -20,6 +20,7 @@ lint: - .github/workflows/ci.yml - .github/workflows/linting_comment.yml - .github/workflows/linting.yml + - .github/workflows/ci.yml - conf/test_full.config - lib/Utils.groovy - lib/WorkflowMain.groovy @@ -32,7 +33,9 @@ lint: - docs/images/nf-core-miassembler_logo_light.png - docs/images/nf-core-miassembler_logo_dark.png - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/PULL_REQUEST_TEMPLATE.md - .github/CONTRIBUTING.md + - .github/workflows/linting.yml - LICENSE - docs/README.md - .gitignore @@ -45,5 +48,6 @@ lint: - params.custom_config_base - manifest.name - manifest.homePage + - custom_config readme: - nextflow_badge diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 2986e13..b9feb24 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -3,12 +3,12 @@ report_comment: > analysis pipeline. report_section_order: - "software_versions": - order: -1000 "ebi-metagenomics-miassembler-methods-description": order: -1001 - "ebi-metagenomics-miassembler-summary": + "software_versions": order: -1002 + "ebi-metagenomics-miassembler-summary": + order: -1003 export_plots: true diff --git a/conf/base.config b/conf/base.config index aff79f6..7170d21 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,9 +10,15 @@ process { - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + resourceLimits = [ + cpus: 32, + memory: '1.TB', + time: '168.h' + ] + + cpus = { 1 * task.attempt } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } errorStrategy = { task.exitStatus in ((130..155) + 104) ? 'retry' : 'finish' } maxRetries = 1 diff --git a/conf/codon_slurm.config b/conf/codon_slurm.config index 7fb4789..c658798 100644 --- a/conf/codon_slurm.config +++ b/conf/codon_slurm.config @@ -12,7 +12,6 @@ executor { queueGlobalStatus = true submitRateLimit = "10 sec" pollInterval = "10 sec" - } conda.enabled = false diff --git a/conf/modules.config b/conf/modules.config index e6814bc..5e58f41 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,17 +13,17 @@ process { withName: 'FETCHTOOL*' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } ext.args = params.private_study ? "--private" : "" } withName: 'FASTP*' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 8.h * task.attempt } publishDir = [ [ path: "${params.outdir}", @@ -61,9 +61,9 @@ process { } withName: 'FASTQC' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 8.h * task.attempt } publishDir = [ [ path: "${params.outdir}", @@ -84,9 +84,9 @@ process { // This BWAMEM2_MEM belongs to the coverage module withName: 'BWAMEM2_MEM_COVERAGE' { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 20.GB * task.attempt, 'memory' ) } - time = { check_max( 1.h * task.attempt, 'time' ) } + cpus = { 12 * task.attempt } + memory = { 20.GB * task.attempt } + time = { 1.h * task.attempt } ext.args = "-M" ext.args2 = "-F 268 -uS" @@ -94,23 +94,23 @@ process { /* Decontamination */ withName: 'BWAMEM2DECONTNOBAMS' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 2 * task.attempt } + time = { 8.h * task.attempt } ext.prefix = "decontaminated" } withName: 'HUMAN*_DECONTAMINATION' { - memory = { check_max( 64.GB * task.attempt, 'memory' ) } + memory = { 64.GB * task.attempt } } withName: 'HOST_DECONTAMINATION' { - memory = { check_max( 24.GB * task.attempt, 'memory' ) } + memory = { 24.GB * task.attempt } } withName: 'CANU*' { - cpus = { check_max( 4 , 'cpus' ) } - memory = { check_max( 3.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 4 } + memory = { 3.GB * task.attempt } + time = { 4.h * task.attempt } ext.args = [ '-trim', @@ -141,10 +141,10 @@ process { ].join(' ').trim() } - withName: 'PORECHOP_ONT' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + withName: 'PORECHOP_ABI' { + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } } /* --------- */ @@ -154,11 +154,11 @@ process { // We increase the memory 50% with each try memory = { def assembly_memory = meta.assembly_memory ?: params.assembly_memory; - check_max( assembly_memory.GB + ( assembly_memory.GB * 0.5 * ( task.attempt - 1 ) ), 'memory') + assembly_memory.GB + ( assembly_memory.GB * 0.5 * ( task.attempt - 1 ) ) } - cpus = { check_max( 32 * task.attempt, 'cpus') } + cpus = { 32 * task.attempt } // TODO: tweak this based on input ( using the biome maybe? ) - time = { check_max( 168.h * task.attempt, 'time') } + time = { 168.h * task.attempt } ext.args = params.spades_only_assembler ? "--only-assembler" : "" errorStrategy = 'retry' maxRetries = params.max_spades_retries @@ -194,10 +194,10 @@ process { withName: 'MEGAHIT' { memory = { def assembly_memory = meta.assembly_memory ?: params.assembly_memory; - check_max( assembly_memory.GB + ( assembly_memory.GB * 0.5 * ( task.attempt - 1 ) ), 'memory') + assembly_memory.GB + ( assembly_memory.GB * 0.5 * ( task.attempt - 1 ) ) } - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + cpus = { 12 * task.attempt } + time = { 16.h * task.attempt } errorStrategy = 'retry' maxRetries = params.max_megahit_retries @@ -218,15 +218,15 @@ process { } withName: 'SEQKIT_SEQ' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 2 * task.attempt } + memory = { 12.GB * task.attempt } + time = { 4.h * task.attempt } } withName: 'BLAST_BLASTN*' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 8.h * task.attempt } ext.args = [ '-task', @@ -274,17 +274,17 @@ process { } withName: 'SEQKIT_GREP' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 2 * task.attempt } + memory = { 12.GB * task.attempt } + time = { 4.h * task.attempt } ext.args = "--invert-match" } // Dummy process to published the filtered and decontaminated contigs withName: 'PUBLISH_CLEANED_CONTIGS' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 250.MB * task.attempt , 'memory' ) } - time = { check_max( 30.m * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 250.MB * task.attempt } + time = { 30.m * task.attempt } publishDir = [ [ path: "${params.outdir}", @@ -300,15 +300,15 @@ process { } withName: 'BWAMEM2_INDEX' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 16.GB * task.attempt } + time = { 6.h * task.attempt } } withName: 'METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + cpus = { 6 * task.attempt } + memory = { 36.GB * task.attempt } + time = { 8.h * task.attempt } publishDir = [ [ path: "${params.outdir}", @@ -325,15 +325,15 @@ process { } withName: 'SAMTOOLS_IDXSTATS' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } } withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } publishDir = [ [ @@ -345,9 +345,9 @@ process { } withName: 'MULTIQC_STUDY' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } @@ -370,9 +370,9 @@ process { } withName: 'MULTIQC_RUN' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } @@ -395,9 +395,9 @@ process { } withName: 'QUAST' { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + cpus = { 1 } + memory = { 6.GB * task.attempt } + time = { 4.h * task.attempt } publishDir = [ [ diff --git a/conf/test.config b/conf/test.config index 2e734c2..b3b6265 100644 --- a/conf/test.config +++ b/conf/test.config @@ -13,11 +13,14 @@ profiles { // Limit resources so that this can run on GitHub Actions test { + process { + resourceLimits = [ + cpus: 2, + memory: 6.GB, + time: 1.h + ] + } params { - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" reference_genomes_folder = "${projectDir}/tests/human/" diff --git a/main.nf b/main.nf index f1d5494..b2321bd 100644 --- a/main.nf +++ b/main.nf @@ -15,25 +15,7 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { validateParameters; paramsHelp; paramsSummaryLog; paramsSummaryMap; } from 'plugin/nf-schema' - -def summary_params = paramsSummaryMap(workflow) - -if (params.help) { - log.info paramsHelp("nextflow run ebi-metagenomics/miassembler --help") - exit 0 -} - -validateParameters() - -// Custom validation // -// The conditional validation doesn't work yet -> https://github.com/nf-core/tools/issues/2619 -if ( !params.samplesheet && ( !params.study_accession || !params.reads_accession ) ) { - error "Either --samplesheet or both --study_accession and --reads_accession are required." - exit 1 -} - -log.info paramsSummaryLog(workflow) +include { validateParameters } from 'plugin/nf-schema' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,6 +43,16 @@ workflow EBIMETAGENOMICS_MIASSEMBLER { // See: https://github.com/nf-core/rnaseq/issues/619 // workflow { + + validateParameters() + + // Custom validation // + // The conditional validation doesn't work yet -> https://github.com/nf-core/tools/issues/2619 + if ( !params.samplesheet && ( !params.study_accession || !params.reads_accession ) ) { + error "Either --samplesheet or both --study_accession and --reads_accession are required." + exit 1 + } + EBIMETAGENOMICS_MIASSEMBLER () } diff --git a/modules.json b/modules.json index c34d7cc..2b2ccea 100644 --- a/modules.json +++ b/modules.json @@ -8,7 +8,8 @@ "bwamem2/mem": { "branch": "main", "git_sha": "75707538d91ddd27fb6007b4ac3710cb05154780", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/ebi-metagenomics/bwamem2/mem/bwamem2-mem.diff" }, "bwamem2decontnobams": { "branch": "main", @@ -23,7 +24,7 @@ "nf-core": { "blast/blastn": { "branch": "master", - "git_sha": "209e5a3e2753c5e628736a662c877c20f341ee15", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, @@ -35,7 +36,8 @@ "canu": { "branch": "master", "git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/canu/canu.diff" }, "custom/dumpsoftwareversions": { "branch": "master", @@ -65,7 +67,7 @@ }, "megahit": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "7755db15e36b30da564cd67fffdfe18a255092aa", "installed_by": ["modules"], "patch": "modules/nf-core/megahit/megahit.diff" }, @@ -77,12 +79,14 @@ "minimap2/align": { "branch": "master", "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/multiqc/multiqc.diff" }, "porechop/abi": { "branch": "master", diff --git a/modules/ebi-metagenomics/bwamem2/mem/bwamem2-mem.diff b/modules/ebi-metagenomics/bwamem2/mem/bwamem2-mem.diff new file mode 100644 index 0000000..759865c --- /dev/null +++ b/modules/ebi-metagenomics/bwamem2/mem/bwamem2-mem.diff @@ -0,0 +1,29 @@ +Changes in module 'ebi-metagenomics/bwamem2/mem' +'modules/ebi-metagenomics/bwamem2/mem/environment.yml' is unchanged +Changes in 'bwamem2/mem/main.nf': +--- modules/ebi-metagenomics/bwamem2/mem/main.nf ++++ modules/ebi-metagenomics/bwamem2/mem/main.nf +@@ -7,8 +7,7 @@ + 'biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2d15960ccea84e249a150b7f5d4db3a42fc2d6c3-0' }" + + input: +- tuple val(meta), path(reads) +- tuple val(meta2), path(index) ++ tuple val(meta), path(reads), path(index) + + output: + tuple val(meta), path("*_sorted.bam"), path("*_sorted.bam.bai"), emit: bam +@@ -21,7 +20,6 @@ + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: meta.id +- def database = task.ext.database ?: meta2.id + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + +'modules/ebi-metagenomics/bwamem2/mem/meta.yml' is unchanged +'modules/ebi-metagenomics/bwamem2/mem/tests/tags.yml' is unchanged +'modules/ebi-metagenomics/bwamem2/mem/tests/main.nf.test.snap' is unchanged +'modules/ebi-metagenomics/bwamem2/mem/tests/main.nf.test' is unchanged +************************************************************ diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff index 888e64e..e596c33 100644 --- a/modules/nf-core/blast/blastn/blast-blastn.diff +++ b/modules/nf-core/blast/blastn/blast-blastn.diff @@ -1,4 +1,6 @@ Changes in module 'nf-core/blast/blastn' +'modules/nf-core/blast/blastn/environment.yml' is unchanged +Changes in 'blast/blastn/main.nf': --- modules/nf-core/blast/blastn/main.nf +++ modules/nf-core/blast/blastn/main.nf @@ -20,7 +20,7 @@ @@ -11,4 +13,9 @@ Changes in module 'nf-core/blast/blastn' def fasta_name = is_compressed ? fasta.getBaseName() : fasta +'modules/nf-core/blast/blastn/meta.yml' is unchanged +'modules/nf-core/blast/blastn/tests/tags.yml' is unchanged +'modules/nf-core/blast/blastn/tests/nextflow.config' is unchanged +'modules/nf-core/blast/blastn/tests/main.nf.test.snap' is unchanged +'modules/nf-core/blast/blastn/tests/main.nf.test' is unchanged ************************************************************ diff --git a/modules/nf-core/blast/blastn/environment.yml b/modules/nf-core/blast/blastn/environment.yml index cb9b15d..777e097 100644 --- a/modules/nf-core/blast/blastn/environment.yml +++ b/modules/nf-core/blast/blastn/environment.yml @@ -1,7 +1,5 @@ -name: blast_blastn channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::blast=2.14.1 + - bioconda::blast=2.15.0 diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index 9b44592..587e799 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -4,8 +4,8 @@ process BLAST_BLASTN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/blast:2.14.1--pl5321h6f7f691_0': - 'biocontainers/blast:2.14.1--pl5321h6f7f691_0' }" + 'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1': + 'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }" input: tuple val(meta) , path(fasta) diff --git a/modules/nf-core/blast/blastn/meta.yml b/modules/nf-core/blast/blastn/meta.yml index a0d64dd..0f5e41b 100644 --- a/modules/nf-core/blast/blastn/meta.yml +++ b/modules/nf-core/blast/blastn/meta.yml @@ -13,39 +13,42 @@ tools: documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs doi: 10.1016/S0022-2836(05)80360-2 licence: ["US-Government-Work"] + identifier: "" input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: Input fasta file containing queries sequences - pattern: "*.{fa,fasta,fa.gz,fasta.gz}" - - meta2: - type: map - description: | - Groovy Map containing db information - e.g. [ id:'test2', single_end:false ] - - db: - type: directory - description: Directory containing the blast database - pattern: "*" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing queries sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: directory + description: Directory containing the blast database + pattern: "*" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - txt: - type: file - description: File containing blastn hits - pattern: "*.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.txt": + type: file + description: File containing blastn hits + pattern: "*.txt" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test b/modules/nf-core/blast/blastn/tests/main.nf.test index 02ecfab..aacc93c 100644 --- a/modules/nf-core/blast/blastn/tests/main.nf.test +++ b/modules/nf-core/blast/blastn/tests/main.nf.test @@ -15,7 +15,7 @@ nextflow_process { script "../../makeblastdb/main.nf" process { """ - input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [id:'test2'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] """ } } @@ -29,7 +29,7 @@ nextflow_process { } process { """ - input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] input[1] = BLAST_MAKEBLASTDB.out.db """ } @@ -53,7 +53,7 @@ nextflow_process { } process { """ - input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta_gz'], checkIfExists: true) ] + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ] input[1] = BLAST_MAKEBLASTDB.out.db """ } diff --git a/modules/nf-core/blast/blastn/tests/main.nf.test.snap b/modules/nf-core/blast/blastn/tests/main.nf.test.snap index d1b5f3f..dd8b775 100644 --- a/modules/nf-core/blast/blastn/tests/main.nf.test.snap +++ b/modules/nf-core/blast/blastn/tests/main.nf.test.snap @@ -2,7 +2,7 @@ "versions": { "content": [ [ - "versions.yml:md5,2d5ffadc7035672f6a9e00b01d1751ea" + "versions.yml:md5,faf2471d836ebbf24d96d3e1f8720b17" ] ], "timestamp": "2023-12-11T07:20:03.54997013" @@ -10,7 +10,7 @@ "versions_zipped": { "content": [ [ - "versions.yml:md5,2d5ffadc7035672f6a9e00b01d1751ea" + "versions.yml:md5,faf2471d836ebbf24d96d3e1f8720b17" ] ], "timestamp": "2023-12-11T07:20:12.925782708" diff --git a/modules/nf-core/canu/canu.diff b/modules/nf-core/canu/canu.diff new file mode 100644 index 0000000..1e6aba1 --- /dev/null +++ b/modules/nf-core/canu/canu.diff @@ -0,0 +1,37 @@ +Changes in module 'nf-core/canu' +'modules/nf-core/canu/environment.yml' is unchanged +Changes in 'canu/main.nf': +--- modules/nf-core/canu/main.nf ++++ modules/nf-core/canu/main.nf +@@ -15,7 +15,7 @@ + output: + tuple val(meta), path("*.report") , emit: report + tuple val(meta), path("*.contigs.fasta.gz") , emit: assembly , optional: true +- tuple val(meta), path("*.unassembled.fasta.gz") , emit: contigs ++ tuple val(meta), path("*.unassembled.fasta.gz") , emit: contigs , optional: true + tuple val(meta), path("*.correctedReads.fasta.gz") , emit: corrected_reads , optional: true + tuple val(meta), path("*.trimmedReads.fasta.gz") , emit: corrected_trimmed_reads , optional: true + tuple val(meta), path("*.contigs.layout") , emit: metadata , optional: true +@@ -28,6 +28,7 @@ + + script: + def args = task.ext.args ?: '' ++ def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def valid_mode = ["-pacbio", "-nanopore", "-pacbio-hifi"] + if ( !valid_mode.contains(mode) ) { error "Unrecognised mode to run Canu. Options: ${valid_mode.join(', ')}" } +@@ -37,10 +38,9 @@ + $mode \\ + genomeSize=${genomesize} \\ + $args \\ ++ $args2 \\ + maxThreads=$task.cpus \\ + $reads +- +- gzip *.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +'modules/nf-core/canu/meta.yml' is unchanged +************************************************************ diff --git a/modules/nf-core/megahit/environment.yml b/modules/nf-core/megahit/environment.yml index aac2f99..eed8b72 100644 --- a/modules/nf-core/megahit/environment.yml +++ b/modules/nf-core/megahit/environment.yml @@ -1,8 +1,6 @@ -name: megahit channels: - conda-forge - bioconda - - defaults dependencies: - bioconda::megahit=1.2.9 - - conda-forge::pigz=2.6 + - conda-forge::pigz=2.8 diff --git a/modules/nf-core/megahit/main.nf b/modules/nf-core/megahit/main.nf index 750e3ec..dc9bc4b 100644 --- a/modules/nf-core/megahit/main.nf +++ b/modules/nf-core/megahit/main.nf @@ -1,22 +1,22 @@ process MEGAHIT { - tag "$meta.id" + tag "${meta.id}" label 'process_high' - conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' : - 'biocontainers/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f2/f2cb827988dca7067ff8096c37cb20bc841c878013da52ad47a50865d54efe83/data' : + 'community.wave.seqera.io/library/megahit_pigz:87a590163e594224' }" input: tuple val(meta), path(reads) output: - tuple val(meta), path("megahit_out/*.contigs.fa.gz") , emit: contigs - tuple val(meta), path("megahit_out/intermediate_contigs/k*.contigs.fa.gz") , emit: k_contigs - tuple val(meta), path("megahit_out/intermediate_contigs/k*.addi.fa.gz") , emit: addi_contigs - tuple val(meta), path("megahit_out/intermediate_contigs/k*.local.fa.gz") , emit: local_contigs - tuple val(meta), path("megahit_out/intermediate_contigs/k*.final.contigs.fa.gz"), emit: kfinal_contigs - path "versions.yml" , emit: versions + tuple val(meta), path("*.contigs.fa.gz") , emit: contigs + tuple val(meta), path("intermediate_contigs/k*.contigs.fa.gz") , emit: k_contigs + tuple val(meta), path("intermediate_contigs/k*.addi.fa.gz") , emit: addi_contigs + tuple val(meta), path("intermediate_contigs/k*.local.fa.gz") , emit: local_contigs + tuple val(meta), path("intermediate_contigs/k*.final.contigs.fa.gz"), emit: kfinal_contigs + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -25,56 +25,46 @@ process MEGAHIT { def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - megahit \\ - -r ${reads} \\ - -t $task.cpus \\ - $args \\ - --out-prefix $prefix - - if [ ! -s megahit_out/*.fa ]; then - echo "No contigs assembled" | tee /dev/stderr - exit 1 - fi + def reads_command = meta.single_end ? "-r ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + megahit \\ + ${reads_command} \\ + ${args} \\ + -t ${task.cpus} \\ + --out-prefix ${prefix} - pigz \\ - --no-name \\ - -p $task.cpus \\ - $args2 \\ - megahit_out/*.fa \\ - megahit_out/intermediate_contigs/*.fa + pigz \\ + --no-name \\ + -p ${task.cpus} \\ + ${args2} \\ + megahit_out/*.fa \\ + megahit_out/intermediate_contigs/*.fa - cat <<-END_VERSIONS > versions.yml - "${task.process}": - megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') - END_VERSIONS - """ - } else { - """ - megahit \\ - -1 ${reads[0]} \\ - -2 ${reads[1]} \\ - -t $task.cpus \\ - $args \\ - --out-prefix $prefix + mv megahit_out/* . - if [ ! -s megahit_out/*.fa ]; then - echo "No contigs assembled" | tee /dev/stderr - exit 1 - fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ - pigz \\ - --no-name \\ - -p $task.cpus \\ - $args2 \\ - megahit_out/*.fa \\ - megahit_out/intermediate_contigs/*.fa + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end ? "-r ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + mkdir -p intermediate_contigs + echo "" | gzip > ${prefix}.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.addi.fa.gz + echo "" | gzip > intermediate_contigs/k21.local.fa.gz + echo "" | gzip > intermediate_contigs/k21.final.contigs.fa.gz + touch ${prefix}.log - cat <<-END_VERSIONS > versions.yml - "${task.process}": - megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') - END_VERSIONS - """ - } + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ } diff --git a/modules/nf-core/megahit/megahit.diff b/modules/nf-core/megahit/megahit.diff index 9f4fe6f..0f2f60b 100644 --- a/modules/nf-core/megahit/megahit.diff +++ b/modules/nf-core/megahit/megahit.diff @@ -1,29 +1,38 @@ Changes in module 'nf-core/megahit' +'modules/nf-core/megahit/environment.yml' is unchanged +Changes in 'megahit/main.nf': --- modules/nf-core/megahit/main.nf +++ modules/nf-core/megahit/main.nf -@@ -33,6 +33,11 @@ - $args \\ - --out-prefix $prefix +@@ -7,7 +7,7 @@ + 'community.wave.seqera.io/library/megahit_pigz:87a590163e594224' }" -+ if [ ! -s megahit_out/*.fa ]; then -+ echo "No contigs assembled" | tee /dev/stderr -+ exit 1 -+ fi -+ - pigz \\ - --no-name \\ - -p $task.cpus \\ -@@ -54,6 +59,11 @@ - $args \\ - --out-prefix $prefix + input: +- tuple val(meta), path(reads1), path(reads2) ++ tuple val(meta), path(reads) -+ if [ ! -s megahit_out/*.fa ]; then -+ echo "No contigs assembled" | tee /dev/stderr -+ exit 1 -+ fi -+ - pigz \\ - --no-name \\ - -p $task.cpus \\ + output: + tuple val(meta), path("*.contigs.fa.gz") , emit: contigs +@@ -25,7 +25,7 @@ + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" +- def reads_command = meta.single_end || !reads2 ? "-r ${reads1}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" ++ def reads_command = meta.single_end ? "-r ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + megahit \\ + ${reads_command} \\ +@@ -52,7 +52,7 @@ + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" +- def reads_command = meta.single_end || !reads2 ? "-r ${reads1}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" ++ def reads_command = meta.single_end ? "-r ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + mkdir -p intermediate_contigs + echo "" | gzip > ${prefix}.contigs.fa.gz +'modules/nf-core/megahit/meta.yml' is unchanged +'modules/nf-core/megahit/tests/tags.yml' is unchanged +'modules/nf-core/megahit/tests/main.nf.test.snap' is unchanged +'modules/nf-core/megahit/tests/main.nf.test' is unchanged ************************************************************ diff --git a/modules/nf-core/megahit/meta.yml b/modules/nf-core/megahit/meta.yml index 83b718f..04dab4c 100644 --- a/modules/nf-core/megahit/meta.yml +++ b/modules/nf-core/megahit/meta.yml @@ -8,53 +8,106 @@ keywords: - metagenomics tools: - megahit: - description: "An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph" + description: "An ultra-fast single-node solution for large and complex metagenomics + assembly via succinct de Bruijn graph" homepage: https://github.com/voutcn/megahit documentation: https://github.com/voutcn/megahit tool_dev_url: https://github.com/voutcn/megahit doi: "10.1093/bioinformatics/btv033" licence: ["GPL v3"] + args_id: "$args" + identifier: biotools:megahit + - pigz: + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + args_id: "$args2" + + identifier: biotools:megahit input: - - meta: - type: map - description: | - Groovy Map containing sample information and input single, or paired-end FASTA/FASTQ files (optionally decompressed) - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively in gzipped or uncompressed FASTQ or FASTA format. + - - meta: + type: map + description: | + Groovy Map containing sample information and input single, or paired-end FASTA/FASTQ files (optionally decompressed) + e.g. [ id:'test', single_end:false ] + - reads1: + type: file + description: | + A single or list of input FastQ files for single-end or R1 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. + - reads2: + type: file + description: | + A single or list of input FastQ files for R2 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - contigs: - type: file - description: Final final contigs result of the assembly in FASTA format. - pattern: "*.contigs.fa.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.contigs.fa.gz": + type: file + description: Final final contigs result of the assembly in FASTA format. + pattern: "*.contigs.fa.gz" - k_contigs: - type: file - description: Contigs assembled from the de Bruijn graph of order-K - pattern: "k*.contigs.fa.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.contigs.fa.gz: + type: file + description: Contigs assembled from the de Bruijn graph of order-K + pattern: "k*.contigs.fa.gz" - addi_contigs: - type: file - description: Contigs assembled after iteratively removing local low coverage unitigs in the de Bruijn graph of order-K - pattern: "k*.addi.fa.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.addi.fa.gz: + type: file + description: Contigs assembled after iteratively removing local low coverage + unitigs in the de Bruijn graph of order-K + pattern: "k*.addi.fa.gz" - local_contigs: - type: file - description: Contigs of the locally assembled contigs for k=K - pattern: "k*.local.fa.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.local.fa.gz: + type: file + description: Contigs of the locally assembled contigs for k=K + pattern: "k*.local.fa.gz" - kfinal_contigs: - type: file - description: Stand-alone contigs for k=K; if local assembly is turned on, the file will be empty - pattern: "k*.final.contigs.fa.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.final.contigs.fa.gz: + type: file + description: Stand-alone contigs for k=K; if local assembly is turned on, the + file will be empty + pattern: "k*.final.contigs.fa.gz" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: Log file containing statistics of the assembly output + pattern: "*.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" maintainers: diff --git a/modules/nf-core/megahit/tests/main.nf.test b/modules/nf-core/megahit/tests/main.nf.test new file mode 100644 index 0000000..b52765d --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test @@ -0,0 +1,126 @@ +nextflow_process { + + name "Test Process MEGAHIT" + script "../main.nf" + process "MEGAHIT" + + tag "modules" + tag "modules_nfcore" + tag "megahit" + + test("sarscov2 - fastq - se") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe - coassembly") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true)] , + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/megahit/tests/main.nf.test.snap b/modules/nf-core/megahit/tests/main.nf.test.snap new file mode 100644 index 0000000..4677cc3 --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test.snap @@ -0,0 +1,172 @@ +{ + "sarscov2 - fastq - se": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:42.387947698" + }, + "sarscov2 - fastq - pe": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:48.679485983" + }, + "sarscov2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ], + "addi_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "k_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "kfinal_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "local_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:44:35.245399991" + }, + "sarscov2 - fastq - pe - coassembly": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:56.23363342" + } +} \ No newline at end of file diff --git a/modules/nf-core/megahit/tests/tags.yml b/modules/nf-core/megahit/tests/tags.yml new file mode 100644 index 0000000..9e86584 --- /dev/null +++ b/modules/nf-core/megahit/tests/tags.yml @@ -0,0 +1,2 @@ +megahit: + - "modules/nf-core/megahit/**" diff --git a/modules/nf-core/minimap2/align/minimap2-align.diff b/modules/nf-core/minimap2/align/minimap2-align.diff new file mode 100644 index 0000000..647611a --- /dev/null +++ b/modules/nf-core/minimap2/align/minimap2-align.diff @@ -0,0 +1,59 @@ +Changes in module 'nf-core/minimap2/align' +'modules/nf-core/minimap2/align/environment.yml' is unchanged +Changes in 'minimap2/align/main.nf': +--- modules/nf-core/minimap2/align/main.nf ++++ modules/nf-core/minimap2/align/main.nf +@@ -11,12 +11,14 @@ + input: + tuple val(meta), path(reads) + tuple val(meta2), path(reference) ++ val prefix2 + val bam_format + val bam_index_extension + val cigar_paf_format + val cigar_bam + + output: ++ tuple val(meta), path("*.minimap*") , optional: true, emit: filtered_fastq + tuple val(meta), path("*.paf") , optional: true, emit: paf + tuple val(meta), path("*.bam") , optional: true, emit: bam + tuple val(meta), path("*.bam.${bam_index_extension}"), optional: true, emit: index +@@ -32,7 +34,8 @@ + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_index = bam_index_extension ? "${prefix}.bam##idx##${prefix}.bam.${bam_index_extension} --write-index" : "${prefix}.bam" +- def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus-1} -o ${bam_index} ${args2}" : "-o ${prefix}.paf" ++ def map_mode = "${meta.platform}" ? "-x map-${meta.platform}" : '' ++ def bam_output = bam_format ? "-a | samtools fastq -f 4 | gzip > ${prefix}.${prefix2}.minimap.fastq.gz" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + def bam_input = "${reads.extension}".matches('sam|bam|cram') +@@ -45,12 +48,12 @@ + minimap2 \\ + $args \\ + -t $task.cpus \\ ++ $map_mode \\ + $target \\ + $query \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output +- + + cat <<-END_VERSIONS > versions.yml + "${task.process}": +@@ -60,7 +63,7 @@ + """ + + stub: +- def prefix = task.ext.prefix ?: "${meta.id}" ++ def prefix = task.ext.prefix ?: c + def output_file = bam_format ? "${prefix}.bam" : "${prefix}.paf" + def bam_index = bam_index_extension ? "touch ${prefix}.bam.${bam_index_extension}" : "" + def bam_input = "${reads.extension}".matches('sam|bam|cram') + +'modules/nf-core/minimap2/align/meta.yml' is unchanged +'modules/nf-core/minimap2/align/tests/tags.yml' is unchanged +'modules/nf-core/minimap2/align/tests/main.nf.test.snap' is unchanged +'modules/nf-core/minimap2/align/tests/main.nf.test' is unchanged +************************************************************ diff --git a/modules/nf-core/multiqc/multiqc.diff b/modules/nf-core/multiqc/multiqc.diff new file mode 100644 index 0000000..1f02e13 --- /dev/null +++ b/modules/nf-core/multiqc/multiqc.diff @@ -0,0 +1,22 @@ +Changes in module 'nf-core/multiqc' +'modules/nf-core/multiqc/environment.yml' is unchanged +Changes in 'multiqc/main.nf': +--- modules/nf-core/multiqc/main.nf ++++ modules/nf-core/multiqc/main.nf +@@ -7,7 +7,8 @@ + 'biocontainers/multiqc:1.25.1--pyhdfd78af_0' }" + + input: +- path multiqc_files, stageAs: "?/*" ++ path(multiqc_files, stageAs: "?/*") ++ tuple val(meta), path(pipeline_files, stageAs: "?/*") + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + +'modules/nf-core/multiqc/meta.yml' is unchanged +'modules/nf-core/multiqc/tests/tags.yml' is unchanged +'modules/nf-core/multiqc/tests/nextflow.config' is unchanged +'modules/nf-core/multiqc/tests/main.nf.test.snap' is unchanged +'modules/nf-core/multiqc/tests/main.nf.test' is unchanged +************************************************************ diff --git a/modules/nf-core/seqkit/grep/seqkit-grep.diff b/modules/nf-core/seqkit/grep/seqkit-grep.diff index 30b7db0..60ca678 100644 --- a/modules/nf-core/seqkit/grep/seqkit-grep.diff +++ b/modules/nf-core/seqkit/grep/seqkit-grep.diff @@ -1,4 +1,6 @@ Changes in module 'nf-core/seqkit/grep' +'modules/nf-core/seqkit/grep/environment.yml' is unchanged +Changes in 'seqkit/grep/main.nf': --- modules/nf-core/seqkit/grep/main.nf +++ modules/nf-core/seqkit/grep/main.nf @@ -9,8 +9,7 @@ @@ -11,5 +13,21 @@ Changes in module 'nf-core/seqkit/grep' output: tuple val(meta), path("*.{fa,fq}.gz") , emit: filter +@@ -21,10 +20,13 @@ + + script: + def args = task.ext.args ?: '' +- def prefix = task.ext.prefix ?: "${meta.id}" + // fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix + def suffix = task.ext.suffix ?: "${sequence}" ==~ /(.*f[astn]*a(.gz)?$)/ ? "fa" : "fq" + def pattern_file = pattern ? "-f ${pattern}" : "" ++ ++ def pattern_filename = pattern.getName() ++ def pattern_name = pattern_filename.split('\\.')[0] ++ def prefix = task.ext.prefix ?: "${meta.id}_${pattern_name}" + + """ + seqkit \\ +'modules/nf-core/seqkit/grep/meta.yml' is unchanged ************************************************************ diff --git a/nextflow.config b/nextflow.config index 3080801..e3bf5ff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,10 +6,6 @@ ---------------------------------------------------------------------------------------- */ -plugins { - id 'nf-schema' -} - // Global default params, used in configs params { @@ -108,11 +104,6 @@ params { help = false version = false - // Max resource options - // Defaults only, expecting to be overwritten - max_memory = '1.TB' - max_cpus = 32 - max_time = '168.h' // 7 days max_spades_retries = 3 max_megahit_retries = 3 @@ -126,7 +117,11 @@ params { validation { failUnrecognisedParams = true lenientMode = false - showHiddenParams = false + help { + enabled = true + showHidden = false + command = "nextflow run ebi-metagenomics/miassembler --samplesheet samplesheet.csv --outdir output" + } } // Load base.config by default for all pipelines @@ -243,7 +238,7 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { - id 'nf-schema@2.0.0' + id 'nf-schema@2.2.0' } // Export these variables to prevent local Python/R libraries from conflicting with those in the container @@ -287,7 +282,7 @@ manifest { homePage = 'https://github.com/ebi-metagenomics/miassembler' description = """Microbiome Informatics metagenomes assembly pipeline""" mainScript = 'main.nf' - nextflowVersion = '!>=23.04.0' + nextflowVersion = '!>=24.04.0' version = 'v1.0.0' doi = '' } @@ -314,36 +309,3 @@ def study_folder( meta = null ) { study_accession, ].join("/") } - -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} diff --git a/nextflow_schema.json b/nextflow_schema.json index d6f3d00..65d981c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -4,7 +4,7 @@ "title": "ebi-metagenomics/miassembler pipeline parameters", "description": "Microbiome Informatics metagenomes assembly pipeline", "type": "object", - "defs": { + "$defs": { "input_output_options": { "title": "Input/output options", "type": "object", @@ -51,7 +51,7 @@ "long_reads_assembler_config": { "type": "string", "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", - "default": "" + "default": null }, "single_end": { "type": "boolean", @@ -70,7 +70,7 @@ "platform": { "type": "string", "description": "Force the instrument_platform value for the study / reads", - "default": "ont" + "default": null }, "flye_version": { "type": "string", @@ -211,32 +211,6 @@ "description": "Set the top limit for requested resources for any single job.", "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 32, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "1.TB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "168.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - }, "max_spades_retries": { "type": "integer", "description": "Maximum number of task attempt retries for (meta)spades assembly steps only.", @@ -341,16 +315,16 @@ }, "allOf": [ { - "$ref": "#/defs/input_output_options" + "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/defs/reads_qc" + "$ref": "#/$defs/reads_qc" }, { - "$ref": "#/defs/max_job_request_options" + "$ref": "#/$defs/max_job_request_options" }, { - "$ref": "#/defs/generic_options" + "$ref": "#/$defs/generic_options" } ] } diff --git a/nf-test.config b/nf-test.config index ac040d5..853c892 100644 --- a/nf-test.config +++ b/nf-test.config @@ -1,5 +1,4 @@ config { - testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" diff --git a/tests/main.nf.test b/tests/main.nf.test index fb6ab8f..bd389ab 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -19,7 +19,7 @@ nextflow_pipeline { then { with(workflow) { assert success - assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 1 + assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 2 assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 1 } } @@ -53,7 +53,7 @@ nextflow_pipeline { assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 0 assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 0 // This process should not have been called - assert trace.succeeded().count{ task -> task.name.contains("DOWNLOAD_FROM_FIRE") == 0} + assert trace.succeeded().count{ task -> task.name.contains("DOWNLOAD_FROM_FIRE") } == 0 } } @@ -216,11 +216,13 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" samplesheet = "${projectDir}/tests/samplesheet/test_mem.csv" assembly_memory = 0.5 - // will will be [0.5GB, 0.75GB, 1.13GB, ...] which rounds down to [0, 0, 1, ...] so should definitely fail twice before succeeding. after a few trys. + /* Memory jumping testing */ + /* will try with [0.5GB, 0.75GB, 1.13GB, ...] + /* which rounds down to [0, 0, 1, ...] + /* so should definitely fail twice before succeeding. after a few trys. + /* ~~~ */ max_spades_retries = 5 } } @@ -247,8 +249,6 @@ nextflow_pipeline { params { outdir = "tests/results" assembler = "spades" - bwamem2_reference_genomes_folder = "${projectDir}/tests/human_phix/bwa2mem" - blast_reference_genomes_folder = "${projectDir}/tests/human_phix/blast" samplesheet = "${projectDir}/tests/samplesheet/test.csv" private_study = true } diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 2c61bac..d896ebd 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -4,13 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { - validateParameters ; - paramsSummaryLog ; - paramsSummaryMap ; - samplesheetToList ; - paramsHelp -} from 'plugin/nf-schema' +include { paramsSummaryLog; paramsSummaryMap; samplesheetToList } from 'plugin/nf-schema' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -67,13 +61,6 @@ workflow MIASSEMBLER { // Print parameter summary log to screen log.info(logo + paramsSummaryLog(workflow) + citation) - validateParameters() - - if (params.help) { - log.info(paramsHelp("nextflow run ebi-metagenomics/miassembler --help")) - exit(0) - } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES From ebb527862c10570b004e6a51c7c9d6304aa6d2a7 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 16:18:24 +0000 Subject: [PATCH 28/33] Adjust code based on feedback --- bin/s3fire_downloader.py | 20 ++++++-------------- modules/local/download_from_fire.nf | 2 +- nextflow_schema.json | 3 ++- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/bin/s3fire_downloader.py b/bin/s3fire_downloader.py index 09a7282..5291d3c 100755 --- a/bin/s3fire_downloader.py +++ b/bin/s3fire_downloader.py @@ -105,13 +105,8 @@ def download_files(ftp_paths: List[str], outdir: str, access_key: Optional[str], :type secret_key: Optional[str] """ for ftp_path in ftp_paths: - try: - s3_key, bucket = transform_ftp_to_s3(ftp_path) - download_file_from_fire(s3_key, bucket, outdir, access_key, secret_key) - except ValueError as ve: - logger.error(f"Skipping download due to error: {ve}") - except Exception as e: - logger.error(f"Unexpected error while downloading {ftp_path}: {e}") + s3_key, bucket = transform_ftp_to_s3(ftp_path) + download_file_from_fire(s3_key, bucket, outdir, access_key, secret_key) def main() -> None: @@ -119,7 +114,7 @@ def main() -> None: description="Download multiple files from FTP paths via FIRE S3 (supports public and private files)." ) parser.add_argument( - "--ftp_paths", + "--ftp-paths", nargs="+", required=True, help="Space-separated list of FTP paths to download (e.g., ftp.sra.ebi.ac.uk/vol1/.../file1 ftp.sra.ebi.ac.uk/vol1/.../file2).", @@ -129,12 +124,9 @@ def main() -> None: parser.add_argument("--secret-key", required=False, help="S3 secret key (required for private files).") args = parser.parse_args() - try: - logger.info("Starting the file download process...") - download_files(args.ftp_paths, args.outdir, args.access_key, args.secret_key) - logger.info("All files have been processed.") - except Exception as e: - logger.error(f"Unexpected error: {e}") + logger.info("Starting the file download process...") + download_files(args.ftp_paths, args.outdir, args.access_key, args.secret_key) + logger.info("All files have been processed.") if __name__ == "__main__": diff --git a/modules/local/download_from_fire.nf b/modules/local/download_from_fire.nf index 7226f72..b8c7242 100644 --- a/modules/local/download_from_fire.nf +++ b/modules/local/download_from_fire.nf @@ -23,7 +23,7 @@ process DOWNLOAD_FROM_FIRE { s3fire_downloader.py \\ --access-key \${FIRE_ACCESS_KEY} \\ --secret-key \${FIRE_SECRET_KEY} \\ - --ftp_paths ${input_reads.join(" ")} \\ + --ftp-paths ${input_reads.join(" ")} \\ --outdir fastq_files cat <<-END_VERSIONS > versions.yml diff --git a/nextflow_schema.json b/nextflow_schema.json index 65d981c..4df56ed 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -50,7 +50,8 @@ }, "long_reads_assembler_config": { "type": "string", - "description": "Configuration to use flye with. Pick from nano-raw, nano-corr, nano-hq, pacbio-raw, pacbio-corr, pacbio-hifi", + "enum": ["nano-raw", "nano-corr", "nano-hq", "pacbio-raw", "pacbio-corr", "pacbio-hifi"], + "description": "Configuration to use flye with.", "default": null }, "single_end": { From 48cc7ee90026af1df2a6d8629dec0c453aa44550 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 16:18:45 +0000 Subject: [PATCH 29/33] Adjust code based on feedback --- tests/main.nf.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/main.nf.test b/tests/main.nf.test index bd389ab..644b429 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -139,7 +139,9 @@ nextflow_pipeline { } - test("MEGAHIT - single end - should fail") { + test("MEGAHIT - single end - should produce an empty contigs file") { + + // TODO: fix along with - https://github.com/EBI-Metagenomics/miassembler/pull/21 tag "ena-portal-api" @@ -154,7 +156,7 @@ nextflow_pipeline { then { with(workflow) { - assert !success + assert success assert trace.failed().count{ task -> task.name.contains("MEGAHIT") } == 1 } } From a244fa0bf249bf604a8198c292e6ac8139602d14 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 16:22:04 +0000 Subject: [PATCH 30/33] Add nf-core version to .nf-core.yml --- .nf-core.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.nf-core.yml b/.nf-core.yml index 6074a56..6c1e96b 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -51,3 +51,4 @@ lint: - custom_config readme: - nextflow_badge +nf_core_version: 3.0.2 From 5592bc48c65d1d9c530ea8f57c67c9c3ab0f1ebb Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 16:24:44 +0000 Subject: [PATCH 31/33] Bump version of nf-test for nf_tests github aciton --- .github/workflows/nf_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nf_tests.yml b/.github/workflows/nf_tests.yml index f29e7d8..70e3e58 100644 --- a/.github/workflows/nf_tests.yml +++ b/.github/workflows/nf_tests.yml @@ -31,6 +31,7 @@ jobs: uses: nf-core/setup-nf-test@v1 with: install-pdiff: true + version: 0.9.2 - name: Run pipeline with test data run: | From e42163902d45a7427ffea88ae2a3140866d5139d Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Thu, 21 Nov 2024 16:38:59 +0000 Subject: [PATCH 32/33] Adjust test - megahit with empty contigs will succedd --- tests/main.nf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/main.nf.test b/tests/main.nf.test index 644b429..f77f1f5 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -157,7 +157,7 @@ nextflow_pipeline { then { with(workflow) { assert success - assert trace.failed().count{ task -> task.name.contains("MEGAHIT") } == 1 + assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 1 } } } From 570059344637134ac8691da7442152b096b8bd12 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Tue, 10 Dec 2024 16:47:44 +0000 Subject: [PATCH 33/33] Retrofit some fixes --- conf/modules.config | 6 +++--- nf-test.config | 2 +- workflows/short_reads_assembler.nf | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7f4b2c1..4a2209e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -325,9 +325,9 @@ process { } withName: 'CALCULATE_ASSEMBLY_COVERAGE' { - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 100.MB * task.attempt, 'memory' ) } - time = { check_max( 30.m * task.attempt, 'time' ) } + cpus = { 1 * task.attempt } + memory = { 100.MB * task.attempt } + time = { 30.m * task.attempt } publishDir = [ [ path: "${params.outdir}", diff --git a/nf-test.config b/nf-test.config index c47d9bb..853c892 100644 --- a/nf-test.config +++ b/nf-test.config @@ -2,5 +2,5 @@ config { testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "test_ci,docker" + profile "test,docker" } diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 6929e41..2beb545 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -153,7 +153,8 @@ workflow SHORT_READS_ASSEMBLER { // Coverage // SHORT_READS_ASSEMBLY_COVERAGE( - SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false) + SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false), + SHORT_READS_QC.out.fastp_json ) ch_versions = ch_versions.mix(SHORT_READS_ASSEMBLY_COVERAGE.out.versions)