From d644d4f7b693d43c3eb7c97e099f728ea7697895 Mon Sep 17 00:00:00 2001 From: Ge94 Date: Tue, 17 Dec 2024 12:24:29 +0000 Subject: [PATCH 1/7] Fixed regex in modules.config + codon ref genome --- conf/modules.config | 6 +++--- subworkflows/local/long_reads_qc.nf | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4a2209e..b18e33d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,7 +12,7 @@ process { - withName: 'FETCHTOOL*' { + withName: 'FETCHTOOL_READS' { cpus = { 1 } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } @@ -20,7 +20,7 @@ process { ext.args = params.private_study ? "--private" : "" } - withName: 'FASTP*' { + withName: 'FASTP' { cpus = { 6 * task.attempt } memory = { 36.GB * task.attempt } time = { 8.h * task.attempt } @@ -99,7 +99,7 @@ process { ext.prefix = "decontaminated" } - withName: 'HUMAN*_DECONTAMINATION' { + withName: 'HUMAN_PHIX_DECONTAMINATION' { memory = { 64.GB * task.attempt } } diff --git a/subworkflows/local/long_reads_qc.nf b/subworkflows/local/long_reads_qc.nf index da0b059..6d83e0c 100644 --- a/subworkflows/local/long_reads_qc.nf +++ b/subworkflows/local/long_reads_qc.nf @@ -48,7 +48,7 @@ workflow LONG_READS_QC { // can we use the same flag, even if one has phix but not the other? // Check file extensions too - human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.fna", checkIfExists: true) + human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.f*a", checkIfExists: true) .collect().map { files -> [ ["id": params.human_fasta_prefix], files ] } From b940114ae8b199362cdd583c0f0b87eae7261db3 Mon Sep 17 00:00:00 2001 From: Germana Baldi Date: Tue, 17 Dec 2024 13:46:48 +0000 Subject: [PATCH 2/7] Update human ref in codon config --- conf/codon_slurm.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/codon_slurm.config b/conf/codon_slurm.config index c658798..8e4d063 100644 --- a/conf/codon_slurm.config +++ b/conf/codon_slurm.config @@ -4,6 +4,7 @@ params { blast_reference_genomes_folder = "/nfs/production/rdf/metagenomics/pipelines/prod/assembly-pipeline/blast_dbs/" human_phix_blast_index_name = "human_phix" human_phix_bwamem2_index_name = "human_phix" + human_fasta_prefix = "hg38" } executor { From f63e25ccdc595dff6ac9a13aa35f54d1e362abd7 Mon Sep 17 00:00:00 2001 From: Jennifer Mattock Date: Tue, 14 Jan 2025 10:20:18 +0000 Subject: [PATCH 3/7] bug fix for too few contigs and scientific notation errors --- README.md | 2 + bin/calculate_assembly_coverage.py | 4 +- nextflow.config | 1 + nextflow_schema.json | 5 +++ subworkflows/local/short_reads_assembly_qc.nf | 39 +++++++++++++++++-- workflows/miassembler.nf | 34 +++++++++------- workflows/short_reads_assembler.nf | 13 +++++-- 7 files changed, 75 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 39d05bc..aaefe36 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ Input/output options --human_phix_bwamem2_index_name [string] Combined Human and phiX bwa-mem2 index. [default: human_phix] --short_reads_min_contig_length [integer] Minimum contig length filter. [default: 500] --short_reads_min_contig_length_metat [integer] Minimum contig length filter for metaT. [default: 200] + --short_reads_contig_threshold [integer] Minimum number of contigs in final assembly. [default: 2] --assembly_memory [integer] Default memory allocated for the assembly process. [default: 100] --spades_only_assembler [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true] --outdir [string] The output directory where the results will be saved. You have to use absolute paths to storage on Cloud @@ -278,6 +279,7 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded | --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. | | `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | +| `short_reads_contig_threshold` | The minimum number of contigs allowed after host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. | #### Assembled Runs diff --git a/bin/calculate_assembly_coverage.py b/bin/calculate_assembly_coverage.py index fc90ec2..20be9ec 100755 --- a/bin/calculate_assembly_coverage.py +++ b/bin/calculate_assembly_coverage.py @@ -23,10 +23,10 @@ def get_assembled_base_pairs_and_length(jgi_summarize_coverage_file_gz: str) -> with gzip.open(jgi_summarize_coverage_file_gz, "rt") as file_handle: csv_reader = csv.DictReader(file_handle, delimiter="\t") for row in csv_reader: - contig_length_str = row["contigLen"] + contig_length_str = float(row["contigLen"]) total_avg_depth_str = row["totalAvgDepth"] - if not contig_length_str.isnumeric(): + if not contig_length_str.is_integer(): raise ValueError(f"The column 'contigLen' has an invalid value: {contig_length_str}") if int(contig_length_str) == 0: diff --git a/nextflow.config b/nextflow.config index 02daa11..3b87d49 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,6 +83,7 @@ params { spades_only_assembler = true short_reads_min_contig_length = 500 short_reads_min_contig_length_metat = 200 + short_reads_contig_threshold = 2 long_reads_assembler_config = null assembly_memory = 100 diff --git a/nextflow_schema.json b/nextflow_schema.json index 4df56ed..c6af2e5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -153,6 +153,11 @@ "default": 200, "description": "Minimum contig length filter for short reads metaT." }, + "short_reads_contig_threshold": { + "type": "integer", + "default": 2, + "description": "Minimum number of contigs in final assembly." + }, "assembly_memory": { "type": "number", "default": 100, diff --git a/subworkflows/local/short_reads_assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf index 5e273af..99beee3 100644 --- a/subworkflows/local/short_reads_assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -83,11 +83,44 @@ workflow SHORT_READS_ASSEMBLY_QC { ch_versions = ch_versions.mix(SEQKIT_GREP_HOST.out.versions) } + if(reference_genome == null) { + + cleaned_contigs = filtered_contigs + } + + /******************************************/ + /* Cleaned assemblies that fail the following rule: */ + /* - Less than 2 contigs */ + /******************************************/ + + extended_qc_assembly = cleaned_contigs.map { meta, assembly_fasta -> + { + def con_count = assembly_fasta.countFasta() + def assem_qc_meta = [ + "too_few_contigs": con_count < params.short_reads_contig_threshold, + "enough_contigs": con_count >= params.short_reads_contig_threshold + ] + return [meta + assem_qc_meta, assembly_fasta] + } + } + + extended_qc_assembly + .branch { meta, assembly_fasta -> + qc_failed: meta.too_few_contigs + qc_passed: meta.enough_contigs + } + .set { qc_filtered_assemblies } + + passed_cleaned_contigs = qc_filtered_assemblies.qc_passed.map { meta, assembly -> + [ meta - [enough_contigs: true] - [too_few_contigs: false] , assembly ] + } + PUBLISH_CLEANED_CONTIGS( - filtered_contigs + passed_cleaned_contigs ) emit: - filtered_contigs = filtered_contigs - versions = ch_versions + passed_cleaned_contigs = passed_cleaned_contigs // tuple(meta) + qc_assem_failed = qc_filtered_assemblies.qc_failed // tuple(meta) + versions = ch_versions } diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index d896ebd..0af97d8 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -1,15 +1,15 @@ /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRINT PARAMS SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ include { paramsSummaryLog; paramsSummaryMap; samplesheetToList } from 'plugin/nf-schema' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -21,9 +21,9 @@ include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT THE MAIN ENTRY POINT WORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -33,17 +33,17 @@ include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ @@ -309,16 +309,20 @@ workflow MIASSEMBLER { } .collectFile(name: "assembled_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) - // Short reads QC failed // - def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed.map { meta, __, extended_meta -> + // Short reads and assembly QC failed // + + def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_all_failed.map { meta, __ -> { - if (extended_meta.low_reads_count) { + if (meta.low_reads_count) { return "${meta.id},low_reads_count" } - if (extended_meta.filter_ratio_threshold_exceeded) { + if (meta.filter_ratio_threshold_exceeded) { return "${meta.id},filter_ratio_threshold_exceeded" } - error("Unexpected. meta: ${meta}, extended_meta: ${extended_meta}") + if (meta.too_few_contigs) { + return "${meta.id},too_few_contigs" + error("Unexpected. meta: ${meta}") + } } } diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 2beb545..a66e6fc 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -127,6 +127,7 @@ workflow SHORT_READS_ASSEMBLER { } .set { qc_filtered_reads } + /*********************/ /* Assembly */ /********************/ @@ -153,7 +154,7 @@ workflow SHORT_READS_ASSEMBLER { // Coverage // SHORT_READS_ASSEMBLY_COVERAGE( - SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false), + SHORT_READS_ASSEMBLY_QC.out.passed_cleaned_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false), SHORT_READS_QC.out.fastp_json ) @@ -162,16 +163,22 @@ workflow SHORT_READS_ASSEMBLER { // Stats // /* The QUAST module was modified to run metaQUAST instead */ QUAST( - SHORT_READS_ASSEMBLY_QC.out.filtered_contigs, + SHORT_READS_ASSEMBLY_QC.out.passed_cleaned_contigs, [[], []], [[], []] ) + // Quality results // + + qc_reads_failed = qc_filtered_reads.qc_failed.map { meta, reads, qc_meta -> [ meta + qc_meta, reads]} + + qc_all_failed = qc_reads_failed.concat(SHORT_READS_ASSEMBLY_QC.out.qc_assem_failed) + ch_versions = ch_versions.mix(QUAST.out.versions) emit: fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) - qc_failed = qc_filtered_reads.qc_failed // tuple(meta) + qc_all_failed = qc_all_failed // tuple(meta) fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) quast_results = QUAST.out.results // tuple(meta) From 2f1fc99597bec04b26bfec8bdc9bd0d0835b7e89 Mon Sep 17 00:00:00 2001 From: Jennifer Mattock Date: Fri, 17 Jan 2025 15:10:05 +0000 Subject: [PATCH 4/7] Fixed conflicts with dev, fixed test and added new test --- README.md | 2 +- nextflow_schema.json | 2 +- tests/main.nf.test | 30 ++++++++++++++++++++++++++ workflows/miassembler.nf | 20 +++++------------- workflows/short_reads_assembler.nf | 34 ++++++++++++------------------ 5 files changed, 50 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index aaefe36..449d460 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Input/output options --human_phix_bwamem2_index_name [string] Combined Human and phiX bwa-mem2 index. [default: human_phix] --short_reads_min_contig_length [integer] Minimum contig length filter. [default: 500] --short_reads_min_contig_length_metat [integer] Minimum contig length filter for metaT. [default: 200] - --short_reads_contig_threshold [integer] Minimum number of contigs in final assembly. [default: 2] + --short_reads_contig_threshold [integer] Minimum number of contigs in host cleaned assembly. [default: 2] --assembly_memory [integer] Default memory allocated for the assembly process. [default: 100] --spades_only_assembler [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true] --outdir [string] The output directory where the results will be saved. You have to use absolute paths to storage on Cloud diff --git a/nextflow_schema.json b/nextflow_schema.json index c6af2e5..273c5c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -156,7 +156,7 @@ "short_reads_contig_threshold": { "type": "integer", "default": 2, - "description": "Minimum number of contigs in final assembly." + "description": "Minimum number of contigs in host cleaned assembly." }, "assembly_memory": { "type": "number", diff --git a/tests/main.nf.test b/tests/main.nf.test index 80e5e0d..7731ff5 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -70,6 +70,7 @@ nextflow_pipeline { // Force the assembly short_reads_filter_ratio_threshold = 0.1 + short_reads_contig_threshold = 1 study_accession = "SRP115494" reads_accession = "SRR6180434" @@ -115,6 +116,35 @@ nextflow_pipeline { } + test("metaSPAdes - too few contigs") { + + tag "ena-portal-api" + + when { + + params { + outdir = "tests/results" + + // Force the assembly + short_reads_filter_ratio_threshold = 0.1 + + study_accession = "SRP115494" + reads_accession = "SRR6180434" + } + } + + then { + with (workflow) { + // Cleaned assembly should contain 1 contig which fails the contig threshold and the pipeline stops + assert success + assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 1 + assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 0 + assert trace.succeeded().size() == 11 + } + } + + } + test("metaSPAdes - single end - should fail") { tag "ena-portal-api" diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 8fa6d46..57c5a52 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PRINT PARAMS SUMMARY + IMPORT PLUGINS AND OTHER BITS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ @@ -46,6 +46,7 @@ include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + workflow MIASSEMBLER { /* @@ -321,8 +322,8 @@ workflow MIASSEMBLER { // Short reads and assembly QC failed // - def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_all_failed.map { meta, __ -> - { + def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_all_failed.map { + meta, __ -> { if (meta.low_reads_count) { return "${meta.id},low_reads_count" } @@ -337,16 +338,5 @@ workflow MIASSEMBLER { } short_reads_qc_failed_entries.collectFile(name: "qc_failed_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false) - - // Unassembled samples // - SHORT_READS_ASSEMBLER.out.unassembled_runs.map { - meta -> { - return "${meta.id},${meta.assembler},${meta.assembler_version}" - } - }.collectFile( - name: "unassembled_runs.csv", - storeDir: "${params.outdir}", - newLine: true, - cache: false - ) } + diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index c035ee3..8cc5abe 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -1,7 +1,7 @@ /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -15,9 +15,9 @@ include { SHORT_READS_ASSEMBLY_QC } from '../subworkflows/local/short_read include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_reads_assembly_coverage' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -30,9 +30,9 @@ include { MEGAHIT } from '../modules/nf-core/megahit/main' include { QUAST } from '../modules/nf-core/quast/main' /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow SHORT_READS_ASSEMBLER { @@ -141,15 +141,6 @@ workflow SHORT_READS_ASSEMBLER { MEGAHIT( qc_filtered_reads.megahit.map { meta, reads, __ -> [meta, reads] } ) - - /* MEGAHIT can report 0 contigs, and an empty file */ - MEGAHIT.out.contigs.branch { _meta, contigs -> - empty: contigs.countFasta() == 0 - assembled: contigs.countFasta() > 0 - }.set { - megahit_contigs - } - ch_versions = ch_versions.mix(MEGAHIT.out.versions) assembly = SPADES.out.contigs.mix(MEGAHIT.out.contigs) @@ -186,10 +177,11 @@ workflow SHORT_READS_ASSEMBLER { ch_versions = ch_versions.mix(QUAST.out.versions) emit: - fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) - qc_all_failed = qc_all_failed // tuple(meta) - fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) - assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) - quast_results = QUAST.out.results // tuple(meta) - versions = ch_versions + fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) + qc_all_failed = qc_all_failed // tuple(meta) + fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) + assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) + quast_results = QUAST.out.results // tuple(meta) + versions = ch_versions } + From f64a4fa7479ee7b428d28fbf992bd6a7527137e2 Mon Sep 17 00:00:00 2001 From: Jennifer Mattock Date: Wed, 22 Jan 2025 08:57:54 +0000 Subject: [PATCH 5/7] Tidying code --- README.md | 4 +- nextflow_schema.json | 2 +- subworkflows/local/short_reads_assembly_qc.nf | 38 +++++++------------ workflows/miassembler.nf | 4 +- workflows/short_reads_assembler.nf | 6 +-- 5 files changed, 22 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 449d460..42c28cf 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Input/output options --human_phix_bwamem2_index_name [string] Combined Human and phiX bwa-mem2 index. [default: human_phix] --short_reads_min_contig_length [integer] Minimum contig length filter. [default: 500] --short_reads_min_contig_length_metat [integer] Minimum contig length filter for metaT. [default: 200] - --short_reads_contig_threshold [integer] Minimum number of contigs in host cleaned assembly. [default: 2] + --short_reads_contig_threshold [integer] Minimum number of contigs in human+phiX+host cleaned assembly. [default: 2] --assembly_memory [integer] Default memory allocated for the assembly process. [default: 100] --spades_only_assembler [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true] --outdir [string] The output directory where the results will be saved. You have to use absolute paths to storage on Cloud @@ -279,7 +279,7 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded | --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. | | `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | -| `short_reads_contig_threshold` | The minimum number of contigs allowed after host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. | +| `short_reads_contig_threshold` | The minimum number of contigs allowed after human+phiX+host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. | #### Assembled Runs diff --git a/nextflow_schema.json b/nextflow_schema.json index 273c5c2..69b4f04 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -156,7 +156,7 @@ "short_reads_contig_threshold": { "type": "integer", "default": 2, - "description": "Minimum number of contigs in host cleaned assembly." + "description": "Minimum number of contigs in human+phiX+host cleaned assembly." }, "assembly_memory": { "type": "number", diff --git a/subworkflows/local/short_reads_assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf index 99beee3..6cc528a 100644 --- a/subworkflows/local/short_reads_assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -59,6 +59,9 @@ workflow SHORT_READS_ASSEMBLY_QC { ch_versions = ch_versions.mix(SEQKIT_GREP_HUMAN_PHIX.out.versions) } + + // The cleaned contigs are those that have been filtered, but they will be further cleaned if a reference genome is set. + cleaned_contigs = filtered_contigs if ( reference_genome != null ) { @@ -83,36 +86,22 @@ workflow SHORT_READS_ASSEMBLY_QC { ch_versions = ch_versions.mix(SEQKIT_GREP_HOST.out.versions) } - if(reference_genome == null) { - - cleaned_contigs = filtered_contigs - } - /******************************************/ /* Cleaned assemblies that fail the following rule: */ - /* - Less than 2 contigs */ + /* - Less than params.short_reads_contig_threshold (default is 2) contigs */ /******************************************/ - extended_qc_assembly = cleaned_contigs.map { meta, assembly_fasta -> - { - def con_count = assembly_fasta.countFasta() - def assem_qc_meta = [ - "too_few_contigs": con_count < params.short_reads_contig_threshold, - "enough_contigs": con_count >= params.short_reads_contig_threshold - ] - return [meta + assem_qc_meta, assembly_fasta] + cleaned_contigs.map { meta, assembly_fasta -> { + [meta , ["contigs_count": assembly_fasta.countFasta()], assembly_fasta] + } } - } - - extended_qc_assembly - .branch { meta, assembly_fasta -> - qc_failed: meta.too_few_contigs - qc_passed: meta.enough_contigs + .branch { meta, meta2, assembly_fasta -> + qc_failed: meta2.contigs_count < params.short_reads_contig_threshold + qc_passed: meta2.contigs_count >= params.short_reads_contig_threshold } .set { qc_filtered_assemblies } - - passed_cleaned_contigs = qc_filtered_assemblies.qc_passed.map { meta, assembly -> - [ meta - [enough_contigs: true] - [too_few_contigs: false] , assembly ] + passed_cleaned_contigs = qc_filtered_assemblies.qc_passed.map { meta, _meta2, assembly -> + [ meta, assembly ] } PUBLISH_CLEANED_CONTIGS( @@ -121,6 +110,7 @@ workflow SHORT_READS_ASSEMBLY_QC { emit: passed_cleaned_contigs = passed_cleaned_contigs // tuple(meta) - qc_assem_failed = qc_filtered_assemblies.qc_failed // tuple(meta) + qc_failed_assemblies = qc_filtered_assemblies.qc_failed.map { meta, _meta2, assembly -> + [meta + ["too_few_contigs": true], assembly] } // tuple(meta) versions = ch_versions } diff --git a/workflows/miassembler.nf b/workflows/miassembler.nf index 57c5a52..8526cdf 100644 --- a/workflows/miassembler.nf +++ b/workflows/miassembler.nf @@ -322,7 +322,7 @@ workflow MIASSEMBLER { // Short reads and assembly QC failed // - def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_all_failed.map { + def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed_all.map { meta, __ -> { if (meta.low_reads_count) { return "${meta.id},low_reads_count" @@ -332,8 +332,8 @@ workflow MIASSEMBLER { } if (meta.too_few_contigs) { return "${meta.id},too_few_contigs" - error("Unexpected. meta: ${meta}") } + error("Unexpected. meta: ${meta}") } } diff --git a/workflows/short_reads_assembler.nf b/workflows/short_reads_assembler.nf index 8cc5abe..4e188c4 100644 --- a/workflows/short_reads_assembler.nf +++ b/workflows/short_reads_assembler.nf @@ -170,15 +170,15 @@ workflow SHORT_READS_ASSEMBLER { // Quality results // - qc_reads_failed = qc_filtered_reads.qc_failed.map { meta, reads, qc_meta -> [ meta + qc_meta, reads]} + qc_failed_reads = qc_filtered_reads.qc_failed.map { meta, reads, qc_meta -> [ meta + qc_meta, reads]} - qc_all_failed = qc_reads_failed.concat(SHORT_READS_ASSEMBLY_QC.out.qc_assem_failed) + qc_failed_all = qc_failed_reads.concat(SHORT_READS_ASSEMBLY_QC.out.qc_failed_assemblies) ch_versions = ch_versions.mix(QUAST.out.versions) emit: fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta) - qc_all_failed = qc_all_failed // tuple(meta) + qc_failed_all = qc_failed_all // tuple(meta) fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta) assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta) quast_results = QUAST.out.results // tuple(meta) From 0b3831822183ad0ee596644f59f04c6c6daa7501 Mon Sep 17 00:00:00 2001 From: Germana Baldi Date: Thu, 23 Jan 2025 12:06:08 +0000 Subject: [PATCH 6/7] Reformatting short_reads_assembly_qc.nf --- subworkflows/local/short_reads_assembly_qc.nf | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/short_reads_assembly_qc.nf b/subworkflows/local/short_reads_assembly_qc.nf index 6cc528a..0e57f85 100644 --- a/subworkflows/local/short_reads_assembly_qc.nf +++ b/subworkflows/local/short_reads_assembly_qc.nf @@ -21,7 +21,7 @@ process PUBLISH_CLEANED_CONTIGS { workflow SHORT_READS_ASSEMBLY_QC { take: - assembly // [ val(meta), path(assembly_fasta) ] + assembly // [ val(meta), path(assembly_fasta) ] reference_genome // [ val(meta2), path(reference_genome) ] | meta2 contains the name of the reference genome main: @@ -86,10 +86,10 @@ workflow SHORT_READS_ASSEMBLY_QC { ch_versions = ch_versions.mix(SEQKIT_GREP_HOST.out.versions) } - /******************************************/ - /* Cleaned assemblies that fail the following rule: */ - /* - Less than params.short_reads_contig_threshold (default is 2) contigs */ - /******************************************/ + /***************************************************************************/ + /* Cleaned assemblies that fail the following rule: */ + /* - Less than params.short_reads_contig_threshold (default is 2) contigs */ + /***************************************************************************/ cleaned_contigs.map { meta, assembly_fasta -> { [meta , ["contigs_count": assembly_fasta.countFasta()], assembly_fasta] @@ -100,17 +100,21 @@ workflow SHORT_READS_ASSEMBLY_QC { qc_passed: meta2.contigs_count >= params.short_reads_contig_threshold } .set { qc_filtered_assemblies } + passed_cleaned_contigs = qc_filtered_assemblies.qc_passed.map { meta, _meta2, assembly -> [ meta, assembly ] } + qc_failed_assemblies = qc_filtered_assemblies.qc_failed.map { meta, _meta2, assembly -> + [meta + ["too_few_contigs": true], assembly] + } + PUBLISH_CLEANED_CONTIGS( passed_cleaned_contigs ) emit: passed_cleaned_contigs = passed_cleaned_contigs // tuple(meta) - qc_failed_assemblies = qc_filtered_assemblies.qc_failed.map { meta, _meta2, assembly -> - [meta + ["too_few_contigs": true], assembly] } // tuple(meta) - versions = ch_versions + qc_failed_assemblies = qc_failed_assemblies // tuple(meta) + versions = ch_versions } From 5bc16282f1651994a856be812aa84d8625e7fa27 Mon Sep 17 00:00:00 2001 From: Germana Baldi Date: Thu, 23 Jan 2025 12:22:21 +0000 Subject: [PATCH 7/7] Reformat README.md with prettier --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 42c28cf..0868a88 100644 --- a/README.md +++ b/README.md @@ -279,7 +279,7 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded | --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. | | `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. | -| `short_reads_contig_threshold` | The minimum number of contigs allowed after human+phiX+host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. | +| `short_reads_contig_threshold` | The minimum number of contigs allowed after human+phiX+host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. | #### Assembled Runs