Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs in miassembler #31

Merged
merged 8 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Input/output options
--human_phix_bwamem2_index_name [string] Combined Human and phiX bwa-mem2 index. [default: human_phix]
--short_reads_min_contig_length [integer] Minimum contig length filter. [default: 500]
--short_reads_min_contig_length_metat [integer] Minimum contig length filter for metaT. [default: 200]
--short_reads_contig_threshold [integer] Minimum number of contigs in host cleaned assembly. [default: 2]
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
--assembly_memory [integer] Default memory allocated for the assembly process. [default: 100]
--spades_only_assembler [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true]
--outdir [string] The output directory where the results will be saved. You have to use absolute paths to storage on Cloud
Expand Down Expand Up @@ -278,6 +279,7 @@ SRR6180434,short_reads_filter_ratio_threshold_exceeded
| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. |
| `short_reads_low_reads_count_threshold` | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled. |
| `short_reads_contig_threshold` | The minimum number of contigs allowed after host cleaning. If below it flags a low contig count and the cleaned assembly isn't generated. |
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved

#### Assembled Runs

Expand Down
4 changes: 2 additions & 2 deletions bin/calculate_assembly_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def get_assembled_base_pairs_and_length(jgi_summarize_coverage_file_gz: str) ->
with gzip.open(jgi_summarize_coverage_file_gz, "rt") as file_handle:
csv_reader = csv.DictReader(file_handle, delimiter="\t")
for row in csv_reader:
contig_length_str = row["contigLen"]
contig_length_str = float(row["contigLen"])
total_avg_depth_str = row["totalAvgDepth"]

if not contig_length_str.isnumeric():
if not contig_length_str.is_integer():
mberacochea marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"The column 'contigLen' has an invalid value: {contig_length_str}")

if int(contig_length_str) == 0:
Expand Down
1 change: 1 addition & 0 deletions conf/codon_slurm.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ params {
blast_reference_genomes_folder = "/nfs/production/rdf/metagenomics/pipelines/prod/assembly-pipeline/blast_dbs/"
human_phix_blast_index_name = "human_phix"
human_phix_bwamem2_index_name = "human_phix"
human_fasta_prefix = "hg38"
}

executor {
Expand Down
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@

process {

withName: 'FETCHTOOL*' {
withName: 'FETCHTOOL_READS' {
cpus = { 1 }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }

ext.args = params.private_study ? "--private" : ""
}

withName: 'FASTP*' {
withName: 'FASTP' {
cpus = { 6 * task.attempt }
memory = { 36.GB * task.attempt }
time = { 8.h * task.attempt }
Expand Down Expand Up @@ -99,7 +99,7 @@ process {
ext.prefix = "decontaminated"
}

withName: 'HUMAN*_DECONTAMINATION' {
withName: 'HUMAN_PHIX_DECONTAMINATION' {
memory = { 64.GB * task.attempt }
}

Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ params {
spades_only_assembler = true
short_reads_min_contig_length = 500
short_reads_min_contig_length_metat = 200
short_reads_contig_threshold = 2
long_reads_assembler_config = null
assembly_memory = 100

Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@
"default": 200,
"description": "Minimum contig length filter for short reads metaT."
},
"short_reads_contig_threshold": {
"type": "integer",
"default": 2,
"description": "Minimum number of contigs in host cleaned assembly."
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
},
"assembly_memory": {
"type": "number",
"default": 100,
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/long_reads_qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ workflow LONG_READS_QC {
// can we use the same flag, even if one has phix but not the other?
// Check file extensions too

human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.fna", checkIfExists: true)
human_reference = Channel.fromPath( "${params.reference_genomes_folder}/${params.human_fasta_prefix}.f*a", checkIfExists: true)
.collect().map {
files -> [ ["id": params.human_fasta_prefix], files ]
}
Expand Down
39 changes: 36 additions & 3 deletions subworkflows/local/short_reads_assembly_qc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,44 @@ workflow SHORT_READS_ASSEMBLY_QC {
ch_versions = ch_versions.mix(SEQKIT_GREP_HOST.out.versions)
}

if(reference_genome == null) {
mberacochea marked this conversation as resolved.
Show resolved Hide resolved

jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
cleaned_contigs = filtered_contigs
}

/******************************************/
/* Cleaned assemblies that fail the following rule: */
/* - Less than 2 contigs */
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
/******************************************/

extended_qc_assembly = cleaned_contigs.map { meta, assembly_fasta ->
{
def con_count = assembly_fasta.countFasta()
def assem_qc_meta = [
"too_few_contigs": con_count < params.short_reads_contig_threshold,
"enough_contigs": con_count >= params.short_reads_contig_threshold
]
return [meta + assem_qc_meta, assembly_fasta]
}
}

extended_qc_assembly
.branch { meta, assembly_fasta ->
qc_failed: meta.too_few_contigs
qc_passed: meta.enough_contigs
}
.set { qc_filtered_assemblies }

passed_cleaned_contigs = qc_filtered_assemblies.qc_passed.map { meta, assembly ->
[ meta - [enough_contigs: true] - [too_few_contigs: false] , assembly ]
}
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved

PUBLISH_CLEANED_CONTIGS(
filtered_contigs
passed_cleaned_contigs
)

emit:
filtered_contigs = filtered_contigs
versions = ch_versions
passed_cleaned_contigs = passed_cleaned_contigs // tuple(meta)
qc_assem_failed = qc_filtered_assemblies.qc_failed // tuple(meta)
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
versions = ch_versions
}
30 changes: 30 additions & 0 deletions tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ nextflow_pipeline {

// Force the assembly
short_reads_filter_ratio_threshold = 0.1
short_reads_contig_threshold = 1

study_accession = "SRP115494"
reads_accession = "SRR6180434"
Expand Down Expand Up @@ -115,6 +116,35 @@ nextflow_pipeline {

}

test("metaSPAdes - too few contigs") {
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved

tag "ena-portal-api"

when {

params {
outdir = "tests/results"

// Force the assembly
short_reads_filter_ratio_threshold = 0.1

study_accession = "SRP115494"
reads_accession = "SRR6180434"
}
}

then {
with (workflow) {
// Cleaned assembly should contain 1 contig which fails the contig threshold and the pipeline stops
assert success
assert trace.succeeded().count{ task -> task.name.contains("SPADES") } == 1
assert trace.succeeded().count{ task -> task.name.contains("MEGAHIT") } == 0
assert trace.succeeded().size() == 11
}
}

}

test("metaSPAdes - single end - should fail") {

tag "ena-portal-api"
Expand Down
50 changes: 22 additions & 28 deletions workflows/miassembler.nf
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT PLUGINS AND OTHER BITS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { validateParameters; paramsSummaryLog; paramsSummaryMap; samplesheetToList; paramsHelp } from 'plugin/nf-schema'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT NF-CORE MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

//
Expand All @@ -21,9 +21,9 @@ include { MULTIQC as MULTIQC_RUN } from '../modules/nf-core/multiqc/main'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT THE MAIN ENTRY POINT WORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

//
Expand All @@ -33,19 +33,20 @@ include { SHORT_READS_ASSEMBLER } from '../workflows/short_reads_assembler
include { LONG_READS_ASSEMBLER } from '../workflows/long_reads_assembler'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT NF-CORE MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

include { FETCHTOOL_READS } from '../modules/local/fetchtool_reads'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/


workflow MIASSEMBLER {

/*
Expand Down Expand Up @@ -319,30 +320,23 @@ workflow MIASSEMBLER {
}
.collectFile(name: "assembled_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false)

// Short reads QC failed //
def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_failed.map {
meta, __, extended_meta -> {
if (extended_meta.low_reads_count) {
// Short reads and assembly QC failed //

def short_reads_qc_failed_entries = SHORT_READS_ASSEMBLER.out.qc_all_failed.map {
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
meta, __ -> {
if (meta.low_reads_count) {
return "${meta.id},low_reads_count"
}
if (extended_meta.filter_ratio_threshold_exceeded) {
if (meta.filter_ratio_threshold_exceeded) {
return "${meta.id},filter_ratio_threshold_exceeded"
}
error("Unexpected. meta: ${meta}, extended_meta: ${extended_meta}")
if (meta.too_few_contigs) {
return "${meta.id},too_few_contigs"
error("Unexpected. meta: ${meta}")
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

short_reads_qc_failed_entries.collectFile(name: "qc_failed_runs.csv", storeDir: "${params.outdir}", newLine: true, cache: false)

// Unassembled samples //
SHORT_READS_ASSEMBLER.out.unassembled_runs.map {
meta -> {
return "${meta.id},${meta.assembler},${meta.assembler_version}"
}
}.collectFile(
name: "unassembled_runs.csv",
storeDir: "${params.outdir}",
newLine: true,
cache: false
)
}

36 changes: 17 additions & 19 deletions workflows/short_reads_assembler.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT LOCAL MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

//
Expand All @@ -15,9 +15,9 @@ include { SHORT_READS_ASSEMBLY_QC } from '../subworkflows/local/short_read
include { SHORT_READS_ASSEMBLY_COVERAGE } from '../subworkflows/local/short_reads_assembly_coverage'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
IMPORT NF-CORE MODULES/SUBWORKFLOWS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

//
Expand All @@ -30,9 +30,9 @@ include { MEGAHIT } from '../modules/nf-core/megahit/main'
include { QUAST } from '../modules/nf-core/quast/main'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

workflow SHORT_READS_ASSEMBLER {
Expand Down Expand Up @@ -127,6 +127,7 @@ workflow SHORT_READS_ASSEMBLER {
}
.set { qc_filtered_reads }


/*********************/
/* Assembly */
/********************/
Expand All @@ -140,15 +141,6 @@ workflow SHORT_READS_ASSEMBLER {
MEGAHIT(
qc_filtered_reads.megahit.map { meta, reads, __ -> [meta, reads] }
)

/* MEGAHIT can report 0 contigs, and an empty file */
MEGAHIT.out.contigs.branch { _meta, contigs ->
empty: contigs.countFasta() == 0
assembled: contigs.countFasta() > 0
}.set {
megahit_contigs
}

ch_versions = ch_versions.mix(MEGAHIT.out.versions)

assembly = SPADES.out.contigs.mix(MEGAHIT.out.contigs)
Expand All @@ -162,7 +154,7 @@ workflow SHORT_READS_ASSEMBLER {

// Coverage //
SHORT_READS_ASSEMBLY_COVERAGE(
SHORT_READS_ASSEMBLY_QC.out.filtered_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false),
SHORT_READS_ASSEMBLY_QC.out.passed_cleaned_contigs.join(SHORT_READS_QC.out.qc_reads, remainder: false),
SHORT_READS_QC.out.fastp_json
)

Expand All @@ -171,19 +163,25 @@ workflow SHORT_READS_ASSEMBLER {
// Stats //
/* The QUAST module was modified to run metaQUAST instead */
QUAST(
SHORT_READS_ASSEMBLY_QC.out.filtered_contigs,
SHORT_READS_ASSEMBLY_QC.out.passed_cleaned_contigs,
[[], []],
[[], []]
)

// Quality results //

qc_reads_failed = qc_filtered_reads.qc_failed.map { meta, reads, qc_meta -> [ meta + qc_meta, reads]}

qc_all_failed = qc_reads_failed.concat(SHORT_READS_ASSEMBLY_QC.out.qc_assem_failed)
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved
jmattock5 marked this conversation as resolved.
Show resolved Hide resolved

ch_versions = ch_versions.mix(QUAST.out.versions)

emit:
fastqc_before_zip = FASTQC_BEFORE.out.zip // tuple(meta)
qc_failed = qc_filtered_reads.qc_failed // tuple(meta)
qc_all_failed = qc_all_failed // tuple(meta)
fastqc_after_zip = FASTQC_AFTER.out.zip // tuple(meta)
assembly_coverage_samtools_idxstats = SHORT_READS_ASSEMBLY_COVERAGE.out.samtools_idxstats // tuple(meta)
quast_results = QUAST.out.results // tuple(meta)
unassembled_runs = megahit_contigs.empty.map { map, _contigs -> map } // meta with the samples that Megahit/Metaspades couldn't assemble.
versions = ch_versions
}

Loading