diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf new file mode 100644 index 00000000..25387af9 --- /dev/null +++ b/subworkflows/local/bin_qc.nf @@ -0,0 +1,122 @@ +/* + * BUSCO/CheckM/CheckM2: Quantitative measures for the assessment of genome assembly + */ + +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV } from '../../modules/local/combine_tsv' + +workflow BIN_QC { + take: + bins // channel: [ val(meta), path(bin) ] + checkm_db + checkm2_db + busco_db + + main: + ch_versions = Channel.empty() + + if (params.binqc_tool == "busco") { + // BUSCO workflow + if (!busco_db.isEmpty()) { + if (busco_db.extension in ['gz', 'tgz']) { + // Expects to be tar.gz! + BUSCO_DB_PREPARATION(busco_db) + ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> + [[id: meta, lineage: 'Y'], db] + } + } + else if (busco_db.isDirectory()) { + // Set meta to match expected channel cardinality for BUSCO + ch_db_for_busco = Channel + .of(busco_db) + .map { db -> + def basename = db.getBaseName() + def lineage = basename.contains('odb10') ? 'Y' : 'N' + [[id: basename, lineage: lineage], db] + } + .collect() + } + } + else { + // Set BUSCO database to empty to allow for --auto-lineage + ch_db_for_busco = Channel + .of([]) + .map { empty_db -> [[lineage: ''], []] } + .collect() + } + + if (params.save_busco_db) { + // publish files downloaded by Busco + ch_downloads = BUSCO.out.busco_downloads + .groupTuple() + .map { lin, downloads -> downloads[0] } + .toSortedList() + .flatten() + BUSCO_SAVE_DOWNLOAD(ch_downloads) + } + + BUSCO(bins, ch_db_for_busco) + + // busco_summary_domain = BUSCO.out.summary_domain.collect() + // busco_summary_specific = BUSCO.out.summary_specific.collect() + // busco_failed_bin = BUSCO.out.failed_bin.collect() + + BUSCO_SUMMARY( + BUSCO.out.summary_domain.map { it[1] }.collect().ifEmpty([]), + BUSCO.out.summary_specific.map { it[1] }.collect().ifEmpty([]), + BUSCO.out.failed_bin.map { it[1] }.collect().ifEmpty([]) + ) + + multiqc_reports = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } + summary = BUSCO_SUMMARY.out.summary + ch_versions = ch_versions.mix(BUSCO.out.versions.first()) + } + else if (params.binqc_tool == "checkm") { + // CheckM workflow + ch_bins_for_checkmlineagewf = bins + .filter { meta, bin -> + meta.domain != "eukarya" + } + .multiMap { meta, fa -> + reads: [meta, fa] + ext: fa.extension.unique().join("") + } + + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) + + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { meta, dir, marker -> + [meta, dir, marker, []] + } + + CHECKM_QA(ch_checkmqa_input, []) + + COMBINE_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) + + summary = COMBINE_TSV.out.combined + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + } + else if (params.binqc_tool == "checkm2") { + // CheckM2 workflow + CHECKM2_PREDICT(bins, checkm2_db) + + COMBINE_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) + + summary = COMBINE_TSV.out.combined + ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) + } + + emit: + summary = summary + checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] + multiqc = params.binqc_tool == "busco" ? multiqc_reports : [] + versions = ch_versions +} diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf deleted file mode 100644 index 73183d2d..00000000 --- a/subworkflows/local/checkm_qc.nf +++ /dev/null @@ -1,54 +0,0 @@ -/* - * CheckM/CheckM2: Quantitative measures for the assessment of genome assembly - */ - -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' - -workflow CHECKM_QC { - take: - bins // channel: [ val(meta), path(bin) ] - checkm_db - checkm2_db - - main: - ch_versions = Channel.empty() - - if (params.binqc_tool == "checkm") { - ch_bins_for_checkmlineagewf = bins.multiMap { - meta, fa -> - reads: [ meta, fa ] - ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine - } - - CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - - ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output - .join(CHECKM_LINEAGEWF.out.marker_file) - .map{ - meta, dir, marker -> - [ meta, dir, marker, []] - } - - CHECKM_QA ( ch_checkmqa_input, [] ) - - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) - - COMBINE_CHECKM_TSV(CHECKM_QA.out.output.map{it[1]}.collect()) - } - if (params.binqc_tool == "checkm2") { - CHECKM2_PREDICT(bins, checkm2_db) - - ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) - - COMBINE_CHECKM_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map{it[1]}.collect()) - } - - emit: - summary = COMBINE_CHECKM_TSV.out.combined - checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] - versions = ch_versions -} diff --git a/workflows/mag.nf b/workflows/mag.nf index 8d993340..0b0b1936 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -14,10 +14,9 @@ include { methodsDescriptionText } from '../subwo // include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' include { BINNING } from '../subworkflows/local/binning' +include { BIN_QC } from '../subworkflows/local/bin_qc' include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' @@ -185,11 +184,7 @@ workflow MAG { ch_metaeuk_db = Channel.empty() } - // Additional info for completion email and summary - def busco_failed_bins = [:] - // Get checkM database if not supplied - if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { ARIA2_UNTAR(params.checkm_download_url) ch_checkm_db = ARIA2_UNTAR.out.downloaded_file @@ -797,55 +792,26 @@ workflow MAG { ch_input_bins_for_qc = ch_input_for_postbinning_bins_unbins.transpose() - if (!params.skip_binqc && params.binqc_tool == 'busco') { - /* - * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly - */ - - BUSCO_QC( - ch_busco_db, - ch_input_bins_for_qc - ) - ch_busco_summary = BUSCO_QC.out.summary - ch_versions = ch_versions.mix(BUSCO_QC.out.versions.first()) - // process information if BUSCO analysis failed for individual bins due to no matching genes - BUSCO_QC.out.failed_bin - .splitCsv(sep: '\t') - .map { bin, error -> - if (!bin.contains(".unbinned.")) { - busco_failed_bins[bin] = error - } - } - } + BIN_QC( + ch_input_bins_for_qc, + ch_checkm_db, + ch_checkm2_db, + ch_busco_db + ) - if (!params.skip_binqc && params.binqc_tool in ['checkm', 'checkm2']) { - /* - * CheckM/CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly - */ + ch_versions = ch_versions.mix(BIN_QC.out.versions) - ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> + if (params.run_gunc) { + ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> meta.domain != "eukarya" } - CHECKM_QC( - ch_input_bins_for_checkm.groupTuple(), - ch_checkm_db, - ch_checkm2_db + GUNC_QC( + ch_input_bins_for_gunc, + ch_gunc_db, + params.binqc_tool == 'checkm' ? BIN_QC.out.checkm_tsv : [] ) - ch_checkm_summary = CHECKM_QC.out.summary - - ch_versions = ch_versions.mix(CHECKM_QC.out.versions) - } - if (params.run_gunc && params.binqc_tool == 'checkm') { - GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) - ch_versions = ch_versions.mix(GUNC_QC.out.versions) - } - else if (params.run_gunc) { - ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> - meta.domain != "eukarya" - } - GUNC_QC(ch_input_bins_for_qc, ch_gunc_db, []) ch_versions = ch_versions.mix(GUNC_QC.out.versions) } @@ -1075,7 +1041,7 @@ workflow MAG { } if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco') { - ch_multiqc_files = ch_multiqc_files.mix(BUSCO_QC.out.multiqc.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc.collect().ifEmpty([])) }