From c5ae764bc009a1da641eae66bd4e96b044ff03ec Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Fri, 26 Jan 2024 14:41:34 +0100 Subject: [PATCH 01/10] Update workflows/mag.nf Co-authored-by: James A. Fellows Yates --- workflows/mag.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/mag.nf b/workflows/mag.nf index f162495e..a9333536 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -937,6 +937,7 @@ workflow MAG { ch_input_for_postbinning_bins_unbins, ch_cat_db ) + // Group all classification results for each sample in a single file ch_cat_summary = CAT.out.tax_classification_names .collectFile(keepHeader: true) { meta, classification -> From c3373eebd54032568d0720a65e013985a668d13b Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Fri, 26 Jan 2024 14:41:43 +0100 Subject: [PATCH 02/10] Update workflows/mag.nf Co-authored-by: James A. Fellows Yates --- workflows/mag.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/mag.nf b/workflows/mag.nf index a9333536..090e5232 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -943,6 +943,7 @@ workflow MAG { meta, classification -> ["${meta.id}.txt", classification] } + // Group all classification results for the whole run in a single file CAT_SUMMARY( ch_cat_summary.collect() ) From 2c41bab0d76dbb72cbf3b2dc41c07e46bd369cdf Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Fri, 26 Jan 2024 14:42:19 +0100 Subject: [PATCH 03/10] Update subworkflows/local/gtdbtk.nf Co-authored-by: James A. Fellows Yates --- subworkflows/local/gtdbtk.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 57c10508..f3d3ec4e 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -53,7 +53,7 @@ workflow GTDBTK { ch_filtered_bins = bins .transpose() .map { meta, bin -> [bin.getName(), bin, meta]} - .join(ch_bin_metrics, failOnDuplicate: true, failOnMismatch: false) + .join(ch_bin_metrics, failOnDuplicate: true) .map { bin_name, bin, meta, completeness, contamination -> [meta, bin, completeness, contamination] } .branch { passed: (it[2] != -1 && it[2] >= params.gtdbtk_min_completeness && it[3] != -1 && it[3] <= params.gtdbtk_max_contamination) From ea568df1491814a389a9157ac4e367b989f3f50a Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Fri, 26 Jan 2024 14:42:29 +0100 Subject: [PATCH 04/10] Update subworkflows/local/gtdbtk.nf Co-authored-by: James A. Fellows Yates --- subworkflows/local/gtdbtk.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index f3d3ec4e..370f3c4f 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -88,8 +88,6 @@ workflow GTDBTK { GTDBTK_CLASSIFYWF.out.summary.map{it[1]}.collect().ifEmpty([]), [], [] - // GTDBTK_CLASSIFYWF.out.filtered.map{it[1]}.collect().ifEmpty([]), - // GTDBTK_CLASSIFYWF.out.failed.map{it[1]}.collect().ifEmpty([]) ) emit: From 629e6672fa7d143945760008622fef3c7c10d0f9 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 26 Jan 2024 14:53:29 +0100 Subject: [PATCH 05/10] Update CLASSIFYWF to fix find glob issue --- modules.json | 2 +- modules/nf-core/gtdbtk/classifywf/environment.yml | 7 +++++++ modules/nf-core/gtdbtk/classifywf/main.nf | 4 ++-- modules/nf-core/gtdbtk/classifywf/meta.yml | 5 +++-- 4 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 modules/nf-core/gtdbtk/classifywf/environment.yml diff --git a/modules.json b/modules.json index e9162243..861a777d 100644 --- a/modules.json +++ b/modules.json @@ -118,7 +118,7 @@ }, "gtdbtk/classifywf": { "branch": "master", - "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5", + "git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5", "installed_by": ["modules"] }, "gunc/downloaddb": { diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml new file mode 100644 index 00000000..8801269e --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/environment.yml @@ -0,0 +1,7 @@ +name: gtdbtk_classifywf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gtdbtk=2.3.2 diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 00da4459..6d9733ba 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -3,7 +3,7 @@ process GTDBTK_CLASSIFYWF { label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::gtdbtk=2.3.2" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" @@ -61,7 +61,7 @@ process GTDBTK_CLASSIFYWF { mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" - find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing + find -name "gtdbtk.${prefix}.*.classify.tree" | xargs -r gzip # do not fail if .tree is missing cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index 4319bc74..d85f9966 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -16,7 +16,6 @@ tools: tool_dev_url: https://github.com/Ecogenomics/GTDBTk doi: "10.1093/bioinformatics/btz848" licence: ["GNU General Public v3 (GPL v3)"] - input: - meta: type: map @@ -35,7 +34,6 @@ input: type: file description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) pattern: "*.msh" - output: - meta: type: map @@ -85,3 +83,6 @@ output: authors: - "@skrakau" - "@abhi18av" +maintainers: + - "@skrakau" + - "@abhi18av" From 3f3505a0bac1cbfeef0777f123d98e64cc8bf78f Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Fri, 26 Jan 2024 13:55:04 +0000 Subject: [PATCH 06/10] [automated] Fix linting with Prettier --- .devcontainer/devcontainer.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4ecfbfe3..4a9bc5c7 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint" + "python.linting.pylintPath": "/opt/conda/bin/pylint", }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } - } + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], + }, + }, } From 99f9aa8d918f3a3406321b3dcb8843b59f389b26 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 26 Jan 2024 15:17:28 +0100 Subject: [PATCH 07/10] feat: add CAT summary to bin_summary table --- bin/combine_tables.py | 45 +++++++++++++++++++++++++++++++++++- modules/local/bin_summary.nf | 3 +++ workflows/mag.nf | 3 ++- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2feed698..57e83464 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -13,7 +13,7 @@ def parse_args(args=None): parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") - + parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( "-o", "--out", @@ -25,6 +25,43 @@ def parse_args(args=None): return parser.parse_args(args) +def parse_cat_table(cat_table): + """Parse CAT table. + + CAT table is trickier to parse than the other tables, because it has a variable number of columns, + depending on the number of ranks that are reported for the taxonomic assignation of each contig. + Therefore, we first parse the header to get the column names, and then parse the table, to get the + maximum number of columns. Then, we merge the columns containing the ranks into a single column. + + Args: + cat_table (str): Path to CAT table + + Returns: + pd.DataFrame: parse CAT table + """ + with open(cat_table, "r") as f: + next(f) # skip header + maxcol = 0 + for line in f: + maxcol = max(maxcol, len(line.split("\t"))) + + header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"] + + df = pd.read_table( + cat_table, + names=header + [f"rank_{i}" for i in range(maxcol - len(header))], + on_bad_lines="warn", + header=None, + skiprows=1, + ) + # merge all rank columns into a single column + df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + # remove rank_* columns + df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) + + return df + + def main(args=None): args = parse_args(args) @@ -93,6 +130,12 @@ def main(args=None): results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer" ) # assuming depths for all bins are given + if args.cat_table: + cat_results = parse_cat_table(args.cat_summary) + if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: + sys.exit("Bins in CAT summary do not match bins in bin depths summary!") + results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer") + results.to_csv(args.out, sep="\t") diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 4503502f..b387174c 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -11,6 +11,7 @@ process BIN_SUMMARY { path(checkm_sum) path(quast_sum) path(gtdbtk_sum) + path(cat_sum) output: path("bin_summary.tsv"), emit: summary @@ -21,12 +22,14 @@ process BIN_SUMMARY { def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : "" def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ combine_tables.py --depths_summary ${bin_depths} \ $busco_summary \ $checkm_summary \ $quast_summary \ $gtdbtk_summary \ + $cat_summary \ --out bin_summary.tsv cat <<-END_VERSIONS > versions.yml diff --git a/workflows/mag.nf b/workflows/mag.nf index f162495e..44c5579c 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -982,7 +982,8 @@ workflow MAG { ch_busco_summary.ifEmpty([]), ch_checkm_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), - ch_gtdbtk_summary.ifEmpty([]) + ch_gtdbtk_summary.ifEmpty([]), + CAT_SUMMARY.out.summary.ifEmpty([]) ) } From d7985934de42d0831be24409f2323aab16b838dc Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 26 Jan 2024 15:24:28 +0100 Subject: [PATCH 08/10] chore: update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc6eed0d..4ef4ef2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor) + ### `Changed` ### `Fixed` From 6964daa88daffa1a79523cc1276ca12405f8bf78 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 26 Jan 2024 15:29:11 +0100 Subject: [PATCH 09/10] fix: CAT global summary channel --- workflows/mag.nf | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index 837833bf..63304cec 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -950,6 +950,13 @@ workflow MAG { ch_versions = ch_versions.mix(CAT.out.versions.first()) ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions) + // If CAT is not run, then the CAT global summary should be an empty channel + if ( params.cat_db_generate || params.cat_db) { + ch_cat_global_summary = CAT_SUMMARY.out.summary + } else { + ch_cat_global_summary = Channel.empty() + } + /* * GTDB-tk: taxonomic classifications using GTDB reference */ @@ -985,7 +992,7 @@ workflow MAG { ch_checkm_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), - CAT_SUMMARY.out.summary.ifEmpty([]) + ch_cat_global_summary.ifEmpty([]) ) } From 64ae23efd1f941558bc9736fd460f4b5a3d39998 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 26 Jan 2024 16:23:11 +0100 Subject: [PATCH 10/10] fix: cat_summary is the variable name --- bin/combine_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 57e83464..b867ed73 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -130,7 +130,7 @@ def main(args=None): results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer" ) # assuming depths for all bins are given - if args.cat_table: + if args.cat_summary: cat_results = parse_cat_table(args.cat_summary) if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: sys.exit("Bins in CAT summary do not match bins in bin depths summary!")