diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f38fcfe..289f43da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor) + ### `Changed` ### `Fixed` diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2feed698..b867ed73 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -13,7 +13,7 @@ def parse_args(args=None): parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") - + parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( "-o", "--out", @@ -25,6 +25,43 @@ def parse_args(args=None): return parser.parse_args(args) +def parse_cat_table(cat_table): + """Parse CAT table. + + CAT table is trickier to parse than the other tables, because it has a variable number of columns, + depending on the number of ranks that are reported for the taxonomic assignation of each contig. + Therefore, we first parse the header to get the column names, and then parse the table, to get the + maximum number of columns. Then, we merge the columns containing the ranks into a single column. + + Args: + cat_table (str): Path to CAT table + + Returns: + pd.DataFrame: parse CAT table + """ + with open(cat_table, "r") as f: + next(f) # skip header + maxcol = 0 + for line in f: + maxcol = max(maxcol, len(line.split("\t"))) + + header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"] + + df = pd.read_table( + cat_table, + names=header + [f"rank_{i}" for i in range(maxcol - len(header))], + on_bad_lines="warn", + header=None, + skiprows=1, + ) + # merge all rank columns into a single column + df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + # remove rank_* columns + df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) + + return df + + def main(args=None): args = parse_args(args) @@ -93,6 +130,12 @@ def main(args=None): results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer" ) # assuming depths for all bins are given + if args.cat_summary: + cat_results = parse_cat_table(args.cat_summary) + if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: + sys.exit("Bins in CAT summary do not match bins in bin depths summary!") + results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer") + results.to_csv(args.out, sep="\t") diff --git a/modules.json b/modules.json index 89828a1c..526e5400 100644 --- a/modules.json +++ b/modules.json @@ -118,7 +118,7 @@ }, "gtdbtk/classifywf": { "branch": "master", - "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5", + "git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5", "installed_by": ["modules"] }, "gunc/downloaddb": { diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 4503502f..b387174c 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -11,6 +11,7 @@ process BIN_SUMMARY { path(checkm_sum) path(quast_sum) path(gtdbtk_sum) + path(cat_sum) output: path("bin_summary.tsv"), emit: summary @@ -21,12 +22,14 @@ process BIN_SUMMARY { def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : "" def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ combine_tables.py --depths_summary ${bin_depths} \ $busco_summary \ $checkm_summary \ $quast_summary \ $gtdbtk_summary \ + $cat_summary \ --out bin_summary.tsv cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml new file mode 100644 index 00000000..8801269e --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/environment.yml @@ -0,0 +1,7 @@ +name: gtdbtk_classifywf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gtdbtk=2.3.2 diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 00da4459..6d9733ba 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -3,7 +3,7 @@ process GTDBTK_CLASSIFYWF { label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::gtdbtk=2.3.2" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" @@ -61,7 +61,7 @@ process GTDBTK_CLASSIFYWF { mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" - find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing + find -name "gtdbtk.${prefix}.*.classify.tree" | xargs -r gzip # do not fail if .tree is missing cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index 4319bc74..d85f9966 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -16,7 +16,6 @@ tools: tool_dev_url: https://github.com/Ecogenomics/GTDBTk doi: "10.1093/bioinformatics/btz848" licence: ["GNU General Public v3 (GPL v3)"] - input: - meta: type: map @@ -35,7 +34,6 @@ input: type: file description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) pattern: "*.msh" - output: - meta: type: map @@ -85,3 +83,6 @@ output: authors: - "@skrakau" - "@abhi18av" +maintainers: + - "@skrakau" + - "@abhi18av" diff --git a/workflows/mag.nf b/workflows/mag.nf index 5fb7caa9..ee33230e 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -958,6 +958,13 @@ workflow MAG { ch_versions = ch_versions.mix(CAT.out.versions.first()) ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions) + // If CAT is not run, then the CAT global summary should be an empty channel + if ( params.cat_db_generate || params.cat_db) { + ch_cat_global_summary = CAT_SUMMARY.out.summary + } else { + ch_cat_global_summary = Channel.empty() + } + /* * GTDB-tk: taxonomic classifications using GTDB reference */ @@ -992,7 +999,8 @@ workflow MAG { ch_busco_summary.ifEmpty([]), ch_checkm_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), - ch_gtdbtk_summary.ifEmpty([]) + ch_gtdbtk_summary.ifEmpty([]), + ch_cat_global_summary.ifEmpty([]) ) }