Skip to content

Commit

Permalink
Merge pull request #562 from maxibor/add_cat_2_summary
Browse files Browse the repository at this point in the history
Add CAT summary into the global `bin_summary`
  • Loading branch information
jfy133 authored Feb 1, 2024
2 parents eb97cbd + 212d079 commit 15650d8
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor)

### `Changed`

### `Fixed`
Expand Down
45 changes: 44 additions & 1 deletion bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def parse_args(args=None):
parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.")
parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")

parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
parser.add_argument(
"-o",
"--out",
Expand All @@ -25,6 +25,43 @@ def parse_args(args=None):
return parser.parse_args(args)


def parse_cat_table(cat_table):
"""Parse CAT table.
CAT table is trickier to parse than the other tables, because it has a variable number of columns,
depending on the number of ranks that are reported for the taxonomic assignation of each contig.
Therefore, we first parse the header to get the column names, and then parse the table, to get the
maximum number of columns. Then, we merge the columns containing the ranks into a single column.
Args:
cat_table (str): Path to CAT table
Returns:
pd.DataFrame: parse CAT table
"""
with open(cat_table, "r") as f:
next(f) # skip header
maxcol = 0
for line in f:
maxcol = max(maxcol, len(line.split("\t")))

header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"]

df = pd.read_table(
cat_table,
names=header + [f"rank_{i}" for i in range(maxcol - len(header))],
on_bad_lines="warn",
header=None,
skiprows=1,
)
# merge all rank columns into a single column
df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
# remove rank_* columns
df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)

return df


def main(args=None):
args = parse_args(args)

Expand Down Expand Up @@ -93,6 +130,12 @@ def main(args=None):
results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
) # assuming depths for all bins are given

if args.cat_summary:
cat_results = parse_cat_table(args.cat_summary)
if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in CAT summary do not match bins in bin depths summary!")
results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer")

results.to_csv(args.out, sep="\t")


Expand Down
2 changes: 1 addition & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
},
"gtdbtk/classifywf": {
"branch": "master",
"git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
"git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5",
"installed_by": ["modules"]
},
"gunc/downloaddb": {
Expand Down
3 changes: 3 additions & 0 deletions modules/local/bin_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process BIN_SUMMARY {
path(checkm_sum)
path(quast_sum)
path(gtdbtk_sum)
path(cat_sum)

output:
path("bin_summary.tsv"), emit: summary
Expand All @@ -21,12 +22,14 @@ process BIN_SUMMARY {
def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : ""
def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
"""
combine_tables.py --depths_summary ${bin_depths} \
$busco_summary \
$checkm_summary \
$quast_summary \
$gtdbtk_summary \
$cat_summary \
--out bin_summary.tsv
cat <<-END_VERSIONS > versions.yml
Expand Down
7 changes: 7 additions & 0 deletions modules/nf-core/gtdbtk/classifywf/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,13 @@ workflow MAG {
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)

// If CAT is not run, then the CAT global summary should be an empty channel
if ( params.cat_db_generate || params.cat_db) {
ch_cat_global_summary = CAT_SUMMARY.out.summary
} else {
ch_cat_global_summary = Channel.empty()
}

/*
* GTDB-tk: taxonomic classifications using GTDB reference
*/
Expand Down Expand Up @@ -992,7 +999,8 @@ workflow MAG {
ch_busco_summary.ifEmpty([]),
ch_checkm_summary.ifEmpty([]),
ch_quast_bins_summary.ifEmpty([]),
ch_gtdbtk_summary.ifEmpty([])
ch_gtdbtk_summary.ifEmpty([]),
ch_cat_global_summary.ifEmpty([])
)
}

Expand Down

0 comments on commit 15650d8

Please sign in to comment.