Skip to content

Commit

Permalink
Merge branch 'dev' into warn-when-gtdb-filter-empty
Browse files Browse the repository at this point in the history
  • Loading branch information
jfy133 authored Feb 1, 2024
2 parents dabc167 + 15650d8 commit 3ead481
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 8 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#548](https://github.com/nf-core/mag/pull/548) fixes to GTDBK-TK execusion, CAT/QUAST/DEPTH bin summary file name collisions, BUSCO database parsing, correct CAT name files (reported by @maxibor, @PPpissar, @muniheart, @llborcard, fix by @maxibor)
- [#548](https://github.com/nf-core/mag/pull/548) - Fixes to (reported by @maxibor, @PPpissar, @muniheart, @llborcard, fix by @maxibor)
- GTDBK-TK execution
- CAT/QUAST/DEPTH bin summary file name collisions
- BUSCO database parsing
- Correct CAT name files
- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor)
- [#565](https://github.com/nf-core/mag/pull/565) - Add warning of empty GTDB-TK results if no contigs pass completeness filter (fix by @jfy133 and @maxibor)

### `Changed`
Expand Down
45 changes: 44 additions & 1 deletion bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def parse_args(args=None):
parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.")
parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")

parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
parser.add_argument(
"-o",
"--out",
Expand All @@ -25,6 +25,43 @@ def parse_args(args=None):
return parser.parse_args(args)


def parse_cat_table(cat_table):
"""Parse CAT table.
CAT table is trickier to parse than the other tables, because it has a variable number of columns,
depending on the number of ranks that are reported for the taxonomic assignation of each contig.
Therefore, we first parse the header to get the column names, and then parse the table, to get the
maximum number of columns. Then, we merge the columns containing the ranks into a single column.
Args:
cat_table (str): Path to CAT table
Returns:
pd.DataFrame: parse CAT table
"""
with open(cat_table, "r") as f:
next(f) # skip header
maxcol = 0
for line in f:
maxcol = max(maxcol, len(line.split("\t")))

header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"]

df = pd.read_table(
cat_table,
names=header + [f"rank_{i}" for i in range(maxcol - len(header))],
on_bad_lines="warn",
header=None,
skiprows=1,
)
# merge all rank columns into a single column
df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
# remove rank_* columns
df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)

return df


def main(args=None):
args = parse_args(args)

Expand Down Expand Up @@ -93,6 +130,12 @@ def main(args=None):
results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
) # assuming depths for all bins are given

if args.cat_summary:
cat_results = parse_cat_table(args.cat_summary)
if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in CAT summary do not match bins in bin depths summary!")
results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer")

results.to_csv(args.out, sep="\t")


Expand Down
2 changes: 1 addition & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
},
"gtdbtk/classifywf": {
"branch": "master",
"git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
"git_sha": "9bbc6a88ce3004ae4bc9f84cef762484dc2c95e5",
"installed_by": ["modules"]
},
"gunc/downloaddb": {
Expand Down
3 changes: 3 additions & 0 deletions modules/local/bin_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process BIN_SUMMARY {
path(checkm_sum)
path(quast_sum)
path(gtdbtk_sum)
path(cat_sum)

output:
path("bin_summary.tsv"), emit: summary
Expand All @@ -21,12 +22,14 @@ process BIN_SUMMARY {
def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : ""
def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : ""
def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : ""
def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : ""
"""
combine_tables.py --depths_summary ${bin_depths} \
$busco_summary \
$checkm_summary \
$quast_summary \
$gtdbtk_summary \
$cat_summary \
--out bin_summary.tsv
cat <<-END_VERSIONS > versions.yml
Expand Down
7 changes: 7 additions & 0 deletions modules/nf-core/gtdbtk/classifywf/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions modules/nf-core/gtdbtk/classifywf/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,13 @@ workflow MAG {
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)

// If CAT is not run, then the CAT global summary should be an empty channel
if ( params.cat_db_generate || params.cat_db) {
ch_cat_global_summary = CAT_SUMMARY.out.summary
} else {
ch_cat_global_summary = Channel.empty()
}

/*
* GTDB-tk: taxonomic classifications using GTDB reference
*/
Expand Down Expand Up @@ -992,7 +999,8 @@ workflow MAG {
ch_busco_summary.ifEmpty([]),
ch_checkm_summary.ifEmpty([]),
ch_quast_bins_summary.ifEmpty([]),
ch_gtdbtk_summary.ifEmpty([])
ch_gtdbtk_summary.ifEmpty([]),
ch_cat_global_summary.ifEmpty([])
)
}

Expand Down

0 comments on commit 3ead481

Please sign in to comment.