Skip to content

Commit

Permalink
Merge pull request #548 from maxibor/patch_2_5_1
Browse files Browse the repository at this point in the history
Patch of #534 into dev for v2.5.1
  • Loading branch information
jfy133 authored Jan 26, 2024
2 parents c79c5d4 + 3a65193 commit 13212ab
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 40 deletions.
2 changes: 1 addition & 1 deletion bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def main(args=None):

if args.gtdbtk_summary:
gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t")
if not bins.equals(gtdbtk_results["user_genome"].sort_values().reset_index(drop=True)):
if len(set(gtdbtk_results["user_genome"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in GTDB-Tk summary do not match bins in bin depths summary!")
results = pd.merge(
results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
Expand Down
19 changes: 10 additions & 9 deletions modules/local/cat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ process CAT {
tuple val(db_name), path("database/*"), path("taxonomy/*")

output:
path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
path("raw/*.predicted_proteins.faa.gz") , emit: faa
path("raw/*.predicted_proteins.gff.gz") , emit: gff
path("raw/*.log") , emit: log
path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
path "versions.yml" , emit: versions
tuple val(meta), path("*.bin2classification.names.txt") , emit: tax_classification_names
path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
path("raw/*.predicted_proteins.faa.gz") , emit: faa
path("raw/*.predicted_proteins.gff.gz") , emit: gff
path("raw/*.log") , emit: log
path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
path "versions.yml" , emit: versions

script:
def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
Expand All @@ -31,12 +31,13 @@ process CAT {
mkdir raw
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
cp *.bin2classification.names.txt raw/
gzip "raw/${prefix}.ORF2LCA.txt" \
"raw/${prefix}.concatenated.predicted_proteins.faa" \
"raw/${prefix}.concatenated.predicted_proteins.gff" \
"raw/${prefix}.bin2classification.txt" \
"${prefix}.ORF2LCA.names.txt" \
"${prefix}.bin2classification.names.txt"
"raw/${prefix}.bin2classification.names.txt"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
3 changes: 0 additions & 3 deletions modules/local/cat_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ process CAT_SUMMARY {
script:
def prefix = task.ext.prefix ?: "cat_summary"
"""
# use find as sometimes these are empty and need to fail gracefully
find -L -type f -name "*bin2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} +
bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
Expand Down
2 changes: 1 addition & 1 deletion modules/local/quast_bins.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process QUAST_BINS {

output:
path "QUAST/*", type: 'dir' , emit: dir
path "QUAST/*-quast_summary.tsv", emit: quast_bin_summaries
tuple val(meta), path("QUAST/*-quast_summary.tsv"), emit: quast_bin_summaries
path "versions.yml" , emit: versions

script:
Expand Down
1 change: 1 addition & 0 deletions subworkflows/local/binning_refinement.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ workflow BINNING_REFINEMENT {

// Note: do not `failOnMismatch` on join here, in some cases e.g. MAXBIN2 will fail if no bins, so cannot join!
// Only want to join for DAS_Tool on bins that 'exist'

ch_input_for_dastool = ch_contigs_for_dastool.join(ch_fastatocontig2bin_for_dastool, by: 0)

ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_METABAT2.out.versions.first())
Expand Down
13 changes: 9 additions & 4 deletions subworkflows/local/depths.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ workflow DEPTHS {
main:
ch_versions = Channel.empty()


depths.dump(tag: 'depths', pretty: true)

// Compute bin depths for different samples (according to `binning_map_mode`)
// Create a new meta combine key first, but copy meta so that
// we retain the information about binners and domain classification
Expand Down Expand Up @@ -62,7 +59,15 @@ workflow DEPTHS {
}

MAG_DEPTHS_PLOT ( ch_mag_depths_plot, ch_sample_groups.collect() )
MAG_DEPTHS_SUMMARY ( MAG_DEPTHS.out.depths.map{it[1]}.collect() )

//Depth files that are coming from bins and failed binning refinement are concatenated per meta
ch_mag_depth_out = MAG_DEPTHS.out.depths
.collectFile(keepHeader: true) {
meta, depth ->
[meta.id, depth]
}

MAG_DEPTHS_SUMMARY ( ch_mag_depth_out.collect() )
ch_versions = ch_versions.mix( MAG_DEPTHS_PLOT.out.versions )
ch_versions = ch_versions.mix( MAG_DEPTHS_SUMMARY.out.versions )

Expand Down
9 changes: 4 additions & 5 deletions subworkflows/local/gtdbtk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ workflow GTDBTK {
def completeness = -1
def contamination = -1
def missing, duplicated
if (params.busco_db.getBaseName().contains('odb10')) {
def busco_db = file(params.busco_db)
if (busco_db.getBaseName().contains('odb10')) {
missing = row.'%Missing (specific)' // TODO or just take '%Complete'?
duplicated = row.'%Complete and duplicated (specific)'
} else {
Expand All @@ -52,7 +53,7 @@ workflow GTDBTK {
ch_filtered_bins = bins
.transpose()
.map { meta, bin -> [bin.getName(), bin, meta]}
.join(ch_bin_metrics, failOnDuplicate: true, failOnMismatch: true)
.join(ch_bin_metrics, failOnDuplicate: true)
.map { bin_name, bin, meta, completeness, contamination -> [meta, bin, completeness, contamination] }
.branch {
passed: (it[2] != -1 && it[2] >= params.gtdbtk_min_completeness && it[3] != -1 && it[3] <= params.gtdbtk_max_contamination)
Expand Down Expand Up @@ -84,11 +85,9 @@ workflow GTDBTK {

GTDBTK_SUMMARY (
ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]),
GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]),
GTDBTK_CLASSIFYWF.out.summary.map{it[1]}.collect().ifEmpty([]),
[],
// GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]),
[]
// GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([])
)

emit:
Expand Down
46 changes: 29 additions & 17 deletions workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -455,27 +455,30 @@ workflow MAG {
if ( !ch_centrifuge_db_file.isEmpty() ) {
if ( ch_centrifuge_db_file.extension in ['gz', 'tgz'] ) {
// Expects to be tar.gz!
ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ).db
ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file )
.db
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}
} else if ( ch_centrifuge_db_file.isDirectory() ) {
ch_db_for_centrifuge = Channel
.fromPath( "${ch_centrifuge_db_file}/*.cf" )
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}
} else {
ch_db_for_centrifuge = Channel.empty()
}
} else {
ch_db_for_centrifuge = Channel.empty()
}

// Centrifuge val(db_name) has to be the basename of any of the
// index files up to but not including the final .1.cf
ch_db_for_centrifuge = ch_db_for_centrifuge
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}

CENTRIFUGE (
ch_short_reads,
ch_db_for_centrifuge
Expand Down Expand Up @@ -578,6 +581,7 @@ workflow MAG {
}

ch_assemblies = Channel.empty()

if (!params.skip_megahit){
MEGAHIT ( ch_short_reads_grouped )
ch_megahit_assemblies = MEGAHIT.out.assembly
Expand Down Expand Up @@ -791,8 +795,6 @@ workflow MAG {
[meta_new, bins]
}



// If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one
if ( params.refine_bins_dastool ) {
ch_prokarya_bins_dastool = ch_binning_results_bins
Expand Down Expand Up @@ -829,8 +831,6 @@ workflow MAG {
} else if ( params.postbinning_input == 'refined_bins_only' ) {
ch_input_for_postbinning_bins = ch_refined_bins
ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins)
// TODO REACTIVATE ONCE PR #489 IS READY!
// TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING
} else if ( params.postbinning_input == 'both' ) {
ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins)
ch_input_for_postbinning_bins = ch_all_bins
Expand Down Expand Up @@ -913,7 +913,12 @@ workflow MAG {

QUAST_BINS ( ch_input_for_quast_bins )
ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first())
QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() )
ch_quast_bin_summary = QUAST_BINS.out.quast_bin_summaries
.collectFile(keepHeader: true) {
meta, summary ->
["${meta.id}.tsv", summary]
}
QUAST_BINS_SUMMARY ( ch_quast_bin_summary.collect() )
ch_quast_bins_summary = QUAST_BINS_SUMMARY.out.summary
}

Expand All @@ -932,8 +937,15 @@ workflow MAG {
ch_input_for_postbinning_bins_unbins,
ch_cat_db
)
// Group all classification results for each sample in a single file
ch_cat_summary = CAT.out.tax_classification_names
.collectFile(keepHeader: true) {
meta, classification ->
["${meta.id}.txt", classification]
}
// Group all classification results for the whole run in a single file
CAT_SUMMARY(
CAT.out.tax_classification_names.collect()
ch_cat_summary.collect()
)
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)
Expand Down

0 comments on commit 13212ab

Please sign in to comment.