Skip to content

Commit

Permalink
run taxonomic assignment for all hmq MAGs
Browse files Browse the repository at this point in the history
  • Loading branch information
alienzj committed Aug 31, 2023
1 parent bc54a48 commit 97b5f52
Show file tree
Hide file tree
Showing 12 changed files with 296 additions and 66 deletions.
12 changes: 7 additions & 5 deletions metapi/checkmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@ def checkm_prepare(gene_table, batch_num, mags_dir):
os.makedirs(mags_dir, exist_ok=True)

table_df = pd.read_csv(gene_table, sep="\t")
table_df = table_df.sort_values(by="bin_id",
key=lambda x: np.argsort(index_natsorted(table_df["bin_id"])))\
.reset_index(drop=True)
table_df = table_df.sort_values(
by="bin_id",
key=lambda x: np.argsort(
index_natsorted(table_df["bin_id"]))).reset_index(drop=True)

batchid = -1
if len(table_df) > 0:
for batch in range(0, len(table_df), batch_num):
batchid += 1
table_split = table_df.iloc[batch:batch+batch_num, ]
table_split.to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
sep="\t", index=False, header=None)
table_split.to_csv(
os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
sep="\t", index=False, header=None)
else:
subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True)

Expand Down
19 changes: 11 additions & 8 deletions metapi/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
def gtdbtk_prepare_from_mags(rep_table, batch_num, mags_dir):
os.makedirs(mags_dir, exist_ok=True)

table_df = pd.read_csv(rep_table, sep="\t")
table_df = pd.read_csv(rep_table, sep="\t")\
.query('MIMAG_quality_level!="low_quality"')\
.reset_index(drop=True)

batchid = -1
if len(table_df) > 0:
Expand All @@ -25,12 +27,13 @@ def gtdbtk_prepare_from_mags(rep_table, batch_num, mags_dir):
genome_index = columns_list.index("genome")
best_translation_table_index = columns_list.index("best_translation_table")

table_split = table_df.iloc[batch:batch+batch_num,
[bin_file_index, genome_index, best_translation_table_index]]
table_split = table_df.iloc[
batch:batch+batch_num,
[bin_file_index, genome_index, best_translation_table_index]]

table_split\
.to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
sep="\t", index=False, header=None)
table_split.to_csv(
os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
sep="\t", index=False, header=None)
else:
subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True)

Expand All @@ -54,12 +57,12 @@ def gtdbtk_prepare_from_genes(rep_table, batch_num, mags_dir):

table_split["pep_location"] = table_split.apply(lambda x: os.path.splitext(x["pep_file"])[0], axis=1)
table_split["pep_basename"] = table_split.apply(lambda x: os.path.basename(x["pep_location"]), axis=1)

table_split\
.loc[:, ["pep_location", "pep_basename", "best_translation_table"]]\
.to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
sep="\t", index=False, header=None)
pepcount = 0
pepcount = 0
for pep_file in table_split["pep_file"]:
pep_file_ = os.path.splitext(pep_file)[0]
if not os.path.exists(pep_file_):
Expand Down
4 changes: 2 additions & 2 deletions metapi/profiles/sge/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,8 @@ dereplicate_vmags_postprocess:
taxonomic_gtdbtk:
#partition: "hugemem"
mem: "50G"
output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.o"
error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.e"
output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.o"
error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.e"

databases_bacteriome_refine_taxonomy:
mem: "1G"
Expand Down
4 changes: 2 additions & 2 deletions metapi/profiles/slurm/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -529,8 +529,8 @@ dereplicate_vmags_postprocess:
taxonomic_gtdbtk:
#partition: "hugemem"
mem: "50G"
output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.o"
error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.e"
output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.o"
error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.e"

databases_bacteriome_refine_taxonomy:
mem: "1G"
Expand Down
6 changes: 4 additions & 2 deletions metapi/rules/checkm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,10 @@ rule checkm_report:
gene_table = pd.read_csv(input.gene_table, sep="\t")
mags_report = metapi.extract_mags_report(input.mags_report)

genomes_info = pd.merge(mags_report, gene_table, how="inner", on=["bin_id", "bin_file"])\
.merge(checkm_table, how="inner", on="bin_id")
genomes_info = pd.merge(
mags_report, gene_table,
how="inner", on=["bin_id", "bin_file"])\
.merge(checkm_table, how="inner", on="bin_id")

genomes_info["genome"] = genomes_info["bin_id"] + ".fa.gz"
genomes_info.to_csv(output.genomes_info, sep='\t', index=False)
Expand Down
10 changes: 5 additions & 5 deletions metapi/rules/databases.smk
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
rule databases_bacteriome_refine_taxonomy:
input:
genomes_info = os.path.join(
config["output"]["check"],
"report/checkm/checkm_table_genomes_info.{assembler}.all.tsv"),
rep_genomes_info = os.path.join(
config["output"]["dereplicate"],
"report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz"),
table_gtdb = os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.gtdb.tsv")
"report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv")
output:
taxonomy = os.path.join(
config["output"]["databases"],
Expand All @@ -25,7 +25,7 @@ rule databases_bacteriome_refine_taxonomy:
run:
import shutil

shutil.rmtree(params.out_dir)
shutil.rmtree(params.out_dir)

metapi.refine_taxonomy(
input.genomes_info,
Expand Down
2 changes: 1 addition & 1 deletion metapi/rules/predict_mags.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ rule predict_mags_gene_prodigal_report:
checkm_df_list = []
for binner_checkm in BINNERS_CHECKM:
checkm_df = ASSEMBLY_GROUPS.copy()
checkm_df["binner_checkm"] = binner_checkm
checkm_df["binner_checkm"] = binner_checkm
checkm_df_list.append(checkm_df)
CHECKM_GROUPS = pd.concat(checkm_df_list, axis=0)

Expand Down
83 changes: 56 additions & 27 deletions metapi/rules/taxonomic.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
checkpoint taxonomic_gtdbtk_prepare:
input:
rep_genomes_info = os.path.join(
config["output"]["dereplicate"],
"report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz")
config["output"]["check"],
"report/checkm/checkm_table_{assembler}_{binner_checkm}.tsv.gz")
output:
mags_dir = directory(os.path.join(
config["output"]["taxonomic"],
"mags_input/{assembler}.{dereper}"))
"mags_input/{assembler}.{binner_checkm}"))
params:
batch_num = config["params"]["taxonomic"]["gtdbtk"]["batch_num"]
run:
Expand All @@ -21,26 +21,29 @@ rule taxonomic_gtdbtk:
input:
mags_input = os.path.join(
config["output"]["taxonomic"],
"mags_input/{assembler}.{dereper}/mags_input.{batchid}.tsv"),
"mags_input/{assembler}.{binner_checkm}/mags_input.{batchid}.tsv"),
gtdb_data_path = expand(os.path.join(
config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"], "{gtdbtk_dir}"),
gtdbtk_dir = [
"fastani", "markers", "masks", "metadata",
"mrca_red", "msa", "pplacer", "radii", "taxonomy"])
output:
os.path.join(
done = os.path.join(
config["output"]["taxonomic"],
"table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/gtdbtk_done"),
mash_db = os.path.join(
config["output"]["taxonomic"],
"table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done")
"table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/msh")
wildcard_constraints:
batchid = "\d+"
log:
os.path.join(
config["output"]["taxonomic"],
"logs/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.log")
"logs/taxonomic_gtdbtk/{assembler}.{binner_checkm}.{batchid}.log")
benchmark:
os.path.join(
config["output"]["taxonomic"],
"benchmark/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.txt")
wildcard_constraints:
batchid="\d+"
"benchmark/taxonomic_gtdbtk/{assembler}.{binner_checkm}.{batchid}.txt")
params:
gtdb_data_path = config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"],
pplacer_threads = config["params"]["taxonomic"]["gtdbtk"]["pplacer_threads"]
Expand All @@ -52,15 +55,20 @@ rule taxonomic_gtdbtk:
'''
export GTDBTK_DATA_PATH={params.gtdb_data_path}
outdir=$(dirname {output})
#rm -rf $outdir
outdir=$(dirname {output.done})
rm -rf $outdir
gtdbtk classify_wf \
--batchfile {input.mags_input} \
--out_dir $outdir \
--extension gz \
--cpus {threads} \
--pplacer_cpus {params.pplacer_threads} \
--mash_db {output.mash_db} \
--keep_intermediates \
--write_single_copy_genes \
--skip_ani_screen \
--force \
> {log} 2>&1
if [ -f $outdir/classify/gtdbtk.bac120.summary.tsv ];
Expand Down Expand Up @@ -95,13 +103,13 @@ rule taxonomic_gtdbtk:


def aggregate_gtdbtk_report_input(wildcards):
checkpoint_output = checkpoints.taxonomic_gtdbtk_prepare.get(**wildcards).output[0]
checkpoint_output = checkpoints.taxonomic_gtdbtk_prepare.get(**wildcards).output.mags_dir

return expand(os.path.join(
config["output"]["taxonomic"],
"table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done"),
"table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/gtdbtk_done"),
assembler=wildcards.assembler,
dereper=wildcards.dereper,
binner_checkm=wildcards.binner_checkm,
batchid=list(set([i.split("/")[0] \
for i in glob_wildcards(
os.path.join(checkpoint_output,
Expand All @@ -110,20 +118,17 @@ def aggregate_gtdbtk_report_input(wildcards):

rule taxonomic_gtdbtk_report:
input:
gtdb_done = aggregate_gtdbtk_report_input,
rep_genomes_info = os.path.join(
config["output"]["dereplicate"],
"report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz")
gtdb_done = aggregate_gtdbtk_report_input
output:
table_gtdb = os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.gtdb.tsv"),
"report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.gtdb.tsv"),
table_ncbi = os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.ncbi.tsv"),
"report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.ncbi.tsv"),
table_all = os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.all.tsv")
"report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.all.tsv")
params:
metadata_archaea = config["params"]["taxonomic"]["gtdbtk"]["metadata_archaea"],
metadata_bacteria = config["params"]["taxonomic"]["gtdbtk"]["metadata_bacteria"],
Expand All @@ -136,16 +141,40 @@ rule taxonomic_gtdbtk_report:
"../wrappers/gtdbtk_postprocess.py"


rule taxonomic_gtdbtk_report_merge:
input:
expand(os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.{{assembler}}.{binner_checkm}.gtdbtk.gtdb.tsv"),
binner_checkm=BINNERS_CHECKM)
output:
os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv")
shell:
'''
head -1 {input[0]} > {output}
for report in {input}
do
tail -q -n +2 $report >> {output}
done
'''


if config["params"]["taxonomic"]["gtdbtk"]["do"]:
rule taxonomic_gtdbtk_all:
input:
expand(os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.{system}.tsv"),
expand([
os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.{system}.tsv"),
os.path.join(
config["output"]["taxonomic"],
"report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv")],
system=["gtdb", "ncbi", "all"],
assembler=ASSEMBLERS,
dereper=DEREPERS,
binner_checkm=BINNERS_CHECKM),
binner_checkm=BINNERS_CHECKM)

else:
rule taxonomic_gtdbtk_all:
Expand Down
Loading

0 comments on commit 97b5f52

Please sign in to comment.