diff --git a/metapi/checkmer.py b/metapi/checkmer.py index 98a720f..98edfdb 100755 --- a/metapi/checkmer.py +++ b/metapi/checkmer.py @@ -13,17 +13,19 @@ def checkm_prepare(gene_table, batch_num, mags_dir): os.makedirs(mags_dir, exist_ok=True) table_df = pd.read_csv(gene_table, sep="\t") - table_df = table_df.sort_values(by="bin_id", - key=lambda x: np.argsort(index_natsorted(table_df["bin_id"])))\ - .reset_index(drop=True) + table_df = table_df.sort_values( + by="bin_id", + key=lambda x: np.argsort( + index_natsorted(table_df["bin_id"]))).reset_index(drop=True) batchid = -1 if len(table_df) > 0: for batch in range(0, len(table_df), batch_num): batchid += 1 table_split = table_df.iloc[batch:batch+batch_num, ] - table_split.to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), - sep="\t", index=False, header=None) + table_split.to_csv( + os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), + sep="\t", index=False, header=None) else: subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True) diff --git a/metapi/classifier.py b/metapi/classifier.py index d77dea8..2829d03 100755 --- a/metapi/classifier.py +++ b/metapi/classifier.py @@ -13,7 +13,9 @@ def gtdbtk_prepare_from_mags(rep_table, batch_num, mags_dir): os.makedirs(mags_dir, exist_ok=True) - table_df = pd.read_csv(rep_table, sep="\t") + table_df = pd.read_csv(rep_table, sep="\t")\ + .query('MIMAG_quality_level!="low_quality"')\ + .reset_index(drop=True) batchid = -1 if len(table_df) > 0: @@ -25,12 +27,13 @@ def gtdbtk_prepare_from_mags(rep_table, batch_num, mags_dir): genome_index = columns_list.index("genome") best_translation_table_index = columns_list.index("best_translation_table") - table_split = table_df.iloc[batch:batch+batch_num, - [bin_file_index, genome_index, best_translation_table_index]] + table_split = table_df.iloc[ + batch:batch+batch_num, + [bin_file_index, genome_index, best_translation_table_index]] - table_split\ - .to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), - sep="\t", index=False, header=None) + table_split.to_csv( + os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), + sep="\t", index=False, header=None) else: subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True) @@ -54,12 +57,12 @@ def gtdbtk_prepare_from_genes(rep_table, batch_num, mags_dir): table_split["pep_location"] = table_split.apply(lambda x: os.path.splitext(x["pep_file"])[0], axis=1) table_split["pep_basename"] = table_split.apply(lambda x: os.path.basename(x["pep_location"]), axis=1) - + table_split\ .loc[:, ["pep_location", "pep_basename", "best_translation_table"]]\ .to_csv(os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), sep="\t", index=False, header=None) - pepcount = 0 + pepcount = 0 for pep_file in table_split["pep_file"]: pep_file_ = os.path.splitext(pep_file)[0] if not os.path.exists(pep_file_): diff --git a/metapi/profiles/sge/cluster.yaml b/metapi/profiles/sge/cluster.yaml index dbef32f..646cbc3 100644 --- a/metapi/profiles/sge/cluster.yaml +++ b/metapi/profiles/sge/cluster.yaml @@ -507,8 +507,8 @@ dereplicate_vmags_postprocess: taxonomic_gtdbtk: #partition: "hugemem" mem: "50G" - output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.o" - error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.e" + output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.o" + error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.e" databases_bacteriome_refine_taxonomy: mem: "1G" diff --git a/metapi/profiles/slurm/cluster.yaml b/metapi/profiles/slurm/cluster.yaml index a5f88e0..b69297a 100644 --- a/metapi/profiles/slurm/cluster.yaml +++ b/metapi/profiles/slurm/cluster.yaml @@ -529,8 +529,8 @@ dereplicate_vmags_postprocess: taxonomic_gtdbtk: #partition: "hugemem" mem: "50G" - output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.o" - error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.dereper}.{wildcards.batchid}.{jobid}.e" + output: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.o" + error: "logs/10.{rule}/{rule}.{wildcards.assembler}.{wildcards.binner_checkm}.{wildcards.batchid}.{jobid}.e" databases_bacteriome_refine_taxonomy: mem: "1G" diff --git a/metapi/rules/checkm.smk b/metapi/rules/checkm.smk index 009bd2d..27326b6 100644 --- a/metapi/rules/checkm.smk +++ b/metapi/rules/checkm.smk @@ -134,8 +134,10 @@ rule checkm_report: gene_table = pd.read_csv(input.gene_table, sep="\t") mags_report = metapi.extract_mags_report(input.mags_report) - genomes_info = pd.merge(mags_report, gene_table, how="inner", on=["bin_id", "bin_file"])\ - .merge(checkm_table, how="inner", on="bin_id") + genomes_info = pd.merge( + mags_report, gene_table, + how="inner", on=["bin_id", "bin_file"])\ + .merge(checkm_table, how="inner", on="bin_id") genomes_info["genome"] = genomes_info["bin_id"] + ".fa.gz" genomes_info.to_csv(output.genomes_info, sep='\t', index=False) diff --git a/metapi/rules/databases.smk b/metapi/rules/databases.smk index 0f51c17..b49982e 100644 --- a/metapi/rules/databases.smk +++ b/metapi/rules/databases.smk @@ -1,11 +1,11 @@ rule databases_bacteriome_refine_taxonomy: input: - genomes_info = os.path.join( - config["output"]["check"], - "report/checkm/checkm_table_genomes_info.{assembler}.all.tsv"), + rep_genomes_info = os.path.join( + config["output"]["dereplicate"], + "report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz"), table_gtdb = os.path.join( config["output"]["taxonomic"], - "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.gtdb.tsv") + "report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv") output: taxonomy = os.path.join( config["output"]["databases"], @@ -25,7 +25,7 @@ rule databases_bacteriome_refine_taxonomy: run: import shutil - shutil.rmtree(params.out_dir) + shutil.rmtree(params.out_dir) metapi.refine_taxonomy( input.genomes_info, diff --git a/metapi/rules/predict_mags.smk b/metapi/rules/predict_mags.smk index 6756820..079dfbc 100644 --- a/metapi/rules/predict_mags.smk +++ b/metapi/rules/predict_mags.smk @@ -50,7 +50,7 @@ rule predict_mags_gene_prodigal_report: checkm_df_list = [] for binner_checkm in BINNERS_CHECKM: checkm_df = ASSEMBLY_GROUPS.copy() - checkm_df["binner_checkm"] = binner_checkm + checkm_df["binner_checkm"] = binner_checkm checkm_df_list.append(checkm_df) CHECKM_GROUPS = pd.concat(checkm_df_list, axis=0) diff --git a/metapi/rules/taxonomic.smk b/metapi/rules/taxonomic.smk index a063228..e07e16d 100644 --- a/metapi/rules/taxonomic.smk +++ b/metapi/rules/taxonomic.smk @@ -1,12 +1,12 @@ checkpoint taxonomic_gtdbtk_prepare: input: rep_genomes_info = os.path.join( - config["output"]["dereplicate"], - "report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz") + config["output"]["check"], + "report/checkm/checkm_table_{assembler}_{binner_checkm}.tsv.gz") output: mags_dir = directory(os.path.join( config["output"]["taxonomic"], - "mags_input/{assembler}.{dereper}")) + "mags_input/{assembler}.{binner_checkm}")) params: batch_num = config["params"]["taxonomic"]["gtdbtk"]["batch_num"] run: @@ -21,26 +21,29 @@ rule taxonomic_gtdbtk: input: mags_input = os.path.join( config["output"]["taxonomic"], - "mags_input/{assembler}.{dereper}/mags_input.{batchid}.tsv"), + "mags_input/{assembler}.{binner_checkm}/mags_input.{batchid}.tsv"), gtdb_data_path = expand(os.path.join( config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"], "{gtdbtk_dir}"), gtdbtk_dir = [ "fastani", "markers", "masks", "metadata", "mrca_red", "msa", "pplacer", "radii", "taxonomy"]) output: - os.path.join( + done = os.path.join( + config["output"]["taxonomic"], + "table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/gtdbtk_done"), + mash_db = os.path.join( config["output"]["taxonomic"], - "table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done") + "table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/msh") + wildcard_constraints: + batchid = "\d+" log: os.path.join( config["output"]["taxonomic"], - "logs/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.log") + "logs/taxonomic_gtdbtk/{assembler}.{binner_checkm}.{batchid}.log") benchmark: os.path.join( config["output"]["taxonomic"], - "benchmark/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.txt") - wildcard_constraints: - batchid="\d+" + "benchmark/taxonomic_gtdbtk/{assembler}.{binner_checkm}.{batchid}.txt") params: gtdb_data_path = config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"], pplacer_threads = config["params"]["taxonomic"]["gtdbtk"]["pplacer_threads"] @@ -52,8 +55,8 @@ rule taxonomic_gtdbtk: ''' export GTDBTK_DATA_PATH={params.gtdb_data_path} - outdir=$(dirname {output}) - #rm -rf $outdir + outdir=$(dirname {output.done}) + rm -rf $outdir gtdbtk classify_wf \ --batchfile {input.mags_input} \ @@ -61,6 +64,11 @@ rule taxonomic_gtdbtk: --extension gz \ --cpus {threads} \ --pplacer_cpus {params.pplacer_threads} \ + --mash_db {output.mash_db} \ + --keep_intermediates \ + --write_single_copy_genes \ + --skip_ani_screen \ + --force \ > {log} 2>&1 if [ -f $outdir/classify/gtdbtk.bac120.summary.tsv ]; @@ -95,13 +103,13 @@ rule taxonomic_gtdbtk: def aggregate_gtdbtk_report_input(wildcards): - checkpoint_output = checkpoints.taxonomic_gtdbtk_prepare.get(**wildcards).output[0] + checkpoint_output = checkpoints.taxonomic_gtdbtk_prepare.get(**wildcards).output.mags_dir return expand(os.path.join( config["output"]["taxonomic"], - "table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done"), + "table/gtdbtk/gtdbtk.out.{assembler}.{binner_checkm}.{batchid}/gtdbtk_done"), assembler=wildcards.assembler, - dereper=wildcards.dereper, + binner_checkm=wildcards.binner_checkm, batchid=list(set([i.split("/")[0] \ for i in glob_wildcards( os.path.join(checkpoint_output, @@ -110,20 +118,17 @@ def aggregate_gtdbtk_report_input(wildcards): rule taxonomic_gtdbtk_report: input: - gtdb_done = aggregate_gtdbtk_report_input, - rep_genomes_info = os.path.join( - config["output"]["dereplicate"], - "report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz") + gtdb_done = aggregate_gtdbtk_report_input output: table_gtdb = os.path.join( config["output"]["taxonomic"], - "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.gtdb.tsv"), + "report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.gtdb.tsv"), table_ncbi = os.path.join( config["output"]["taxonomic"], - "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.ncbi.tsv"), + "report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.ncbi.tsv"), table_all = os.path.join( config["output"]["taxonomic"], - "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.all.tsv") + "report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.all.tsv") params: metadata_archaea = config["params"]["taxonomic"]["gtdbtk"]["metadata_archaea"], metadata_bacteria = config["params"]["taxonomic"]["gtdbtk"]["metadata_bacteria"], @@ -136,16 +141,40 @@ rule taxonomic_gtdbtk_report: "../wrappers/gtdbtk_postprocess.py" +rule taxonomic_gtdbtk_report_merge: + input: + expand(os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.{{assembler}}.{binner_checkm}.gtdbtk.gtdb.tsv"), + binner_checkm=BINNERS_CHECKM) + output: + os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv") + shell: + ''' + head -1 {input[0]} > {output} + + for report in {input} + do + tail -q -n +2 $report >> {output} + done + ''' + + if config["params"]["taxonomic"]["gtdbtk"]["do"]: rule taxonomic_gtdbtk_all: input: - expand(os.path.join( - config["output"]["taxonomic"], - "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.{system}.tsv"), + expand([ + os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.{assembler}.{binner_checkm}.gtdbtk.{system}.tsv"), + os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq_{assembler}_all_gtdbtk_gtdb.tsv")], system=["gtdb", "ncbi", "all"], assembler=ASSEMBLERS, - dereper=DEREPERS, - binner_checkm=BINNERS_CHECKM), + binner_checkm=BINNERS_CHECKM) else: rule taxonomic_gtdbtk_all: diff --git a/metapi/rules/taxonomic_rep.smk b/metapi/rules/taxonomic_rep.smk new file mode 100644 index 0000000..a063228 --- /dev/null +++ b/metapi/rules/taxonomic_rep.smk @@ -0,0 +1,199 @@ +checkpoint taxonomic_gtdbtk_prepare: + input: + rep_genomes_info = os.path.join( + config["output"]["dereplicate"], + "report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz") + output: + mags_dir = directory(os.path.join( + config["output"]["taxonomic"], + "mags_input/{assembler}.{dereper}")) + params: + batch_num = config["params"]["taxonomic"]["gtdbtk"]["batch_num"] + run: + shell("rm -rf {output.mags_dir}") + metapi.gtdbtk_prepare_from_mags( + input.rep_genomes_info, + params.batch_num, + output.mags_dir) + + +rule taxonomic_gtdbtk: + input: + mags_input = os.path.join( + config["output"]["taxonomic"], + "mags_input/{assembler}.{dereper}/mags_input.{batchid}.tsv"), + gtdb_data_path = expand(os.path.join( + config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"], "{gtdbtk_dir}"), + gtdbtk_dir = [ + "fastani", "markers", "masks", "metadata", + "mrca_red", "msa", "pplacer", "radii", "taxonomy"]) + output: + os.path.join( + config["output"]["taxonomic"], + "table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done") + log: + os.path.join( + config["output"]["taxonomic"], + "logs/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.log") + benchmark: + os.path.join( + config["output"]["taxonomic"], + "benchmark/taxonomic_gtdbtk/{assembler}.{dereper}.{batchid}.txt") + wildcard_constraints: + batchid="\d+" + params: + gtdb_data_path = config["params"]["taxonomic"]["gtdbtk"]["gtdb_data_path"], + pplacer_threads = config["params"]["taxonomic"]["gtdbtk"]["pplacer_threads"] + threads: + config["params"]["taxonomic"]["threads"] + conda: + config["envs"]["gtdbtk"] + shell: + ''' + export GTDBTK_DATA_PATH={params.gtdb_data_path} + + outdir=$(dirname {output}) + #rm -rf $outdir + + gtdbtk classify_wf \ + --batchfile {input.mags_input} \ + --out_dir $outdir \ + --extension gz \ + --cpus {threads} \ + --pplacer_cpus {params.pplacer_threads} \ + > {log} 2>&1 + + if [ -f $outdir/classify/gtdbtk.bac120.summary.tsv ]; + then + pushd $outdir + ln -s classify/gtdbtk.bac120.summary.tsv gtdbtk.bacteria.summary.tsv + popd + else + pushd $outdir + touch gtdbtk.bacteria.summary.tsv + popd + fi + + if [ -f $outdir/classify/gtdbtk.ar53.summary.tsv ]; + then + pushd $outdir + ln -s classify/gtdbtk.ar53.summary.tsv gtdbtk.archaea.summary.tsv + popd + elif [ -f $outdir/classify/gtdbtk.ar122.summary.tsv ]; + then + pushd $outdir + ln -s classify/gtdbtk.ar122.summary.tsv gtdbtk.archaea.summary.tsv + popd + else + pushd $outdir + touch gtdbtk.archaea.summary.tsv + popd + fi + + touch {output} + ''' + + +def aggregate_gtdbtk_report_input(wildcards): + checkpoint_output = checkpoints.taxonomic_gtdbtk_prepare.get(**wildcards).output[0] + + return expand(os.path.join( + config["output"]["taxonomic"], + "table/gtdbtk/gtdbtk.out.{assembler}.{dereper}.{batchid}/gtdbtk_done"), + assembler=wildcards.assembler, + dereper=wildcards.dereper, + batchid=list(set([i.split("/")[0] \ + for i in glob_wildcards( + os.path.join(checkpoint_output, + "mags_input.{batchid}.tsv")).batchid]))) + + +rule taxonomic_gtdbtk_report: + input: + gtdb_done = aggregate_gtdbtk_report_input, + rep_genomes_info = os.path.join( + config["output"]["dereplicate"], + "report/bacteriome/checkm_table_genomes_info.{assembler}.{dereper}.tsv.gz") + output: + table_gtdb = os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.gtdb.tsv"), + table_ncbi = os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.ncbi.tsv"), + table_all = os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.all.tsv") + params: + metadata_archaea = config["params"]["taxonomic"]["gtdbtk"]["metadata_archaea"], + metadata_bacteria = config["params"]["taxonomic"]["gtdbtk"]["metadata_bacteria"], + gtdb_to_ncbi_script = config["params"]["taxonomic"]["gtdbtk"]["gtdb_to_ncbi_script"] + threads: + config["params"]["taxonomic"]["threads"] + conda: + config["envs"]["gtdbtk"] + script: + "../wrappers/gtdbtk_postprocess.py" + + +if config["params"]["taxonomic"]["gtdbtk"]["do"]: + rule taxonomic_gtdbtk_all: + input: + expand(os.path.join( + config["output"]["taxonomic"], + "report/gtdbtk/MAGs_hmq.rep.{assembler}.{dereper}.gtdbtk.{system}.tsv"), + system=["gtdb", "ncbi", "all"], + assembler=ASSEMBLERS, + dereper=DEREPERS, + binner_checkm=BINNERS_CHECKM), + +else: + rule taxonomic_gtdbtk_all: + input: + + +#if config["params"]["taxonomic"]["genomad"]["do"]: +# rule taxonomic_genomad: +# input: +# rep_fna = os.path.join( +# config["output"]["dereplicate"], +# "genomes/virome/representative/vMAGs_hmq.{assembler}.rep.fa.gz"), +# db = config["params"]["taxonomic"]["genomad"]["genomad_db"] +# output: +# os.path.join(config["output"]["taxonomic"], "table/genomad/genomad.out.{assembler}/genomad_done") +# benchmark: +# +# log: +# +# params: +# min_score = config["params"]["taxonomic"]["genomad"]["min_score"], +# output_dir = os.path.join(config["output"]["taxonomic"], "table/genomad/genomad.out.{assembler}") +# conda: +# config["envs"]["genomad"] +# threads: +# config["params"]["taxonomic"]["threads"] +# shell: +# ''' +# genomad end-to-end \ +# --min-score {params.min_score} \ +# --cleanup \ +# --threads {threads}\ +# {input.rep_fna} \ +# {params.output_dir} \ +# {input.db} \ +# >{log} 2>&1 +# +# touch {output.genomad_done} +# ''' + + +rule taxonomic_all: + input: + rules.taxonomic_gtdbtk_all.input + + +localrules: + taxonomic_gtdbtk_prepare, + taxonomic_gtdbtk_report, + taxonomic_gtdbtk_all, + taxonomic_all \ No newline at end of file diff --git a/metapi/snakefiles/mag_wf.smk b/metapi/snakefiles/mag_wf.smk index e24c930..bc87df6 100644 --- a/metapi/snakefiles/mag_wf.smk +++ b/metapi/snakefiles/mag_wf.smk @@ -135,10 +135,10 @@ include: "../rules/predict_mags.smk" include: "../rules/annotation.smk" include: "../rules/checkm.smk" include: "../rules/checkv.smk" +include: "../rules/taxonomic.smk" include: "../rules/dereplicate_gene.smk" include: "../rules/dereplicate_mags.smk" include: "../rules/dereplicate_vmags.smk" -include: "../rules/taxonomic.smk" include: "../rules/databases.smk" include: "../rules/upload.smk" diff --git a/metapi/taxonomyer.py b/metapi/taxonomyer.py index a12eb53..7847108 100755 --- a/metapi/taxonomyer.py +++ b/metapi/taxonomyer.py @@ -119,7 +119,7 @@ def update_genomes(rep_info, scaftigs_info): rc_id = rc.id rc.id = contig_name rc.description = f"{genome_id}|original_contig_id={rc_id}|original_bin_id={bin_id}|gtdb_classification={clade_lineage}|completeness={completeness}|contamination={contamination}|strain_heterogeneity={strain_heterogeneity}|quality_score={quality_score}" - SeqIO.write(rc, oh2, "fasta") + SeqIO.write(rc, oh2, "fasta") handle.close() @@ -131,16 +131,16 @@ def refine_taxonomy(genomes_info_f, tax_info_f, map_name, rep_level, base_dir, o tax_info = pd.read_csv(tax_info_f, sep="\t").rename(columns={"user_genome": "genome"}) rep_info = pd.merge(genomes_info, tax_info, how="inner", on=["genome"])\ - .sort_values(["classification", "genome"])\ - .reset_index(drop=True) + .sort_values(["classification", "genome"])\ + .reset_index(drop=True) # step 1: set genomes rep_info = set_genomes(rep_info, map_name, base_dir) # step 2: set lineages rep_info[["kingdom", "phylum", "class", "order", - "family", "genus", "species", "strain", - "lineage"]] = rep_info.apply( + "family", "genus", "species", "strain", + "lineage"]] = rep_info.apply( lambda x: set_lineages( x["genome_id"], x["classification"], rep_level), axis = 1, result_type = "expand") diff --git a/metapi/wrappers/gtdbtk_postprocess.py b/metapi/wrappers/gtdbtk_postprocess.py index d04182e..307bb49 100755 --- a/metapi/wrappers/gtdbtk_postprocess.py +++ b/metapi/wrappers/gtdbtk_postprocess.py @@ -39,7 +39,6 @@ def merge(input_list, func, workers, **kwargs): threads = int(snakemake.threads) gtdb_done_list = snakemake.input["gtdb_done"] -rep_genomes_info = snakemake.input["rep_genomes_info"] gtdb_to_ncbi_script = snakemake.params["gtdb_to_ncbi_script"] metadata_archaea = snakemake.params["metadata_archaea"] @@ -105,12 +104,8 @@ def merge(input_list, func, workers, **kwargs): table_ncbi_df = table_ncbi_df.rename(columns={"Genome ID": "user_genome"}) pprint(table_ncbi_df) -table_rep_genomes_info = pd.read_csv(rep_genomes_info, sep="\t")\ - .rename(columns={"genome": "user_genome"}) -pprint(table_rep_genomes_info) - -table_all_df = pd.merge(table_gtdb_df, table_ncbi_df, how="inner", - on=["user_genome", "GTDB classification"])#\ - #.merge(table_rep_genomes_info, how="inner", on="user_genome") +table_all_df = pd.merge( + table_gtdb_df, table_ncbi_df, how="inner", + on=["user_genome", "GTDB classification"])#\ table_all_df.to_csv(table_all, sep="\t", index=False)