From c87e6c2cb2c8160d02617105732c58121a65285b Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 16 Jan 2024 20:32:48 -0600 Subject: [PATCH 1/8] Fix length error --- extend.py | 2 +- partition.py | 6 +++--- src/parsnp.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/extend.py b/extend.py index 73a28d6..104a6ab 100644 --- a/extend.py +++ b/extend.py @@ -68,7 +68,7 @@ def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index): for aln in tqdm(AlignIO.parse(xmfa_file, "mauve")): for seq in aln: # Skip reference for now... - aln_len = seq.annotations["end"] - seq.annotations["start"] + 1 + aln_len = seq.annotations["end"] - seq.annotations["start"] cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()] gid = index_to_gid[seq.name] diff --git a/partition.py b/partition.py index 52f622e..682e969 100644 --- a/partition.py +++ b/partition.py @@ -69,7 +69,7 @@ def get_interval(aln: MultipleSeqAlignment) -> Tuple[int, IntervalType]: """ seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*') seq = aln[0] - aln_len = seq.annotations["end"] - seq.annotations["start"] + 1 + aln_len = seq.annotations["end"] - seq.annotations["start"] cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()] if seq.annotations["strand"] == -1: @@ -122,7 +122,7 @@ def trim(aln: MultipleSeqAlignment, # Look for the record in the LCB that represents the reference genome for rec in aln: if int(rec.name) == seqidx: - aln_len = rec.annotations["end"] - rec.annotations["start"] + 1 + aln_len = rec.annotations["end"] - rec.annotations["start"] cluster_idx, contig_idx, super_startpos = [int(x) for x in seqid_parser.match(rec.id).groups()] if rec.annotations["strand"] == -1: @@ -185,7 +185,7 @@ def trim(aln: MultipleSeqAlignment, # orig_seq = copy.deepcopy(seq) new_rec = aln_seqs[seq_idx] - aln_len = rec.annotations["end"] - rec.annotations["start"] + 1 + aln_len = rec.annotations["end"] - rec.annotations["start"] cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(rec.id).groups()] left_bases_trim = 0 right_bases_trim = 0 diff --git a/src/parsnp.cpp b/src/parsnp.cpp index 004489a..019233f 100755 --- a/src/parsnp.cpp +++ b/src/parsnp.cpp @@ -976,18 +976,18 @@ void Aligner::writeOutput(string psnp,vector& coveragerow) if ( ct.mums.at(0).isforward.at(i) ) { - xmfafile << "> " << i+1 << ":" << ct.start.at(i)+1 << "-" << ct.end.at(i)-1 << " "; + xmfafile << "> " << i+1 << ":" << ct.start.at(i)+1 << "-" << ct.end.at(i) << " "; if (recomb_filter) { - clcbfile << "> " << i+1 << ":" << ct.start.at(i)+1 << "-" << ct.end.at(i)-1 << " "; + clcbfile << "> " << i+1 << ":" << ct.start.at(i)+1 << "-" << ct.end.at(i) << " "; } } else { - xmfafile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 << "-" << ct.mums.front().end.at(i)-1 << " "; + xmfafile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 << "-" << ct.mums.front().end.at(i) << " "; if (recomb_filter) { - clcbfile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 << "-" << ct.mums.front().end.at(i)-1 << " "; + clcbfile << "> " << i+1 << ":" << ct.mums.back().start.at(i)+1 << "-" << ct.mums.front().end.at(i) << " "; } } bool hit1 = false; From d322e9fc85213629e9da7fec318733a537572853 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 16 Jan 2024 20:35:20 -0600 Subject: [PATCH 2/8] Only remove tmp, log, and config directories from used output folder --- parsnp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/parsnp b/parsnp index 391cdf7..ce8cce5 100755 --- a/parsnp +++ b/parsnp @@ -685,15 +685,20 @@ def create_output_directory(output_dir): if os.path.exists(output_dir): logger.warning(f"Output directory {output_dir} exists, all results will be overwritten") - shutil.rmtree(output_dir) + if os.path.exists(output_dir + "/partition"): + shutil.rmtree(output_dir + "/partition/") + if os.path.exists(output_dir + "/config/"): + shutil.rmtree(output_dir + "/config/") + if os.path.exists(output_dir + "/log/"): + shutil.rmtree(output_dir + "/log/") elif output_dir == "[P_CURRDATE_CURRTIME]": today = datetime.datetime.now() timestamp = "P_" + today.isoformat().replace("-", "_").replace(".", "").replace(":", "").replace("T", "_") output_dir = os.getcwd() + os.sep + timestamp - os.makedirs(output_dir) - os.makedirs(os.path.join(output_dir, "tmp")) - os.makedirs(os.path.join(output_dir, "log")) - os.makedirs(os.path.join(output_dir, "config")) + os.makedirs(output_dir, exist_ok=True) + os.makedirs(os.path.join(output_dir, "tmp"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "log"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "config"), exist_ok=True) return output_dir From f456d3f3d7444e597abb7c7f8fed24169b64c1b1 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Tue, 16 Jan 2024 20:35:39 -0600 Subject: [PATCH 3/8] Working extension --- extend.py | 62 +++++++++++++++++++++++++++++++++++------------------- parsnp | 63 +++++++++++++++++++------------------------------------ 2 files changed, 62 insertions(+), 63 deletions(-) diff --git a/extend.py b/extend.py index 104a6ab..693c97a 100644 --- a/extend.py +++ b/extend.py @@ -8,6 +8,7 @@ import re import subprocess from collections import namedtuple, defaultdict, Counter +import bisect import os from Bio.Align import substitution_matrices from itertools import product, combinations @@ -15,7 +16,9 @@ from Bio.AlignIO.MafIO import MafWriter, MafIterator from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator from logger import logger +from tqdm import tqdm import time +import spoa #%% @@ -46,24 +49,33 @@ def parse_xmfa_header(xmfa_file): return index_to_gid, gid_to_index -def index_input_sequences(xmfa_file, input_dir): +def index_input_sequences(xmfa_file, file_list): + basename_to_path = {} + for f in file_list: + basename = str(Path(f).stem) + basename_to_path[basename] = f gid_to_records = {} gid_to_cid_to_index = {} + gid_to_index_to_cid = {} with open(xmfa_file) as parsnp_fd: for line in (line.strip() for line in parsnp_fd): if line[:2] == "##": if line.startswith("##SequenceFile"): - p = Path(os.path.join(input_dir + line.split(' ')[1])) - gid_to_records[p.stem] = {record.id: record for record in SeqIO.parse(str(p), "fasta")} - gid_to_cid_to_index[p.stem] = {idx+1: rec.id for (idx, rec) in enumerate(SeqIO.parse(str(p), "fasta"))} - return gid_to_records, gid_to_cid_to_index + basename = Path(line.split(' ')[1]).stem + p = Path(basename_to_path[basename]) + gid_to_records[p.stem] = {} + gid_to_cid_to_index[p.stem] = {} + gid_to_index_to_cid[p.stem] = {} + for idx, rec in enumerate(SeqIO.parse(str(p), "fasta")): + gid_to_records[p.stem][rec.id] = rec + gid_to_cid_to_index[p.stem][rec.id] = idx + 1 + gid_to_index_to_cid[p.stem][idx + 1] = rec.id + return gid_to_records, gid_to_cid_to_index, gid_to_index_to_cid - -def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index): +def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_index_to_cid): seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*') idpair_to_segments = defaultdict(list) - idpair_to_tree = defaultdict(IntervalTree) cluster_to_named_segments = defaultdict(list) for aln in tqdm(AlignIO.parse(xmfa_file, "mauve")): for seq in aln: @@ -78,26 +90,25 @@ def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_cid_to_index): else: endpos = startpos + aln_len - idp = IdPair(gid, gid_to_cid_to_index[gid][contig_idx]) + idp = IdPair(gid, gid_to_index_to_cid[gid][contig_idx]) seg = Segment(idp, startpos, startpos + aln_len, seq.annotations["strand"]) idpair_to_segments[idp].append(seg) - idpair_to_tree[idp].addi(seg.start, seg.stop) cluster_to_named_segments[cluster_idx].append(seg) for idp in idpair_to_segments: idpair_to_segments[idp] = sorted(idpair_to_segments[idp]) - idpair_to_tree[idp].merge_overlaps() - return idpair_to_segments, idpair_to_tree, cluster_to_named_segments + return idpair_to_segments, cluster_to_named_segments def run_msa(downstream_segs_to_align, gid_to_records): keep_extending = True iteration = 0 - seq_len_desc = stats.describe([seg.stop - seg.start for seg in downstream_segs_to_align]) - longest_seq = seq_len_desc.minmax[1] + seq_lens = [seg.stop - seg.start for seg in downstream_segs_to_align] + longest_seq = max(seq_lens) + mean_seq_len = np.mean(seq_lens) if sum( - seq_len_desc.mean*(1 - length_window) <= (seg.stop - seg.start) <= seq_len_desc.mean*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop: - base_length = int(seq_len_desc.mean*(1 + length_window)) + mean_seq_len*(1 - length_window) <= (seg.stop - seg.start) <= mean_seq_len*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop: + base_length = int(mean_seq_len*(1 + length_window)) else: base_length = BASE_LENGTH @@ -131,7 +142,7 @@ def run_msa(downstream_segs_to_align, gid_to_records): return aligned_msa_seqs -def extend_clusters(xmfa_file, index_to_gid, gid_to_cid_to_index, idpair_to_segments, idpair_to_tree, cluster_to_named_segments, gid_to_records): +def extend_clusters(xmfa_file, gid_to_index, gid_to_cid_to_index, idpair_to_segments, cluster_to_named_segments, gid_to_records): ret_lcbs = [] seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*') @@ -167,29 +178,36 @@ def extend_clusters(xmfa_file, index_to_gid, gid_to_cid_to_index, idpair_to_segm new_lcb = MultipleSeqAlignment([]) # Assumes alignments are always in the same order new_bp = [] - for seq_idx, (covered_seg, uncovered_seg, aln_str) in enumerate(zip(segs, downstream_segs_to_align, aligned_msa_seqs)): + for seg_idx, (covered_seg, uncovered_seg, aln_str) in enumerate(zip(segs, downstream_segs_to_align, aligned_msa_seqs)): # Update segment in idpair_to_segments + if len(aln_str) < MIN_LEN: + continue new_bp_covered = len(aln_str) - aln_str.count("-") # print(f"Extending {covered_seg} by {new_bp_covered}") new_bp.append(new_bp_covered) new_seq = aln_str if covered_seg.strand == 1: new_seg = Segment(covered_seg.idp, uncovered_seg.start, uncovered_seg.start + new_bp_covered, covered_seg.strand) + if new_bp_covered > 0: + segs[seg_idx] = Segment(covered_seg.idp, covered_seg.start, new_seg.stop, covered_seg.strand) else: aln_str = Seq(aln_str).reverse_complement() new_seg = Segment(covered_seg.idp, covered_seg.start - new_bp_covered, covered_seg.start, covered_seg.strand) + if new_bp_covered > 0: + segs[seg_idx] = Segment(covered_seg.idp, new_seg.start, covered_seg.stop, covered_seg.strand) new_record = SeqRecord( seq=new_seq, - id=f"{covered_seg.idp.gid}#{covered_seg.idp.cid}", + id=f"cluster{cluster_idx} s{gid_to_cid_to_index[covered_seg.idp.gid][covered_seg.idp.cid]}:p{new_seg.start if new_seg.strand == 1 else new_seg.stop}", + name=gid_to_index[covered_seg.idp.gid], annotations={"start": new_seg.start, "end": new_seg.stop, "strand": new_seg.strand} ) + # if covered_seg.strand == 1: new_lcb.append(new_record) - if new_bp_covered > 0: - idpair_to_tree[covered_seg.idp].addi(new_seg.start, new_seg.stop) - ret_lcbs.append(new_lcb) + if len(new_lcb) > 0: + ret_lcbs.append(new_lcb) return ret_lcbs diff --git a/parsnp b/parsnp index ce8cce5..eebf303 100755 --- a/parsnp +++ b/parsnp @@ -1671,51 +1671,32 @@ SETTINGS: partition.merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads) - - run_lcb_trees = 0 + parsnp_output = f"{outputDir}/parsnp.xmfa" # This is the stuff for LCB extension: - annotation_dict = {} - #TODO always using xtrafast? - parsnp_output = f"{outputDir}/parsnp.xmfa" if args.extend_lcbs: - xmfa_file = f"{outputDir}/parsnp.xmfa" - with TemporaryDirectory() as temp_directory: - original_maf_file = f"{outputDir}/parsnp-original.maf" - extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa" - fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = ext.get_sequence_data( - ref, - finalfiles, - index_files=False) - fname_to_contigid_to_coords, fname_header_to_gcontigidx = ext.xmfa_to_maf( - xmfa_file, - original_maf_file, - fname_contigidx_to_header, - fname_contigid_to_length) - packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords) - fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result - cluster_files = glob(f"{temp_directory}/*.fasta") - clusterdir_expand, clusterdir_len = ext.get_new_extensions( - cluster_files, - args.match_score, - args.mismatch_penalty, - args.gap_penalty) - ext.write_extended_xmfa( - original_maf_file, - extended_xmfa_file, - temp_directory, - clusterdir_expand, - clusterdir_len, - fname_contigid_to_cluster_dir_to_length, - fname_contigid_to_cluster_dir_to_adjacent_cluster, - fname_header_to_gcontigidx, - fname_contigid_to_length, - args.extend_ani_cutoff, - args.extend_indel_cutoff, - threads) - parsnp_output = extended_xmfa_file - os.remove(original_maf_file) + import partition + import extend as ext + + orig_parsnp_xmfa = parsnp_output + extended_parsnp_xmfa = orig_parsnp_xmfa + ".extended" + + # Index input fasta files and original xmfa + index_to_gid, gid_to_index = ext.parse_xmfa_header(orig_parsnp_xmfa) + gid_to_records, gid_to_cid_to_index, gid_to_index_to_cid = ext.index_input_sequences(orig_parsnp_xmfa, finalfiles + [ref]) + + # Get covered regions of xmfa file + idpair_to_segments, cluster_to_named_segments = ext.xmfa_to_covered(orig_parsnp_xmfa, index_to_gid, gid_to_index_to_cid) + + # Extend clusters + logger.info(f"Extending LCBs with SPOA...") + new_lcbs = ext.extend_clusters(orig_parsnp_xmfa, gid_to_index, gid_to_cid_to_index, idpair_to_segments, cluster_to_named_segments, gid_to_records) + # Write output + partition.copy_header(orig_parsnp_xmfa, extended_parsnp_xmfa) + with open(extended_parsnp_xmfa, 'a') as out_f: + for lcb in new_lcbs: + partition.write_aln_to_xmfa(lcb, out_f) #add genbank here, if present if len(genbank_ref) != 0: From 0b8e5099b956168cffaf75a1b237c9252abad948 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Thu, 18 Jan 2024 15:39:15 -0600 Subject: [PATCH 4/8] Version updated to 2.0.2 --- parsnp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsnp b/parsnp index eebf303..7c3c16a 100755 --- a/parsnp +++ b/parsnp @@ -27,7 +27,7 @@ from pathlib import Path import extend as ext from tqdm import tqdm -__version__ = "2.0.1" +__version__ = "2.0.2" reroot_tree = True #use --midpoint-reroot random_seeded = random.Random(42) From 918aa2fa1353fe51ef631345ddaf6b0e07065c9a Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Thu, 18 Jan 2024 15:42:30 -0600 Subject: [PATCH 5/8] Add warning for extension module --- parsnp | 1 + 1 file changed, 1 insertion(+) diff --git a/parsnp b/parsnp index 7c3c16a..561c73e 100755 --- a/parsnp +++ b/parsnp @@ -1675,6 +1675,7 @@ SETTINGS: # This is the stuff for LCB extension: if args.extend_lcbs: + logger.warning("The LCB extension module is experimental. Runtime may be significantly increased and extended alignments may not be as high quality as the original core-genome. Extensions off of existing LCBs are in a separate xmfa file.") import partition import extend as ext From 038187fc0c1ba10d3bf22f6b093262d9f527559d Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 19 Jan 2024 00:01:00 -0600 Subject: [PATCH 6/8] Pass tqdm through logger --- extend.py | 11 ++++++++--- logger.py | 23 +++++++++++++++++++++++ parsnp | 8 ++++++-- partition.py | 13 ++++++++++--- 4 files changed, 47 insertions(+), 8 deletions(-) diff --git a/extend.py b/extend.py index 693c97a..632d8a8 100644 --- a/extend.py +++ b/extend.py @@ -3,6 +3,7 @@ from Bio.SeqIO import SeqRecord from Bio.Align import MultipleSeqAlignment from glob import glob +import logging import tempfile from pathlib import Path import re @@ -15,7 +16,7 @@ import numpy as np from Bio.AlignIO.MafIO import MafWriter, MafIterator from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator -from logger import logger +from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL from tqdm import tqdm import time import spoa @@ -77,7 +78,7 @@ def xmfa_to_covered(xmfa_file, index_to_gid, gid_to_index_to_cid): seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*') idpair_to_segments = defaultdict(list) cluster_to_named_segments = defaultdict(list) - for aln in tqdm(AlignIO.parse(xmfa_file, "mauve")): + for aln in AlignIO.parse(xmfa_file, "mauve"): for seq in aln: # Skip reference for now... aln_len = seq.annotations["end"] - seq.annotations["start"] @@ -146,7 +147,11 @@ def extend_clusters(xmfa_file, gid_to_index, gid_to_cid_to_index, idpair_to_segm ret_lcbs = [] seqid_parser = re.compile(r'^cluster(\d+) s(\d+):p(\d+)/.*') - for aln_idx, aln in enumerate(tqdm(AlignIO.parse(xmfa_file, "mauve"), total=len(cluster_to_named_segments))): + for aln_idx, aln in enumerate(tqdm( + AlignIO.parse(xmfa_file, "mauve"), + total=len(cluster_to_named_segments), + file=TqdmToLogger(logger, level=logging.INFO), + mininterval=MIN_TQDM_INTERVAL)): # validate_lcb(aln, gid_to_records, parsnp_header=True) seq = aln[0] cluster_idx, contig_idx, startpos = [int(x) for x in seqid_parser.match(seq.id).groups()] diff --git a/logger.py b/logger.py index d6d5e82..83579e8 100644 --- a/logger.py +++ b/logger.py @@ -1,4 +1,5 @@ import logging +import io ############################################# Logging ############################################## BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) #These are the sequences need to get colored ouput @@ -14,6 +15,28 @@ COLOR_SEQ = "\033[1;%dm" BOLD_SEQ = "\033[1m" +MIN_TQDM_INTERVAL=30 + + +# Logging redirect copied from https://stackoverflow.com/questions/14897756/python-progress-bar-through-logging-module +class TqdmToLogger(io.StringIO): + """ + Output stream for TQDM which will output to logger module instead of + the StdOut. + """ + logger = None + level = None + buf = '' + def __init__(self,logger,level=None): + super(TqdmToLogger, self).__init__() + self.logger = logger + self.level = level or logging.INFO + def write(self,buf): + self.buf = buf.strip('\r\n\t ') + def flush(self): + self.logger.log(self.level, self.buf) + + def formatter_message(message, use_color = True): if use_color: message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ) diff --git a/parsnp b/parsnp index 561c73e..2aaf441 100755 --- a/parsnp +++ b/parsnp @@ -13,7 +13,7 @@ import shlex from tempfile import TemporaryDirectory import re import logging -from logger import logger +from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL import multiprocessing import argparse import signal @@ -1650,7 +1650,11 @@ SETTINGS: logger.info("Running partitions...") good_chunks = set(chunk_labels) with Pool(args.threads) as pool: - return_codes = tqdm(pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), total=len(chunk_output_dirs)) + return_codes = tqdm( + pool.imap(run_parsnp_aligner, chunk_output_dirs, chunksize=1), + total=len(chunk_output_dirs), + file=TqdmToLogger(logger,level=logging.INFO), + mininterval=MIN_TQDM_INTERVAL) for cl, rc in zip(chunk_labels, return_codes): if rc != 0: logger.error(f"Partition {cl} failed...") diff --git a/partition.py b/partition.py index 682e969..365efa0 100644 --- a/partition.py +++ b/partition.py @@ -7,6 +7,7 @@ import os import copy import math +import logging from multiprocessing import Pool from functools import partial from collections import namedtuple, defaultdict, Counter @@ -18,7 +19,7 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment -from logger import logger +from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL from tqdm import tqdm @@ -634,7 +635,11 @@ def trim_xmfas( trim_partial = partial(trim_single_xmfa, intersected_interval_dict=intersected_interval_dict) orig_xmfa_files = [f"{partition_output_dir}/{CHUNK_PREFIX}-{cl}-out/parsnp.xmfa" for cl in chunk_labels] with Pool(threads) as pool: - num_clusters_per_xmfa = list(tqdm(pool.imap_unordered(trim_partial, orig_xmfa_files), total=len(orig_xmfa_files))) + num_clusters_per_xmfa = list(tqdm( + pool.imap_unordered(trim_partial, orig_xmfa_files), + total=len(orig_xmfa_files), + file=TqdmToLogger(logger,level=logging.INFO), + mininterval=MIN_TQDM_INTERVAL)) #TODO clean up if not all(num_clusters_per_xmfa[0][1] == nc for xmfa, nc in num_clusters_per_xmfa): logger.critical("One of the partitions has a different number of clusters after trimming...") @@ -712,7 +717,9 @@ def merge_xmfas( with Pool(threads) as pool: tmp_xmfas = list(tqdm( pool.imap_unordered(merge_single_LCB_star_partial, enumerate(pairs_list)), - total=num_clusters) + total=num_clusters, + file=TqdmToLogger(logger,level=logging.INFO), + mininterval=MIN_TQDM_INTERVAL) ) with open(xmfa_out_f, 'a') as xmfa_out_handle: From feb7a2002f0311e9ce9ae068f79651a3d4d34b9c Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 19 Jan 2024 11:44:29 -0600 Subject: [PATCH 7/8] Start extension w/ shortest length --- extend.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/extend.py b/extend.py index 632d8a8..a971ea2 100644 --- a/extend.py +++ b/extend.py @@ -107,12 +107,13 @@ def run_msa(downstream_segs_to_align, gid_to_records): seq_lens = [seg.stop - seg.start for seg in downstream_segs_to_align] longest_seq = max(seq_lens) mean_seq_len = np.mean(seq_lens) - if sum( - mean_seq_len*(1 - length_window) <= (seg.stop - seg.start) <= mean_seq_len*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop: - base_length = int(mean_seq_len*(1 + length_window)) - else: - base_length = BASE_LENGTH + # if sum( + # mean_seq_len*(1 - length_window) <= (seg.stop - seg.start) <= mean_seq_len*(1 + length_window) for seg in downstream_segs_to_align) > len(downstream_segs_to_align)*window_prop: + # base_length = int(mean_seq_len*(1 + length_window)) + # else: + # base_length = BASE_LENGTH + base_length = BASE_LENGTH while keep_extending: seqs_to_align = ["A" + (str( gid_to_records[seg.idp.gid][seg.idp.cid].seq[seg.start:seg.stop] if seg.strand == 1 From 2f81014b383301a6381b6c9a448e5ebe66cf4c46 Mon Sep 17 00:00:00 2001 From: Bryce Lorenz Kille Date: Fri, 19 Jan 2024 12:02:30 -0600 Subject: [PATCH 8/8] happy pylint --- extend.py | 11 +-------- parsnp | 7 ++---- partition.py | 66 ---------------------------------------------------- 3 files changed, 3 insertions(+), 81 deletions(-) diff --git a/extend.py b/extend.py index a971ea2..94aa321 100644 --- a/extend.py +++ b/extend.py @@ -2,23 +2,14 @@ from Bio.Seq import Seq from Bio.SeqIO import SeqRecord from Bio.Align import MultipleSeqAlignment -from glob import glob import logging -import tempfile from pathlib import Path import re -import subprocess from collections import namedtuple, defaultdict, Counter import bisect -import os -from Bio.Align import substitution_matrices -from itertools import product, combinations import numpy as np -from Bio.AlignIO.MafIO import MafWriter, MafIterator -from Bio.AlignIO.MauveIO import MauveWriter, MauveIterator -from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL from tqdm import tqdm -import time +from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL import spoa #%% diff --git a/parsnp b/parsnp index 2aaf441..85fd9c9 100755 --- a/parsnp +++ b/parsnp @@ -5,19 +5,16 @@ ''' -import os, sys, string, getopt, random,subprocess, time,operator, math, datetime,numpy #pysam +import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam from collections import defaultdict -import csv import shutil import shlex from tempfile import TemporaryDirectory import re import logging from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL -import multiprocessing import argparse import signal -import inspect from multiprocessing import Pool from Bio import SeqIO from glob import glob @@ -149,7 +146,7 @@ def run_phipack(query,seqlen,workingdir): currdir = os.getcwd() os.chdir(workingdir) command = "Profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query) - run_command(command,1, prepend_time=True) + run_command(command, 1) os.chdir(currdir) def run_fasttree(query,workingdir,recombination_sites): diff --git a/partition.py b/partition.py index 365efa0..3eb780d 100644 --- a/partition.py +++ b/partition.py @@ -729,69 +729,3 @@ def merge_xmfas( xmfa_out_handle.write(tx.read()) -if __name__ == "__main__": - args = parse_args() - os.makedirs(args.output_dir, exist_ok=True) - - query_files = list() - for arg_path in args.sequences: - if arg_path.endswith(".txt"): - with open(arg_path) as input_list_handle: - for line in input_list_handle: - query_files.append(line.strip()) - elif any(arg_path.endswith(suff) for suff in FASTA_SUFFIX_LIST): - query_files.append(arg_path) - elif os.path.isdir(arg_path): - for f in os.listdir(arg_path): - if any(f.endswith(suff) for suff in FASTA_SUFFIX_LIST): - query_files.append(f"{arg_path}/{f}") - - full_query_list_path = f"{args.output_dir}/input-list.txt" - with open(full_query_list_path, 'w') as input_list_handle: - for qf in query_files: - input_list_handle.write(qf + "\n") - - partition_output_dir = f"{args.output_dir}/partition" - partition_list_dir = f"{partition_output_dir}/input-lists" - os.makedirs(partition_list_dir, exist_ok=True) - run_command(f"split -l {args.partition_size} -a 5 --additional-suffix '.txt' {full_query_list_path} {partition_list_dir}/{CHUNK_PREFIX}-") - - chunk_label_parser = re.compile(f'{CHUNK_PREFIX}-(.*).txt') - chunk_labels = [] - for partition_list_file in os.listdir(partition_list_dir): - chunk_labels.append(chunk_label_parser.search(partition_list_file).groups()[0]) - - parsnp_commands = [] - for cl in chunk_labels: - chunk_output_dir = f"{partition_output_dir}/{CHUNK_PREFIX}-{cl}-out" - os.makedirs(chunk_output_dir, exist_ok=True) - chunk_command = f"./parsnp -d {partition_list_dir}/{CHUNK_PREFIX}-{cl}.txt -r {args.reference} -o {chunk_output_dir} " - chunk_command += args.parsnp_flags - chunk_command += f" > {chunk_output_dir}/parsnp.stdout 2> {chunk_output_dir}/parsnp.stderr" - parsnp_commands.append(chunk_command) - - good_chunks = set(chunk_labels) - run_command_nocheck = partial(run_command, check=False) - with Pool(args.threads) as pool: - return_codes = tqdm(pool.imap(run_command_nocheck, parsnp_commands, chunksize=1), total=len(parsnp_commands)) - for cl, rc in zip(chunk_labels, return_codes): - if rc != 0: - logger.error("Partition {cl} failed...") - good_chunks.remove(cl) - - chunk_labels = list(good_chunks) - - logger.info("Computing intersection of all partition LCBs...") - chunk_to_intvervaldict = get_chunked_intervals(partition_output_dir, chunk_labels) - intersected_interval_dict = get_intersected_intervals(chunk_to_intvervaldict) - - logger.info("Trimming partitioned XMFAs back to intersected intervals...") - num_clusters = trim_xmfas(partition_output_dir, chunk_labels, intersected_interval_dict, args.threads) - - os.makedirs(f"{args.output_dir}/merged-out/", exist_ok=True) - xmfa_out_f = f"{args.output_dir}/merged-out/parsnp.xmfa" - - logger.info(f"Merging trimmed XMFA files into a single XMFA at {xmfa_out_f}") - merge_xmfas(partition_output_dir, chunk_labels, xmfa_out_f, num_clusters, args.threads) - -