From 7fd9ace0cb7134dfb2679ed803650607de27a7bb Mon Sep 17 00:00:00 2001 From: Dom Laetsch Date: Sun, 17 Apr 2016 23:54:24 +0100 Subject: [PATCH] v0.9.14 - added new output format for bam2cov - parses readcov as well --- bam2cov.py | 76 ++++++++++++++++++++++++------------------ lib/BtCore.py | 91 ++++++++++++++++++++++++++------------------------- lib/BtIO.py | 52 ++++++++++++++++++++++------- lib/BtLog.py | 6 ++-- lib/BtPlot.py | 28 +++++++++++----- 5 files changed, 154 insertions(+), 99 deletions(-) diff --git a/bam2cov.py b/bam2cov.py index 5f38afa..4321b76 100644 --- a/bam2cov.py +++ b/bam2cov.py @@ -1,12 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -"""usage: blobtools bam2cov -i FASTA -b BAM [-h|--help] - +"""usage: blobtools bam2cov -i FASTA -b BAM [--mq MQ] [--no_base_cov] + [-h|--help] + Options: -h --help show this - -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces. + -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces. -b, --bam BAM file (requires samtools in $PATH) + --mq minimum Mapping Quality (MQ) [default: 1] + --no_base_cov only parse read coverage (faster, but ... + can only be used for "blobtools blobplot --noblobs") """ from __future__ import division @@ -18,11 +22,12 @@ class Fasta(): def __init__(self, name, seq): - self.name = name + self.name = name self.length = len(seq) self.n_count = seq.count('N') self.agct_count = self.length - self.n_count - self.cov = 0.0 + self.base_cov = 0.0 + self.read_cov = 0 def which(program): def is_exe(fpath): @@ -47,9 +52,9 @@ def runCmd(command): return iter(p.stdout.readline, b'') def readFasta(infile): - with open(infile) as fh: + with open(infile) as fh: header, seqs = '', [] - for l in fh: + for l in fh: if l[0] == '>': if (header): yield header, ''.join(seqs) @@ -86,49 +91,58 @@ def readBam(infile, fasta_headers): progress_unit = int(int(reads_total)/1000) base_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's - # execute samtools to get only mapped reads - command = "samtools view -F 4 " + infile + + read_cov_dict = {} + # execute samtools to get only mapped reads from primary alignment + command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile # only one counter since only yields mapped reads - parsed_reads = 0 + parsed_reads = 0 for line in runCmd(command): match = line.split("\t") - if match >= 11: - seq_name = match[2] - base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) - if (base_cov): - parsed_reads += 1 - if seq_name not in fasta_headers: - print BtLog.warn_d['2'] % (seq_name, infile) - else: - base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov + seq_name = match[2] + if seq_name not in fasta_headers: + print BtLog.warn_d['2'] % (seq_name, infile) + else: + read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 + if not (no_base_cov_flag): + base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) + if (base_cov): + base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov + parsed_reads += 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) - - if not int(reads_mapped) == int(parsed_reads): - print warn_d['3'] % (reads_mapped, parsed_reads) - return base_cov_dict, reads_total, parsed_reads + return base_cov_dict, read_cov_dict, reads_total, parsed_reads def parseBam(bam_f, fasta_dict): - base_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys())) + base_cov_dict, read_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys())) if reads_total == 0: print BtLog.warn_d['4'] % bam_f for name, base_cov in base_cov_dict.items(): - fasta_dict[name].cov = base_cov / fasta_dict[name].agct_count - return fasta_dict + fasta_dict[name].base_cov = base_cov / fasta_dict[name].agct_count + for name, read_cov in read_cov_dict.items(): + fasta_dict[name].read_cov = read_cov + return fasta_dict, reads_total, reads_mapped -def writeCov(fasta_dict, out_f): +def writeCov(fasta_dict, reads_total, reads_mapped, out_f): with open(out_f, 'w') as fh: + fh.write("# Total Reads = %s\n" % (reads_total)) + fh.write("# Mapped Reads = %s\n" % (reads_mapped)) + fh.write("# Unmapped Reads = %s\n" % (reads_total - reads_mapped)) + fh.write("# Parameters : MQ = %s, No_base_cov_flag = %s\n" % (mq, no_base_cov_flag)) + fh.write("# %s\t%s\t%s\n" % ("contig_id", "read_cov", "base_cov")) for name, fasta_obj in fasta_dict.items(): - fh.write("%s\t%s\n" % (name, fasta_obj.cov)) + fh.write("%s\t%s\t%s\n" % (name, fasta_obj.read_cov, fasta_obj.base_cov)) if __name__ == '__main__': args = docopt(__doc__) - + fasta_f = args['--infile'] bam_f = args['--bam'] out_f = os.path.basename(bam_f) + ".cov" + mq = int(args['--mq']) + no_base_cov_flag = args['--no_base_cov'] fasta_dict = parseFasta(fasta_f) - fasta_dict = parseBam(bam_f, fasta_dict) - writeCov(fasta_dict, out_f) + fasta_dict, reads_total, reads_mapped = parseBam(bam_f, fasta_dict) + writeCov(fasta_dict, reads_total, reads_mapped, out_f) diff --git a/lib/BtCore.py b/lib/BtCore.py index f78586e..82cdaec 100644 --- a/lib/BtCore.py +++ b/lib/BtCore.py @@ -14,14 +14,14 @@ class BlobDB holds all information parsed from files def __init__(self, title): self.title = title self.assembly_f = '' - self.dict_of_blobs = {} + self.dict_of_blobs = {} self.order_of_blobs = [] self.set_of_taxIds = set() self.lineages = {} self.length = 0 self.seqs = 0 self.n_count = 0 - self.covLibs = {} + self.covLibs = {} self.hitLibs = {} self.nodesDB_f = '' self.taxrules = [] @@ -36,8 +36,8 @@ def view(self, out_f, ranks, taxrule, hits_flag, seqs): header += "%s\n" % "\n".join("## " + hitLib['name'] + " : " + hitLib["f"] for hitLib in self.hitLibs.values()) header += "## nodesDB : %s\n" % self.nodesDB_f header += "## taxrule : %s\n" % taxrule - header += "##\n" - header += "# %s" % sep.join(map(str, [ "name", "length", "GC", "N" ])) + header += "##\n" + header += "# %s" % sep.join(map(str, [ "name", "length", "GC", "N" ])) header += "%s%s" % (sep, sep.join([cov_lib_name for cov_lib_name in self.covLibs])) if (len(self.covLibs)) > 1: header += "%s%s" % (sep, "cov_sum") @@ -59,10 +59,10 @@ def view(self, out_f, ranks, taxrule, hits_flag, seqs): else: with open(out_f, 'w') as fh: fh.write(header + body) - + def getViewLine(self, blob, taxrule, ranks, sep, hits_flag): line = '' - line += "\n%s" % sep.join(map(str, [ blob['name'], blob['length'], blob['gc'], blob['n_count'] ])) + line += "\n%s" % sep.join(map(str, [ blob['name'], blob['length'], blob['gc'], blob['n_count'] ])) line += sep line += "%s" % sep.join(map(str, [ blob['covs'][covLib] for covLib in self.covLibs])) if len(self.covLibs) > 1: @@ -81,7 +81,7 @@ def getViewLine(self, blob, taxrule, ranks, sep, hits_flag): line += "%s=" % tax_lib_name sum_dict = {} for hit in blob['hits'][tax_lib_name]: - tax_rank = self.lineages[hit['taxId']][rank] + tax_rank = self.lineages[hit['taxId']][rank] sum_dict[tax_rank] = sum_dict.get(tax_rank, 0.0) + hit['score'] line += "%s" % "|".join([":".join(map(str, [tax_rank, sum_dict[tax_rank]])) for tax_rank in sorted(sum_dict, key=sum_dict.get, reverse=True)]) else: @@ -110,7 +110,7 @@ def load(self, BlobDb_f): for k, v in blobDict.items(): setattr(self, k, v) self.set_of_taxIds = blobDict['lineages'].keys() - #for k, v in self.__dict__.items(): + #for k, v in self.__dict__.items(): # print k, type(v), v # this seems to work #self.title = blobDict['title'] @@ -119,7 +119,7 @@ def load(self, BlobDb_f): #self.lineages = blobDict['lineages'] #self.set_of_taxIds = blobDict['lineages'].keys() #self.order_of_blobs = blobDict['order_of_blobs'] - #self.dict_of_blobs = blobDict['dict_of_blobs'] + #self.dict_of_blobs = blobDict['dict_of_blobs'] #self.length = int(blobDict['length']) #self.seqs = int(blobDict['seqs']) #self.n_count = int(blobDict['n_count']) @@ -129,14 +129,14 @@ def load(self, BlobDb_f): # self.title = title # self.assembly_f = '' - # self.dict_of_blobs = {} + # self.dict_of_blobs = {} # self.order_of_blobs = [] # self.set_of_taxIds = set() # self.lineages = {} # self.length = 0 # self.seqs = 0 # self.n_count = 0 - # self.covLibs = {} + # self.covLibs = {} # self.hitLibs = {} # self.nodesDB_f = '' # self.taxrules = [] @@ -149,13 +149,13 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour cov_lib_dict = self.covLibs #print cov_lib_dict cov_lib_names_l = self.covLibs.keys() # does not include cov_sum - + if len(cov_lib_names_l) > 1: # more than one cov_lib, cov_sum_lib has to be created cov_lib_dict['sum'] = CovLibObj('sum', 'sum', None).__dict__ # ugly cov_lib_dict['sum']['reads_total'] = sum([cov_lib_dict[x]['reads_total'] for x in cov_lib_dict]) cov_lib_dict['sum']['reads_mapped'] = sum([cov_lib_dict[x]['reads_mapped'] for x in cov_lib_dict]) - + #print self.covLibs #cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()} #print cov_libs_reads_total # correct @@ -164,32 +164,32 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour for blob in self.dict_of_blobs.values(): name, gc, length, group = blob['name'], blob['gc'], blob['length'], '' - + if (catcolour_dict): # annotation with categories specified in catcolour group = str(catcolour_dict[name]) elif (c_index): # annotation with c_index instead of taxonomic group group = str(blob['taxonomy'][taxrule][rank]['c_index']) else: # annotation with taxonomic group group = str(blob['taxonomy'][taxrule][rank]['tax']) - - if not group in data_dict: + + if not group in data_dict: data_dict[group] = { - 'name' : [], - 'length' : [], - 'gc' : [], - 'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists + 'name' : [], + 'length' : [], + 'gc' : [], + 'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists 'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists 'count' : 0, 'count_hidden' : 0, 'count_visible' : 0, - 'span': 0, - 'span_hidden' : 0, + 'span': 0, + 'span_hidden' : 0, 'span_visible' : 0, } data_dict[group]['count'] = data_dict[group].get('count', 0) + 1 data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length) - + if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1 data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length) @@ -212,16 +212,16 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour max_cov = cov # add cov of blob to group - data_dict[group]['covs'][cov_lib].append(cov) + data_dict[group]['covs'][cov_lib].append(cov) cov_sum += cov # add readcov - if cov_lib in blob['read_cov']: + if cov_lib in blob['read_cov']: reads_mapped = blob['read_cov'][cov_lib] - data_dict[group]['reads_mapped'][cov_lib] += reads_mapped + data_dict[group]['reads_mapped'][cov_lib] += reads_mapped reads_mapped_sum += reads_mapped - + if len(cov_lib_names_l) > 1: if cov_sum < 0.02 : cov_sum = 0.02 @@ -232,8 +232,8 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum #if len(cov_lib_names_l) > 1: # for cov_lib, data in self.covLibs.items(): - # cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total'] - + # cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total'] + #for group in data_dict: # print "#", group @@ -246,7 +246,7 @@ def addCovLib(self, covLib): self.covLibs[covLib.name] = covLib for blObj in self.dict_of_blobs.values(): blObj.addCov(covLib.name, 0.0) - + def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) @@ -260,7 +260,7 @@ def parseFasta(self, fasta_f, fasta_type): self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count - + if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov @@ -270,16 +270,15 @@ def parseFasta(self, fasta_f, fasta_type): self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) - + if self.seqs == 0 or self.length == 0: BtLog.error('1') - + def parseCovs(self, covLibObjs): for covLib in covLibObjs: self.addCovLib(covLib) print BtLog.status_d['1'] % (covLib.name, covLib.f) if covLib.fmt == 'bam' or covLib.fmt == 'sam': - base_cov_dict = {} if covLib.fmt == 'bam': base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs)) @@ -305,17 +304,19 @@ def parseCovs(self, covLibObjs): self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) elif covLib.fmt == 'cov': - cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs)) - if not len(cov_dict) == self.seqs: + base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs)) + #cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs)) + if not len(base_cov_dict) == self.seqs: print BtLog.warn_d['4'] % covLib.f - for name, cov in cov_dict.items(): + for name, cov in base_cov_dict.items(): covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) + self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) else: - pass + pass covLib.mean_cov = covLib.cov_sum/self.seqs if covLib.cov_sum == 0.0: - BtLog.error('26', covLib.name) + print BtLog.warn_d['6'] % (covLib.name) self.covLibs[covLib.name] = covLib @@ -327,10 +328,10 @@ def parseHits(self, hitLibs): for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)): if ";" in hitDict['taxId']: hitDict['taxId'] = hitDict['taxId'].split(";")[0] - print BtLog.warn['5'] % (hitDict['name'], hitLib) + print BtLog.warn_d['5'] % (hitDict['name'], hitLib) self.set_of_taxIds.add(hitDict['taxId']) self.dict_of_blobs[hitDict['name']].addHits(hitLib.name, hitDict) - + def computeTaxonomy(self, taxrules, nodesDB): tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) @@ -344,14 +345,14 @@ def computeTaxonomy(self, taxrules, nodesDB): blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages) else: blObj.taxonomy[taxrule] = BtTax.noHit() - + def getBlobs(self): for blObj in [self.dict_of_blobs[key] for key in self.order_of_blobs]: yield blObj class BlObj(): def __init__(self, name, seq): - self.name = name + self.name = name self.length = len(seq) self.n_count = seq.count('N') self.agct_count = self.length - self.n_count @@ -360,7 +361,7 @@ def __init__(self, name, seq): self.read_cov = {} self.hits = {} self.taxonomy = {} - + def calculateGC(self, seq): return float((seq.count('G') + seq.count('C') ) / self.agct_count \ if self.agct_count > 0 else 0.0) @@ -393,4 +394,4 @@ def __init__(self, name, fmt, f): self.f = abspath(f) if (f) else '' if __name__ == '__main__': - pass \ No newline at end of file + pass diff --git a/lib/BtIO.py b/lib/BtIO.py index 3f61faa..94879f2 100644 --- a/lib/BtIO.py +++ b/lib/BtIO.py @@ -146,22 +146,52 @@ def parseCovFromHeader(fasta_type, header ): pass def readCov(infile, set_of_blobs): - cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") - cov_dict = {} + old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") + base_cov_dict = {} + + cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") + reads_total = 0 + reads_mapped = 0 + read_cov_dict = {} + seqs_parsed = 0 progress_unit = 1 + old_format = 1 with open(infile) as fh: for line in fh: - match = cov_line_re.search(line) - if match: - seqs_parsed += 1 - name, cov = match.group(1), float(match.group(2)) - if name not in set_of_blobs: - print BtLog.warn_d['2'] % (name, infile) - cov_dict[name] = cov + if line.startswith("#"): + old_format = 0 + if old_format == 0: + if line.startswith("# Total Reads"): + reads_total = int(line.split(" = ")[1]) + elif line.startswith("# Mapped Reads"): + reads_mapped = int(line.split(" = ")[1]) + elif line.startswith("# Unmapped Reads"): + pass + elif line.startswith("# Parameters"): + pass + elif line.startswith("# contig_id"): + pass + else: + match = cov_line_re.search(line) + if match: + seqs_parsed += 1 + name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) + if name not in set_of_blobs: + print BtLog.warn_d['2'] % (name, infile) + read_cov_dict[name] = read_cov + base_cov_dict[name] = base_cov + else: + match = old_cov_line_re.search(line) + if match: + seqs_parsed += 1 + name, base_cov = match.group(1), float(match.group(2)) + if name not in set_of_blobs: + print BtLog.warn_d['2'] % (name, infile) + base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) - BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) - return cov_dict + #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) + return base_cov_dict, reads_total, reads_mapped, read_cov_dict def checkCas(infile): print BtLog.status_d['12'] diff --git a/lib/BtLog.py b/lib/BtLog.py index c00c424..044e8e4 100644 --- a/lib/BtLog.py +++ b/lib/BtLog.py @@ -56,8 +56,7 @@ def progress(iteration, steps, max_value): '22' : '[ERROR:22]\t: Tax file %s seems to have no taxids.', '23' : '[ERROR:23]\t: Catcolour file %s does not seem to have the right format.', '24' : '[ERROR:24]\t: Catcolour file incompatible with c-index colouring.', - '25' : '[ERROR:25]\t: Cov file %s does not seem to have the right format.', - '26' : '[ERROR:26]\t: The cumulative coverage of cov lib %s is 0.0. Please check the mapping/coverage file.' + '25' : '[ERROR:25]\t: Cov file %s does not seem to have the right format.' } warn_d = { @@ -66,7 +65,8 @@ def progress(iteration, steps, max_value): '2' : '[WARN]\t: %s in file %s is not part of the assembly', '3' : '[WARN]\t: samtools flagstat reported %s mapped reads, %s mapped reads were parsed', '4' : '[WARN]\t: No coverage data found in %s', - '5' : '[WARN]\t: Hit for sequence %s in tax file %s has multiple taxIds, only first one is used. ' + '5' : '[WARN]\t: Hit for sequence %s in tax file %s has multiple taxIds, only first one is used.', + '6' : '[WARN]\t: The cumulative coverage of cov lib %s is 0.0. Please check the mapping/coverage file.' } status_d = { '1' : '[STATUS]\t: Parsing %s - %s', diff --git a/lib/BtPlot.py b/lib/BtPlot.py index 23c7e08..9ee8005 100644 --- a/lib/BtPlot.py +++ b/lib/BtPlot.py @@ -74,16 +74,26 @@ def parseCatColour(catcolour_f): def parseCovFile(cov_f): cov_dict = {} + old_format = 1 + seq_name = '' + cov = 0.0 with open(cov_f) as fh: for l in fh: - try: - seq_name, cov = l.rstrip("\n").split("\t") - if float(cov) < 0.02: - cov_dict[seq_name] = 0.02 - else: - cov_dict[seq_name] = float(cov) - except: - BtLog.error('25', cov_f) + if l.startswith("#"): + old_format = 0 + else: + try: + field = l.rstrip("\n").split("\t") + if not (old_format): + seq_name, cov = field[0], field[2] + else: + seq_name, cov = field[0], field[1] + if float(cov) < 0.02: + cov_dict[seq_name] = 0.02 + else: + cov_dict[seq_name] = float(cov) + except: + BtLog.error('25', cov_f) return cov_dict @@ -407,7 +417,7 @@ def plotScatterCov(self, cov_lib, cov_dict, info_flag, x_label, y_label, scale, for group in self.plot_order: i += 1 group_length_array = array(self.stats[group]['length']) - group_cov_y_array = array([cov_dict[name] for name in self.stats[group]['name']]) # fix it! + group_cov_y_array = array([cov_dict.get(name, 0.02) for name in self.stats[group]['name']]) group_cov_x_array = array(self.stats[group]['covs'][cov_lib]) # calculate values for legend if len(group_length_array) > 0: