Skip to content

Commit

Permalink
Merge pull request #11 from DRL/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
DRL committed Feb 16, 2016
2 parents 3be2abb + 05cd61f commit daff0b2
Show file tree
Hide file tree
Showing 10 changed files with 237 additions and 151 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
testing/
testing_fly/
*.json
*.pyc
*.txt
Expand Down
3 changes: 1 addition & 2 deletions comparecov.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@
-p, --plotgroups INT Number of (taxonomic) groups to plot, remaining
groups are placed in 'other' [default: 7]
-r, --rank RANK Taxonomic rank used for colouring of blobs [default: superkingdom]
(Supported: superkingdom)
-r, --rank RANK Taxonomic rank used for colouring of blobs [default: phylum]
-x, --taxrule TAXRULE Taxrule which has been used for computing taxonomy
(Supported: bestsum, bestsumorder) [default: bestsum]
--sort <ORDER> Sort order for plotting [default: span]
Expand Down
54 changes: 4 additions & 50 deletions create.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,64 +36,19 @@
import lib.BtCore as bt
import lib.BtLog as BtLog
import lib.BtIO as BtIO
import lib.BtInput as BtInput
import os.path


if __name__ == '__main__':
ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet']

main_dir = os.path.dirname(__file__)
#print data_dir
args = docopt(__doc__)
#print args
fasta_f = args['--infile']
fasta_type = args['--type']

sam_fs = args['--sam']
bam_fs = args['--bam']
cov_fs = args['--cov']
cas_fs = args['--cas']
hit_fs = args['--taxfile']

out_f = args['--out']
if (out_f):
out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json")
else:
out_f = "%s" % ("BlobDB.json")
nodesDB_f = args['--db']
names_f = args['--names']
nodes_f = args['--nodes']
taxrules = args['--taxrule']
title = args['--title'] if (args['--title']) else out_f


# Do files exist ?
files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
for f in files:
if not os.path.isfile(f):
BtLog.error('0', f)

# Is taxonomy provided?
if nodesDB_f == "data/nodesDB.txt":
nodesDB_f = os.path.join(main_dir, nodesDB_f)
if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
BtLog.error('3')

if not (hit_fs):
BtLog.error('18')

# can FASTA parser deal with assemblies
if not fasta_type in ASSEMBLY_TYPES:
BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))

# Is coverage provided?
if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
BtLog.error('1')
title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f = BtInput.validate_input_create(main_dir, args)

cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
[bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
[bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
[bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

# Create BlobDB object
blobDb = bt.BlobDb(title)

Expand All @@ -103,8 +58,7 @@
blobDb.parseCovs(cov_libs)

# Parse Tax
hitLibs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]
blobDb.parseHits(hitLibs)
blobDb.parseHits(hit_libs)

# Parse nodesDB
nodesDB, nodesDB_f = BtIO.getNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f)
Expand Down
160 changes: 100 additions & 60 deletions lib/BtCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,26 +107,60 @@ def dump(self):

def load(self, BlobDb_f):
blobDict = BtIO.readJson(BlobDb_f)
self.title = blobDict['title']
self.assembly_f = blobDict['assembly_f']
self.nodesDB_f = blobDict['nodesDB_f']
self.lineages = blobDict['lineages']
for k, v in blobDict.items():
setattr(self, k, v)
self.set_of_taxIds = blobDict['lineages'].keys()
self.order_of_blobs = blobDict['order_of_blobs']
self.dict_of_blobs = blobDict['dict_of_blobs']
self.length = int(blobDict['length'])
self.seqs = int(blobDict['seqs'])
self.n_count = int(blobDict['n_count'])
self.covLibs = blobDict['covLibs']
self.hitLibs = blobDict['hitLibs']
self.taxrules = blobDict['taxrules']
#for k, v in self.__dict__.items():
# print k, type(v), v # this seems to work

#self.title = blobDict['title']
#self.assembly_f = blobDict['assembly_f']
#self.nodesDB_f = blobDict['nodesDB_f']
#self.lineages = blobDict['lineages']
#self.set_of_taxIds = blobDict['lineages'].keys()
#self.order_of_blobs = blobDict['order_of_blobs']
#self.dict_of_blobs = blobDict['dict_of_blobs']
#self.length = int(blobDict['length'])
#self.seqs = int(blobDict['seqs'])
#self.n_count = int(blobDict['n_count'])
#self.covLibs = blobDict['covLibs']
#self.hitLibs = blobDict['hitLibs']
#self.taxrules = blobDict['taxrules']

# self.title = title
# self.assembly_f = ''
# self.dict_of_blobs = {}
# self.order_of_blobs = []
# self.set_of_taxIds = set()
# self.lineages = {}
# self.length = 0
# self.seqs = 0
# self.n_count = 0
# self.covLibs = {}
# self.hitLibs = {}
# self.nodesDB_f = ''
# self.taxrules = []

def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
data_dict = {}
read_cov_dict = {}
max_cov = 0.0
cov_libs = self.covLibs.keys()
cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
print self.covLibs
cov_lib_dict = self.covLibs
#print cov_lib_dict
cov_lib_names_l = self.covLibs.keys() # does not include cov_sum

if len(cov_lib_names_l) > 1:
# more than one cov_lib, cov_sum_lib has to be created
cov_lib_dict['sum'] = CovLibObj('sum', 'sum', None).__dict__ # ugly
cov_lib_dict['sum']['reads_total'] = sum([cov_lib_dict[x]['reads_total'] for x in cov_lib_dict])
cov_lib_dict['sum']['reads_mapped'] = sum([cov_lib_dict[x]['reads_mapped'] for x in cov_lib_dict])

#print self.covLibs
#cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
#print cov_libs_reads_total # correct
#cov_libs_reads_mapped = {cov_lib : data['reads_mapped'] for cov_lib, data in self.covLibs.items()}
#print cov_libs_reads_mapped # correct

for blob in self.dict_of_blobs.values():
name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
Expand All @@ -143,21 +177,19 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
'name' : [],
'length' : [],
'gc' : [],
'covs' : {covLib : [] for covLib in cov_libs},
'reads_mapped' : {covLib : 0 for covLib in cov_libs},
'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
'count' : 0,
'count_hidden' : 0,
'count_visible' : 0,
'span': 0,
'span_hidden' : 0,
'span_visible' : 0,
}
if len(cov_libs) > 1:
data_dict[group]['covs']['cov_sum'] = []
data_dict[group]['reads_mapped']['cov_sum'] = 0

data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)

if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
Expand All @@ -170,35 +202,45 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour

cov_sum = 0.0
reads_mapped_sum = 0
for cov_lib in sorted(cov_libs):
cov = float(blob['covs'][cov_lib])
cov_sum += cov
cov = cov if cov > 0.02 else 0.02
for cov_lib in sorted(cov_lib_names_l):
cov = float(blob['covs'][cov_lib])
if cov < 0.02:
cov = 0.02
#cov = cov if cov > 0.02 else 0.02
# increase max_cov
if cov > max_cov:
max_cov = cov
data_dict[group]['covs'][cov_lib].append(cov)
if cov_lib in blob['read_cov']:

# add cov of blob to group
data_dict[group]['covs'][cov_lib].append(cov)

cov_sum += cov

# add readcov
if cov_lib in blob['read_cov']:
reads_mapped = blob['read_cov'][cov_lib]
reads_mapped_sum += reads_mapped
data_dict[group]['reads_mapped'][cov_lib] += reads_mapped
reads_mapped_sum += reads_mapped

if len(cov_libs) > 1:
cov_sum = cov_sum if cov_sum > 0.02 else 0.02
data_dict[group]['covs']['cov_sum'].append(cov_sum)
if cov > max_cov:
max_cov = cov
if len(cov_lib_names_l) > 1:
if cov_sum < 0.02 :
cov_sum = 0.02
data_dict[group]['covs']['sum'].append(cov_sum)
if cov_sum > max_cov:
max_cov = cov_sum
if (reads_mapped_sum):
data_dict[group]['reads_mapped']['cov_sum'] += reads_mapped_sum

#data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
#data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
#if len(cov_lib_names_l) > 1:
# for cov_lib, data in self.covLibs.items():
# cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']


if len(cov_libs) > 1:
cov_libs.append('cov_sum')
for cov_lib, data in self.covLibs.items():
cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']
#for group in data_dict:
# print "#", group
# for cat in data_dict[group]:
# print cat, data_dict[group][cat]

return data_dict, max_cov, cov_libs, cov_libs_reads_total
return data_dict, max_cov, cov_lib_dict

def addCovLib(self, covLib):
self.covLibs[covLib.name] = covLib
Expand Down Expand Up @@ -237,26 +279,31 @@ def parseCovs(self, covLibObjs):
self.addCovLib(covLib)
print BtLog.status_d['1'] % (covLib.name, covLib.f)
if covLib.fmt == 'bam' or covLib.fmt == 'sam':

base_cov_dict = {}
if covLib.fmt == 'bam':
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
else:
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))

if covLib.reads_total == 0:
print BtLog.warn_d['4'] % covLib.f

for name, base_cov in base_cov_dict.items():
cov = base_cov / self.dict_of_blobs[name].agct_count
covLib.cov_sum += cov
self.dict_of_blobs[name].addCov(covLib.name, cov)
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])

elif covLib.fmt == 'cas':
cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
if covLib.reads_total == 0:
print BtLog.warn_d['4'] % covLib.f
for name, cov in cov_dict.items():
covLib.cov_sum += cov
self.dict_of_blobs[name].addCov(covLib.name, cov)
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])

elif covLib.fmt == 'cov':
cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
if not len(cov_dict) == self.seqs:
Expand All @@ -267,6 +314,8 @@ def parseCovs(self, covLibObjs):
else:
pass
covLib.mean_cov = covLib.cov_sum/self.seqs
if covLib.cov_sum == 0.0:
BtLog.error('26', covLib.name)
self.covLibs[covLib.name] = covLib


Expand Down Expand Up @@ -295,18 +344,6 @@ def computeTaxonomy(self, taxrules, nodesDB):
blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages)
else:
blObj.taxonomy[taxrule] = BtTax.noHit()

def counts(self):
count_dict = {
'seqs' : self.seqs,
'length' : self.length,
'Ns' : self.n_count,
'AvgCov' : {lib : round(covlibObj.cov_sum/self.seqs, 2) for lib, covlibObj in self.covLibs.items()},
'GC' : round(sum([blObj.gc for blObj in self.dict_of_blobs.values()])/self.seqs, 2),
'MappedReads' : {lib : (covlibObj.reads_mapped) for lib, covlibObj in self.covLibs.items()},
'TotalReads' : {lib : (covlibObj.reads_total) for lib, covlibObj in self.covLibs.items()}
}
print count_dict

def getBlobs(self):
for blObj in [self.dict_of_blobs[key] for key in self.order_of_blobs]:
Expand All @@ -328,8 +365,11 @@ def calculateGC(self, seq):
return float((seq.count('G') + seq.count('C') ) / self.agct_count \
if self.agct_count > 0 else 0.0)

def addCov(self, name, cov):
self.covs[name] = cov
def addCov(self, lib_name, cov):
self.covs[lib_name] = cov

def addReadCov(self, lib_name, read_cov):
self.read_cov[lib_name] = read_cov

def addHits(self, hitLibName, hitDict):
if not hitLibName in self.hits:
Expand All @@ -340,8 +380,8 @@ class CovLibObj():
def __init__(self, name, fmt, f):
self.name = name
self.fmt = fmt
self.f = abspath(f)
self.cov_sum = 0
self.f = abspath(f) if (f) else ''
self.cov_sum = 0.0
self.reads_total = 0
self.reads_mapped = 0
self.mean_cov = 0.0
Expand All @@ -350,7 +390,7 @@ class hitLibObj():
def __init__(self, name, fmt, f):
self.name = name
self.fmt = fmt
self.f = abspath(f)
self.f = abspath(f) if (f) else ''

if __name__ == '__main__':
pass
Loading

0 comments on commit daff0b2

Please sign in to comment.