Merge pull request #11 from DRL/development

Development
DRL · Feb 16, 2016 · daff0b2 · daff0b2
2 parents 3be2abb + 05cd61f
commit daff0b2
Show file tree

Hide file tree

Showing 10 changed files with 237 additions and 151 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 testing/
+testing_fly/
 *.json
 *.pyc
 *.txt

diff --git a/comparecov.py b/comparecov.py
@@ -21,8 +21,7 @@
 
         -p, --plotgroups INT        Number of (taxonomic) groups to plot, remaining 
                                      groups are placed in 'other' [default: 7]
-        -r, --rank RANK             Taxonomic rank used for colouring of blobs [default: superkingdom]
-                                     (Supported: superkingdom) 
+        -r, --rank RANK             Taxonomic rank used for colouring of blobs [default: phylum]
         -x, --taxrule TAXRULE       Taxrule which has been used for computing taxonomy 
                                      (Supported: bestsum, bestsumorder) [default: bestsum]
         --sort <ORDER>              Sort order for plotting [default: span]

diff --git a/create.py b/create.py
@@ -36,64 +36,19 @@
 import lib.BtCore as bt
 import lib.BtLog as BtLog
 import lib.BtIO as BtIO
+import lib.BtInput as BtInput
 import os.path
 
 
 if __name__ == '__main__':
-    ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet']
+
     main_dir = os.path.dirname(__file__)
     #print data_dir
     args = docopt(__doc__)
     #print args
-    fasta_f = args['--infile']
-    fasta_type = args['--type']
 
-    sam_fs = args['--sam']
-    bam_fs = args['--bam']
-    cov_fs = args['--cov']
-    cas_fs = args['--cas']
-    hit_fs = args['--taxfile']
-
-    out_f = args['--out']
-    if (out_f):
-        out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json")
-    else:
-        out_f = "%s" % ("BlobDB.json")
-    nodesDB_f = args['--db']
-    names_f = args['--names']
-    nodes_f = args['--nodes']
-    taxrules = args['--taxrule']
-    title = args['--title'] if (args['--title']) else out_f
-
-
-    # Do files exist ?
-    files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
-    for f in files:
-        if not os.path.isfile(f):
-            BtLog.error('0', f)
-
-    # Is taxonomy provided?
-    if nodesDB_f == "data/nodesDB.txt":
-        nodesDB_f = os.path.join(main_dir, nodesDB_f)
-    if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
-        BtLog.error('3')
-
-    if not (hit_fs):
-        BtLog.error('18')
-
-    # can FASTA parser deal with assemblies
-    if not fasta_type in ASSEMBLY_TYPES:
-        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
-
-    # Is coverage provided?
-    if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
-        BtLog.error('1')
+    title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f = BtInput.validate_input_create(main_dir, args)
 
-    cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
-               [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
-               [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
-               [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 
-
     # Create BlobDB object              
     blobDb = bt.BlobDb(title)
 
@@ -103,8 +58,7 @@
     blobDb.parseCovs(cov_libs)
 
     # Parse Tax
-    hitLibs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]
-    blobDb.parseHits(hitLibs)
+    blobDb.parseHits(hit_libs)
 
     # Parse nodesDB
     nodesDB, nodesDB_f = BtIO.getNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f)

diff --git a/lib/BtCore.py b/lib/BtCore.py
@@ -107,26 +107,60 @@ def dump(self):
 
     def load(self, BlobDb_f):
         blobDict = BtIO.readJson(BlobDb_f)
-        self.title = blobDict['title']
-        self.assembly_f = blobDict['assembly_f']
-        self.nodesDB_f = blobDict['nodesDB_f']
-        self.lineages = blobDict['lineages']
+        for k, v in blobDict.items():
+            setattr(self, k, v)
         self.set_of_taxIds = blobDict['lineages'].keys()
-        self.order_of_blobs = blobDict['order_of_blobs']
-        self.dict_of_blobs = blobDict['dict_of_blobs'] 
-        self.length = int(blobDict['length'])
-        self.seqs = int(blobDict['seqs'])
-        self.n_count = int(blobDict['n_count'])
-        self.covLibs = blobDict['covLibs']
-        self.hitLibs = blobDict['hitLibs']
-        self.taxrules = blobDict['taxrules']
+        #for k, v in self.__dict__.items(): 
+        #    print k, type(v), v # this seems to work
+
+        #self.title = blobDict['title']
+        #self.assembly_f = blobDict['assembly_f']
+        #self.nodesDB_f = blobDict['nodesDB_f']
+        #self.lineages = blobDict['lineages']
+        #self.set_of_taxIds = blobDict['lineages'].keys()
+        #self.order_of_blobs = blobDict['order_of_blobs']
+        #self.dict_of_blobs = blobDict['dict_of_blobs'] 
+        #self.length = int(blobDict['length'])
+        #self.seqs = int(blobDict['seqs'])
+        #self.n_count = int(blobDict['n_count'])
+        #self.covLibs = blobDict['covLibs']
+        #self.hitLibs = blobDict['hitLibs']
+        #self.taxrules = blobDict['taxrules']
+
+        # self.title = title
+        # self.assembly_f = ''
+        # self.dict_of_blobs = {}  
+        # self.order_of_blobs = []
+        # self.set_of_taxIds = set()
+        # self.lineages = {}
+        # self.length = 0
+        # self.seqs = 0
+        # self.n_count = 0
+        # self.covLibs = {} 
+        # self.hitLibs = {}
+        # self.nodesDB_f = ''
+        # self.taxrules = []
 
     def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
         data_dict = {}
         read_cov_dict = {}
         max_cov = 0.0
-        cov_libs = self.covLibs.keys()
-        cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
+        print self.covLibs
+        cov_lib_dict = self.covLibs
+        #print cov_lib_dict
+        cov_lib_names_l = self.covLibs.keys() # does not include cov_sum
+
+        if len(cov_lib_names_l) > 1:
+            # more than one cov_lib, cov_sum_lib has to be created
+            cov_lib_dict['sum'] = CovLibObj('sum', 'sum', None).__dict__ # ugly
+            cov_lib_dict['sum']['reads_total'] = sum([cov_lib_dict[x]['reads_total'] for x in cov_lib_dict])
+            cov_lib_dict['sum']['reads_mapped'] = sum([cov_lib_dict[x]['reads_mapped'] for x in cov_lib_dict])
+
+        #print self.covLibs
+        #cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
+        #print cov_libs_reads_total # correct
+        #cov_libs_reads_mapped = {cov_lib : data['reads_mapped'] for cov_lib, data in self.covLibs.items()}
+        #print cov_libs_reads_mapped # correct
 
         for blob in self.dict_of_blobs.values():
             name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
@@ -143,21 +177,19 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
                                     'name' : [], 
                                     'length' : [], 
                                     'gc' : [], 
-                                    'covs' : {covLib : [] for covLib in cov_libs}, 
-                                    'reads_mapped' : {covLib : 0 for covLib in cov_libs},
+                                    'covs' : {covLib : [] for covLib in cov_lib_dict.keys()},           # includes cov_sum if it exists 
+                                    'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()},    # includes cov_sum if it exists
                                     'count' : 0,
                                     'count_hidden' : 0,
                                     'count_visible' : 0,
                                     'span': 0, 
                                     'span_hidden' : 0, 
                                     'span_visible' : 0,
                                     }
-                if len(cov_libs) > 1:
-                    data_dict[group]['covs']['cov_sum'] = []
-                    data_dict[group]['reads_mapped']['cov_sum'] = 0
 
             data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
             data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
+
             if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
                 data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
                 data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
@@ -170,35 +202,45 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
 
                 cov_sum = 0.0
                 reads_mapped_sum = 0
-                for cov_lib in sorted(cov_libs):
-                    cov = float(blob['covs'][cov_lib]) 
-                    cov_sum += cov
-                    cov = cov if cov > 0.02 else 0.02
+                for cov_lib in sorted(cov_lib_names_l):
+                    cov = float(blob['covs'][cov_lib])
+                    if cov < 0.02:
+                        cov = 0.02
+                    #cov = cov if cov > 0.02 else 0.02
+                    # increase max_cov
                     if cov > max_cov:
                         max_cov = cov
-                    data_dict[group]['covs'][cov_lib].append(cov)
-                    if cov_lib in blob['read_cov']:
+
+                    # add cov of blob to group
+                    data_dict[group]['covs'][cov_lib].append(cov) 
+
+                    cov_sum += cov
+
+                    # add readcov
+                    if cov_lib in blob['read_cov']:                 
                         reads_mapped = blob['read_cov'][cov_lib]
-                        reads_mapped_sum += reads_mapped
                         data_dict[group]['reads_mapped'][cov_lib] += reads_mapped  
+                        reads_mapped_sum += reads_mapped
 
-                if len(cov_libs) > 1:
-                    cov_sum = cov_sum if cov_sum > 0.02 else 0.02
-                    data_dict[group]['covs']['cov_sum'].append(cov_sum)
-                    if cov > max_cov:
-                        max_cov = cov
+                if len(cov_lib_names_l) > 1:
+                    if cov_sum < 0.02 :
+                        cov_sum = 0.02
+                    data_dict[group]['covs']['sum'].append(cov_sum)
+                    if cov_sum > max_cov:
+                        max_cov = cov_sum
                     if (reads_mapped_sum):
-                        data_dict[group]['reads_mapped']['cov_sum'] += reads_mapped_sum
-
-                #data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
-                #data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
+                        data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
+        #if len(cov_lib_names_l) > 1:
+        #    for cov_lib, data in self.covLibs.items():
+        #        cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total'] 
+
 
-        if len(cov_libs) > 1:
-            cov_libs.append('cov_sum')
-            for cov_lib, data in self.covLibs.items():
-                cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total'] 
+        #for group in data_dict:
+        #    print "#", group
+        #    for cat in data_dict[group]:
+        #        print cat, data_dict[group][cat]
 
-        return data_dict, max_cov, cov_libs, cov_libs_reads_total
+        return data_dict, max_cov, cov_lib_dict
 
     def addCovLib(self, covLib):
         self.covLibs[covLib.name] = covLib
@@ -237,26 +279,31 @@ def parseCovs(self, covLibObjs):
             self.addCovLib(covLib)
             print BtLog.status_d['1'] % (covLib.name, covLib.f)
             if covLib.fmt == 'bam' or covLib.fmt == 'sam':
+
                 base_cov_dict = {}
                 if covLib.fmt == 'bam':
                     base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
                 else:
-                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))    
+                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
+
                 if covLib.reads_total == 0:
                     print BtLog.warn_d['4'] % covLib.f
+
                 for name, base_cov in base_cov_dict.items():
                     cov = base_cov / self.dict_of_blobs[name].agct_count
                     covLib.cov_sum += cov
                     self.dict_of_blobs[name].addCov(covLib.name, cov)
-                    self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
+                    self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])
+
             elif covLib.fmt == 'cas':
                 cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
                 if covLib.reads_total == 0:
                     print BtLog.warn_d['4'] % covLib.f
                 for name, cov in cov_dict.items():
                     covLib.cov_sum += cov
                     self.dict_of_blobs[name].addCov(covLib.name, cov)
-                    self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
+                    self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])
+
             elif covLib.fmt == 'cov':
                 cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
                 if not len(cov_dict) == self.seqs:
@@ -267,6 +314,8 @@ def parseCovs(self, covLibObjs):
             else:
                 pass        
             covLib.mean_cov = covLib.cov_sum/self.seqs
+            if covLib.cov_sum == 0.0:
+                BtLog.error('26', covLib.name)
             self.covLibs[covLib.name] = covLib
 
 
@@ -295,18 +344,6 @@ def computeTaxonomy(self, taxrules, nodesDB):
                     blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages)
                 else:
                     blObj.taxonomy[taxrule] = BtTax.noHit()
-
-    def counts(self):
-        count_dict = {
-            'seqs'     : self.seqs,
-            'length'   : self.length,
-            'Ns'       : self.n_count,
-            'AvgCov'   : {lib : round(covlibObj.cov_sum/self.seqs, 2) for lib, covlibObj in self.covLibs.items()},
-            'GC'       : round(sum([blObj.gc for blObj in self.dict_of_blobs.values()])/self.seqs, 2),
-            'MappedReads' : {lib : (covlibObj.reads_mapped) for lib, covlibObj in self.covLibs.items()},
-            'TotalReads' : {lib : (covlibObj.reads_total) for lib, covlibObj in self.covLibs.items()}
-        }
-        print count_dict
 
     def getBlobs(self):
         for blObj in [self.dict_of_blobs[key] for key in self.order_of_blobs]:
@@ -328,8 +365,11 @@ def calculateGC(self, seq):
         return float((seq.count('G') + seq.count('C') ) / self.agct_count \
                      if self.agct_count > 0 else 0.0)
 
-    def addCov(self, name, cov):
-        self.covs[name] = cov
+    def addCov(self, lib_name, cov):
+        self.covs[lib_name] = cov
+
+    def addReadCov(self, lib_name, read_cov):
+        self.read_cov[lib_name] = read_cov
 
     def addHits(self, hitLibName, hitDict):
         if not hitLibName in self.hits:
@@ -340,8 +380,8 @@ class CovLibObj():
     def __init__(self, name, fmt, f):
         self.name = name
         self.fmt = fmt
-        self.f = abspath(f)
-        self.cov_sum = 0
+        self.f = abspath(f) if (f) else ''
+        self.cov_sum = 0.0
         self.reads_total = 0
         self.reads_mapped = 0
         self.mean_cov = 0.0
@@ -350,7 +390,7 @@ class hitLibObj():
     def __init__(self, name, fmt, f):
         self.name = name
         self.fmt = fmt
-        self.f = abspath(f)
+        self.f = abspath(f) if (f) else ''
 
 if __name__ == '__main__':
     pass