diff --git a/metaphlan/metaphlan.py b/metaphlan/metaphlan.py index d706c36c..6f00772e 100755 --- a/metaphlan/metaphlan.py +++ b/metaphlan/metaphlan.py @@ -3,8 +3,8 @@ 'Nicola Segata (nicola.segata@unitn.it), ' 'Duy Tin Truong, ' 'Francesco Asnicar (f.asnicar@unitn.it)') -__version__ = '3.0.4' -__date__ = '23 Sep 2020' +__version__ = '3.0.5' +__date__ = '10 Nov 2020' import sys try: @@ -422,12 +422,18 @@ def run_bowtie2(fna_in, outfmt6_out, bowtie2_db, preset, nproc, min_mapq_val, fi p.communicate() read_fastx_stderr = readin.stderr.readlines() nreads = None + avg_read_length = None try: - nreads = int(read_fastx_stderr[0]) + nreads, avg_read_length = list(map(float, read_fastx_stderr[0].decode().split())) if not nreads: sys.stderr.write('Fatal error running MetaPhlAn. Total metagenome size was not estimated.\nPlease check your input files.\n') sys.exit(1) - outf.write(lmybytes('#nreads\t{}'.format(nreads))) + if not avg_read_length: + sys.stderr.write('Fatal error running MetaPhlAn. The average read length was not estimated.\nPlease check your input files.\n') + sys.exit(1) + + outf.write(lmybytes('#nreads\t{}\n'.format(int(nreads)))) + outf.write(lmybytes('#avg_read_length\t{}'.format(avg_read_length))) outf.close() except ValueError: sys.stderr.write(b''.join(read_fastx_stderr).decode()) @@ -457,6 +463,7 @@ class TaxClade: perc_nonzero = None quantile = None avoid_disqm = False + avg_read_length = 1 def __init__( self, name, tax_id, uncl = False): self.children, self.markers2nreads = {}, {} @@ -500,7 +507,7 @@ def get_full_name( self ): return "|".join(fullname[1:]) def get_normalized_counts( self ): - return [(m,float(n)*1000.0/self.markers2lens[m]) + return [(m,float(n)*1000.0/(self.markers2lens[m] - self.avg_read_length +1) ) for m,n in self.markers2nreads.items()] def compute_mapped_reads( self ): @@ -577,24 +584,24 @@ def compute_abundance( self ): elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']): loc_ab = nrawreads / rat if rat >= 0 else 0.0 elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']): - loc_ab = np.mean([float(n)/r for r,n in rat_nreads]) + loc_ab = np.mean([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads]) elif self.stat == 'tavg_g': - wnreads = sorted([(float(n)/r,r,n) for r,n in rat_nreads], key=lambda x:x[0]) + wnreads = sorted([(float(n)/(r-self.avg_read_length+1),(r - self.avg_read_length+1) ,n) for r,n in rat_nreads], key=lambda x:x[0]) den,num = zip(*[v[1:] for v in wnreads[ql:qr]]) loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0 elif self.stat == 'tavg_l': - loc_ab = np.mean(sorted([float(n)/r for r,n in rat_nreads])[ql:qr]) + loc_ab = np.mean(sorted([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads])[ql:qr]) elif self.stat == 'wavg_g': vmin, vmax = nreads_v[ql], nreads_v[qr] wnreads = [vmin]*qn+list(nreads_v[ql:qr])+[vmax]*qn loc_ab = float(sum(wnreads)) / rat elif self.stat == 'wavg_l': - wnreads = sorted([float(n)/r for r,n in rat_nreads]) + wnreads = sorted([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads]) vmin, vmax = wnreads[ql], wnreads[qr] wnreads = [vmin]*qn+list(wnreads[ql:qr])+[vmax]*qn loc_ab = np.mean(wnreads) elif self.stat == 'med': - loc_ab = np.median(sorted([float(n)/r for r,n in rat_nreads])[ql:qr]) + loc_ab = np.median(sorted([float(n)/(r - self.avg_read_length +1) for r,n in rat_nreads])[ql:qr]) self.abundance = loc_ab if rat < self.min_cu_len and self.children: @@ -628,6 +635,7 @@ def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ): TaxClade.markers2lens = self.markers2lens TaxClade.markers2exts = self.markers2exts TaxClade.taxa2clades = self.taxa2clades + self.avg_read_length = 1 for clade, value in mpa['taxonomy'].items(): clade = clade.strip().split("|") @@ -674,11 +682,12 @@ def add_lens( node ): def set_min_cu_len( self, min_cu_len ): TaxClade.min_cu_len = min_cu_len - def set_stat( self, stat, quantile, perc_nonzero, avoid_disqm = False): + def set_stat( self, stat, quantile, perc_nonzero, avg_read_length, avoid_disqm = False): TaxClade.stat = stat TaxClade.perc_nonzero = perc_nonzero TaxClade.quantile = quantile TaxClade.avoid_disqm = avoid_disqm + TaxClade.avg_read_length = avg_read_length def add_reads( self, marker, n, add_viruses = False, @@ -794,11 +803,14 @@ def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len= reads2markers = {} n_metagenome_reads = None + avg_read_length = 1 #Set to 1 if it is not calculated from read_fastx if input_type == 'bowtie2out': for r, c in ras(inpf): if r.startswith('#') and 'nreads' in r: n_metagenome_reads = int(c) + if r.startswith('#') and 'avg_read_length' in r: + avg_read_length = float(c) else: reads2markers[r] = c elif input_type == 'sam': @@ -816,7 +828,7 @@ def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len= for r, m in reads2markers.items(): markers2reads[m].add(r) - return (markers2reads, n_metagenome_reads) + return (markers2reads, n_metagenome_reads, avg_read_length) def maybe_generate_biom_file(tree, pars, abundance_predictions): json_key = "MetaPhlAn" @@ -994,9 +1006,11 @@ def main(): REPORT_MERGED = mpa_pkl.get('merged_taxon',False) tree = TaxTree( mpa_pkl, ignore_markers ) tree.set_min_cu_len( pars['min_cu_len'] ) - tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], pars['avoid_disqm']) + + markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len']) + + tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm']) - markers2reads, n_metagenome_reads = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len']) if no_map: os.remove( pars['inp'] ) diff --git a/metaphlan/utils/read_fastx.py b/metaphlan/utils/read_fastx.py index b4c735cf..b3b248a9 100755 --- a/metaphlan/utils/read_fastx.py +++ b/metaphlan/utils/read_fastx.py @@ -55,75 +55,75 @@ def fopen(fn): def read_and_write_raw_int(fd, min_len=None): fmt = None + avg_read_length = 0 nreads = 0 discarded = 0 - if min_len: - r = [] + #if min_len: + r = [] - while True: - l = fd.readline() - - if not fmt: - fmt = fastx(l) - parser = FastqGeneralIterator if fmt == 'fastq' else SimpleFastaParser - readn = 4 if fmt == 'fastq' else 2 - - r.append(l) - - if len(r) == readn: - break - - for record in parser(uio.StringIO("".join(r))): - if readn == 4: - description, sequence, qual = record - else: - qual = None - description, sequence = record - - if len(sequence) >= min_len: - description = ignore_spaces(description, forced=True) - _ = sys.stdout.write(print_record(description, sequence, qual, fmt)) - else: - discarded = discarded + 1 - - for idx, record in enumerate(parser(fd),2): - if readn == 4: - description, sequence, qual = record - else: - qual = None - description, sequence = record - - if len(sequence) >= min_len: - description = ignore_spaces(description, forced=True) - _ = sys.stdout.write(print_record(description, sequence, qual, fmt)) - else: - discarded = discarded + 1 - else: - for idx, l in enumerate(fd,1): - _ = sys.stdout.write(ignore_spaces(l)) - - #Read again the first line of the file to determine if is a fasta or a fastq - fd.seek(0) + while True: l = fd.readline() - readn = 4 if fastx(l) == 'fastq' else 2 - idx = idx // readn + + if not fmt: + fmt = fastx(l) + parser = FastqGeneralIterator if fmt == 'fastq' else SimpleFastaParser + readn = 4 if fmt == 'fastq' else 2 + + r.append(l) + + if len(r) == readn: + break + + for record in parser(uio.StringIO("".join(r))): + if readn == 4: + description, sequence, qual = record + else: + qual = None + description, sequence = record + + if len(sequence) >= min_len: + description = ignore_spaces(description, forced=True) + avg_read_length = len(sequence) + avg_read_length + _ = sys.stdout.write(print_record(description, sequence, qual, fmt)) + else: + discarded = discarded + 1 + + for idx, record in enumerate(parser(fd),2): + if readn == 4: + description, sequence, qual = record + else: + qual = None + description, sequence = record + + if len(sequence) >= min_len: + avg_read_length = len(sequence) + avg_read_length + description = ignore_spaces(description, forced=True) + _ = sys.stdout.write(print_record(description, sequence, qual, fmt)) + else: + discarded = discarded + 1 + # else: + # for idx, l in enumerate(fd,1): + # avg_read_length = len(l) + avg_read_length + # _ = sys.stdout.write(ignore_spaces(l)) nreads = idx - discarded - return nreads + avg_read_length /= nreads + return (nreads, avg_read_length) def read_and_write_raw(fd, opened=False, min_len=None): if opened: #fd is stdin - nreads = read_and_write_raw_int(fd, min_len=min_len) + nreads, avg_read_length = read_and_write_raw_int(fd, min_len=min_len) else: with fopen(fd) as inf: - nreads = read_and_write_raw_int(inf, min_len=min_len) - return nreads + nreads, avg_read_length = read_and_write_raw_int(inf, min_len=min_len) + return (nreads, avg_read_length) def main(): - min_len = None + min_len = 0 args = [] nreads = None + avg_read_length = None if len(sys.argv) > 1: for l in sys.argv[1:]: @@ -140,10 +140,10 @@ def main(): args.append(l) if len(args) == 0: - nreads = read_and_write_raw(sys.stdin, opened=True, min_len=min_len) + nreads, avg_read_length = read_and_write_raw(sys.stdin, opened=True, min_len=min_len) else: files = [] - nreads = 0 + avg_read_length, nreads = 0, 0 for a in args: for f in a.split(','): if os.path.isdir(f): @@ -152,10 +152,13 @@ def main(): files += [f] for f in files: - nreads += read_and_write_raw(f, opened=False, min_len=min_len) + f_nreads, f_avg_read_length = read_and_write_raw(f, opened=False, min_len=min_len) + nreads += f_nreads + avg_read_length += f_avg_read_length + avg_read_length = avg_read_length / len(files) - if nreads: - sys.stderr.write(str(nreads)) + if nreads and avg_read_length: + sys.stderr.write('{}\t{}'.format(nreads,avg_read_length) ) else: exit(1) diff --git a/setup.py b/setup.py index aaf9c458..ec746f09 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup( name='MetaPhlAn', - version='3.0.4', + version='3.0.5', author='Francesco Beghini', author_email='francesco.beghini@unitn.it', url='http://github.com/biobakery/MetaPhlAn/',