-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_blob_table.py
executable file
·370 lines (298 loc) · 14.5 KB
/
parse_blob_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/env python3
""" Reimplementation of Jon's parseBlobTable.R in Python, because I can't be having with R.
Results should be byte-identical with Jon's old code.
See http://gitlab.genepool.private/production-team/qc_tools_python/blob/master/scripts/parseBlobTable.R
"""
import os, sys, re
import logging as L
from copy import deepcopy
from collections import Counter
from contextlib import suppress
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
def read_blob_table(fh, name_extractor):
"""Read the lines from the .blobplot.stats.txt, which is mostly a tab-separated file.
name_extractor must be a function that gets the library name from the full filename
Return (colnames, name_map, datalines)
"""
headerlines = []
colnames = []
datalines = []
for l in fh:
l = l.strip()
# Special case - if the first line simpy says 'No data' then there was no data.
if l == 'No data' and not headerlines:
return ([], {}, [])
if not l:
# Not expecting blank lines, but skip them if seen.
continue
if l.startswith('## '):
headerlines.append(l[3:])
elif l.startswith('# '):
assert not colnames # Should only see one such line
colnames = l[2:].split("\t")
else:
datalines.append(l.split("\t"))
assert len(datalines[-1]) == len(colnames)
# Now extract a name mapping dict from the headerlines
# This gets the base of the filename...
name_map = { l.split('=')[0] : name_extractor(l)
for l in headerlines
if re.match(r'(bam|cov)[0-9]+=', l) }
# There should be no tab chars in any names
assert not any(re.search(r'\t', n) for n in name_map.keys())
assert not any(re.search(r'\t', n) for n in name_map.values())
# Coerce every value in a _read_map column to an integer, and each _read_map_p
# column to a float
for dl in datalines:
for i, colname in enumerate(colnames):
if colname.endswith('_read_map'):
dl[i] = int(dl[i].replace(',',''))
elif colname.endswith('_read_map_p'):
dl[i] = float(dl[i].replace('%',''))
# That's what we need!
return (colnames, name_map, datalines)
def name_extractor_regular(l):
return re.sub('(\+.*|\.[^.]*)$', '', l.split('/')[-1])
def name_extractor_hesiod_old(l):
"""Can probably be removed - use to recreate old reports
"""
return '/'.join( l.split('=')[-1].split('/')[-3:-1] )
def name_extractor_hesiod(l):
path_parts = l.split('=')[-1].split('/')
name_parts = path_parts[-1].split('_')
# Now we need to deal with old and new files and there's not really a neat way so
# lookk for a part which resembles a flowcell ID
if re.match(r"[A-Z]{3}[0-9]{5}$", name_parts[-3]):
return f"{path_parts[-3]} {name_parts[-3]}_{name_parts[-2]}"
else:
return f"{path_parts[-3]} {name_parts[-4]}_{name_parts[-3]} {name_parts[-2]}".rstrip(". ")
def main(args):
L.basicConfig(level=(L.DEBUG if args.debug else L.WARNING))
# Do all the parsing first. The tables won't be large so having everything in
# memory at once is fine.
all_tables = []
for f in args.statstxt:
with open(f) as fh:
ne = globals()['name_extractor_' + args.name_extractor]
all_tables.append(read_blob_table(fh, ne))
# Now I can have a master Matrix
mm = Matrix('taxon', 'lib', numsort=('taxon'))
total_reads_per_lib = Counter()
for colnames, name_map, datalines in all_tables:
# Empty file?
if not datalines: continue
# Since I'm not using a table datatype, make my own index of the column names
colidx = { n: i for i, n in enumerate(colnames) }
# (Comment from Jon)
# Get the actual read numbers so we can calculate a more precise percentage
# than Blobtools provides. Then we can see how close to 0 some of these 0
# percentages are.
#
# Blobtools' percentage values are relative to the original read counts
# (mapped and unmapped). If we want to re-calculate the percentages we need to
# re-infer that original read count. We can use the 'all' percent to do that.
# It won't be completely right due to rounding error in the 'all' percentage
# itself. But it should be close enough. Of course we might find it more
# useful in future to have percentages relative to the mapped reads.
# Having moved the data coercion into the read_blob_table() function, I can now
# just do that calculation. First estimate the total reads per lib based off the
# 'all' row.
all_row, = [ dl for dl in datalines if dl[colidx['name']] == 'all' ]
total_reads_per_bam = Counter()
for n in name_map:
if all_row[colidx[n+'_read_map']] > 0:
total_reads_per_bam[n] = (all_row[colidx[n+'_read_map']] * 100) / all_row[colidx[n+'_read_map_p']]
else:
# This avoids division by zero possibility when there's no data
total_reads_per_bam[n] = 0
# We also keep a master count
total_reads_per_lib[name_map[n]] += total_reads_per_bam[n]
# We've now also verified that all names in the name map have corresponding columns,
# but are all columns present in the name_map?
for colname in colnames:
if colname.endswith('_read_map_p') and colname != 'covsum_read_map_p':
assert name_map[colname[:-len('_read_map_p')]]
# Strip out rows that have name in ['all', 'no-hit', 'undef'] (but not 'other')
datalines = [ dl for dl in datalines
if dl[colidx['name']] not in ['all', 'no-hit', 'undef'] ]
# Now recalculate all the percentages relative to this value, and pop them into
# my master matrix.
for dl in datalines:
for n in name_map:
total_reads = total_reads_per_bam[n]
mapped_reads = dl[colidx[n+'_read_map']]
new_pct = (mapped_reads * 100) / total_reads
# The new value should be close to the old one.
assert abs(dl[colidx[n+'_read_map_p']] - new_pct) < 0.1
# Into the matrix with ye!
# The mm.add() method will object if a value was already present.
mm.add(new_pct, taxon=dl[colidx['name']], lib=name_map[n])
# End of loop through all_tables
# Now prune out taxa with nothing that makes the cutoff - but first see what is
# the highest number. Due to our sorting strategy this will always be in the leftmost
# taxon column.
max_percent = "None"
for tax in mm.list_labels('taxon')[:1]:
max_percent = "{:.{}f}".format(max( mm.get_vector('taxon', tax) ), args.round)
mm.prune('taxon', lambda v: v >= args.cutoff)
# And print the result. As it's TSV and we've already checked for tabs this is safe.
if args.output == '-':
fh = sys.stdout
else:
fh = open(args.output, 'x')
if not mm.list_labels('taxon'):
# Jon had this so I'll (kinda) copy it...
# taxlevel <- unlist(strsplit(basename(files[1]), '\\.'))[2]
try:
taxlevel = args.statstxt[0].split('/')[-1].split('.')[-4]
except IndexError:
taxlevel = "taxon"
print('No {taxlevel} is represented by at least {limit}% of reads (max {max}%)'.format(
taxlevel = taxlevel,
limit = args.cutoff,
max = max_percent ),
file = fh)
else:
# Heading
print('\t'.join( [args.label] +
(["Total Reads"] if args.total_reads else []) +
mm.list_labels('taxon') ),
file=fh)
# Rows
for lib in mm.list_labels('lib'):
print('\t'.join( [lib] +
(["{:.0f}".format(total_reads_per_lib[lib])] if args.total_reads else []) +
[ "{:.{}f}".format(v, args.round) for v in mm.get_vector('lib', lib) ] ),
file=fh)
# And done
fh.close()
class Matrix:
"""A lightweight 2D matrix suitable for my porpoises.
Yes I could use a Pandas DataFrame and copy the R logic but I don't see the need to
depend on Pandas (which is big) and also I'd still need to define the sort/prune logic
which is the trickiest bit of the code.
See test_blob_matrix.py for example usage featuring cheese.
"""
def __init__(self, colname='x', rowname='y', numsort=(), empty=0.0):
# What to call the axes, if not X and Y
self._colname = colname
self._rowname = rowname
self._col_numsort = self._colname in numsort
self._row_numsort = self._rowname in numsort
self._empty = empty
self._data = dict()
# We need to keep an explicit list of row labels or else pruning a column
# may cause a row to vanish and we don't want this.
self._rowlabels = set()
def copy(self):
# This works. I assume the stored values won't be mutable
# types.
return deepcopy(self)
def add_overwrite(self, val, **kwargs):
"""Add a value to the matrix
"""
if len(kwargs) > 2:
# All other cases will trigger a KeyError below.
raise IndexError("Too many kwargs")
if type(val) != type(self._empty):
raise TypeError("{} is not a {}".format(val, type(self._empty)))
# I tried a defaultdict but it causes problems - ie.
# if the caller tries to retrieve an unknown column it springs into
# existence.
self._data.setdefault(kwargs[self._colname], dict())[kwargs[self._rowname]] = val
# Add to _rowlabels
self._rowlabels.add(kwargs[self._rowname])
def add(self, val, **kwargs):
"""Add a value but check it's not already set.
This is normally what we want.
"""
if kwargs[self._rowname] in self._data.get(kwargs[self._colname], {}):
raise KeyError(f"already in matrix - {kwargs}")
self.add_overwrite(val, **kwargs)
def list_labels(self, axis):
"""List all the labels for either the rows or columns.
Sort order will be depend on the order specified at object creation.
"""
if axis == self._colname:
return self._list_col_labels()
elif axis == self._rowname:
return self._list_row_labels()
else:
raise KeyError("The axis may be {} or {}.", self._colname, self._rowname)
def _list_col_labels(self):
# Always sort by name first.
res = sorted(self._data)
if self._col_numsort:
# The labels with the highest max values come first
res.sort( reverse = True,
key = lambda cl:
max(self._data[cl].get(rl, self._empty) for rl in self._rowlabels) )
return res
def _list_row_labels(self):
# Always sort by name first.
res = sorted(self._rowlabels)
if self._row_numsort:
res.sort( reverse = True,
key = lambda rl:
max(self._data[cl].get(rl, self._empty) for cl in self._data) )
return res
def prune(self, axis, func):
"""Prune out rows or columns where no item passes the test.
Note that empty cells are also checked.
func : a function on type(this.empty) => bool
"""
if axis == self._colname:
# Scan through columns
for cl in list(self._data):
if not any( func(self._data[cl].get(rl, self._empty))
for rl in self._rowlabels ):
del self._data[cl]
elif axis == self._rowname:
# Actually turns out to be simpler. Scan through rows
for rl in list(self._rowlabels):
if not any( func(self._data[cl].get(rl, self._empty))
for cl in self._data ):
for cl in self._data:
with suppress(KeyError):
del self._data[cl][rl]
self._rowlabels.remove(rl)
else:
raise KeyError("The axis may be {} or {}.", self._colname, self._rowname)
def get_vector(self, axis, label):
"""Get a whole row or column
"""
# Opposite to list_labels since to fetch a whole row I need the column
# labels and vice versa.
if axis == self._colname:
return [ self._data[label].get(rl, self._empty) for
rl in self._list_row_labels() ]
elif axis == self._rowname:
return [ self._data[cl].get(label, self._empty) for
cl in self._list_col_labels() ]
else:
raise KeyError("The axis may be {} or {}.", self._colname, self._rowname)
def parse_args(*args):
description = """ Takes one or more .blobplot.stats.txt files and makes a TSV table
of the most common taxa by BAM (or COV) file.
"""
argparser = ArgumentParser( description=description,
formatter_class = ArgumentDefaultsHelpFormatter )
argparser.add_argument("-o", "--output", default='-',
help="Where to save the result. Defaults to stdout.")
argparser.add_argument("-c", "--cutoff", type=float, default="1.0",
help="Taxa found at less than cutoff%% in all samples will not be shown.")
argparser.add_argument("-l", "--label", default="Library",
help="Label to put on the first column")
argparser.add_argument("-t", "--total_reads", action="store_true",
help="Add a 'Total Reads' column.")
argparser.add_argument("-r", "--round", type=int, default=2,
help="Number of decimals to keep in floats.")
argparser.add_argument("-n", "--name_extractor", default='hesiod',
help="Method to extract Library names from the header lines. May be regular or hesiod.")
argparser.add_argument("statstxt", nargs="+",
help="One or more input files to scan.")
argparser.add_argument("-d", "--debug", action="store_true",
help="Print more verbose debugging messages.")
return argparser.parse_args(*args)
if __name__ == "__main__":
main(parse_args())