-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbcl_direct_reader.py
374 lines (297 loc) · 15.4 KB
/
bcl_direct_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python3
"""
A module to grab sequence reads direct from the .bcl and .filter files
outputted by Illumina. The motivation is that for some QC tasks we want
to grab a small subsample of reads, and getting these from the FASTQ is
very inefficient, especially once everything is demultiplexed and zipped.
We already have a C implementation of this in the bcl2fastq source and
a Java implementation in Picard Tools, but the world needs Python.
We assume not only the BCL format but also the standard directory
layout as specified in
https://support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf
This module can take advantage of the fact that the BCL files have a fixed record length
and use seek() to jump to the location where the position of interest is stored.
Unfortunately for GZipped files this does not give much of an advantage, as the file
must be decompressed internally to perform the seek(). For reading several sequences
at once a simple load to memory turns out to be faster, so the reader will do
that.
For max efficiency you should call get_seqs() just once per tile with
all the locations you want to extract.
Synopsis:
proj = BCLReader("/your/project/dir")
tile = proj.get_tile(1, 1101)
all_seqs = tile.get_seqs([70657,70658,70659], start=20, end=40)
seq1, flag1 = all_seqs[70567]
seq2, flag2 = all_seqs[70568]
seq3, flag3 = all_seqs[70569]
how_many_valid = sum([flag1,flag2,flag3])
On 24th Oct 2017:
We'd also like this module to be able to read .cbcl files, which are concatenated BCL files
(aka. indexed gzip files). Reading these efficiently might require changing the API a little.
"""
__version__ = 1.2
__author__ = 'Tim Booth, Edinburgh Genomics <[email protected]>'
import os, sys, re
import struct
import gzip
# This now works only in Python3 - byte semantics are totally different
assert sys.version >= '3'
# Callers may find these constants useful when dealing with results.
SEQUENCE = 0
QUAL_FLAG = 1
class BCLReader(object):
def __init__(self, location="."):
"""Creates a BCLReader instance that reads from a single run.
location: The top level data directory for the run.
This should be the one that contains the Data directory and the
RunInfo.xml file.
"""
# Just check that we can read the expected files at this
# location.
basecalls_dir = os.listdir( os.path.join(location, "Data", "Intensities", "BaseCalls") )
self.lanes = [ d for d in basecalls_dir if re.match('L\d\d\d$', d) ]
self.location = location
def get_seq(self, lane, tile, cluster_index, start=0, end=None):
"""Fetches a single sequence from a specified tile.
lane: lane number (see get_tile)
tile: tile number (see get_tile)
cluster_index: cluster number counting from 0. To determine
this from the standard position co-ordinates you need to
parse the .locs file for the run.
** This is going to be very inefficient for fetching multiple sequences. **
Use get_tile(...).get_seqs([0,2,4,6]) instead.
"""
tile = self.get_tile(lane, tile)
result = tile.get_seqs([cluster_index], start, end)
# return nuc_string, flag
return result[cluster_index]
def get_tile(self, lane, tile, in_memory=False):
"""Opens a tile for reading. You can then call get_seqs() to actually
fetch the data.
Lane and tile should be specified as per the Illumina file structure,
so lanes are 1 to 8 and tiles are eg. [12][12]{01-28} (for HiSeq 4000).
"""
lane_dir = str(lane)
if lane_dir not in self.lanes:
lane_dir = 'L%03d' % int(lane_dir)
data_dir = os.path.join(self.location, "Data", "Intensities", "BaseCalls", lane_dir)
if in_memory:
raise RuntimeError("Preloading into memory not implemented yet")
return Tile(data_dir, tile)
class Tile(object):
def __init__(self, data_dir, tile):
"""Fetches sequences from a single tile.
You would not normally instantiate these directly. Create a
BCLReader and call get_tile() instead.
"""
# Find the file prefix I need to be looking at. Could infer it
# from the lane number but instead I'll do it by looking for the
# matching .filter file
self.bcl_filename = None
self.data_dir = data_dir
self.tile = tile
data_dir_listing = os.listdir(data_dir)
for filt in data_dir_listing:
amatch = re.match('(.+_%s).filter' % tile, filt)
if amatch:
self.bcl_filename = amatch.group(1) + '.bcl.gz'
self.filter_file = os.path.join(data_dir, filt)
break
if not self.bcl_filename:
raise RuntimeError("Cannot find a .filter file for tile %s" % tile)
# The CBCL filename is in the format "L00{lane}_{surface}", where the prefix
# should have the same name as the data_dir, and the surface is the first
# digit of the tile name.
self.cbcl_filename = "%s_%s.cbcl" % (os.path.basename(data_dir), str(tile)[0])
# Also work out the number of cycle folders. Should be 308
# for HiSeq 4000
cycle_dirs = [ f for f in data_dir_listing if re.match('C\d+.1$', f) ]
self.num_cycles = len(cycle_dirs)
# And also the number of clusters, which should be 4309650
# for HiSeq 4000. We need to snag this from the top of the .filter file
with open(self.filter_file, 'rb') as filt_fh:
filt_header = struct.unpack('<III', filt_fh.read(12))
# The first word should be 0, and the version byte is 3, at least
# on my test files.
assert tuple(filt_header[0:2]) == (0, 3)
self.num_clusters = filt_header[2]
# For now, don't read the rest of the .filter file
self.filter_offsets = None
self.passing_wells = None
def get_seqs(self, cluster_indices, start=0, end=None):
"""Collects the sequences specified by indices as a hash of pairs of seq+flag. Ie.
result = { idx1: ( 'ATCG...', True ), idx2: ('NNNG...', False), ... }
indices: An iterable that yields integers. The order is unimportant.
start: starting base.
end: ending base. Note this is as in Python's range(start,end), so
start=2 and end=10 will skip the first two bases and yield the
next 8.
"""
# To build the sequence we have to loop over all the .bcl.gz files for the selected tile
# in the cycle folders. These are all named C[num].1 where num is 1-308 (unpadded).
# The first proper base of the read will be in C1.1 as the read is set to start after the
# adapter, but there may still be an internal barcode (eg. for RAD) so high diversity
# is not assured.
# To understand the meaning of all the cycles and what reads they correspond to we do
# need to look at the run settings.
if end is None:
end = self.num_cycles
# We could use NumPy
# for more efficient storage but it seems overkill as the number of reads
# being extracted is comparatively small.
# This also ensures that all the indices are ints
seq_collector = { int(idx) : ['N'] * (end - start) for idx in cluster_indices }
flag_collector = { int(idx) : None for idx in cluster_indices }
sorted_keys = sorted(seq_collector.keys())
# Fail fast if a key is out of range
if sorted_keys[-1] >= self.num_clusters:
raise IndexError("Requested cluster %i is out of range. Highest on this tile is %i." %
(sorted_keys[-1], self.num_clusters-1) )
# And just to be sure, no key should be negative
if sorted_keys[0] < 0:
raise IndexError("Requested cluster %i is a negative number." % sorted_keys[0])
# Get the accept/reject flag from the .filter file
fo = self._get_filter_offsets()
for idx in sorted_keys:
flag_collector[idx] = (fo[idx] != -1)
# Now the actual basecalls
for cycle in range(start, end):
cycle_dir = os.path.join(self.data_dir, 'C%i.1' % (cycle + 1))
# Now are we looking at .bcl.gz files or NovaSeq .cbcl files??
cycle_file = os.path.join(cycle_dir, self.bcl_filename)
cbcl_file = os.path.join(cycle_dir, self.cbcl_filename)
try:
with gzip.open(cycle_file, 'rb') as bcl_fh:
self._get_seqs_from_bcl(bcl_fh, cycle - start, sorted_keys, seq_collector)
except FileNotFoundError:
# Try the cbcl file. If this fails allow the stack trace which will report both
# missing files.
# Note that this does result in opening the same CBCL file again and again
# for each tile, but each chunk is only unzipped once.
with open(cbcl_file, 'rb') as fh:
self._get_seqs_from_cbcl(fh, cycle - start, sorted_keys, seq_collector)
# Remap the arrays into strings
# return dict( idx : (nuc_string, flag) )
return { idx : ( ''.join(seq), flag_collector[idx] ) for idx, seq in seq_collector.items() }
def _get_filter_offsets(self):
""" Load the filter file, and convert it to a series of offsets. The actual
offsets are only used when reading excluded CBCL files but the -1 entries
will indicate the bad wells.
The file must exist as we opened it earlier when reading self.num_clusters
"""
# Lazy load
if self.filter_offsets:
return self.filter_offsets
with open(self.filter_file, 'rb') as filt_fh:
filt_header = filt_fh.read(12)
# We already saw this!
assert tuple(struct.unpack('<III', filt_header)) == (0, 3, self.num_clusters)
# Slurp the whole thing - file length should match the num_clusters value
# we already know.
filt_bytes = struct.unpack('<{}B'.format(self.num_clusters), filt_fh.read())
# Make the table of offsets
filt_offsets = [-1] * self.num_clusters
offset = 0
for n, flag in enumerate(filt_bytes):
if flag & 0b00000001:
filt_offsets[n] = offset
offset += 1
self.filter_offsets = filt_offsets
self.passing_wells = offset
return self.filter_offsets
def _get_seqs_from_cbcl(self, fh, cycle_idx, sorted_keys, seq_collector):
""" Reads from the fh to find the appropriate BCL block and then unpacks
it to extract the basecalls. Deals with excluded/unexcluded flag, requesting
the filter_offsets as necessary.
See cbcl_read.py for a more comprehensive version of CBCL reading code.
"""
# Assume that fh is positioned at the start and read the header...
# First 12 bytes are fixed fields.
header_bytes = fh.read(12)
h_version, h_size, h_basebits, h_qbits, h_bins = struct.unpack('<HIBBI', header_bytes)
assert h_version == 1
assert h_size > 32 #Should actually be 5681 for all the current CBCL files
assert h_basebits == 2
assert h_qbits == 2 #6 is valid but we don't support it!
assert h_bins == 4 #implied if h_qbits is 2
# We don't care about the quality binning info but we do need the tile count,
# even though we're pretty sure it will always be 352.
qbin_and_tc_bytes = fh.read((h_bins * 4 * 2) + 4)
tile_count, = struct.unpack('<I', qbin_and_tc_bytes[-4:])
# Now I can get all the file offsets which is what I really wanted.
# Plus the excluded_flag which is the final byte.
all_offset_bytes = fh.read( tile_count * 16 + 1 )
excluded_flag = bool(all_offset_bytes[-1])
t_bcl_offset = h_size
# A loop is necessary as I have to tot up the csize values to get the seek offset
tile_as_int = int(self.tile)
for t in range(tile_count):
t_number, t_clusters, t_usize, t_csize = struct.unpack('<IIII', all_offset_bytes[t*16:(t+1)*16])
if t_number == tile_as_int:
# Found it!
break
t_bcl_offset += t_csize
# We did find it, right?
assert t_number == tile_as_int
# Go to the start of the block of tile data and slurp it all (even if
# I only want 1 or 2 bases - seems pointless to try and optimise the
# bases < 10 case)
fh.seek(t_bcl_offset)
zipdata = gzip.GzipFile(fileobj=fh, mode='rb').read(t_usize)
if excluded_flag:
excluded_offsets = self._get_filter_offsets()
for welln in sorted_keys:
wellidx = welln
if excluded_flag:
wellidx = excluded_offsets[welln]
if wellidx == -1:
# Leave the read as an N
continue
if wellidx % 2:
# Take the high bits
base_byte = zipdata[wellidx//2] >> 4
else:
# Take the low bits
base_byte = zipdata[wellidx//2] & 0b00001111
# Finally it's the same as for old BCL.
if base_byte:
seq_collector[welln][cycle_idx] = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
def _get_seqs_from_bcl(self, fh, cycle_idx, sorted_keys, seq_collector):
""" Reads from the fh, which is presumably a gzip stream handle, and
adds the specified seqs to the seq_collector.
This is intended for internal use only.
And obviously it can only be called once per fh.
"""
bcl_header = fh.read(4)
# The BCL header should be a fixed length depending on the machine type.
# This assertion checks that it is at least consistent with the filter
# file for this tile.
assert struct.unpack('<I', bcl_header)[0] == self.num_clusters
# I envisaged a a cunning system where we would seek through the file,
# just reading the chunks we wanted. Turns out for more than, say,
# 10 reads, it's faster just to slurp the thing. For over 10000 it's
# considerably faster!
if len(sorted_keys) > 10:
slurped_file = fh.read()
for idx in sorted_keys:
base_byte = slurped_file[idx]
# base = 'N'
# qual = 0
if base_byte:
# The two lowest bits give us the base call
base = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
# And the high bits give us the quality, but we're not using
# it here, other than the above test which catches no-calls.
# qual = base_byte >> 2
#Collect the base
seq_collector[idx][cycle_idx] = base
else:
for idx in sorted_keys:
fh.seek(idx + 4)
# Is reading bytes 1 at a time slow? I'd imagine that internal
# cacheing negates any need for chunked reads at this level.
base_byte, = fh.read(1)
# Copy-paste-ahoy!
if base_byte:
base = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
seq_collector[idx][cycle_idx] = base