-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrab_bcl2fastq_stats.py
executable file
·189 lines (145 loc) · 6.44 KB
/
grab_bcl2fastq_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python3
from __future__ import division, print_function, absolute_import
# FIXME - this should be replaced by stats_json_aggregator.py which is doing the same job but
# in a better way.
# Eg. Try this:
# $ cd /lustre/fastqdata/<anyrun>
# $ grab_bcl2fastq_stats.py demultiplexing/lane1 180119_M01145_0008_000000000-B8RR9 1
# $ stats_json_aggregator.py demultiplexing/lane1/Stats/Stats.json
import os, sys
from glob import glob
import yaml, yamlloader
from collections import OrderedDict
from illuminatus.Formatters import rat
#I guess I want pstdev since I'm calculating variance over the whole run?
#Nope, to be compatible with Illumina we want regular sample stdev
#Note this requires Py3.4 or else the statistics package to be installed via Pip
from statistics import stdev, pstdev, mean
def slurp(filename):
"""Grab a file as a list of strings.
"""
res = []
with open(filename) as fh:
for line in fh:
res.append(line.rstrip("\n"))
return res
# Here is some logic which is specific to our drive layout.
def find_fastq_stats(basedir, lane):
#There might be two stats files. One for Read1 and one for for the complete read.
# demux_stats, = ( f for f in
# glob('/ifs/runqc/{}/*/Stats/DemuxSummaryF1L{}.txt'.format(runid, lane))
# if ('_Read1/' not in f) )
fastq_stats = [ f for f in
glob('{}/Stats/FastqSummaryF1L{}.txt'.format(basedir, lane))
if ('_Read1/' not in f) ]
if len(fastq_stats) == 0:
return None
else:
#This can happen if there were multiple index lengths in a single lane.
#There is no good solution because both stats files will show a high level of
#unassigned barcodes.
#If you want, rename one of the .txt files and then the other will be used.
assert len(fastq_stats) == 1, "multiple fastq_stats found: %s" % repr(fastq_stats)
return fastq_stats[0]
def get_data_container():
#Now, what do we actually want to save out?
# total assigned reads raw
# total assigned reads pf
# total unassigned reads raw
# total unassigned reads pf
# assert $1 == $2
# $2 + $4 / $1 + $3 is the pass rate. But can we do better?
# total reads per sample
# how to calculate the barcode balance? Standard deviation of reads per sample??
# Apparently we want the coefficient of variance which is the std deviation / mean
# or so Donald says, so blame him if it's wrong.
return OrderedDict.fromkeys([
"Number of Indexes",
"Assigned Reads Raw",
"Unassigned Reads Raw",
"Assigned Reads PF",
"Unassigned Reads PF",
"Fraction PF",
"Fraction Assigned",
"Fraction Assigned Raw",
"Mean Reads Per Sample",
"Barcode Balance",
])
def gather_fastq_stats(fastq_stats_file):
if fastq_stats_file is None:
return None
dc = get_data_container()
dc['Assigned Reads Raw'] = 0
dc['Assigned Reads PF'] = 0
dc['Unassigned Reads Raw'] = 0
dc['Unassigned Reads PF'] = 0
# Don't use a defaultdict as we can't dump these as YAML
reads_per_sample = dict()
fastq_stats_header = 'SampleNumber Tile NumberOfReadsRaw NumberOfReadsPF'.split()
for num, line in enumerate(slurp(fastq_stats_file)):
if num == 0:
assert line.split() == fastq_stats_header
else:
sample_num, tile, rr, rpf = line.split()
if sample_num == '0':
dc['Unassigned Reads Raw'] += int(rr)
dc['Unassigned Reads PF'] += int(rpf)
else:
dc['Assigned Reads Raw'] += int(rr)
dc['Assigned Reads PF'] += int(rpf)
reads_per_sample.setdefault(sample_num, 0)
reads_per_sample[sample_num] += int(rpf)
total_reads_raw = dc['Assigned Reads Raw'] + dc['Unassigned Reads Raw']
#If it didn't pass the filter, it wasn't assigned, right?
#Apparently this only applies to 4000 and X runs!
if dc['Assigned Reads Raw'] != dc['Assigned Reads PF']:
dc['Fraction Assigned Raw'] = rat((dc['Assigned Reads Raw']), total_reads_raw)
dc['Fraction PF'] = rat((dc['Assigned Reads PF'] + dc['Unassigned Reads PF']), total_reads_raw )
dc['Fraction Assigned'] = rat((dc['Assigned Reads PF']), total_reads_raw)
#Barcode balance... manual calculation of pop std dev
#Mean Reads Per Sample = Assigned Reads PF / len(reads_per_sample)
#mean_sq_reads_per_sample = sum( x**2 for x in reads_per_sample.values() ) / len(reads_per_sample)
#stddev_reads_per_sample = (mean_sq_reads_per_sample - (Mean Reads Per Sample ** 2)) ** 0.5
#Barcode Balance = stddev_reads_per_sample / Mean Reads Per Sample
#However, it seems that Illumina use stdev and not pstdev, so I guess we should do the same?
#Annoyingly I've already done a load of QC with the old values and I can't re-generate
#them as the logs are gone.
if reads_per_sample:
dc['Mean Reads Per Sample'] = mean(reads_per_sample.values())
else:
#This shouldn't happen if the sample sheet is valid!
dc['Mean Reads Per Sample'] = 0
if len(reads_per_sample) <= 1:
#With only one barcode in the lane the calculation is meaningless
dc['Barcode Balance'] = 'NA'
else:
dc['Barcode Balance'] = rat(stdev(reads_per_sample.values()), dc['Mean Reads Per Sample'])
dc['Number of Indexes'] = len(reads_per_sample)
return dc
def print_fastq_stats(runid, lane, dc, fh=sys.stdout):
"""Print out the data container to a file. As YAML, of course.
"""
od = OrderedDict()
od['Run'] = runid
od['Lane'] = lane
#dc can be None if file not found. Originally missing data would cause the
#script to error, but I don't want Rapid QC to crash out just for this.
if dc:
od.update(dc)
print(yaml.dump(od,
Dumper = yamlloader.ordereddict.CSafeDumper,
default_flow_style = False),
file = fh,
end = '')
def dump_lane(basedir, runid, lane):
#Find the stats file, extract the data and print it out.
print_fastq_stats(runid, lane, gather_fastq_stats(find_fastq_stats(basedir, lane)))
def main():
"""We need to give it a run id and lane to work on.
"""
if not len(sys.argv) == 4:
exit("Please specify a base dir, run and lane to examine")
else:
dump_lane(*sys.argv[1:])
if __name__ == '__main__':
main()