-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpf_vs_occupied.py
executable file
·237 lines (186 loc) · 9.39 KB
/
pf_vs_occupied.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python3
# This script reads InterOp for a run and generates GNUPlot
# instructions to make a plot of %Occupied vs. %Pass Filter,
# as requested by Matt.
# To make my life simpler, I'll use the Python bindings to read the InterOp
# files directly. (This is now a standard PyPi module - try
# 'python3 -m interop --test' to see if the install is good)
# The tutorial at https://github.com/Illumina/interop/blob/master/docs/src/Tutorial_01_Intro.ipynb
# is very pertinent.
import os, sys
from statistics import mean
from math import ceil
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import yaml
from interop import py_interop_run_metrics, py_interop_run
from interop.py_interop_metrics import index_out_of_bounds_exception
# This needs to be an integer- see comment below
DEF_MAX_DENSITY = 1000
def get_numbers_from_rundir(run_dir):
"""Returns a data structure with all the numbers I need
"""
res = dict()
# Depending on the type of flowcell we might have #s or %s
res['type'] = '?'
# The "Tile Metrics" and "Extended Tile Metrics" seem to be needed
metrics_handle = get_run_metrics_handle(run_dir)
tms = metrics_handle.tile_metric_set()
etms = metrics_handle.extended_tile_metric_set()
# I also want to get the FCID, which I may as well get directly from the InterOp
# Account for the 00000- prefix on MiSeq flowcells.
res['fcid'] = metrics_handle.run_info().flowcell_id().split('-')[-1]
# Now go lane by lane. This gets a tuple eg. (1, 2)
lanes = list(tms.lanes())
res['lanes'] = ['Lane {}'.format(alane) for alane in lanes]
for alane, lanelabel in zip(lanes, res['lanes']):
# Add to the overall dict
laneres = res[lanelabel] = list()
# Now go tile by tile and get the raw numbers
tiles_on_lane = tms.tile_numbers_for_lane(alane)
for atile in tiles_on_lane:
cluster_count = int(tms.get_metric(alane, atile).cluster_count())
clusters_pf = int(tms.get_metric(alane, atile).cluster_count_pf())
try:
# If this works we're on a patterned flowcell
clusters_occup = int(etms.get_metric(alane, atile).cluster_count_occupied())
pct_occup = etms.get_metric(alane, atile).percent_occupied()
pct_pf = tms.get_metric(alane, atile).percent_pf()
laneres.append( dict( tile = atile,
count = cluster_count,
occ = clusters_occup,
pf = clusters_pf,
pct_occup = pct_occup,
pct_pf = pct_pf) )
assert res['type'] != '#'
res['type'] = '%'
except index_out_of_bounds_exception:
# OK so we're on a MiSeq flowcell
cluster_density_k = tms.get_metric(alane, atile).cluster_density_k()
cluster_density_pf_k = tms.get_metric(alane, atile).cluster_density_pf_k()
laneres.append( dict( tile = atile,
count = cluster_count,
pf = clusters_pf,
density_k = cluster_density_k,
pf_density_k = cluster_density_pf_k ) )
assert res['type'] != '%'
res['type'] = '#'
# OK can we do something cunning with the cluster density to get some reasonable axis sizes
# for the MiSeq plots? Hmmm.
# Just eyeballing the numbers, I think having a plot where the max density is 1000k or maybe 900k
# makes sense.
return res
def get_run_metrics_handle(run_dir):
""" Load the goodies from the .bin files in the InterOp directory.
This black magic is copy-pasted straight out of the tutorial linked above!
"""
#print("Examining: {}".format(run_dir))
valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
for v2l in (py_interop_run.Tile, py_interop_run.ExtendedTile):
valid_to_load[v2l] = 1
run_metrics = py_interop_run_metrics.run_metrics()
run_metrics.read(run_dir, valid_to_load)
return run_metrics
def format_gnuplot(tds):
"""Generates a series of lines to plot Tile Data Structure in
GNUPlot.
"""
if tds['type'] == '%':
fname = "Occupancy-By-Tile"
xlabel = "% Occupied"
ylabel = "% Pass Filter"
xrange = yrange = 100.0
plot = ["pct_occup", "pct_pf"]
else:
fname = "Cluster-Density-By-Tile"
xlabel = "Cluster Density (kclusters/mm^2)"
ylabel = "PF Density (kclusters/mm^2)"
xrange = yrange = tds['density_max']
plot = ["density_k", "pf_density_k"]
gp_lines = [ "# generated by {}".format(' '.join(sys.argv)),
"set terminal pngcairo enhanced font 'sans,10'",
"set output '{}.png'".format(fname),
"set title '{}'".format(tds['fcid']),
"set xrange [0 : {} ]".format(xrange),
"set yrange [0 : {} ]".format(yrange),
"set xlabel '{}'".format(xlabel),
"set ylabel '{}'".format(ylabel),
"set boxwidth 0.3",
"set key outside right top",
"set grid",
"set datafile separator ','",
]
# Add a line to show x=y, because no point should ever be above this line
gp_lines.extend(["set style arrow 2 nohead linewidth 1.0 dashtype 3",
"set arrow from 0,0 rto {},{} as 2 lc 'grey'".format(xrange, yrange)])
# Add lines (using headless arrows) to show the means. And make them dotted (supported by pngcairo)
# Taking means of percentages is OK here as all the tiles are the same size.
gp_lines.extend(["set style arrow 1 nohead linewidth 1.2 dashtype 2"])
for n, l in enumerate(tds['lanes']):
mean_x = mean([atile[plot[0]] for atile in tds[l]])
mean_y = mean([atile[plot[1]] for atile in tds[l]])
gp_lines.append("set arrow from {},0 rto 0,{} as 1 lc {} front".format(mean_x, yrange, n+1))
gp_lines.append("set arrow from 0,{} rto {},0 as 1 lc {} front".format(mean_y, xrange, n+1))
# Now plot the actual datas. Start by declaring all tiles to plot on one line.
gp_lines.extend(["set style fill transparent solid 0.1 border",
"set style circle radius graph 0.01 noclip"])
plot_cmds = [ "'-' title '{}' with circles".format(l)
for l in tds['lanes'] ]
gp_lines.append("plot" + (" ,".join(plot_cmds)))
# Now the points, one tile per line, each lane terminated with an 'e'
for l in tds['lanes']:
laneres = tds[l]
for atile in laneres:
gp_lines.append("{},{}".format(*[atile[p] for p in plot]))
gp_lines.append('e')
return gp_lines
def get_density_range(tds, min_range, buffer=100):
"""Given a set of density numbers, work out the right limit of the plot, ie. the
max xrange/yrange. For MiSeq tiles the density will normally be <1000 but
for clustered flowcells (if you force plot a JiSeq with no occupancy info) then
it could be more like 2500.
"""
buffered_range = min_range
for alane in tds['lanes']:
for atile in tds[alane]:
dk = atile['density_k']
# See if dk is going to fit on a plot bounded by current min_range.
# The idea is that if res is 1000 and dk is 1020 we extend res to 1200
# because we round up to the nearest 100 then add 100
buffered_range = max(buffered_range, (ceil(dk/buffer) * buffer) + buffer)
return float(buffered_range)
def main(args):
if args.from_yaml:
with open(args.from_yaml) as yfh:
res = yaml.safe_load(yfh)
else:
res = get_numbers_from_rundir(args.run_dir)
# The density max range may be saved within the YAML but the command line option
# always takes precedence. Note we can tell if --density_max was specified on the
# command line because the arg will always be a float, whereas the default is an
# int. Yes this is a hack.
if res['type'] == '#':
if type(args.density_max) is float:
res['density_max'] = args.density_max
elif not res.get('density_max'):
res['density_max'] = get_density_range(res, args.density_max)
if args.dump_yaml:
print(yaml.safe_dump(res), end='')
else:
print(*format_gnuplot(res), sep='\n')
def parse_args(*argv):
"""Usual ArgumentParser
"""
desc = "Get tile density metrics from InterOp and format for GNUPlot"
parser = ArgumentParser( description = desc,
formatter_class = ArgumentDefaultsHelpFormatter )
parser.add_argument("run_dir", type=str, nargs='?', help="Run to be examined.")
parser.add_argument("--dump_yaml", action="store_true", help="Output YAML not GNUPlot commands.")
parser.add_argument("--from_yaml", help="Load from YAML not from run dir.")
parser.add_argument("--density_max", type=float, help="Override axis limits when plotting density.",
default=DEF_MAX_DENSITY)
args = parser.parse_args(*argv)
if args.from_yaml and args.run_dir:
sys.exit("Inspecting a run directly and loading from YAML are mutually exclusive.")
return args
if __name__ == '__main__':
main(parse_args())