-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile.read1qc
executable file
·310 lines (269 loc) · 12.8 KB
/
Snakefile.read1qc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/bin/bash
# vim: ft=python
## This workflow specifically runs the well dups scanner.
## This triggers before demultiplexing but contributes to the
## final QC report so it makes sens for it to be separate.
## For non-patterned flowcells this will just be a no-op.
# Contents >>>
# + Embedded BASH script to bootstrap the workflow
# + Initialisation and configuration
# + Helper functions
# + The rules specific to this workflow
# + More generic rules
"""true" ### Begin shell script part
set -euo pipefail
source "`dirname $0`"/shell_helper_functions.sh
export TOOLBOX="$(find_toolbox)"
snakerun_drmaa "$0" "$@"
"exit""" ### End of shell script part
#!/usr/bin/env snakemake
import yaml
import re
import xml.etree.ElementTree as ET
from itertools import chain
from snakemake.utils import format
# See notes in Snakefile.qc regarding the toolbox.
# Here we need wd_get_cached_targets and wd_count_well_duplicates to be available.
TOOLBOX = 'env PATH="{}:$PATH"'.format(os.environ['TOOLBOX'])
def glob():
"""Regular glob() is useful but it can be improved like so.
"""
from glob import glob
return lambda p: sorted( (f.rstrip('/') for f in glob(os.path.expanduser(p))) )
glob = glob()
# Caller can set this, or else there needs to be a symlink, as with Snakefile.demux
RUNDIR = os.path.realpath(config.get('rundir', './seqdata'))
def get_wd_settings(rundir, run_info_root):
"""Settings for well duplicates scanning.
"""
wd_settings = dict( TARGETS_TO_SAMPLE = 2500,
READ_LENGTH = 50,
START_POS = 20,
LEVELS_TO_SCAN = 5,
REPORT_VERBOSE = True )
# We need to examine the run info. OK to do this on every cluster job and
# fail immediately if missing.
# We're now relying on LANES_IN_RUN being correct for the barcode check even
# if there is not WD check to be run.
wd_settings.update(dict( LANES_IN_RUN = [1],
LAST_LANE = '0',
END_POS = 0,
TILE_MATCH = dict() ))
if not run_info_root:
# Just return the defaults
return wd_settings
wd_settings['LAST_TILE'] = wd_settings['FIRST_TILE'] = '0000'
try:
wd_settings['LAST_LANE'], wd_settings['LAST_TILE'] = \
max(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
_, wd_settings['FIRST_TILE'] = \
min(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
except ValueError:
# If there are no tiles we can reasonably assume we're on a non-patterned
# Flowell (MiSeq or 2500). In this case, no well dups scan.
# Of course, an update to the instrument software may change the XML...
return wd_settings
# Lanes to sample is now variable since the arrival of Novaseq, so get it from
# RunInfo.xml...
wd_settings['LANES_IN_RUN'] = list(range(1, int(wd_settings['LAST_LANE']) + 1))
# Find the first read that has >READ_LENGTH cycles. Normally read1 will do.
# If not using read1, always set START_POS to the first cycle of that read.
# Also scanning later reads will fail when triggered early if there are missing BCL
# files but that's OK as the pipeline will simply retry once the run is complete.
all_read_lens = [ int(r.get('NumCycles')) for r in
sorted( run_info_root.findall("Run/Reads/Read"),
key = lambda r: int(r.get("Number")) ) ]
if max(all_read_lens) > wd_settings['READ_LENGTH']:
read_to_sample = [ n for n, l in enumerate(all_read_lens)
if l > wd_settings['READ_LENGTH'] ][0]
# Is it OK to start at START_POS or do we go from 0?
if ( read_to_sample == 0 and
all_read_lens[read_to_sample] > (wd_settings['READ_LENGTH'] +
wd_settings['START_POS'])
):
start_pos = wd_settings['START_POS']
else:
start_pos = 0
# Now we have the relative START_POS, calculate the absolute START_POS and
# overwrite the initial setting
wd_settings['START_POS'] = start_pos + sum( all_read_lens[:read_to_sample] )
else:
# We can't process this run
print(f"Cannot process run with short read lengths {all_read_lens}.", file=sys.stderr)
return wd_settings
wd_settings['END_POS'] = wd_settings['READ_LENGTH'] + wd_settings['START_POS']
# If there are 4 swaths per side (NovaSeq) then only do the even tiles
if wd_settings['LAST_TILE'] >= '2400':
wd_settings['TILE_MATCH'] = dict( T='1..[02468]', B='2..[02468]' )
else:
wd_settings['TILE_MATCH'] = dict( T='1...', B='2...' )
# On the new NovaSeq SP flowcells they seem to be disabling one surface, so:
if wd_settings['LAST_TILE'][0] == '1':
del wd_settings['TILE_MATCH']['B']
elif wd_settings['FIRST_TILE'][0] == '2':
del wd_settings['TILE_MATCH']['T']
# Unfortunately there's more - for slimmed-down runs we get a line in
# pipeline_settings.ini like this, indicating that most tiles are missing:
# --tiles: s_[$LANE]_1101
# I don't want to just ignore tiles with missing BCL files as this is right to
# trigger a genuine validation failure.
# Rather than re-code the logic for what goes in pipeline_settings.ini I'm just going
# to scan the file and interpret the line.
# This is dodgy in the general case but OK for test runs!
try:
with open(rundir + "/pipeline_settings.ini") as fh:
for l in fh:
mo = re.match(r'''\s*--tiles:.*]_(\d{4})['"]?$''', l)
if mo:
tile = mo.group(1)
wd_settings['TILE_MATCH'] = ( dict( T=tile ) if tile[0] == '1'
else dict( B=tile ) )
except Exception:
#no matter
pass
return wd_settings
try:
run_info_root = ET.parse(RUNDIR + "/RunInfo.xml").getroot()
RUN = run_info_root.find('./Run').attrib['Id']
except FileNotFoundError:
# Take the run name from the CWD
run_info_root = None
RUN = os.path.basename(os.path.realpath(os.getcwd()))
# Set the returned dict items as global variables
globals().update(get_wd_settings(RUNDIR, run_info_root))
# === Driver rules ===
localrules: wd_main, bc_main
if not TILE_MATCH:
rule wd_main:
# No-op rule
shell: "# No tiles to be sampled. Assuming this is a MiSeq run, or read1 is too short."
else:
rule wd_main:
input:
summary = format('QC/welldups/{TARGETS_TO_SAMPLE}summary.yml')
# For the early demultiplex barcode check. The idea is that if this file is non-empty then the
# driver will trigger an alert message incorporating the text of the file, or if the file is
# missing it will also send a warning.
rule bc_main:
input:
summary = 'QC/bc_check/bc_check.msg'
# === Rules to invoke the duplicate counter ===
localrules: make_wd_summary, make_bc_summary, setup_bc_check
rule make_wd_summary:
output: 'QC/welldups/{targets}summary.yml'
input:
expand( "QC/welldups/{{targets}}targets_lane{lane}{side}.txt", lane = LANES_IN_RUN,
side = TILE_MATCH.keys() )
run:
from collections import deque
from time import sleep
sleep(2) # Avoids annoying re-running due to clock skew
# TODO - I really should break this into a function and make a unit test!
# Result will be of the form { lane: { side: { metric: 12.34 } } }
res = dict()
for f in input:
# Since files end in 2T.txt or whatever, populate res[lane][side]
lane = res.setdefault(f[-6], dict())
side = lane.setdefault(f[-5], dict())
#Look at the last 3 lines of the file
with open(f) as fh:
for k, v in [ l.rstrip('\n%').split(':') for l in deque(fh, 3) ]:
if "Overall" in k:
side["raw"] = float(v)
elif "v1" in k:
side["v1"] = float(v)
elif "v2" in k:
side["v2"] = float(v)
# Now average things up.
for lane in res.values():
lane['mean'] = { metric: sum(side[metric] for side in lane.values()) / len(lane)
for metric in list(lane.values())[0].keys() }
# And save it out
with open(output[0], 'w') as yfh:
yaml.safe_dump(res, yfh)
rule prep_wd_indices:
output: "QC/welldups/{targets}clusters.list"
input:
slocs = format("{RUNDIR}/Data/Intensities/s.locs")
shell:
"{TOOLBOX} wd_get_cached_targets {input.slocs} {TARGETS_TO_SAMPLE} {output}"
rule count_well_dupl:
output: "QC/welldups/{targets}targets_lane{lane}{side,.}.txt"
input:
targfile = "QC/welldups/{targets}clusters.list"
params:
verbose = '' if REPORT_VERBOSE else '-S',
tiles = lambda wc: TILE_MATCH[wc.side],
cycles = f'{START_POS}-{END_POS}'
shell:
r"""{TOOLBOX} wd_count_well_duplicates -f {input.targfile} -n {wildcards.targets} -s {LAST_TILE} \
-r {RUNDIR} -i {wildcards.lane} -l {LEVELS_TO_SCAN} --cycles {params.cycles:q} \
-t {params.tiles:q} {params.verbose} > {output}"""
# === Rules to pre-demultiplex one tile to check the barcode balance ===
# This copies all the Stats.json for MultiQC to find, then calls assess_bc_check.py
# to make the summary over all Stats.json files. It also adds QC/{rep}/run_info.{r}.2.yml
# for all the reports which provides some metadata for the top of the report.
# Note this rule has no wildcards just a load of expansions.
rule make_bc_summary:
output:
summary = "QC/bc_check/bc_check.msg",
json = expand("QC/lane{l}/Stats.json", l=LANES_IN_RUN),
unass = expand("QC/lane{l}/{run}_{l}_unassigned_table.txt", l=LANES_IN_RUN, run=[RUN]),
ri2o = expand("QC/overview/run_info.{r}.2.yml", r=[RUN]),
ri2l = expand("QC/lane{l}/run_info.{r}.2.yml", l=LANES_IN_RUN, r=[RUN]),
input:
json = expand("QC/bc_check/lane{l}/Stats/Stats.json", l=LANES_IN_RUN),
unass = expand("QC/bc_check/lane{l}/unassigned_table.txt", l=LANES_IN_RUN),
opts = expand("QC/bc_check/lane{l}/bcl2fastq.opts", l=LANES_IN_RUN),
log = expand("QC/bc_check/lane{l}/bcl2fastq.log", l=LANES_IN_RUN),
run:
for ij, oj in chain( zip(input.json, output.json),
zip(input.unass, output.unass) ):
shell("cp -v {ij} {oj}")
shell("assess_bc_check.py {input.json} > {output.summary}")
shell("summarize_post_bcl2fastq.py --subdir QC/bc_check > {output.ri2o}")
for n, l in enumerate(LANES_IN_RUN):
out_ri2l = output.ri2l[n]
shell("summarize_post_bcl2fastq.py --subdir QC/bc_check --lane {l} > {out_ri2l}")
# Read 1 demux uses a shadow directory as we don't want to keep the reports or the FASTQ files at all.
# Shallow shadow is tricky but here we can safely assume the QC dir already exists and bc_check_tmp
# does not.
rule bcl2fastq_bc_check:
output:
json = "QC/bc_check/lane{l}/Stats/Stats.json",
opts = "QC/bc_check/lane{l}/bcl2fastq.opts",
unass = "QC/bc_check/lane{l}/unassigned_table.txt"
log:
version = "QC/bc_check/lane{l}/bcl2fastq.version",
log = "QC/bc_check/lane{l}/bcl2fastq.log"
input:
ssheet = "QC/bc_check/lane{l}/SampleSheet.filtered.csv"
threads: 2
shadow: "shallow"
shell:
# Run do_demultiplex.sh for this lane. The silly stuff with f=0 is just to allow us to
# preserve the log even if the demux fails.
"""export PROCESSING_THREADS={threads}
mkdir bc_check_tmp
lanedir=bc_check_tmp/QC/bc_check/lane{wildcards.l}
mkdir -p "$lanedir"
f=0
{TOOLBOX} do_demultiplex.sh {RUNDIR} "$lanedir" {input.ssheet} {wildcards.l} || f=1
for outfile in {log} {output} ; do
mv bc_check_tmp/$outfile $outfile || true
done
unassigned_to_table.py {output.json} > {output.unass}
( exit $f )
"""
rule setup_bc_check:
output:
ssheet = "QC/bc_check/lane{l}/SampleSheet.filtered.csv"
input:
ssheet = RUNDIR + "/SampleSheet.csv"
shell:
# If the override file is missing or empty we'll apply auto revcomp logic.
# I considered using TILE_MATCH to configure which single tile to sample but this is not
# a good idea - just leave it to bcl2fastq_setup.py to work out a sensible option.
"""revcomp=$(cat {RUNDIR}/pipeline/index_revcomp.lane{wildcards.l}.OVERRIDE 2>/dev/null || true)
bcl2fastq_setup.py --bc_check --lane {wildcards.l} --revcomp "${{revcomp:-auto}}" {RUNDIR} > {output}
"""