-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile.demux
executable file
·133 lines (112 loc) · 4.85 KB
/
Snakefile.demux
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
# vim: ft=python
## This workflow should be run in a fastqdata/demultiplexing directory to
## drive bcl2fastq. The reason for running here is to get the
## .snakemake and slurm_output dirs in a sensible place.
# Contents >>>
# + Embedded BASH script to bootstrap the workflow
# + Initialisation and configuration
# + Helper functions
# + The rules specific to this workflow
# + More generic rules
"""true" ### Begin shell script part
set -euo pipefail
source "`dirname $0`"/shell_helper_functions.sh
# $PATH doesn't get passed to worker nodes on SLURM but I only need it
# for local rules to see programs in this directory. Toolbox path is a separate
# thing and will be added explicitly.
export TOOLBOX="$(find_toolbox)"
export PATH="${PATH}:$(dirname "$0")"
# Caller needs to supply --config lanes='? ? ?' rundir=?
# Apply -F to run all steps every time. Otherwise I could maybe depend on the
# SampleSheet.csv as an input?
snakerun_drmaa "$0" --keep-going -F "$@"
"exit""" ### End of shell script part
#!/usr/bin/env snakemake
import yaml
# do_demultiplex.sh needs to find bcl2fastq in the toolbox.
TOOLBOX = 'env PATH="{}:$PATH"'.format(os.environ['TOOLBOX'])
def glob():
"""Regular glob() is useful but it can be improved like so.
"""
from glob import glob
return lambda p: sorted( (f.rstrip('/') for f in glob(os.path.expanduser(p))) )
glob = glob()
# Config - must be supplied
LANES = config['lanes']
RUNDIR = os.path.realpath(config.get('rundir', './seqdata')) #symlink now made by driver.sh
def get_sequential_file(pattern):
"""Given a filename like foo???.log, find the next available filename that
matches the pattern. The file will be created and left empty.
I've coded this to avoid race conditions but it's not a problem as
Snakemake locks the working dir anyway.
"""
pre, c, post = re.match(r'([^?]*)([?]+)([^?]*)', pattern).groups()
seqnum = 0
for existing in glob(pattern):
try:
seqnum = int(existing.lstrip(pre).rstrip(post))
except ValueError:
pass
try:
while True:
try:
fh = open('{}{:0{len}d}{}'.format(pre, seqnum, post, len=len(c)), mode='x')
return fh.name
except FileExistsError:
seqnum += 1
finally:
fh.close()
# bcl2fastq is run for each LANE to produce laneN/Stats/DemuxSummaryF1LN.txt,
# laneN/Stats/DemultiplexingStats.xml and laneN/foo.json
# postprocess rule is run just once to gather and rename all the fastq.gz files
# output is renamesXXX.log where XXX starts at 000.
# Setup and postprocess rules can run locally
localrules: setup, postprocess
# I accidentally put '../projects_ready.txt' as an output of this rule but then
# the file gets clobbered on a REDO, so don't do that again! This file should
# be modified by the post processor and never removed.
rule postprocess:
output: 'renames.log', 'projects_ready.txt.bak'
input:
summary = expand("lane{l}/Stats/DemuxSummaryF1L{l}.txt", l=LANES),
stats = expand("lane{l}/Stats/Stats.json", l=LANES),
run:
log_file = get_sequential_file('renames.log.???')
shell("ln -s {log_file} renames.log")
shell("BCL2FASTQPostprocessor.py ..")
try:
shell("test -s ../projects_ready.txt")
shell("cp ../projects_ready.txt projects_ready.txt.bak")
except subprocess.CalledProcessError:
snakemake.logging.logger.error("No projects listed in projects_ready.txt after calling BCL2FASTQPostprocessor.py")
raise
rule bcl2fastq:
output:
summary = "lane{l}/Stats/DemuxSummaryF1L{l}.txt",
stats = "lane{l}/Stats/DemultiplexingStats.xml",
json = "lane{l}/Stats/Stats.json",
opts = "lane{l}/bcl2fastq.opts",
log:
version = "lane{l}/bcl2fastq.version",
log = "lane{l}/bcl2fastq.log"
input:
ssheet = "lane{l}/SampleSheet.filtered.csv"
threads: 12 # if you edit this number, also edit the cluster.yml
shell:
# Run do_demultiplex.sh for this lane.
# Log will go to "lane{l}/bcl2fastq.log"; for convenience, dump the tail of the log
# to stderr using a shell trap.
"""trap 'tail -v -n 20 lane{wildcards.l}/bcl2fastq.log >&2' exit
echo "Log will be written to lane{wildcards.l}/bcl2fastq.log"
export PROCESSING_THREADS={threads}
{TOOLBOX} do_demultiplex.sh {RUNDIR} lane{wildcards.l} {input.ssheet} {wildcards.l}
"""
rule setup:
output:
ssheet = "lane{l}/SampleSheet.filtered.csv"
shell:
# Override file being missing or empty results in auto revcomp
"""revcomp=$(cat {RUNDIR}/pipeline/index_revcomp.lane{wildcards.l}.OVERRIDE 2>/dev/null || true)
bcl2fastq_setup.py --lane {wildcards.l} --revcomp "${{revcomp:-auto}}" {RUNDIR} > {output}
"""