Snakefile.demux

#!/bin/bash
# vim: ft=python

## This workflow should be run in a fastqdata/demultiplexing directory to
## drive bcl2fastq. The reason for running here is to get the
## .snakemake and slurm_output dirs in a sensible place.

# Contents >>>
#   + Embedded BASH script to bootstrap the workflow
#   + Initialisation and configuration
#   + Helper functions
#   + The rules specific to this workflow
#   + More generic rules

"""true" ### Begin shell script part
set -euo pipefail

source "`dirname $0`"/shell_helper_functions.sh

# $PATH doesn't get passed to worker nodes on SLURM but I only need it
# for local rules to see programs in this directory. Toolbox path is a separate
# thing and will be added explicitly.
export TOOLBOX="$(find_toolbox)"
export PATH="${PATH}:$(dirname "$0")"

# Caller needs to supply --config lanes='? ? ?' rundir=?
# Apply -F to run all steps every time. Otherwise I could maybe depend on the
# SampleSheet.csv as an input?
snakerun_drmaa "$0" --keep-going -F "$@"

"exit""" ### End of shell script part

#!/usr/bin/env snakemake
import yaml

# do_demultiplex.sh needs to find bcl2fastq in the toolbox.
TOOLBOX = 'env PATH="{}:$PATH"'.format(os.environ['TOOLBOX'])

def glob():
    """Regular glob() is useful but it can be improved like so.
    """
    from glob import glob
    return lambda p: sorted( (f.rstrip('/') for f in glob(os.path.expanduser(p))) )
glob = glob()

# Config - must be supplied
LANES = config['lanes']
RUNDIR = os.path.realpath(config.get('rundir', './seqdata')) #symlink now made by driver.sh

def get_sequential_file(pattern):
    """Given a filename like foo???.log, find the next available filename that
       matches the pattern. The file will be created and left empty.
       I've coded this to avoid race conditions but it's not a problem as
       Snakemake locks the working dir anyway.
    """
    pre, c, post = re.match(r'([^?]*)([?]+)([^?]*)', pattern).groups()

    seqnum = 0

    for existing in glob(pattern):
        try:
            seqnum = int(existing.lstrip(pre).rstrip(post))
        except ValueError:
            pass

    try:
        while True:
            try:
                fh = open('{}{:0{len}d}{}'.format(pre, seqnum, post, len=len(c)), mode='x')
                return fh.name
            except FileExistsError:
                seqnum += 1
    finally:
        fh.close()

# bcl2fastq is run for each LANE to produce laneN/Stats/DemuxSummaryF1LN.txt,
# laneN/Stats/DemultiplexingStats.xml and laneN/foo.json

# postprocess rule is run just once to gather and rename all the fastq.gz files
# output is renamesXXX.log where XXX starts at 000.

# Setup and postprocess rules can run locally
localrules: setup, postprocess

# I accidentally put '../projects_ready.txt' as an output of this rule but then
# the file gets clobbered on a REDO, so don't do that again! This file should
# be modified by the post processor and never removed.
rule postprocess:
    output: 'renames.log', 'projects_ready.txt.bak'
    input:
        summary = expand("lane{l}/Stats/DemuxSummaryF1L{l}.txt", l=LANES),
        stats   = expand("lane{l}/Stats/Stats.json", l=LANES),
    run:
        log_file = get_sequential_file('renames.log.???')
        shell("ln -s {log_file} renames.log")
        shell("BCL2FASTQPostprocessor.py ..")
        try:
            shell("test -s ../projects_ready.txt")
            shell("cp ../projects_ready.txt projects_ready.txt.bak")
        except subprocess.CalledProcessError:
             snakemake.logging.logger.error("No projects listed in projects_ready.txt after calling BCL2FASTQPostprocessor.py")
             raise

rule bcl2fastq:
    output:
        summary = "lane{l}/Stats/DemuxSummaryF1L{l}.txt",
        stats   = "lane{l}/Stats/DemultiplexingStats.xml",
        json    = "lane{l}/Stats/Stats.json",
        opts    = "lane{l}/bcl2fastq.opts",
    log:
        version = "lane{l}/bcl2fastq.version",
        log     = "lane{l}/bcl2fastq.log"
    input:
        ssheet  = "lane{l}/SampleSheet.filtered.csv"
    threads: 12 # if you edit this number, also edit the cluster.yml
    shell:
        # Run do_demultiplex.sh for this lane.
        # Log will go to "lane{l}/bcl2fastq.log"; for convenience, dump the tail of the log
        # to stderr using a shell trap.
        """trap 'tail -v -n 20 lane{wildcards.l}/bcl2fastq.log >&2' exit
           echo "Log will be written to lane{wildcards.l}/bcl2fastq.log"
           export PROCESSING_THREADS={threads}
           {TOOLBOX} do_demultiplex.sh {RUNDIR} lane{wildcards.l} {input.ssheet} {wildcards.l}
        """

rule setup:
    output:
        ssheet = "lane{l}/SampleSheet.filtered.csv"
    shell:
        # Override file being missing or empty results in auto revcomp
        """revcomp=$(cat {RUNDIR}/pipeline/index_revcomp.lane{wildcards.l}.OVERRIDE 2>/dev/null || true)
           bcl2fastq_setup.py --lane {wildcards.l} --revcomp "${{revcomp:-auto}}" {RUNDIR} > {output}
        """