Snakefile.read1qc

#!/bin/bash
# vim: ft=python

## This workflow specifically runs the well dups scanner.
## This triggers before demultiplexing but contributes to the
## final QC report so it makes sens for it to be separate.
## For non-patterned flowcells this will just be a no-op.

# Contents >>>
#   + Embedded BASH script to bootstrap the workflow
#   + Initialisation and configuration
#   + Helper functions
#   + The rules specific to this workflow
#   + More generic rules

"""true" ### Begin shell script part
set -euo pipefail

source "`dirname $0`"/shell_helper_functions.sh
export TOOLBOX="$(find_toolbox)"

snakerun_drmaa "$0" "$@"

"exit""" ### End of shell script part

#!/usr/bin/env snakemake
import yaml
import re
import xml.etree.ElementTree as ET
from itertools import chain
from snakemake.utils import format

# See notes in Snakefile.qc regarding the toolbox.
# Here we need wd_get_cached_targets and wd_count_well_duplicates to be available.
TOOLBOX = 'env PATH="{}:$PATH"'.format(os.environ['TOOLBOX'])

def glob():
    """Regular glob() is useful but it can be improved like so.
    """
    from glob import glob
    return lambda p: sorted( (f.rstrip('/') for f in glob(os.path.expanduser(p))) )
glob = glob()

# Caller can set this, or else there needs to be a symlink, as with Snakefile.demux
RUNDIR = os.path.realpath(config.get('rundir', './seqdata'))

def get_wd_settings(rundir, run_info_root):
    """Settings for well duplicates scanning.
    """
    wd_settings = dict( TARGETS_TO_SAMPLE = 2500,
                        READ_LENGTH = 50,
                        START_POS = 20,
                        LEVELS_TO_SCAN = 5,
                        REPORT_VERBOSE = True )

    # We need to examine the run info. OK to do this on every cluster job and
    # fail immediately if missing.
    # We're now relying on LANES_IN_RUN being correct for the barcode check even
    # if there is not WD check to be run.
    wd_settings.update(dict( LANES_IN_RUN = [1],
                             LAST_LANE = '0',
                             END_POS = 0,
                             TILE_MATCH = dict() ))

    if not run_info_root:
        # Just return the defaults
        return wd_settings

    wd_settings['LAST_TILE'] = wd_settings['FIRST_TILE'] = '0000'
    try:
        wd_settings['LAST_LANE'], wd_settings['LAST_TILE'] = \
            max(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
        _, wd_settings['FIRST_TILE'] = \
            min(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
    except ValueError:
        # If there are no tiles we can reasonably assume we're on a non-patterned
        # Flowell (MiSeq or 2500). In this case, no well dups scan.
        # Of course, an update to the instrument software may change the XML...
        return wd_settings

    # Lanes to sample is now variable since the arrival of Novaseq, so get it from
    # RunInfo.xml...
    wd_settings['LANES_IN_RUN'] = list(range(1, int(wd_settings['LAST_LANE']) + 1))

    # Find the first read that has >READ_LENGTH cycles. Normally read1 will do.
    # If not using read1, always set START_POS to the first cycle of that read.
    # Also scanning later reads will fail when triggered early if there are missing BCL
    # files but that's OK as the pipeline will simply retry once the run is complete.
    all_read_lens = [ int(r.get('NumCycles')) for r in
                      sorted( run_info_root.findall("Run/Reads/Read"),
                              key = lambda r: int(r.get("Number")) ) ]
    if max(all_read_lens) > wd_settings['READ_LENGTH']:
        read_to_sample = [ n for n, l in enumerate(all_read_lens)
                           if l > wd_settings['READ_LENGTH'] ][0]

        # Is it OK to start at START_POS or do we go from 0?
        if ( read_to_sample == 0 and
             all_read_lens[read_to_sample] > (wd_settings['READ_LENGTH'] +
                                              wd_settings['START_POS'])
           ):
            start_pos = wd_settings['START_POS']
        else:
            start_pos = 0

        # Now we have the relative START_POS, calculate the absolute START_POS and
        # overwrite the initial setting
        wd_settings['START_POS'] = start_pos + sum( all_read_lens[:read_to_sample] )
    else:
        # We can't process this run
        print(f"Cannot process run with short read lengths {all_read_lens}.", file=sys.stderr)
        return wd_settings

    wd_settings['END_POS'] = wd_settings['READ_LENGTH'] + wd_settings['START_POS']

    # If there are 4 swaths per side (NovaSeq) then only do the even tiles
    if wd_settings['LAST_TILE'] >= '2400':
        wd_settings['TILE_MATCH'] = dict( T='1..[02468]', B='2..[02468]' )
    else:
        wd_settings['TILE_MATCH'] = dict( T='1...', B='2...' )

    # On the new NovaSeq SP flowcells they seem to be disabling one surface, so:
    if wd_settings['LAST_TILE'][0] == '1':
        del wd_settings['TILE_MATCH']['B']
    elif wd_settings['FIRST_TILE'][0] == '2':
        del wd_settings['TILE_MATCH']['T']

    # Unfortunately there's more - for slimmed-down runs we get a line in
    # pipeline_settings.ini like this, indicating that most tiles are missing:
    #  --tiles: s_[$LANE]_1101
    # I don't want to just ignore tiles with missing BCL files as this is right to
    # trigger a genuine validation failure.
    # Rather than re-code the logic for what goes in pipeline_settings.ini I'm just going
    # to scan the file and interpret the line.
    # This is dodgy in the general case but OK for test runs!
    try:
        with open(rundir + "/pipeline_settings.ini") as fh:
            for l in fh:
                mo = re.match(r'''\s*--tiles:.*]_(\d{4})['"]?$''', l)
                if mo:
                    tile = mo.group(1)
                    wd_settings['TILE_MATCH'] = ( dict( T=tile ) if tile[0] == '1'
                                                  else dict( B=tile ) )
    except Exception:
        #no matter
        pass

    return wd_settings

try:
    run_info_root = ET.parse(RUNDIR + "/RunInfo.xml").getroot()
    RUN = run_info_root.find('./Run').attrib['Id']
except FileNotFoundError:
    # Take the run name from the CWD
    run_info_root = None
    RUN = os.path.basename(os.path.realpath(os.getcwd()))

# Set the returned dict items as global variables
globals().update(get_wd_settings(RUNDIR, run_info_root))

# === Driver rules ===
localrules: wd_main, bc_main

if not TILE_MATCH:
    rule wd_main:
        # No-op rule
        shell: "# No tiles to be sampled. Assuming this is a MiSeq run, or read1 is too short."
else:
    rule wd_main:
        input:
            summary = format('QC/welldups/{TARGETS_TO_SAMPLE}summary.yml')

# For the early demultiplex barcode check. The idea is that if this file is non-empty then the
# driver will trigger an alert message incorporating the text of the file, or if the file is
# missing it will also send a warning.
rule bc_main:
    input:
        summary = 'QC/bc_check/bc_check.msg'

# === Rules to invoke the duplicate counter ===
localrules: make_wd_summary, make_bc_summary, setup_bc_check

rule make_wd_summary:
    output: 'QC/welldups/{targets}summary.yml'
    input:
        expand( "QC/welldups/{{targets}}targets_lane{lane}{side}.txt", lane = LANES_IN_RUN,
                                                                       side = TILE_MATCH.keys() )
    run:
        from collections import deque
        from time import sleep
        sleep(2) # Avoids annoying re-running due to clock skew

        # TODO - I really should break this into a function and make a unit test!
        # Result will be of the form { lane: { side: { metric: 12.34 } } }
        res = dict()

        for f in input:
            # Since files end in 2T.txt or whatever, populate res[lane][side]
            lane = res.setdefault(f[-6], dict())
            side = lane.setdefault(f[-5], dict())

            #Look at the last 3 lines of the file
            with open(f) as fh:
                for k, v in [ l.rstrip('\n%').split(':') for l in deque(fh, 3) ]:
                    if "Overall" in k:
                        side["raw"] = float(v)
                    elif "v1" in k:
                        side["v1"] = float(v)
                    elif "v2" in k:
                        side["v2"] = float(v)
        # Now average things up.
        for lane in res.values():
            lane['mean'] = { metric: sum(side[metric] for side in lane.values()) / len(lane)
                             for metric in list(lane.values())[0].keys() }

        # And save it out
        with open(output[0], 'w') as yfh:
            yaml.safe_dump(res, yfh)

rule prep_wd_indices:
    output: "QC/welldups/{targets}clusters.list"
    input:
        slocs = format("{RUNDIR}/Data/Intensities/s.locs")
    shell:
        "{TOOLBOX} wd_get_cached_targets {input.slocs} {TARGETS_TO_SAMPLE} {output}"

rule count_well_dupl:
    output: "QC/welldups/{targets}targets_lane{lane}{side,.}.txt"
    input:
        targfile = "QC/welldups/{targets}clusters.list"
    params:
        verbose = '' if REPORT_VERBOSE else '-S',
        tiles   = lambda wc: TILE_MATCH[wc.side],
        cycles  = f'{START_POS}-{END_POS}'
    shell:
       r"""{TOOLBOX} wd_count_well_duplicates -f {input.targfile} -n {wildcards.targets} -s {LAST_TILE} \
           -r {RUNDIR} -i {wildcards.lane} -l {LEVELS_TO_SCAN} --cycles {params.cycles:q} \
           -t {params.tiles:q} {params.verbose} > {output}"""

# === Rules to pre-demultiplex one tile to check the barcode balance ===

# This copies all the Stats.json for MultiQC to find, then calls assess_bc_check.py
# to make the summary over all Stats.json files. It also adds QC/{rep}/run_info.{r}.2.yml
# for all the reports which provides some metadata for the top of the report.
# Note this rule has no wildcards just a load of expansions.
rule make_bc_summary:
    output:
        summary = "QC/bc_check/bc_check.msg",
        json    = expand("QC/lane{l}/Stats.json", l=LANES_IN_RUN),
        unass   = expand("QC/lane{l}/{run}_{l}_unassigned_table.txt", l=LANES_IN_RUN, run=[RUN]),
        ri2o    = expand("QC/overview/run_info.{r}.2.yml", r=[RUN]),
        ri2l    = expand("QC/lane{l}/run_info.{r}.2.yml",  l=LANES_IN_RUN, r=[RUN]),
    input:
        json    = expand("QC/bc_check/lane{l}/Stats/Stats.json", l=LANES_IN_RUN),
        unass   = expand("QC/bc_check/lane{l}/unassigned_table.txt", l=LANES_IN_RUN),
        opts    = expand("QC/bc_check/lane{l}/bcl2fastq.opts",   l=LANES_IN_RUN),
        log     = expand("QC/bc_check/lane{l}/bcl2fastq.log",    l=LANES_IN_RUN),
    run:
        for ij, oj in chain( zip(input.json, output.json),
                             zip(input.unass, output.unass) ):
            shell("cp -v {ij} {oj}")
        shell("assess_bc_check.py {input.json} > {output.summary}")
        shell("summarize_post_bcl2fastq.py --subdir QC/bc_check > {output.ri2o}")
        for n, l in enumerate(LANES_IN_RUN):
            out_ri2l = output.ri2l[n]
            shell("summarize_post_bcl2fastq.py --subdir QC/bc_check --lane {l} > {out_ri2l}")


# Read 1 demux uses a shadow directory as we don't want to keep the reports or the FASTQ files at all.
# Shallow shadow is tricky but here we can safely assume the QC dir already exists and bc_check_tmp
# does not.
rule bcl2fastq_bc_check:
    output:
        json    = "QC/bc_check/lane{l}/Stats/Stats.json",
        opts    = "QC/bc_check/lane{l}/bcl2fastq.opts",
        unass   = "QC/bc_check/lane{l}/unassigned_table.txt"
    log:
        version = "QC/bc_check/lane{l}/bcl2fastq.version",
        log     = "QC/bc_check/lane{l}/bcl2fastq.log"
    input:
        ssheet  = "QC/bc_check/lane{l}/SampleSheet.filtered.csv"
    threads: 2
    shadow: "shallow"
    shell:
        # Run do_demultiplex.sh for this lane. The silly stuff with f=0 is just to allow us to
        # preserve the log even if the demux fails.
        """export PROCESSING_THREADS={threads}
           mkdir bc_check_tmp
           lanedir=bc_check_tmp/QC/bc_check/lane{wildcards.l}
           mkdir -p "$lanedir"
           f=0
           {TOOLBOX} do_demultiplex.sh {RUNDIR} "$lanedir" {input.ssheet} {wildcards.l} || f=1
           for outfile in {log} {output} ; do
             mv bc_check_tmp/$outfile $outfile || true
           done
           unassigned_to_table.py {output.json} > {output.unass}
           ( exit $f )
        """

rule setup_bc_check:
    output:
        ssheet = "QC/bc_check/lane{l}/SampleSheet.filtered.csv"
    input:
        ssheet = RUNDIR + "/SampleSheet.csv"
    shell:
        # If the override file is missing or empty we'll apply auto revcomp logic.
        # I considered using TILE_MATCH to configure which single tile to sample but this is not
        # a good idea - just leave it to bcl2fastq_setup.py to work out a sensible option.
        """revcomp=$(cat {RUNDIR}/pipeline/index_revcomp.lane{wildcards.l}.OVERRIDE 2>/dev/null || true)
           bcl2fastq_setup.py --bc_check --lane {wildcards.l} --revcomp "${{revcomp:-auto}}" {RUNDIR} > {output}
        """