Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fabio-cunial committed Nov 6, 2023
1 parent 1acffcc commit 1fc3379
Show file tree
Hide file tree
Showing 6 changed files with 331 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ workflows:
- name: HelloWorkflow
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/HelloWorkflow.wdl
- name: TruvariIntrasample
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/TruvariIntrasample.wdl
107 changes: 107 additions & 0 deletions docker/truvari_intrasample/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
FROM continuumio/miniconda3
MAINTAINER Fabio Cunial
ARG work_dir=/truvari_intrasample
WORKDIR ${work_dir}


# --------------------------------- Versions -----------------------------------
ARG gcloud_version=429.0.0
ARG htslib_version=1.18
ARG samtools_version=1.18
ARG bcftools_version=1.18
# ------------------------------------------------------------------------------


# OS AND BASIC UTILITIES
RUN apt-get -qqy update --fix-missing \
&& apt-get -qqy dist-upgrade \
&& apt-get install -y --no-install-recommends \
zlib1g-dev \
liblzma-dev \
libbz2-dev \
libdeflate-dev \
libreadline-dev \
libsqlite3-dev \
libssl-dev \
libcurl4-openssl-dev \
libncurses5-dev \
libncursesw5-dev \
libffi-dev \
liblzma-dev \
libopenblas-dev \
apt-transport-https \
gawk \
ca-certificates \
tree \
gnupg \
ssh \
time \
curl \
wget \
autotools-dev \
autoconf \
automake \
make \
cmake \
gcc \
g++ \
gfortran \
build-essential \
git \
bc \
python3-pip \
xz-utils \
tk-dev \
python-dev \
bsdmainutils \
unzip \
python3-pycurl

# GSUTIL
RUN pip3 uninstall -y crcmod && pip3 install --no-cache-dir -U crcmod
RUN wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \
&& tar -xf google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \
&& rm -f google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \
&& yes | ./google-cloud-sdk/install.sh
ENV PATH=${work_dir}/google-cloud-sdk/bin:${PATH}

# HTSLIB
RUN wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 \
&& tar xjf htslib-${htslib_version}.tar.bz2 \
&& rm htslib-${htslib_version}.tar.bz2 \
&& cd htslib-${htslib_version} \
&& ./configure \
&& make \
&& make install \
&& cd ${work_dir} \
&& rm -rf htslib-${htslib_version} \
&& bgzip --help

# SAMTOOLS
RUN wget https://github.com/samtools/samtools/releases/download/${samtools_version}/samtools-${samtools_version}.tar.bz2 \
&& tar xjf samtools-${samtools_version}.tar.bz2 \
&& rm samtools-${samtools_version}.tar.bz2 \
&& cd samtools-${samtools_version} \
&& ./configure --without-curses \
&& make \
&& make install \
&& cd ${work_dir} \
&& rm -rf samtools-${samtools_version} \
&& samtools --help

# BCFTOOLS
RUN wget https://github.com/samtools/bcftools/releases/download/${bcftools_version}/bcftools-${bcftools_version}.tar.bz2 \
&& tar xjf bcftools-${bcftools_version}.tar.bz2 \
&& rm bcftools-${bcftools_version}.tar.bz2 \
&& cd bcftools-${bcftools_version} \
&& ./configure --without-curses \
&& make \
&& make install \
&& cd ${work_dir} \
&& rm -rf bcftools-${bcftools_version} \
&& bcftools --help

# TRUVARI
RUN pip3 install git+https://github.com/acenglish/[email protected] \
&& truvari --help
COPY ./resolve.py ${work_dir}
4 changes: 4 additions & 0 deletions docker/truvari_intrasample/pushDocker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
#
docker build --progress=plain -t fcunial/truvari_intrasample .
docker push fcunial/truvari_intrasample
82 changes: 82 additions & 0 deletions docker/truvari_intrasample/resolve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Given a VCF, fill in the <DEL>, <INV>, <DUP> ALT sequence
DEL and INV are easy. But I gotta check the DUPs' coordinates.
They'll have a span, but can I just insert that sequence?
Also, do I have to worry about dup-to-ins?
"""

import sys
import pysam
import truvari

REF_CLEAN = True # Set to false if you're working with the right reference
MAX_SV = 100_000 # Filter things smaller than this

RC = str.maketrans("ATCG", "TAGC")
def do_rc(s):
"""
Reverse complement a sequence
"""
return s.translate(RC)[::-1]

def resolve(entry, ref):
"""
"""
if entry.start > ref.get_reference_length(entry.chrom):
return None

seq = ref.fetch(entry.chrom, entry.start, entry.stop)
if entry.alts[0] == '<DEL>':
entry.ref = seq
entry.alts = [seq[0]]
elif entry.alts[0] == '<INV>':
entry.ref = seq
entry.alts = [do_rc(seq)]
elif entry.alts[0] == '<DUP>':
entry.info['SVTYPE'] = 'INS'
entry.ref = seq[0]
entry.alts = [seq]
entry.stop = entry.start + 1
entry.qual = 1
return entry

if __name__ == '__main__':
default_quals = {"pbsv": 3,
"sniffles": 2,
"pav": 4}

vcf = pysam.VariantFile(sys.argv[1])
ref = pysam.FastaFile(sys.argv[2])
n_header = vcf.header.copy()
d_qual = 1
for key, val in default_quals.items():
if key in sys.argv[1]:
d_qual = default_quals[key]
if REF_CLEAN:
for ctg in vcf.header.contigs.keys():
if ctg not in ref.references:
n_header.contigs[ctg].remove_header()

out = pysam.VariantFile("/dev/stdout", 'w', header=n_header)
for entry in vcf:
if REF_CLEAN and entry.chrom not in ref.references:
continue

if truvari.entry_size(entry) >= MAX_SV:
continue

entry.qual = d_qual
if entry.alts[0].startswith("<"):
entry = resolve(entry, ref)

if entry is None or set(entry.alts[0]) == {'N'}:
continue
# No more blank genotypes
n_gt = tuple([_ if _ is not None else 0 for _ in entry.samples[0]['GT']])
entry.samples[0]['GT'] = n_gt
entry.translate(n_header)
try:
out.write(entry)
except Exception:
sys.stderr.write(f"{entry}\n{type(entry)}\n")
9 changes: 6 additions & 3 deletions scripts/build_docker_images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

set -euxo pipefail

DOCKER_REGISTRY="us.gcr.io" # us-central1-docker.pkg.dev is not supported on RWB
GOOGLE_PROJECT="broad-dsp-lrma"
DOCKER_REPO="aou-lr"
LABEL=$1

Expand All @@ -11,8 +13,9 @@ do
DOCKER_NAME=$(basename $DIR_NAME)

VERSION=$(grep '^current_version' .bumpversion.cfg | cut -d' ' -f3)
TAG_VERSION="us-central1-docker.pkg.dev/broad-dsp-lrma/$DOCKER_REPO/$DOCKER_NAME:$VERSION"
TAG_LABEL="us-central1-docker.pkg.dev/broad-dsp-lrma/$DOCKER_REPO/$DOCKER_NAME:$LABEL"

TAG_VERSION="$DOCKER_REGISTRY/$GOOGLE_PROJECT/$DOCKER_REPO/$DOCKER_NAME:$VERSION"
TAG_LABEL="$DOCKER_REGISTRY/$GOOGLE_PROJECT/$DOCKER_REPO/$DOCKER_NAME:$LABEL"

if docker manifest inspect $TAG_VERSION > /dev/null; then
# Make sure the most recent version of the Docker image gets used as the cache
Expand All @@ -26,4 +29,4 @@ do

docker build -t $TAG_LABEL $DIR_NAME
docker push $TAG_LABEL
done
done
129 changes: 129 additions & 0 deletions wdl/pipelines/TruvariIntrasample.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
version 1.0


# Workflow for intra-sample merging for AoU.
#
workflow TruvariIntrasample {
input {
String sample_id
File pbsv_vcf_gz
File pbsv_vcf_gz_tbi
File sniffles_vcf_gz
File sniffles_vcf_gz_tbi
File pav_vcf_gz
File pav_vcf_gz_tbi
File reference_fa
}
parameter_meta {
}

call TruvariIntrasampleImpl {
input:
sample_id = sample_id,
pbsv_vcf_gz = pbsv_vcf_gz,
pbsv_vcf_gz_tbi = pbsv_vcf_gz_tbi,
sniffles_vcf_gz = sniffles_vcf_gz,
sniffles_vcf_gz_tbi = sniffles_vcf_gz_tbi,
pav_vcf_gz = pav_vcf_gz,
pav_vcf_gz_tbi = pav_vcf_gz_tbi,
reference_fa = reference_fa
}

output {
File truvari_collapsed = TruvariIntrasampleImpl.truvari_collapsed
File truvari_collapsed_idx = TruvariIntrasampleImpl.truvari_collapsed_idx
File bcftools_merged = TruvariIntrasampleImpl.bcftools_merged
File bcftools_merged_idx = TruvariIntrasampleImpl.bcftools_merged_idx
}
}


# Other intermediate files created, but likely aren't useful for production are:
# - preprocessed/ directory for each caller's cleaned result
# - ~{sample_id}.removed.vcf.gz variant representations removed during
# collapsing
#
task TruvariIntrasampleImpl {
input {
String sample_id
File pbsv_vcf_gz
File pbsv_vcf_gz_tbi
File sniffles_vcf_gz
File sniffles_vcf_gz_tbi
File pav_vcf_gz
File pav_vcf_gz_tbi
File reference_fa
}
parameter_meta {
}

Int disk_size_gb = 10*( ceil(size(pbsv_vcf_gz,"GB")) + ceil(size(sniffles_vcf_gz,"GB")) + ceil(size(pav_vcf_gz,"GB")) + ceil(size(reference_fa,"GB")) ) + 50
String docker_dir = "/truvari_intrasample"
String work_dir = "/cromwell_root/truvari_intrasample"

command <<<
set -euxo pipefail
mkdir -p ~{work_dir}
cd ~{work_dir}

TIME_COMMAND="/usr/bin/time --verbose"
N_SOCKETS="$(lscpu | grep '^Socket(s):' | awk '{print $NF}')"
N_CORES_PER_SOCKET="$(lscpu | grep '^Core(s) per socket:' | awk '{print $NF}')"
N_THREADS=$(( ${N_SOCKETS} * ${N_CORES_PER_SOCKET} ))

# Step 1 - clean up the VCFs
# - Assigns quality scores to each SV caller's result
# - pav 4
# - pbsv 3
# - sniffles 2
# - Resolves any symbolic alts (e.g. `<DEL>` with the sequence from the
# reference)
# - symbolic variants are given quality score of 1
# - Filters out variants greater than 100kbp (I might want to remove
# this)
# - Fills in blank genotypes with `0`
# - Filters out BND variants
# The quality scores are set based on which variant representations we
# believe are generally more accurate with higher being better.
mkdir -p preprocessed
for in_vcf in ~{pav_vcf_gz} ~{pbsv_vcf_gz} ~{sniffles_vcf_gz}
do
outname=preprocessed/$(basename $in_vcf)
python ~{docker_dir}/resolve.py ${in_vcf} ~{reference_fa} \
| bcftools norm --check-ref s --fasta-ref ~{reference_fa} -N -m-any \
| bcftools view -i "SVTYPE != 'BND'" -O z -o ${outname}
tabix $outname
done

# Step 2 - merge
# Pastes the samples together in the order of the preferred genotypes.
# That is to say, this creates a three sample VCF with sample columns
# from pav_sv, pbsv, and sniffles.
bcftools merge --threads ${N_THREADS} --merge none --force-samples -O z \
-o ~{sample_id}.bcftools_merged.vcf.gz \
preprocessed/$(basename ~{pav_vcf_gz}) \
preprocessed/$(basename ~{pbsv_vcf_gz}) \
preprocessed/$(basename ~{sniffles_vcf_gz})
tabix ~{sample_id}.bcftools_merged.vcf.gz

# Step 3 - collapse
truvari collapse -i ~{sample_id}.bcftools_merged.vcf.gz -c removed.vcf.gz -k maxqual --gt --intra \
--pctseq 0.90 --pctsize 0.90 --refdist 500 \
| bcftools sort -O z -o ~{sample_id}.truvari_collapsed.vcf.gz
tabix ~{sample_id}.truvari_collapsed.vcf.gz
>>>

output {
File truvari_collapsed = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz"
File truvari_collapsed_idx = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz.tbi"
File bcftools_merged = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz"
File bcftools_merged_idx = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz.tbi"
}
runtime {
docker: "fcunial/truvari_intrasample"
cpu: 1
memory: "16GB"
disks: "local-disk " + disk_size_gb + " HDD"
preemptible: 0
}
}

0 comments on commit 1fc3379

Please sign in to comment.