From 1fc337923bac73cc724159f2297edc6ee668a60c Mon Sep 17 00:00:00 2001 From: fcunial Date: Mon, 6 Nov 2023 11:56:06 -0500 Subject: [PATCH] fixes --- .dockstore.yml | 3 + docker/truvari_intrasample/Dockerfile | 107 +++++++++++++++++++ docker/truvari_intrasample/pushDocker.sh | 4 + docker/truvari_intrasample/resolve.py | 82 ++++++++++++++ scripts/build_docker_images.sh | 9 +- wdl/pipelines/TruvariIntrasample.wdl | 129 +++++++++++++++++++++++ 6 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 docker/truvari_intrasample/Dockerfile create mode 100755 docker/truvari_intrasample/pushDocker.sh create mode 100644 docker/truvari_intrasample/resolve.py create mode 100644 wdl/pipelines/TruvariIntrasample.wdl diff --git a/.dockstore.yml b/.dockstore.yml index a22917f..b28df87 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -3,3 +3,6 @@ workflows: - name: HelloWorkflow subclass: wdl primaryDescriptorPath: /wdl/pipelines/HelloWorkflow.wdl +- name: TruvariIntrasample + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TruvariIntrasample.wdl \ No newline at end of file diff --git a/docker/truvari_intrasample/Dockerfile b/docker/truvari_intrasample/Dockerfile new file mode 100644 index 0000000..a69e6dc --- /dev/null +++ b/docker/truvari_intrasample/Dockerfile @@ -0,0 +1,107 @@ +FROM continuumio/miniconda3 +MAINTAINER Fabio Cunial +ARG work_dir=/truvari_intrasample +WORKDIR ${work_dir} + + +# --------------------------------- Versions ----------------------------------- +ARG gcloud_version=429.0.0 +ARG htslib_version=1.18 +ARG samtools_version=1.18 +ARG bcftools_version=1.18 +# ------------------------------------------------------------------------------ + + +# OS AND BASIC UTILITIES +RUN apt-get -qqy update --fix-missing \ + && apt-get -qqy dist-upgrade \ + && apt-get install -y --no-install-recommends \ + zlib1g-dev \ + liblzma-dev \ + libbz2-dev \ + libdeflate-dev \ + libreadline-dev \ + libsqlite3-dev \ + libssl-dev \ + libcurl4-openssl-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libffi-dev \ + liblzma-dev \ + libopenblas-dev \ + apt-transport-https \ + gawk \ + ca-certificates \ + tree \ + gnupg \ + ssh \ + time \ + curl \ + wget \ + autotools-dev \ + autoconf \ + automake \ + make \ + cmake \ + gcc \ + g++ \ + gfortran \ + build-essential \ + git \ + bc \ + python3-pip \ + xz-utils \ + tk-dev \ + python-dev \ + bsdmainutils \ + unzip \ + python3-pycurl + +# GSUTIL +RUN pip3 uninstall -y crcmod && pip3 install --no-cache-dir -U crcmod +RUN wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ + && tar -xf google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ + && rm -f google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ + && yes | ./google-cloud-sdk/install.sh +ENV PATH=${work_dir}/google-cloud-sdk/bin:${PATH} + +# HTSLIB +RUN wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 \ + && tar xjf htslib-${htslib_version}.tar.bz2 \ + && rm htslib-${htslib_version}.tar.bz2 \ + && cd htslib-${htslib_version} \ + && ./configure \ + && make \ + && make install \ + && cd ${work_dir} \ + && rm -rf htslib-${htslib_version} \ + && bgzip --help + +# SAMTOOLS +RUN wget https://github.com/samtools/samtools/releases/download/${samtools_version}/samtools-${samtools_version}.tar.bz2 \ + && tar xjf samtools-${samtools_version}.tar.bz2 \ + && rm samtools-${samtools_version}.tar.bz2 \ + && cd samtools-${samtools_version} \ + && ./configure --without-curses \ + && make \ + && make install \ + && cd ${work_dir} \ + && rm -rf samtools-${samtools_version} \ + && samtools --help + +# BCFTOOLS +RUN wget https://github.com/samtools/bcftools/releases/download/${bcftools_version}/bcftools-${bcftools_version}.tar.bz2 \ + && tar xjf bcftools-${bcftools_version}.tar.bz2 \ + && rm bcftools-${bcftools_version}.tar.bz2 \ + && cd bcftools-${bcftools_version} \ + && ./configure --without-curses \ + && make \ + && make install \ + && cd ${work_dir} \ + && rm -rf bcftools-${bcftools_version} \ + && bcftools --help + +# TRUVARI +RUN pip3 install git+https://github.com/acenglish/truvari.git@v4.2.0-rc \ + && truvari --help +COPY ./resolve.py ${work_dir} diff --git a/docker/truvari_intrasample/pushDocker.sh b/docker/truvari_intrasample/pushDocker.sh new file mode 100755 index 0000000..eb3cd36 --- /dev/null +++ b/docker/truvari_intrasample/pushDocker.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# +docker build --progress=plain -t fcunial/truvari_intrasample . +docker push fcunial/truvari_intrasample diff --git a/docker/truvari_intrasample/resolve.py b/docker/truvari_intrasample/resolve.py new file mode 100644 index 0000000..4c3f088 --- /dev/null +++ b/docker/truvari_intrasample/resolve.py @@ -0,0 +1,82 @@ +""" +Given a VCF, fill in the , , ALT sequence + +DEL and INV are easy. But I gotta check the DUPs' coordinates. +They'll have a span, but can I just insert that sequence? +Also, do I have to worry about dup-to-ins? +""" + +import sys +import pysam +import truvari + +REF_CLEAN = True # Set to false if you're working with the right reference +MAX_SV = 100_000 # Filter things smaller than this + +RC = str.maketrans("ATCG", "TAGC") +def do_rc(s): + """ + Reverse complement a sequence + """ + return s.translate(RC)[::-1] + +def resolve(entry, ref): + """ + """ + if entry.start > ref.get_reference_length(entry.chrom): + return None + + seq = ref.fetch(entry.chrom, entry.start, entry.stop) + if entry.alts[0] == '': + entry.ref = seq + entry.alts = [seq[0]] + elif entry.alts[0] == '': + entry.ref = seq + entry.alts = [do_rc(seq)] + elif entry.alts[0] == '': + entry.info['SVTYPE'] = 'INS' + entry.ref = seq[0] + entry.alts = [seq] + entry.stop = entry.start + 1 + entry.qual = 1 + return entry + +if __name__ == '__main__': + default_quals = {"pbsv": 3, + "sniffles": 2, + "pav": 4} + + vcf = pysam.VariantFile(sys.argv[1]) + ref = pysam.FastaFile(sys.argv[2]) + n_header = vcf.header.copy() + d_qual = 1 + for key, val in default_quals.items(): + if key in sys.argv[1]: + d_qual = default_quals[key] + if REF_CLEAN: + for ctg in vcf.header.contigs.keys(): + if ctg not in ref.references: + n_header.contigs[ctg].remove_header() + + out = pysam.VariantFile("/dev/stdout", 'w', header=n_header) + for entry in vcf: + if REF_CLEAN and entry.chrom not in ref.references: + continue + + if truvari.entry_size(entry) >= MAX_SV: + continue + + entry.qual = d_qual + if entry.alts[0].startswith("<"): + entry = resolve(entry, ref) + + if entry is None or set(entry.alts[0]) == {'N'}: + continue + # No more blank genotypes + n_gt = tuple([_ if _ is not None else 0 for _ in entry.samples[0]['GT']]) + entry.samples[0]['GT'] = n_gt + entry.translate(n_header) + try: + out.write(entry) + except Exception: + sys.stderr.write(f"{entry}\n{type(entry)}\n") diff --git a/scripts/build_docker_images.sh b/scripts/build_docker_images.sh index 7691320..b1e213c 100755 --- a/scripts/build_docker_images.sh +++ b/scripts/build_docker_images.sh @@ -2,6 +2,8 @@ set -euxo pipefail +DOCKER_REGISTRY="us.gcr.io" # us-central1-docker.pkg.dev is not supported on RWB +GOOGLE_PROJECT="broad-dsp-lrma" DOCKER_REPO="aou-lr" LABEL=$1 @@ -11,8 +13,9 @@ do DOCKER_NAME=$(basename $DIR_NAME) VERSION=$(grep '^current_version' .bumpversion.cfg | cut -d' ' -f3) - TAG_VERSION="us-central1-docker.pkg.dev/broad-dsp-lrma/$DOCKER_REPO/$DOCKER_NAME:$VERSION" - TAG_LABEL="us-central1-docker.pkg.dev/broad-dsp-lrma/$DOCKER_REPO/$DOCKER_NAME:$LABEL" + + TAG_VERSION="$DOCKER_REGISTRY/$GOOGLE_PROJECT/$DOCKER_REPO/$DOCKER_NAME:$VERSION" + TAG_LABEL="$DOCKER_REGISTRY/$GOOGLE_PROJECT/$DOCKER_REPO/$DOCKER_NAME:$LABEL" if docker manifest inspect $TAG_VERSION > /dev/null; then # Make sure the most recent version of the Docker image gets used as the cache @@ -26,4 +29,4 @@ do docker build -t $TAG_LABEL $DIR_NAME docker push $TAG_LABEL -done +done \ No newline at end of file diff --git a/wdl/pipelines/TruvariIntrasample.wdl b/wdl/pipelines/TruvariIntrasample.wdl new file mode 100644 index 0000000..462c4af --- /dev/null +++ b/wdl/pipelines/TruvariIntrasample.wdl @@ -0,0 +1,129 @@ +version 1.0 + + +# Workflow for intra-sample merging for AoU. +# +workflow TruvariIntrasample { + input { + String sample_id + File pbsv_vcf_gz + File pbsv_vcf_gz_tbi + File sniffles_vcf_gz + File sniffles_vcf_gz_tbi + File pav_vcf_gz + File pav_vcf_gz_tbi + File reference_fa + } + parameter_meta { + } + + call TruvariIntrasampleImpl { + input: + sample_id = sample_id, + pbsv_vcf_gz = pbsv_vcf_gz, + pbsv_vcf_gz_tbi = pbsv_vcf_gz_tbi, + sniffles_vcf_gz = sniffles_vcf_gz, + sniffles_vcf_gz_tbi = sniffles_vcf_gz_tbi, + pav_vcf_gz = pav_vcf_gz, + pav_vcf_gz_tbi = pav_vcf_gz_tbi, + reference_fa = reference_fa + } + + output { + File truvari_collapsed = TruvariIntrasampleImpl.truvari_collapsed + File truvari_collapsed_idx = TruvariIntrasampleImpl.truvari_collapsed_idx + File bcftools_merged = TruvariIntrasampleImpl.bcftools_merged + File bcftools_merged_idx = TruvariIntrasampleImpl.bcftools_merged_idx + } +} + + +# Other intermediate files created, but likely aren't useful for production are: +# - preprocessed/ directory for each caller's cleaned result +# - ~{sample_id}.removed.vcf.gz variant representations removed during +# collapsing +# +task TruvariIntrasampleImpl { + input { + String sample_id + File pbsv_vcf_gz + File pbsv_vcf_gz_tbi + File sniffles_vcf_gz + File sniffles_vcf_gz_tbi + File pav_vcf_gz + File pav_vcf_gz_tbi + File reference_fa + } + parameter_meta { + } + + Int disk_size_gb = 10*( ceil(size(pbsv_vcf_gz,"GB")) + ceil(size(sniffles_vcf_gz,"GB")) + ceil(size(pav_vcf_gz,"GB")) + ceil(size(reference_fa,"GB")) ) + 50 + String docker_dir = "/truvari_intrasample" + String work_dir = "/cromwell_root/truvari_intrasample" + + command <<< + set -euxo pipefail + mkdir -p ~{work_dir} + cd ~{work_dir} + + TIME_COMMAND="/usr/bin/time --verbose" + N_SOCKETS="$(lscpu | grep '^Socket(s):' | awk '{print $NF}')" + N_CORES_PER_SOCKET="$(lscpu | grep '^Core(s) per socket:' | awk '{print $NF}')" + N_THREADS=$(( ${N_SOCKETS} * ${N_CORES_PER_SOCKET} )) + + # Step 1 - clean up the VCFs + # - Assigns quality scores to each SV caller's result + # - pav 4 + # - pbsv 3 + # - sniffles 2 + # - Resolves any symbolic alts (e.g. `` with the sequence from the + # reference) + # - symbolic variants are given quality score of 1 + # - Filters out variants greater than 100kbp (I might want to remove + # this) + # - Fills in blank genotypes with `0` + # - Filters out BND variants + # The quality scores are set based on which variant representations we + # believe are generally more accurate with higher being better. + mkdir -p preprocessed + for in_vcf in ~{pav_vcf_gz} ~{pbsv_vcf_gz} ~{sniffles_vcf_gz} + do + outname=preprocessed/$(basename $in_vcf) + python ~{docker_dir}/resolve.py ${in_vcf} ~{reference_fa} \ + | bcftools norm --check-ref s --fasta-ref ~{reference_fa} -N -m-any \ + | bcftools view -i "SVTYPE != 'BND'" -O z -o ${outname} + tabix $outname + done + + # Step 2 - merge + # Pastes the samples together in the order of the preferred genotypes. + # That is to say, this creates a three sample VCF with sample columns + # from pav_sv, pbsv, and sniffles. + bcftools merge --threads ${N_THREADS} --merge none --force-samples -O z \ + -o ~{sample_id}.bcftools_merged.vcf.gz \ + preprocessed/$(basename ~{pav_vcf_gz}) \ + preprocessed/$(basename ~{pbsv_vcf_gz}) \ + preprocessed/$(basename ~{sniffles_vcf_gz}) + tabix ~{sample_id}.bcftools_merged.vcf.gz + + # Step 3 - collapse + truvari collapse -i ~{sample_id}.bcftools_merged.vcf.gz -c removed.vcf.gz -k maxqual --gt --intra \ + --pctseq 0.90 --pctsize 0.90 --refdist 500 \ + | bcftools sort -O z -o ~{sample_id}.truvari_collapsed.vcf.gz + tabix ~{sample_id}.truvari_collapsed.vcf.gz + >>> + + output { + File truvari_collapsed = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz" + File truvari_collapsed_idx = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz.tbi" + File bcftools_merged = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz" + File bcftools_merged_idx = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz.tbi" + } + runtime { + docker: "fcunial/truvari_intrasample" + cpu: 1 + memory: "16GB" + disks: "local-disk " + disk_size_gb + " HDD" + preemptible: 0 + } +}