-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1acffcc
commit 1fc3379
Showing
6 changed files
with
331 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
FROM continuumio/miniconda3 | ||
MAINTAINER Fabio Cunial | ||
ARG work_dir=/truvari_intrasample | ||
WORKDIR ${work_dir} | ||
|
||
|
||
# --------------------------------- Versions ----------------------------------- | ||
ARG gcloud_version=429.0.0 | ||
ARG htslib_version=1.18 | ||
ARG samtools_version=1.18 | ||
ARG bcftools_version=1.18 | ||
# ------------------------------------------------------------------------------ | ||
|
||
|
||
# OS AND BASIC UTILITIES | ||
RUN apt-get -qqy update --fix-missing \ | ||
&& apt-get -qqy dist-upgrade \ | ||
&& apt-get install -y --no-install-recommends \ | ||
zlib1g-dev \ | ||
liblzma-dev \ | ||
libbz2-dev \ | ||
libdeflate-dev \ | ||
libreadline-dev \ | ||
libsqlite3-dev \ | ||
libssl-dev \ | ||
libcurl4-openssl-dev \ | ||
libncurses5-dev \ | ||
libncursesw5-dev \ | ||
libffi-dev \ | ||
liblzma-dev \ | ||
libopenblas-dev \ | ||
apt-transport-https \ | ||
gawk \ | ||
ca-certificates \ | ||
tree \ | ||
gnupg \ | ||
ssh \ | ||
time \ | ||
curl \ | ||
wget \ | ||
autotools-dev \ | ||
autoconf \ | ||
automake \ | ||
make \ | ||
cmake \ | ||
gcc \ | ||
g++ \ | ||
gfortran \ | ||
build-essential \ | ||
git \ | ||
bc \ | ||
python3-pip \ | ||
xz-utils \ | ||
tk-dev \ | ||
python-dev \ | ||
bsdmainutils \ | ||
unzip \ | ||
python3-pycurl | ||
|
||
# GSUTIL | ||
RUN pip3 uninstall -y crcmod && pip3 install --no-cache-dir -U crcmod | ||
RUN wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ | ||
&& tar -xf google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ | ||
&& rm -f google-cloud-cli-${gcloud_version}-linux-x86_64.tar.gz \ | ||
&& yes | ./google-cloud-sdk/install.sh | ||
ENV PATH=${work_dir}/google-cloud-sdk/bin:${PATH} | ||
|
||
# HTSLIB | ||
RUN wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 \ | ||
&& tar xjf htslib-${htslib_version}.tar.bz2 \ | ||
&& rm htslib-${htslib_version}.tar.bz2 \ | ||
&& cd htslib-${htslib_version} \ | ||
&& ./configure \ | ||
&& make \ | ||
&& make install \ | ||
&& cd ${work_dir} \ | ||
&& rm -rf htslib-${htslib_version} \ | ||
&& bgzip --help | ||
|
||
# SAMTOOLS | ||
RUN wget https://github.com/samtools/samtools/releases/download/${samtools_version}/samtools-${samtools_version}.tar.bz2 \ | ||
&& tar xjf samtools-${samtools_version}.tar.bz2 \ | ||
&& rm samtools-${samtools_version}.tar.bz2 \ | ||
&& cd samtools-${samtools_version} \ | ||
&& ./configure --without-curses \ | ||
&& make \ | ||
&& make install \ | ||
&& cd ${work_dir} \ | ||
&& rm -rf samtools-${samtools_version} \ | ||
&& samtools --help | ||
|
||
# BCFTOOLS | ||
RUN wget https://github.com/samtools/bcftools/releases/download/${bcftools_version}/bcftools-${bcftools_version}.tar.bz2 \ | ||
&& tar xjf bcftools-${bcftools_version}.tar.bz2 \ | ||
&& rm bcftools-${bcftools_version}.tar.bz2 \ | ||
&& cd bcftools-${bcftools_version} \ | ||
&& ./configure --without-curses \ | ||
&& make \ | ||
&& make install \ | ||
&& cd ${work_dir} \ | ||
&& rm -rf bcftools-${bcftools_version} \ | ||
&& bcftools --help | ||
|
||
# TRUVARI | ||
RUN pip3 install git+https://github.com/acenglish/[email protected] \ | ||
&& truvari --help | ||
COPY ./resolve.py ${work_dir} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
# | ||
docker build --progress=plain -t fcunial/truvari_intrasample . | ||
docker push fcunial/truvari_intrasample |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
""" | ||
Given a VCF, fill in the <DEL>, <INV>, <DUP> ALT sequence | ||
DEL and INV are easy. But I gotta check the DUPs' coordinates. | ||
They'll have a span, but can I just insert that sequence? | ||
Also, do I have to worry about dup-to-ins? | ||
""" | ||
|
||
import sys | ||
import pysam | ||
import truvari | ||
|
||
REF_CLEAN = True # Set to false if you're working with the right reference | ||
MAX_SV = 100_000 # Filter things smaller than this | ||
|
||
RC = str.maketrans("ATCG", "TAGC") | ||
def do_rc(s): | ||
""" | ||
Reverse complement a sequence | ||
""" | ||
return s.translate(RC)[::-1] | ||
|
||
def resolve(entry, ref): | ||
""" | ||
""" | ||
if entry.start > ref.get_reference_length(entry.chrom): | ||
return None | ||
|
||
seq = ref.fetch(entry.chrom, entry.start, entry.stop) | ||
if entry.alts[0] == '<DEL>': | ||
entry.ref = seq | ||
entry.alts = [seq[0]] | ||
elif entry.alts[0] == '<INV>': | ||
entry.ref = seq | ||
entry.alts = [do_rc(seq)] | ||
elif entry.alts[0] == '<DUP>': | ||
entry.info['SVTYPE'] = 'INS' | ||
entry.ref = seq[0] | ||
entry.alts = [seq] | ||
entry.stop = entry.start + 1 | ||
entry.qual = 1 | ||
return entry | ||
|
||
if __name__ == '__main__': | ||
default_quals = {"pbsv": 3, | ||
"sniffles": 2, | ||
"pav": 4} | ||
|
||
vcf = pysam.VariantFile(sys.argv[1]) | ||
ref = pysam.FastaFile(sys.argv[2]) | ||
n_header = vcf.header.copy() | ||
d_qual = 1 | ||
for key, val in default_quals.items(): | ||
if key in sys.argv[1]: | ||
d_qual = default_quals[key] | ||
if REF_CLEAN: | ||
for ctg in vcf.header.contigs.keys(): | ||
if ctg not in ref.references: | ||
n_header.contigs[ctg].remove_header() | ||
|
||
out = pysam.VariantFile("/dev/stdout", 'w', header=n_header) | ||
for entry in vcf: | ||
if REF_CLEAN and entry.chrom not in ref.references: | ||
continue | ||
|
||
if truvari.entry_size(entry) >= MAX_SV: | ||
continue | ||
|
||
entry.qual = d_qual | ||
if entry.alts[0].startswith("<"): | ||
entry = resolve(entry, ref) | ||
|
||
if entry is None or set(entry.alts[0]) == {'N'}: | ||
continue | ||
# No more blank genotypes | ||
n_gt = tuple([_ if _ is not None else 0 for _ in entry.samples[0]['GT']]) | ||
entry.samples[0]['GT'] = n_gt | ||
entry.translate(n_header) | ||
try: | ||
out.write(entry) | ||
except Exception: | ||
sys.stderr.write(f"{entry}\n{type(entry)}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
version 1.0 | ||
|
||
|
||
# Workflow for intra-sample merging for AoU. | ||
# | ||
workflow TruvariIntrasample { | ||
input { | ||
String sample_id | ||
File pbsv_vcf_gz | ||
File pbsv_vcf_gz_tbi | ||
File sniffles_vcf_gz | ||
File sniffles_vcf_gz_tbi | ||
File pav_vcf_gz | ||
File pav_vcf_gz_tbi | ||
File reference_fa | ||
} | ||
parameter_meta { | ||
} | ||
|
||
call TruvariIntrasampleImpl { | ||
input: | ||
sample_id = sample_id, | ||
pbsv_vcf_gz = pbsv_vcf_gz, | ||
pbsv_vcf_gz_tbi = pbsv_vcf_gz_tbi, | ||
sniffles_vcf_gz = sniffles_vcf_gz, | ||
sniffles_vcf_gz_tbi = sniffles_vcf_gz_tbi, | ||
pav_vcf_gz = pav_vcf_gz, | ||
pav_vcf_gz_tbi = pav_vcf_gz_tbi, | ||
reference_fa = reference_fa | ||
} | ||
|
||
output { | ||
File truvari_collapsed = TruvariIntrasampleImpl.truvari_collapsed | ||
File truvari_collapsed_idx = TruvariIntrasampleImpl.truvari_collapsed_idx | ||
File bcftools_merged = TruvariIntrasampleImpl.bcftools_merged | ||
File bcftools_merged_idx = TruvariIntrasampleImpl.bcftools_merged_idx | ||
} | ||
} | ||
|
||
|
||
# Other intermediate files created, but likely aren't useful for production are: | ||
# - preprocessed/ directory for each caller's cleaned result | ||
# - ~{sample_id}.removed.vcf.gz variant representations removed during | ||
# collapsing | ||
# | ||
task TruvariIntrasampleImpl { | ||
input { | ||
String sample_id | ||
File pbsv_vcf_gz | ||
File pbsv_vcf_gz_tbi | ||
File sniffles_vcf_gz | ||
File sniffles_vcf_gz_tbi | ||
File pav_vcf_gz | ||
File pav_vcf_gz_tbi | ||
File reference_fa | ||
} | ||
parameter_meta { | ||
} | ||
|
||
Int disk_size_gb = 10*( ceil(size(pbsv_vcf_gz,"GB")) + ceil(size(sniffles_vcf_gz,"GB")) + ceil(size(pav_vcf_gz,"GB")) + ceil(size(reference_fa,"GB")) ) + 50 | ||
String docker_dir = "/truvari_intrasample" | ||
String work_dir = "/cromwell_root/truvari_intrasample" | ||
|
||
command <<< | ||
set -euxo pipefail | ||
mkdir -p ~{work_dir} | ||
cd ~{work_dir} | ||
|
||
TIME_COMMAND="/usr/bin/time --verbose" | ||
N_SOCKETS="$(lscpu | grep '^Socket(s):' | awk '{print $NF}')" | ||
N_CORES_PER_SOCKET="$(lscpu | grep '^Core(s) per socket:' | awk '{print $NF}')" | ||
N_THREADS=$(( ${N_SOCKETS} * ${N_CORES_PER_SOCKET} )) | ||
|
||
# Step 1 - clean up the VCFs | ||
# - Assigns quality scores to each SV caller's result | ||
# - pav 4 | ||
# - pbsv 3 | ||
# - sniffles 2 | ||
# - Resolves any symbolic alts (e.g. `<DEL>` with the sequence from the | ||
# reference) | ||
# - symbolic variants are given quality score of 1 | ||
# - Filters out variants greater than 100kbp (I might want to remove | ||
# this) | ||
# - Fills in blank genotypes with `0` | ||
# - Filters out BND variants | ||
# The quality scores are set based on which variant representations we | ||
# believe are generally more accurate with higher being better. | ||
mkdir -p preprocessed | ||
for in_vcf in ~{pav_vcf_gz} ~{pbsv_vcf_gz} ~{sniffles_vcf_gz} | ||
do | ||
outname=preprocessed/$(basename $in_vcf) | ||
python ~{docker_dir}/resolve.py ${in_vcf} ~{reference_fa} \ | ||
| bcftools norm --check-ref s --fasta-ref ~{reference_fa} -N -m-any \ | ||
| bcftools view -i "SVTYPE != 'BND'" -O z -o ${outname} | ||
tabix $outname | ||
done | ||
|
||
# Step 2 - merge | ||
# Pastes the samples together in the order of the preferred genotypes. | ||
# That is to say, this creates a three sample VCF with sample columns | ||
# from pav_sv, pbsv, and sniffles. | ||
bcftools merge --threads ${N_THREADS} --merge none --force-samples -O z \ | ||
-o ~{sample_id}.bcftools_merged.vcf.gz \ | ||
preprocessed/$(basename ~{pav_vcf_gz}) \ | ||
preprocessed/$(basename ~{pbsv_vcf_gz}) \ | ||
preprocessed/$(basename ~{sniffles_vcf_gz}) | ||
tabix ~{sample_id}.bcftools_merged.vcf.gz | ||
|
||
# Step 3 - collapse | ||
truvari collapse -i ~{sample_id}.bcftools_merged.vcf.gz -c removed.vcf.gz -k maxqual --gt --intra \ | ||
--pctseq 0.90 --pctsize 0.90 --refdist 500 \ | ||
| bcftools sort -O z -o ~{sample_id}.truvari_collapsed.vcf.gz | ||
tabix ~{sample_id}.truvari_collapsed.vcf.gz | ||
>>> | ||
|
||
output { | ||
File truvari_collapsed = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz" | ||
File truvari_collapsed_idx = "~{work_dir}/~{sample_id}.truvari_collapsed.vcf.gz.tbi" | ||
File bcftools_merged = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz" | ||
File bcftools_merged_idx = "~{work_dir}/~{sample_id}.bcftools_merged.vcf.gz.tbi" | ||
} | ||
runtime { | ||
docker: "fcunial/truvari_intrasample" | ||
cpu: 1 | ||
memory: "16GB" | ||
disks: "local-disk " + disk_size_gb + " HDD" | ||
preemptible: 0 | ||
} | ||
} |