Skip to content

Commit

Permalink
Added support for giving background samples to joint calling workflow. (
Browse files Browse the repository at this point in the history
#474)

This adds in the ability to specify an `Array[Array[File]]` containing GVCFs to include as background samples in a joint call in `SRJointCallGVCFsWithGenomicsDB`. This input is optional.
  • Loading branch information
jonn-smith authored Nov 7, 2024
1 parent 8feb5a4 commit cf08477
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ workflow SRJointCallGVCFsWithGenomicsDB {
shard_max_interval_size_bp: "Maximum size of the interval on each shard. This along with the given sequence dictionary determines how many shards there will be. To shard by contig, set to a very high number. Default is 999999999."

prefix: "Prefix to use for output files."

background_sample_gvcfs: "Array of GVCFs to use as background samples for joint calling."
background_sample_gvcf_indices: "Array of GVCF index files for `background_sample_gvcfs`. Order should correspond to that in `background_sample_gvcfs`."

gcs_out_root_dir: "GCS Bucket into which to finalize outputs. If no bucket is given, outputs will not be finalized and instead will remain in their native execution location."
}

Expand Down Expand Up @@ -103,6 +107,9 @@ workflow SRJointCallGVCFsWithGenomicsDB {

String prefix

Array[Array[File]]? background_sample_gvcfs
Array[Array[File]]? background_sample_gvcf_indices

String? gcs_out_root_dir
}

Expand All @@ -112,6 +119,7 @@ workflow SRJointCallGVCFsWithGenomicsDB {
call SRJOINT.CreateSampleNameMap as CreateSampleNameMap {
input:
gvcfs = gvcfs,
background_sample_gvcfs = if defined(background_sample_gvcfs) then flatten(select_first([background_sample_gvcfs])) else [],
prefix = prefix
}

Expand Down
45 changes: 45 additions & 0 deletions wdl/tasks/VariantCalling/SRJointGenotyping.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ task CreateSampleNameMap {
Array[File] gvcfs
String prefix

Array[File] background_sample_gvcfs = []

RuntimeAttr? runtime_attr_override
}
Expand All @@ -21,6 +23,10 @@ task CreateSampleNameMap {
help: "Array of single-sample GVCF files.",
localization_optional: true
}
background_sample_gvcfs: {
help: "Array of single-sample GVCF files to use as background samples for joint calling.",
localization_optional: true
}
}

Int disk_size_gb = 20
Expand All @@ -31,6 +37,8 @@ task CreateSampleNameMap {
# Every so often we should reauthorize so `bcftools` can continue to access our data:
Int re_auth_interval = 50

String has_background_samples = if length(background_sample_gvcfs) > 0 then "true" else "false"

command <<<
set -euxo pipefail

Expand Down Expand Up @@ -72,6 +80,43 @@ task CreateSampleNameMap {
fi
done < ${gvcf_file_list}

if [[ ~{has_background_samples} == "true" ]] ; then

echo "Adding background samples to the sample name map..."

# Add the background sample gvcfs to the sample name map:
background_gvcf_file_list=~{write_lines(background_sample_gvcfs)}

# Reset our access token:
export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)

i=1
while read file_path ; do

# Get our sample list from our file:
bcftools query -l ${file_path} > sample_names.txt

# Make sure we only have one sample name:
[[ $(wc -l sample_names.txt | awk '{print $1}') -ne 1 ]] && echo "Incorrect number of sample names found in GVCF (there can be only one!): ${file_path}" && exit 1

# Make sure the samplename has an actual name:
[ $(grep -iq "unnamedsample" sample_names.txt) ] && echo "Sample name found to be unnamedsample in GVCF: ${file_path}" && exit 1

# Add the sample name and GVCF path to the sample name file:
echo -e "$(cat sample_names.txt)\t${file_path}" >> ~{outfile_name}

# Add the file size to the size file:
gsutil du -sac ${file_path} | tail -n1 | awk '{print $1}' >> ${size_file}

i=$((i+1))
if [[ $i -gt ~{re_auth_interval} ]] ; then
# Periodically we should update the token so we don't have problems with long file lists:
export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
i=0
fi
done < ${background_gvcf_file_list}
fi

# Now calculate the final file size in GB:
# We include an additional GB in case we have a very small dataset:
awk '{s += $1}END{print int(1+s/(1024*1024*1024))}' ${size_file} > ~{size_file_gb}
Expand Down

0 comments on commit cf08477

Please sign in to comment.