Added support for giving background samples to joint calling workflow. (

#474) This adds in the ability to specify an `Array[Array[File]]` containing GVCFs to include as background samples in a joint call in `SRJointCallGVCFsWithGenomicsDB`. This input is optional.
broadinstitute · Nov 7, 2024 · cf08477 · cf08477
1 parent 8feb5a4
commit cf08477
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 0 deletions.
diff --git a/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB.wdl b/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB.wdl
@@ -52,6 +52,10 @@ workflow SRJointCallGVCFsWithGenomicsDB {
         shard_max_interval_size_bp: "Maximum size of the interval on each shard.  This along with the given sequence dictionary determines how many shards there will be.  To shard by contig, set to a very high number.  Default is 999999999."
 
         prefix: "Prefix to use for output files."
+
+        background_sample_gvcfs: "Array of GVCFs to use as background samples for joint calling."
+        background_sample_gvcf_indices: "Array of GVCF index files for `background_sample_gvcfs`.  Order should correspond to that in `background_sample_gvcfs`."
+
         gcs_out_root_dir:    "GCS Bucket into which to finalize outputs.  If no bucket is given, outputs will not be finalized and instead will remain in their native execution location."
     }
 
@@ -103,6 +107,9 @@ workflow SRJointCallGVCFsWithGenomicsDB {
 
         String prefix
 
+        Array[Array[File]]? background_sample_gvcfs
+        Array[Array[File]]? background_sample_gvcf_indices
+
         String? gcs_out_root_dir
     }
 
@@ -112,6 +119,7 @@ workflow SRJointCallGVCFsWithGenomicsDB {
     call SRJOINT.CreateSampleNameMap as CreateSampleNameMap {
         input:
             gvcfs = gvcfs,
+            background_sample_gvcfs = if defined(background_sample_gvcfs) then flatten(select_first([background_sample_gvcfs])) else [],
             prefix = prefix
     }
 

diff --git a/wdl/tasks/VariantCalling/SRJointGenotyping.wdl b/wdl/tasks/VariantCalling/SRJointGenotyping.wdl
@@ -13,6 +13,8 @@ task CreateSampleNameMap {
         Array[File] gvcfs
         String prefix
 
+        Array[File] background_sample_gvcfs = []
+
         RuntimeAttr? runtime_attr_override
     }
 
@@ -21,6 +23,10 @@ task CreateSampleNameMap {
             help: "Array of single-sample GVCF files.",
             localization_optional: true
         }
+        background_sample_gvcfs: {
+            help: "Array of single-sample GVCF files to use as background samples for joint calling.",
+            localization_optional: true
+        }
     }
 
     Int disk_size_gb = 20
@@ -31,6 +37,8 @@ task CreateSampleNameMap {
     # Every so often we should reauthorize so `bcftools` can continue to access our data:
     Int re_auth_interval = 50
 
+    String has_background_samples = if length(background_sample_gvcfs) > 0 then "true" else "false"
+
     command <<<
         set -euxo pipefail
 
@@ -72,6 +80,43 @@ task CreateSampleNameMap {
             fi
         done < ${gvcf_file_list}
 
+        if [[ ~{has_background_samples} == "true" ]] ; then
+
+            echo "Adding background samples to the sample name map..."
+
+            # Add the background sample gvcfs to the sample name map:
+            background_gvcf_file_list=~{write_lines(background_sample_gvcfs)}
+
+            # Reset our access token:
+            export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
+
+            i=1 
+            while read file_path ; do
+
+                # Get our sample list from our file:
+                bcftools query -l ${file_path} > sample_names.txt
+
+                # Make sure we only have one sample name:
+                [[ $(wc -l sample_names.txt | awk '{print $1}') -ne 1 ]] && echo "Incorrect number of sample names found in GVCF (there can be only one!): ${file_path}" && exit 1
+
+                # Make sure the samplename has an actual name:
+                [ $(grep -iq "unnamedsample" sample_names.txt) ] && echo "Sample name found to be unnamedsample in GVCF: ${file_path}" && exit 1
+
+                # Add the sample name and GVCF path to the sample name file:
+                echo -e "$(cat sample_names.txt)\t${file_path}" >> ~{outfile_name}
+
+                # Add the file size to the size file:
+                gsutil du -sac ${file_path} | tail -n1 | awk '{print $1}' >> ${size_file}
+
+                i=$((i+1))
+                if [[ $i -gt ~{re_auth_interval} ]] ; then
+                    # Periodically we should update the token so we don't have problems with long file lists:
+                    export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
+                    i=0
+                fi
+            done < ${background_gvcf_file_list}
+        fi
+
         # Now calculate the final file size in GB:
         # We include an additional GB in case we have a very small dataset:
         awk '{s += $1}END{print int(1+s/(1024*1024*1024))}' ${size_file} > ~{size_file_gb}