uclahs-cds · Faizal-Eeman · Nov 26, 2024 · Nov 26, 2024 · Dec 7, 2024 · Dec 7, 2024
@@ -9,6 +9,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 ### Added
+- Add XY filtration
 - NFTest test case
 
 ---
@@ -152,7 +153,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Update reheadering to use -c option
 - Modularize workflows for different modes (single vs. paired, WGS vs targeted)
 - Update GATK to 4.2.4.0 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q)
-- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) 
+- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q)
 
 ---
 

@@ -78,7 +78,10 @@ Take the output from Step 6 as input, and apply the model in Step 5 to recalibra
 ### 8. Filter gSNP – Filter out ambiguous variants
 Use customized Perl script to filter out ambiguous variants.
 
-### 9. Generate sha512 checksum
+### 9. Adjust chrX and chrY genotypes based on sample sex from recalibrated VCF
+Apply XY filtration workflow to recalibrated VCF as discribed [here](docs/xy_filtration_workflow.md).
+
+### 10. Generate sha512 checksum
 Generate sha512 checksum for VCFs and GVCFs.
 
 ---
@@ -115,6 +118,8 @@ For normal-only or tumor-only samples, exclude the fields for the other state.
 |:----------------|:---------|:-----|:------------|
 | `dataset_id` | Yes | string | Dataset ID |
 | `blcds_registered_dataset` | Yes | boolean | Set to true when using BLCDS folder structure; use false for now |
+| `genome_build` | Yes | string | Genome build, GRCh37 or GRCh38 |
+| `sample_sex` | Yes | string | Sample Sex, XY or XX |
         --sample-ploidy 2 \ 
         --sample-ploidy 2 \ 
 | `output_dir` | Yes | string | Need to set if `blcds_registered_dataset = false` |
 | `save_intermediate_files` | Yes | boolean | Set to false to disable publishing of intermediate files; true otherwise; disabling option will delete intermediate files to allow for processing of large BAMs |
 | `cache_intermediate_pipeline_steps` | No | boolean | Set to true to enable process caching from Nextflow; defaults to false |
@@ -126,6 +131,7 @@ For normal-only or tumor-only samples, exclude the fields for the other state.
 | `bundle_hapmap_3p3_vcf_gz` | Yes | path | Absolute path to HapMap 3.3 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/hapmap_3.3.hg38.vcf.gz` |
 | `bundle_omni_1000g_2p5_vcf_gz` | Yes | path | Absolute path to 1000 genomes OMNI 2.5 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz` |
 | `bundle_phase1_1000g_snps_high_conf_vcf_gz` | Yes | path | Absolute path to 1000 genomes phase 1 high-confidence file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz` |
+| `par_bed` | Yes | path | Absolute path to Pseudo-autosomal Region (PAR) BED |
 | `work_dir` | optional | path | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With ucla_cds, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. |
 | `docker_container_registry` | optional | string | Registry containing tool Docker images. Default: `ghcr.io/uclahs-cds` |
 | `base_resource_update` | optional | namespace | Namespace of parameters to update base resource allocations in the pipeline. Usage and structure are detailed in `template.config` and below. |
@@ -199,6 +205,10 @@ base_resource_update {
 | `<GATK>_<dataset_id>_<patient_id>_indel.vcf.gz` | Filtered INDELs with non-germline and ambiguous variants removed |
 | `<GATK>_<dataset_id>_<patient_id>_indel.vcf.gz.tbi` | Filtered germline INDELs index |
 | `<GATK>_<dataset_id>_<patient_id>_indel.vcf.gz.sha512` | Filtered germline INDELs sha512 checksum |
+| `<Hail>_<GATK>_<dataset_id>_<patient_id>_<sample_sex>_filtered.vcf.bgz` | chrX/Y filtered SNP and INDEL recalibrated variants |
+| `<Hail>_<GATK>_<dataset_id>_<patient_id>_<sample_sex>_filtered.vcf.bgz.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants checksum |
+| `<Hail>_<GATK>_<dataset_id>_<patient_id>_<sample_sex>_filtered.vcf.bgz.tbi` | chrX/Y filtered SNP and INDEL recalibrated variants index |
+| `<Hail>_<GATK>_<dataset_id>_<patient_id>_<sample_sex>_filtered.vcf.bgz.tbi.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants index checksum |
 | `report.html`, `timeline.html` and `trace.txt` | Nextflow report, timeline and trace files |
 | `*.command.*` | Process specific logging files created by nextflow |
 

@@ -111,4 +111,14 @@ process {
             }
         }
     }
+    withName: filter_XY_Hail {
+        cpus = 1
+        memory = 2.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
 }
@@ -111,4 +111,14 @@ process {
             }
         }
     }
+    withName: filter_XY_Hail {
+        cpus = 2
+        memory = 4.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
 }
@@ -111,4 +111,14 @@ process {
             }
         }
     }
+    withName: filter_XY_Hail {
+        cpus = 2
+        memory = 6.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
 }
@@ -111,4 +111,14 @@ process {
             }
         }
     }
+    withName: filter_XY_Hail {
+        cpus = 4
+        memory = 10.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
 }
@@ -20,10 +20,12 @@ params {
     picard_version = "2.26.10"
     pipeval_version = "4.0.0-rc.2"
     gatkfilter_version = "v1.0.0"
+    hail_version = "0.2.133"
     docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}"
     docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}"
     docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"
     docker_image_gatkfilter = "${-> params.docker_container_registry}/gatk:${params.gatkfilter_version}"
+    docker_image_hail = "${-> params.docker_container_registry}/hail:${params.hail_version}"
 
     emit_all_confident_sites = false
 }
@@ -36,7 +38,7 @@ process {
     cache = true
 
     executor = 'local'
-    
+
     // Other directives or options that should apply for every process
 
     // total amount of resources avaible to the pipeline

@@ -3,10 +3,26 @@ patient_id:
   type: 'String'
   required: true
   help: 'Patient ID'
+sample_sex:
+  type: 'String'
+  required: true
+  help: 'Sample Sex'
+  choices:
+    - "XY"
+    - "XX"
 dataset_id:
   type: 'String'
   required: true
   help: 'Dataset ID'
+genome_build:
+  type: 'String'
+  required: true
+  help: 'Genome build, GRCh37 or GRCh38'
+  default:
+    - "GRCh38"
+  choice:
+    - "GRCh37"
+    - "GRCh38"
 output_dir:
   type: 'Path'
   mode: 'w'
@@ -62,6 +78,11 @@ bundle_phase1_1000g_snps_high_conf_vcf_gz:
   mode: 'r'
   required: true
   help: 'Absolute path to high-confidence 1000g SNPs VCF'
+par_bed:
+  type: 'Path'
+  mode: 'r'
+  required: true
+  help: 'Absolute path to Pseudo-autosomal Region (PAR) BED'
 base_resource_update:
   type: 'ResourceUpdateNamespace'
   required: false

@@ -11,6 +11,11 @@ params {
     dataset_id = ''
     blcds_registered_dataset = false // if you want the output to be registered
 
+    genome_build = "GRCh38"
+
+    // Input patient sex
+    sample_sex = '' // 'XY' or 'XX'
+
     output_dir = '/path/to/output/directory'
 
     // Set to false to disable the publish rule and delete intermediate files as they're no longer needed
@@ -43,6 +48,9 @@ params {
     bundle_omni_1000g_2p5_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz"
     bundle_phase1_1000g_snps_high_conf_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
 
+    // Specify BED file path for Pseudoautosomal Region (PAR)
+    par_bed = ""
+
     // Base resource allocation updater
     // See README for adding parameters to update the base resource allocations
 }

@@ -0,0 +1,26 @@
+# Filter XY calls from a germline VCF file
+
+## Steps:
+1. Extract autosomes and chrX/Y variants from input VCF
+2. Filter chrX/Y variants
+3. Merge autosomal and filtered chrX/Y variants
+
+## chrX/Y Filter Criteria:
+- Extract chrX/Y calls
+- Extract chrX/Y calls overlapping with Pseudo-Autosomal Regions (PARs)
+- For non-PAR chrX/Y calls
+    - if `sample_sex` is `XY`:
+        - Filter out heterozygous `GT` calls in chrX and chrY
+        - Transform homozygous `GT=1/1` to hemizygous `GT=1`
+    - if `sample_sex` is `XX`:
+        - Filter out `chrY` calls
+
+## Pseudo-Autosomal Regions (PARs)
+### GRCh38
+| CHROM | START | END | PAR | REGION | REFERENCE |
+|---|---|---|---|---|---|
+| chrX | 10001 | 2781479 | PAR1 | Xp22 | EMSEMBL |
+| chrX | 91434839 | 91438584 | PAR3/XTR | Xq21.3 | PMID:23708688 |
+| chrX | 155701383 | 156030895 | PAR2 | Xq28 | ENSEMBL |
+| chrY | 10001 | 10300000 | PAR1+PAR3/XTR | Yp11 | ENSEMBL +PMID:23708688 |
+| chrY | 56887903 | 57217415 | PAR2 | Yq12 | ENSEMBL |
@@ -68,6 +68,7 @@ include {
     } from './module/merge-vcf.nf'
 include { recalibrate_variants } from './module/workflow-recalibrate-variants.nf'
 include { filter_gSNP_GATK } from './module/filter-gsnp.nf'
+include { filter_XY_Hail } from './module/filter-xy.nf'
 include { calculate_sha512 } from './module/checksum.nf'
 
 // Returns the index file for the given bam or vcf
@@ -104,6 +105,12 @@ workflow {
         }
         .set{ input_ch_collected_files }
 
+    script_dir_ch = Channel.fromPath(
+        "$projectDir/script",
+        checkIfExists: true
+        )
+        .collect()
+
     /**
     *   Input validation
     */
@@ -248,13 +255,28 @@ workflow {
         recalibrate_variants.out.output_ch_recalibrated_variants
     )
 
+    filter_xy_ch = recalibrate_variants.out.output_ch_recalibrated_variants
+        .map { it -> [it[0], it[1], it[2]] }
+
+    script_dir_ch = Channel.fromPath(
+        "$projectDir/script",
+        checkIfExists: true
+        )
+        .collect()
+
+    filter_XY_Hail(
+        filter_xy_ch,
+        params.par_bed,
+        script_dir_ch
+        )
     /**
     *   Calculate checksums for output files
     */
     run_MergeVcfs_Picard_VCF.out.merged_vcf
         .mix(run_MergeVcfs_Picard_GVCF.out.merged_vcf)
         .mix(recalibrate_variants.out.output_ch_recalibrated_variants)
         .map{ [it[1], it[2]] }
+        .mix(filter_XY_Hail.out.xy_filtered_vqsr)
         .mix(filter_gSNP_GATK.out.germline_filtered)
         .flatten()
         .set{ input_ch_calculate_checksum }

@@ -0,0 +1,63 @@
+include { generate_standard_filename; sanitize_string } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
+
+/*
+    Nextflow module for filtering chrX and chrY variant calls based on sample sex
+
+    input:
+        sample_id: identifier for sample
+        sample_vcf: path to VCF to filter
+        sample_vcf_tbi: path to index of VCF to filter
+
+    params:
+        params.output_dir_base: string(path)
+        params.log_output_dir: string(path)
+        params.docker_image_hail: string
+        params.sample_sex: string
+        params.par_bed: string(path)
+*/
+
+process filter_XY_Hail {
+    container params.docker_image_hail
+
+    publishDir path: "${params.output_dir_base}/output",
+      mode: "copy",
+      pattern: '*.vcf.bgz*'
+
+    publishDir path: "${params.log_output_dir}/process-log",
+      pattern: ".command.*",
+      mode: "copy",
+      saveAs: {
+        "${task.process.replace(':', '/')}-${sample_id}/log${file(it).getName()}"
+        }
+
+    input:
+    tuple val(sample_id), path(recalibrated_vcf), path(recalibrated_vcf_tbi)
+    path(par_bed)
+    path(script_dir)
+
+    output:
+    path(".command.*")
+    tuple path("${output_filename}_XY_filtered.vcf.bgz"), path("${output_filename}_XY_filtered.vcf.bgz.tbi"), emit: xy_filtered_vqsr
+
+    script:
+    output_filename = generate_standard_filename(
+        "Hail-${params.hail_version}",
+        params.dataset_id,
+        sample_id,
+        [additional_tools:["GATK-${params.gatk_version}"]]
+        )
+    """
+    set -euo pipefail
+
+    zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt
+
+    python ${script_dir}/filter_xy_call.py \
+        --sample_name ${output_filename} \
+        --input_vcf ${recalibrated_vcf} \
+        --vcf_source_file ./vcf_source.txt \
+        --sample_sex ${params.sample_sex} \
+        --par_bed ${par_bed} \
+        --genome_build ${params.genome_build} \
+        --output_dir .
+    """
+}