Skip to content

Commit

Permalink
Merge pull request #158 from uclahs-cds/sfitz-combine-gvcfs
Browse files Browse the repository at this point in the history
Use GVCFs for genotyping - run time/CPU hours substantially reduced (0.52)
  • Loading branch information
sorelfitzgibbon authored Jun 7, 2024
2 parents d36fc31 + e929f14 commit 3de3df6
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 114 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
---

## [Unreleased]
### Added
- Add workflow for genotyping from GVCFs
### Changed
- Standardize description
- Update GATK to 4.5.0.0

---

Expand Down
14 changes: 12 additions & 2 deletions config/F16.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process {
cpus = 1
memory = 1.GB
}
withName: run_HaplotypeCallerVCF_GATK {
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
Expand All @@ -21,7 +21,17 @@ process {
}
}
}
withName: run_HaplotypeCallerGVCF_GATK {
withName: run_CombineGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
operand = 2
}
}
}
withName: run_GenotypeGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
Expand Down
14 changes: 12 additions & 2 deletions config/F32.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process {
cpus = 1
memory = 1.GB
}
withName: run_HaplotypeCallerVCF_GATK {
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
Expand All @@ -21,7 +21,17 @@ process {
}
}
}
withName: run_HaplotypeCallerGVCF_GATK {
withName: run_CombineGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
operand = 2
}
}
}
withName: run_GenotypeGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
Expand Down
18 changes: 14 additions & 4 deletions config/F72.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process {
cpus = 1
memory = 1.GB
}
withName: run_HaplotypeCallerVCF_GATK {
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 3
memory = 7.GB
retry_strategy {
Expand All @@ -21,9 +21,19 @@ process {
}
}
}
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 3
memory = 7.GB
withName: run_CombineGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
operand = 2
}
}
}
withName: run_GenotypeGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
Expand Down
18 changes: 14 additions & 4 deletions config/M64.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process {
cpus = 1
memory = 1.GB
}
withName: run_HaplotypeCallerVCF_GATK {
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 3
memory = 7.GB
retry_strategy {
Expand All @@ -21,9 +21,19 @@ process {
}
}
}
withName: run_HaplotypeCallerGVCF_GATK {
cpus = 3
memory = 7.GB
withName: run_CombineGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
operand = 2
}
}
}
withName: run_GenotypeGVCFs_GATK {
cpus = 2
memory = 4.GB
retry_strategy {
memory {
strategy = 'exponential'
Expand Down
2 changes: 1 addition & 1 deletion config/default.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ params {

docker_container_registry = "ghcr.io/uclahs-cds"

gatk_version = "4.2.4.1"
gatk_version = "4.5.0.0"
picard_version = "2.26.10"
pipeval_version = "4.0.0-rc.2"
gatkfilter_version = "v1.0.0"
Expand Down
46 changes: 28 additions & 18 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Current Configuration:
bundle_omni_1000g_2p5_vcf_gz: ${params.bundle_omni_1000g_2p5_vcf_gz}
bundle_phase1_1000g_snps_high_conf_vcf_gz: ${params.bundle_phase1_1000g_snps_high_conf_vcf_gz}
- output:
- output:
output: ${params.output_dir}
output_dir_base: ${params.output_dir_base}
log_output_dir: ${params.log_output_dir}
Expand Down Expand Up @@ -58,9 +58,10 @@ include { extract_GenomeIntervals } from './external/pipeline-Nextflow-module/mo
]
)
include {
run_HaplotypeCallerVCF_GATK
run_HaplotypeCallerGVCF_GATK
} from './module/haplotypecaller.nf'
include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'
include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
include {
run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF
run_MergeVcfs_Picard as run_MergeVcfs_Picard_GVCF
Expand Down Expand Up @@ -147,51 +148,60 @@ workflow {
/**
* Haplotype calling
*/
input_ch_collected_files.combine(input_ch_intervals)

input_ch_samples_with_index.combine(input_ch_intervals)
.map{ it ->
[
it[0].bams,
it[0].indices,
it[0].id,
it[0].path,
it[0].index,
it[1].interval_path,
it[1].interval_id
]
}
.set{ input_ch_haplotypecallervcf }
.set{ input_ch_haplotypecallergvcf }

run_HaplotypeCallerVCF_GATK(
run_HaplotypeCallerGVCF_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
params.bundle_v0_dbsnp138_vcf_gz,
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
input_ch_haplotypecallervcf
input_ch_haplotypecallergvcf
)

input_ch_samples_with_index.combine(input_ch_intervals)
run_HaplotypeCallerGVCF_GATK.out.gvcfs
.groupTuple(by: 4) // Group by interval ID
.map{ it ->
[
it[0].id,
it[0].path,
it[0].index,
it[1].interval_path,
it[1].interval_id
it[1].flatten(), // GVCFs
it[2].flatten(), // Indices
it[3][0], // Interval path
it[4] // Interval ID
]
}
.set{ input_ch_haplotypecallergvcf }
.set { input_ch_combine_gvcfs }

run_HaplotypeCallerGVCF_GATK(
run_CombineGVCFs_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
input_ch_combine_gvcfs
)

run_GenotypeGVCFs_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
params.bundle_v0_dbsnp138_vcf_gz,
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
input_ch_haplotypecallergvcf
run_CombineGVCFs_GATK.out.combined_gvcf
)

/**
* Merge VCFs
*/
run_HaplotypeCallerVCF_GATK.out.vcfs
run_GenotypeGVCFs_GATK.out.vcfs
.reduce( ['vcfs': [], 'indices': []] ){ a, b ->
a.vcfs.add(b[0]);
a.indices.add(b[1]);
Expand Down
2 changes: 1 addition & 1 deletion metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ maintainers: "Boutros Lab Infrastructure <[email protected]
languages: ["Nextflow", "Docker"]
dependencies: ["Java", "Nextflow", "Docker"]
references: "https://uclahs-cds.atlassian.net/wiki/spaces/BOUTROSLAB/pages/3189620/Guide+to+Nextflow"
tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.2.4.1"]
tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.5.0.0"]
48 changes: 48 additions & 0 deletions module/combine-gvcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'

/*
Nextflow module for merging GVCFs for joint genotyping with GATK
*/
process run_CombineGVCFs_GATK {
container params.docker_image_gatk
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
mode: "copy",
enabled: params.save_intermediate_files,
pattern: '*g.vcf.gz*'
publishDir path: "${params.log_output_dir}/process-log",
pattern: ".command.*",
mode: "copy",
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }

input:
path(reference_fasta)
path(reference_fasta_fai)
path(reference_fasta_dict)
tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)

output:
path(".command.*")
tuple path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: combined_gvcf

script:
output_filename = generate_standard_filename(
"GATK-${params.gatk_version}",
params.dataset_id,
params.patient_id,
[
'additional_information': "${interval_id}.g.vcf.gz"
]
)
gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
"""
set -euo pipefail
gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
CombineGVCFs \
--reference ${reference_fasta} \
${gvcf_input_str} \
--output ${output_filename} \
--create-output-variant-index true \
--verbosity INFO
"""
}
55 changes: 55 additions & 0 deletions module/genotype-gvcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'

/*
Nextflow module for joint genotyping merged GVCFs with GATK
*/
process run_GenotypeGVCFs_GATK {
container params.docker_image_gatk
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
mode: "copy",
enabled: params.save_intermediate_files,
pattern: '*.vcf*'

publishDir path: "${params.log_output_dir}/process-log",
pattern: ".command.*",
mode: "copy",
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }

input:
path(reference_fasta)
path(reference_fasta_fai)
path(reference_fasta_dict)
path(dbsnp_bundle)
path(dbsnp_bundle_index)
tuple path(combined_gvcf), path(combined_gvcf_index), path(interval_path), val(interval_id)

output:
path(".command.*")
tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs

script:
output_filename = generate_standard_filename(
"GATK-${params.gatk_version}",
params.dataset_id,
params.patient_id,
[
'additional_information': "${interval_id}.vcf.gz"
]
)
interval_str = "--intervals ${interval_path}"
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
"""
set -euo pipefail
gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
GenotypeGVCFs \
--variant ${combined_gvcf} \
--reference ${reference_fasta} \
--verbosity INFO \
--output ${output_filename} \
--dbsnp ${dbsnp_bundle} \
--standard-min-confidence-threshold-for-calling 50 \
${interval_str} \
${interval_padding}
"""
}
Loading

0 comments on commit 3de3df6

Please sign in to comment.