From 1a6f75d16bd11a59fbb2e714dd343dddb4674798 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Fri, 29 Mar 2024 12:46:05 -0700 Subject: [PATCH 01/15] use F72 config for F16 --- config/F16.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/F16.config b/config/F16.config index bca8986..406d896 100644 --- a/config/F16.config +++ b/config/F16.config @@ -12,8 +12,8 @@ process { memory = 1.GB } withName: run_HaplotypeCallerVCF_GATK { - cpus = 2 - memory = 4.GB + cpus = 3 + memory = 7.GB retry_strategy { memory { strategy = 'exponential' @@ -22,8 +22,8 @@ process { } } withName: run_HaplotypeCallerGVCF_GATK { - cpus = 2 - memory = 4.GB + cpus = 3 + memory = 7.GB retry_strategy { memory { strategy = 'exponential' From 2c7b3eb298a5c37f7a467b041e9595610ae9bf16 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Sat, 30 Mar 2024 16:57:48 -0700 Subject: [PATCH 02/15] add genotype from gvcfs --- main.nf | 85 +++++++++++++++++++++++++------------ module/genomicsdb-import.nf | 48 +++++++++++++++++++++ module/genotype-gvcfs.nf | 55 ++++++++++++++++++++++++ module/haplotypecaller.nf | 6 +-- 4 files changed, 163 insertions(+), 31 deletions(-) create mode 100644 module/genomicsdb-import.nf create mode 100644 module/genotype-gvcfs.nf diff --git a/main.nf b/main.nf index 2fd315a..966ad70 100644 --- a/main.nf +++ b/main.nf @@ -25,7 +25,7 @@ Current Configuration: bundle_omni_1000g_2p5_vcf_gz: ${params.bundle_omni_1000g_2p5_vcf_gz} bundle_phase1_1000g_snps_high_conf_vcf_gz: ${params.bundle_phase1_1000g_snps_high_conf_vcf_gz} - - output: + - output: output: ${params.output_dir} output_dir_base: ${params.output_dir_base} log_output_dir: ${params.log_output_dir} @@ -61,6 +61,8 @@ include { run_HaplotypeCallerVCF_GATK run_HaplotypeCallerGVCF_GATK } from './module/haplotypecaller.nf' +include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf' +include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf' include { run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF run_MergeVcfs_Picard as run_MergeVcfs_Picard_GVCF @@ -106,13 +108,13 @@ workflow { /** * Input validation */ - run_validate_PipeVal(input_ch_validate) - - run_validate_PipeVal.out.validation_result - .collectFile( - name: 'input_validation.txt', - storeDir: "${params.output_dir_base}/validation" - ) +// run_validate_PipeVal(input_ch_validate) +// +// run_validate_PipeVal.out.validation_result +// .collectFile( +// name: 'input_validation.txt', +// storeDir: "${params.output_dir_base}/validation" +// ) /** * Handle interval splitting based on targeted or WGS mode @@ -147,25 +149,25 @@ workflow { /** * Haplotype calling */ - input_ch_collected_files.combine(input_ch_intervals) - .map{ it -> - [ - it[0].bams, - it[0].indices, - it[1].interval_path, - it[1].interval_id - ] - } - .set{ input_ch_haplotypecallervcf } - - run_HaplotypeCallerVCF_GATK( - params.reference_fasta, - "${params.reference_fasta}.fai", - "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", - params.bundle_v0_dbsnp138_vcf_gz, - "${params.bundle_v0_dbsnp138_vcf_gz}.tbi", - input_ch_haplotypecallervcf - ) +// input_ch_collected_files.combine(input_ch_intervals) +// .map{ it -> +// [ +// it[0].bams, +// it[0].indices, +// it[1].interval_path, +// it[1].interval_id +// ] +// } +// .set{ input_ch_haplotypecallervcf } + +// run_HaplotypeCallerVCF_GATK( +// params.reference_fasta, +// "${params.reference_fasta}.fai", +// "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", +// params.bundle_v0_dbsnp138_vcf_gz, +// "${params.bundle_v0_dbsnp138_vcf_gz}.tbi", +// input_ch_haplotypecallervcf +// ) input_ch_samples_with_index.combine(input_ch_intervals) .map{ it -> @@ -188,10 +190,37 @@ workflow { input_ch_haplotypecallergvcf ) +run_HaplotypeCallerGVCF_GATK.out.gvcfs +// [sample, gvcf, index, interval_path, interval_id] + .groupTuple(by: 4) // Group by interval_path +// [interval_id, [[sample1, sample2, ...], [gvcf1, gvcf2, ...], [index1, index2, ...], interval_path, ]] + .map{ it -> + [ + it[1].flatten(), // GVCFs + it[2].flatten(), // Indices + it[3][0], // Interval path + it[4] // Interval ID + ] + } + .set { input_ch_genomicsdb } + + run_GenomicsDBImport_GATK( + input_ch_genomicsdb + ) + + run_GenotypeGVCFs_GATK( + params.reference_fasta, + "${params.reference_fasta}.fai", + "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", + params.bundle_v0_dbsnp138_vcf_gz, + "${params.bundle_v0_dbsnp138_vcf_gz}.tbi", + run_GenomicsDBImport_GATK.out.genomicsdb + ) + /** * Merge VCFs */ - run_HaplotypeCallerVCF_GATK.out.vcfs + run_GenotypeGVCFs_GATK.out.vcfs .reduce( ['vcfs': [], 'indices': []] ){ a, b -> a.vcfs.add(b[0]); a.indices.add(b[1]); diff --git a/module/genomicsdb-import.nf b/module/genomicsdb-import.nf new file mode 100644 index 0000000..48ec110 --- /dev/null +++ b/module/genomicsdb-import.nf @@ -0,0 +1,48 @@ +include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +/* + Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK +*/ +process run_GenomicsDBImport_GATK { + container params.docker_image_gatk + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + mode: "copy", + enabled: params.save_intermediate_files, + pattern: '*genomicsdb' + + publishDir path: "${params.log_output_dir}/process-log", + pattern: ".command.*", + mode: "copy", + saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" } + + input: + tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id) + + output: + path(".command.*") + tuple path(output_filename), path(interval_path), val(interval_id), emit: genomicsdb + + script: + output_filename = generate_standard_filename( + "GATK-${params.gatk_version}", + params.dataset_id, + params.patient_id, + [ + 'additional_information': "${interval_id}.genomicsdb" + ] + ) + gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ') + interval_str = "--intervals ${interval_path}" + interval_padding = params.is_targeted ? "--interval-padding 100" : "" + """ + set -euo pipefail + + gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \ + GenomicsDBImport \ + ${gvcf_input_str} \ + --genomicsdb-workspace-path ${output_filename} \ + --verbosity INFO \ + ${interval_str} \ + ${interval_padding} + """ +} diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf new file mode 100644 index 0000000..8652014 --- /dev/null +++ b/module/genotype-gvcfs.nf @@ -0,0 +1,55 @@ +include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +/* + Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK +*/ +process run_GenotypeGVCFs_GATK { + container params.docker_image_gatk + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + mode: "copy", + enabled: params.save_intermediate_files, + pattern: '*.vcf*' + + publishDir path: "${params.log_output_dir}/process-log", + pattern: ".command.*", + mode: "copy", + saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" } + + input: + path(reference_fasta) + path(reference_fasta_fai) + path(reference_fasta_dict) + path(dbsnp_bundle) + path(dbsnp_bundle_index) + tuple path(genomicsdb), path(interval), val(interval_id) + + output: + path(".command.*") + tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs + + script: + output_filename = generate_standard_filename( + "GATK-${params.gatk_version}", + params.dataset_id, + params.patient_id, + [ + 'additional_information': "${interval_id}.vcf.gz" + ] + ) + interval_str = "--intervals ${interval}" + interval_padding = params.is_targeted ? "--interval-padding 100" : "" + """ + set -euo pipefail + + gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \ + GenotypeGVCFs \ + --variant gendb://${genomicsdb} \ + --reference ${reference_fasta} \ + --verbosity INFO \ + --output ${output_filename} \ + --dbsnp ${dbsnp_bundle} \ + --standard-min-confidence-threshold-for-calling 50 \ + ${interval_str} \ + ${interval_padding} + """ +} diff --git a/module/haplotypecaller.nf b/module/haplotypecaller.nf index e6cb95f..a4ac95b 100644 --- a/module/haplotypecaller.nf +++ b/module/haplotypecaller.nf @@ -120,12 +120,12 @@ process run_HaplotypeCallerGVCF_GATK { path(reference_fasta_dict) path(dbsnp_bundle) path(dbsnp_bundle_index) - tuple val(sample_id), path(bam), path(bam_index), path(interval), val(interval_id) + tuple val(sample_id), path(bam), path(bam_index), path(interval_path), val(interval_id) output: path(".command.*") - tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), emit: gvcfs + tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: gvcfs script: output_filename = generate_standard_filename( @@ -136,7 +136,7 @@ process run_HaplotypeCallerGVCF_GATK { 'additional_information': "${interval_id}_raw_variants.g.vcf.gz" ] ) - interval_str = "--intervals ${interval}" + interval_str = "--intervals ${interval_path}" interval_padding = params.is_targeted ? "--interval-padding 100" : "" output_mode = params.emit_all_confident_sites ? "EMIT_ALL_CONFIDENT_SITES" : "EMIT_VARIANTS_ONLY" """ From 255beb4b68a0b347ffe4f53a5c6e8b201d11ce52 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Sat, 30 Mar 2024 16:58:26 -0700 Subject: [PATCH 03/15] tmp F16 changed to F72 values --- config/F16.config | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/config/F16.config b/config/F16.config index 406d896..088d7bf 100644 --- a/config/F16.config +++ b/config/F16.config @@ -31,6 +31,26 @@ process { } } } + withName: run_GenomicsDBImport_GATK { + cpus = 3 + memory = 7.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } + withName: run_GenotypeGVCFs_GATK { + cpus = 3 + memory = 7.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } withName: run_MergeVcfs_Picard_VCF { cpus = 1 memory = 15.GB From 460fcbdadf297c632e53ea6ec89cea569fcf0e8e Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Sat, 30 Mar 2024 21:53:30 -0700 Subject: [PATCH 04/15] add combine and genotype gvcfs --- main.nf | 12 ++++++---- module/combine-gvcfs.nf | 52 ++++++++++++++++++++++++++++++++++++++++ module/genotype-gvcfs.nf | 6 ++--- 3 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 module/combine-gvcfs.nf diff --git a/main.nf b/main.nf index 966ad70..cb2be93 100644 --- a/main.nf +++ b/main.nf @@ -62,6 +62,7 @@ include { run_HaplotypeCallerGVCF_GATK } from './module/haplotypecaller.nf' include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf' +include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf' include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf' include { run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF @@ -202,10 +203,13 @@ run_HaplotypeCallerGVCF_GATK.out.gvcfs it[4] // Interval ID ] } - .set { input_ch_genomicsdb } + .set { input_ch_combine_gvcfs } - run_GenomicsDBImport_GATK( - input_ch_genomicsdb + run_CombineGVCFs_GATK( + params.reference_fasta, + "${params.reference_fasta}.fai", + "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", + input_ch_combine_gvcfs ) run_GenotypeGVCFs_GATK( @@ -214,7 +218,7 @@ run_HaplotypeCallerGVCF_GATK.out.gvcfs "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", params.bundle_v0_dbsnp138_vcf_gz, "${params.bundle_v0_dbsnp138_vcf_gz}.tbi", - run_GenomicsDBImport_GATK.out.genomicsdb + run_CombineGVCFs_GATK.out.combined_gvcf ) /** diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf new file mode 100644 index 0000000..65b1ba2 --- /dev/null +++ b/module/combine-gvcfs.nf @@ -0,0 +1,52 @@ +include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +/* + Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK +*/ +process run_CombineGVCFs_GATK { + container params.docker_image_gatk + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + mode: "copy", + enabled: params.save_intermediate_files, + pattern: '*g.gvcf.gz*' + publishDir path: "${params.log_output_dir}/process-log", + pattern: ".command.*", + mode: "copy", + saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" } + + input: + path(reference_fasta) + path(reference_fasta_fai) + path(reference_fasta_dict) + tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id) + + output: + path(".command.*") + tuple path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: combined_gvcf + + script: + output_filename = generate_standard_filename( + "GATK-${params.gatk_version}", + params.dataset_id, + params.patient_id, + [ + 'additional_information': "${interval_id}.g.vcf.gz" + ] + ) + gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ') + interval_str = "--intervals ${interval_path}" + interval_padding = params.is_targeted ? "--interval-padding 100" : "" + """ + set -euo pipefail + + gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \ + CombineGVCFs \ + --reference ${reference_fasta} \ + ${gvcf_input_str} \ + --output ${output_filename} \ + --create-output-variant-index true \ + --verbosity INFO \ + ${interval_str} \ + ${interval_padding} + """ +} diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf index 8652014..9586336 100644 --- a/module/genotype-gvcfs.nf +++ b/module/genotype-gvcfs.nf @@ -21,7 +21,7 @@ process run_GenotypeGVCFs_GATK { path(reference_fasta_dict) path(dbsnp_bundle) path(dbsnp_bundle_index) - tuple path(genomicsdb), path(interval), val(interval_id) + tuple path(combined_gvcf), path(combined_gvcf_index), path(interval_path), val(interval_id) output: path(".command.*") @@ -36,14 +36,14 @@ process run_GenotypeGVCFs_GATK { 'additional_information': "${interval_id}.vcf.gz" ] ) - interval_str = "--intervals ${interval}" + interval_str = "--intervals ${interval_path}" interval_padding = params.is_targeted ? "--interval-padding 100" : "" """ set -euo pipefail gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \ GenotypeGVCFs \ - --variant gendb://${genomicsdb} \ + --variant ${combined_gvcf} \ --reference ${reference_fasta} \ --verbosity INFO \ --output ${output_filename} \ From 4b130c5ff1c443697bd9955017eec053b7f03ed3 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Sat, 30 Mar 2024 21:54:40 -0700 Subject: [PATCH 05/15] tmp F72 to F16 config --- config/F16.config | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/config/F16.config b/config/F16.config index 088d7bf..d11fa47 100644 --- a/config/F16.config +++ b/config/F16.config @@ -41,6 +41,16 @@ process { } } } + withName: run_CombineGVCFs_GATK { + cpus = 3 + memory = 7.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } withName: run_GenotypeGVCFs_GATK { cpus = 3 memory = 7.GB From 5b5d8925cc9614164dd46fa62235b32e0be8729a Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Mon, 1 Apr 2024 08:47:23 -0700 Subject: [PATCH 06/15] tidy up scripts and replace input validation --- main.nf | 35 +++++++---------------------------- module/combine-gvcfs.nf | 2 +- module/genotype-gvcfs.nf | 2 +- 3 files changed, 9 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index cb2be93..7d49665 100644 --- a/main.nf +++ b/main.nf @@ -109,13 +109,13 @@ workflow { /** * Input validation */ -// run_validate_PipeVal(input_ch_validate) -// -// run_validate_PipeVal.out.validation_result -// .collectFile( -// name: 'input_validation.txt', -// storeDir: "${params.output_dir_base}/validation" -// ) + run_validate_PipeVal(input_ch_validate) + + run_validate_PipeVal.out.validation_result + .collectFile( + name: 'input_validation.txt', + storeDir: "${params.output_dir_base}/validation" + ) /** * Handle interval splitting based on targeted or WGS mode @@ -150,25 +150,6 @@ workflow { /** * Haplotype calling */ -// input_ch_collected_files.combine(input_ch_intervals) -// .map{ it -> -// [ -// it[0].bams, -// it[0].indices, -// it[1].interval_path, -// it[1].interval_id -// ] -// } -// .set{ input_ch_haplotypecallervcf } - -// run_HaplotypeCallerVCF_GATK( -// params.reference_fasta, -// "${params.reference_fasta}.fai", -// "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict", -// params.bundle_v0_dbsnp138_vcf_gz, -// "${params.bundle_v0_dbsnp138_vcf_gz}.tbi", -// input_ch_haplotypecallervcf -// ) input_ch_samples_with_index.combine(input_ch_intervals) .map{ it -> @@ -192,9 +173,7 @@ workflow { ) run_HaplotypeCallerGVCF_GATK.out.gvcfs -// [sample, gvcf, index, interval_path, interval_id] .groupTuple(by: 4) // Group by interval_path -// [interval_id, [[sample1, sample2, ...], [gvcf1, gvcf2, ...], [index1, index2, ...], interval_path, ]] .map{ it -> [ it[1].flatten(), // GVCFs diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf index 65b1ba2..142a9bc 100644 --- a/module/combine-gvcfs.nf +++ b/module/combine-gvcfs.nf @@ -1,7 +1,7 @@ include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' /* - Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK + Nextflow module for merging GVCFs for joint genotyping with GATK */ process run_CombineGVCFs_GATK { container params.docker_image_gatk diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf index 9586336..77655d6 100644 --- a/module/genotype-gvcfs.nf +++ b/module/genotype-gvcfs.nf @@ -1,7 +1,7 @@ include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' /* - Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK + Nextflow module for joint genotyping merged GVCFs with GATK */ process run_GenotypeGVCFs_GATK { container params.docker_image_gatk From 8407e29939e8876ae2be5b2ae0a8d356aa195924 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Mon, 1 Apr 2024 08:49:54 -0700 Subject: [PATCH 07/15] replace original F16.config --- config/F16.config | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/config/F16.config b/config/F16.config index d11fa47..bca8986 100644 --- a/config/F16.config +++ b/config/F16.config @@ -12,8 +12,8 @@ process { memory = 1.GB } withName: run_HaplotypeCallerVCF_GATK { - cpus = 3 - memory = 7.GB + cpus = 2 + memory = 4.GB retry_strategy { memory { strategy = 'exponential' @@ -22,38 +22,8 @@ process { } } withName: run_HaplotypeCallerGVCF_GATK { - cpus = 3 - memory = 7.GB - retry_strategy { - memory { - strategy = 'exponential' - operand = 2 - } - } - } - withName: run_GenomicsDBImport_GATK { - cpus = 3 - memory = 7.GB - retry_strategy { - memory { - strategy = 'exponential' - operand = 2 - } - } - } - withName: run_CombineGVCFs_GATK { - cpus = 3 - memory = 7.GB - retry_strategy { - memory { - strategy = 'exponential' - operand = 2 - } - } - } - withName: run_GenotypeGVCFs_GATK { - cpus = 3 - memory = 7.GB + cpus = 2 + memory = 4.GB retry_strategy { memory { strategy = 'exponential' From 694b6d23d13d0ec3953fbad5d9faeb97b6001819 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Mon, 1 Apr 2024 09:29:32 -0700 Subject: [PATCH 08/15] update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbd158c..6616b33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- ## [Unreleased] - +### Added +- Add workflow for genotyping from GVCFs --- ## [10.0.0] - 2024-03-08 From 9c3f9811e614923bac6d2ce395cc5047a2fe89fc Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Tue, 2 Apr 2024 09:16:35 -0700 Subject: [PATCH 09/15] update gatk --- CHANGELOG.md | 3 +++ config/default.config | 2 +- metadata.yaml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6616b33..49bd030 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Added - Add workflow for genotyping from GVCFs + +### Changed +- Update GATK to 4.5.0.0 --- ## [10.0.0] - 2024-03-08 diff --git a/config/default.config b/config/default.config index bf96ba3..806d508 100644 --- a/config/default.config +++ b/config/default.config @@ -16,7 +16,7 @@ params { docker_container_registry = "ghcr.io/uclahs-cds" - gatk_version = "4.2.4.1" + gatk_version = "4.5.0.0" picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" diff --git a/metadata.yaml b/metadata.yaml index d31715d..f4e504a 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -5,4 +5,4 @@ maintainers: "Boutros Lab Infrastructure Date: Tue, 2 Apr 2024 09:17:07 -0700 Subject: [PATCH 10/15] update resources --- config/F72.config | 14 ++++++++++++-- main.nf | 20 ++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/config/F72.config b/config/F72.config index 406d896..f8edc02 100644 --- a/config/F72.config +++ b/config/F72.config @@ -11,7 +11,7 @@ process { cpus = 1 memory = 1.GB } - withName: run_HaplotypeCallerVCF_GATK { + withName: run_HaplotypeCallerGVCF_GATK { cpus = 3 memory = 7.GB retry_strategy { @@ -21,7 +21,17 @@ process { } } } - withName: run_HaplotypeCallerGVCF_GATK { + withName: run_CombineGVCFs_GATK { + cpus = 3 + memory = 7.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } + withName: run_GenotypeGVCFs_GATK { cpus = 3 memory = 7.GB retry_strategy { diff --git a/main.nf b/main.nf index 7d49665..990168d 100644 --- a/main.nf +++ b/main.nf @@ -172,16 +172,16 @@ workflow { input_ch_haplotypecallergvcf ) -run_HaplotypeCallerGVCF_GATK.out.gvcfs - .groupTuple(by: 4) // Group by interval_path - .map{ it -> - [ - it[1].flatten(), // GVCFs - it[2].flatten(), // Indices - it[3][0], // Interval path - it[4] // Interval ID - ] - } + run_HaplotypeCallerGVCF_GATK.out.gvcfs + .groupTuple(by: 4) // Group by interval_path + .map{ it -> + [ + it[1].flatten(), // GVCFs + it[2].flatten(), // Indices + it[3][0], // Interval path + it[4] // Interval ID + ] + } .set { input_ch_combine_gvcfs } run_CombineGVCFs_GATK( From edf5010e0ba2d7465c7b5fface581ae5d7eddba4 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 3 Apr 2024 11:55:21 -0700 Subject: [PATCH 11/15] removed unused genomicsdb-import --- main.nf | 1 - module/genomicsdb-import.nf | 48 ------------------------------------- 2 files changed, 49 deletions(-) delete mode 100644 module/genomicsdb-import.nf diff --git a/main.nf b/main.nf index 990168d..bee20bf 100644 --- a/main.nf +++ b/main.nf @@ -61,7 +61,6 @@ include { run_HaplotypeCallerVCF_GATK run_HaplotypeCallerGVCF_GATK } from './module/haplotypecaller.nf' -include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf' include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf' include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf' include { diff --git a/module/genomicsdb-import.nf b/module/genomicsdb-import.nf deleted file mode 100644 index 48ec110..0000000 --- a/module/genomicsdb-import.nf +++ /dev/null @@ -1,48 +0,0 @@ -include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' - -/* - Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK -*/ -process run_GenomicsDBImport_GATK { - container params.docker_image_gatk - publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - mode: "copy", - enabled: params.save_intermediate_files, - pattern: '*genomicsdb' - - publishDir path: "${params.log_output_dir}/process-log", - pattern: ".command.*", - mode: "copy", - saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" } - - input: - tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id) - - output: - path(".command.*") - tuple path(output_filename), path(interval_path), val(interval_id), emit: genomicsdb - - script: - output_filename = generate_standard_filename( - "GATK-${params.gatk_version}", - params.dataset_id, - params.patient_id, - [ - 'additional_information': "${interval_id}.genomicsdb" - ] - ) - gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ') - interval_str = "--intervals ${interval_path}" - interval_padding = params.is_targeted ? "--interval-padding 100" : "" - """ - set -euo pipefail - - gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \ - GenomicsDBImport \ - ${gvcf_input_str} \ - --genomicsdb-workspace-path ${output_filename} \ - --verbosity INFO \ - ${interval_str} \ - ${interval_padding} - """ -} From 9cdd5fff921c22eba779f9d06358a878d1467842 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 3 Apr 2024 11:56:12 -0700 Subject: [PATCH 12/15] fix intermediate output --- module/combine-gvcfs.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf index 142a9bc..1df1dc3 100644 --- a/module/combine-gvcfs.nf +++ b/module/combine-gvcfs.nf @@ -8,7 +8,7 @@ process run_CombineGVCFs_GATK { publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", mode: "copy", enabled: params.save_intermediate_files, - pattern: '*g.gvcf.gz*' + pattern: '*g.vcf.gz*' publishDir path: "${params.log_output_dir}/process-log", pattern: ".command.*", mode: "copy", From 12976861a94d06e571b2c2747c46fd32b9f9b61e Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Wed, 3 Apr 2024 15:53:03 -0700 Subject: [PATCH 13/15] adjust resources --- config/F16.config | 14 ++++++++++++-- config/F32.config | 14 ++++++++++++-- config/F72.config | 8 ++++---- config/M64.config | 18 ++++++++++++++---- main.nf | 1 - 5 files changed, 42 insertions(+), 13 deletions(-) diff --git a/config/F16.config b/config/F16.config index bca8986..1fb00e0 100644 --- a/config/F16.config +++ b/config/F16.config @@ -11,7 +11,7 @@ process { cpus = 1 memory = 1.GB } - withName: run_HaplotypeCallerVCF_GATK { + withName: run_HaplotypeCallerGVCF_GATK { cpus = 2 memory = 4.GB retry_strategy { @@ -21,7 +21,17 @@ process { } } } - withName: run_HaplotypeCallerGVCF_GATK { + withName: run_CombineGVCFs_GATK { + cpus = 2 + memory = 4.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } + withName: run_GenotypeGVCFs_GATK { cpus = 2 memory = 4.GB retry_strategy { diff --git a/config/F32.config b/config/F32.config index bca8986..1fb00e0 100644 --- a/config/F32.config +++ b/config/F32.config @@ -11,7 +11,7 @@ process { cpus = 1 memory = 1.GB } - withName: run_HaplotypeCallerVCF_GATK { + withName: run_HaplotypeCallerGVCF_GATK { cpus = 2 memory = 4.GB retry_strategy { @@ -21,7 +21,17 @@ process { } } } - withName: run_HaplotypeCallerGVCF_GATK { + withName: run_CombineGVCFs_GATK { + cpus = 2 + memory = 4.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } + withName: run_GenotypeGVCFs_GATK { cpus = 2 memory = 4.GB retry_strategy { diff --git a/config/F72.config b/config/F72.config index f8edc02..b16f3db 100644 --- a/config/F72.config +++ b/config/F72.config @@ -22,8 +22,8 @@ process { } } withName: run_CombineGVCFs_GATK { - cpus = 3 - memory = 7.GB + cpus = 2 + memory = 4.GB retry_strategy { memory { strategy = 'exponential' @@ -32,8 +32,8 @@ process { } } withName: run_GenotypeGVCFs_GATK { - cpus = 3 - memory = 7.GB + cpus = 2 + memory = 4.GB retry_strategy { memory { strategy = 'exponential' diff --git a/config/M64.config b/config/M64.config index 406d896..b16f3db 100644 --- a/config/M64.config +++ b/config/M64.config @@ -11,7 +11,7 @@ process { cpus = 1 memory = 1.GB } - withName: run_HaplotypeCallerVCF_GATK { + withName: run_HaplotypeCallerGVCF_GATK { cpus = 3 memory = 7.GB retry_strategy { @@ -21,9 +21,19 @@ process { } } } - withName: run_HaplotypeCallerGVCF_GATK { - cpus = 3 - memory = 7.GB + withName: run_CombineGVCFs_GATK { + cpus = 2 + memory = 4.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } + withName: run_GenotypeGVCFs_GATK { + cpus = 2 + memory = 4.GB retry_strategy { memory { strategy = 'exponential' diff --git a/main.nf b/main.nf index bee20bf..7847361 100644 --- a/main.nf +++ b/main.nf @@ -58,7 +58,6 @@ include { extract_GenomeIntervals } from './external/pipeline-Nextflow-module/mo ] ) include { - run_HaplotypeCallerVCF_GATK run_HaplotypeCallerGVCF_GATK } from './module/haplotypecaller.nf' include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf' From e8836889763147528de67cb92b4cfca87c81bd92 Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Fri, 31 May 2024 13:55:03 -0700 Subject: [PATCH 14/15] remove intervals from combine-gvcfs --- module/combine-gvcfs.nf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf index 1df1dc3..70421fe 100644 --- a/module/combine-gvcfs.nf +++ b/module/combine-gvcfs.nf @@ -34,8 +34,6 @@ process run_CombineGVCFs_GATK { ] ) gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ') - interval_str = "--intervals ${interval_path}" - interval_padding = params.is_targeted ? "--interval-padding 100" : "" """ set -euo pipefail @@ -45,8 +43,6 @@ process run_CombineGVCFs_GATK { ${gvcf_input_str} \ --output ${output_filename} \ --create-output-variant-index true \ - --verbosity INFO \ - ${interval_str} \ - ${interval_padding} + --verbosity INFO """ } From e929f1459f125477a71d0fb1dffbc3ec6ac2cd1e Mon Sep 17 00:00:00 2001 From: Sorel Fitz-Gibbon Date: Fri, 31 May 2024 15:44:05 -0700 Subject: [PATCH 15/15] remove unused haplotypecallervcf --- main.nf | 2 +- module/haplotypecaller.nf | 79 --------------------------------------- 2 files changed, 1 insertion(+), 80 deletions(-) diff --git a/main.nf b/main.nf index 7847361..33e0937 100644 --- a/main.nf +++ b/main.nf @@ -171,7 +171,7 @@ workflow { ) run_HaplotypeCallerGVCF_GATK.out.gvcfs - .groupTuple(by: 4) // Group by interval_path + .groupTuple(by: 4) // Group by interval ID .map{ it -> [ it[1].flatten(), // GVCFs diff --git a/module/haplotypecaller.nf b/module/haplotypecaller.nf index a4ac95b..6aac72c 100644 --- a/module/haplotypecaller.nf +++ b/module/haplotypecaller.nf @@ -1,84 +1,5 @@ include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' -/* - Nextflow module for calling haplotypes in VCF mode - - input: - reference_fasta: path to reference genome fasta file - reference_fasta_fai: path to index for reference fasta - reference_fasta_dict: path to dictionary for reference fasta - dbsnp_bundle: path to dbSNP variants - dbsnp_bundle_index: path to index of dbSNP variants - bams: path to BAMs for calling - bam_indices: path to indices of BAM - interval: path to specific intervals for calling - interval_id: interval ID - - params: - params.output_dir_base: string(path) - params.log_output_dir: string(path) - params.save_intermediate_files: bool. - params.docker_image_gatk: string - params.is_targeted: bool. Indicator of whether in targeted exome mode or in WGS mode - params.gatk_command_mem_diff: float(memory) -*/ -process run_HaplotypeCallerVCF_GATK { - container params.docker_image_gatk - publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - mode: "copy", - enabled: params.save_intermediate_files, - pattern: '*.vcf*' - - publishDir path: "${params.log_output_dir}/process-log", - pattern: ".command.*", - mode: "copy", - saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" } - - input: - path(reference_fasta) - path(reference_fasta_fai) - path(reference_fasta_dict) - path(dbsnp_bundle) - path(dbsnp_bundle_index) - tuple path(bams), path(bam_indices), path(interval), val(interval_id) - - - output: - path(".command.*") - tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs - - script: - // Get split interval number to serve as task ID - interval_id = interval.baseName.split('-')[0] - output_filename = generate_standard_filename( - "GATK-${params.gatk_version}", - params.dataset_id, - params.patient_id, - [ - 'additional_information': "${interval_id}.vcf.gz" - ] - ) - interval_str = "--intervals ${interval}" - bam_input_str = bams.collect{ "--input '${it}'" }.join(' ') - interval_padding = params.is_targeted ? "--interval-padding 100" : "" - """ - set -euo pipefail - - gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true -Djava.io.tmpdir=${workDir}" \ - HaplotypeCaller \ - ${bam_input_str} \ - --output ${output_filename} \ - --reference ${reference_fasta} \ - --verbosity INFO \ - --output-mode EMIT_VARIANTS_ONLY \ - --dbsnp ${dbsnp_bundle} \ - --sample-ploidy 2 \ - --standard-min-confidence-threshold-for-calling 50 \ - ${interval_str} \ - ${interval_padding} - """ -} - /* Nextflow module for calling haplotypes in GVCF mode