From 1a6f75d16bd11a59fbb2e714dd343dddb4674798 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Fri, 29 Mar 2024 12:46:05 -0700
Subject: [PATCH 01/15] use F72 config for F16

---
 config/F16.config | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/F16.config b/config/F16.config
index bca8986..406d896 100644
--- a/config/F16.config
+++ b/config/F16.config
@@ -12,8 +12,8 @@ process {
         memory = 1.GB
     }
     withName: run_HaplotypeCallerVCF_GATK {
-        cpus = 2
-        memory = 4.GB
+        cpus = 3
+        memory = 7.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'
@@ -22,8 +22,8 @@ process {
         }
     }
     withName: run_HaplotypeCallerGVCF_GATK {
-        cpus = 2
-        memory = 4.GB
+        cpus = 3
+        memory = 7.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'

From 2c7b3eb298a5c37f7a467b041e9595610ae9bf16 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Sat, 30 Mar 2024 16:57:48 -0700
Subject: [PATCH 02/15] add genotype from gvcfs

---
 main.nf                     | 85 +++++++++++++++++++++++++------------
 module/genomicsdb-import.nf | 48 +++++++++++++++++++++
 module/genotype-gvcfs.nf    | 55 ++++++++++++++++++++++++
 module/haplotypecaller.nf   |  6 +--
 4 files changed, 163 insertions(+), 31 deletions(-)
 create mode 100644 module/genomicsdb-import.nf
 create mode 100644 module/genotype-gvcfs.nf

diff --git a/main.nf b/main.nf
index 2fd315a..966ad70 100644
--- a/main.nf
+++ b/main.nf
@@ -25,7 +25,7 @@ Current Configuration:
         bundle_omni_1000g_2p5_vcf_gz: ${params.bundle_omni_1000g_2p5_vcf_gz}
         bundle_phase1_1000g_snps_high_conf_vcf_gz: ${params.bundle_phase1_1000g_snps_high_conf_vcf_gz}
 
-    - output: 
+    - output:
         output: ${params.output_dir}
         output_dir_base: ${params.output_dir_base}
         log_output_dir: ${params.log_output_dir}
@@ -61,6 +61,8 @@ include {
     run_HaplotypeCallerVCF_GATK
     run_HaplotypeCallerGVCF_GATK
     } from './module/haplotypecaller.nf'
+include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf'
+include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
 include {
     run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF
     run_MergeVcfs_Picard as run_MergeVcfs_Picard_GVCF
@@ -106,13 +108,13 @@ workflow {
     /**
     *   Input validation
     */
-    run_validate_PipeVal(input_ch_validate)
-
-    run_validate_PipeVal.out.validation_result
-        .collectFile(
-            name: 'input_validation.txt',
-            storeDir: "${params.output_dir_base}/validation"
-        )
+//    run_validate_PipeVal(input_ch_validate)
+//
+//    run_validate_PipeVal.out.validation_result
+//        .collectFile(
+//            name: 'input_validation.txt',
+//            storeDir: "${params.output_dir_base}/validation"
+//        )
 
     /**
     *   Handle interval splitting based on targeted or WGS mode
@@ -147,25 +149,25 @@ workflow {
     /**
     *   Haplotype calling
     */
-    input_ch_collected_files.combine(input_ch_intervals)
-        .map{ it ->
-            [
-                it[0].bams,
-                it[0].indices,
-                it[1].interval_path,
-                it[1].interval_id
-            ]
-        }
-        .set{ input_ch_haplotypecallervcf }
-
-    run_HaplotypeCallerVCF_GATK(
-        params.reference_fasta,
-        "${params.reference_fasta}.fai",
-        "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
-        params.bundle_v0_dbsnp138_vcf_gz,
-        "${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
-        input_ch_haplotypecallervcf
-    )
+//    input_ch_collected_files.combine(input_ch_intervals)
+//        .map{ it ->
+//            [
+//                it[0].bams,
+//                it[0].indices,
+//                it[1].interval_path,
+//                it[1].interval_id
+//            ]
+//        }
+//        .set{ input_ch_haplotypecallervcf }
+
+//    run_HaplotypeCallerVCF_GATK(
+//        params.reference_fasta,
+//        "${params.reference_fasta}.fai",
+//        "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
+//        params.bundle_v0_dbsnp138_vcf_gz,
+//        "${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
+//        input_ch_haplotypecallervcf
+//    )
 
     input_ch_samples_with_index.combine(input_ch_intervals)
         .map{ it ->
@@ -188,10 +190,37 @@ workflow {
         input_ch_haplotypecallergvcf
     )
 
+run_HaplotypeCallerGVCF_GATK.out.gvcfs
+// [sample, gvcf, index, interval_path, interval_id]
+    .groupTuple(by: 4) // Group by interval_path
+// [interval_id, [[sample1, sample2, ...], [gvcf1, gvcf2, ...], [index1, index2, ...], interval_path, ]]
+    .map{ it ->
+        [
+            it[1].flatten(), // GVCFs
+            it[2].flatten(), // Indices
+            it[3][0], // Interval path
+            it[4] // Interval ID
+        ]
+    }
+    .set { input_ch_genomicsdb }
+
+    run_GenomicsDBImport_GATK(
+        input_ch_genomicsdb
+        )
+
+    run_GenotypeGVCFs_GATK(
+        params.reference_fasta,
+        "${params.reference_fasta}.fai",
+        "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
+        params.bundle_v0_dbsnp138_vcf_gz,
+        "${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
+        run_GenomicsDBImport_GATK.out.genomicsdb
+    )
+
     /**
     *   Merge VCFs
     */
-    run_HaplotypeCallerVCF_GATK.out.vcfs
+    run_GenotypeGVCFs_GATK.out.vcfs
         .reduce( ['vcfs': [], 'indices': []] ){ a, b ->
             a.vcfs.add(b[0]);
             a.indices.add(b[1]);
diff --git a/module/genomicsdb-import.nf b/module/genomicsdb-import.nf
new file mode 100644
index 0000000..48ec110
--- /dev/null
+++ b/module/genomicsdb-import.nf
@@ -0,0 +1,48 @@
+include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
+
+/*
+    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
+*/
+process run_GenomicsDBImport_GATK {
+    container params.docker_image_gatk
+    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+        mode: "copy",
+        enabled: params.save_intermediate_files,
+        pattern: '*genomicsdb'
+
+    publishDir path: "${params.log_output_dir}/process-log",
+        pattern: ".command.*",
+        mode: "copy",
+        saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
+
+    input:
+    tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)
+
+    output:
+    path(".command.*")
+    tuple path(output_filename), path(interval_path), val(interval_id), emit: genomicsdb
+
+    script:
+    output_filename = generate_standard_filename(
+        "GATK-${params.gatk_version}",
+        params.dataset_id,
+        params.patient_id,
+        [
+            'additional_information': "${interval_id}.genomicsdb"
+        ]
+    )
+    gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
+    interval_str = "--intervals ${interval_path}"
+    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
+    """
+    set -euo pipefail
+
+    gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
+        GenomicsDBImport \
+        ${gvcf_input_str} \
+        --genomicsdb-workspace-path ${output_filename} \
+        --verbosity INFO \
+        ${interval_str} \
+        ${interval_padding}
+    """
+}
diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf
new file mode 100644
index 0000000..8652014
--- /dev/null
+++ b/module/genotype-gvcfs.nf
@@ -0,0 +1,55 @@
+include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
+
+/*
+    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
+*/
+process run_GenotypeGVCFs_GATK {
+    container params.docker_image_gatk
+    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+        mode: "copy",
+        enabled: params.save_intermediate_files,
+        pattern: '*.vcf*'
+
+    publishDir path: "${params.log_output_dir}/process-log",
+        pattern: ".command.*",
+        mode: "copy",
+        saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
+
+    input:
+    path(reference_fasta)
+    path(reference_fasta_fai)
+    path(reference_fasta_dict)
+    path(dbsnp_bundle)
+    path(dbsnp_bundle_index)
+    tuple path(genomicsdb), path(interval), val(interval_id)
+
+    output:
+    path(".command.*")
+    tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs
+
+    script:
+    output_filename = generate_standard_filename(
+        "GATK-${params.gatk_version}",
+        params.dataset_id,
+        params.patient_id,
+        [
+            'additional_information': "${interval_id}.vcf.gz"
+        ]
+    )
+    interval_str = "--intervals ${interval}"
+    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
+    """
+    set -euo pipefail
+
+    gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
+        GenotypeGVCFs \
+        --variant gendb://${genomicsdb} \
+        --reference ${reference_fasta} \
+        --verbosity INFO \
+        --output ${output_filename} \
+        --dbsnp ${dbsnp_bundle} \
+        --standard-min-confidence-threshold-for-calling 50 \
+        ${interval_str} \
+        ${interval_padding}
+    """
+}
diff --git a/module/haplotypecaller.nf b/module/haplotypecaller.nf
index e6cb95f..a4ac95b 100644
--- a/module/haplotypecaller.nf
+++ b/module/haplotypecaller.nf
@@ -120,12 +120,12 @@ process run_HaplotypeCallerGVCF_GATK {
     path(reference_fasta_dict)
     path(dbsnp_bundle)
     path(dbsnp_bundle_index)
-    tuple val(sample_id), path(bam), path(bam_index), path(interval), val(interval_id)
+    tuple val(sample_id), path(bam), path(bam_index), path(interval_path), val(interval_id)
 
 
     output:
     path(".command.*")
-    tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), emit: gvcfs
+    tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: gvcfs
 
     script:
     output_filename = generate_standard_filename(
@@ -136,7 +136,7 @@ process run_HaplotypeCallerGVCF_GATK {
             'additional_information': "${interval_id}_raw_variants.g.vcf.gz"
         ]
     )
-    interval_str = "--intervals ${interval}"
+    interval_str = "--intervals ${interval_path}"
     interval_padding = params.is_targeted ? "--interval-padding 100" : ""
     output_mode = params.emit_all_confident_sites ? "EMIT_ALL_CONFIDENT_SITES" : "EMIT_VARIANTS_ONLY"
     """

From 255beb4b68a0b347ffe4f53a5c6e8b201d11ce52 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Sat, 30 Mar 2024 16:58:26 -0700
Subject: [PATCH 03/15] tmp F16 changed to F72 values

---
 config/F16.config | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/config/F16.config b/config/F16.config
index 406d896..088d7bf 100644
--- a/config/F16.config
+++ b/config/F16.config
@@ -31,6 +31,26 @@ process {
             }
         }
     }
+    withName: run_GenomicsDBImport_GATK {
+        cpus = 3
+        memory = 7.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
+    withName: run_GenotypeGVCFs_GATK {
+        cpus = 3
+        memory = 7.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
     withName: run_MergeVcfs_Picard_VCF {
         cpus = 1
         memory = 15.GB

From 460fcbdadf297c632e53ea6ec89cea569fcf0e8e Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Sat, 30 Mar 2024 21:53:30 -0700
Subject: [PATCH 04/15] add combine and genotype gvcfs

---
 main.nf                  | 12 ++++++----
 module/combine-gvcfs.nf  | 52 ++++++++++++++++++++++++++++++++++++++++
 module/genotype-gvcfs.nf |  6 ++---
 3 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 module/combine-gvcfs.nf

diff --git a/main.nf b/main.nf
index 966ad70..cb2be93 100644
--- a/main.nf
+++ b/main.nf
@@ -62,6 +62,7 @@ include {
     run_HaplotypeCallerGVCF_GATK
     } from './module/haplotypecaller.nf'
 include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf'
+include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'
 include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
 include {
     run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF
@@ -202,10 +203,13 @@ run_HaplotypeCallerGVCF_GATK.out.gvcfs
             it[4] // Interval ID
         ]
     }
-    .set { input_ch_genomicsdb }
+    .set { input_ch_combine_gvcfs }
 
-    run_GenomicsDBImport_GATK(
-        input_ch_genomicsdb
+    run_CombineGVCFs_GATK(
+        params.reference_fasta,
+        "${params.reference_fasta}.fai",
+        "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
+        input_ch_combine_gvcfs
         )
 
     run_GenotypeGVCFs_GATK(
@@ -214,7 +218,7 @@ run_HaplotypeCallerGVCF_GATK.out.gvcfs
         "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
         params.bundle_v0_dbsnp138_vcf_gz,
         "${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
-        run_GenomicsDBImport_GATK.out.genomicsdb
+        run_CombineGVCFs_GATK.out.combined_gvcf
     )
 
     /**
diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf
new file mode 100644
index 0000000..65b1ba2
--- /dev/null
+++ b/module/combine-gvcfs.nf
@@ -0,0 +1,52 @@
+include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
+
+/*
+    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
+*/
+process run_CombineGVCFs_GATK {
+    container params.docker_image_gatk
+    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+        mode: "copy",
+        enabled: params.save_intermediate_files,
+        pattern: '*g.gvcf.gz*'
+    publishDir path: "${params.log_output_dir}/process-log",
+        pattern: ".command.*",
+        mode: "copy",
+        saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
+
+    input:
+    path(reference_fasta)
+    path(reference_fasta_fai)
+    path(reference_fasta_dict)
+    tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)
+
+    output:
+    path(".command.*")
+    tuple path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: combined_gvcf
+
+    script:
+    output_filename = generate_standard_filename(
+        "GATK-${params.gatk_version}",
+        params.dataset_id,
+        params.patient_id,
+        [
+            'additional_information': "${interval_id}.g.vcf.gz"
+        ]
+    )
+    gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
+    interval_str = "--intervals ${interval_path}"
+    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
+    """
+    set -euo pipefail
+
+    gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
+        CombineGVCFs \
+        --reference ${reference_fasta} \
+        ${gvcf_input_str} \
+        --output ${output_filename} \
+        --create-output-variant-index true \
+        --verbosity INFO \
+        ${interval_str} \
+        ${interval_padding}
+    """
+}
diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf
index 8652014..9586336 100644
--- a/module/genotype-gvcfs.nf
+++ b/module/genotype-gvcfs.nf
@@ -21,7 +21,7 @@ process run_GenotypeGVCFs_GATK {
     path(reference_fasta_dict)
     path(dbsnp_bundle)
     path(dbsnp_bundle_index)
-    tuple path(genomicsdb), path(interval), val(interval_id)
+    tuple path(combined_gvcf), path(combined_gvcf_index), path(interval_path), val(interval_id)
 
     output:
     path(".command.*")
@@ -36,14 +36,14 @@ process run_GenotypeGVCFs_GATK {
             'additional_information': "${interval_id}.vcf.gz"
         ]
     )
-    interval_str = "--intervals ${interval}"
+    interval_str = "--intervals ${interval_path}"
     interval_padding = params.is_targeted ? "--interval-padding 100" : ""
     """
     set -euo pipefail
 
     gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
         GenotypeGVCFs \
-        --variant gendb://${genomicsdb} \
+        --variant ${combined_gvcf} \
         --reference ${reference_fasta} \
         --verbosity INFO \
         --output ${output_filename} \

From 4b130c5ff1c443697bd9955017eec053b7f03ed3 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Sat, 30 Mar 2024 21:54:40 -0700
Subject: [PATCH 05/15] tmp F72 to F16 config

---
 config/F16.config | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/config/F16.config b/config/F16.config
index 088d7bf..d11fa47 100644
--- a/config/F16.config
+++ b/config/F16.config
@@ -41,6 +41,16 @@ process {
             }
         }
     }
+    withName: run_CombineGVCFs_GATK {
+        cpus = 3
+        memory = 7.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
     withName: run_GenotypeGVCFs_GATK {
         cpus = 3
         memory = 7.GB

From 5b5d8925cc9614164dd46fa62235b32e0be8729a Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Mon, 1 Apr 2024 08:47:23 -0700
Subject: [PATCH 06/15] tidy up scripts and replace input validation

---
 main.nf                  | 35 +++++++----------------------------
 module/combine-gvcfs.nf  |  2 +-
 module/genotype-gvcfs.nf |  2 +-
 3 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/main.nf b/main.nf
index cb2be93..7d49665 100644
--- a/main.nf
+++ b/main.nf
@@ -109,13 +109,13 @@ workflow {
     /**
     *   Input validation
     */
-//    run_validate_PipeVal(input_ch_validate)
-//
-//    run_validate_PipeVal.out.validation_result
-//        .collectFile(
-//            name: 'input_validation.txt',
-//            storeDir: "${params.output_dir_base}/validation"
-//        )
+    run_validate_PipeVal(input_ch_validate)
+
+    run_validate_PipeVal.out.validation_result
+        .collectFile(
+            name: 'input_validation.txt',
+            storeDir: "${params.output_dir_base}/validation"
+        )
 
     /**
     *   Handle interval splitting based on targeted or WGS mode
@@ -150,25 +150,6 @@ workflow {
     /**
     *   Haplotype calling
     */
-//    input_ch_collected_files.combine(input_ch_intervals)
-//        .map{ it ->
-//            [
-//                it[0].bams,
-//                it[0].indices,
-//                it[1].interval_path,
-//                it[1].interval_id
-//            ]
-//        }
-//        .set{ input_ch_haplotypecallervcf }
-
-//    run_HaplotypeCallerVCF_GATK(
-//        params.reference_fasta,
-//        "${params.reference_fasta}.fai",
-//        "${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
-//        params.bundle_v0_dbsnp138_vcf_gz,
-//        "${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
-//        input_ch_haplotypecallervcf
-//    )
 
     input_ch_samples_with_index.combine(input_ch_intervals)
         .map{ it ->
@@ -192,9 +173,7 @@ workflow {
     )
 
 run_HaplotypeCallerGVCF_GATK.out.gvcfs
-// [sample, gvcf, index, interval_path, interval_id]
     .groupTuple(by: 4) // Group by interval_path
-// [interval_id, [[sample1, sample2, ...], [gvcf1, gvcf2, ...], [index1, index2, ...], interval_path, ]]
     .map{ it ->
         [
             it[1].flatten(), // GVCFs
diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf
index 65b1ba2..142a9bc 100644
--- a/module/combine-gvcfs.nf
+++ b/module/combine-gvcfs.nf
@@ -1,7 +1,7 @@
 include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
 
 /*
-    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
+    Nextflow module for merging GVCFs for joint genotyping with GATK
 */
 process run_CombineGVCFs_GATK {
     container params.docker_image_gatk
diff --git a/module/genotype-gvcfs.nf b/module/genotype-gvcfs.nf
index 9586336..77655d6 100644
--- a/module/genotype-gvcfs.nf
+++ b/module/genotype-gvcfs.nf
@@ -1,7 +1,7 @@
 include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
 
 /*
-    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
+    Nextflow module for joint genotyping merged GVCFs with GATK
 */
 process run_GenotypeGVCFs_GATK {
     container params.docker_image_gatk

From 8407e29939e8876ae2be5b2ae0a8d356aa195924 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Mon, 1 Apr 2024 08:49:54 -0700
Subject: [PATCH 07/15] replace original F16.config

---
 config/F16.config | 38 ++++----------------------------------
 1 file changed, 4 insertions(+), 34 deletions(-)

diff --git a/config/F16.config b/config/F16.config
index d11fa47..bca8986 100644
--- a/config/F16.config
+++ b/config/F16.config
@@ -12,8 +12,8 @@ process {
         memory = 1.GB
     }
     withName: run_HaplotypeCallerVCF_GATK {
-        cpus = 3
-        memory = 7.GB
+        cpus = 2
+        memory = 4.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'
@@ -22,38 +22,8 @@ process {
         }
     }
     withName: run_HaplotypeCallerGVCF_GATK {
-        cpus = 3
-        memory = 7.GB
-        retry_strategy {
-            memory {
-                strategy = 'exponential'
-                operand = 2
-            }
-        }
-    }
-    withName: run_GenomicsDBImport_GATK {
-        cpus = 3
-        memory = 7.GB
-        retry_strategy {
-            memory {
-                strategy = 'exponential'
-                operand = 2
-            }
-        }
-    }
-    withName: run_CombineGVCFs_GATK {
-        cpus = 3
-        memory = 7.GB
-        retry_strategy {
-            memory {
-                strategy = 'exponential'
-                operand = 2
-            }
-        }
-    }
-    withName: run_GenotypeGVCFs_GATK {
-        cpus = 3
-        memory = 7.GB
+        cpus = 2
+        memory = 4.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'

From 694b6d23d13d0ec3953fbad5d9faeb97b6001819 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Mon, 1 Apr 2024 09:29:32 -0700
Subject: [PATCH 08/15] update changelog

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bbd158c..6616b33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 ---
 
 ## [Unreleased]
-
+### Added
+- Add workflow for genotyping from GVCFs
 ---
 
 ## [10.0.0] - 2024-03-08

From 9c3f9811e614923bac6d2ce395cc5047a2fe89fc Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 2 Apr 2024 09:16:35 -0700
Subject: [PATCH 09/15] update gatk

---
 CHANGELOG.md          | 3 +++
 config/default.config | 2 +-
 metadata.yaml         | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6616b33..49bd030 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 ## [Unreleased]
 ### Added
 - Add workflow for genotyping from GVCFs
+
+### Changed
+- Update GATK to 4.5.0.0
 ---
 
 ## [10.0.0] - 2024-03-08
diff --git a/config/default.config b/config/default.config
index bf96ba3..806d508 100644
--- a/config/default.config
+++ b/config/default.config
@@ -16,7 +16,7 @@ params {
 
     docker_container_registry = "ghcr.io/uclahs-cds"
 
-    gatk_version = "4.2.4.1"
+    gatk_version = "4.5.0.0"
     picard_version = "2.26.10"
     pipeval_version = "4.0.0-rc.2"
     gatkfilter_version = "v1.0.0"
diff --git a/metadata.yaml b/metadata.yaml
index d31715d..f4e504a 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -5,4 +5,4 @@ maintainers: "Boutros Lab Infrastructure <BoutrosLabInfrastructure@mednet.ucla.e
 languages: ["Nextflow", "Docker"]
 dependencies: ["Java", "Nextflow", "Docker"]
 references: "https://uclahs-cds.atlassian.net/wiki/spaces/BOUTROSLAB/pages/3189620/Guide+to+Nextflow"
-tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.2.4.1"]
+tools: ["Picard:2.26.10", "GATK:3.7.0", "GATK:4.5.0.0"]

From 46c9555155c59cf1ad9b442afa556863e4aec2a4 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Tue, 2 Apr 2024 09:17:07 -0700
Subject: [PATCH 10/15] update resources

---
 config/F72.config | 14 ++++++++++++--
 main.nf           | 20 ++++++++++----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/config/F72.config b/config/F72.config
index 406d896..f8edc02 100644
--- a/config/F72.config
+++ b/config/F72.config
@@ -11,7 +11,7 @@ process {
         cpus = 1
         memory = 1.GB
     }
-    withName: run_HaplotypeCallerVCF_GATK {
+    withName: run_HaplotypeCallerGVCF_GATK {
         cpus = 3
         memory = 7.GB
         retry_strategy {
@@ -21,7 +21,17 @@ process {
             }
         }
     }
-    withName: run_HaplotypeCallerGVCF_GATK {
+    withName: run_CombineGVCFs_GATK {
+        cpus = 3
+        memory = 7.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
+    withName: run_GenotypeGVCFs_GATK {
         cpus = 3
         memory = 7.GB
         retry_strategy {
diff --git a/main.nf b/main.nf
index 7d49665..990168d 100644
--- a/main.nf
+++ b/main.nf
@@ -172,16 +172,16 @@ workflow {
         input_ch_haplotypecallergvcf
     )
 
-run_HaplotypeCallerGVCF_GATK.out.gvcfs
-    .groupTuple(by: 4) // Group by interval_path
-    .map{ it ->
-        [
-            it[1].flatten(), // GVCFs
-            it[2].flatten(), // Indices
-            it[3][0], // Interval path
-            it[4] // Interval ID
-        ]
-    }
+    run_HaplotypeCallerGVCF_GATK.out.gvcfs
+        .groupTuple(by: 4) // Group by interval_path
+        .map{ it ->
+            [
+                it[1].flatten(), // GVCFs
+                it[2].flatten(), // Indices
+                it[3][0], // Interval path
+                it[4] // Interval ID
+            ]
+        }
     .set { input_ch_combine_gvcfs }
 
     run_CombineGVCFs_GATK(

From edf5010e0ba2d7465c7b5fface581ae5d7eddba4 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 3 Apr 2024 11:55:21 -0700
Subject: [PATCH 11/15] removed unused genomicsdb-import

---
 main.nf                     |  1 -
 module/genomicsdb-import.nf | 48 -------------------------------------
 2 files changed, 49 deletions(-)
 delete mode 100644 module/genomicsdb-import.nf

diff --git a/main.nf b/main.nf
index 990168d..bee20bf 100644
--- a/main.nf
+++ b/main.nf
@@ -61,7 +61,6 @@ include {
     run_HaplotypeCallerVCF_GATK
     run_HaplotypeCallerGVCF_GATK
     } from './module/haplotypecaller.nf'
-include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf'
 include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'
 include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
 include {
diff --git a/module/genomicsdb-import.nf b/module/genomicsdb-import.nf
deleted file mode 100644
index 48ec110..0000000
--- a/module/genomicsdb-import.nf
+++ /dev/null
@@ -1,48 +0,0 @@
-include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
-
-/*
-    Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
-*/
-process run_GenomicsDBImport_GATK {
-    container params.docker_image_gatk
-    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
-        mode: "copy",
-        enabled: params.save_intermediate_files,
-        pattern: '*genomicsdb'
-
-    publishDir path: "${params.log_output_dir}/process-log",
-        pattern: ".command.*",
-        mode: "copy",
-        saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
-
-    input:
-    tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)
-
-    output:
-    path(".command.*")
-    tuple path(output_filename), path(interval_path), val(interval_id), emit: genomicsdb
-
-    script:
-    output_filename = generate_standard_filename(
-        "GATK-${params.gatk_version}",
-        params.dataset_id,
-        params.patient_id,
-        [
-            'additional_information': "${interval_id}.genomicsdb"
-        ]
-    )
-    gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
-    interval_str = "--intervals ${interval_path}"
-    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
-    """
-    set -euo pipefail
-
-    gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
-        GenomicsDBImport \
-        ${gvcf_input_str} \
-        --genomicsdb-workspace-path ${output_filename} \
-        --verbosity INFO \
-        ${interval_str} \
-        ${interval_padding}
-    """
-}

From 9cdd5fff921c22eba779f9d06358a878d1467842 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 3 Apr 2024 11:56:12 -0700
Subject: [PATCH 12/15] fix intermediate output

---
 module/combine-gvcfs.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf
index 142a9bc..1df1dc3 100644
--- a/module/combine-gvcfs.nf
+++ b/module/combine-gvcfs.nf
@@ -8,7 +8,7 @@ process run_CombineGVCFs_GATK {
     publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
         mode: "copy",
         enabled: params.save_intermediate_files,
-        pattern: '*g.gvcf.gz*'
+        pattern: '*g.vcf.gz*'
     publishDir path: "${params.log_output_dir}/process-log",
         pattern: ".command.*",
         mode: "copy",

From 12976861a94d06e571b2c2747c46fd32b9f9b61e Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Wed, 3 Apr 2024 15:53:03 -0700
Subject: [PATCH 13/15] adjust resources

---
 config/F16.config | 14 ++++++++++++--
 config/F32.config | 14 ++++++++++++--
 config/F72.config |  8 ++++----
 config/M64.config | 18 ++++++++++++++----
 main.nf           |  1 -
 5 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/config/F16.config b/config/F16.config
index bca8986..1fb00e0 100644
--- a/config/F16.config
+++ b/config/F16.config
@@ -11,7 +11,7 @@ process {
         cpus = 1
         memory = 1.GB
     }
-    withName: run_HaplotypeCallerVCF_GATK {
+    withName: run_HaplotypeCallerGVCF_GATK {
         cpus = 2
         memory = 4.GB
         retry_strategy {
@@ -21,7 +21,17 @@ process {
             }
         }
     }
-    withName: run_HaplotypeCallerGVCF_GATK {
+    withName: run_CombineGVCFs_GATK {
+        cpus = 2
+        memory = 4.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
+    withName: run_GenotypeGVCFs_GATK {
         cpus = 2
         memory = 4.GB
         retry_strategy {
diff --git a/config/F32.config b/config/F32.config
index bca8986..1fb00e0 100644
--- a/config/F32.config
+++ b/config/F32.config
@@ -11,7 +11,7 @@ process {
         cpus = 1
         memory = 1.GB
     }
-    withName: run_HaplotypeCallerVCF_GATK {
+    withName: run_HaplotypeCallerGVCF_GATK {
         cpus = 2
         memory = 4.GB
         retry_strategy {
@@ -21,7 +21,17 @@ process {
             }
         }
     }
-    withName: run_HaplotypeCallerGVCF_GATK {
+    withName: run_CombineGVCFs_GATK {
+        cpus = 2
+        memory = 4.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
+    withName: run_GenotypeGVCFs_GATK {
         cpus = 2
         memory = 4.GB
         retry_strategy {
diff --git a/config/F72.config b/config/F72.config
index f8edc02..b16f3db 100644
--- a/config/F72.config
+++ b/config/F72.config
@@ -22,8 +22,8 @@ process {
         }
     }
     withName: run_CombineGVCFs_GATK {
-        cpus = 3
-        memory = 7.GB
+        cpus = 2
+        memory = 4.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'
@@ -32,8 +32,8 @@ process {
         }
     }
     withName: run_GenotypeGVCFs_GATK {
-        cpus = 3
-        memory = 7.GB
+        cpus = 2
+        memory = 4.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'
diff --git a/config/M64.config b/config/M64.config
index 406d896..b16f3db 100644
--- a/config/M64.config
+++ b/config/M64.config
@@ -11,7 +11,7 @@ process {
         cpus = 1
         memory = 1.GB
     }
-    withName: run_HaplotypeCallerVCF_GATK {
+    withName: run_HaplotypeCallerGVCF_GATK {
         cpus = 3
         memory = 7.GB
         retry_strategy {
@@ -21,9 +21,19 @@ process {
             }
         }
     }
-    withName: run_HaplotypeCallerGVCF_GATK {
-        cpus = 3
-        memory = 7.GB
+    withName: run_CombineGVCFs_GATK {
+        cpus = 2
+        memory = 4.GB
+        retry_strategy {
+            memory {
+                strategy = 'exponential'
+                operand = 2
+            }
+        }
+    }
+    withName: run_GenotypeGVCFs_GATK {
+        cpus = 2
+        memory = 4.GB
         retry_strategy {
             memory {
                 strategy = 'exponential'
diff --git a/main.nf b/main.nf
index bee20bf..7847361 100644
--- a/main.nf
+++ b/main.nf
@@ -58,7 +58,6 @@ include { extract_GenomeIntervals } from './external/pipeline-Nextflow-module/mo
         ]
     )
 include {
-    run_HaplotypeCallerVCF_GATK
     run_HaplotypeCallerGVCF_GATK
     } from './module/haplotypecaller.nf'
 include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'

From e8836889763147528de67cb92b4cfca87c81bd92 Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Fri, 31 May 2024 13:55:03 -0700
Subject: [PATCH 14/15] remove intervals from combine-gvcfs

---
 module/combine-gvcfs.nf | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/module/combine-gvcfs.nf b/module/combine-gvcfs.nf
index 1df1dc3..70421fe 100644
--- a/module/combine-gvcfs.nf
+++ b/module/combine-gvcfs.nf
@@ -34,8 +34,6 @@ process run_CombineGVCFs_GATK {
         ]
     )
     gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
-    interval_str = "--intervals ${interval_path}"
-    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
     """
     set -euo pipefail
 
@@ -45,8 +43,6 @@ process run_CombineGVCFs_GATK {
         ${gvcf_input_str} \
         --output ${output_filename} \
         --create-output-variant-index true \
-        --verbosity INFO \
-        ${interval_str} \
-        ${interval_padding}
+        --verbosity INFO
     """
 }

From e929f1459f125477a71d0fb1dffbc3ec6ac2cd1e Mon Sep 17 00:00:00 2001
From: Sorel Fitz-Gibbon <sfitzgibbon@mednet.ucla.edu>
Date: Fri, 31 May 2024 15:44:05 -0700
Subject: [PATCH 15/15] remove unused haplotypecallervcf

---
 main.nf                   |  2 +-
 module/haplotypecaller.nf | 79 ---------------------------------------
 2 files changed, 1 insertion(+), 80 deletions(-)

diff --git a/main.nf b/main.nf
index 7847361..33e0937 100644
--- a/main.nf
+++ b/main.nf
@@ -171,7 +171,7 @@ workflow {
     )
 
     run_HaplotypeCallerGVCF_GATK.out.gvcfs
-        .groupTuple(by: 4) // Group by interval_path
+        .groupTuple(by: 4) // Group by interval ID
         .map{ it ->
             [
                 it[1].flatten(), // GVCFs
diff --git a/module/haplotypecaller.nf b/module/haplotypecaller.nf
index a4ac95b..6aac72c 100644
--- a/module/haplotypecaller.nf
+++ b/module/haplotypecaller.nf
@@ -1,84 +1,5 @@
 include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
 
-/*
-    Nextflow module for calling haplotypes in VCF mode
-
-    input:
-        reference_fasta: path to reference genome fasta file
-        reference_fasta_fai: path to index for reference fasta
-        reference_fasta_dict: path to dictionary for reference fasta
-        dbsnp_bundle: path to dbSNP variants
-        dbsnp_bundle_index: path to index of dbSNP variants
-        bams: path to BAMs for calling
-        bam_indices: path to indices of BAM
-        interval: path to specific intervals for calling
-        interval_id: interval ID
-        
-    params:
-        params.output_dir_base: string(path)
-        params.log_output_dir: string(path)
-        params.save_intermediate_files: bool.
-        params.docker_image_gatk: string
-        params.is_targeted: bool. Indicator of whether in targeted exome mode or in WGS mode
-        params.gatk_command_mem_diff: float(memory)
-*/
-process run_HaplotypeCallerVCF_GATK {
-    container params.docker_image_gatk
-    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
-      mode: "copy",
-      enabled: params.save_intermediate_files,
-      pattern: '*.vcf*'
-
-    publishDir path: "${params.log_output_dir}/process-log",
-      pattern: ".command.*",
-      mode: "copy",
-      saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }
-
-    input:
-    path(reference_fasta)
-    path(reference_fasta_fai)
-    path(reference_fasta_dict)
-    path(dbsnp_bundle)
-    path(dbsnp_bundle_index)
-    tuple path(bams), path(bam_indices), path(interval), val(interval_id)
-
-
-    output:
-    path(".command.*")
-    tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs
-
-    script:
-    // Get split interval number to serve as task ID
-    interval_id = interval.baseName.split('-')[0]
-    output_filename = generate_standard_filename(
-        "GATK-${params.gatk_version}",
-        params.dataset_id,
-        params.patient_id,
-        [
-            'additional_information': "${interval_id}.vcf.gz"
-        ]
-    )
-    interval_str = "--intervals ${interval}"
-    bam_input_str = bams.collect{ "--input '${it}'" }.join(' ')
-    interval_padding = params.is_targeted ? "--interval-padding 100" : ""
-    """
-    set -euo pipefail
-
-    gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true -Djava.io.tmpdir=${workDir}" \
-        HaplotypeCaller \
-        ${bam_input_str} \
-        --output ${output_filename} \
-        --reference ${reference_fasta} \
-        --verbosity INFO \
-        --output-mode EMIT_VARIANTS_ONLY \
-        --dbsnp ${dbsnp_bundle} \
-        --sample-ploidy 2 \
-        --standard-min-confidence-threshold-for-calling 50 \
-        ${interval_str} \
-        ${interval_padding}
-    """
-}
-
 /*
     Nextflow module for calling haplotypes in GVCF mode