This repository has been archived by the owner on May 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathcram2filtered.wdl
160 lines (146 loc) · 7.53 KB
/
cram2filtered.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# This workflow takes an input CRAM to call variants with HaplotypeCaller
# Then filters the calls with the CNNVariant neural net tool
# The site-level scores are added to the INFO field of the VCF.
# The architecture arguments, info_key and tensor type arguments MUST be in agreement
# (e.g. 2D models must have tensor_type of read_tensor and info_key CNN_2D, 1D models have tensor_type reference and info_key CNN_1D)
# The INFO field key will be "1D_CNN" or "2D_CNN" depending on the neural net architecture used for inference.
# The architecture arguments specify pre-trained networks.
# New networks can be trained by the GATK tools: CNNVariantWriteTensors and CNNVariantTrain
# The CRAM could be generated by the single-sample pipeline
# (https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl)
# Also accepts a BAM as the input file in which case a BAM index is required as well.
#import "cnn_variant_common_tasks.wdl" as CNNTasks
import "https://raw.githubusercontent.com/gatk-workflows/gatk4-cnn-variant-filter/1.2.0/tasks/cnn_variant_common_tasks.wdl" as CNNTasks
workflow Cram2FilteredVcf {
File input_file # Aligned CRAM file or Aligned BAM files
File? input_file_index # Index for an aligned BAM file if that is the input, unneeded if input is a CRAM
File reference_fasta
File reference_dict
File reference_fasta_index
Array[File] resources # List of VCF file names of resources of known SNPs and INDELs, (e.g. mills, gnomAD)
Array[File] resources_index # List of VCF file indices of resources
File? architecture_json # Neural Net configuration for CNNScoreVariants
File? architecture_hd5 # Pre-Trained weights and architecture for CNNScoreVariants
Int? inference_batch_size # Batch size for python in CNNScoreVariants
Int? transfer_batch_size # Batch size for java in CNNScoreVariants
Int? intra_op_threads # Tensorflow threading within nodes
Int? inter_op_threads # Tensorflow threading between nodes
String output_prefix # Identifying string for this run will be used to name all output files
String? tensor_type # What kind of tensors the Neural Net expects (e.g. reference, read_tensor)
String info_key # The score key for the info field of the vcf (e.g. CNN_1D, CNN_2D)
String snp_tranches # Filtering threshold(s) for SNPs in terms of sensitivity to overlapping known variants in resources
String indel_tranches # Filtering threshold(s) for INDELs in terms of sensitivity to overlapping known variants in resources
File? gatk_override # GATK Jar file to over ride the one included in gatk_docker
String gatk_docker
File calling_intervals
Int scatter_count # Number of shards for parallelization of HaplotypeCaller and CNNScoreVariants
String extra_args # Extra arguments for HaplotypeCaller
# Runtime parameters
Int? mem_gb
Int? preemptible_attempts
Float? disk_space_gb
Int? cpu
Int? increase_disk_size
Int additional_disk = select_first([increase_disk_size, 20])
Float ref_size = size(reference_fasta, "GB") + size(reference_fasta_index, "GB") + size(reference_dict, "GB")
# Clunky check to see if the input is a BAM or a CRAM
if (basename(input_file) == basename(input_file, ".bam")){
call CNNTasks.CramToBam {
input:
reference_fasta = reference_fasta,
reference_dict = reference_dict,
reference_fasta_index = reference_fasta_index,
cram_file = input_file,
output_prefix = output_prefix,
disk_space_gb = round(4*size(input_file, "GB") + ref_size + additional_disk),
preemptible_attempts = preemptible_attempts
}
}
call CNNTasks.SplitIntervals {
input:
gatk_override = gatk_override,
scatter_count = scatter_count,
intervals = calling_intervals,
ref_fasta = reference_fasta,
ref_dict = reference_dict,
ref_fai = reference_fasta_index,
gatk_docker = gatk_docker,
disk_space = round(additional_disk + ref_size)
}
String input_bam = select_first([CramToBam.output_bam, input_file])
Float bam_size = size(input_bam, "GB")
scatter (calling_interval in SplitIntervals.interval_files) {
call CNNTasks.RunHC4 {
input:
input_bam = input_bam,
input_bam_index = select_first([CramToBam.output_bam_index, input_file_index]),
reference_fasta = reference_fasta,
reference_dict = reference_dict,
reference_fasta_index = reference_fasta_index,
output_prefix = output_prefix,
interval_list = calling_interval,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
preemptible_attempts = preemptible_attempts,
extra_args = extra_args,
disk_space_gb = round(bam_size + ref_size + additional_disk)
}
call CNNTasks.CNNScoreVariants {
input:
input_vcf = RunHC4.raw_vcf,
input_vcf_index = RunHC4.raw_vcf_index,
bam_file = RunHC4.bamout,
bam_file_index = RunHC4.bamout_index,
architecture_json = architecture_json,
architecture_hd5 = architecture_hd5,
reference_fasta = reference_fasta,
tensor_type = tensor_type,
inference_batch_size = inference_batch_size,
transfer_batch_size = transfer_batch_size,
intra_op_threads = intra_op_threads,
inter_op_threads = inter_op_threads,
reference_dict = reference_dict,
reference_fasta_index = reference_fasta_index,
output_prefix = output_prefix,
interval_list = calling_interval,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
mem_gb = mem_gb,
disk_space_gb = round((bam_size/scatter_count) + ref_size + additional_disk)
}
}
call CNNTasks.MergeVCFs as MergeVCF_HC4 {
input:
input_vcfs = CNNScoreVariants.cnn_annotated_vcf,
output_prefix = output_prefix,
gatk_override = gatk_override,
preemptible_attempts = preemptible_attempts,
gatk_docker = gatk_docker,
disk_space_gb = additional_disk
}
call CNNTasks.FilterVariantTranches {
input:
input_vcf = MergeVCF_HC4.merged_vcf,
input_vcf_index = MergeVCF_HC4.merged_vcf_index,
resources = resources,
resources_index = resources_index,
output_prefix = output_prefix,
snp_tranches = snp_tranches,
indel_tranches = indel_tranches,
info_key = info_key,
gatk_override = gatk_override,
preemptible_attempts = preemptible_attempts,
gatk_docker = gatk_docker,
disk_space_gb = additional_disk
}
call CNNTasks.SamtoolsMergeBAMs {
input:
input_bams = RunHC4.bamout,
output_prefix = output_prefix,
disk_space_gb = round(bam_size + ref_size + additional_disk)
}
output {
FilterVariantTranches.*
}
}