This repository has been archived by the owner on May 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpathseq-build-microbe-reference.wdl
229 lines (199 loc) · 6.79 KB
/
pathseq-build-microbe-reference.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
###############################################################
##
## PathSeq Microbe Reference Build WDL
##
###############################################################
##
## Builds a microbe reference for use with PathSeq
##
## For further info see the GATK Documentation for the PathSeqPipelineSpark tool:
## https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_pathseq_PathSeqPipelineSpark.php
##
###############################################################
##
## Input requirements :
## - FASTA file containing microbe sequences from NCBI RefSeq
##
## Output:
## - FASTA index and dictionary files
## - GATK BWA-MEM index image
## - PathSeq taxonomy file
##
###############################################################
# WORKFLOW DEFINITION
workflow PathSeqBuildMicrobeReferenceWorkflow {
#Mandatory input
File microbe_fasta
#Optional input
Int? min_non_virus_contig_length
File? gatk4_jar_override
# Runtime parameters
String gatk_docker
Int? index_fasta_disk_gb
Int? bwa_mem_index_disk_gb
Int? build_taxonomy_disk_gb
Int? index_fasta_mem_gb
Int? bwa_mem_index_mem_gb
Int? build_taxonomy_mem_gb
Int? preemptible_attempts
call IndexFasta {
input:
fasta_file=microbe_fasta,
disk_space_gb=index_fasta_disk_gb,
mem_gb=index_fasta_mem_gb,
gatk_docker=gatk_docker,
gatk4_jar_override=gatk4_jar_override,
preemptible_attempts=preemptible_attempts
}
call BuildBwaMemIndexImage {
input:
fasta_file=microbe_fasta,
fai_file=IndexFasta.output_fai_file,
disk_space_gb=bwa_mem_index_disk_gb,
mem_gb=bwa_mem_index_mem_gb,
gatk_docker=gatk_docker,
gatk4_jar_override=gatk4_jar_override,
preemptible_attempts=preemptible_attempts
}
call BuildPathSeqTaxonomyFile {
input:
fasta_file=microbe_fasta,
min_non_virus_contig_length=min_non_virus_contig_length,
fai_file=IndexFasta.output_fai_file,
dict_file=IndexFasta.output_dict_file,
disk_space_gb=build_taxonomy_disk_gb,
mem_gb=build_taxonomy_mem_gb,
gatk_docker=gatk_docker,
gatk4_jar_override=gatk4_jar_override,
preemptible_attempts=preemptible_attempts
}
output {
File output_fai_file = IndexFasta.output_fai_file
File output_dict_file = IndexFasta.output_dict_file
File output_img_file = BuildBwaMemIndexImage.output_img_file
File output_taxonomy_file = BuildPathSeqTaxonomyFile.output_taxonomy_file
}
}
# Task DEFINITIONS
task IndexFasta {
# Inputs for this task
File fasta_file
String gatk_docker
File? gatk4_jar_override
Int? mem_gb
Int? preemptible_attempts
Int? disk_space_gb
#Disk size
Int fasta_size_gb = ceil(size(fasta_file, "GB"))
Int default_disk_space_gb = fasta_size_gb + 20
# Mem is in units of GB but our command and memory runtime values are in MB
Int default_mem_gb = 7
Int machine_mem = if defined(mem_gb) then mem_gb*1000 else default_mem_gb*1000
Int command_mem = machine_mem - 1000
String fasta_filename = basename(fasta_file)
String fai_path = fasta_filename + ".fai"
String dict_path = sub(fasta_filename, "\\.fasta$|\\.fa$", ".dict")
command <<<
set -e
mv ${fasta_file} .
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
samtools faidx ${fasta_filename}
gatk --java-options "-Xmx${command_mem}m" CreateSequenceDictionary -R ${fasta_filename} -O ${dict_path}
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
# Note that the space before SSD and HDD should be included.
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_fai_file = "${fai_path}"
File output_dict_file = "${dict_path}"
}
}
task BuildBwaMemIndexImage {
# Inputs for this task
File fasta_file
File fai_file
String gatk_docker
File? gatk4_jar_override
Int? mem_gb
Int? preemptible_attempts
Int? disk_space_gb
#Disk size
Int fasta_size_gb = ceil(size(fasta_file, "GB"))
Int default_disk_space_gb = (fasta_size_gb * 3) + 20
# Mem is in units of GB but our command and memory runtime values are in MB
Int default_mem_gb = (fasta_size_gb * 4) + 8
Int machine_mem = if defined(mem_gb) then mem_gb*1000 else default_mem_gb*1000
Int command_mem = machine_mem - 4000
String fasta_filename = basename(fasta_file)
String img_path = fasta_filename + ".img"
command <<<
set -e
mv ${fasta_file} .
mv ${fai_file} .
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
gatk --java-options "-Xmx${command_mem}m" BwaMemIndexImageCreator -I ${fasta_filename}
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
# Note that the space before SSD and HDD should be included.
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_img_file = "${img_path}"
}
}
task BuildPathSeqTaxonomyFile {
# Inputs for this task
File fasta_file
File fai_file
File dict_file
Int? min_non_virus_contig_length
String gatk_docker
File? gatk4_jar_override
Int? mem_gb
Int? preemptible_attempts
Int? disk_space_gb
#Disk size
Int fasta_size_gb = ceil(size(fasta_file, "GB"))
Int default_disk_space_gb = fasta_size_gb + 20
# Mem is in units of GB but our command and memory runtime values are in MB
Int default_mem_gb = 30
Int machine_mem = if defined(mem_gb) then mem_gb*1000 else default_mem_gb*1000
Int command_mem = machine_mem - 4000
String fasta_filename = basename(fasta_file)
String taxonomy_file = fasta_filename + ".db"
String catalog_file = "catalog.gz"
String taxdump_file = "taxdump.tar.gz"
command <<<
set -e
mv ${fasta_file} .
mv ${fai_file} .
mv ${dict_file} .
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
wget -O ${catalog_file} "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/RefSeq-release*.catalog.gz"
wget -O ${taxdump_file} "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
gatk --java-options "-Xmx${command_mem}m" \
PathSeqBuildReferenceTaxonomy \
--reference ${fasta_filename} \
--output ${taxonomy_file} \
--refseq-catalog ${catalog_file} \
--tax-dump ${taxdump_file} \
--min-non-virus-contig-length ${select_first([min_non_virus_contig_length, 5000])}
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
# Note that the space before SSD and HDD should be included.
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_taxonomy_file = "${taxonomy_file}"
}
}