-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnextflow_schema.json
509 lines (509 loc) · 27.8 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/epi2me-labs/wf-16s/master/nextflow_schema.json",
"title": "epi2me-labs/wf-16s",
"workflow_title": "16s workflow",
"description": "Taxonomic classification of single reads from amplicon-targeted sequencing.",
"demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-16s/wf-16s-demo.tar.gz",
"aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-16s/wf-16s-demo/aws.nextflow.config",
"url": "https://github.com/epi2me-labs/wf-16s",
"type": "object",
"resources": {
"recommended": {
"cpus": 12,
"memory": "32GB"
},
"minimum": {
"cpus": 6,
"memory": "16GB"
},
"run_time": "~40min for 1 million reads in total (24 barcodes) using Minimap2 and the ncbi_16s_18s database.",
"arm_support": true
},
"definitions": {
"input_options": {
"title": "Input Options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Define where the pipeline should find input data and save output data.",
"properties": {
"fastq": {
"type": "string",
"format": "path",
"title": "FASTQ",
"description": "FASTQ files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.",
"demo_data": "test_data"
},
"bam": {
"type": "string",
"format": "path",
"description": "BAM or unaligned BAM (uBAM) files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"classifier": {
"type": "string",
"default": "minimap2",
"title": "Classification method",
"description": "Kraken2 or Minimap2 workflow to be used for classification of reads.",
"enum": [
"kraken2",
"minimap2"
],
"help_text": "Use Kraken2 for fast classification and minimap2 for finer resolution, see Readme for further info."
},
"analyse_unclassified": {
"type": "boolean",
"default": false,
"title": "Analyse unclassified reads",
"description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
"help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
},
"exclude_host": {
"type": "string",
"format": "file-path",
"title": "Exclude host reads",
"description": "A FASTA or MMI file of the host reference. Reads that align with this reference will be excluded from the analysis."
}
},
"oneOf": [
{
"required": [
"fastq"
]
},
{
"required": [
"bam"
]
}
]
},
"real_time_analysis_options": {
"title": "Real Time Analysis Options",
"type": "object",
"description": "Options relating to the default real-time Kraken2 workflow.",
"overrides": {
"epi2mecloud": {
"hidden": true
}
},
"properties": {
"real_time": {
"type": "boolean",
"title": "Enable real time processing",
"default": false,
"description": "Enable to continuously watch the input directory for new input files. Reads will be analysed as they appear",
"help_text": "This option enables the use of Nextflow\u2019s directory watching feature to constantly monitor input directories for new files. As soon as files are written by an external process Nextflow will begin analysing these files. The workflow will accumulate data over time to produce an updating report."
},
"batch_size": {
"type": "integer",
"default": 0,
"title": "Size of each batch",
"description": "Maximum number of sequence records to process in a batch.",
"help_text": "Large files will be split such that batch_size records are processed together. Set to 0 to avoid rebatching input files. A value of 32000 is recommended to rebatch large files."
},
"read_limit": {
"type": "integer",
"title": "Number of reads analysed to stop the workflow",
"description": "Stop processing data when a particular number of reads have been analysed. By default the workflow will run indefinitely.",
"help_text": "Sets the upper bound on the number of reads that will be analysed before the workflow is automatically stopped and no more data is analysed."
},
"port": {
"type": "integer",
"default": 8080,
"title": "Network port",
"description": "Network port for communication between Kraken2 server and clients (available in real time pipeline).",
"help_text": "The workflow uses a server process to handle Kraken2 classification requests. This allows the workflow to persist the sequence database in memory throughout the duration of processing. The option specifies the local network port on which the server and clients will communicate."
},
"host": {
"type": "string",
"default": "localhost",
"title": "Network hostname (or IP address)",
"description": "Network hostname (or IP address) for communication between Kraken2 server and clients. (See also 'external_kraken2' parameter). (Available in real time pipeline).",
"help_text": "The workflow uses a server process to handle Kraken2 classification requests. This allows the workflow to persist the sequence database in memory throughout the duration of processing. The option specifies the local network hostname (or IP address) of the Kraken server."
},
"external_kraken2": {
"type": "boolean",
"default": false,
"title": "External Kraken2 server",
"description": "Whether a pre-existing Kraken2 server should be used, rather than creating one as part of the workflow. (Available in real time pipeline).",
"help_text": "By default the workflow assumes that it is running on a single host computer, and further that it should start its own Kraken2 server. It may be desirable to start a Kraken2 server outside of the workflow, in which case this option should be enabled. This option may be used in conjunction with the `host` option to specify that the Kraken2 server is running on a remote computer. "
},
"server_threads": {
"type": "integer",
"default": 2,
"title": "CPU threads used by the Kraken2 server",
"description": "Number of CPU threads used by the Kraken2 server for classifying reads. (Available in the real_time pipeline).",
"help_text": "For the real-time Kraken2 workflow, this is the number of CPU threads used by the Kraken2 server for classifying reads."
},
"kraken_clients": {
"type": "integer",
"default": 2,
"title": "Clients used by the Kraken2 server",
"description": "Number of clients that can connect at once to the Kraken-server for classifying reads. (Available in the real_time pipeline).",
"help_text": "For the real-time Kraken2 workflow, this is the number of clients sending reads to the server. It should not be set to more than 4 fewer than the executor CPU limit."
}
}
},
"sample_options": {
"title": "Sample Options",
"type": "object",
"default": "",
"properties": {
"sample_sheet": {
"type": "string",
"format": "file-path",
"title": "Sample sheet",
"description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. Disabled in the real time pipeline.",
"help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode`,`alias`. Extra columns are allowed."
},
"sample": {
"type": "string",
"title": "Sample name",
"description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. Disabled in the real time pipeline."
}
},
"description": "Parameters that relate to samples such as sample sheets and sample names."
},
"reference_options": {
"title": "Reference Options",
"type": "object",
"description": "Files will be downloaded as part of the first run of workflow and automatically stored for subsequent runs.",
"default": "",
"properties": {
"database_set": {
"type": "string",
"default": "ncbi_16s_18s",
"title": "Choose a database",
"description": "Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1']. Workflow will require memory available to be slightly higher than the size of the database.",
"enum": [
"ncbi_16s_18s",
"ncbi_16s_18s_28s_ITS",
"SILVA_138_1"
],
"help_text": "This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options."
},
"store_dir": {
"type": "string",
"format": "directory-path",
"title": "Store directory name",
"description": "Where to store initial download of database.",
"help_text": "database set selected will be downloaded as part of the workflow and saved in this location, on subsequent runs it will use this as the database. ",
"hidden": true,
"default": "store_dir"
},
"database": {
"type": "string",
"format": "path",
"title": "Kraken2 database",
"description": "Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory].",
"help_text": "By default uses database chosen in database_set parameter.",
"overrides": {
"epi2mecloud": {
"hidden": true
}
}
},
"taxonomy": {
"type": "string",
"format": "path",
"title": "Taxonomy database",
"description": "Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file [.tar.gz or directory].",
"help_text": "By default NCBI taxonomy file will be downloaded and used."
},
"reference": {
"type": "string",
"format": "file-path",
"title": "Minimap2 reference",
"description": "Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index.",
"help_text": "This option should be used in conjunction with the database parameter to specify a custom database."
},
"ref2taxid": {
"type": "string",
"format": "file-path",
"title": "File linking reference IDs to specific taxids",
"description": "Not required but can be used to specify a ref2taxid mapping. Format is .tsv (refname taxid), no header row.",
"help_text": "By default uses ref2taxid for option chosen in database_set parameter."
},
"taxonomic_rank": {
"type": "string",
"default": "G",
"title": "Taxonomic rank",
"description": "Returns results at the taxonomic rank chosen. In the Kraken2 pipeline, this sets the level that Bracken will estimate abundance at. Default: G (genus). Other possible options are P (phylum), C (class), O (order), F (family), and S (species).",
"enum": [
"S",
"G",
"F",
"O",
"C",
"P"
]
}
},
"dependencies": {
"reference": [
"ref2taxid"
],
"ref2taxid": [
"reference"
]
}
},
"kraken2_options": {
"title": "Kraken2 Options",
"type": "object",
"fa_icon": "fas fa-university",
"help_text": "Kraken2: It is possible to enable classification by Kraken2, disabling alignment, which is a faster but coarser method of classification reliant on the presence of a Kraken2 database.",
"properties": {
"bracken_length": {
"type": "integer",
"title": "Bracken length",
"description": "Set the length value Bracken will use",
"minimum": 1,
"help_text": "Should be set to the length used to generate the kmer distribution file supplied in the Kraken database input directory. For the default datasets these will be set automatically. ncbi_16s_18s = 1000 , ncbi_16s_18s_28s_ITS = 1000 , PlusPF-8 = 300"
},
"bracken_threshold": {
"type": "integer",
"title": "Bracken minimum read threshold",
"description": "Set the minimum read threshold Bracken will use to consider a taxon",
"default": 10,
"minimum": 0,
"help_text": "Bracken will only consider taxa with a read count greater than or equal to this value."
},
"kraken2_memory_mapping": {
"type": "boolean",
"default": false,
"title": "Enable memory mapping",
"description": "Avoids loading database into RAM",
"help_text": "Kraken 2 will by default load the database into process-local RAM; this flag will avoid doing so. It may be useful if the available RAM memory is lower than the size of the chosen database."
},
"kraken2_confidence": {
"type": "number",
"default": 0.0,
"title": "Confidence score threshold",
"description": "Kraken2 Confidence score threshold. Default: 0.0. Valid interval: 0-1",
"help_text": "Apply a threshold to determine if a sequence is classified or unclassified. See the [kraken2 manual section on confidence scoring](https://github.com/DerrickWood/kraken2/wiki/Manual#confidence-scoring) for further details about how it works."
}
},
"description": "Kraken2 classification options. Only relevant if classifier parameter is set to kraken2"
},
"minimap2_options": {
"title": "Minimap2 Options",
"type": "object",
"fa_icon": "fas fa-dna",
"properties": {
"minimap2filter": {
"type": "string",
"title": "Select reads belonging to the following taxonomy identifiers (taxids)",
"description": "Filter output of minimap2 by taxids inc. child nodes, E.g. \"9606,1404\"",
"help_text": "Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs."
},
"minimap2exclude": {
"type": "boolean",
"default": false,
"title": "Exclude reads from previous selected taxids",
"description": "Invert minimap2filter and exclude the given taxids instead",
"help_text": "Exclude a list of taxids from analysis outputs."
},
"keep_bam": {
"type": "boolean",
"title": "Enable keep BAM files",
"default": false,
"description": "Copy bam files into the output directory."
},
"minimap2_by_reference": {
"type": "boolean",
"default": false,
"title": "Compute coverage and sequencing depth of the references.",
"description": "Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report"
},
"min_percent_identity": {
"type": "number",
"default": 95,
"minimum": 0,
"maximum": 100,
"title": "Filter taxa based on the percent of identity with the references.",
"description": "Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified."
},
"min_ref_coverage": {
"type": "number",
"default": 90,
"minimum": 0,
"maximum": 100,
"title": "Filter taxa based on the percent of coverage with the reference.",
"description": "Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths."
}
},
"description": "Minimap2 classification options. Only relevant if classifier parameter is set to minimap2.",
"help_text": "Minimap2: The default strategy is using minimap2 to perform full alignments against .fasta-formatted references sequences."
},
"report_options": {
"title": "Report Options",
"type": "object",
"fa_icon": "fas fa-pills",
"properties": {
"abundance_threshold": {
"type": "number",
"default": 1,
"title": "Abundance threshold",
"description": "Remove those taxa whose abundance is equal or lower than the chosen value.",
"help_text": "To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1."
},
"n_taxa_barplot": {
"type": "integer",
"default": 9,
"title": "Number of taxa to be displayed in the barplot",
"description": "Number of most abundant taxa to be displayed in the barplot. The remaining taxa will be grouped under the \"Other\" category."
}
}
},
"output_options": {
"title": "Output Options",
"type": "object",
"description": "Parameters for saving and naming workflow outputs.",
"default": "",
"properties": {
"out_dir": {
"type": "string",
"format": "directory-path",
"default": "output",
"title": "Output folder name",
"description": "Directory for output of all user-facing files."
},
"igv": {
"type": "boolean",
"default": false,
"title": "IGV",
"description": "Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index."
},
"include_read_assignments": {
"type": "boolean",
"default": false,
"title": "Include Kraken2/Minimap2 taxonomy per read.",
"description": "A per-sample TSV file that indicates the taxonomy assigned to each sequence. The TSVs will only be published to the output on completion of the workflow, and therefore if running indefinitely using the real time option, these files will never be published."
}
}
},
"advanced_options": {
"title": "Advanced Options",
"type": "object",
"description": "Advanced options for configuring processes inside the workflow.",
"default": "",
"properties": {
"min_len": {
"type": "integer",
"default": 800,
"title": "Minimum read length",
"description": "Specify read length lower limit.",
"help_text": "Any reads shorter than this limit will not be included in the analysis."
},
"min_read_qual": {
"type": "number",
"title": "Minimum read quality",
"description": "Specify read quality lower limit.",
"help_text": "Any reads with a quality lower than this limit will not be included in the analysis."
},
"max_len": {
"type": "integer",
"title": "Maximum read length",
"default": 2000,
"description": "Specify read length upper limit",
"help_text": "Any reads longer than this limit will not be included in the analysis."
},
"threads": {
"type": "integer",
"default": 4,
"title": "Number of CPU threads per workflow task",
"description": "Maximum number of CPU threads to use in each parallel workflow task.",
"help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline."
}
}
},
"miscellaneous_options": {
"title": "Miscellaneous Options",
"type": "object",
"fa_icon": "fas fa-file-import",
"description": "Everything else.",
"help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs. Typically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
"properties": {
"disable_ping": {
"type": "boolean",
"title": "Disable ping",
"default": false,
"description": "Enable to prevent sending a workflow ping.",
"hidden": false
},
"help": {
"type": "boolean",
"title": "Display help text",
"default": false,
"fa_icon": "fas fa-question-circle",
"hidden": true
},
"version": {
"type": "boolean",
"title": "Display version",
"default": false,
"description": "Display version and exit.",
"fa_icon": "fas fa-question-circle",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/input_options"
},
{
"$ref": "#/definitions/real_time_analysis_options"
},
{
"$ref": "#/definitions/sample_options"
},
{
"$ref": "#/definitions/reference_options"
},
{
"$ref": "#/definitions/kraken2_options"
},
{
"$ref": "#/definitions/minimap2_options"
},
{
"$ref": "#/definitions/output_options"
},
{
"$ref": "#/definitions/advanced_options"
},
{
"$ref": "#/definitions/miscellaneous_options"
},
{
"$ref": "#/definitions/report_options"
}
],
"properties": {
"aws_image_prefix": {
"type": "string",
"title": "AWS image prefix",
"hidden": true
},
"aws_queue": {
"type": "string",
"title": "AWS queue",
"hidden": true
},
"monochrome_logs": {
"type": "boolean"
},
"validate_params": {
"type": "boolean",
"default": true
},
"show_hidden_params": {
"type": "boolean"
}
}
}