nextflow_schema.json

{
    "$schema": "http://json-schema.org/draft-07/schema",
    "$id": "https://raw.githubusercontent.com/epi2me-labs/wf-16s/master/nextflow_schema.json",
    "title": "epi2me-labs/wf-16s",
    "workflow_title": "16s workflow",
    "description": "Taxonomic classification of single reads from amplicon-targeted sequencing.",
    "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-16s/wf-16s-demo.tar.gz",
    "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-16s/wf-16s-demo/aws.nextflow.config",
    "url": "https://github.com/epi2me-labs/wf-16s",
    "type": "object",
    "resources": {
        "recommended": {
            "cpus": 12,
            "memory": "32GB"
        },
        "minimum": {
            "cpus": 6,
            "memory": "16GB"
        },
        "run_time": "~40min for 1 million reads in total (24 barcodes) using Minimap2 and the ncbi_16s_18s database.",
        "arm_support": true
    },
    "definitions": {
        "input_options": {
            "title": "Input Options",
            "type": "object",
            "fa_icon": "fas fa-terminal",
            "description": "Define where the pipeline should find input data and save output data.",
            "properties": {
                "fastq": {
                    "type": "string",
                    "format": "path",
                    "title": "FASTQ",
                    "description": "FASTQ files to use in the analysis.",
                    "help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.",
                    "demo_data": "test_data"
                },
                "bam": {
                    "type": "string",
                    "format": "path",
                    "description": "BAM or unaligned BAM (uBAM) files to use in the analysis.",
                    "help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
                },
                "classifier": {
                    "type": "string",
                    "default": "minimap2",
                    "title": "Classification method",
                    "description": "Kraken2 or Minimap2 workflow to be used for classification of reads.",
                    "enum": [
                        "kraken2",
                        "minimap2"
                    ],
                    "help_text": "Use Kraken2 for fast classification and minimap2 for finer resolution, see Readme for further info."
                },
                "analyse_unclassified": {
                    "type": "boolean",
                    "default": false,
                    "title": "Analyse unclassified reads",
                    "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
                    "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
                },
                "exclude_host": {
                    "type": "string",
                    "format": "file-path",
                    "title": "Exclude host reads",
                    "description": "A FASTA or MMI file of the host reference. Reads that align with this reference will be excluded from the analysis."
                }
            },
            "oneOf": [
                {
                    "required": [
                        "fastq"
                    ]
                },
                {
                    "required": [
                        "bam"
                    ]
                }
            ]
        },
        "real_time_analysis_options": {
            "title": "Real Time Analysis Options",
            "type": "object",
            "description": "Options relating to the default real-time Kraken2 workflow.",
            "overrides": {
                "epi2mecloud": {
                    "hidden": true
                }
            },
            "properties": {
                "real_time": {
                    "type": "boolean",
                    "title": "Enable real time processing",
                    "default": false,
                    "description": "Enable to continuously watch the input directory for new input files. Reads will be analysed as they appear",
                    "help_text": "This option enables the use of Nextflow\u2019s directory watching feature to constantly monitor input directories for new files. As soon as files are written by an external process Nextflow will begin analysing these files. The workflow will accumulate data over time to produce an updating report."
                },
                "batch_size": {
                    "type": "integer",
                    "default": 0,
                    "title": "Size of each batch",
                    "description": "Maximum number of sequence records to process in a batch.",
                    "help_text": "Large files will be split such that batch_size records are processed together. Set to 0 to avoid rebatching input files. A value of 32000 is recommended to rebatch large files."
                },
                "read_limit": {
                    "type": "integer",
                    "title": "Number of reads analysed to stop the workflow",
                    "description": "Stop processing data when a particular number of reads have been analysed. By default the workflow will run indefinitely.",
                    "help_text": "Sets the upper bound on the number of reads that will be analysed before the workflow is automatically stopped and no more data is analysed."
                },
                "port": {
                    "type": "integer",
                    "default": 8080,
                    "title": "Network port",
                    "description": "Network port for communication between Kraken2 server and clients (available in real time  pipeline).",
                    "help_text": "The workflow uses a server process to handle Kraken2 classification requests. This allows the workflow to persist the sequence database in memory throughout the duration of processing. The option specifies the local network port on which the server and clients will communicate."
                },
                "host": {
                    "type": "string",
                    "default": "localhost",
                    "title": "Network hostname (or IP address)",
                    "description": "Network hostname (or IP address) for communication between Kraken2 server and clients. (See also 'external_kraken2' parameter). (Available in real time  pipeline).",
                    "help_text": "The workflow uses a server process to handle Kraken2 classification requests. This allows the workflow to persist the sequence database in memory throughout the duration of processing. The option specifies the local network hostname (or IP address) of the Kraken server."
                },
                "external_kraken2": {
                    "type": "boolean",
                    "default": false,
                    "title": "External Kraken2 server",
                    "description": "Whether a pre-existing Kraken2 server should be used, rather than creating one as part of the workflow. (Available in real time  pipeline).",
                    "help_text": "By default the workflow assumes that it is running on a single host computer, and further that it should start its own Kraken2 server. It may be desirable to start a Kraken2 server outside of the workflow, in which case this option should be enabled. This option may be used in conjunction with the `host` option to specify that the Kraken2 server is running on a remote computer. "
                },
                "server_threads": {
                    "type": "integer",
                    "default": 2,
                    "title": "CPU threads used by the Kraken2 server",
                    "description": "Number of CPU threads used by the Kraken2 server for classifying reads. (Available in the real_time pipeline).",
                    "help_text": "For the real-time Kraken2 workflow, this is the number of CPU threads used by the Kraken2 server for classifying reads."
                },
                "kraken_clients": {
                    "type": "integer",
                    "default": 2,
                    "title": "Clients used by the Kraken2 server",
                    "description": "Number of clients that can connect at once to the Kraken-server for classifying reads. (Available in the real_time pipeline).",
                    "help_text": "For the real-time Kraken2 workflow, this is the number of clients sending reads to the server. It should not be set to more than 4 fewer than the executor CPU limit."
                }
            }
        },
        "sample_options": {
            "title": "Sample Options",
            "type": "object",
            "default": "",
            "properties": {
                "sample_sheet": {
                    "type": "string",
                    "format": "file-path",
                    "title": "Sample sheet",
                    "description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. Disabled in the real time pipeline.",
                    "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode`,`alias`. Extra columns are allowed."
                },
                "sample": {
                    "type": "string",
                    "title": "Sample name",
                    "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. Disabled in the real time pipeline."
                }
            },
            "description": "Parameters that relate to samples such as sample sheets and sample names."
        },
        "reference_options": {
            "title": "Reference Options",
            "type": "object",
            "description": "Files will be downloaded as part of the first run of workflow and automatically stored for subsequent runs.",
            "default": "",
            "properties": {
                "database_set": {
                    "type": "string",
                    "default": "ncbi_16s_18s",
                    "title": "Choose a database",
                    "description": "Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1']. Workflow will require memory available to be slightly higher than the size of the database.",
                    "enum": [
                        "ncbi_16s_18s",
                        "ncbi_16s_18s_28s_ITS",
                        "SILVA_138_1"
                    ],
                    "help_text": "This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options."
                },
                "store_dir": {
                    "type": "string",
                    "format": "directory-path",
                    "title": "Store directory name",
                    "description": "Where to store initial download of database.",
                    "help_text": "database set selected will be downloaded as part of the workflow and saved in this location, on subsequent runs it will use this as the database. ",
                    "hidden": true,
                    "default": "store_dir"
                },
                "database": {
                    "type": "string",
                    "format": "path",
                    "title": "Kraken2 database",
                    "description": "Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory].",
                    "help_text": "By default uses database chosen in database_set parameter.",
                    "overrides": {
                        "epi2mecloud": {
                            "hidden": true
                        }
                    }
                },
                "taxonomy": {
                    "type": "string",
                    "format": "path",
                    "title": "Taxonomy database",
                    "description": "Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file  [.tar.gz or directory].",
                    "help_text": "By default NCBI taxonomy file will be downloaded and used."
                },
                "reference": {
                    "type": "string",
                    "format": "file-path",
                    "title": "Minimap2 reference",
                    "description": "Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index.",
                    "help_text": "This option should be used in conjunction with the database parameter to specify a custom database."
                },
                "ref2taxid": {
                    "type": "string",
                    "format": "file-path",
                    "title": "File linking reference IDs to specific taxids",
                    "description": "Not required but can be used to specify a  ref2taxid mapping. Format is .tsv (refname  taxid), no header row.",
                    "help_text": "By default uses ref2taxid for option chosen in database_set parameter."
                },
                "taxonomic_rank": {
                    "type": "string",
                    "default": "G",
                    "title": "Taxonomic rank",
                    "description": "Returns results at the taxonomic rank chosen. In the Kraken2 pipeline, this sets the level that Bracken will estimate abundance at. Default: G (genus). Other possible options are P (phylum), C (class), O (order), F (family), and S (species).",
                    "enum": [
                        "S",
                        "G",
                        "F",
                        "O",
                        "C",
                        "P"
                    ]
                }
            },
            "dependencies": {
                "reference": [
                    "ref2taxid"
                ],
                "ref2taxid": [
                    "reference"
                ]
            }
        },
        "kraken2_options": {
            "title": "Kraken2 Options",
            "type": "object",
            "fa_icon": "fas fa-university",
            "help_text": "Kraken2: It is possible to enable classification by Kraken2, disabling alignment, which is a faster but coarser method of classification reliant on the presence of a Kraken2 database.",
            "properties": {
                "bracken_length": {
                    "type": "integer",
                    "title": "Bracken length",
                    "description": "Set the length value Bracken will use",
                    "minimum": 1,
                    "help_text": "Should be set to the length used to generate the kmer distribution file supplied in the Kraken database input directory. For the default datasets these will be set automatically. ncbi_16s_18s = 1000 , ncbi_16s_18s_28s_ITS = 1000 , PlusPF-8 = 300"
                },
                "bracken_threshold": {
                    "type": "integer",
                    "title": "Bracken minimum read threshold",
                    "description": "Set the minimum read threshold Bracken will use to consider a taxon",
                    "default": 10,
                    "minimum": 0,
                    "help_text": "Bracken will only consider taxa with a read count greater than or equal to this value."
                },
                "kraken2_memory_mapping": {
                    "type": "boolean",
                    "default": false,
                    "title": "Enable memory mapping",
                    "description": "Avoids loading database into RAM",
                    "help_text": "Kraken 2 will by default load the database into process-local RAM; this flag will avoid doing so. It may be useful if the available RAM memory is lower than the size of the chosen database."
                },
                "kraken2_confidence": {
                    "type": "number",
                    "default": 0.0,
                    "title": "Confidence score threshold",
                    "description": "Kraken2 Confidence score threshold. Default: 0.0. Valid interval: 0-1",
                    "help_text": "Apply a threshold to determine if a sequence is classified or unclassified. See the [kraken2 manual section on confidence scoring](https://github.com/DerrickWood/kraken2/wiki/Manual#confidence-scoring) for further details about how it works."
                }
            },
            "description": "Kraken2 classification options. Only relevant if classifier parameter is set to kraken2"
        },
        "minimap2_options": {
            "title": "Minimap2 Options",
            "type": "object",
            "fa_icon": "fas fa-dna",
            "properties": {
                "minimap2filter": {
                    "type": "string",
                    "title": "Select reads belonging to the following taxonomy identifiers (taxids)",
                    "description": "Filter output of minimap2 by taxids inc. child nodes, E.g. \"9606,1404\"",
                    "help_text": "Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs."
                },
                "minimap2exclude": {
                    "type": "boolean",
                    "default": false,
                    "title": "Exclude reads from previous selected taxids",
                    "description": "Invert minimap2filter and exclude the given taxids instead",
                    "help_text": "Exclude a list of taxids from analysis outputs."
                },
                "keep_bam": {
                    "type": "boolean",
                    "title": "Enable keep BAM files",
                    "default": false,
                    "description": "Copy bam files into the output directory."
                },
                "minimap2_by_reference": {
                    "type": "boolean",
                    "default": false,
                    "title": "Compute coverage and sequencing depth of the references.",
                    "description": "Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report"
                },
                "min_percent_identity": {
                    "type": "number",
                    "default": 95,
                    "minimum": 0,
                    "maximum": 100,
                    "title": "Filter taxa based on the percent of identity with the references.",
                    "description": "Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified."
                },
                "min_ref_coverage": {
                    "type": "number",
                    "default": 90,
                    "minimum": 0,
                    "maximum": 100,
                    "title": "Filter taxa based on the percent of coverage with the reference.",
                    "description": "Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths."
                }
            },
            "description": "Minimap2 classification options. Only relevant if classifier parameter is set to minimap2.",
            "help_text": "Minimap2: The default strategy is using minimap2 to perform full alignments against .fasta-formatted references sequences."
        },
        "report_options": {
            "title": "Report Options",
            "type": "object",
            "fa_icon": "fas fa-pills",
            "properties": {
                "abundance_threshold": {
                    "type": "number",
                    "default": 1,
                    "title": "Abundance threshold",
                    "description": "Remove those taxa whose abundance is equal or lower than the chosen value.",
                    "help_text": "To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1."
                },
                "n_taxa_barplot": {
                    "type": "integer",
                    "default": 9,
                    "title": "Number of taxa to be displayed in the barplot",
                    "description": "Number of most abundant taxa to be displayed in the barplot. The remaining taxa will be grouped under the \"Other\" category."
                }
            }
        },
        "output_options": {
            "title": "Output Options",
            "type": "object",
            "description": "Parameters for saving and naming workflow outputs.",
            "default": "",
            "properties": {
                "out_dir": {
                    "type": "string",
                    "format": "directory-path",
                    "default": "output",
                    "title": "Output folder name",
                    "description": "Directory for output of all user-facing files."
                },
                "igv": {
                    "type": "boolean",
                    "default": false,
                    "title": "IGV",
                    "description": "Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index."
                },
                "include_read_assignments": {
                    "type": "boolean",
                    "default": false,
                    "title": "Include Kraken2/Minimap2 taxonomy per read.",
                    "description": "A per-sample TSV file that indicates the taxonomy assigned to each sequence. The TSVs will only be published to the output on completion of the workflow, and therefore if running indefinitely using the real time option, these files will never be published."
                }
            }
        },
        "advanced_options": {
            "title": "Advanced Options",
            "type": "object",
            "description": "Advanced options for configuring processes inside the workflow.",
            "default": "",
            "properties": {
                "min_len": {
                    "type": "integer",
                    "default": 800,
                    "title": "Minimum read length",
                    "description": "Specify read length lower limit.",
                    "help_text": "Any reads shorter than this limit will not be included in the analysis."
                },
                "min_read_qual": {
                    "type": "number",
                    "title": "Minimum read quality",
                    "description": "Specify read quality lower limit.",
                    "help_text": "Any reads with a quality lower than this limit will not be included in the analysis."
                },
                "max_len": {
                    "type": "integer",
                    "title": "Maximum read length",
                    "default": 2000,
                    "description": "Specify read length upper limit",
                    "help_text": "Any reads longer than this limit will not be included in the analysis."
                },
                "threads": {
                    "type": "integer",
                    "default": 4,
                    "title": "Number of CPU threads per workflow task",
                    "description": "Maximum number of CPU threads to use in each parallel workflow task.",
                    "help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline."
                }
            }
        },
        "miscellaneous_options": {
            "title": "Miscellaneous Options",
            "type": "object",
            "fa_icon": "fas fa-file-import",
            "description": "Everything else.",
            "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs. Typically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
            "properties": {
                "disable_ping": {
                    "type": "boolean",
                    "title": "Disable ping",
                    "default": false,
                    "description": "Enable to prevent sending a workflow ping.",
                    "hidden": false
                },
                "help": {
                    "type": "boolean",
                    "title": "Display help text",
                    "default": false,
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true
                },
                "version": {
                    "type": "boolean",
                    "title": "Display version",
                    "default": false,
                    "description": "Display version and exit.",
                    "fa_icon": "fas fa-question-circle",
                    "hidden": true
                }
            }
        }
    },
    "allOf": [
        {
            "$ref": "#/definitions/input_options"
        },
        {
            "$ref": "#/definitions/real_time_analysis_options"
        },
        {
            "$ref": "#/definitions/sample_options"
        },
        {
            "$ref": "#/definitions/reference_options"
        },
        {
            "$ref": "#/definitions/kraken2_options"
        },
        {
            "$ref": "#/definitions/minimap2_options"
        },
        {
            "$ref": "#/definitions/output_options"
        },
        {
            "$ref": "#/definitions/advanced_options"
        },
        {
            "$ref": "#/definitions/miscellaneous_options"
        },
        {
            "$ref": "#/definitions/report_options"
        }
    ],
    "properties": {
        "aws_image_prefix": {
            "type": "string",
            "title": "AWS image prefix",
            "hidden": true
        },
        "aws_queue": {
            "type": "string",
            "title": "AWS queue",
            "hidden": true
        },
        "monochrome_logs": {
            "type": "boolean"
        },
        "validate_params": {
            "type": "boolean",
            "default": true
        },
        "show_hidden_params": {
            "type": "boolean"
        }
    }
}