diff --git a/CHANGELOG.md b/CHANGELOG.md index 86196de0..1de2f2a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#266](https://github.com/genomic-medicine-sweden/nallo/pull/266) - Added CADD to dynamically calculate indel CADD-scores - [#270](https://github.com/genomic-medicine-sweden/nallo/pull/270) - Added SNV phasing stats to MultiQC - [#271](https://github.com/genomic-medicine-sweden/nallo/pull/271) - Added a `--skip_aligned_read_qc` parameter to skip the qc aligned reads subworkflow +- [#314](https://github.com/genomic-medicine-sweden/nallo/pull/314) - Added a `--vep_plugin_files` parameter to separate VEP plugins from cache ### `Changed` @@ -89,6 +90,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | `--split_fastq` | `--parallel_alignments` | | `--extra_gvcfs` | | | `--extra_snfs` | | +| | `--vep_plugin_files` | > [!NOTE] > Parameter has been updated if both old and new parameter information is present. diff --git a/assets/vep_plugin_files_schema.json b/assets/vep_plugin_files_schema.json new file mode 100644 index 00000000..d904317b --- /dev/null +++ b/assets/vep_plugin_files_schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/vep_plugin_files_schema.json", + "title": "Schema for VEP plugin files and their indices", + "description": "Schema for VEP plugin files and their indices", + "type": "array", + "items": { + "type": "object", + "properties": { + "vep_files": { + "type": "string", + "anyOf": [ + { + "format": "file-path" + }, + { + "format": "directory-path" + } + ], + "exists": true, + "description": "Path to vep plugin files and their indices" + } + }, + "required": ["vep_files"] + } +} diff --git a/conf/modules/snv_annotation.config b/conf/modules/snv_annotation.config index 393e4cfd..34e6748e 100644 --- a/conf/modules/snv_annotation.config +++ b/conf/modules/snv_annotation.config @@ -39,10 +39,10 @@ process { withName: '.*:SNV_ANNOTATION:ENSEMBLVEP_VEP' { ext.prefix = { "${meta.id}_vep" } ext.args = { [ - "--dir_plugins ${cache}/Plugins", - "--plugin LoFtool,${cache}/LoFtool_scores.txt", - "--plugin pLI,${cache}/pLI_values.txt", - "--plugin SpliceAI,snv=${cache}/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=${cache}/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz", + "--dir_plugins .", + "--plugin LoFtool,LoFtool_scores.txt", + "--plugin pLI,pLI_values.txt", + "--plugin SpliceAI,snv=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz", '--distance 5000', '--buffer_size 20000', '--format vcf --max_sv_size 248387328', diff --git a/conf/test.config b/conf/test.config index 7c96ecd8..2d91812b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -40,8 +40,9 @@ params { variant_catalog = params.pipelines_testdata_base_path + 'nallo/reference/variant_catalog_grch38.json' // SNV Annotation - vep_cache = params.pipelines_testdata_base_path + 'nallo/reference/vep_cache_test_data.tar.gz' - snp_db = params.pipelines_testdata_base_path + 'nallo/testdata/snp_dbs.csv' + vep_cache = params.pipelines_testdata_base_path + 'nallo/reference/vep_cache_test_data.tar.gz' + vep_plugin_files = params.pipelines_testdata_base_path + 'nallo/reference/vep_plugin_files.csv' + snp_db = params.pipelines_testdata_base_path + 'nallo/testdata/snp_dbs.csv' // Rank variants reduced_penetrance = params.pipelines_testdata_base_path + 'nallo/reference/reduced_penetrance.tsv' diff --git a/docs/usage.md b/docs/usage.md index 4b80d336..1fa1b629 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -112,6 +112,23 @@ Some workflows require additional files: - If running without `--skip_repeat_annotation`, download a json variant catalog, (e.g. [variant_catalog_grch38.json](https://github.com/Clinical-Genomics/stranger/raw/main/stranger/resources/variant_catalog_grch38.json)) matching your reference genome to supply with `--variant_catalog`. - If running without `--skip_snv_annotation`, download [VEP cache](https://ftp.ensembl.org/pub/release-110/variation/vep/homo_sapiens_vep_110_GRCh38.tar.gz) to supply with `--vep_cache` and prepare a samplesheet with annotation databases ([`echtvar encode`](https://github.com/brentp/echtvar)) to supply with `--snp_db`: +- If running without `--skip_snv_annotation`, you will also need to download VEP plugin files to supply with `--vep_plugin_files` see [example](https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugin_files.csv). PLI, LoFtool and SpliceAI are required. + +``` +vep_files +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/SpliceAI.pm +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/LoFtool.pm +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/dbNSFP.pm +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/plugin_config.txt +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz.tbi +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz.tbi +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/pLI_values.txt +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/pLI.pm +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/MaxEntScan.pm +https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_plugins/LoFtool_scores.txt +``` ``` sample,file @@ -247,6 +264,7 @@ Different processes may need extra input files | `trgt_repeats` | BED-file for repeats to be genotyped | `string` | | | | | `snp_db` | Extra echtvar-databases to annotate SNVs with | `string` | | | | | `vep_cache` | Path to directory of vep_cache | `string` | | | | +| `vep_plugin_files` | A csv file with paths to vep plugin files, pLI, LoFtool and SpliceAI is required. | `string` | | | | | `bed` | BED file with regions of interest | `string` | | | | | `hificnv_xy` | | `string` | | | | | `hificnv_xx` | | `string` | | | | diff --git a/nextflow.config b/nextflow.config index 1f3fd481..2eb33c30 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { snp_db = null variant_consequences_snv = null vep_cache = null + vep_plugin_files = null hificnv_xy = null hificnv_xx = null hificnv_exclude = null diff --git a/nextflow_schema.json b/nextflow_schema.json index aea8e485..b2ba20b5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -383,6 +383,12 @@ "default": 110, "description": "VEP cache version" }, + "vep_plugin_files": { + "type": "string", + "mimetype": "text/csv", + "description": "A csv file with paths to vep plugin files, pLI, LoFtool and SpliceAI is required.", + "schema": "assets/vep_plugin_files_schema.json" + }, "deepvariant_model_type": { "type": "string", "default": "PACBIO", diff --git a/subworkflows/local/snv_annotation/main.nf b/subworkflows/local/snv_annotation/main.nf index 16994f41..1bcc22e7 100644 --- a/subworkflows/local/snv_annotation/main.nf +++ b/subworkflows/local/snv_annotation/main.nf @@ -13,6 +13,7 @@ workflow SNV_ANNOTATION { ch_fai // channel: [mandatory] [ val(meta), path(fai) ] ch_vep_cache // channel: [mandatory] [ path(cache) ] val_vep_cache_version // string: [mandatory] default: 110 + ch_vep_extra_files // channel: [mandatory] [ path(files) ] val_annotate_cadd // bool: [mandatory] ch_cadd_header // channel: [mandatory] [ path(txt) ] ch_cadd_resources // channel: [mandatory] [ path(annotation) ] @@ -59,7 +60,7 @@ workflow SNV_ANNOTATION { val_vep_cache_version, ch_vep_cache, ch_fasta, - [] + ch_vep_extra_files ) ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions) diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf index 8e1c2697..716ded53 100644 --- a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf @@ -73,7 +73,7 @@ def workflowDependencies = [ def fileDependencies = [ mapping : ["fasta", "somalier_sites"], assembly : ["fasta", "dipcall_par"], // The assembly workflow should be split into two - assembly and variant calling (requires ref) - snv_annotation : ["snp_db", "vep_cache", "reduced_penetrance", "score_config_snv", "variant_consequences_snv"], + snv_annotation : ["snp_db", "vep_cache", "vep_plugin_files", "reduced_penetrance", "score_config_snv", "variant_consequences_snv"], cnv_calling : ["hificnv_xy", "hificnv_xx", "hificnv_exclude"], repeat_calling : ["trgt_repeats"], repeat_annotation: ["variant_catalog"], diff --git a/tests/main.nf.test b/tests/main.nf.test index ab94bb8c..240e9485 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -23,6 +23,7 @@ nextflow_pipeline { trgt_repeats = params.pipelines_testdata_base_path + 'nallo/reference/pathogenic_repeats.hg38.bed' variant_catalog = params.pipelines_testdata_base_path + 'nallo/reference/variant_catalog_grch38.json' vep_cache = params.pipelines_testdata_base_path + 'nallo/reference/vep_cache_test_data.tar.gz' + vep_plugin_files = params.pipelines_testdata_base_path + 'nallo/reference/vep_plugin_files.csv' snp_db = params.pipelines_testdata_base_path + 'nallo/testdata/snp_dbs.csv' somalier_sites = params.pipelines_testdata_base_path + 'nallo/reference/somalier_sites.vcf.gz' reduced_penetrance = params.pipelines_testdata_base_path + 'nallo/reference/reduced_penetrance.tsv' @@ -155,6 +156,7 @@ nextflow_pipeline { trgt_repeats = params.pipelines_testdata_base_path + 'nallo/reference/pathogenic_repeats.hg38.bed' variant_catalog = params.pipelines_testdata_base_path + 'nallo/reference/variant_catalog_grch38.json' vep_cache = params.pipelines_testdata_base_path + 'nallo/reference/vep_cache_test_data.tar.gz' + vep_plugin_files = params.pipelines_testdata_base_path + 'nallo/reference/vep_plugin_files.csv' snp_db = params.pipelines_testdata_base_path + 'nallo/testdata/snp_dbs.csv' somalier_sites = params.pipelines_testdata_base_path + 'nallo/reference/somalier_sites.vcf.gz' reduced_penetrance = params.pipelines_testdata_base_path + 'nallo/reference/reduced_penetrance.tsv' diff --git a/workflows/nallo.nf b/workflows/nallo.nf index 620af098..5403fa41 100644 --- a/workflows/nallo.nf +++ b/workflows/nallo.nf @@ -91,6 +91,8 @@ workflow NALLO { : Channel.value([]) ch_vep_cache_unprocessed = params.vep_cache ? Channel.fromPath(params.vep_cache).map { it -> [ [ id:'vep_cache' ], it ] }.collect() : Channel.value([[],[]]) + ch_vep_extra_files_unsplit = params.vep_plugin_files ? Channel.fromPath(params.vep_plugin_files).collect() + : '' ch_expected_xy_bed = params.hificnv_xy ? Channel.fromPath(params.hificnv_xy).collect() : '' ch_expected_xx_bed = params.hificnv_xx ? Channel.fromPath(params.hificnv_xx).collect() @@ -121,6 +123,21 @@ workflow NALLO { .collect() .set { ch_pedfile } + // Read and store paths in the vep_plugin_files file + if (params.vep_plugin_files) { + ch_vep_extra_files_unsplit.splitCsv ( header:true ) + .map { row -> + f = file(row.vep_files[0]) + if(f.isFile() || f.isDirectory()){ + return [f] + } else { + error("\nVep database file ${f} does not exist.") + } + } + .collect() + .set {ch_vep_extra_files} + } + // // Convert BAM files to FASTQ and vice versa // @@ -341,6 +358,7 @@ workflow NALLO { fai.map { name, fai -> [ [ id: name ], fai ] }, ch_vep_cache, params.vep_cache_version, + ch_vep_extra_files, (params.cadd_resources && params.cadd_prescored), ch_cadd_header, ch_cadd_resources,