nf-core · svarona · Apr 22, 2024 · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -68,6 +68,7 @@ jobs:
           - "--spades_mode corona"
           - "--spades_mode metaviral"
           - "--skip_plasmidid false --skip_asciigenome"
+          - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz"
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2
@@ -86,6 +87,11 @@ jobs:
           wget -qO- get.nextflow.io | bash
           sudo mv nextflow /usr/local/bin/
 
+      - name: Download GTF for additional annotation
+        if: contains(matrix.parameters, 'additional_annotation')
+        run: |
+          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/858/895/GCA_009858895.3_ASM985889v3/GCA_009858895.3_ASM985889v3_genomic.gtf.gz
+
       - name: Run pipeline with various parameters
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.parameters }} --publish_dir_mode link --outdir ./results
@@ -120,6 +126,7 @@ jobs:
       matrix:
         parameters:
           - "--gff false"
+          - "--additional_annotation ./GCA_009858895.3_ASM985889v3_genomic.gtf.gz"
           - "--input false"
           - "--min_barcode_reads 10000"
           - "--min_guppyplex_reads 10000"
@@ -133,7 +140,14 @@ jobs:
         run: |
           wget -qO- get.nextflow.io | bash
           sudo mv nextflow /usr/local/bin/
+
+      - name: Download GTF for additional annotation
+        if: contains(matrix.parameters, 'additional_annotation')
+        run: |
+          wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/858/895/GCA_009858895.3_ASM985889v3/GCA_009858895.3_ASM985889v3_genomic.gtf.gz
+
       - name: Download medaka model
+        if: contains(matrix.parameters, 'r941_min_high_g360_model.hdf5')
         run: |
           wget https://github.com/nanoporetech/medaka/raw/master/medaka/data/r941_min_high_g360_model.hdf5
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,17 +22,18 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [[PR #393](https://github.com/nf-core/viralrecon/pull/393)] - Changed primer set to params
 - [[PR #405](https://github.com/nf-core/viralrecon/pull/412)] - Including parameter `depthcutoff` to freyja demix and boot
 - [[PR #413](https://github.com/nf-core/viralrecon/pull/413)] - Update multiqc module & include freyja in report
+- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation
 
 ### Parameters
 
-| Old parameter          | New parameter |
-| ---------------------- | ------------- |
-| `--skip_freyja`        |               |
-| `--freyja_repeats`     |               |
-| `--freyja_db_name`     |               |
-| `--freyja_barcodes`    |               |
-| `--freyja_lineages`    |               |
-| `--freyja_depthcutoff` |               |
+| Old parameter       | New parameter             |
+| ------------------- | ------------------------- |
+| `--skip_freyja`     |                           |
+| `--freyja_repeats`  |                           |
+| `--freyja_db_name`  |                           |
+| `--freyja_barcodes` |                           |
+| `--freyja_lineages` |                           |
+|                     | `--additional_annotation` |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present.
 > **NB:** Parameter has been **added** if just the new parameter information is present.

diff --git a/assets/headers/ivar_variants_header_mqc.txt b/assets/headers/ivar_variants_header_mqc.txt
@@ -1,7 +1,7 @@
 #id: 'ivar_variants'
 #section_name: 'VARIANTS: Total variants (iVar)'
-#description: "is calculated from the total number of variants called by
-#              <a href='https://andersen-lab.github.io/ivar/html/manualpage.html' target='_blank'>iVar</a>."
+#description: "Is calculated from the total number of variants called by
+#              <a href='https://andersen-lab.github.io/ivar/html/manualpage.html' target='_blank'>iVar</a> (Defaults: 0.25 allele frequency, minimum quality score = 20 and minimum position depth = 10)."
 #plot_type: 'bargraph'
 #anchor: 'ivar_variants'
 #pconfig:

diff --git a/assets/multiqc_config_illumina.yml b/assets/multiqc_config_illumina.yml
@@ -75,7 +75,7 @@ module_order:
         - "./variants/*.txt"
   - snpeff:
       name: "VARIANTS: SnpEff"
-      info: "This section of the report shows SnpEff results for the called variants."
+      info: "This section of the report shows SnpEff results for the called variants passing filters (Defaults: 0.25 allele frequency, minimum quality score = 20 and minimum position depth = 10). Some variants may have more than one annotation respect to genomic region, impact or effect, leading to differences in the number of variants respect to the vcf file."
       path_filters:
         - "./variants/*.csv"
   - quast:

diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py
@@ -236,11 +236,7 @@ def snpsift_to_table(snpsift_file):
     new_colnames = [x.replace("ANN[*].", "") for x in old_colnames]
     table.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True)
     table = table.loc[:, ["CHROM", "POS", "REF", "ALT", "GENE", "EFFECT", "HGVS_C", "HGVS_P"]]
-
-    ## Split by comma and get first value in cols = ['ALT','GENE','EFFECT','HGVS_C','HGVS_P']
-    for i in range(len(table)):
-        for j in range(3, 8):
-            table.iloc[i, j] = str(table.iloc[i, j]).split(",")[0]
+    table = one_effect_per_line(table)
 
     ## Amino acid substitution
     aa = []
@@ -252,6 +248,51 @@ def snpsift_to_table(snpsift_file):
     return table
 
 
+def one_effect_per_line(table):
+    one_effect_per_line_table = pd.DataFrame()
+    for i in range(len(table)):
+        gene_list = table.iloc[i, 4].split(",")
+        effect_list = table.iloc[i, 5].split(",")
+        hgvs_c_list = table.iloc[i, 6].split(",")
+        hgvs_p_list = table.iloc[i, 7].split(",")
+
+        count = 0
+        for j in range(len(gene_list)):
+            if "upstream" in effect_list[j] or "downstream" in effect_list[j]:
+                count += 1
+        for j in range(len(gene_list)):
+            if len(effect_list) == count:
+                row = {
+                    "CHROM": table.iloc[i, 0],
+                    "POS": table.iloc[i, 1],
+                    "REF": table.iloc[i, 2],
+                    "ALT": table.iloc[i, 3],
+                    "GENE": gene_list[0],
+                    "EFFECT": effect_list[0],
+                    "HGVS_C": hgvs_c_list[0],
+                    "HGVS_P": hgvs_p_list[0],
+                }
+                one_effect_per_line_table = pd.concat(
+                    [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                )
+            else:
+                if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]:
+                    row = {
+                        "CHROM": table.iloc[i, 0],
+                        "POS": table.iloc[i, 1],
+                        "REF": table.iloc[i, 2],
+                        "ALT": table.iloc[i, 3],
+                        "GENE": gene_list[j],
+                        "EFFECT": effect_list[j],
+                        "HGVS_C": hgvs_c_list[j],
+                        "HGVS_P": hgvs_p_list[j],
+                    }
+                    one_effect_per_line_table = pd.concat(
+                        [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                    )
+    return one_effect_per_line_table
+
+
 def main(args=None):
     args = parser_args(args)
 

diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
@@ -572,6 +572,14 @@ if (!params.skip_variants) {
                         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
                     ]
                 }
+                withName: 'MAKE_VARIANTS_LONG_TABLE_ADDITIONAL' {
+                    ext.args = "--variant_caller ${variant_caller} --output_file 'additional_variants_long_table.csv'"
+                    publishDir = [
+                        path: { "${params.outdir}/variants/${variant_caller}" },
+                        mode: params.publish_dir_mode,
+                        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                    ]
+                }
             }
         }
     }

diff --git a/conf/modules_nanopore.config b/conf/modules_nanopore.config
@@ -361,6 +361,14 @@ if (!params.skip_snpeff) {
                     saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
                 ]
             }
+            withName: 'MAKE_VARIANTS_LONG_TABLE_ADDITIONAL' {
+                ext.args = "--variant_caller ${params.artic_minion_caller} --output_file 'additional_variants_long_table.csv'"
+                publishDir = [
+                    path: { "${params.outdir}/variants/${params.artic_minion_caller}" },
+                    mode: params.publish_dir_mode,
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                ]
+            }
         }
     }
 }

diff --git a/docs/output.md b/docs/output.md
@@ -291,12 +291,13 @@ As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs
 
 - `<CALLER>/`
   - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis.
+  - `additional_variants_long_table.csv`: Long format table similar to `variants_long_table.csv` for additional annotation file with overlapping annotation features.
 
 **NB:** The value of `<CALLER>` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
 
 </details>
 
-Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)).
+Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). The variants used for this table are the ones passing artic minion quality filters (`*.pass.unique.vcf.gz`) explained before in [Nanopore: artic minion](#nanopore-artic-minion) output files.
 
 The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below:
 
@@ -308,6 +309,25 @@ SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.27
 SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1
 ```
 
+Table columns:
+
+- SAMPLE: sample name
+- CHROM: Reference/fragment ID
+- POS: Position of the variant respect to the reference genome
+- REF: Reference allele
+- ALT: Alternative allele
+- FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear.
+- DP: Position read depth
+- REF_DP: Reference allele depth
+- ALT_DP: Alternative allele depth
+- AF: Alternative allele frequency
+- GENE: Gene name in annotation file
+- EFFECT: Effect of the variant
+- HGVS_C: Position annotation at CDS level
+- HGVS_P: Position annotation at protein level
+- HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format
+- Caller: Variant caller used
+
 ## Nanopore: Workflow reporting
 
 ### Nanopore: MultiQC
@@ -756,12 +776,31 @@ Phylogenetic Assignment of Named Global Outbreak LINeages ([Pangolin](https://gi
 
 - `variants/<VARIANT_CALLER>/`
   - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis.
+  - `additional_variants_long_table.csv`: Long format table similar to `variants_long_table.csv` for additional annotation file with overlapping annotation features.
 
 **NB:** The value of `<VARIANT_CALLER>` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
 
 </details>
 
-Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)).
+Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). The variants used for this table are the ones passing variant caller filters (`variants/<VARIANT_CALLER>/*.vcf.gz`):
+
+- For ivar by default filters are:
+  - Allele frequency threshold >= 0.25
+  - Minimum quality score threshold = 20
+  - Minimum position depth = 10
+  - If using metagenomics protocol, strand bias filter also is aplied in `ivar_variants_to_vcf.py`
+- For bcftools default filters are:
+  - Minimum quality score threshold = 20
+  - Minimum position depth = 10
+
+To filter variants included in the consensus genome from the variants long table file, the following filters should be applied:
+
+- AF >= 0.75
+
+Additionally, to filter variants included in the consensus genome that are missense variants from the variants long table file, the following filters should be applied:
+
+- AF >= 0.75
+- EFFECT == missense_variant
 
 The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below:
 
@@ -773,6 +812,25 @@ SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.27
 SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1
 ```
 
+Table columns:
+
+- SAMPLE: sample name
+- CHROM: Reference/fragment ID
+- POS: Position of the variant respect to the reference genome
+- REF: Reference allele
+- ALT: Alternative allele
+- FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear.
+- DP: Position read depth
+- REF_DP: Reference allele depth
+- ALT_DP: Alternative allele depth
+- AF: Alternative allele frequency
+- GENE: Gene name in annotation file
+- EFFECT: Effect of the variant
+- HGVS_C: Position annotation at CDS level
+- HGVS_P: Position annotation at protein level
+- HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format
+- Caller: Variant caller used
+
 ## Illumina: De novo assembly
 
 A file called `summary_assembly_metrics_mqc.csv` containing a selection of read alignment and _de novo_ assembly related metrics will be saved in the `multiqc/` results directory. The same metrics will also be added to the top of the MultiQC report.

diff --git a/modules/local/snpeff_build.nf b/modules/local/snpeff_build.nf
@@ -20,7 +20,14 @@ process SNPEFF_BUILD {
     task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args ?: ''
     def basename = fasta.baseName
+    def extension = gff.getExtension()
+    if (extension == "gtf") {
+        format = "gtf22"
+    } else {
+        format = "gff3"
+    }
 
     def avail_mem = 4
     if (!task.memory) {
@@ -36,7 +43,7 @@ process SNPEFF_BUILD {
     cd ../../
     mkdir -p snpeff_db/${basename}/
     cd snpeff_db/${basename}/
-    ln -s ../../$gff genes.gff
+    ln -s ../../$gff genes.$extension
 
     cd ../../
     echo "${basename}.genome : ${basename}" > snpeff.config
@@ -46,7 +53,8 @@ process SNPEFF_BUILD {
         build \\
         -config snpeff.config \\
         -dataDir ./snpeff_db \\
-        -gff3 \\
+        -${format} \\
+        $args \\
         -v \\
         ${basename}
 

diff --git a/nextflow.config b/nextflow.config
@@ -22,6 +22,7 @@ params {
     primer_left_suffix         = '_LEFT'
     primer_right_suffix        = '_RIGHT'
     save_reference             = false
+    additional_annotation      = null
 
     // Nanopore options
     fastq_dir                  = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -79,6 +79,14 @@
                     "description": "Full path to GFF annotation file.",
                     "fa_icon": "fas fa-file-invoice"
                 },
+                "additional_annotation": {
+                    "type": "string",
+                    "format": "file-path",
+                    "mimetype": "text/plain",
+                    "pattern": "^\\S+(\\.gff|\\.gtf)(\\.gz)?$",
+                    "description": "Full path to additional annotation file in GTF or GFF format.",
+                    "fa_icon": "fas fa-file-invoice"
+                },
                 "bowtie2_index": {
                     "type": "string",
                     "format": "path",