EBI-Metagenomics · mberacochea · Sep 2, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -0,0 +1,80 @@
+name: nf-core linting
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+  release:
+    types: [published]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
+        with:
+          python-version: "3.12"
+
+      - name: Install pre-commit
+        run: pip install pre-commit
+
+      - name: Run pre-commit
+        run: pre-commit run --all-files
+
+  nf-core:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v2
+
+      - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
+        with:
+          python-version: "3.12"
+          architecture: "x64"
+
+      - name: read .nf-core.yml
+        uses: pietrobolcato/[email protected]
+        id: read_yml
+        with:
+          config: ${{ github.workspace }}/.nf-core.yml
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }}
+
+      - name: Run nf-core pipelines lint
+        if: ${{ github.base_ref != 'main' }}
+        env:
+          GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+        run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+
+      - name: Run nf-core pipelines lint --release
+        if: ${{ github.base_ref == 'main' }}
+        env:
+          GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+        run: nf-core -l lint_log.txt pipelines lint --release --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+
+      - name: Save PR number
+        if: ${{ always() }}
+        run: echo ${{ github.event.pull_request.number }} > PR_number.txt
+
+      - name: Upload linting log file artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4
+        with:
+          name: linting-logs
+          path: |
+            lint_log.txt
+            lint_results.md
+            PR_number.txt
diff --git a/.github/workflows/ci.yml → .github/workflows/nf_tests.yml b/.github/workflows/ci.yml → .github/workflows/nf_tests.yml
@@ -1,11 +1,9 @@
 name: nf-test CI
 on:
-  push:
-    branches:
-      - dev
   pull_request:
   release:
     types: [published]
+  workflow_dispatch:
 
 env:
   NXF_ANSI_LOG: false
@@ -15,22 +13,25 @@ jobs:
     name: Run pipeline with test data
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        # Nextflow versions: check pipeline minimum and current latest
+        NXF_VER: ["24.04.0"]
+
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v4
 
-      - uses: actions/setup-java@99b8673ff64fbf99d8d325f52d9a5bdedb8483e9 # v4
-        with:
-          distribution: "temurin"
-          java-version: "17"
-
       - name: Setup Nextflow
-        uses: nf-core/setup-nextflow@v2
+        uses: nf-core/[email protected]
+        with:
+          version: "${{ matrix.NXF_VER }}"
 
       - name: Install nf-test
         uses: nf-core/setup-nf-test@v1
         with:
-          version: 0.9.0
+          install-pdiff: true
+          version: 0.9.2
 
       - name: Run pipeline with test data
         run: |

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -20,6 +20,7 @@ lint:
     - .github/workflows/ci.yml
     - .github/workflows/linting_comment.yml
     - .github/workflows/linting.yml
+    - .github/workflows/ci.yml
     - conf/test_full.config
     - lib/Utils.groovy
     - lib/WorkflowMain.groovy
@@ -32,18 +33,22 @@ lint:
     - docs/images/nf-core-miassembler_logo_light.png
     - docs/images/nf-core-miassembler_logo_dark.png
     - .github/ISSUE_TEMPLATE/bug_report.yml
+    - .github/PULL_REQUEST_TEMPLATE.md
     - .github/CONTRIBUTING.md
+    - .github/workflows/linting.yml
     - LICENSE
     - docs/README.md
     - .gitignore
   multiqc_config:
     - report_comment
-  nextflow_config: False
+  nextflow_config:
     - params.input
     - params.validationSchemaIgnoreParams
     - params.custom_config_version
     - params.custom_config_base
     - manifest.name
     - manifest.homePage
+    - custom_config
   readme:
     - nextflow_badge
+nf_core_version: 3.0.2
diff --git a/README.md b/README.md
@@ -15,9 +15,6 @@ This pipeline is still in early development. It's mostly a direct port of the mi
 
 ## Usage
 
-> [!WARNING]
-> It only runs in Codon using Slurm ATM.
-
 Pipeline help:
 
 ```bash
@@ -28,27 +25,31 @@ Typical pipeline command:
 Input/output options
   --study_accession                       [string]  The ENA Study secondary accession
   --reads_accession                       [string]  The ENA Run primary accession
-  --private_study                         [boolean] To use if the ENA study is private
+  --private_study                         [boolean] To use if the ENA study is private, *this feature only works on EBI infrastructure at the moment*
   --samplesheet                           [string]  Path to comma-separated file containing information about the raw reads with the prefix to be used.
   --assembler                             [string]  The short reads assembler (accepted: spades, metaspades, megahit)
   --single_end                            [boolean] Force the single_end value for the study / reads
   --library_strategy                      [string]  Force the library_strategy value for the study / reads (accepted: metagenomic, metatranscriptomic,
                                                     genomic, transcriptomic, other)
   --library_layout                        [string]  Force the library_layout value for the study / reads (accepted: single, paired)
+  --platform                              [string]  Force the sequencing_platform value for the study / reads
   --spades_version                        [string]  null [default: 3.15.5]
   --megahit_version                       [string]  null [default: 1.2.9]
-  --reference_genome                      [string]  The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics
+  --flye_version                          [string]  null [default: 2.9]
+  --reference_genome                 [string]  The genome to be used to clean the assembly, the genome will be taken from the Microbiome Informatics
                                                     internal directory (accepted: chicken.fna, salmon.fna, cod.fna, pig.fna, cow.fna, mouse.fna,
                                                     honeybee.fna, rainbow_trout.fna, ...)
   --blast_reference_genomes_folder        [string]  The folder with the reference genome blast indexes, defaults to the Microbiome Informatics internal
                                                     directory.
   --bwamem2_reference_genomes_folder      [string]  The folder with the reference genome bwa-mem2 indexes, defaults to the Microbiome Informatics internal
+
+  --reference_genomes_folder              [string]  The folder with reference genomes, defaults to the Microbiome Informatics internal
                                                     directory.
   --remove_human_phix                     [boolean] Remove human and phiX reads pre assembly, and contigs matching those genomes. [default: true]
   --human_phix_blast_index_name           [string]  Combined Human and phiX BLAST db. [default: human_phix]
   --human_phix_bwamem2_index_name         [string]  Combined Human and phiX bwa-mem2 index. [default: human_phix]
-  --min_contig_length                     [integer] Minimum contig length filter. [default: 500]
-  --min_contig_length_metatranscriptomics [integer] Minimum contig length filter for metaT. [default: 200]
+  --short_reads_min_contig_length         [integer] Minimum contig length filter. [default: 500]
+  --short_reads_min_contig_length_metat   [integer] Minimum contig length filter for metaT. [default: 200]
   --assembly_memory                       [integer] Default memory allocated for the assembly process. [default: 100]
   --spades_only_assembler                 [boolean] Run SPAdes/metaSPAdes without the error correction step. [default: true]
   --outdir                                [string]  The output directory where the results will be saved. You have to use absolute paths to storage on Cloud
@@ -72,6 +73,37 @@ nextflow run ebi-metagenomics/miassembler \
   --reads_accession SRR1631361
 ```
 
+### Required DBs:
+
+- `--reference_genome`: reference genome in FASTA format
+- `--blast_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/)
+- `--bwamem2_reference_genomes_folder`: mandatory **human_phiX** is provided on [FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/)
+
+Blast and bwa-mem2 reference databases can be generated for any reference genome to polish input sequences with.
+
+#### BWA-MEM2
+
+As explained in [bwa-mem2's README](https://github.com/bwa-mem2/bwa-mem2?tab=readme-ov-file#getting-started):
+
+```
+# Use precompiled binaries (recommended)
+curl -L https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 \
+  | tar jxf -
+
+# Index your reference genome with
+bwa-mem2-2.2.1_x64-linux/bwa-mem2 index ref.fa
+```
+
+This will generate multiple index files in a folder. The folder containing them is the one to use as `bwamem2_reference_genomes_folder`.
+
+#### BLAST
+
+```
+makeblastdb -in <ref.fa> -dbtype nucl -out <my_db_file>
+```
+
+As with bwa-mem2, numerous files will be generated in the same folder, which should be used for `blast_reference_genomes_folder`.
+
 ### Samplesheet
 
 The samplesheet is a comma-separated file (.csv) with the following columns:
@@ -115,6 +147,18 @@ PRJ1,ERR1,/path/to/reads/ERR1_1.fq.gz,/path/to/reads/ERR1_2.fq.gz,paired,metagen
 PRJ2,ERR2,/path/to/reads/ERR2.fq.gz,,single,genomic,megahit,32
 ```
 
+### ENA Private Data
+
+The pipeline includes a module to download private data from ENA using the EMBL-EBI FIRE (File Replication) system. This system is restricted for use within the EMBL-EBI network and will not work unless connected to that network.
+
+If you have private data to assemble, you must provide the full path to the files on a system that Nextflow can access.
+
+#### Microbiome Informatics Team
+
+To process private data, the pipeline should be launched with the `--private_study` flag, and the samplesheet must include the private FTP (transfer services) paths. The `download_from_fire` module will be utilized to download the files.
+
+This module uses [Nextflow secrets](https://www.nextflow.io/docs/latest/secrets.html#how-it-works). Specifically, it requires the `FIRE_ACCESS_KEY` and `FIRE_SECRET_KEY` secrets to authenticate and download the files.
+
 ## Outputs
 
 The outputs of the pipeline are organized as follows:
@@ -225,15 +269,15 @@ Runs that fail QC checks are excluded from the assembly process. These runs are
 Example:
 
 ```csv
-SRR6180434,filter_ratio_threshold_exceeded
+SRR6180434,short_reads_filter_ratio_threshold_exceeded
 ```
 
 ##### Runs exclusion messages
 
-| Exclusion Message                 | Description                                                                                                                                                                                                                                                              |
-| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.9, meaning that if more than 90% of the reads are filtered out, the threshold is considered exceeded, and the run is not assembled. |
-| `low_reads_count_threshold`       | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled.                                                                                                                                                 |
+| Exclusion Message                             | Description                                                                                                                                                                                                                                                                          |
+| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `short_reads_filter_ratio_threshold_exceeded` | The maximum fraction of reads that are allowed to be filtered out. If exceeded, it flags excessive filtering. The default value is 0.1, meaning that if less than 10% of the reads are retained after filtering, the threshold is considered exceeded, and the run is not assembled. |
+| `short_reads_low_reads_count_threshold`       | The minimum number of reads required after filtering. If below, it flags a low read count, and the run is not assembled.                                                                                                                                                             |
 
 #### Assembled Runs
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -3,12 +3,12 @@ report_comment: >
   analysis pipeline.
 
 report_section_order:
-  "software_versions":
-    order: -1000
   "ebi-metagenomics-miassembler-methods-description":
     order: -1001
-  "ebi-metagenomics-miassembler-summary":
+  "software_versions":
     order: -1002
+  "ebi-metagenomics-miassembler-summary":
+    order: -1003
 
 export_plots: true
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -47,6 +47,9 @@
                 "enum": ["metagenomic", "metatranscriptomic", "genomic", "transcriptomic", "other"],
                 "errorMessage": "library strategy should be only value from list: 'metagenomic', 'metatranscriptomic', 'genomic', 'transcriptomic', 'other'"
             },
+            "platform": {
+                "type": "string"
+            },
             "assembler": {
                 "type": "string",
                 "enum": ["spades", "metaspades", "megahit"],
@@ -57,6 +60,9 @@
                 "type": "integer",
                 "default": null,
                 "description": "Default memory (in GB) allocated for the assembly process for the run."
+            },
+            "assembler_config": {
+                "type": "string"
             }
         },
         "required": ["study_accession", "reads_accession", "fastq_1", "library_layout", "library_strategy"]