Merge branch 'master' into catpack-reads

nf-core · Jan 25, 2025 · c052bab · c052bab
2 parents 6e25635 + 5b33c12
commit c052bab
Show file tree

Hide file tree

Showing 11 changed files with 379 additions and 230 deletions.
diff --git a/modules/nf-core/simpleaf/index/environment.yml b/modules/nf-core/simpleaf/index/environment.yml
@@ -1,8 +1,9 @@
 channels:
-  - conda-forge
   - bioconda
+  - conda-forge
 
 dependencies:
-  - bioconda::alevin-fry=0.8.2
-  - bioconda::salmon=1.10.2
-  - bioconda::simpleaf=0.15.1
+  - bioconda::alevin-fry=0.11.1
+  - bioconda::piscem=0.11.0
+  - bioconda::salmon=1.10.3
+  - bioconda::simpleaf=0.18.4
diff --git a/modules/nf-core/simpleaf/index/main.nf b/modules/nf-core/simpleaf/index/main.nf
@@ -1,37 +1,40 @@
+// NOTE because the default indexer, piscem, needs to frequently read and write a large number of intermediate files, if your use case involves the situations where the CPU and storage are not physically connected, we recommend setting `--work-dir /path/to/a/local/dir` or in the `ext.args` in nextflow.config, or  `scratch = true`, to avoid runtime issues.
 process SIMPLEAF_INDEX {
-    tag "$genome_fasta $transcript_fasta"
+    tag "${meta.id ?: meta2.id}"
     label 'process_high'
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/simpleaf:0.15.1--h4ac6f70_0':
-        'biocontainers/simpleaf:0.15.1--h4ac6f70_0' }"
+        'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1':
+        'biocontainers/simpleaf:0.18.4--ha6fb395_1' }"
 
     input:
-    tuple val(meta), path(genome_fasta)
-    tuple val(meta2), path(genome_gtf)
-    tuple val(meta3), path(transcript_fasta)
+    tuple val(meta),  path(genome_fasta), path(genome_gtf)
+    tuple val(meta2), path(transcript_fasta)
 
     output:
-    tuple val(meta), path("${prefix}/index")              , emit: index
-    tuple val(meta), path("${prefix}/ref/t2g_3col.tsv")   , emit: transcript_tsv, optional: true
-    tuple val(meta), path("${prefix}")                    , emit: salmon
-    path "versions.yml"                                   , emit: versions
+    tuple val(meta), path("${prefix}/index")                    , emit: index
+    tuple val(meta), path("${prefix}/ref")                      , emit: ref, optional: true
+    tuple val(meta), path("${prefix}/ref/{t2g,t2g_3col}.tsv")   , emit: t2g, optional: true
+    path "versions.yml"                                         , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
-    def seq_inputs = (transcript_fasta) ? "--refseq $transcript_fasta" : "--gtf $genome_gtf --fasta $genome_fasta"
+    def seq_inputs = input_args(genome_fasta, genome_gtf, transcript_fasta)//, probes_csv, features_csv)
 
     // Output meta needs to correspond to the input used
-    meta = (transcript_fasta) ? meta3 : meta
+    meta = (transcript_fasta) ? meta2 : meta
     prefix = task.ext.prefix ?: "${meta.id}"
     """
     # export required var
     export ALEVIN_FRY_HOME=.
 
+    # set maximum number of file descriptors for temp files
+    ulimit -n 2048
+
     # prep simpleaf
     simpleaf set-paths
 
@@ -45,26 +48,49 @@ process SIMPLEAF_INDEX {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2)
+        alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g")
+        piscem: \$(piscem --version | sed -e "s/piscem //g")
         salmon: \$(salmon --version | sed -e "s/salmon //g")
+        simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g")
     END_VERSIONS
     """
 
     stub:
     def args = task.ext.args ?: ''
-    prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta3.id}")
+    prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta2.id}")
+
     """
     mkdir -p ${prefix}/index
     mkdir -p ${prefix}/ref
-    touch ${prefix}/index/ctg_offsets.bin
-    touch ${prefix}/index/duplicate_clusters.tsv
-    touch ${prefix}/index/mphf.bin
+    touch ${prefix}/index/piscem_idx_cfish.json
+    touch ${prefix}/index/piscem_idx.ectab
+    touch ${prefix}/index/piscem_idx.sshash
     touch ${prefix}/ref/t2g_3col.tsv
+    touch ${prefix}/ref/roers_ref.fa
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2)
+        alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g")
+        piscem: \$(piscem --version | sed -e "s/piscem //g")
         salmon: \$(salmon --version | sed -e "s/salmon //g")
+        simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g")
     END_VERSIONS
     """
 }
+
+def input_args(genome_fasta, genome_gtf, transcript_fasta) { //, probes_csv, features_csv) {
+    // if (probe_csv) {
+    //     args = "--probe_csv ${probe_csv}"
+    // } else if (feature_csv) {
+    //     args = "--feature_csv ${feature_csv}"
+    // } else
+    if (transcript_fasta) {
+        return "--ref-seq ${transcript_fasta}"
+    } else if (genome_fasta && genome_gtf) {
+        return "--fasta ${genome_fasta} --gtf ${genome_gtf}"
+    } else {
+        error "No valid input provided; please provide either a genome fasta + gtf set or a transcript fasta file. ${genome_fasta} ${genome_gtf} ${transcript_fasta}"
+        // error "No valid input provided; please provide one of the followings: (i) a genome fasta + gtf set, (ii) a transcript fasta file, (iii) a probes csv file (iv) a features csv file."
+    }
+
+}
diff --git a/modules/nf-core/simpleaf/index/meta.yml b/modules/nf-core/simpleaf/index/meta.yml
@@ -1,4 +1,3 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
 name: simpleaf_index
 description: Indexing of transcriptome for gene expression quantification using SimpleAF
 keywords:
@@ -17,58 +16,59 @@ input:
   - - meta:
         type: map
         description: |
-          Groovy Map containing information on genome_fasta
+          Groovy Map containing information on genome_fasta and genome_gtf
     - genome_fasta:
         type: file
         description: |
-          FASTA file containing the genome sequence
-  - - meta2:
-        type: map
-        description: |
-          Groovy Map containing information on genome_gtf
+          FASTA file containing the genome sequence.
+          It conflicts with transcript_fasta.
+          When transcript_fasta is provided, it must be empty (provided as []).
+          When transcript_fasta is empty, it must be provided together with its corresponding genome_gtf file.
     - genome_gtf:
         type: file
         description: |
-          GTF file containing transcript annotations. Optional if transcript FASTA file is provided.
-  - - meta3:
+          GTF file containing gene annotations.
+          It conflicts with transcript_fasta.
+          When transcript_fasta is provided, it must be empty (provided as []).
+          When transcript_fasta is empty, it must be provided together with its corresponding genome_fasta file.
+  - - meta2:
         type: map
         description: |
           Groovy Map containing information on transcript_fasta
     - transcript_fasta:
         type: file
         description: |
-          FASTA file containing the transcript sequences. Optional if transcript GTF file is provided.
+          FASTA file containing the transcript sequences to build index directly on.
+          It conflicts with genome_gtf and genome_fasta.
+          When genome_gtf and genome_fasta are provided, it must be empty (provided as []).
 output:
   - index:
       - meta:
           type: map
           description: |
-            Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
+            Groovy Map containing information on the index generated by simpleaf
       - ${prefix}/index:
-          type: directory
+          type: map
           description: |
-            Folder containing the Salmon index files
-          pattern: "salmon/index"
-  - transcript_tsv:
+            Groovy Map containing information on the index generated by simpleaf
+  - ref:
       - meta:
           type: map
           description: |
-            Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
-      - ${prefix}/ref/t2g_3col.tsv:
-          type: file
+            Groovy Map containing information on the transcriptomic reference constructed by simpleaf.
+      - ${prefix}/ref:
+          type: map
           description: |
-            Transcript-to-gene mapping file in 3-column TSV format
-          pattern: "salmon/ref/*_t2g_3col.tsv"
-  - salmon:
+            Groovy Map containing information on the transcriptomic reference constructed by simpleaf.
+  - t2g:
       - meta:
-          type: map
+          type: file
           description: |
-            Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
-      - ${prefix}:
-          type: directory
+            Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant.
+      - ${prefix}/ref/{t2g,t2g_3col}.tsv:
+          type: file
           description: |
-            Folder containing the Salmon files
-          pattern: "salmon"
+            Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant.
   - versions:
       - versions.yml:
           type: file
@@ -81,9 +81,11 @@ authors:
   - "@Khajidu"
   - "@apeltzer"
   - "@pinin4fjords"
+  - "@dongzehe"
 maintainers:
   - "@fmalmeida"
   - "@maxulysse"
   - "@Khajidu"
   - "@apeltzer"
   - "@pinin4fjords"
+  - "@dongzehe"
diff --git a/modules/nf-core/simpleaf/index/tests/main.nf.test b/modules/nf-core/simpleaf/index/tests/main.nf.test
@@ -9,6 +9,7 @@ nextflow_process {
     tag "simpleaf"
     tag "simpleaf/index"
 
+    // test piscem
     test("Homo sapiens - genome index - expanded - fasta + gtf") {
 
         when {
@@ -18,23 +19,28 @@ nextflow_process {
                 gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)
                 meta = [ 'id': 'human_genome']
 
-                input[0] = Channel.of([ meta, genome_fasta ])
-                input[1] = Channel.of([ meta, gtf ])
-                input[2] = Channel.of([[],[]])
-
+                input[0] = Channel.of([ meta, genome_fasta, gtf ])
+                input[1] = Channel.of([[],[]])
                 """
             }
         }
 
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(
-                    path("${process.out.index[0][1]}/ctg_offsets.bin"),
-                    path("${process.out.index[0][1]}/duplicate_clusters.tsv"),
-                    path("${process.out.index[0][1]}/mphf.bin"),
-                    process.out.versions)
-                    .match() }
+                { assert snapshot(process.out.versions).match() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() },
+                { assert file("${process.out.ref.get(0).get(1)}/roers_ref.fa").exists() },
+                { assert file("${process.out.ref.get(0).get(1)}/t2g_3col.tsv").exists() },
+                { assert file("${process.out.ref.get(0).get(1)}/gene_id_to_name.tsv").exists() },
+                { assert file("${process.out.ref.get(0).get(1)}/roers_make-ref.json").exists() },
+                { assert file("${process.out.t2g.get(0).get(1)}").exists() },
             )
         }
 
@@ -48,22 +54,29 @@ nextflow_process {
                 transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)
                 meta = [ 'id': 'human_transcriptome']
 
-                input[0] = Channel.of([[],[]])
-                input[1] = Channel.of([[],[]])
-                input[2] = Channel.of([ meta, transcriptome_fasta ])
+                input[0] = Channel.of([[],[],[]])
+                input[1] = Channel.of([ meta, transcriptome_fasta ])
                 """
             }
         }
 
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(
-                    path("${process.out.index[0][1]}/ctg_offsets.bin"),
-                    path("${process.out.index[0][1]}/duplicate_clusters.tsv"),
-                    path("${process.out.index[0][1]}/mphf.bin"),
-                    process.out.versions)
-                    .match() }
+                { assert snapshot(process.out.versions).match() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() },
+                { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() }
+                // { assert snapshot(
+                //     path("${process.out.index.get(0).get(1)}/piscem_idx.ctab"),
+                //     path("${process.out.index.get(0).get(1)}/piscem_idx.json"),
+                //     path("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json"),
+                //     process.out.versions)
+                //     .match() }
             )
         }
     }
@@ -76,9 +89,8 @@ nextflow_process {
                 transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)
                 meta = [ 'id': 'human_transcriptome']
 
-                input[0] = Channel.of([[],[]])
-                input[1] = Channel.of([[],[]])
-                input[2] = Channel.of([ meta, transcriptome_fasta ])
+                input[0] = Channel.of([[],[],[]])
+                input[1] = Channel.of([ meta, transcriptome_fasta ])
                 """
             }
         }
@@ -90,5 +102,4 @@ nextflow_process {
             )
         }
     }
-
 }