Skip to content

Commit

Permalink
Merge branch 'master' into catpack-reads
Browse files Browse the repository at this point in the history
  • Loading branch information
sateeshperi authored Jan 25, 2025
2 parents 6e25635 + 5b33c12 commit c052bab
Show file tree
Hide file tree
Showing 11 changed files with 379 additions and 230 deletions.
9 changes: 5 additions & 4 deletions modules/nf-core/simpleaf/index/environment.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
channels:
- conda-forge
- bioconda
- conda-forge

dependencies:
- bioconda::alevin-fry=0.8.2
- bioconda::salmon=1.10.2
- bioconda::simpleaf=0.15.1
- bioconda::alevin-fry=0.11.1
- bioconda::piscem=0.11.0
- bioconda::salmon=1.10.3
- bioconda::simpleaf=0.18.4
62 changes: 44 additions & 18 deletions modules/nf-core/simpleaf/index/main.nf
Original file line number Diff line number Diff line change
@@ -1,37 +1,40 @@
// NOTE because the default indexer, piscem, needs to frequently read and write a large number of intermediate files, if your use case involves the situations where the CPU and storage are not physically connected, we recommend setting `--work-dir /path/to/a/local/dir` or in the `ext.args` in nextflow.config, or `scratch = true`, to avoid runtime issues.
process SIMPLEAF_INDEX {
tag "$genome_fasta $transcript_fasta"
tag "${meta.id ?: meta2.id}"
label 'process_high'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/simpleaf:0.15.1--h4ac6f70_0':
'biocontainers/simpleaf:0.15.1--h4ac6f70_0' }"
'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1':
'biocontainers/simpleaf:0.18.4--ha6fb395_1' }"

input:
tuple val(meta), path(genome_fasta)
tuple val(meta2), path(genome_gtf)
tuple val(meta3), path(transcript_fasta)
tuple val(meta), path(genome_fasta), path(genome_gtf)
tuple val(meta2), path(transcript_fasta)

output:
tuple val(meta), path("${prefix}/index") , emit: index
tuple val(meta), path("${prefix}/ref/t2g_3col.tsv") , emit: transcript_tsv, optional: true
tuple val(meta), path("${prefix}") , emit: salmon
path "versions.yml" , emit: versions
tuple val(meta), path("${prefix}/index") , emit: index
tuple val(meta), path("${prefix}/ref") , emit: ref, optional: true
tuple val(meta), path("${prefix}/ref/{t2g,t2g_3col}.tsv") , emit: t2g, optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def seq_inputs = (transcript_fasta) ? "--refseq $transcript_fasta" : "--gtf $genome_gtf --fasta $genome_fasta"
def seq_inputs = input_args(genome_fasta, genome_gtf, transcript_fasta)//, probes_csv, features_csv)

// Output meta needs to correspond to the input used
meta = (transcript_fasta) ? meta3 : meta
meta = (transcript_fasta) ? meta2 : meta
prefix = task.ext.prefix ?: "${meta.id}"
"""
# export required var
export ALEVIN_FRY_HOME=.
# set maximum number of file descriptors for temp files
ulimit -n 2048
# prep simpleaf
simpleaf set-paths
Expand All @@ -45,26 +48,49 @@ process SIMPLEAF_INDEX {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2)
alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g")
piscem: \$(piscem --version | sed -e "s/piscem //g")
salmon: \$(salmon --version | sed -e "s/salmon //g")
simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g")
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta3.id}")
prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta2.id}")

"""
mkdir -p ${prefix}/index
mkdir -p ${prefix}/ref
touch ${prefix}/index/ctg_offsets.bin
touch ${prefix}/index/duplicate_clusters.tsv
touch ${prefix}/index/mphf.bin
touch ${prefix}/index/piscem_idx_cfish.json
touch ${prefix}/index/piscem_idx.ectab
touch ${prefix}/index/piscem_idx.sshash
touch ${prefix}/ref/t2g_3col.tsv
touch ${prefix}/ref/roers_ref.fa
cat <<-END_VERSIONS > versions.yml
"${task.process}":
simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2)
alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g")
piscem: \$(piscem --version | sed -e "s/piscem //g")
salmon: \$(salmon --version | sed -e "s/salmon //g")
simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g")
END_VERSIONS
"""
}

def input_args(genome_fasta, genome_gtf, transcript_fasta) { //, probes_csv, features_csv) {
// if (probe_csv) {
// args = "--probe_csv ${probe_csv}"
// } else if (feature_csv) {
// args = "--feature_csv ${feature_csv}"
// } else
if (transcript_fasta) {
return "--ref-seq ${transcript_fasta}"
} else if (genome_fasta && genome_gtf) {
return "--fasta ${genome_fasta} --gtf ${genome_gtf}"
} else {
error "No valid input provided; please provide either a genome fasta + gtf set or a transcript fasta file. ${genome_fasta} ${genome_gtf} ${transcript_fasta}"
// error "No valid input provided; please provide one of the followings: (i) a genome fasta + gtf set, (ii) a transcript fasta file, (iii) a probes csv file (iv) a features csv file."
}

}
56 changes: 29 additions & 27 deletions modules/nf-core/simpleaf/index/meta.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
name: simpleaf_index
description: Indexing of transcriptome for gene expression quantification using SimpleAF
keywords:
Expand All @@ -17,58 +16,59 @@ input:
- - meta:
type: map
description: |
Groovy Map containing information on genome_fasta
Groovy Map containing information on genome_fasta and genome_gtf
- genome_fasta:
type: file
description: |
FASTA file containing the genome sequence
- - meta2:
type: map
description: |
Groovy Map containing information on genome_gtf
FASTA file containing the genome sequence.
It conflicts with transcript_fasta.
When transcript_fasta is provided, it must be empty (provided as []).
When transcript_fasta is empty, it must be provided together with its corresponding genome_gtf file.
- genome_gtf:
type: file
description: |
GTF file containing transcript annotations. Optional if transcript FASTA file is provided.
- - meta3:
GTF file containing gene annotations.
It conflicts with transcript_fasta.
When transcript_fasta is provided, it must be empty (provided as []).
When transcript_fasta is empty, it must be provided together with its corresponding genome_fasta file.
- - meta2:
type: map
description: |
Groovy Map containing information on transcript_fasta
- transcript_fasta:
type: file
description: |
FASTA file containing the transcript sequences. Optional if transcript GTF file is provided.
FASTA file containing the transcript sequences to build index directly on.
It conflicts with genome_gtf and genome_fasta.
When genome_gtf and genome_fasta are provided, it must be empty (provided as []).
output:
- index:
- meta:
type: map
description: |
Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
Groovy Map containing information on the index generated by simpleaf
- ${prefix}/index:
type: directory
type: map
description: |
Folder containing the Salmon index files
pattern: "salmon/index"
- transcript_tsv:
Groovy Map containing information on the index generated by simpleaf
- ref:
- meta:
type: map
description: |
Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
- ${prefix}/ref/t2g_3col.tsv:
type: file
Groovy Map containing information on the transcriptomic reference constructed by simpleaf.
- ${prefix}/ref:
type: map
description: |
Transcript-to-gene mapping file in 3-column TSV format
pattern: "salmon/ref/*_t2g_3col.tsv"
- salmon:
Groovy Map containing information on the transcriptomic reference constructed by simpleaf.
- t2g:
- meta:
type: map
type: file
description: |
Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used)
- ${prefix}:
type: directory
Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant.
- ${prefix}/ref/{t2g,t2g_3col}.tsv:
type: file
description: |
Folder containing the Salmon files
pattern: "salmon"
Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant.
- versions:
- versions.yml:
type: file
Expand All @@ -81,9 +81,11 @@ authors:
- "@Khajidu"
- "@apeltzer"
- "@pinin4fjords"
- "@dongzehe"
maintainers:
- "@fmalmeida"
- "@maxulysse"
- "@Khajidu"
- "@apeltzer"
- "@pinin4fjords"
- "@dongzehe"
57 changes: 34 additions & 23 deletions modules/nf-core/simpleaf/index/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ nextflow_process {
tag "simpleaf"
tag "simpleaf/index"

// test piscem
test("Homo sapiens - genome index - expanded - fasta + gtf") {

when {
Expand All @@ -18,23 +19,28 @@ nextflow_process {
gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true)
meta = [ 'id': 'human_genome']
input[0] = Channel.of([ meta, genome_fasta ])
input[1] = Channel.of([ meta, gtf ])
input[2] = Channel.of([[],[]])
input[0] = Channel.of([ meta, genome_fasta, gtf ])
input[1] = Channel.of([[],[]])
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(
path("${process.out.index[0][1]}/ctg_offsets.bin"),
path("${process.out.index[0][1]}/duplicate_clusters.tsv"),
path("${process.out.index[0][1]}/mphf.bin"),
process.out.versions)
.match() }
{ assert snapshot(process.out.versions).match() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() },
{ assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() },
{ assert file("${process.out.ref.get(0).get(1)}/roers_ref.fa").exists() },
{ assert file("${process.out.ref.get(0).get(1)}/t2g_3col.tsv").exists() },
{ assert file("${process.out.ref.get(0).get(1)}/gene_id_to_name.tsv").exists() },
{ assert file("${process.out.ref.get(0).get(1)}/roers_make-ref.json").exists() },
{ assert file("${process.out.t2g.get(0).get(1)}").exists() },
)
}

Expand All @@ -48,22 +54,29 @@ nextflow_process {
transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)
meta = [ 'id': 'human_transcriptome']
input[0] = Channel.of([[],[]])
input[1] = Channel.of([[],[]])
input[2] = Channel.of([ meta, transcriptome_fasta ])
input[0] = Channel.of([[],[],[]])
input[1] = Channel.of([ meta, transcriptome_fasta ])
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(
path("${process.out.index[0][1]}/ctg_offsets.bin"),
path("${process.out.index[0][1]}/duplicate_clusters.tsv"),
path("${process.out.index[0][1]}/mphf.bin"),
process.out.versions)
.match() }
{ assert snapshot(process.out.versions).match() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() },
{ assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() },
{ assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() }
// { assert snapshot(
// path("${process.out.index.get(0).get(1)}/piscem_idx.ctab"),
// path("${process.out.index.get(0).get(1)}/piscem_idx.json"),
// path("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json"),
// process.out.versions)
// .match() }
)
}
}
Expand All @@ -76,9 +89,8 @@ nextflow_process {
transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true)
meta = [ 'id': 'human_transcriptome']
input[0] = Channel.of([[],[]])
input[1] = Channel.of([[],[]])
input[2] = Channel.of([ meta, transcriptome_fasta ])
input[0] = Channel.of([[],[],[]])
input[1] = Channel.of([ meta, transcriptome_fasta ])
"""
}
}
Expand All @@ -90,5 +102,4 @@ nextflow_process {
)
}
}

}
Loading

0 comments on commit c052bab

Please sign in to comment.