diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d13bc92c..5b5e0e7ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## v3.13.0dev - [date]
+## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17
### Credits
@@ -30,8 +30,12 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
- [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
- [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification
-- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8)
+- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
- [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting
+- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom
+- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py`
+- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
+ Ensembl genome
### Software dependencies
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 100536f4e..4085d95e3 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,7 +1,7 @@
report_comment: >
- This report has been generated by the nf-core/rnaseq
+ This report has been generated by the nf-core/rnaseq
analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ documentation.
report_section_order:
"nf-core-rnaseq-methods-description":
order: -1000
diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py
new file mode 100755
index 000000000..3d09bd859
--- /dev/null
+++ b/bin/filter_gtf.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import logging
+import argparse
+import re
+import statistics
+from typing import Set
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger("fasta_gtf_filter")
+logger.setLevel(logging.INFO)
+
+
+def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
+ """Extracts the sequence names from a FASTA file."""
+ with open(fasta_name) as fasta:
+ return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}
+
+
+def tab_delimited(file: str) -> float:
+ """Check if file is tab-delimited and return median number of tabs."""
+ with open(file, "r") as f:
+ data = f.read(1024)
+ return statistics.median(line.count("\t") for line in data.split("\n"))
+
+
+def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
+ """Filter GTF file based on FASTA sequence names."""
+ if tab_delimited(gtf_in) != 8:
+ raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
+
+ seq_names_in_genome = extract_fasta_seq_names(fasta)
+ logger.info(f"Extracted chromosome sequence names from {fasta}")
+ logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
+
+ seq_names_in_gtf = set()
+ try:
+ with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
+ line_count = 0
+ for line in gtf:
+ seq_name = line.split("\t")[0]
+ seq_names_in_gtf.add(seq_name) # Add sequence name to the set
+
+ if seq_name in seq_names_in_genome:
+ if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
+ out.write(line)
+ line_count += 1
+
+ if line_count == 0:
+ raise ValueError("All GTF lines removed by filters")
+
+ except IOError as e:
+ logger.error(f"File operation failed: {e}")
+ return
+
+ logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
+ logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
+ parser.add_argument("--gtf", type=str, required=True, help="GTF file")
+ parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
+ parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
+ parser.add_argument(
+ "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
+ )
+
+ args = parser.parse_args()
+ filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)
diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
deleted file mode 100755
index 9f876eaa0..000000000
--- a/bin/filter_gtf_for_genes_in_genome.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python
-from __future__ import print_function
-import logging
-from itertools import groupby
-import argparse
-
-# Create a logger
-logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
-logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-
-
-def is_header(line):
- return line[0] == ">"
-
-
-def extract_fasta_seq_names(fasta_name):
- """
- modified from Brent Pedersen
- Correct Way To Parse A Fasta File In Python
- given a fasta file. yield tuples of header, sequence
- from https://www.biostars.org/p/710/
- """
- # first open the file outside
- fh = open(fasta_name)
-
- # ditch the boolean (x[0]) and just keep the header or sequence since
- # we know they alternate.
- faiter = (x[1] for x in groupby(fh, is_header))
-
- for i, header in enumerate(faiter):
- line = next(header)
- if is_header(line):
- # drop the ">"
- headerStr = line[1:].strip().split()[0]
- yield headerStr
-
-
-def extract_genes_in_genome(fasta, gtf_in, gtf_out):
- seq_names_in_genome = set(extract_fasta_seq_names(fasta))
- logger.info("Extracted chromosome sequence names from : %s" % fasta)
- logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome)))
- seq_names_in_gtf = set([])
-
- n_total_lines = 0
- n_lines_in_genome = 0
- with open(gtf_out, "w") as f:
- with open(gtf_in) as g:
- for line in g.readlines():
- n_total_lines += 1
- seq_name_gtf = line.split("\t")[0]
- seq_names_in_gtf.add(seq_name_gtf)
- if seq_name_gtf in seq_names_in_genome:
- n_lines_in_genome += 1
- f.write(line)
- logger.info(
- "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta)
- )
- logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf)))
-
- logger.info("Wrote matching lines to %s" % gtf_out)
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""")
- parser.add_argument("--gtf", type=str, help="GTF file")
- parser.add_argument("--fasta", type=str, help="Genome fasta file")
- parser.add_argument(
- "-o",
- "--output",
- dest="output",
- default="genes_in_genome.gtf",
- type=str,
- help="GTF features on fasta genome sequences",
- )
-
- args = parser.parse_args()
- extract_genes_in_genome(args.fasta, args.gtf, args.output)
diff --git a/conf/modules.config b/conf/modules.config
index 5b46659cf..6e28f549b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -117,7 +117,8 @@ process {
]
}
- withName: 'GTF_GENE_FILTER' {
+ withName: 'GTF_FILTER' {
+ ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }
publishDir = [
path: { "${params.outdir}/genome" },
mode: params.publish_dir_mode,
diff --git a/docs/usage.md b/docs/usage.md
index 0f3050907..524acba9b 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -198,6 +198,10 @@ Notes:
- As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)).
+### GTF filtering
+
+By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
+
## Running the pipeline
The typical command for running the pipeline is as follows:
diff --git a/modules.json b/modules.json
index d06dfe193..84d48d87f 100644
--- a/modules.json
+++ b/modules.json
@@ -71,8 +71,8 @@
"installed_by": ["modules"]
},
"kallisto/quant": {
- "branch": "kallisto_updates",
- "git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84",
+ "branch": "master",
+ "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad",
"installed_by": ["modules"]
},
"picard/markduplicates": {
diff --git a/modules/local/gtf_gene_filter/main.nf b/modules/local/gtf_filter/main.nf
similarity index 67%
rename from modules/local/gtf_gene_filter/main.nf
rename to modules/local/gtf_filter/main.nf
index cd8e16adb..d14e8ff42 100644
--- a/modules/local/gtf_gene_filter/main.nf
+++ b/modules/local/gtf_filter/main.nf
@@ -1,4 +1,4 @@
-process GTF_GENE_FILTER {
+process GTF_FILTER {
tag "$fasta"
conda "conda-forge::python=3.9.5"
@@ -11,18 +11,18 @@ process GTF_GENE_FILTER {
path gtf
output:
- path "*.gtf" , emit: gtf
- path "versions.yml", emit: versions
+ path "*.filtered.gtf", emit: genome_gtf
+ path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
- script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
+ script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
"""
- filter_gtf_for_genes_in_genome.py \\
+ filter_gtf.py \\
--gtf $gtf \\
--fasta $fasta \\
- -o ${fasta.baseName}_genes.gtf
+ --prefix ${fasta.baseName}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf
index 7ecc28f87..a59d8a533 100644
--- a/modules/local/multiqc/main.nf
+++ b/modules/local/multiqc/main.nf
@@ -3,8 +3,8 @@ process MULTIQC {
conda "bioconda::multiqc=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
- 'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
+ 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' :
+ 'biocontainers/multiqc:1.17--pyhdfd78af_1' }"
input:
path multiqc_config
diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf
index 2ad0bb78b..f5444d791 100644
--- a/modules/nf-core/kallisto/quant/main.nf
+++ b/modules/nf-core/kallisto/quant/main.nf
@@ -59,7 +59,7 @@ process KALLISTO_QUANT {
-o $prefix \\
${reads} 2> >(tee -a ${prefix}/kallisto_quant.log >&2)
- cp ${prefix}/kallisto_quant.log ${prefix}.log
+ cp ${prefix}/kallisto_quant.log ${prefix}.log
cp ${prefix}/run_info.json ${prefix}.run_info.json
cat <<-END_VERSIONS > versions.yml
diff --git a/nextflow.config b/nextflow.config
index 2c8b29691..ab539161f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -17,6 +17,8 @@ params {
splicesites = null
gtf_extra_attributes = 'gene_name'
gtf_group_features = 'gene_id'
+ skip_gtf_filter = false
+ skip_gtf_transcript_filter = false
featurecounts_feature_type = 'exon'
featurecounts_group_type = 'gene_biotype'
gencode = false
@@ -315,7 +317,7 @@ manifest {
description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control."""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
- version = '3.13.0dev'
+ version = '3.13.0'
doi = 'https://doi.org/10.5281/zenodo.1400710'
}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ddbe77de4..60e6585cf 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -520,6 +520,17 @@
"fa_icon": "fas fa-fast-forward",
"description": "Options to skip various steps within the workflow.",
"properties": {
+ "skip_gtf_filter": {
+ "type": "boolean",
+ "fa_icon": "fas fa-forward",
+ "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.",
+ "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option."
+ },
+ "skip_gtf_transcript_filter": {
+ "type": "boolean",
+ "fa_icon": "fas fa-forward",
+ "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline."
+ },
"skip_bbsplit": {
"type": "boolean",
"default": true,
diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
index ca598a199..0be947954 100644
--- a/subworkflows/local/prepare_genome/main.nf
+++ b/subworkflows/local/prepare_genome/main.nf
@@ -30,7 +30,7 @@ include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../..
include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode'
include { GTF2BED } from '../../../modules/local/gtf2bed'
include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta'
-include { GTF_GENE_FILTER } from '../../../modules/local/gtf_gene_filter'
+include { GTF_FILTER } from '../../../modules/local/gtf_filter'
include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes'
workflow PREPARE_GENOME {
@@ -53,6 +53,7 @@ workflow PREPARE_GENOME {
is_aws_igenome // boolean: whether the genome files are from AWS iGenomes
biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file
prepare_tool_indices // list: tools to prepare indices for
+ filter_gtf // boolean: whether to filter GTF file
main:
@@ -71,22 +72,30 @@ workflow PREPARE_GENOME {
//
// Uncompress GTF annotation file or create from GFF3 if required
//
- if (gtf) {
- if (gtf.endsWith('.gz')) {
- ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
- ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
- } else {
- ch_gtf = Channel.value(file(gtf))
+ if (gtf || gff) {
+ if (gtf) {
+ if (gtf.endsWith('.gz')) {
+ ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
+ ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
+ } else {
+ ch_gtf = Channel.value(file(gtf))
+ }
+ } else if (gff) {
+ if (gff.endsWith('.gz')) {
+ ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
+ ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
+ } else {
+ ch_gff = Channel.value(file(gff))
+ }
+ ch_gtf = GFFREAD ( ch_gff ).gtf
+ ch_versions = ch_versions.mix(GFFREAD.out.versions)
}
- } else if (gff) {
- if (gff.endsWith('.gz')) {
- ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
- ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
- } else {
- ch_gff = Channel.value(file(gff))
+
+ if (filter_gtf) {
+ GTF_FILTER ( ch_fasta, ch_gtf )
+ ch_gtf = GTF_FILTER.out.genome_gtf
+ ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
}
- ch_gtf = GFFREAD ( ch_gff ).gtf
- ch_versions = ch_versions.mix(GFFREAD.out.versions)
}
//
@@ -136,9 +145,8 @@ workflow PREPARE_GENOME {
ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
}
} else {
- ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf
- ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta
- ch_versions = ch_versions.mix(GTF_GENE_FILTER.out.versions)
+ ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
+ ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
}
@@ -293,6 +301,5 @@ workflow PREPARE_GENOME {
hisat2_index = ch_hisat2_index // channel: path(hisat2/index/)
salmon_index = ch_salmon_index // channel: path(salmon/index/)
kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ]
-
versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
}
diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf
index 66ee061f5..403194ac2 100755
--- a/workflows/rnaseq.nf
+++ b/workflows/rnaseq.nf
@@ -39,6 +39,25 @@ if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' }
if (!params.skip_alignment) { prepareToolIndices << params.aligner }
if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner }
+// Determine whether to filter the GTF or not
+def filterGtf =
+ ((
+ // Condition 1: Alignment is required and aligner is set
+ !params.skip_alignment && params.aligner
+ ) ||
+ (
+ // Condition 2: Pseudoalignment is required and pseudoaligner is set
+ !params.skip_pseudo_alignment && params.pseudo_aligner
+ ) ||
+ (
+ // Condition 3: Transcript FASTA file is not provided
+ !params.transcript_fasta
+ )) &&
+ (
+ // Condition 4: --skip_gtf_filter is not provided
+ !params.skip_gtf_filter
+ )
+
// Get RSeqC modules to run
def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : []
if (params.bam_csi_index) {
@@ -175,7 +194,8 @@ workflow RNASEQ {
params.gencode,
is_aws_igenome,
biotype,
- prepareToolIndices
+ prepareToolIndices,
+ filterGtf
)
ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)