Skip to content

Commit

Permalink
Merge pull request #1107 from nf-core/stringtie_gtf
Browse files Browse the repository at this point in the history
Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
  • Loading branch information
drpatelh authored Nov 15, 2023
2 parents ab9df9a + b67558d commit 7a04c5d
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 115 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v3.13.0dev - [date]
## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17

### Credits

Expand All @@ -30,8 +30,12 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
- [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
- [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification
- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8)
- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
- [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting
- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom
- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py`
- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
Ensembl genome

### Software dependencies

Expand Down
4 changes: 2 additions & 2 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
report_comment: >
This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/dev" target="_blank">nf-core/rnaseq</a>
This report has been generated by the <a href="https://github.com/nf-core/rnaseq/releases/tag/3.13.0" target="_blank">nf-core/rnaseq</a>
analysis pipeline. For information about how to interpret these results, please see the
<a href="https://nf-co.re/rnaseq/dev/docs/output" target="_blank">documentation</a>.
<a href="https://nf-co.re/rnaseq/3.13.0/docs/output" target="_blank">documentation</a>.
report_section_order:
"nf-core-rnaseq-methods-description":
order: -1000
Expand Down
70 changes: 70 additions & 0 deletions bin/filter_gtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python
import logging
import argparse
import re
import statistics
from typing import Set

# Create a logger
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("fasta_gtf_filter")
logger.setLevel(logging.INFO)


def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
"""Extracts the sequence names from a FASTA file."""
with open(fasta_name) as fasta:
return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}


def tab_delimited(file: str) -> float:
"""Check if file is tab-delimited and return median number of tabs."""
with open(file, "r") as f:
data = f.read(1024)
return statistics.median(line.count("\t") for line in data.split("\n"))


def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
"""Filter GTF file based on FASTA sequence names."""
if tab_delimited(gtf_in) != 8:
raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")

seq_names_in_genome = extract_fasta_seq_names(fasta)
logger.info(f"Extracted chromosome sequence names from {fasta}")
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))

seq_names_in_gtf = set()
try:
with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
line_count = 0
for line in gtf:
seq_name = line.split("\t")[0]
seq_names_in_gtf.add(seq_name) # Add sequence name to the set

if seq_name in seq_names_in_genome:
if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
out.write(line)
line_count += 1

if line_count == 0:
raise ValueError("All GTF lines removed by filters")

except IOError as e:
logger.error(f"File operation failed: {e}")
return

logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
parser.add_argument("--gtf", type=str, required=True, help="GTF file")
parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
parser.add_argument(
"--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
)

args = parser.parse_args()
filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)
78 changes: 0 additions & 78 deletions bin/filter_gtf_for_genes_in_genome.py

This file was deleted.

3 changes: 2 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ process {
]
}

withName: 'GTF_GENE_FILTER' {
withName: 'GTF_FILTER' {
ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' }
publishDir = [
path: { "${params.outdir}/genome" },
mode: params.publish_dir_mode,
Expand Down
4 changes: 4 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ Notes:

- As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)).

### GTF filtering

By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.

## Running the pipeline

The typical command for running the pipeline is as follows:
Expand Down
4 changes: 2 additions & 2 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@
"installed_by": ["modules"]
},
"kallisto/quant": {
"branch": "kallisto_updates",
"git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84",
"branch": "master",
"git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad",
"installed_by": ["modules"]
},
"picard/markduplicates": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process GTF_GENE_FILTER {
process GTF_FILTER {
tag "$fasta"

conda "conda-forge::python=3.9.5"
Expand All @@ -11,18 +11,18 @@ process GTF_GENE_FILTER {
path gtf

output:
path "*.gtf" , emit: gtf
path "versions.yml", emit: versions
path "*.filtered.gtf", emit: genome_gtf
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/
"""
filter_gtf_for_genes_in_genome.py \\
filter_gtf.py \\
--gtf $gtf \\
--fasta $fasta \\
-o ${fasta.baseName}_genes.gtf
--prefix ${fasta.baseName}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 2 additions & 2 deletions modules/local/multiqc/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ process MULTIQC {

conda "bioconda::multiqc=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' :
'biocontainers/multiqc:1.17--pyhdfd78af_1' }"

input:
path multiqc_config
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/kallisto/quant/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ params {
splicesites = null
gtf_extra_attributes = 'gene_name'
gtf_group_features = 'gene_id'
skip_gtf_filter = false
skip_gtf_transcript_filter = false
featurecounts_feature_type = 'exon'
featurecounts_group_type = 'gene_biotype'
gencode = false
Expand Down Expand Up @@ -315,7 +317,7 @@ manifest {
description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control."""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '3.13.0dev'
version = '3.13.0'
doi = 'https://doi.org/10.5281/zenodo.1400710'
}

Expand Down
11 changes: 11 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,17 @@
"fa_icon": "fas fa-fast-forward",
"description": "Options to skip various steps within the workflow.",
"properties": {
"skip_gtf_filter": {
"type": "boolean",
"fa_icon": "fas fa-forward",
"description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.",
"help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option."
},
"skip_gtf_transcript_filter": {
"type": "boolean",
"fa_icon": "fas fa-forward",
"description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline."
},
"skip_bbsplit": {
"type": "boolean",
"default": true,
Expand Down
45 changes: 26 additions & 19 deletions subworkflows/local/prepare_genome/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../..
include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode'
include { GTF2BED } from '../../../modules/local/gtf2bed'
include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta'
include { GTF_GENE_FILTER } from '../../../modules/local/gtf_gene_filter'
include { GTF_FILTER } from '../../../modules/local/gtf_filter'
include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes'

workflow PREPARE_GENOME {
Expand All @@ -53,6 +53,7 @@ workflow PREPARE_GENOME {
is_aws_igenome // boolean: whether the genome files are from AWS iGenomes
biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file
prepare_tool_indices // list: tools to prepare indices for
filter_gtf // boolean: whether to filter GTF file

main:

Expand All @@ -71,22 +72,30 @@ workflow PREPARE_GENOME {
//
// Uncompress GTF annotation file or create from GFF3 if required
//
if (gtf) {
if (gtf.endsWith('.gz')) {
ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
} else {
ch_gtf = Channel.value(file(gtf))
if (gtf || gff) {
if (gtf) {
if (gtf.endsWith('.gz')) {
ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
} else {
ch_gtf = Channel.value(file(gtf))
}
} else if (gff) {
if (gff.endsWith('.gz')) {
ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
} else {
ch_gff = Channel.value(file(gff))
}
ch_gtf = GFFREAD ( ch_gff ).gtf
ch_versions = ch_versions.mix(GFFREAD.out.versions)
}
} else if (gff) {
if (gff.endsWith('.gz')) {
ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
} else {
ch_gff = Channel.value(file(gff))

if (filter_gtf) {
GTF_FILTER ( ch_fasta, ch_gtf )
ch_gtf = GTF_FILTER.out.genome_gtf
ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
}
ch_gtf = GFFREAD ( ch_gff ).gtf
ch_versions = ch_versions.mix(GFFREAD.out.versions)
}

//
Expand Down Expand Up @@ -136,9 +145,8 @@ workflow PREPARE_GENOME {
ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
}
} else {
ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf
ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta
ch_versions = ch_versions.mix(GTF_GENE_FILTER.out.versions)
ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
}

Expand Down Expand Up @@ -293,6 +301,5 @@ workflow PREPARE_GENOME {
hisat2_index = ch_hisat2_index // channel: path(hisat2/index/)
salmon_index = ch_salmon_index // channel: path(salmon/index/)
kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ]

versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
}
Loading

0 comments on commit 7a04c5d

Please sign in to comment.