nf-core · drpatelh · Nov 15, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Special thanks to the following for their contributions to the release:
 - [Júlia Mir Pedrol](https://github.com/mirpedrol)
 - [Matthias Zepper](https://github.com/MatthiasZepper)
 - [Maxime Garcia](https://github.com/maxulysse)
+- [Jonathan Manning](https://github.com/pinin4fjords)
 
 Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form.
 
@@ -28,6 +29,10 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders
 - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10
 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability
+- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage
+- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for filter_gtf_for_genes_in_genome.py
+- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields
+- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom Ensembl genome
 
 ### Software dependencies
 

diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import logging
+from itertools import groupby
+import argparse
+import re
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+
+def is_header(line: str) -> bool:
+    """Returns True if the given line is a header line in a FASTA file."""
+    return line[0] == ">"
+
+
+def extract_fasta_seq_names(fasta_name: str) -> set:
+    """Extracts the sequence names from a FASTA file.
+
+    modified from Brent Pedersen
+    Correct Way To Parse A Fasta File In Python
+    given a fasta file. yield tuples of header, sequence
+    from https://www.biostars.org/p/710/
+
+    Args:
+      fasta_name: The path to the FASTA file.
+
+    Returns:
+      A set of the sequence names in the FASTA file.
+    """
+
+    # first open the file outside
+    fh = open(fasta_name)
+
+    # ditch the boolean (x[0]) and just keep the header or sequence since
+    # we know they alternate.
+    faiter = (x[1] for x in groupby(fh, is_header))
+
+    for i, header in enumerate(faiter):
+        line = next(header)
+        if is_header(line):
+            # drop the ">"
+            headerStr = line[1:].strip().split()[0]
+        yield headerStr
+
+
+def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None:
+    """Extracts the genes in the genome from a GTF file.
+
+    Args:
+      fasta: The path to the FASTA file.
+      gtf_in: The path to the input GTF file.
+      gtf_out: The path to the output GTF file.
+
+    Raises:
+      ValueError: If no overlap is found or if the GTF file is not tab delimited.
+    """
+
+    def is_tab_delimited(file):
+        with open(file, "r") as f:
+            return "\t" in f.readline()
+
+    if not is_tab_delimited(gtf_in):
+        raise ValueError("The GTF file is not tab delimited.")
+
+    seq_names_in_genome = set(extract_fasta_seq_names(fasta))
+    logger.info(f"Extracted chromosome sequence names from {fasta}")
+    logger.info("All chromosome names: " + ", ".join(sorted(seq_names_in_genome)))
+
+    with open(gtf_in) as gtf, open(gtf_out, "w") as out:
+        seq_names_in_gtf = {line.split("\t")[0] for line in gtf if line.strip()}
+        overlap = seq_names_in_genome & seq_names_in_gtf
+        if not overlap:
+            raise ValueError("No overlapping scaffolds found.")
+
+        gtf.seek(0)  # Reset file pointer to the start of the file
+        for line in gtf:
+            if line.split("\t")[0] in overlap:
+                out.write(line)
+
+    logger.info(f"Extracted {len(overlap)} matching sequences from {gtf_in} into {gtf_out}")
+    logger.info("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
+    logger.info(f"Wrote matching lines to {gtf_out}")
+
+
+def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None:
+    """
+    Removes gene rows with absent or empty transcript_id attributes from a GTF file.
+
+    Args:
+      gtf_in: Path to the input GTF file.
+      gtf_out: The path to the output GTF file.
+    """
+
+    with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out:
+        for line in f_in:
+            transcript_id_match = re.search(r'transcript_id "([^"]+)"', line)
+            if transcript_id_match:
+                f_out.write(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""")
+    parser.add_argument("--gtf", type=str, help="GTF file")
+    parser.add_argument("--fasta", type=str, help="Genome fasta file")
+    parser.add_argument(
+        "-p",
+        "--prefix",
+        dest="prefix",
+        default="genes",
+        type=str,
+        help="Prefix for output GTF files",
+    )
+
+    args = parser.parse_args()
+    extract_genes_in_genome(args.fasta, args.gtf, args.prefix + "_in_genome.gtf")
+    remove_features_without_transcript_id(args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf")
diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
diff --git a/conf/modules.config b/conf/modules.config
@@ -108,7 +108,7 @@ process {
         ]
     }
 
-    withName: 'GTF_GENE_FILTER' {
+    withName: 'GTF_FILTER' {
         publishDir = [
             path: { "${params.outdir}/genome" },
             mode: params.publish_dir_mode,
@@ -155,13 +155,15 @@ process {
         ext.args   = '--record-count 1000000 --seed 1'
         ext.prefix = { "${meta.id}.subsampled" }
         publishDir = [
+            mode: params.publish_dir_mode,
             enabled: false
         ]
     }
 
     withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' {
         ext.args   = '--skipQuant'
         publishDir = [
+            mode: params.publish_dir_mode,
             enabled: false
         ]
     }

diff --git a/modules/local/gtf_gene_filter/main.nf → modules/local/gtf_filter/main.nf b/modules/local/gtf_gene_filter/main.nf → modules/local/gtf_filter/main.nf
@@ -1,4 +1,4 @@
-process GTF_GENE_FILTER {
+process GTF_FILTER {
     tag "$fasta"
 
     conda "conda-forge::python=3.9.5"
@@ -11,18 +11,19 @@ process GTF_GENE_FILTER {
     path gtf
 
     output:
-    path "*.gtf"       , emit: gtf
-    path "versions.yml", emit: versions
+    path "*_in_genome.gtf"           , emit: genome_gtf
+    path "*_with_transcript_ids.gtf" , emit: transcript_id_gtf
+    path "versions.yml"              , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/
     """
-    filter_gtf_for_genes_in_genome.py \\
+    filter_gtf.py \\
         --gtf $gtf \\
         --fasta $fasta \\
-        -o ${fasta.baseName}_genes.gtf
+        --prefix ${fasta.baseName}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf
@@ -1,10 +1,10 @@
 process MULTIQC {
     label 'process_medium'
 
-    conda "bioconda::multiqc=1.15"
+    conda "bioconda::multiqc=1.17"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' :
-        'biocontainers/multiqc:1.15--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
+        'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
 
     input:
     path multiqc_config

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -28,7 +28,7 @@ include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA       } from '../../..
 include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode'
 include { GTF2BED                              } from '../../../modules/local/gtf2bed'
 include { CAT_ADDITIONAL_FASTA                 } from '../../../modules/local/cat_additional_fasta'
-include { GTF_GENE_FILTER                      } from '../../../modules/local/gtf_gene_filter'
+include { GTF_FILTER                           } from '../../../modules/local/gtf_filter'
 include { STAR_GENOMEGENERATE_IGENOMES         } from '../../../modules/local/star_genomegenerate_igenomes'
 
 workflow PREPARE_GENOME {
@@ -68,22 +68,31 @@ workflow PREPARE_GENOME {
     //
     // Uncompress GTF annotation file or create from GFF3 if required
     //
-    if (gtf) {
-        if (gtf.endsWith('.gz')) {
-            ch_gtf      = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
-            ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
-        } else {
-            ch_gtf = Channel.value(file(gtf))
-        }
-    } else if (gff) {
-        if (gff.endsWith('.gz')) {
-            ch_gff      = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
-            ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
-        } else {
-            ch_gff = Channel.value(file(gff))
+    if (gtf || gff) {
+        if (gtf) {
+            if (gtf.endsWith('.gz')) {
+                ch_gtf      = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
+            } else {
+                ch_gtf = Channel.value(file(gtf))
+            }
+        } else if (gff) {
+            if (gff.endsWith('.gz')) {
+                ch_gff      = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
+            } else {
+                ch_gff = Channel.value(file(gff))
+            }
+            ch_gtf      = GFFREAD ( ch_gff ).gtf
+            ch_versions = ch_versions.mix(GFFREAD.out.versions)
         }
-        ch_gtf      = GFFREAD ( ch_gff ).gtf
-        ch_versions = ch_versions.mix(GFFREAD.out.versions)
+
+        //
+        // Apply filtering we may need for GTFs
+        //
+        GTF_FILTER ( ch_fasta, ch_gtf )
+        ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf
+        ch_gtf_genome = GTF_FILTER.out.genome_gtf
     }
 
     //
@@ -133,9 +142,8 @@ workflow PREPARE_GENOME {
             ch_versions         = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
         }
     } else {
-        ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf
-        ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta
-        ch_versions         = ch_versions.mix(GTF_GENE_FILTER.out.versions)
+        ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf_genome ).transcript_fasta
+        ch_versions         = ch_versions.mix(GTF_FILTER.out.versions)
         ch_versions         = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
     }
 
@@ -259,18 +267,19 @@ workflow PREPARE_GENOME {
     }
 
     emit:
-    fasta            = ch_fasta                  // channel: path(genome.fasta)
-    gtf              = ch_gtf                    // channel: path(genome.gtf)
-    fai              = ch_fai                    // channel: path(genome.fai)
-    gene_bed         = ch_gene_bed               // channel: path(gene.bed)
-    transcript_fasta = ch_transcript_fasta       // channel: path(transcript.fasta)
-    chrom_sizes      = ch_chrom_sizes            // channel: path(genome.sizes)
-    splicesites      = ch_splicesites            // channel: path(genome.splicesites.txt)
-    bbsplit_index    = ch_bbsplit_index          // channel: path(bbsplit/index/)
-    star_index       = ch_star_index             // channel: path(star/index/)
-    rsem_index       = ch_rsem_index             // channel: path(rsem/index/)
-    hisat2_index     = ch_hisat2_index           // channel: path(hisat2/index/)
-    salmon_index     = ch_salmon_index           // channel: path(salmon/index/)
+    fasta                   = ch_fasta                   // channel: path(genome.fasta)
+    gtf                     = ch_gtf_genome              // channel: path(genome.gtf)
+    fai                     = ch_fai                     // channel: path(genome.fai)
+    gene_bed                = ch_gene_bed                // channel: path(gene.bed)
+    gtf_with_transcript_ids = ch_gtf_with_transcript_ids // channel: path(gtf)
+    transcript_fasta        = ch_transcript_fasta        // channel: path(transcript.fasta)
+    chrom_sizes             = ch_chrom_sizes             // channel: path(genome.sizes)
+    splicesites             = ch_splicesites             // channel: path(genome.splicesites.txt)
+    bbsplit_index           = ch_bbsplit_index           // channel: path(bbsplit/index/)
+    star_index              = ch_star_index              // channel: path(star/index/)
+    rsem_index              = ch_rsem_index              // channel: path(rsem/index/)
+    hisat2_index            = ch_hisat2_index            // channel: path(hisat2/index/)
+    salmon_index            = ch_salmon_index            // channel: path(salmon/index/)
 
-    versions         = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
+    versions                = ch_versions.ifEmpty(null)  // channel: [ versions.yml ]
 }