From 688d19b74fd88e5ea34b7140870a8497412fe72a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 7 Nov 2023 18:37:33 +0000 Subject: [PATCH 01/43] Add special GTF handling for stringtie --- modules/local/gtf_for_stringtie/main.nf | 30 +++++++++++++++++++++++ subworkflows/local/prepare_genome/main.nf | 13 ++++++++++ workflows/rnaseq.nf | 2 +- 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 modules/local/gtf_for_stringtie/main.nf diff --git a/modules/local/gtf_for_stringtie/main.nf b/modules/local/gtf_for_stringtie/main.nf new file mode 100644 index 000000000..b3b1db1a8 --- /dev/null +++ b/modules/local/gtf_for_stringtie/main.nf @@ -0,0 +1,30 @@ +process GTF_FOR_STRINGTIE { + tag "$gtf" + label 'process_low' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + path gtf + + output: + path '*_for_stringtie.gtf' , emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + """ + # Remove gene entries from a GTF, if they don't contain a transcript_id + # attribute, which is common in Ensembl GTFs + awk '$3 == "\gene" { print; next } /transcript_id/ && \$0 ~ /transcript_id "[^"]+"/' $gtf > ${gtf}_for_stringtie.gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //')) + END_VERSIONS + """ +} diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index eceb2c1ce..8381fc545 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -30,6 +30,7 @@ include { GTF2BED } from '../../../modules/local/gt include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta' include { GTF_GENE_FILTER } from '../../../modules/local/gtf_gene_filter' include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' +include { GTF_FOR_STRINGTIE } from '../../../modules/local/gtf_for_stringtie' workflow PREPARE_GENOME { take: @@ -117,6 +118,17 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(GTF2BED.out.versions) } + // + // Prepare a GTF for StringTie + // + + ch_gtf_for_stringtie = Channel.empty() + if (!params.skip_alignment && !params.skip_stringtie) { + GTF_FOR_STRINGTIE( ch_gtf ) + ch_gtf_for_stringtie = GTF_FOR_STRINGTIE.out.gtf + ch_versions = ch_versions.mix(GTF_FOR_STRINGTIE.out.versions) + } + // // Uncompress transcript fasta file / create if required // @@ -263,6 +275,7 @@ workflow PREPARE_GENOME { gtf = ch_gtf // channel: path(genome.gtf) fai = ch_fai // channel: path(genome.fai) gene_bed = ch_gene_bed // channel: path(gene.bed) + gtf_for_stringtie= ch_gtf_for_stringtie // channel: path(gtf) transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) splicesites = ch_splicesites // channel: path(genome.splicesites.txt) diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 955a727c3..4a13cb3ee 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -648,7 +648,7 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_stringtie) { STRINGTIE_STRINGTIE ( ch_genome_bam, - PREPARE_GENOME.out.gtf + PREPARE_GENOME.out.gtf_for_stringtie ) ch_versions = ch_versions.mix(STRINGTIE_STRINGTIE.out.versions.first()) } From c6d354df5f66cb8a48bd06dae0d8ed0ffba806dc Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 7 Nov 2023 18:39:30 +0000 Subject: [PATCH 02/43] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a235dd00f..e64510937 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability +- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Prepare a GTF for stringtie ### Software dependencies From 82b67e16202685ee7907536773fa78b825cd5e3f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 7 Nov 2023 18:40:00 +0000 Subject: [PATCH 03/43] appease eclint --- modules/local/gtf_for_stringtie/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/gtf_for_stringtie/main.nf b/modules/local/gtf_for_stringtie/main.nf index b3b1db1a8..dd25b632d 100644 --- a/modules/local/gtf_for_stringtie/main.nf +++ b/modules/local/gtf_for_stringtie/main.nf @@ -16,7 +16,7 @@ process GTF_FOR_STRINGTIE { when: task.ext.when == null || task.ext.when - + """ # Remove gene entries from a GTF, if they don't contain a transcript_id # attribute, which is common in Ensembl GTFs From 5dbbe0c96c97ee14db3bc1b2f480f2b974bad102 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 7 Nov 2023 18:41:04 +0000 Subject: [PATCH 04/43] Comment dollar --- modules/local/gtf_for_stringtie/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/gtf_for_stringtie/main.nf b/modules/local/gtf_for_stringtie/main.nf index dd25b632d..2dadbc24f 100644 --- a/modules/local/gtf_for_stringtie/main.nf +++ b/modules/local/gtf_for_stringtie/main.nf @@ -20,7 +20,7 @@ process GTF_FOR_STRINGTIE { """ # Remove gene entries from a GTF, if they don't contain a transcript_id # attribute, which is common in Ensembl GTFs - awk '$3 == "\gene" { print; next } /transcript_id/ && \$0 ~ /transcript_id "[^"]+"/' $gtf > ${gtf}_for_stringtie.gtf + awk '\$3 == "gene" { print; next } /transcript_id/ && \$0 ~ /transcript_id "[^"]+"/' $gtf > ${gtf}_for_stringtie.gtf cat <<-END_VERSIONS > versions.yml "${task.process}": From 124c5701d472f0b29af6298b46d664073a9ac76f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 3 Nov 2023 10:31:10 +0000 Subject: [PATCH 05/43] Fix multiqc to avoid imp module error --- modules/local/multiqc/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf index 44565a9f7..06fbb9b29 100644 --- a/modules/local/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.15" + conda "bioconda::multiqc=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : - 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path multiqc_config From 57d4bde66d329271ca02fa3e8c6ba9792ff3585d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 12:58:22 +0000 Subject: [PATCH 06/43] Actually, unify GTF filtering --- bin/filter_gtf_for_genes_in_genome.py | 56 +++++++++++++++++------ conf/test.config | 2 +- modules/local/gtf_for_stringtie/main.nf | 30 ------------ modules/local/gtf_gene_filter/main.nf | 5 +- subworkflows/local/prepare_genome/main.nf | 50 +++++++++----------- workflows/rnaseq.nf | 4 +- 6 files changed, 71 insertions(+), 76 deletions(-) delete mode 100644 modules/local/gtf_for_stringtie/main.nf diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py index 9f876eaa0..cb2002a02 100755 --- a/bin/filter_gtf_for_genes_in_genome.py +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -9,18 +9,25 @@ logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) - -def is_header(line): +def is_header(line: str) -> bool: + """Returns True if the given line is a header line in a FASTA file.""" return line[0] == ">" - -def extract_fasta_seq_names(fasta_name): - """ +def extract_fasta_seq_names(fasta_name: str) -> set: + """Extracts the sequence names from a FASTA file. + modified from Brent Pedersen Correct Way To Parse A Fasta File In Python given a fasta file. yield tuples of header, sequence from https://www.biostars.org/p/710/ + + Args: + fasta_name: The path to the FASTA file. + + Returns: + A set of the sequence names in the FASTA file. """ + # first open the file outside fh = open(fasta_name) @@ -35,8 +42,15 @@ def extract_fasta_seq_names(fasta_name): headerStr = line[1:].strip().split()[0] yield headerStr +def extract_genes_in_genome(fasta: str, gtf_in: str, prefix: str) -> None: + """Extracts the genes in the genome from a GTF file. -def extract_genes_in_genome(fasta, gtf_in, gtf_out): + Args: + fasta: The path to the FASTA file. + gtf_in: The path to the input GTF file. + prefix: Prefix for output GTF + """ + gtf_out = prefix + '_in_genome.gtf' seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info("Extracted chromosome sequence names from : %s" % fasta) logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) @@ -60,19 +74,35 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out): logger.info("Wrote matching lines to %s" % gtf_out) +def remove_features_without_transcript_id(gtf_in, prefix): + """ + Removes gene rows with absent or empty transcript_id attributes from a GTF file. + + Args: + gtf_in: Path to the input GTF file. + prefix: Path to the output GTF file. + """ + gtf_out = prefix + '_with_transcript_ids.gtf' + + with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out: + for line in f_in: + transcript_id = line.split("\t")[8].split(" transcript_id ")[1].split(";")[0].replace('"', '') + if transcript_id and transcript_id.isalnum(): + f_out.write(line) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""") + parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""") parser.add_argument("--gtf", type=str, help="GTF file") parser.add_argument("--fasta", type=str, help="Genome fasta file") parser.add_argument( - "-o", - "--output", - dest="output", - default="genes_in_genome.gtf", + "-p", + "--prefix", + dest="prefix", + default="genes", type=str, - help="GTF features on fasta genome sequences", + help="Prefix for output GTF files", ) args = parser.parse_args() - extract_genes_in_genome(args.fasta, args.gtf, args.output) + extract_genes_in_genome(args.fasta, args.gtf, args.prefix) + remove_features_without_transcript_id(args.gtf, args.prefix) diff --git a/conf/test.config b/conf/test.config index d0430eacb..f78e4e567 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,7 +21,7 @@ params { // Input data - input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_test.csv" + input = "/Users/jonathan.manning/projects/rnaseq/samplesheet_test.csv" // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fasta" diff --git a/modules/local/gtf_for_stringtie/main.nf b/modules/local/gtf_for_stringtie/main.nf deleted file mode 100644 index 2dadbc24f..000000000 --- a/modules/local/gtf_for_stringtie/main.nf +++ /dev/null @@ -1,30 +0,0 @@ -process GTF_FOR_STRINGTIE { - tag "$gtf" - label 'process_low' - - conda "conda-forge::gawk=5.1.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'biocontainers/gawk:5.1.0' }" - - input: - path gtf - - output: - path '*_for_stringtie.gtf' , emit: gtf - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - """ - # Remove gene entries from a GTF, if they don't contain a transcript_id - # attribute, which is common in Ensembl GTFs - awk '\$3 == "gene" { print; next } /transcript_id/ && \$0 ~ /transcript_id "[^"]+"/' $gtf > ${gtf}_for_stringtie.gtf - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //')) - END_VERSIONS - """ -} diff --git a/modules/local/gtf_gene_filter/main.nf b/modules/local/gtf_gene_filter/main.nf index cd8e16adb..b4e0578d7 100644 --- a/modules/local/gtf_gene_filter/main.nf +++ b/modules/local/gtf_gene_filter/main.nf @@ -11,8 +11,9 @@ process GTF_GENE_FILTER { path gtf output: - path "*.gtf" , emit: gtf - path "versions.yml", emit: versions + path "*_in_genome.gtf" , emit: genome_gtf + path "*_with_transcript_ids.gtf" , emit: transcript_id_gtf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 8381fc545..cec93e86b 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -30,7 +30,6 @@ include { GTF2BED } from '../../../modules/local/gt include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta' include { GTF_GENE_FILTER } from '../../../modules/local/gtf_gene_filter' include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' -include { GTF_FOR_STRINGTIE } from '../../../modules/local/gtf_for_stringtie' workflow PREPARE_GENOME { take: @@ -87,6 +86,13 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(GFFREAD.out.versions) } + // + // Apply filtering we may need for GTFs + // + GTF_GENE_FILTER ( ch_fasta, ch_gtf ) + ch_gtf_with_transcript_ids = GTF_GENE_FILTER.out.transcript_id_gtf + ch_gtf_genome = GTF_GENE_FILTER.out.genome_gtf + // // Uncompress additional fasta file and concatenate with reference fasta and gtf files // @@ -118,17 +124,6 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(GTF2BED.out.versions) } - // - // Prepare a GTF for StringTie - // - - ch_gtf_for_stringtie = Channel.empty() - if (!params.skip_alignment && !params.skip_stringtie) { - GTF_FOR_STRINGTIE( ch_gtf ) - ch_gtf_for_stringtie = GTF_FOR_STRINGTIE.out.gtf - ch_versions = ch_versions.mix(GTF_FOR_STRINGTIE.out.versions) - } - // // Uncompress transcript fasta file / create if required // @@ -145,8 +140,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { - ch_filter_gtf = GTF_GENE_FILTER ( ch_fasta, ch_gtf ).gtf - ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filter_gtf ).transcript_fasta + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf_genome ).transcript_fasta ch_versions = ch_versions.mix(GTF_GENE_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -271,19 +265,19 @@ workflow PREPARE_GENOME { } emit: - fasta = ch_fasta // channel: path(genome.fasta) - gtf = ch_gtf // channel: path(genome.gtf) - fai = ch_fai // channel: path(genome.fai) - gene_bed = ch_gene_bed // channel: path(gene.bed) - gtf_for_stringtie= ch_gtf_for_stringtie // channel: path(gtf) - transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) - chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) - splicesites = ch_splicesites // channel: path(genome.splicesites.txt) - bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) - star_index = ch_star_index // channel: path(star/index/) - rsem_index = ch_rsem_index // channel: path(rsem/index/) - hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) - salmon_index = ch_salmon_index // channel: path(salmon/index/) + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + fai = ch_fai // channel: path(genome.fai) + gene_bed = ch_gene_bed // channel: path(gene.bed) + gtf_with_transcript_ids = ch_gtf_with_transcript_ids // channel: path(gtf) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 4a13cb3ee..09de86260 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -648,7 +648,7 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_stringtie) { STRINGTIE_STRINGTIE ( ch_genome_bam, - PREPARE_GENOME.out.gtf_for_stringtie + PREPARE_GENOME.out.gtf_with_transcript_ids ) ch_versions = ch_versions.mix(STRINGTIE_STRINGTIE.out.versions.first()) } @@ -798,7 +798,7 @@ workflow RNASEQ { ch_filtered_reads, PREPARE_GENOME.out.salmon_index, ch_dummy_file, - PREPARE_GENOME.out.gtf, + PREPARE_GENOME.out.gtf_with_transcript_ids, false, params.salmon_quant_libtype ?: '' ) From 7908ca3e01c6faa0ceaeba1a4773c55147dc3fd7 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 13:02:23 +0000 Subject: [PATCH 07/43] Reverse test profile change --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index f78e4e567..d0430eacb 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,7 +21,7 @@ params { // Input data - input = "/Users/jonathan.manning/projects/rnaseq/samplesheet_test.csv" + input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.10/samplesheet_test.csv" // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fasta" From f791e8af6a9d35acd06636bb15e29a2b5b1dca14 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 13:58:31 +0000 Subject: [PATCH 08/43] misc fixes --- ...filter_gtf_for_genes_in_genome.py => filter_gtf.py} | 0 modules/local/gtf_gene_filter/main.nf | 6 +++--- subworkflows/local/prepare_genome/main.nf | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) rename bin/{filter_gtf_for_genes_in_genome.py => filter_gtf.py} (100%) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf.py similarity index 100% rename from bin/filter_gtf_for_genes_in_genome.py rename to bin/filter_gtf.py diff --git a/modules/local/gtf_gene_filter/main.nf b/modules/local/gtf_gene_filter/main.nf index b4e0578d7..c12cf4116 100644 --- a/modules/local/gtf_gene_filter/main.nf +++ b/modules/local/gtf_gene_filter/main.nf @@ -1,4 +1,4 @@ -process GTF_GENE_FILTER { +process GTF_FILTER { tag "$fasta" conda "conda-forge::python=3.9.5" @@ -20,10 +20,10 @@ process GTF_GENE_FILTER { script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/ """ - filter_gtf_for_genes_in_genome.py \\ + filter_gtf.py \\ --gtf $gtf \\ --fasta $fasta \\ - -o ${fasta.baseName}_genes.gtf + --prefix ${fasta.baseName} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index cec93e86b..906590402 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -28,7 +28,7 @@ include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from '../../.. include { PREPROCESS_TRANSCRIPTS_FASTA_GENCODE } from '../../../modules/local/preprocess_transcripts_fasta_gencode' include { GTF2BED } from '../../../modules/local/gtf2bed' include { CAT_ADDITIONAL_FASTA } from '../../../modules/local/cat_additional_fasta' -include { GTF_GENE_FILTER } from '../../../modules/local/gtf_gene_filter' +include { GTF_FILTER } from '../../../modules/local/gtf_filter' include { STAR_GENOMEGENERATE_IGENOMES } from '../../../modules/local/star_genomegenerate_igenomes' workflow PREPARE_GENOME { @@ -89,9 +89,9 @@ workflow PREPARE_GENOME { // // Apply filtering we may need for GTFs // - GTF_GENE_FILTER ( ch_fasta, ch_gtf ) - ch_gtf_with_transcript_ids = GTF_GENE_FILTER.out.transcript_id_gtf - ch_gtf_genome = GTF_GENE_FILTER.out.genome_gtf + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf + ch_gtf_genome = GTF_FILTER.out.genome_gtf // // Uncompress additional fasta file and concatenate with reference fasta and gtf files @@ -141,7 +141,7 @@ workflow PREPARE_GENOME { } } else { ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf_genome ).transcript_fasta - ch_versions = ch_versions.mix(GTF_GENE_FILTER.out.versions) + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } From 19a72f6f168f004e5c6cd15fd5367a86ecfb0344 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 14:00:34 +0000 Subject: [PATCH 09/43] Blackify --- bin/filter_gtf.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index cb2002a02..055bc5197 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -9,13 +9,15 @@ logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) + def is_header(line: str) -> bool: """Returns True if the given line is a header line in a FASTA file.""" return line[0] == ">" + def extract_fasta_seq_names(fasta_name: str) -> set: """Extracts the sequence names from a FASTA file. - + modified from Brent Pedersen Correct Way To Parse A Fasta File In Python given a fasta file. yield tuples of header, sequence @@ -42,6 +44,7 @@ def extract_fasta_seq_names(fasta_name: str) -> set: headerStr = line[1:].strip().split()[0] yield headerStr + def extract_genes_in_genome(fasta: str, gtf_in: str, prefix: str) -> None: """Extracts the genes in the genome from a GTF file. @@ -50,7 +53,7 @@ def extract_genes_in_genome(fasta: str, gtf_in: str, prefix: str) -> None: gtf_in: The path to the input GTF file. prefix: Prefix for output GTF """ - gtf_out = prefix + '_in_genome.gtf' + gtf_out = prefix + "_in_genome.gtf" seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info("Extracted chromosome sequence names from : %s" % fasta) logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) @@ -74,6 +77,7 @@ def extract_genes_in_genome(fasta: str, gtf_in: str, prefix: str) -> None: logger.info("Wrote matching lines to %s" % gtf_out) + def remove_features_without_transcript_id(gtf_in, prefix): """ Removes gene rows with absent or empty transcript_id attributes from a GTF file. @@ -82,13 +86,14 @@ def remove_features_without_transcript_id(gtf_in, prefix): gtf_in: Path to the input GTF file. prefix: Path to the output GTF file. """ - gtf_out = prefix + '_with_transcript_ids.gtf' + gtf_out = prefix + "_with_transcript_ids.gtf" with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out: for line in f_in: - transcript_id = line.split("\t")[8].split(" transcript_id ")[1].split(";")[0].replace('"', '') - if transcript_id and transcript_id.isalnum(): - f_out.write(line) + transcript_id = line.split("\t")[8].split(" transcript_id ")[1].split(";")[0].replace('"', "") + if transcript_id and transcript_id.isalnum(): + f_out.write(line) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""") From 2cfea9d97fe8cce5f0a7eb618f81be2e350021c5 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 14:03:59 +0000 Subject: [PATCH 10/43] move module to right path --- modules/local/{gtf_gene_filter => gtf_filter}/main.nf | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/local/{gtf_gene_filter => gtf_filter}/main.nf (100%) diff --git a/modules/local/gtf_gene_filter/main.nf b/modules/local/gtf_filter/main.nf similarity index 100% rename from modules/local/gtf_gene_filter/main.nf rename to modules/local/gtf_filter/main.nf From 9b4e1406391c53c9e1fd16e611b84f4aaf79d891 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 14:05:25 +0000 Subject: [PATCH 11/43] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e64510937..a850a5353 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability -- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Prepare a GTF for stringtie +- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required ### Software dependencies From ec6f80e76694b44056a312ff6baa87531d0d426e Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 14:26:18 +0000 Subject: [PATCH 12/43] Correct selector --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index eaa35f259..abaf220f9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -116,7 +116,7 @@ process { ] } - withName: 'GTF_GENE_FILTER' { + withName: 'GTF_FILTER' { publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, From 25324ba729fd692da432368f43b7eeafc39d27b4 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 14:45:48 +0000 Subject: [PATCH 13/43] Make gtf filtering dependent on gtf --- subworkflows/local/prepare_genome/main.nf | 46 ++++++++++++----------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 906590402..d82f65b07 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -68,30 +68,32 @@ workflow PREPARE_GENOME { // // Uncompress GTF annotation file or create from GFF3 if required // - if (gtf) { - if (gtf.endsWith('.gz')) { - ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } else { - ch_gtf = Channel.value(file(gtf)) - } - } else if (gff) { - if (gff.endsWith('.gz')) { - ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } - ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) - } else { - ch_gff = Channel.value(file(gff)) + if (gtf || gff) { + if (gtf) { + if (gtf.endsWith('.gz')) { + ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } else { + ch_gtf = Channel.value(file(gtf)) + } + } else if (gff) { + if (gff.endsWith('.gz')) { + ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] } + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) + } else { + ch_gff = Channel.value(file(gff)) + } + ch_gtf = GFFREAD ( ch_gff ).gtf + ch_versions = ch_versions.mix(GFFREAD.out.versions) } - ch_gtf = GFFREAD ( ch_gff ).gtf - ch_versions = ch_versions.mix(GFFREAD.out.versions) - } - // - // Apply filtering we may need for GTFs - // - GTF_FILTER ( ch_fasta, ch_gtf ) - ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf - ch_gtf_genome = GTF_FILTER.out.genome_gtf + // + // Apply filtering we may need for GTFs + // + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf + ch_gtf_genome = GTF_FILTER.out.genome_gtf + } // // Uncompress additional fasta file and concatenate with reference fasta and gtf files From 26e9c1c28884e1aeea23039f1674e101bf2e71d4 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 3 Nov 2023 10:31:10 +0000 Subject: [PATCH 14/43] Fix multiqc to avoid imp module error --- modules/local/multiqc/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf index 44565a9f7..06fbb9b29 100644 --- a/modules/local/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda "bioconda::multiqc=1.15" + conda "bioconda::multiqc=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : - 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path multiqc_config From 251e368543cb823cca23c0635fb46655bb5ca9c5 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 15:35:29 +0000 Subject: [PATCH 15/43] Fix transcript_id pattern --- bin/filter_gtf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 055bc5197..13b7360ff 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -3,6 +3,7 @@ import logging from itertools import groupby import argparse +import re # Create a logger logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") @@ -90,11 +91,10 @@ def remove_features_without_transcript_id(gtf_in, prefix): with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out: for line in f_in: - transcript_id = line.split("\t")[8].split(" transcript_id ")[1].split(";")[0].replace('"', "") - if transcript_id and transcript_id.isalnum(): + transcript_id_match = re.search(r'transcript_id "([^"]+)"', line) + if transcript_id_match: f_out.write(line) - if __name__ == "__main__": parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""") parser.add_argument("--gtf", type=str, help="GTF file") From 6a3ea7ee8725b4c1c8dbaf6af31b385f73b733c1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 15:47:00 +0000 Subject: [PATCH 16/43] blackify --- bin/filter_gtf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 13b7360ff..3d4aa1fc9 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -95,6 +95,7 @@ def remove_features_without_transcript_id(gtf_in, prefix): if transcript_id_match: f_out.write(line) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""") parser.add_argument("--gtf", type=str, help="GTF file") From bb47808c550e3c0324200c1b341141fc2195a35d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 17:11:24 +0000 Subject: [PATCH 17/43] emit chromosome-filtered gtf from genome preparation --- subworkflows/local/prepare_genome/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index d82f65b07..337d65615 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -268,7 +268,7 @@ workflow PREPARE_GENOME { emit: fasta = ch_fasta // channel: path(genome.fasta) - gtf = ch_gtf // channel: path(genome.gtf) + gtf = ch_gtf_genome // channel: path(genome.gtf) fai = ch_fai // channel: path(genome.fai) gene_bed = ch_gene_bed // channel: path(gene.bed) gtf_with_transcript_ids = ch_gtf_with_transcript_ids // channel: path(gtf) From 047f7494a256b6b28aa6f98cfcd4dac18bb2ba93 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 17:31:16 +0000 Subject: [PATCH 18/43] [skip ci] fix changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e64510937..b38303f49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability -- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Prepare a GTF for stringtie +- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage ### Software dependencies From a8010136cb064dd43269f8af8afada30db4ebff0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 9 Nov 2023 18:04:29 +0000 Subject: [PATCH 19/43] Thank myself to poke CI --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b38303f49..66afef72f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Special thanks to the following for their contributions to the release: - [Júlia Mir Pedrol](https://github.com/mirpedrol) - [Matthias Zepper](https://github.com/MatthiasZepper) - [Maxime Garcia](https://github.com/maxulysse) +- [Jonathan Manning](https://github.com/pinin4fjords) Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. From 16430c2f27c2f596930136181e492de1a51b80c4 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 10 Nov 2023 09:40:57 +0000 Subject: [PATCH 20/43] filter_gtf improvements --- bin/filter_gtf.py | 64 ++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 3d4aa1fc9..0f87dc615 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -45,49 +45,51 @@ def extract_fasta_seq_names(fasta_name: str) -> set: headerStr = line[1:].strip().split()[0] yield headerStr - -def extract_genes_in_genome(fasta: str, gtf_in: str, prefix: str) -> None: +def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: """Extracts the genes in the genome from a GTF file. Args: fasta: The path to the FASTA file. gtf_in: The path to the input GTF file. - prefix: Prefix for output GTF - """ - gtf_out = prefix + "_in_genome.gtf" - seq_names_in_genome = set(extract_fasta_seq_names(fasta)) - logger.info("Extracted chromosome sequence names from : %s" % fasta) - logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) - seq_names_in_gtf = set([]) - - n_total_lines = 0 - n_lines_in_genome = 0 - with open(gtf_out, "w") as f: - with open(gtf_in) as g: - for line in g.readlines(): - n_total_lines += 1 - seq_name_gtf = line.split("\t")[0] - seq_names_in_gtf.add(seq_name_gtf) - if seq_name_gtf in seq_names_in_genome: - n_lines_in_genome += 1 - f.write(line) - logger.info( - "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta) - ) - logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf))) + gtf_out: The path to the output GTF file. - logger.info("Wrote matching lines to %s" % gtf_out) + Raises: + ValueError: If no overlap is found or if the GTF file is not tab delimited. + """ + def is_tab_delimited(file): + with open(file, 'r') as f: + return '\t' in f.readline() + if not is_tab_delimited(gtf_in): + raise ValueError("The GTF file is not tab delimited.") -def remove_features_without_transcript_id(gtf_in, prefix): + seq_names_in_genome = set(extract_fasta_seq_names(fasta)) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.info("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) + + with open(gtf_in) as gtf, open(gtf_out, "w") as out: + seq_names_in_gtf = {line.split("\t")[0] for line in gtf if line.strip()} + overlap = seq_names_in_genome & seq_names_in_gtf + if not overlap: + raise ValueError("No overlapping scaffolds found.") + + gtf.seek(0) # Reset file pointer to the start of the file + for line in gtf: + if line.split("\t")[0] in overlap: + out.write(line) + + logger.info(f"Extracted {len(overlap)} matching sequences from {gtf_in} into {gtf_out}") + logger.info("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info(f"Wrote matching lines to {gtf_out}") + +def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None: """ Removes gene rows with absent or empty transcript_id attributes from a GTF file. Args: gtf_in: Path to the input GTF file. - prefix: Path to the output GTF file. + gtf_out: The path to the output GTF file. """ - gtf_out = prefix + "_with_transcript_ids.gtf" with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out: for line in f_in: @@ -110,5 +112,5 @@ def remove_features_without_transcript_id(gtf_in, prefix): ) args = parser.parse_args() - extract_genes_in_genome(args.fasta, args.gtf, args.prefix) - remove_features_without_transcript_id(args.gtf, args.prefix) + extract_genes_in_genome(args.fasta, args.gtf, args.prefix + "_in_genome.gtf" ) + remove_features_without_transcript_id(args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") From 65ac5ef9f4a3333f8ab788598dab299d63c39c25 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 10 Nov 2023 09:46:56 +0000 Subject: [PATCH 21/43] Blackify --- bin/filter_gtf.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 0f87dc615..cb7a85dd1 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -45,6 +45,7 @@ def extract_fasta_seq_names(fasta_name: str) -> set: headerStr = line[1:].strip().split()[0] yield headerStr + def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: """Extracts the genes in the genome from a GTF file. @@ -56,9 +57,10 @@ def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: Raises: ValueError: If no overlap is found or if the GTF file is not tab delimited. """ + def is_tab_delimited(file): - with open(file, 'r') as f: - return '\t' in f.readline() + with open(file, "r") as f: + return "\t" in f.readline() if not is_tab_delimited(gtf_in): raise ValueError("The GTF file is not tab delimited.") @@ -66,7 +68,7 @@ def is_tab_delimited(file): seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info(f"Extracted chromosome sequence names from {fasta}") logger.info("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) - + with open(gtf_in) as gtf, open(gtf_out, "w") as out: seq_names_in_gtf = {line.split("\t")[0] for line in gtf if line.strip()} overlap = seq_names_in_genome & seq_names_in_gtf @@ -82,6 +84,7 @@ def is_tab_delimited(file): logger.info("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) logger.info(f"Wrote matching lines to {gtf_out}") + def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None: """ Removes gene rows with absent or empty transcript_id attributes from a GTF file. @@ -112,5 +115,5 @@ def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None: ) args = parser.parse_args() - extract_genes_in_genome(args.fasta, args.gtf, args.prefix + "_in_genome.gtf" ) - remove_features_without_transcript_id(args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") + extract_genes_in_genome(args.fasta, args.gtf, args.prefix + "_in_genome.gtf") + remove_features_without_transcript_id(args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") From c4823eaae012c9f67681ca98f622e53da87a33d0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 10 Nov 2023 10:01:38 +0000 Subject: [PATCH 22/43] [skip ci] Add resolved issues to changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66afef72f..e370c60cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,9 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability - [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage +- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for filter_gtf_for_genes_in_genome.py +- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields +- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom Ensembl genome ### Software dependencies From ecb3a1480136af13a4d3a64af7a0156ef619aa83 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 10 Nov 2023 10:08:45 +0000 Subject: [PATCH 23/43] reinstate publish modes --- conf/modules.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 0c6bc646d..aabe7bfe7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -155,6 +155,7 @@ process { ext.args = '--record-count 1000000 --seed 1' ext.prefix = { "${meta.id}.subsampled" } publishDir = [ + mode: params.publish_dir_mode, enabled: false ] } @@ -162,6 +163,7 @@ process { withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' { ext.args = '--skipQuant' publishDir = [ + mode: params.publish_dir_mode, enabled: false ] } From 1c23a4a0991e7f71faba1c64cde39956594f255f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 09:53:52 +0000 Subject: [PATCH 24/43] [skip ci] Reduce logging in GTF filtering --- bin/filter_gtf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index cb7a85dd1..10cb646b9 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -10,7 +10,6 @@ logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) - def is_header(line: str) -> bool: """Returns True if the given line is a header line in a FASTA file.""" return line[0] == ">" @@ -67,7 +66,7 @@ def is_tab_delimited(file): seq_names_in_genome = set(extract_fasta_seq_names(fasta)) logger.info(f"Extracted chromosome sequence names from {fasta}") - logger.info("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) + logger.debug("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) with open(gtf_in) as gtf, open(gtf_out, "w") as out: seq_names_in_gtf = {line.split("\t")[0] for line in gtf if line.strip()} @@ -81,7 +80,7 @@ def is_tab_delimited(file): out.write(line) logger.info(f"Extracted {len(overlap)} matching sequences from {gtf_in} into {gtf_out}") - logger.info("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) logger.info(f"Wrote matching lines to {gtf_out}") From 0a70d58289bf4031355844b51b1e6f336ae0135f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 09:56:39 +0000 Subject: [PATCH 25/43] [skip ci] Apply suggestions from code review Co-authored-by: Matthias Zepper <6963520+MatthiasZepper@users.noreply.github.com> --- bin/filter_gtf.py | 37 +++++++++++++++++------------------ modules/local/multiqc/main.nf | 4 ++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 10cb646b9..cafc14c39 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -29,20 +29,12 @@ def extract_fasta_seq_names(fasta_name: str) -> set: Returns: A set of the sequence names in the FASTA file. """ - - # first open the file outside - fh = open(fasta_name) - - # ditch the boolean (x[0]) and just keep the header or sequence since - # we know they alternate. - faiter = (x[1] for x in groupby(fh, is_header)) - - for i, header in enumerate(faiter): - line = next(header) - if is_header(line): - # drop the ">" - headerStr = line[1:].strip().split()[0] - yield headerStr + seqnames = set() + with open(fasta_name) as fasta: + for line in fasta: + if line[0] == ">": + seqnames.add(line[1:].split(None, 1)[0]) + return seqnames def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: @@ -57,14 +49,21 @@ def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: ValueError: If no overlap is found or if the GTF file is not tab delimited. """ - def is_tab_delimited(file): + def tab_delimited(file) -> float: + + import statistics. # put to the top + with open(file, "r") as f: - return "\t" in f.readline() + data = f.read(1024) + lines = data.split("\n") + # most lines should have 9 tab-separated columns + return statistics.median([line.count("\t") for line in lines]) - if not is_tab_delimited(gtf_in): - raise ValueError("The GTF file is not tab delimited.") + num_sep = tab_delimited(gtf_in) + if num_sep != 8: + raise ValueError("No valid tab-delimited GTF file.") - seq_names_in_genome = set(extract_fasta_seq_names(fasta)) + seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") logger.debug("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf index 06fbb9b29..6ab3a38d0 100644 --- a/modules/local/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -3,8 +3,8 @@ process MULTIQC { conda "bioconda::multiqc=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_1' : + 'biocontainers/multiqc:1.17--pyhdfd78af_1' }" input: path multiqc_config From 996abcc5c6ed9a23079acefc91c957035c072dea Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 10:04:02 +0000 Subject: [PATCH 26/43] [skip ci] post-suggestion integration work --- bin/filter_gtf.py | 51 +++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index cafc14c39..11cfbdc30 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -37,13 +37,15 @@ def extract_fasta_seq_names(fasta_name: str) -> set: return seqnames -def extract_genes_in_genome(fasta: str, gtf_in: str, gtf_out: str) -> None: +def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_out: str) -> None: """Extracts the genes in the genome from a GTF file. Args: fasta: The path to the FASTA file. gtf_in: The path to the input GTF file. - gtf_out: The path to the output GTF file. + gtf_in_genome_out: The path to the output GTF file with scaffolds filtered. + gtf_transcript_out: The path to the output GTF file with scaffolds + filtered and rows without transcript ID removed. Raises: ValueError: If no overlap is found or if the GTF file is not tab delimited. @@ -67,36 +69,24 @@ def tab_delimited(file) -> float: logger.info(f"Extracted chromosome sequence names from {fasta}") logger.debug("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) - with open(gtf_in) as gtf, open(gtf_out, "w") as out: - seq_names_in_gtf = {line.split("\t")[0] for line in gtf if line.strip()} - overlap = seq_names_in_genome & seq_names_in_gtf - if not overlap: - raise ValueError("No overlapping scaffolds found.") - - gtf.seek(0) # Reset file pointer to the start of the file + with open(gtf_in) as gtf, open(gtf_in_genome_out, "w") as out, open( + gtf_transcript_out, "w" + ) as out2: + line_count_all = 0 + line_count_transcript = 0 for line in gtf: - if line.split("\t")[0] in overlap: + if ( + line.split("\t")[0] in seq_names_in_genome + and line.count("\t") == num_sep + ): out.write(line) + line_count_all += 1 + if re.search(r'transcript_id "([^"]+)"', line): + out2.write(line) + line_count_transcript += 1 + if line_count_all == 0: + raise ValueError("No overlapping scaffolds found.") - logger.info(f"Extracted {len(overlap)} matching sequences from {gtf_in} into {gtf_out}") - logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) - logger.info(f"Wrote matching lines to {gtf_out}") - - -def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None: - """ - Removes gene rows with absent or empty transcript_id attributes from a GTF file. - - Args: - gtf_in: Path to the input GTF file. - gtf_out: The path to the output GTF file. - """ - - with open(gtf_in, "r") as f_in, open(gtf_out, "w") as f_out: - for line in f_in: - transcript_id_match = re.search(r'transcript_id "([^"]+)"', line) - if transcript_id_match: - f_out.write(line) if __name__ == "__main__": @@ -113,5 +103,4 @@ def remove_features_without_transcript_id(gtf_in: str, gtf_out: str) -> None: ) args = parser.parse_args() - extract_genes_in_genome(args.fasta, args.gtf, args.prefix + "_in_genome.gtf") - remove_features_without_transcript_id(args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") + filter_gtf(args.fasta, args.gtf, args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") From 01e05f74a96d86d9ea4fcc8c8ee9655420ada095 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 10:11:43 +0000 Subject: [PATCH 27/43] [skip ci] more post-review tidy-up to GTF filtering script --- bin/filter_gtf.py | 117 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 84 deletions(-) mode change 100755 => 100644 bin/filter_gtf.py diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py old mode 100755 new mode 100644 index 11cfbdc30..8a6836e4e --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -1,106 +1,55 @@ #!/usr/bin/env python -from __future__ import print_function import logging -from itertools import groupby import argparse import re +import statistics +from typing import Set # Create a logger logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") -logger = logging.getLogger(__file__) +logger = logging.getLogger("fasta_gtf_filter") logger.setLevel(logging.INFO) -def is_header(line: str) -> bool: - """Returns True if the given line is a header line in a FASTA file.""" - return line[0] == ">" - - -def extract_fasta_seq_names(fasta_name: str) -> set: - """Extracts the sequence names from a FASTA file. - - modified from Brent Pedersen - Correct Way To Parse A Fasta File In Python - given a fasta file. yield tuples of header, sequence - from https://www.biostars.org/p/710/ - - Args: - fasta_name: The path to the FASTA file. - - Returns: - A set of the sequence names in the FASTA file. - """ - seqnames = set() +def extract_fasta_seq_names(fasta_name: str) -> Set[str]: + """Extracts the sequence names from a FASTA file.""" with open(fasta_name) as fasta: - for line in fasta: - if line[0] == ">": - seqnames.add(line[1:].split(None, 1)[0]) - return seqnames + return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")} +def tab_delimited(file: str) -> float: + """Check if file is tab-delimited and return median number of tabs.""" + with open(file, "r") as f: + data = f.read(1024) + return statistics.median(line.count("\t") for line in data.split("\n")) def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_out: str) -> None: - """Extracts the genes in the genome from a GTF file. - - Args: - fasta: The path to the FASTA file. - gtf_in: The path to the input GTF file. - gtf_in_genome_out: The path to the output GTF file with scaffolds filtered. - gtf_transcript_out: The path to the output GTF file with scaffolds - filtered and rows without transcript ID removed. - - Raises: - ValueError: If no overlap is found or if the GTF file is not tab delimited. - """ - - def tab_delimited(file) -> float: - - import statistics. # put to the top - - with open(file, "r") as f: - data = f.read(1024) - lines = data.split("\n") - # most lines should have 9 tab-separated columns - return statistics.median([line.count("\t") for line in lines]) - - num_sep = tab_delimited(gtf_in) - if num_sep != 8: - raise ValueError("No valid tab-delimited GTF file.") + """Filter GTF file based on FASTA sequence names.""" + if tab_delimited(gtf_in) != 8: + raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") - logger.debug("All chromosome names: " + ", ".join(sorted(seq_names_in_genome))) - - with open(gtf_in) as gtf, open(gtf_in_genome_out, "w") as out, open( - gtf_transcript_out, "w" - ) as out2: - line_count_all = 0 - line_count_transcript = 0 - for line in gtf: - if ( - line.split("\t")[0] in seq_names_in_genome - and line.count("\t") == num_sep - ): - out.write(line) - line_count_all += 1 - if re.search(r'transcript_id "([^"]+)"', line): - out2.write(line) - line_count_transcript += 1 - if line_count_all == 0: - raise ValueError("No overlapping scaffolds found.") - + try: + with open(gtf_in) as gtf, open(gtf_in_genome_out, "w") as out, open(gtf_transcript_out, "w") as out2: + line_count_all, line_count_transcript = 0, 0 + for line in gtf: + if line.split("\t")[0] in seq_names_in_genome: + out.write(line) + line_count_all += 1 + if re.search(r'transcript_id "([^"]+)"', line): + out2.write(line) + line_count_transcript += 1 + if line_count_all == 0: + raise ValueError("No overlapping scaffolds found.") + except IOError as e: + logger.error(f"File operation failed: {e}") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Filter GTF for various reasons""") - parser.add_argument("--gtf", type=str, help="GTF file") - parser.add_argument("--fasta", type=str, help="Genome fasta file") - parser.add_argument( - "-p", - "--prefix", - dest="prefix", - default="genes", - type=str, - help="Prefix for output GTF files", - ) + parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") + parser.add_argument("--gtf", type=str, required=True, help="GTF file") + parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") + parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") args = parser.parse_args() filter_gtf(args.fasta, args.gtf, args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") + From 556af914e30289533b0d851d2b49b9ccaf2c90fb Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 10:52:39 +0000 Subject: [PATCH 28/43] [skip ci] reinstate logging --- bin/filter_gtf.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) mode change 100644 => 100755 bin/filter_gtf.py diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py old mode 100644 new mode 100755 index 8a6836e4e..18bbcb2d4 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -29,11 +29,15 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") + seq_names_in_gtf = set() try: with open(gtf_in) as gtf, open(gtf_in_genome_out, "w") as out, open(gtf_transcript_out, "w") as out2: line_count_all, line_count_transcript = 0, 0 for line in gtf: - if line.split("\t")[0] in seq_names_in_genome: + seq_name = line.split("\t")[0] + seq_names_in_gtf.add(seq_name) # Add sequence name to the set + + if seq_name in seq_names_in_genome: out.write(line) line_count_all += 1 if re.search(r'transcript_id "([^"]+)"', line): @@ -43,6 +47,11 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o raise ValueError("No overlapping scaffolds found.") except IOError as e: logger.error(f"File operation failed: {e}") + return + + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info(f"Extracted {line_count_all} matching sequences from {gtf_in} into {gtf_in_genome_out}") + logger.info(f"Wrote {line_count_transcript} lines with transcript IDs to {gtf_transcript_out}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") From ad998c4be6b73c97b4b2280132cac514c9e15215 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 10:59:27 +0000 Subject: [PATCH 29/43] [skip ci] reinstate logging --- bin/filter_gtf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 18bbcb2d4..f815a8d52 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -28,6 +28,7 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome))) seq_names_in_gtf = set() try: From 80a95430aa2ebe617dc3a49b91a2eba4f8b48498 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 11:09:20 +0000 Subject: [PATCH 30/43] [skip ci] Fix salmon subworkflow call for gtf --- workflows/rnaseq.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 09de86260..0912c661c 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -473,7 +473,7 @@ workflow RNASEQ { ch_transcriptome_bam, ch_dummy_file, PREPARE_GENOME.out.transcript_fasta, - PREPARE_GENOME.out.gtf, + PREPARE_GENOME.out.gtf_with_transcript_ids, true, params.salmon_quant_libtype ?: '' ) From 519e42adb361239232d36aac91daaec12a08bf87 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 11:18:47 +0000 Subject: [PATCH 31/43] [skip ci] also raise value for no transcript ids --- bin/filter_gtf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index f815a8d52..316813fa0 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -46,6 +46,9 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o line_count_transcript += 1 if line_count_all == 0: raise ValueError("No overlapping scaffolds found.") + if line_count_transcript == 0: + raise ValueError("No transcript_id values found in the GTF file.") + except IOError as e: logger.error(f"File operation failed: {e}") return From 5e1378e23cafb7bd79b9fbb09ee68cf75f6eb4de Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 14 Nov 2023 12:16:24 +0000 Subject: [PATCH 32/43] [skip ci] @drpatelh feedback: Try to avoid filtering process where not required --- bin/filter_gtf.py | 15 ++++++++---- conf/modules.config | 1 + nextflow.config | 2 ++ nextflow_schema.json | 11 +++++++++ subworkflows/local/prepare_genome/main.nf | 30 ++++++++++++++++++++--- 5 files changed, 51 insertions(+), 8 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 316813fa0..345d4a68e 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -21,7 +21,7 @@ def tab_delimited(file: str) -> float: data = f.read(1024) return statistics.median(line.count("\t") for line in data.split("\n")) -def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_out: str) -> None: +def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") @@ -41,12 +41,16 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o if seq_name in seq_names_in_genome: out.write(line) line_count_all += 1 - if re.search(r'transcript_id "([^"]+)"', line): - out2.write(line) + + if not skip_transcript_id_check and re.search(r'transcript_id "([^"]+)"', line): + with open(gtf_transcript_out, "a") as out2: + out2.write(line) line_count_transcript += 1 + if line_count_all == 0: raise ValueError("No overlapping scaffolds found.") - if line_count_transcript == 0: + + if not skip_transcript_id_check and line_count_transcript == 0: raise ValueError("No transcript_id values found in the GTF file.") except IOError as e: @@ -62,7 +66,8 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o parser.add_argument("--gtf", type=str, required=True, help="GTF file") parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") + parser.add_argument("--skip_transcript_id_check", action='store_true', help="Skip checking for transcript IDs in the GTF file") args = parser.parse_args() - filter_gtf(args.fasta, args.gtf, args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf") + filter_gtf(args.fasta, args.gtf, args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf", args.skip_transcript_id_check) diff --git a/conf/modules.config b/conf/modules.config index aabe7bfe7..683ccb1db 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -109,6 +109,7 @@ process { } withName: 'GTF_FILTER' { + ext.args = { params.skip_gtf_transcript_filter ?: '--skip_transcript_id_check' } publishDir = [ path: { "${params.outdir}/genome" }, mode: params.publish_dir_mode, diff --git a/nextflow.config b/nextflow.config index f786d1daa..ce4188962 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,8 @@ params { splicesites = null gtf_extra_attributes = 'gene_name' gtf_group_features = 'gene_id' + skip_gtf_filter = false + skip_gtf_transcript_filter = false featurecounts_feature_type = 'exon' featurecounts_group_type = 'gene_biotype' gencode = false diff --git a/nextflow_schema.json b/nextflow_schema.json index a22baecef..412e3b398 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -85,6 +85,17 @@ "description": "Path to GFF3 annotation file.", "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified." }, + "skip_gtf_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", + "help_text": "If you're confident on the validity of the GTF with respect to the genome file, or wish to disregard failures thriggered by the filtering module, activate this option." + }, + "skip_gtf_transcript_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip just the transcript_id check of GTF filtering." + }, "gene_bed": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 337d65615..e743c1dab 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -90,9 +90,33 @@ workflow PREPARE_GENOME { // // Apply filtering we may need for GTFs // - GTF_FILTER ( ch_fasta, ch_gtf ) - ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf - ch_gtf_genome = GTF_FILTER.out.genome_gtf + + filtering_useful = + ( + // Condition 1: Alignment is required and aligner is set to 'star_salmon' + !params.skip_alignment && params.aligner == 'star_salmon' + ) || + ( + // Condition 2: Pseudo-alignment is required and pseudo-aligner is set to 'salmon' + !params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon' + ) || + ( + // Condition 3: Neither alignment nor stringtie are to be skipped + !params.skip_alignment && !params.skip_stringtie + ) || + ( + // Condition 4: Transcript FASTA file is not provided + !transcript_fasta + ) + + if (params.skip_gtf_filter || ! filtering_useful){ + ch_gtf_with_transcript_ids = ch_gtf + ch_gtf_genome = ch_gtf + } else { + GTF_FILTER ( ch_fasta, ch_gtf ) + ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf + ch_gtf_genome = GTF_FILTER.out.genome_gtf + } } // From fb70fdf3d9191d4f80899a3e7927d3959288deb9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 15 Nov 2023 10:30:15 +0000 Subject: [PATCH 33/43] Simplify GTF filtering --- bin/filter_gtf.py | 28 ++++++++--------------- modules/local/gtf_filter/main.nf | 3 +-- subworkflows/local/prepare_genome/main.nf | 16 ++++++------- workflows/rnaseq.nf | 8 +++---- 4 files changed, 22 insertions(+), 33 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 345d4a68e..dd8e1866e 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -21,7 +21,7 @@ def tab_delimited(file: str) -> float: data = f.read(1024) return statistics.median(line.count("\t") for line in data.split("\n")) -def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_out: str, skip_transcript_id_check: bool) -> None: +def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") @@ -32,34 +32,26 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o seq_names_in_gtf = set() try: - with open(gtf_in) as gtf, open(gtf_in_genome_out, "w") as out, open(gtf_transcript_out, "w") as out2: - line_count_all, line_count_transcript = 0, 0 + with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out: + line_count = 0 for line in gtf: seq_name = line.split("\t")[0] seq_names_in_gtf.add(seq_name) # Add sequence name to the set if seq_name in seq_names_in_genome: - out.write(line) - line_count_all += 1 + if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line): + out.write(line) + line_count += 1 - if not skip_transcript_id_check and re.search(r'transcript_id "([^"]+)"', line): - with open(gtf_transcript_out, "a") as out2: - out2.write(line) - line_count_transcript += 1 - - if line_count_all == 0: - raise ValueError("No overlapping scaffolds found.") - - if not skip_transcript_id_check and line_count_transcript == 0: - raise ValueError("No transcript_id values found in the GTF file.") + if line_count == 0: + raise ValueError("All GTF lines removed by filters") except IOError as e: logger.error(f"File operation failed: {e}") return logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) - logger.info(f"Extracted {line_count_all} matching sequences from {gtf_in} into {gtf_in_genome_out}") - logger.info(f"Wrote {line_count_transcript} lines with transcript IDs to {gtf_transcript_out}") + logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") @@ -69,5 +61,5 @@ def filter_gtf(fasta: str, gtf_in: str, gtf_in_genome_out: str, gtf_transcript_o parser.add_argument("--skip_transcript_id_check", action='store_true', help="Skip checking for transcript IDs in the GTF file") args = parser.parse_args() - filter_gtf(args.fasta, args.gtf, args.prefix + "_in_genome.gtf", args.prefix + "_with_transcript_ids.gtf", args.skip_transcript_id_check) + filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check) diff --git a/modules/local/gtf_filter/main.nf b/modules/local/gtf_filter/main.nf index c12cf4116..cf3536f95 100644 --- a/modules/local/gtf_filter/main.nf +++ b/modules/local/gtf_filter/main.nf @@ -11,8 +11,7 @@ process GTF_FILTER { path gtf output: - path "*_in_genome.gtf" , emit: genome_gtf - path "*_with_transcript_ids.gtf" , emit: transcript_id_gtf + path "*.filtered.gtf" , emit: genome_gtf path "versions.yml" , emit: versions when: diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index e743c1dab..d639adf6c 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -90,6 +90,8 @@ workflow PREPARE_GENOME { // // Apply filtering we may need for GTFs // + + ch_filtered_gtf = ch_gtf filtering_useful = ( @@ -109,13 +111,9 @@ workflow PREPARE_GENOME { !transcript_fasta ) - if (params.skip_gtf_filter || ! filtering_useful){ - ch_gtf_with_transcript_ids = ch_gtf - ch_gtf_genome = ch_gtf - } else { + if (filtering_useful){ GTF_FILTER ( ch_fasta, ch_gtf ) - ch_gtf_with_transcript_ids = GTF_FILTER.out.transcript_id_gtf - ch_gtf_genome = GTF_FILTER.out.genome_gtf + ch_filtered_gtf = GTF_FILTER.out.genome_gtf } } @@ -166,7 +164,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { - ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf_genome ).transcript_fasta + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filtered_gtf ).transcript_fasta ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -292,10 +290,10 @@ workflow PREPARE_GENOME { emit: fasta = ch_fasta // channel: path(genome.fasta) - gtf = ch_gtf_genome // channel: path(genome.gtf) + gtf = ch_gtf // channel: path(genome.gtf) + filtered_gtf = ch_filtered_gtf // channel: path(genome.gtf) fai = ch_fai // channel: path(genome.fai) gene_bed = ch_gene_bed // channel: path(gene.bed) - gtf_with_transcript_ids = ch_gtf_with_transcript_ids // channel: path(gtf) transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) splicesites = ch_splicesites // channel: path(genome.splicesites.txt) diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 0912c661c..529d8e7c5 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -383,7 +383,7 @@ workflow RNASEQ { ALIGN_STAR ( ch_filtered_reads, PREPARE_GENOME.out.star_index.map { [ [:], it ] }, - PREPARE_GENOME.out.gtf.map { [ [:], it ] }, + PREPARE_GENOME.out.filtered_gtf.map { [ [:], it ] }, params.star_ignore_sjdbgtf, '', params.seq_center ?: '', @@ -473,7 +473,7 @@ workflow RNASEQ { ch_transcriptome_bam, ch_dummy_file, PREPARE_GENOME.out.transcript_fasta, - PREPARE_GENOME.out.gtf_with_transcript_ids, + PREPARE_GENOME.out.filtered_gtf, true, params.salmon_quant_libtype ?: '' ) @@ -648,7 +648,7 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_stringtie) { STRINGTIE_STRINGTIE ( ch_genome_bam, - PREPARE_GENOME.out.gtf_with_transcript_ids + PREPARE_GENOME.out.filtered_gtf ) ch_versions = ch_versions.mix(STRINGTIE_STRINGTIE.out.versions.first()) } @@ -798,7 +798,7 @@ workflow RNASEQ { ch_filtered_reads, PREPARE_GENOME.out.salmon_index, ch_dummy_file, - PREPARE_GENOME.out.gtf_with_transcript_ids, + PREPARE_GENOME.out.filtered_gtf, false, params.salmon_quant_libtype ?: '' ) From 5c213c5d4f675283297fcfb44a41680ae77f0bf9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 15 Nov 2023 10:33:02 +0000 Subject: [PATCH 34/43] Blackify --- bin/filter_gtf.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index dd8e1866e..3d09bd859 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -10,18 +10,21 @@ logger = logging.getLogger("fasta_gtf_filter") logger.setLevel(logging.INFO) + def extract_fasta_seq_names(fasta_name: str) -> Set[str]: """Extracts the sequence names from a FASTA file.""" with open(fasta_name) as fasta: return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")} + def tab_delimited(file: str) -> float: """Check if file is tab-delimited and return median number of tabs.""" with open(file, "r") as f: data = f.read(1024) return statistics.median(line.count("\t") for line in data.split("\n")) -def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: + +def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") @@ -53,13 +56,15 @@ def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_ logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.") parser.add_argument("--gtf", type=str, required=True, help="GTF file") parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file") parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files") - parser.add_argument("--skip_transcript_id_check", action='store_true', help="Skip checking for transcript IDs in the GTF file") + parser.add_argument( + "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file" + ) args = parser.parse_args() filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check) - From 6d55142efacc07ee3ac173045e766d971c8e713f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 15 Nov 2023 10:54:57 +0000 Subject: [PATCH 35/43] Document GTF filtering --- docs/usage.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 9748aa5de..3df13b073 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -91,9 +91,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | @@ -196,6 +196,10 @@ Notes: - As of v3.7 of the pipeline, if you are using a genome downloaded from AWS iGenomes and using `--aligner star_salmon` (default) the version of STAR to use for the alignment will be auto-detected (see [#808](https://github.com/nf-core/rnaseq/issues/808)). +### GTF filtering + +By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` flag. The transcript identifer filter can be disabled specifically using `skip_gtf_transcript_filter`. + ## Running the pipeline The typical command for running the pipeline is as follows: From 6e1454b363e1d9fb599d30047cb97a80e6782873 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Wed, 15 Nov 2023 10:56:28 +0000 Subject: [PATCH 36/43] [automated] Fix linting with Prettier --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 3df13b073..82319c466 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -91,9 +91,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | From aff1d2e02717247831644769fc3ba84868c3fdde Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 15 Nov 2023 13:06:09 +0000 Subject: [PATCH 37/43] Bump modules config for merged module --- modules.json | 4 ++-- modules/nf-core/kallisto/quant/main.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index d06dfe193..84d48d87f 100644 --- a/modules.json +++ b/modules.json @@ -71,8 +71,8 @@ "installed_by": ["modules"] }, "kallisto/quant": { - "branch": "kallisto_updates", - "git_sha": "bc4719dcd079fcdb650ddeac05739c2f7dd58c84", + "branch": "master", + "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad", "installed_by": ["modules"] }, "picard/markduplicates": { diff --git a/modules/nf-core/kallisto/quant/main.nf b/modules/nf-core/kallisto/quant/main.nf index 2ad0bb78b..f5444d791 100644 --- a/modules/nf-core/kallisto/quant/main.nf +++ b/modules/nf-core/kallisto/quant/main.nf @@ -59,7 +59,7 @@ process KALLISTO_QUANT { -o $prefix \\ ${reads} 2> >(tee -a ${prefix}/kallisto_quant.log >&2) - cp ${prefix}/kallisto_quant.log ${prefix}.log + cp ${prefix}/kallisto_quant.log ${prefix}.log cp ${prefix}/run_info.json ${prefix}.run_info.json cat <<-END_VERSIONS > versions.yml From f7d078cc159e3ae0a2b722c465ae0a20fcfe66e6 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Nov 2023 14:19:01 +0000 Subject: [PATCH 38/43] Bump pipeline version to 3.13.0 --- CHANGELOG.md | 13 ++++++------- assets/multiqc_config.yml | 4 ++-- nextflow.config | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 357dd24db..80c51ea61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v3.13.0dev - [date] +## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17 ### Credits @@ -14,7 +14,6 @@ Special thanks to the following for their contributions to the release: - [Júlia Mir Pedrol](https://github.com/mirpedrol) - [Matthias Zepper](https://github.com/MatthiasZepper) - [Maxime Garcia](https://github.com/maxulysse) -- [Jonathan Manning](https://github.com/pinin4fjords) Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. @@ -30,13 +29,13 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1083](https://github.com/nf-core/rnaseq/pull/1083) - Move local modules and subworkflows to subfolders - [PR #1088](https://github.com/nf-core/rnaseq/pull/1088) - Updates contributing and code of conduct documents with nf-core template 2.10 - [PR #1091](https://github.com/nf-core/rnaseq/pull/1091) - Reorganise parameters in schema for better usability -- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage -- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for filter_gtf_for_genes_in_genome.py -- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields -- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom Ensembl genome - [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification -- [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - MultiQC [version bump](https://github.com/nf-core/rnaseq/pull/1106/commits/aebad067a10a45510a2b421da852cb436ae65fd8) +- [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage - [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting +- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom +- [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py` +- [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields +Ensembl genome ### Software dependencies diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 100536f4e..4085d95e3 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/rnaseq + This report has been generated by the nf-core/rnaseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-rnaseq-methods-description": order: -1000 diff --git a/nextflow.config b/nextflow.config index 7837c0a29..ab539161f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -317,7 +317,7 @@ manifest { description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control.""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '3.13.0dev' + version = '3.13.0' doi = 'https://doi.org/10.5281/zenodo.1400710' } From 1c6012ecbb087014ea4b8f0f3d39b874850277a8 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Nov 2023 14:38:26 +0000 Subject: [PATCH 39/43] [skip ci] Changes from local review --- conf/modules.config | 2 -- docs/usage.md | 2 +- modules/local/gtf_filter/main.nf | 6 ++--- nextflow_schema.json | 4 +-- subworkflows/local/prepare_genome/main.nf | 30 +++++++++++------------ 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5b4742df0..6e28f549b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -165,7 +165,6 @@ process { ext.args = '--record-count 1000000 --seed 1' ext.prefix = { "${meta.id}.subsampled" } publishDir = [ - mode: params.publish_dir_mode, enabled: false ] } @@ -173,7 +172,6 @@ process { withName: '.*:FASTQ_SUBSAMPLE_FQ_SALMON:SALMON_QUANT' { ext.args = '--skipQuant' publishDir = [ - mode: params.publish_dir_mode, enabled: false ] } diff --git a/docs/usage.md b/docs/usage.md index dd4cb12b4..524acba9b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -200,7 +200,7 @@ Notes: ### GTF filtering -By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` flag. The transcript identifer filter can be disabled specifically using `skip_gtf_transcript_filter`. +By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter. ## Running the pipeline diff --git a/modules/local/gtf_filter/main.nf b/modules/local/gtf_filter/main.nf index cf3536f95..d14e8ff42 100644 --- a/modules/local/gtf_filter/main.nf +++ b/modules/local/gtf_filter/main.nf @@ -11,13 +11,13 @@ process GTF_FILTER { path gtf output: - path "*.filtered.gtf" , emit: genome_gtf - path "versions.yml" , emit: versions + path "*.filtered.gtf", emit: genome_gtf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when - script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/ + script: // filter_gtf.py is bundled with the pipeline, in nf-core/rnaseq/bin/ """ filter_gtf.py \\ --gtf $gtf \\ diff --git a/nextflow_schema.json b/nextflow_schema.json index d350a90fa..f7c04d5ab 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -89,12 +89,12 @@ "type": "boolean", "fa_icon": "fas fa-forward", "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", - "help_text": "If you're confident on the validity of the GTF with respect to the genome file, or wish to disregard failures thriggered by the filtering module, activate this option." + "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option." }, "skip_gtf_transcript_filter": { "type": "boolean", "fa_icon": "fas fa-forward", - "description": "Skip just the transcript_id check of GTF filtering." + "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline." }, "gene_bed": { "type": "string", diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index 1df79b107..e86612001 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -310,20 +310,20 @@ workflow PREPARE_GENOME { } emit: - fasta = ch_fasta // channel: path(genome.fasta) - gtf = ch_gtf // channel: path(genome.gtf) - filtered_gtf = ch_filtered_gtf // channel: path(genome.gtf) - fai = ch_fai // channel: path(genome.fai) - gene_bed = ch_gene_bed // channel: path(gene.bed) - transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) - chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) - splicesites = ch_splicesites // channel: path(genome.splicesites.txt) - bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) - star_index = ch_star_index // channel: path(star/index/) - rsem_index = ch_rsem_index // channel: path(rsem/index/) - hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) - salmon_index = ch_salmon_index // channel: path(salmon/index/) - kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] + fasta = ch_fasta // channel: path(genome.fasta) + gtf = ch_gtf // channel: path(genome.gtf) + filtered_gtf = ch_filtered_gtf // channel: path(genome.gtf) + fai = ch_fai // channel: path(genome.fai) + gene_bed = ch_gene_bed // channel: path(gene.bed) + transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) + chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes) + splicesites = ch_splicesites // channel: path(genome.splicesites.txt) + bbsplit_index = ch_bbsplit_index // channel: path(bbsplit/index/) + star_index = ch_star_index // channel: path(star/index/) + rsem_index = ch_rsem_index // channel: path(rsem/index/) + hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) + salmon_index = ch_salmon_index // channel: path(salmon/index/) + kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } From e7126c80966e6d2999cc5db3f9b5bbe0e98a9ba5 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Nov 2023 15:53:32 +0000 Subject: [PATCH 40/43] Move --skip_* parameters to the same section --- nextflow_schema.json | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f7c04d5ab..60e6585cf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -85,17 +85,6 @@ "description": "Path to GFF3 annotation file.", "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified." }, - "skip_gtf_filter": { - "type": "boolean", - "fa_icon": "fas fa-forward", - "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", - "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option." - }, - "skip_gtf_transcript_filter": { - "type": "boolean", - "fa_icon": "fas fa-forward", - "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline." - }, "gene_bed": { "type": "string", "format": "file-path", @@ -531,6 +520,17 @@ "fa_icon": "fas fa-fast-forward", "description": "Options to skip various steps within the workflow.", "properties": { + "skip_gtf_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip filtering of GTF for valid scaffolds and/ or transcript IDs.", + "help_text": "If you're confident on the validity of the GTF with respect to the genome fasta file, or wish to disregard failures thriggered by the filtering module, activate this option." + }, + "skip_gtf_transcript_filter": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline." + }, "skip_bbsplit": { "type": "boolean", "default": true, From 13cc0e98b957a2b842857c75ef3fdf51977a6697 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Nov 2023 16:00:19 +0000 Subject: [PATCH 41/43] [skip ci] Tidy up code block --- subworkflows/local/prepare_genome/main.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index e86612001..b81005b8a 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -93,9 +93,7 @@ workflow PREPARE_GENOME { // // Apply filtering we may need for GTFs // - - ch_filtered_gtf = ch_gtf - + ch_filtered_gtf = ch_gtf filtering_useful = ( // Condition 1: Alignment is required and aligner is set to 'star_salmon' @@ -114,9 +112,10 @@ workflow PREPARE_GENOME { !transcript_fasta ) - if (filtering_useful){ + if (filtering_useful) { GTF_FILTER ( ch_fasta, ch_gtf ) ch_filtered_gtf = GTF_FILTER.out.genome_gtf + ch_versions = ch_versions.mix(GTF_FILTER.out.versions) } } From 3ac4a3afc7aac982866e56cbc92b329c0bcd312c Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Nov 2023 17:58:18 +0000 Subject: [PATCH 42/43] Move filter_gtf logic into main workflow --- subworkflows/local/prepare_genome/main.nf | 31 +++-------------------- workflows/rnaseq.nf | 30 ++++++++++++++++++---- 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf index b81005b8a..0be947954 100644 --- a/subworkflows/local/prepare_genome/main.nf +++ b/subworkflows/local/prepare_genome/main.nf @@ -53,6 +53,7 @@ workflow PREPARE_GENOME { is_aws_igenome // boolean: whether the genome files are from AWS iGenomes biotype // string: if additional fasta file is provided biotype value to use when appending entries to GTF file prepare_tool_indices // list: tools to prepare indices for + filter_gtf // boolean: whether to filter GTF file main: @@ -90,31 +91,9 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(GFFREAD.out.versions) } - // - // Apply filtering we may need for GTFs - // - ch_filtered_gtf = ch_gtf - filtering_useful = - ( - // Condition 1: Alignment is required and aligner is set to 'star_salmon' - !params.skip_alignment && params.aligner == 'star_salmon' - ) || - ( - // Condition 2: Pseudo-alignment is required and pseudo-aligner is set to 'salmon' - !params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon' - ) || - ( - // Condition 3: Neither alignment nor stringtie are to be skipped - !params.skip_alignment && !params.skip_stringtie - ) || - ( - // Condition 4: Transcript FASTA file is not provided - !transcript_fasta - ) - - if (filtering_useful) { + if (filter_gtf) { GTF_FILTER ( ch_fasta, ch_gtf ) - ch_filtered_gtf = GTF_FILTER.out.genome_gtf + ch_gtf = GTF_FILTER.out.genome_gtf ch_versions = ch_versions.mix(GTF_FILTER.out.versions) } } @@ -166,7 +145,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions) } } else { - ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_filtered_gtf ).transcript_fasta + ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta ch_versions = ch_versions.mix(GTF_FILTER.out.versions) ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions) } @@ -311,7 +290,6 @@ workflow PREPARE_GENOME { emit: fasta = ch_fasta // channel: path(genome.fasta) gtf = ch_gtf // channel: path(genome.gtf) - filtered_gtf = ch_filtered_gtf // channel: path(genome.gtf) fai = ch_fai // channel: path(genome.fai) gene_bed = ch_gene_bed // channel: path(gene.bed) transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta) @@ -323,6 +301,5 @@ workflow PREPARE_GENOME { hisat2_index = ch_hisat2_index // channel: path(hisat2/index/) salmon_index = ch_salmon_index // channel: path(salmon/index/) kallisto_index = ch_kallisto_index // channel: [ meta, path(kallisto/index/) ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index 95a6b783d..403194ac2 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -39,6 +39,25 @@ if (!params.skip_bbsplit) { prepareToolIndices << 'bbsplit' } if (!params.skip_alignment) { prepareToolIndices << params.aligner } if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } +// Determine whether to filter the GTF or not +def filterGtf = + (( + // Condition 1: Alignment is required and aligner is set + !params.skip_alignment && params.aligner + ) || + ( + // Condition 2: Pseudoalignment is required and pseudoaligner is set + !params.skip_pseudo_alignment && params.pseudo_aligner + ) || + ( + // Condition 3: Transcript FASTA file is not provided + !params.transcript_fasta + )) && + ( + // Condition 4: --skip_gtf_filter is not provided + !params.skip_gtf_filter + ) + // Get RSeqC modules to run def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : [] if (params.bam_csi_index) { @@ -175,7 +194,8 @@ workflow RNASEQ { params.gencode, is_aws_igenome, biotype, - prepareToolIndices + prepareToolIndices, + filterGtf ) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) @@ -384,7 +404,7 @@ workflow RNASEQ { ALIGN_STAR ( ch_filtered_reads, PREPARE_GENOME.out.star_index.map { [ [:], it ] }, - PREPARE_GENOME.out.filtered_gtf.map { [ [:], it ] }, + PREPARE_GENOME.out.gtf.map { [ [:], it ] }, params.star_ignore_sjdbgtf, '', params.seq_center ?: '', @@ -474,7 +494,7 @@ workflow RNASEQ { ch_transcriptome_bam, ch_dummy_file, PREPARE_GENOME.out.transcript_fasta, - PREPARE_GENOME.out.filtered_gtf, + PREPARE_GENOME.out.gtf, 'salmon', true, params.salmon_quant_libtype ?: '', @@ -652,7 +672,7 @@ workflow RNASEQ { if (!params.skip_alignment && !params.skip_stringtie) { STRINGTIE_STRINGTIE ( ch_genome_bam, - PREPARE_GENOME.out.filtered_gtf + PREPARE_GENOME.out.gtf ) ch_versions = ch_versions.mix(STRINGTIE_STRINGTIE.out.versions.first()) } @@ -810,7 +830,7 @@ workflow RNASEQ { ch_filtered_reads, ch_pseudo_index, ch_dummy_file, - PREPARE_GENOME.out.filtered_gtf, + PREPARE_GENOME.out.gtf, params.pseudo_aligner, false, params.salmon_quant_libtype ?: '', From b67558d2757cf5c1f05528b328d2381a1f42d7cb Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Wed, 15 Nov 2023 18:06:06 +0000 Subject: [PATCH 43/43] [automated] Fix linting with Prettier --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80c51ea61..5b5e0e7ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,10 +32,10 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #1106](https://github.com/nf-core/rnaseq/pull/1106) - Kallisto quantification - [PR #1107](https://github.com/nf-core/rnaseq/pull/1107) - Expand GTF filtering to remove rows with empty transcript ID when required, fix STAR GTF usage - [#1050](https://github.com/nf-core/rnaseq/issues/1050) - Provide custom prefix/suffix for summary files to avoid overwriting -- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom +- [#1074](https://github.com/nf-core/rnaseq/issues/1074) - Enable quantification using StringTie AND a custom - [#1082](https://github.com/nf-core/rnaseq/issues/1082) - More informative error message for `filter_gtf_for_genes_in_genome.py` - [#1102](https://github.com/nf-core/rnaseq/issues/1102) - gene entries with empty transcript_id fields -Ensembl genome + Ensembl genome ### Software dependencies