diff --git a/bin/get_fasta_largest_isoform.TrinityMS.pl b/bin/get_fasta_largest_isoform.TrinityMS.pl index 2269273..fd57b3a 100755 --- a/bin/get_fasta_largest_isoform.TrinityMS.pl +++ b/bin/get_fasta_largest_isoform.TrinityMS.pl @@ -4,9 +4,14 @@ use FindBin; use lib ("$FindBin::RealBin/PerlLib"); use Getopt::Std; +use Bio::SeqIO; +# This script reads in a fasta file and produces the longest transcript for each gene (if present in ensembl, trinity or basic headers are present). +# This is to stop multiple isoforms of the same gene going through. +# If you wish all genes/isoforms to go through, use "basic" +# It then prints out the longest isoform for each gene, or all genes. -die "Please specify (1)fasta file (2) nucl type (ensembl, trinity)\n" unless(@ARGV==2); +die "Please specify (1)fasta file (2) nucl type (ensembl, trinity, basic)\n" unless(@ARGV==2); my $fastafile = $ARGV[0]; my $nucl_type = $ARGV[1]; @@ -19,7 +24,6 @@ if ($nucl_type eq "trinity"){ -use Bio::SeqIO; my $seqio = Bio::SeqIO->new(-file => "$fastafile", '-format' => 'Fasta'); my %fastadictionary=(); my @headersplit=(); @@ -49,7 +53,7 @@ foreach my $key ( sort keys %fastadictionary){ if ($fastadictionary{$key} eq "Sequenceunavailable"){ - + #Do nothing } else{ print $outhandle ">$key\n$fastadictionary{$key}\n"; @@ -61,7 +65,6 @@ if ($nucl_type eq "ensembl"){ -use Bio::SeqIO; my $seqio = Bio::SeqIO->new(-file => "$fastafile", '-format' => 'Fasta'); my %fastadictionary=(); my @headersplit=(); @@ -87,20 +90,53 @@ } -print "Now print new fasta with one main protein per gene.\n"; +#print "Now print new fasta with one main protein per gene.\n"; foreach my $key ( sort keys %fastadictionary){ if ($fastadictionary{$key} eq "Sequenceunavailable"){ - + #Do nothing } else{ print $outhandle ">$key\n$fastadictionary{$key}\n"; - } - + } +} + +} + + +if ($nucl_type eq "basic"){ + +my $seqio = Bio::SeqIO->new(-file => "$fastafile", '-format' => 'Fasta'); +my %fastadictionary=(); +my @headersplit=(); +while (my $seq = $seqio->next_seq){ ## selects one sequence at a time + ## set variables for THIS sequence + my $id = $seq->display_id; + my $string = $seq->seq; + my $len=length($string); + if ($fastadictionary{$id}){ + my $len_old=length($fastadictionary{$id}); + if ($len >= $len_old){ + $fastadictionary{$id}=$string; + } + } + else{ + $fastadictionary{$id}=$string; + } } +print "Now print new fasta with one main protein per gene.\n"; + +foreach my $key ( sort keys %fastadictionary){ + if ($fastadictionary{$key} eq "Sequenceunavailable"){ + #Do nothing + } + else{ + print $outhandle ">$key\n$fastadictionary{$key}\n"; + } } +} print "Finished: input lines, output lines\n"; diff --git a/bin/get_fasta_largest_isoform.ensembl.pl b/bin/get_fasta_largest_isoform.ensembl.pl deleted file mode 100755 index f4a3905..0000000 --- a/bin/get_fasta_largest_isoform.ensembl.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl -use warnings; -use strict; -use FindBin; -use lib ("$FindBin::RealBin/PerlLib"); -use Getopt::Std; - - -die "Please specify (1)fasta file\n" unless(@ARGV==1); - -my $fastafile = $ARGV[0]; -my $outfile="$fastafile\.largestIsoform"; - -#open(my $fastahandle, "<", $fastafile) or die "Could not open $fastafile \n"; no need to open here for bioperl -#open(my $fastafile, "<", $fastafile) or die "Could not open $fastafile \n"; -open(my $outhandle, ">", $outfile) or die "Could not open $outfile \n"; - - -use Bio::SeqIO; -my $seqio = Bio::SeqIO->new(-file => "$fastafile", '-format' => 'Fasta'); -my %fastadictionary=(); -my @headersplit=(); -while (my $seq = $seqio->next_seq){ ## selects one sequence at a time - ## set variables for THIS sequence - my $id = $seq->display_id; - my $string = $seq->seq; - my @split=split(/\:/, $id); - #my @spl2=split("\_", $split[0]); - #my $waste=pop @spl2; - my $new_id=$split[1]; - my $len=length($string); - #print "length = $len\n"; - if ($fastadictionary{$new_id}){ - my $len_old=length($fastadictionary{$new_id}); - if ($len >= $len_old){ - $fastadictionary{$new_id}=$string; - } - } - else{ - $fastadictionary{$new_id}=$string; - } - -} - -print "Now print new fasta with one main protein per gene.\n"; - -foreach my $key ( sort keys %fastadictionary){ - if ($fastadictionary{$key} eq "Sequenceunavailable"){ - - } - else{ - print $outhandle ">$key\n$fastadictionary{$key}\n"; - } - -} - -print "Finished: input lines, output lines\n"; - diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 0000000..5d6af80 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,67 @@ +// Setting default labels with associated resource allocation +process { + + // Default resources are same as single label for any unlabelled processes that may be added + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + time = { check_max( 1.h * task.attempt, 'time' ) } + + // If error is due to timeout or exceeding resource limits then retry + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + // Do not allow any errors when retrying + maxErrors = '-1' + + withLabel:process_single { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + time = { check_max( 1.h * task.attempt, 'time' ) } + } + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 24.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 64.GB * task.attempt, 'memory' ) } + } + withLabel:process_med_memory { + memory = { check_max( 20.GB * task.attempt, 'memory' ) } + } + withLabel:download_nr { + memory = 12.GB + cpus = 2 + time = 24.h + } + withLabel:blastdb { + memory = 40.GB + cpus = 4 + time = 8.h + } + withLabel:blast { + memory = 40.GB + cpus = 4 + time = 8.h + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } + +} diff --git a/main.firstattempt.nf b/main.firstattempt.nf deleted file mode 100755 index 977b37a..0000000 --- a/main.firstattempt.nf +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2021 - */ - - - /* - * Authors: - * - Chris Wyatt - */ - -params.proteins= false -params.nucleotide = false -params.nucl_type = "ensembl" -params.predownloaded= false -params.numhits = 1 -params.outdir = "results" -params.names = false -params.nodes = false -params.level = "family" -params.sensitivity= "fast" - -log.info """\ - =================================== - proteins : ${params.proteins} - nucleotides : ${params.nucleotide} - out directory : ${params.outdir} - """ - -//================================================================================ -// Include modules -//================================================================================ - -include { DOWNLOAD } from './modules/download.nf' -include { MAKE_DB } from './modules/make_blast_db.nf' -include { DIAMOND_BLAST } from './modules/diamond_blast.nf' -include { PLOT_PIE } from './modules/plot_taxonomy_pie.nf' -include { T_DECODER } from './modules/transdecoder.nf' - - -workflow { - if ( params.proteins ){ - input_target_proteins = channel - .fromPath(params.proteins) - .splitCsv() - .ifEmpty { error "Cannot find the list of protein files: ${params.proteins}" } - } - else if( params.nucleotide ){ - input_target_nucleotide = channel - .fromPath(params.nucleotide) - .splitCsv() - .ifEmpty { error "Cannot find the list of nucleotide files: ${params.nucleotide}" } - .view() - T_DECODER ( input_target_nucleotide ) - T_DECODER.out.protein.set{ input_target_proteins } - } - else{ - println "You have not set the input protein or nucleotide option\n" - } - - - if ( !params.predownloaded ){ - // If not predownloaded then wget all from NCBI. - println "Downloading a new nr database with taxonomy information\n" - DOWNLOAD () - MAKE_DB ( DOWNLOAD.out.database , DOWNLOAD.out.accession2taxid , DOWNLOAD.out.tax_nodes , DOWNLOAD.out.tax_names ) - DIAMOND_BLAST ( input_target_proteins , MAKE_DB.out.blast_database ) - PLOT_PIE ( DOWNLOAD.out.tax_nodes , DOWNLOAD.out.tax_names , DIAMOND_BLAST.out.blast_hits ) - - } - else{ - //input_database = channel - // .fromPath(params.predownloaded) - // .ifEmpty { error "Cannot find the blast database : ${params.predownloaded}" } - //input_names = channel - // .fromPath(params.names) - // .ifEmpty { error "Cannot find the blast database : ${params.names}" } - //input_nodes = channel - // .fromPath(params.nodes) - // .ifEmpty { error "Cannot find the blast database : ${params.nodes}" } - DIAMOND_BLAST ( input_target_proteins , params.predownloaded ) - PLOT_PIE ( params.nodes , params.names , DIAMOND_BLAST.out.blast_hits ) - } -} - -workflow.onComplete { - println ( workflow.success ? "\nDone! Open your report in your browser --> $params.outdir/report.html (if you added -with-report flag)\n" : "Hmmm .. something went wrong" ) -} diff --git a/main.nf b/main.nf index cd88022..f632b3c 100755 --- a/main.nf +++ b/main.nf @@ -1,8 +1,9 @@ /* * Copyright (c) 2021 */ + -/* + /* * Authors: * - Chris Wyatt */ @@ -10,22 +11,20 @@ params.proteins= false params.nucleotide = false params.predownloaded= false +params.numhits = 1 params.outdir = "results" params.names = false params.nodes = false -params.numhits = null -params.tophits = null params.level = "family" -params.sensitivity = "fast" -params.horizontal = false -params.xml = false +params.sensitivity= "fast" +params.nucl_type = "trinity" log.info """\ -=================================== -proteins : ${params.proteins} -nucleotides : ${params.nucleotide} -out directory : ${params.outdir} -""" + =================================== + proteins : ${params.proteins} + nucleotides : ${params.nucleotide} + out directory : ${params.outdir} + """ //================================================================================ // Include modules @@ -33,55 +32,22 @@ out directory : ${params.outdir} include { DOWNLOAD } from './modules/download.nf' include { MAKE_DB } from './modules/make_blast_db.nf' +include { DIAMOND_BLAST } from './modules/diamond_blast.nf' include { PLOT_PIE } from './modules/plot_taxonomy_pie.nf' include { T_DECODER } from './modules/transdecoder.nf' -include { DIAMOND_BLAST } from './modules/diamond_blast.nf' -include { DIAMOND_HORIZONTAL } from './modules/diamond_horizontal.nf' -include { DIAMOND_XML } from './modules/diamond_xml.nf' - -workflow { - input_database = Channel.empty() - input_names = Channel.empty() - input_nodes = Channel.empty() - - if ( !params.predownloaded ){ - // If not predownloaded then wget all from NCBI. - println "Downloading a new nr database with taxonomy information\n" - DOWNLOAD () - MAKE_DB ( DOWNLOAD.out.database , DOWNLOAD.out.accession2taxid , DOWNLOAD.out.tax_nodes , DOWNLOAD.out.tax_names ) - MAKE_DB.out.blast_database.first().set{ input_database } - DOWNLOAD.out.tax_names.first().set{ input_names } - DOWNLOAD.out.tax_nodes.first().set{ input_nodes } - } - else{ - input_database = channel - .fromPath(params.predownloaded) - .ifEmpty { error "Cannot find the blast database : ${params.predownloaded}" } - .first() - input_names = channel - .fromPath(params.names) - .ifEmpty { error "Cannot find the blast database : ${params.names}" } - .first() - input_nodes = channel - .fromPath(params.nodes) - .ifEmpty { error "Cannot find the blast database : ${params.nodes}" } - .first() - } - input_target_proteins = Channel.empty() +workflow { if ( params.proteins ){ input_target_proteins = channel .fromPath(params.proteins) - .splitCsv() .ifEmpty { error "Cannot find the list of protein files: ${params.proteins}" } } else if( params.nucleotide ){ input_target_nucleotide = channel .fromPath(params.nucleotide) - .splitCsv() .ifEmpty { error "Cannot find the list of protein files: ${params.nucleotide}" } - T_DECODER ( input_target_nucleotide ) + T_DECODER ( input_target_nucleotide , params.nucl_type ) T_DECODER.out.protein.set{ input_target_proteins } } else{ @@ -89,16 +55,30 @@ workflow { } - DIAMOND_BLAST ( input_target_proteins , input_database ) - - PLOT_PIE ( input_nodes , input_names , DIAMOND_BLAST.out.blast_hits ) - - //if ( params.horizontal ){ - // DIAMOND_HORIZONTAL ( input_target_proteins , input_database ) - //} - - if ( params.xml ){ - DIAMOND_XML ( input_target_proteins , input_database ) + if ( !params.predownloaded ){ + // If not predownloaded then wget all from NCBI. + println "Downloading a new nr database with taxonomy information\n" + DOWNLOAD () + MAKE_DB ( DOWNLOAD.out.database , DOWNLOAD.out.accession2taxid , DOWNLOAD.out.tax_nodes , DOWNLOAD.out.tax_names ) + DIAMOND_BLAST ( input_target_proteins , MAKE_DB.out.blast_database ) + PLOT_PIE ( DOWNLOAD.out.tax_names , DOWNLOAD.out.tax_nodes , DIAMOND_BLAST.out.blast_hits ) + + } + else{ + input_database = channel + .fromPath(params.predownloaded) + .ifEmpty { error "Cannot find the blast database : ${params.predownloaded}" } + .first() + input_names = channel + .fromPath(params.names) + .ifEmpty { error "Cannot find the blast database : ${params.names}" } + .first() + input_nodes = channel + .fromPath(params.nodes) + .ifEmpty { error "Cannot find the blast database : ${params.nodes}" } + .first() + DIAMOND_BLAST ( input_target_proteins , input_database ) + PLOT_PIE ( input_nodes , input_names , DIAMOND_BLAST.out.blast_hits ) } } diff --git a/main.secondattempt.nf b/main.secondattempt.nf deleted file mode 100755 index d8a815a..0000000 --- a/main.secondattempt.nf +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2021 - */ - - - /* - * Authors: - * - Chris Wyatt - */ - -params.proteins= false -params.nucleotide = false -params.predownloaded= false -params.numhits = 1 -params.outdir = "results" -params.names = false -params.nodes = false -params.level = "family" -params.sensitivity= "fast" - -log.info """\ - =================================== - proteins : ${params.proteins} - nucleotides : ${params.nucleotide} - out directory : ${params.outdir} - """ - -//================================================================================ -// Include modules -//================================================================================ - -include { DOWNLOAD } from './modules/download.nf' -include { MAKE_DB } from './modules/make_blast_db.nf' -include { DIAMOND_BLAST } from './modules/diamond_blast.nf' -include { PLOT_PIE } from './modules/plot_taxonomy_pie.nf' -include { T_DECODER } from './modules/transdecoder.nf' - - -workflow { - if ( params.proteins ){ - input_target_proteins = channel - .fromPath(params.proteins) - .ifEmpty { error "Cannot find the list of protein files: ${params.proteins}" } - } - else if( params.nucleotide ){ - input_target_nucleotide = channel - .fromPath(params.nucleotide) - .ifEmpty { error "Cannot find the list of protein files: ${params.nucleotide}" } - T_DECODER ( input_target_nucleotide ) - T_DECODER.out.protein.set{ input_target_proteins } - } - else{ - println "You have not set the input protein or nucleotide option\n" - } - - - if ( !params.predownloaded ){ - // If not predownloaded then wget all from NCBI. - println "Downloading a new nr database with taxonomy information\n" - DOWNLOAD () - MAKE_DB ( DOWNLOAD.out.database , DOWNLOAD.out.accession2taxid , DOWNLOAD.out.tax_nodes , DOWNLOAD.out.tax_names ) - DIAMOND_BLAST ( input_target_proteins , MAKE_DB.out.blast_database ) - PLOT_PIE ( DOWNLOAD.out.tax_names , DOWNLOAD.out.tax_nodes , DIAMOND_BLAST.out.blast_hits ) - - } - else{ - input_database = channel - .fromPath(params.predownloaded) - .ifEmpty { error "Cannot find the blast database : ${params.predownloaded}" } - .first() - input_names = channel - .fromPath(params.names) - .ifEmpty { error "Cannot find the blast database : ${params.names}" } - .first() - input_nodes = channel - .fromPath(params.nodes) - .ifEmpty { error "Cannot find the blast database : ${params.nodes}" } - .first() - DIAMOND_BLAST ( input_target_proteins , input_database ) - PLOT_PIE ( input_nodes , input_names , DIAMOND_BLAST.out.blast_hits ) - } -} - -workflow.onComplete { - println ( workflow.success ? "\nDone! Open your report in your browser --> $params.outdir/report.html (if you added -with-report flag)\n" : "Hmmm .. something went wrong" ) -} diff --git a/modules/diamond_blast.nf b/modules/diamond_blast.nf index 8b25e72..bfae37b 100755 --- a/modules/diamond_blast.nf +++ b/modules/diamond_blast.nf @@ -1,7 +1,7 @@ process DIAMOND_BLAST { label 'blast' + container = 'chriswyatt/diamond' publishDir "$params.outdir/Blast_results/", mode:'copy' - //stageInMode 'copy' input: path proteins diff --git a/modules/diamond_horizontal.nf b/modules/diamond_horizontal.nf index 862cd4b..9b08d83 100755 --- a/modules/diamond_horizontal.nf +++ b/modules/diamond_horizontal.nf @@ -1,7 +1,7 @@ process DIAMOND_HORIZONTAL { label 'blast' + container = 'chriswyatt/diamond' publishDir "$params.outdir/Blast_results/", mode:'copy' - //stageInMode 'copy' input: path proteins diff --git a/modules/diamond_xml.nf b/modules/diamond_xml.nf index ae16ae6..e80c0fb 100755 --- a/modules/diamond_xml.nf +++ b/modules/diamond_xml.nf @@ -1,7 +1,7 @@ process DIAMOND_XML { label 'blast' + container = 'chriswyatt/diamond' publishDir "$params.outdir/Blast_results/", mode:'copy' - //stageInMode 'copy' input: path proteins diff --git a/modules/download.nf b/modules/download.nf index 05236a4..3b51266 100755 --- a/modules/download.nf +++ b/modules/download.nf @@ -1,11 +1,13 @@ process DOWNLOAD { - label 'download' - + label 'download_nr' + time 24.h + container 'quay.io/ecoflowucl/ncbi_download:v16.1.2-arm64' output: path("nr.gz") , emit: database - path("prot.accession2taxid.FULL") , emit: accession2taxid + path("prot.accession2taxid.FULL") , emit: accession2taxid path("nodes.dmp") , emit: tax_nodes path("names.dmp") , emit: tax_names + script: """ wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz diff --git a/modules/make_blast_db.nf b/modules/make_blast_db.nf index e44f2d7..e2e83d4 100755 --- a/modules/make_blast_db.nf +++ b/modules/make_blast_db.nf @@ -1,6 +1,7 @@ process MAKE_DB { label 'blastdb' - //stageInMode 'copy' + container = 'chriswyatt/diamond' + input: path nr_gz path prot_accession2taxid diff --git a/modules/plot_taxonomy_pie.nf b/modules/plot_taxonomy_pie.nf index e0ab058..e7529d8 100755 --- a/modules/plot_taxonomy_pie.nf +++ b/modules/plot_taxonomy_pie.nf @@ -1,5 +1,6 @@ process PLOT_PIE { label 'perl_pie' + container = 'chriswyatt/perl_r_e1071' publishDir "$params.outdir/Blast_results/", mode:'copy', pattern: '*top.tsv' publishDir "$params.outdir/Taxo_figure/", mode:'copy', pattern: '*.pdf' publishDir "$params.outdir/Taxo_summary/", mode:'copy', pattern: '*_summary.tsv' diff --git a/modules/transdecoder.nf b/modules/transdecoder.nf index 18b990b..fc41fcb 100755 --- a/modules/transdecoder.nf +++ b/modules/transdecoder.nf @@ -1,16 +1,18 @@ process T_DECODER { - label 'transdecoder' + label 'process_low' publishDir "$params.outdir/Prot/", mode:'copy', pattern: '*.prot.fa' + container = 'chriswyatt/transdecoder' input: - path fasta_file - + path fasta_file + val type + output: path("${fasta_file}.prot.fa") , emit: protein script: """ - get_fasta_largest_isoform.TrinityMS.pl $fasta_file $params.nucl_type + get_fasta_largest_isoform.TrinityMS.pl $fasta_file $type TransDecoder.LongOrfs -t ${fasta_file}.largestIsoform --output_dir Output cp Output/longest_orfs.pep ${fasta_file}.prot.fa """ diff --git a/myriad.config b/myriad.config new file mode 100644 index 0000000..9e63200 --- /dev/null +++ b/myriad.config @@ -0,0 +1,15 @@ +executor { + name = 'sge' +} + +apptainer.runOptions = "-B ${HOME},${PWD}" + +process { + + //NEED TO SET PARALLEL ENVIRONMENT TO SMP SO MULTIPLE CPUS CAN BE SUBMITTED + penv = 'smp' + + //PROVIDE EXTRA PARAMETERS AS CLUSTER OPTIONS + clusterOptions = { "-S /bin/bash -l mem=${task.memory.mega}M" } + //clusterOptions = { "-S /bin/bash-l h_rt=${task.time.toHours()}h" } +} diff --git a/nextflow.config b/nextflow.config index 509bce0..4d1e359 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,4 +15,55 @@ profiles { cscluster { includeConfig 'conf/cscluster.config' } + apptainer { + apptainer.enabled = true + // Set where singularity cached images are saved + apptainer.cacheDir = "apptainer/cachedir" + // set registry to quay.io + //apptainer.registry = "quay.io" + // Ensure other container engines are unset + docker.enabled = false + singularity.enabled = false + } +} + +// Load base.config by default for all pipelines +includeConfig 'conf/base.config' + + +// Function to ensure that resource requirements don't go beyond a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} + +params { + max_cpus = 4 // or any valid integer + max_memory = '32 GB' }