forked from angelovangel/nxf-fastqc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
312 lines (256 loc) · 10.2 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// FASTQC pipeline
/*
NXF ver 19.08+ needed because of the use of tuple instead of set
*/
if( !nextflow.version.matches('>=19.08') ) {
println "This workflow requires Nextflow version 19.08 or greater and you are running version $nextflow.version"
exit 1
}
/*
* ANSI escape codes to color output messages, get date to use in results folder name
*/
ANSI_GREEN = "\033[1;32m"
ANSI_RED = "\033[1;31m"
ANSI_RESET = "\033[0m"
// date needed to prefix results dir
DATE = new java.util.Date()
sdf = new java.text.SimpleDateFormat("yyyy-MM-dd")
fdate = sdf.format(DATE)
//println sdf.format(DATE)
// needed to pretty print read/bases counts
import java.text.DecimalFormat
df = new DecimalFormat("###,###") //TODO add symbols to fix US locale, http://tutorials.jenkov.com/java-internationalization/decimalformat.html#creating-a-decimalformat-for-a-specific-locale
//println df.format( 5.13e+23.toBigInteger() )
/*
in bash this works OK:
LC_NUMERIC=en_GB.UTF-8 printf "%'.0f\n" 1.345e+12
or in ksh (only separately, not as part of a pipe or with xargs)
printf '%#d\n' 105000000
gives
105M directly! works for G, T, P ...why not in bash?
even this works!!!
printf "%#d" 12e+12
so no need for java classes
*/
/*
* pipeline input parameters
*/
params.readsdir = "fastq"
params.outdir = "${workflow.launchDir}/results-fastqc"
params.fqpattern = "*R{1,2}*.fastq.gz"
params.ontreads = false
//params.threads = 2 //makes no sense I think, to be removed
params.multiqc_config = "$baseDir/multiqc_config.yml" //custom config mainly for sample names
params.title = "Summarized nxf-fastqc report"
params.help = ""
mqc_config = file(params.multiqc_config) // this is needed, otherwise the multiqc config file is not available in the docker image
if (params.help) {
helpMessage()
exit(0)
}
log.info """
===========================================
F A S T Q C P I P E L I N E
Used parameters:
-------------------------------------------
--readsdir : ${params.readsdir}
--fqpattern : ${params.fqpattern}
--outdir : ${params.outdir}
--ontreads : ${params.ontreads}
--multiqc_config : ${params.multiqc_config}
--title : ${params.title}
Runtime data:
-------------------------------------------
Running with profile: ${ANSI_GREEN}${workflow.profile}${ANSI_RESET}
Running as user: ${ANSI_GREEN}${workflow.userName}${ANSI_RESET}
Launch dir: ${ANSI_GREEN}${workflow.launchDir}${ANSI_RESET}
Base dir: ${ANSI_GREEN}${baseDir}${ANSI_RESET}
"""
.stripIndent()
def helpMessage() {
log.info """
===========================================
F A S T Q C P I P E L I N E
Usage:
-------------------------------------------
--readsdir : directory with fastq files, default is "fastq"
--fqpattern : regex pattern to match fastq files, default is "*R{1,2}*.fastq.gz"
--outdir : where results will be saved, default is "results-fastp"
--ontreads : use this parameter for Nanopore reads
--multiqc_config : path to config file for MultiQC, default is "multiqc_config.yml"
--title : MultiQC report title, default is "Summarized fastp report"
===========================================
"""
.stripIndent()
}
//just in case trailing slash in readsdir not provided...
readsdir_repaired = "${params.readsdir}".replaceFirst(/$/, "/")
//println(readsdir_repaired)
// build search pattern for fastq files in input dir
reads = readsdir_repaired + params.fqpattern
// get counts of found fastq files
readcounts = file(reads)
println " Reads found: ${ANSI_GREEN}${readcounts.size()}${ANSI_RESET}"
// channel for read pairs --> fastp
Channel
.fromFilePairs( reads, checkIfExists: true, size: -1 ) // default is 2, so set to -1 to allow any number of files
.ifEmpty { error "Can not find any reads matching ${reads}" }
.set{ read_pairs_ch }
// channel for reads --> seqtools
Channel
.fromPath( reads, checkIfExists: true )
.set { reads_ch }
// fastp trimmed files are published, json are only sent in the channel and used only by multiqc
process fastp {
tag "fastp on $sample_id"
//echo true
publishDir params.outdir, mode: 'copy', pattern: 'fastp_trimmed/*' // publish only trimmed fastq files
input:
tuple sample_id, file(x) from read_pairs_ch
output:
file("${sample_id}_fastp.json") into fastp_ch
file('fastp_trimmed/trim_*')
file("${sample_id}_fastp.json") into fastp_ch2
val seqmode into seqmode_ch
script:
def single = x instanceof Path // this is from Paolo: https://groups.google.com/forum/#!topic/nextflow/_ygESaTlCXg
def qscore_cutoff = params.ontreads ? 7 : 15 //here ontreads matters
if ( !single ) {
seqmode = "PE"
"""
mkdir fastp_trimmed
fastp \
-q $qscore_cutoff \
-i ${x[0]} -I ${x[1]} \
-o fastp_trimmed/trim_${x[0]} -O fastp_trimmed/trim_${x[1]} \
-j ${sample_id}_fastp.json
"""
}
else {
seqmode = "SE"
"""
mkdir fastp_trimmed
fastp \
-q $qscore_cutoff \
-i ${x} -o fastp_trimmed/trim_${x} \
-j ${sample_id}_fastp.json
"""
}
}
//=========================
/*
* This process reads the json files from fastp (using jq)
* sums the reads/bases from all read files!
* and sends the stdout in the channels total_reads and total_bases. The stdout is in this case a single value.
* These are used later in multiqc (via the --cl_config parameter)
* to add the numbers as section comments
*/
process summary {
input:
file x from fastp_ch2.collect()
output:
stdout into total_reads //the channel contains 4 values, sep by new line
script:
"""
#!/usr/bin/env ksh
# check if jq installed, relevant only in local envs
command -v jq >/dev/null 2>&1 || \
{ echo >&2 "jq required but not installed, install it or use -profile docker or conda. Aborting."; exit 1;}
# prefer to keep the tmp files in the scratch for reference
jq '.summary.before_filtering.total_reads' $x | awk '{sum+=\$0} END{print sum}' > treads-before.tmp
jq '.summary.after_filtering.total_reads' $x | awk '{sum+=\$0} END{print sum}' > treads-after.tmp
jq '.summary.before_filtering.total_bases' $x | awk '{sum+=\$0} END{print sum}' > tbases-before.tmp
jq '.summary.after_filtering.total_bases' $x | awk '{sum+=\$0} END{print sum}' > tbases-after.tmp
cat treads-before.tmp treads-after.tmp tbases-before.tmp tbases-after.tmp | xargs siformat.sh
"""
}
//=========================
process multiqc {
publishDir params.outdir, mode: 'copy'
when:
!params.ontreads // multiqc fails on fastp json from ont files! too big
input:
file x from fastp_ch.collect()
file mqc_config
val y from total_reads // y is a string with 4 values sep by new line now, but still length 1
val seqmode from seqmode_ch.last() // PE or SE, see process fastp, last() because it emits one entry for each fastq file
output:
file('multiqc_report.html')
file('multiqc_report_data/multiqc_general_stats.txt')
// when using --title, make sure that the --filename is explicit, otherwise
// multiqc uses the title string as output filename
script:
// currently y is of length 1, the split gets the values in a list (see string split() method in groovy)
def splitstring = y.split()
"""
multiqc --force --interactive \
--title "${params.title}" \
--filename "multiqc_report.html" \
--config $mqc_config \
--cl_config "section_comments:
{ fastp: '*This is ${ seqmode } data *<br>
Total reads before filter: ** ${ splitstring[0] } ** <br>
Total reads after filter: ** ${ splitstring[1] } ** <br>
Total bases before filter: ** ${ splitstring[2] } ** <br>
Total bases after filter: ** ${ splitstring[3] } ** <br> <br>
See also the report:
<a href="fastq-stats-report.html" target="_blank">fastq-stats-report.html</a>,
which contains per-read data about Phred-score and k-mer distribution, N50 etc. <br>
The raw data from both reports (as *.csv files) can be found in the results folder.'
}
" \
${x}
"""
}
//=============================
Channel.fromPath("${baseDir}/bin/fastq-stats-report.Rmd").set{ fastq_stats_report_ch }
Channel.fromPath("${baseDir}/bin/fastq-stats-report-ont.Rmd").set{ fastq_stats_report_ont_ch }
if (!params.ontreads) {
process fastq_stats_ilmn {
publishDir params.outdir, mode: 'copy'
input:
file x from reads_ch.collect()
file 'fastq-stats-report.Rmd' from fastq_stats_report_ch
output:
file 'fastq-stats-report.html'
file "fastq-stats.csv"
file "fastq-stats.xlsx"
script:
"""
seqtools.R $x
"""
}
} else {
process fastq_stats_ont {
publishDir params.outdir, mode: 'copy'
input:
file x from reads_ch.collect()
file 'fastq-stats-report-ont.Rmd' from fastq_stats_report_ont_ch
output:
file 'fastq-stats-report-ont.html'
file "fastq-stats.csv"
file "fastq-stats.xlsx"
script:
"""
seqtools-ont.R $x
"""
}
}
//=============================
workflow.onComplete {
if (workflow.success) {
log.info """
===========================================
${ANSI_GREEN}Finished in ${workflow.duration}
Processed ${ readcounts.size() } fastq files
See the report here ==> ${ANSI_RESET}$params.outdir/multiqc_report.html
"""
.stripIndent()
} else {
log.info """
===========================================
${ANSI_RED}Finished with errors!${ANSI_RESET}
"""
.stripIndent()
}
}