Merge pull request #117 from rki-mf1/pacbio

add pacbio specific minimap2 parameter and input option
rki-mf1 · Dec 12, 2024 · 9012f7a · 9012f7a
2 parents cf29de1 + b89c99a
commit 9012f7a
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -15,11 +15,11 @@ Technologies ([DNA CS (DCS)](https://assets.ctfassets.net/hkzaxo8a05x5/2IX56YmF5
 
 ## What this workflow does for you
 
-With this workflow you can screen and clean your Illumina, Nanopore or any FASTA-formated sequence data. The results are the clean sequences and the sequences identified as contaminated. Per default [minimap2](https://github.com/lh3/minimap2) is used for aligning your sequences to reference sequences (with the `map-ont` settings for Nanopore data and `sr` settings for short-read data activated automatically). However, for short-read data, you may want to switch to [BWA](https://github.com/lh3/bwa) (`--bwa`). As another alternative, we provide `bbduk`, part of [BBTools](https://github.com/BioInfoTools/BBMap), as a kmer-based approach (`--bbduk`). However, no mapping file will be produced with `bbduk` and thus some subsequent statistics are not calculated. 
+With this workflow you can screen and clean your Illumina, Nanopore, PacBio CLR or any FASTA-formated sequence data. The results are the clean sequences and the sequences identified as contaminated. Per default [minimap2](https://github.com/lh3/minimap2) is used for aligning your sequences to reference sequences (with the `map-ont` settings for Nanopore data, `map-bp` for PacBio CLR data, and `sr` settings for short-read data activated automatically). However, for short-read data, you may want to switch to [BWA](https://github.com/lh3/bwa) (`--bwa`). As another alternative, we provide `bbduk`, part of [BBTools](https://github.com/BioInfoTools/BBMap), as a kmer-based approach (`--bbduk`). However, no mapping file will be produced with `bbduk` and thus some subsequent statistics are not calculated. 
 
 You can simply specify provided hosts and controls for the cleanup or use your own FASTA files. The reads are then mapped (or kmer-based compared in case of `bbduk`) against the specified host, control, and user defined FASTA files. All reads that match are considered as contamination. In case of Illumina paired-end reads, both mates need to be aligned (singleton files will be produced otherwise).
 
-The read input is defined via `--input_type nano` for Nanopore and `--input_type illumina` or `--input_type illumina_single_end` for Illumina reads. Additional control(s) for decontamination can be defined via `--control`. If controls are defined, they are selectively concatenated with the host and potential own FASTA files for decontamination. We provide auto-download for the following controls: `dcs` for Nanopore DNA-Seq, `eno` for Nanopore RNA-Seq, and `phix` from Illumina data. In general, specified host, control, and user defined FASTA files are concatenated for decontamination.
+The read input is defined via `--input_type nano` for Nanopore, `--input_type pacbio` for PacBio CLR, and `--input_type illumina` or `--input_type illumina_single_end` for Illumina reads. Additional control(s) for decontamination can be defined via `--control`. If controls are defined, they are selectively concatenated with the host and potential own FASTA files for decontamination. We provide auto-download for the following controls: `dcs` for Nanopore DNA-Seq, `eno` for Nanopore RNA-Seq, and `phix` from Illumina data. In general, specified host, control, and user defined FASTA files are concatenated for decontamination.
 
 ### Filter soft-clipped contamination reads
 

diff --git a/clean.nf b/clean.nf
@@ -26,7 +26,7 @@ if (params.input.contains('.clean.') ) {
 /*
 Comment section: First part is a terminal print for additional user information,
 followed by some help statements (e.g. missing input) Second part is file
-channel input. This allows via --list to alter the input of --nano & --illumina
+channel input. This allows via --list to alter the input of --input
 to add csv instead. name,path or name,pathR1,pathR2 in case of illumina
 */
 
@@ -80,7 +80,7 @@ if ( workflow.profile.contains('singularity') ) {
 
 Set controls = ['phix', 'dcs', 'eno']
 Set hosts = ['hsa', 'mmu', 'cli', 'csa', 'gga', 'eco', 'sc2', 't2t']
-Set input_types = ['nano', 'illumina', 'illumina_single_end', 'fasta']
+Set input_types = ['nano', 'illumina', 'illumina_single_end', 'fasta', 'pacbio']
 
 if ( params.profile ) { exit 1, "--profile is wrong, use -profile" }
 if ( params.input == '' || !params.input_type == '' ) { exit 1, "Missing required input parameters [--input] and [--input_type]" }
@@ -233,9 +233,10 @@ def helpMSG() {
 
     Workflow: Decontamination
 
-    Clean your Illumina, Nanopore or any FASTA-formated sequence date. The output are the clean
+    Clean your Illumina, Nanopore, PacBio or any FASTA-formated sequence date. The output are the clean
     and as contaminated identified sequences. Per default minimap2 is used for aligning your sequences
-    to a host but we recommend using the ${c_dim}--bbduk${c_reset} flag to switch to bbduk to clean short-read data.
+    to a host but we recommend using BWA for mapping short reads ${c_dim}--bwa${c_reset} or the ${c_dim}--bbduk${c_reset} flag 
+    to switch to bbduk to clean short-read data.
 
     Use the ${c_dim}--host${c_reset} and ${c_dim}--control${c_reset} flag to download a host database or specify your ${c_dim}--own${c_reset} FASTA.
 
@@ -248,11 +249,12 @@ def helpMSG() {
 
     ${c_yellow}Input:${c_reset}
     ${c_green}--input_type nano                --input${c_reset} '*.fasta' or '*.fastq.gz'   -> one sample per file
+    ${c_green}--input_type pacbio              --input${c_reset} '*.fasta' or '*.fastq.gz'   -> one sample per file (for PacBio CLR reads)
     ${c_green}--input_type illumina            --input${c_reset} '*.R{1,2}.fastq.gz'         -> file pairs
     ${c_green}--input_type illumina_single_end --input${c_reset} '*.fastq.gz'                -> one sample per file
     ${c_green}--input_type fasta               --input${c_reset} '*.fasta.gz'                -> one sample per file
     ${c_dim} ...read above input from csv files:${c_reset} ${c_green}--list ${c_reset}
-                         ${c_dim}required format: name,path for --input_type nano and --input_type fasta; name,pathR1,pathR2 for --illumina input_type; name,path for --input_type illumina_single_end${c_reset}
+                         ${c_dim}required format: name,path for --input_type nano, --input_type pacbio, and --input_type fasta; name,pathR1,pathR2 for --illumina input_type; name,path for --input_type illumina_single_end${c_reset}
 
     ${c_yellow}Decontamination options:${c_reset}
     ${c_green}--host${c_reset}         Comma separated list of reference genomes for decontamination, downloaded based on this parameter [default: $params.host]

diff --git a/modules/minimap2.nf b/modules/minimap2.nf
@@ -15,6 +15,11 @@ process minimap2 {
     """
     minimap2 ${params} -N 5 --split-prefix tmp --secondary=no -t ${task.cpus} ${db} ${input} | samtools view -bhS -@ ${task.cpus} > ${name}.bam
     """
+  } else if ( params.input_type == 'pacbio' ) {
+    params = params.reads_rna ? "-ax splice -k14" : "-ax map-pb"
+    """
+    minimap2 ${params} -N 5 --split-prefix tmp --secondary=no -t ${task.cpus} ${db} ${input} | samtools view -bhS -@ ${task.cpus} > ${name}.bam
+    """
   } else if ( params.input_type.contains('illumina') ) {
     """
     minimap2 -ax sr -N 5 --split-prefix tmp --secondary=no -t ${task.cpus} ${db} ${input} | samtools view -bhS -@ ${task.cpus} > ${name}.bam

diff --git a/nextflow.config b/nextflow.config
@@ -15,7 +15,7 @@ params {
 
   // input - reads
   input = ''
-  input_type = '' // nano illumina illumina_single_end fasta
+  input_type = '' // nano illumina illumina_single_end fasta pacbio
   list = false
 
   // parameters

diff --git a/workflows/qc_wf.nf b/workflows/qc_wf.nf
@@ -12,7 +12,7 @@ workflow qc {
     if ( input_type == 'fasta' ){
       quast(input)
       report = quast.out.report_tsv
-    } else if ( input_type == 'nano' ) {
+    } else if ( input_type == 'nano' || input_type == 'pacbio' ) {
       nanoplot(input)
       format_nanoplot_report(nanoplot.out.html)
       report = format_nanoplot_report.out