From 455992936b39ccc8d0e5c94e76c0af382ad8436c Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 15:16:24 +0300 Subject: [PATCH 01/16] remove operational parameters from scientific configuration --- .../TOPMed_RNAseq_pipeline/markduplicates.cwl | 14 ++--- .../TOPMed_RNAseq_pipeline/rna_seqc.cwl | 17 ++---- .../rnaseq_pipeline_fastq.cwl | 56 ++++++++----------- .../TOPMed_RNAseq_pipeline/rsem.cwl | 10 ++-- 4 files changed, 39 insertions(+), 58 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl index c31c9d6..53475b6 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_MarkDuplicates.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_MarkDuplicates.py) @@ -9,12 +10,12 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-MarkDuplicates" label: "run-MarkDuplicates" baseCommand: ["python3", "-u", "/src/run_MarkDuplicates.py"] requirements: - - class: DockerRequirement + InlineJavascriptRequirement: {} + DockerRequirement: dockerPull: heliumdatacommons/topmed-rnaseq:latest inputs: @@ -26,11 +27,10 @@ inputs: type: string inputBinding: position: 2 - memory: - type: int - inputBinding: - position: 3 - prefix: --memory + +arguments: + - prefix: --memory + valueFrom: ${runtime.mem / 1024} outputs: bam_file: diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl index 7d09bf9..80412c0 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_rnaseqc.py](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/src/run_rnaseqc.py) duplicated from [run_rnaseqc.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_rnaseqc.py) with minor modifications. @@ -9,12 +10,12 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-seqc" label: "run-seqc" # run_rnaseqc.py is not an executable file in the docker container. baseCommand: ["python3", "/src/run_rnaseqc.py"] requirements: + InlineJavascriptRequirement: {} DockerRequirement: dockerPull: heliumdatacommons/topmed-rnaseq:latest @@ -40,16 +41,6 @@ inputs: type: string inputBinding: position: 4 - java_path: - type: string - inputBinding: - position: 5 - prefix: --java - memory: - type: int - inputBinding: - position: 6 - prefix: --memory rnaseqc_flags: type: type: array @@ -70,6 +61,10 @@ inputs: # position: 8 # prefix: --gatk_flags +arguments: + - prefix: --memory + valueFrom: ${runtime.ram / 1024} + outputs: gene_rpkm: type: File diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index 589cd2e..bafdfbe 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -15,12 +15,11 @@ doc: | cwlVersion: v1.0 class: Workflow -id: "TOPMed_RNA-seq" label: "TOPMed_RNA-seq" requirements: - - class: SubworkflowFeatureRequirement - - class: ResourceRequirement + SubworkflowFeatureRequirement: {} + ResourceRequirement: coresMin: 4 ramMin: 16 tmpdirMin: 51200 @@ -32,10 +31,6 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir: type: Directory max_frag_len: @@ -53,8 +48,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -64,64 +57,64 @@ inputs: # items: string outputs: - - id: star_output_bam + star_output_bam: outputSource: run_star/bam_file type: File - - id: star_output_bam_index + star_output_bam_index: outputSource: run_star/bam_index type: File - - id: star_output_transcriptome_bam + star_output_transcriptome_bam: outputSource: run_star/transcriptome_bam type: File - - id: star_output_chimeric_junctions + star_output_chimeric_junctions: outputSource: run_star/chimeric_junctions type: File - - id: star_output_chimeric_bam_file + star_output_chimeric_bam_file: outputSource: run_star/chimeric_bam_file type: File - - id: star_output_chimeric_bam_index + star_output_chimeric_bam_index: outputSource: run_star/chimeric_bam_index type: File - - id: star_output_read_counts + star_output_read_counts: outputSource: run_star/read_counts type: File - - id: star_output_junctions + star_output_junctions: outputSource: run_star/junctions type: File - - id: star_output_junctions_pass1 + star_output_junctions_pass1: outputSource: run_star/junctions_pass1 type: File - - id: star_output_logs + star_output_logs: outputSource: run_star/logs type: File[] - - id: markduplicates_output_bam + markduplicates_output_bam: outputSource: run_markduplicates/bam_file type: File - - id: markduplicates_output_metrics + markduplicates_output_metrics: outputSource: run_markduplicates/metrics type: File - - id: markduplicates_bam_index + markduplicates_bam_index: outputSource: run_index_markduplicates_bam/bam_index type: File - - id: rsem_output_gene_results + rsem_output_gene_results: outputSource: run_rsem/gene_results type: File - - id: rsem_output_isoforms_results + rsem_output_isoforms_results: outputSource: run_rsem/isoforms_results type: File - - id: rna-seqc_output_gene_rpkm + rna-seqc_output_gene_rpkm: outputSource: run_rna-seqc/gene_rpkm type: File - - id: rna-seqc_output_gene_counts + rna-seqc_output_gene_counts: outputSource: run_rna-seqc/gene_counts type: File - - id: rna-seqc_output_exon_counts + rna-seqc_output_exon_counts: outputSource: run_rna-seqc/exon_counts type: File - - id: rna-seqc_output_count_metrics + rna-seqc_output_count_metrics: outputSource: run_rna-seqc/count_metrics type: File - - id: rna-seqc_output_count_outputs + rna-seqc_output_count_outputs: outputSource: run_rna-seqc/count_outputs type: File @@ -137,7 +130,6 @@ steps: star_index: star_index fastqs: fastqs prefix_str: prefix_str - threads: threads out: [ bam_file, @@ -157,7 +149,6 @@ steps: in: input_bam: run_star/bam_file prefix_str: prefix_str - memory: memory out: [ bam_file, @@ -180,7 +171,6 @@ steps: estimate_rspd: estimate_rspd is_stranded: is_stranded paired_end: paired_end - threads: threads out: [ gene_results, @@ -194,8 +184,6 @@ steps: genes_gtf: genes_gtf genome_fasta: genome_fasta prefix_str: prefix_str - java_path: java_path - memory: memory rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl index 5de090a..d06ca83 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl @@ -7,7 +7,6 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-rsem" label: "run-rsem" baseCommand: /src/run_RSEM.py @@ -50,11 +49,10 @@ inputs: inputBinding: position: 7 prefix: --paired_end - threads: - type: int - inputBinding: - position: 8 - prefix: --threads + +arguments: + - prefix: --threads + valueFrom: $(runtime.cores) outputs: gene_results: From a43e0ff4727b80e12143b2f2ac7a7e0bcae76e72 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 16:05:01 +0300 Subject: [PATCH 02/16] use a plain STAR container --- .../rnaseq_pipeline_fastq.cwl | 54 +++++--- .../TOPMed_RNAseq_pipeline/star.cwl | 118 ++++++++++++++---- 2 files changed, 131 insertions(+), 41 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index bafdfbe..b25bf8e 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -58,11 +58,11 @@ inputs: outputs: star_output_bam: - outputSource: run_star/bam_file - type: File - star_output_bam_index: - outputSource: run_star/bam_index + outputSource: sort_bam/output_file type: File + # star_output_bam_index: + # outputSource: run_star/bam_index + # type: File star_output_transcriptome_bam: outputSource: run_star/transcriptome_bam type: File @@ -70,11 +70,11 @@ outputs: outputSource: run_star/chimeric_junctions type: File star_output_chimeric_bam_file: - outputSource: run_star/chimeric_bam_file - type: File - star_output_chimeric_bam_index: - outputSource: run_star/chimeric_bam_index + outputSource: sort_chimeras/output_file type: File + # star_output_chimeric_bam_index: + # outputSource: run_star/chimeric_bam_index + # type: File star_output_read_counts: outputSource: run_star/read_counts type: File @@ -129,25 +129,51 @@ steps: in: star_index: star_index fastqs: fastqs - prefix_str: prefix_str + prefix: prefix_str out: [ - bam_file, - bam_index, + bam, transcriptome_bam, chimeric_junctions, - chimeric_bam_file, - chimeric_bam_index, + chimeric_bam, read_counts, junctions, junctions_pass1, logs ] + sort_bam: + run: https://dockstore.org:8443/api/ga4gh/v2/tools/quay.io%2Fcancercollaboratory%2Fdockstore-tool-samtools-sort/versions/1.0/plain-CWL/descriptor/%2FDockstore.cwl + in: + threads: + valueFrom: $(runtime.cores) + memory: + valueFrom: $(runtime.ram)M + input: + source: run_star/bam + output_name: + source: prefix_str + valueFrom: $(self).Aligned.sortedByCoord.out.bam + out: [ output_file ] + + sort_chimeras: + run: https://dockstore.org:8443/api/ga4gh/v2/tools/quay.io%2Fcancercollaboratory%2Fdockstore-tool-samtools-sort/versions/1.0/plain-CWL/descriptor/%2FDockstore.cwl + in: + threads: + valueFrom: $(runtime.cores) + memory: + valueFrom: $(runtime.ram)M + input: + source: run_star/chimeric_bam + output_name: + source: prefix_str + valueFrom: $(self).Chimeric.out.sorted.bam + out: [ output_file ] + run_markduplicates: run: markduplicates.cwl in: - input_bam: run_star/bam_file + input_bam: sort_bam/output_file prefix_str: prefix_str out: [ diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl index ddddabf..7ffb7de 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl @@ -9,21 +9,18 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-star" label: "run-star" -baseCommand: /src/run_STAR.py +baseCommand: STAR -requirements: +hints: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + dockerPull: quay.io/biocontainers/star:2.5.3a--0 inputs: star_index: type: Directory - default: - type: Directory inputBinding: - position: 1 + prefix: --genomeDir fastqs: type: type: array @@ -31,26 +28,89 @@ inputs: inputBinding: itemSeparator: "," inputBinding: - position: 2 - prefix_str: + prefix: --readFilesIn + prefix: type: string inputBinding: - position: 3 - threads: - type: int? - inputBinding: - position: 5 - prefix: --threads + prefix: --outFileNamePrefix + valueFrom: $(runtime.outdir)/$(self). + +arguments: + - prefix: --runMode + valueFrom: alignReads + - prefix: --runThreadN + valueFrom: $(runtime.cores) + - prefix: --twopassMode + valueFrom: Basic + - prefix: --outFilterMultimapNmax + valueFrom: "20" + - prefix: --alignSJoverhangMin + valueFrom: "8" + - prefix: --alignSJDBoverhangMin + valueFrom: "1" + - prefix: --outFilterMismatchNmax + valueFrom: "999" + - prefix: --outFilterMismatchNoverLmax + valueFrom: "0.1" + - prefix: --alignIntronMin + valueFrom: "20" + - prefix: --alignIntronMax + valueFrom: "1000000" + - prefix: --outFilterMismatchNoverLmax + valueFrom: "0.1" + - prefix: --alignMatesGapMax + valueFrom: "1000000" + - prefix: --outFilterType + valueFrom: BySJout + - prefix: --outFilterScoreMinOverLread + valueFrom: "0.33" + - prefix: --outFilterMatchNminOverLread + valueFrom: "0.33" + - prefix: --limitSjdbInsertNsj + valueFrom: "1200000" + - prefix: --readFilesCommand + valueFrom: zcat + - prefix: --outSAMstrandField + valueFrom: introMotif + - prefix: --outFilterIntronMotifs + valueFrom: None + - prefix: --alignSoftClipAtReferenceEnds + valueFrom: Yes + - prefix: --quantMode + valueFrom: "TranscriptomeSAM GeneCounts" + - prefix: --outSAMtype + valueFrom: "BAM Unsorted" + - prefix: --outSAMunmapped + valueFrom: Within + - prefix: --genomeLoad + valueFrom: NoSharedMemory + - prefix: --chimSegmentMin + valueFrom: "15" + - prefix: --chimJunctionOverhangMin + valueFrom: "15" + - prefix: --chimOutType + valueFrom: "WithinBAM SoftClip" + - prefix: --chimMainSegmentMultNmax + valueFrom: "1" + - prefix: --outSAMattributes + valueFrom: "NH HI AS nM NM ch" + - prefix: --outSAMattrRGline + valueFrom: "ID:rg1 SM:sm1" + outputs: - bam_file: + bam: type: File outputBinding: - glob: "*.Aligned.sortedByCoord.out.bam" - bam_index: - type: File - outputBinding: - glob: "*.Aligned.sortedByCoord.out.bam.bai" + glob: $(inputs.prefix).Aligned.out.bam + # bam_file: + # type: File + # outputBinding: + # glob: "*.Aligned.sortedByCoord.out.bam" + # bam_index: + # type: File + # outputBinding: + # glob: "*.Aligned.sortedByCoord.out.bam.bai" transcriptome_bam: type: File outputBinding: @@ -59,14 +119,18 @@ outputs: type: File outputBinding: glob: "*.Chimeric.out.junction" - chimeric_bam_file: - type: File - outputBinding: - glob: "*.Chimeric.out.sorted.bam" - chimeric_bam_index: + chimeric_bam: type: File outputBinding: - glob: "*.Chimeric.out.sorted.bam.bai" + glob: $(inputs.prefix).Chimeric.out.sam + # chimeric_bam_file: + # type: File + # outputBinding: + # glob: "*.Chimeric.out.sorted.bam" + # chimeric_bam_index: + # type: File + # outputBinding: + # glob: "*.Chimeric.out.sorted.bam.bai" read_counts: type: File outputBinding: From ec9d74a8668150390c293dac742d7067fa47c653 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 17:09:09 +0300 Subject: [PATCH 03/16] index the sorted bams --- .../rnaseq_pipeline_fastq.cwl | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index b25bf8e..b8d7981 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -60,9 +60,9 @@ outputs: star_output_bam: outputSource: sort_bam/output_file type: File - # star_output_bam_index: - # outputSource: run_star/bam_index - # type: File + star_output_bam_index: + outputSource: index_bam/bam_index + type: File star_output_transcriptome_bam: outputSource: run_star/transcriptome_bam type: File @@ -72,9 +72,9 @@ outputs: star_output_chimeric_bam_file: outputSource: sort_chimeras/output_file type: File - # star_output_chimeric_bam_index: - # outputSource: run_star/chimeric_bam_index - # type: File + star_output_chimeric_bam_index: + outputSource: index_chimeras/bas_index + type: File star_output_read_counts: outputSource: run_star/read_counts type: File @@ -170,6 +170,18 @@ steps: valueFrom: $(self).Chimeric.out.sorted.bam out: [ output_file ] + index_bam: + run: indexbam.cwl + in: + input_bam: sort_bam/output_file + out: [bam_index] + + index_chimeras: + run: indexbam.cwl + in: + input_bam: sort_chimeras/output_file + out: [bam_index] + run_markduplicates: run: markduplicates.cwl in: From b6216064c347811d58bc82e0dbb8f934720dfb8c Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 17:23:05 +0300 Subject: [PATCH 04/16] use a plain biocontainer for picard --- .../TOPMed_RNAseq_pipeline/indexbam.cwl | 15 +++++------ .../TOPMed_RNAseq_pipeline/markduplicates.cwl | 27 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl index beb4be4..b1c55fa 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl @@ -3,23 +3,22 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-index-bam" label: "run-index-bam" -baseCommand: ["samtools", "index"] +baseCommand: [ samtools, index ] requirements: -- class: InlineJavascriptRequirement -- class: DockerRequirement - dockerPull: heliumdatacommons/topmed-rnaseq:latest -- class: InitialWorkDirRequirement - listing: - - $(inputs.input_bam) + DockerRequirement: + dockerPull: heliumdatacommons/topmed-rnaseq:latest + InitialWorkDirRequirement: + listing: + - $(inputs.input_bam) inputs: input_bam: type: File inputBinding: position: 1 + valueFrom: $(self.basename) outputs: bam_index: diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl index 53475b6..340e171 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl @@ -11,33 +11,36 @@ doc: | cwlVersion: v1.0 class: CommandLineTool label: "run-MarkDuplicates" -baseCommand: ["python3", "-u", "/src/run_MarkDuplicates.py"] +baseCommand: [java, picard.cmdline.PicardCommandLine] -requirements: - InlineJavascriptRequirement: {} +hints: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + dockerPull: quay.io/biocontainers/picard:2.9.2--2 inputs: input_bam: type: File - inputBinding: - position: 1 prefix_str: type: string - inputBinding: - position: 2 arguments: - - prefix: --memory - valueFrom: ${runtime.mem / 1024} + - prefix: -Xmx + valueFrom: $(runtime.ram)M + - picard.cmdline.PicardCommandLine + - MarkFuplicates + - I=$(inputs.input_bam.path) + - O=$(runtime.outdir)/$(inputs.input_bam.nameroot).md.bam + - M=$(runtime.outdir)/$(inputs.prefix_str).marked_dup_metrics.txt + - ASSUME_SORT_ORDER=coordinate + - OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 outputs: bam_file: type: File outputBinding: - glob: "*.md.bam" + glob: $(runtime.outdir)/$(inputs.input_bam.nameroot).md.bam metrics: type: File outputBinding: - glob: "*.marked_dup_metrics.txt" + glob: $(runtime.outdir)/$(inputs.prefix_str).marked_dup_metrics.txt + From 12f4ee5639120f0726c34f5bc50cb194fde7b111 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 17:24:52 +0300 Subject: [PATCH 05/16] use a plain biocontainer for samtools --- topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl index b1c55fa..ced26a4 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl @@ -8,7 +8,7 @@ baseCommand: [ samtools, index ] requirements: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + dockerPull: quay.io/biocontainers/samtools:1.8--4 InitialWorkDirRequirement: listing: - $(inputs.input_bam) From c6b26e53020562e6dd1181a30f831e0ace393479 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 17:56:22 +0300 Subject: [PATCH 06/16] use simple biocontainer for rsem --- .../input-examples/rsem-example.yml | 8 ++-- .../TOPMed_RNAseq_pipeline/rsem.cwl | 42 +++++++++---------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml index fab64df..9398d89 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml @@ -1,13 +1,13 @@ rsem_ref_dir: class: Directory - location: /rsem_ref/ + location: ../test-data/rsem_reference transcriptome_bam: { class: File, path: LC_C13_cRNA.Aligned.toTranscriptome.out.bam } prefix_str: "LC_C13_cRNA" max_frag_len: 1000 -estimate_rspd: "true" -is_stranded: "true" -paired_end: "true" +estimate_rspd: true +is_stranded: true +paired_end: true threads: 4 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl index d06ca83..6b2ba1c 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_RSEM.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_RSEM.py) @@ -8,58 +9,55 @@ doc: | cwlVersion: v1.0 class: CommandLineTool label: "run-rsem" -baseCommand: /src/run_RSEM.py +baseCommand: rsem-calculate-expression -requirements: +hints: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + dockerPull: quay.io/biocontainers/rsem:1.3.0--boost1.64_3 inputs: rsem_ref_dir: type: Directory - default: - type: Directory inputBinding: position: 1 + valueFrom: $(self.path)/rsem_reference transcriptome_bam: type: File inputBinding: - position: 2 + prefix: --bam prefix_str: type: string - inputBinding: - position: 3 max_frag_len: type: int inputBinding: - position: 4 - prefix: --max_frag_len + prefix: --fragment-length-max estimate_rspd: - type: string + type: boolean inputBinding: - position: 5 - prefix: --estimate_rspd + prefix: --estimate-rspd is_stranded: - type: string + type: boolean inputBinding: - position: 6 - prefix: --is_stranded + prefix: "--forward-prob 0.0" paired_end: - type: string + type: boolean inputBinding: - position: 7 - prefix: --paired_end + prefix: --paired-end arguments: - - prefix: --threads + - prefix: --num-threads valueFrom: $(runtime.cores) + - prefix: --fragment-length-max + valueFrom: "1000" + - --no-bam-output + - $(inputs.prefix_str).rsem outputs: gene_results: type: File outputBinding: - glob: "*.rsem.genes.results" + glob: $(inputs.prefix_str).rsem.genes.results isoforms_results: type: File outputBinding: - glob: "*.rsem.isoforms.results" + glob: $(inputs.prefix_str).rsem.isoforms.results From ce0ef69c63d6a68721ae4d40afce089cf78d65d9 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 17:56:53 +0300 Subject: [PATCH 07/16] add #!/usr/bin/env cwl-runner everywhere --- .../TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl | 1 + .../TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl | 1 + .../checker-workflows/components/calc_md5.cwl | 1 + .../checker-workflows/components/compare_bams.cwl | 1 + .../checker-workflows/components/read_file.cwl | 1 + .../checker-workflows/components/string_comparison.cwl | 1 + .../checker-workflows/components/untar_dir.cwl | 1 + .../checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl | 1 + .../checker-workflows/rnaseq_pipeline_fastq_checker.cwl | 1 + topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl | 1 + .../TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl | 2 ++ topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl | 1 + 12 files changed, 13 insertions(+) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl index 77743a6..3b1ab51 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Compare 2 input BAM files and report results. Exit 0 if sucess. diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl index 471900b..e4a00bc 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Calculates the MD5 hash of the input file and compares it to the input MD5 hash. If hashes match: Exit 0 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl index 9d2a724..e72cf7c 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Calculate the MD5 hash for the input file. diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl index c65ba33..afa8ff3 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Compare 2 input BAM files using [BamUtil diff](https://genome.sph.umich.edu/wiki/BamUtil:_diff) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl index d453a93..c24f2df 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner cwlVersion: v1.0 class: ExpressionTool diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl index 7420cf0..88b8a02 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner cwlVersion: v1.0 class: CommandLineTool baseCommand: [] diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl index d5ad119..e4d50b8 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Extract all files from archive.tar and filter through gzip diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl index ff3d050..9d77366 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A workflow to verify the proper execution of [TOPMed RNA-seq Workflow](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl index 53a1989..e08d4da 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A workflow to verify the proper execution of [TOPMed RNA-seq Workflow](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl index ced26a4..1451c81 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A wrapper for running `samtools index `. diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index b8d7981..0eff973 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | TOPMed RNA-seq CWL workflow. Documentation on the workflow can be found [here](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/README.md). Example input files: [Dockstore.json](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/Dockstore.json) and [rnaseq_pipeline_fastq-example.yml](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml). @@ -19,6 +20,7 @@ label: "TOPMed_RNA-seq" requirements: SubworkflowFeatureRequirement: {} +hints: ResourceRequirement: coresMin: 4 ramMin: 16 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl index 7ffb7de..6b76c73 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_STAR.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_STAR.py) From c27f6b846428c553ce0c42c33a3cf5ae8653c311 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Aug 2018 18:00:18 +0300 Subject: [PATCH 08/16] two small errors --- .../checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl | 2 -- .../TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl index 9d77366..2721831 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl @@ -73,8 +73,6 @@ inputs: # type: string # hash_exon_counts: # type: string - hash_count_metrics: - type: string # hash_count_outputs: # type: string checker_star_output_bam: diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index 0eff973..29e4d28 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -75,7 +75,7 @@ outputs: outputSource: sort_chimeras/output_file type: File star_output_chimeric_bam_index: - outputSource: index_chimeras/bas_index + outputSource: index_chimeras/bam_index type: File star_output_read_counts: outputSource: run_star/read_counts From 04b4ecb0f48d337e2bc9fd5cbe05d0bfee6c107f Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 25 Aug 2018 08:47:15 -0700 Subject: [PATCH 09/16] switch star container --- .../rnaseq_pipeline_fastq-example.yml | 41 +++++++--------- .../rnaseq_pipeline_fastq.cwl | 11 ++--- .../TOPMed_RNAseq_pipeline/star.cwl | 49 ++++++++++++++----- 3 files changed, 56 insertions(+), 45 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml index 967e8e9..bfba969 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml @@ -1,30 +1,23 @@ star_index: class: Directory - location: /star_index/ -fastqs: [ - {class: File, - path: LC_C13_cRNA_sequence_R1.fastq.gz}, - {class: File, - path: LC_C13_cRNA_sequence_R2.fastq.gz} -] + location: ../test-data/star_chr12 +fastqs: + - class: File + path: ../test-data/sample/LC_C13_cRNA_sequence_R1_sub.fastq.gz + - class: File + path: ../test-data/sample/LC_C13_cRNA_sequence_R2_sub.fastq.gz rsem_ref_dir: class: Directory - location: /rsem_ref/ + location: ../test-data/rsem_reference max_frag_len: 1000 -estimate_rspd: "true" -is_stranded: "true" -paired_end: "true" -genes_gtf: { - class: File, - path: gencode.v26.annotation.withTranscriptID.gtf -} -genome_fasta: { - class: File, - path: Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta -} -java_path: /usr/lib/jvm/java-1.7.0-openjdk-amd64/bin/java -memory: 8 +estimate_rspd: true +is_stranded: true +paired_end: true +genes_gtf: + class: File + path: ../test-data/gencode.v26.annotation.withTranscriptID.gtf +genome_fasta: + class: File + path: ../test-data/Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta rnaseqc_flags: ["noDoC", "strictMode"] -# gatk_flags: [] -prefix_str: "LC_C13_cRNA" -threads: 4 +prefix_str: LC_C13_cRNA diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index 29e4d28..9093504 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -38,11 +38,11 @@ inputs: max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -122,11 +122,6 @@ outputs: steps: run_star: - requirements: - ResourceRequirement: - coresMin: 4 - ramMin: 16 - tmpdirMin: 51200 run: star.cwl in: star_index: star_index diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl index 6b76c73..ade57a2 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl @@ -15,7 +15,23 @@ baseCommand: STAR hints: DockerRequirement: - dockerPull: quay.io/biocontainers/star:2.5.3a--0 + #dockerPull: quay.io/biocontainers/star:2.5.3a--0 + dockerFile: | + FROM biocontainers/biocontainers:debian-stretch-backports + MAINTAINER biocontainers + LABEL software="rna-star" \ + container="rna-star" \ + about.summary="ultrafast universal RNA-seq aligner" \ + about.home="https://github.com/alexdobin/STAR/" \ + software.version="2.6.1adfsg-1-deb" \ + version="1" \ + about.copyright="2009-2015 Alexander Dobin " \ + about.license="GPL-3+" \ + about.license_file="/usr/share/doc/rna-star/copyright" \ + extra.binaries="/usr/bin/STAR" \ + about.tags="biology::nucleic-acids, field::biology, field::biology:bioinformatics,:c++, role::program, use::analysing,:biological-sequence" + ENV DEBIAN_FRONTEND noninteractive + RUN apt-get update && apt-get install -y rna-star && apt-get clean && apt-get purge && rm -rf /var/lib/apt/lists/* /tmp/* inputs: star_index: @@ -57,8 +73,6 @@ arguments: valueFrom: "20" - prefix: --alignIntronMax valueFrom: "1000000" - - prefix: --outFilterMismatchNoverLmax - valueFrom: "0.1" - prefix: --alignMatesGapMax valueFrom: "1000000" - prefix: --outFilterType @@ -77,10 +91,12 @@ arguments: valueFrom: None - prefix: --alignSoftClipAtReferenceEnds valueFrom: Yes - - prefix: --quantMode - valueFrom: "TranscriptomeSAM GeneCounts" - - prefix: --outSAMtype - valueFrom: "BAM Unsorted" + - --quantMode + - TranscriptomeSAM + - GeneCounts + - --outSAMtype + - BAM + - Unsorted - prefix: --outSAMunmapped valueFrom: Within - prefix: --genomeLoad @@ -89,14 +105,21 @@ arguments: valueFrom: "15" - prefix: --chimJunctionOverhangMin valueFrom: "15" - - prefix: --chimOutType - valueFrom: "WithinBAM SoftClip" + - --chimOutType + - WithinBAM + - SoftClip - prefix: --chimMainSegmentMultNmax valueFrom: "1" - - prefix: --outSAMattributes - valueFrom: "NH HI AS nM NM ch" - - prefix: --outSAMattrRGline - valueFrom: "ID:rg1 SM:sm1" + - --outSAMattributes + - NH + - HI + - AS + - nM + - NM + - ch + - --outSAMattrRGline + - ID:rg1 + - SM:sm1 outputs: From 2818383766b8843b198d25d36609f1385ea003a2 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 25 Aug 2018 09:52:50 -0700 Subject: [PATCH 10/16] use local samtools-sort --- .../rnaseq_pipeline_fastq.cwl | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index 9093504..3f7bbf0 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -20,11 +20,12 @@ label: "TOPMed_RNA-seq" requirements: SubworkflowFeatureRequirement: {} -hints: - ResourceRequirement: - coresMin: 4 - ramMin: 16 - tmpdirMin: 51200 + StepInputExpressionRequirement: {} +# hints: +# ResourceRequirement: +# coresMin: 4 +# ramMin: 16 +# #tmpdirMin: 51200 inputs: star_index: @@ -140,12 +141,8 @@ steps: ] sort_bam: - run: https://dockstore.org:8443/api/ga4gh/v2/tools/quay.io%2Fcancercollaboratory%2Fdockstore-tool-samtools-sort/versions/1.0/plain-CWL/descriptor/%2FDockstore.cwl + run: samtools-sort.cwl in: - threads: - valueFrom: $(runtime.cores) - memory: - valueFrom: $(runtime.ram)M input: source: run_star/bam output_name: @@ -154,12 +151,8 @@ steps: out: [ output_file ] sort_chimeras: - run: https://dockstore.org:8443/api/ga4gh/v2/tools/quay.io%2Fcancercollaboratory%2Fdockstore-tool-samtools-sort/versions/1.0/plain-CWL/descriptor/%2FDockstore.cwl + run: samtools-sort.cwl in: - threads: - valueFrom: $(runtime.cores) - memory: - valueFrom: $(runtime.ram)M input: source: run_star/chimeric_bam output_name: @@ -231,7 +224,7 @@ steps: ] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf From d86c0cd5f7e730f444bbd7980d8882b5d69152f5 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 25 Aug 2018 09:53:08 -0700 Subject: [PATCH 11/16] fix STAR container --- .../TOPMed_RNAseq_pipeline/star.cwl | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl index ade57a2..21d8b8e 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl @@ -17,13 +17,13 @@ hints: DockerRequirement: #dockerPull: quay.io/biocontainers/star:2.5.3a--0 dockerFile: | - FROM biocontainers/biocontainers:debian-stretch-backports + FROM debian:buster-slim MAINTAINER biocontainers LABEL software="rna-star" \ container="rna-star" \ about.summary="ultrafast universal RNA-seq aligner" \ about.home="https://github.com/alexdobin/STAR/" \ - software.version="2.6.1adfsg-1-deb" \ + software.version="2.5.3adfsg-1-deb" \ version="1" \ about.copyright="2009-2015 Alexander Dobin " \ about.license="GPL-3+" \ @@ -31,7 +31,15 @@ hints: extra.binaries="/usr/bin/STAR" \ about.tags="biology::nucleic-acids, field::biology, field::biology:bioinformatics,:c++, role::program, use::analysing,:biological-sequence" ENV DEBIAN_FRONTEND noninteractive - RUN apt-get update && apt-get install -y rna-star && apt-get clean && apt-get purge && rm -rf /var/lib/apt/lists/* /tmp/* + RUN echo \ + 'deb http://snapshot.debian.org/archive/debian/20170906/ buster main' > /etc/apt/sources.list \ + && printf "Package: r-*\nPin: origin snapshot.debian.org\nPin-Priority: 990\n" > /etc/apt/preferences.d/snapshot \ + && apt-get -o Acquire::Check-Valid-Until=false update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + --allow-unauthenticated rna-star=2.5.3a+dfsg-3 \ + && apt-get clean && apt-get purge && rm -rf /var/lib/apt/lists/* /tmp/* + dockerImageId: star inputs: star_index: @@ -127,14 +135,6 @@ outputs: type: File outputBinding: glob: $(inputs.prefix).Aligned.out.bam - # bam_file: - # type: File - # outputBinding: - # glob: "*.Aligned.sortedByCoord.out.bam" - # bam_index: - # type: File - # outputBinding: - # glob: "*.Aligned.sortedByCoord.out.bam.bai" transcriptome_bam: type: File outputBinding: From b981acb5688f83a883b3b92eabc36ed45a600818 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 25 Aug 2018 09:53:18 -0700 Subject: [PATCH 12/16] improve rsem --- .../TOPMed_RNAseq_pipeline/rsem.cwl | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl index 6b2ba1c..514dad9 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl @@ -11,6 +11,12 @@ class: CommandLineTool label: "run-rsem" baseCommand: rsem-calculate-expression +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: + - entry: $(inputs.rsem_ref_dir) + writable: true hints: DockerRequirement: dockerPull: quay.io/biocontainers/rsem:1.3.0--boost1.64_3 @@ -19,14 +25,17 @@ inputs: rsem_ref_dir: type: Directory inputBinding: - position: 1 - valueFrom: $(self.path)/rsem_reference + position: 2 + valueFrom: $(self.basename)/rsem_reference transcriptome_bam: type: File inputBinding: - prefix: --bam + position: 1 prefix_str: type: string + inputBinding: + position: 3 + valueFrom: $(self).rsem max_frag_len: type: int inputBinding: @@ -37,8 +46,6 @@ inputs: prefix: --estimate-rspd is_stranded: type: boolean - inputBinding: - prefix: "--forward-prob 0.0" paired_end: type: boolean inputBinding: @@ -50,7 +57,8 @@ arguments: - prefix: --fragment-length-max valueFrom: "1000" - --no-bam-output - - $(inputs.prefix_str).rsem + - --bam + - ${if (inputs.is_stranded) { return ["--forward-prob", "0.0"];} } outputs: gene_results: From a4a69f69504d7bc77f499f20c01e3a77b6931367 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sat, 25 Aug 2018 10:24:51 -0700 Subject: [PATCH 13/16] fix markduplicates --- .../TOPMed_RNAseq_pipeline/markduplicates.cwl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl index 340e171..60086b5 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl @@ -11,9 +11,13 @@ doc: | cwlVersion: v1.0 class: CommandLineTool label: "run-MarkDuplicates" -baseCommand: [java, picard.cmdline.PicardCommandLine] +baseCommand: [java] -hints: +requirements: # turn back into a hint when the biocontainer has its classpath + # updated + EnvVarRequirement: + envDef: + CLASSPATH: /usr/local/share/picard-2.9.2-2/picard.jar DockerRequirement: dockerPull: quay.io/biocontainers/picard:2.9.2--2 @@ -26,8 +30,9 @@ inputs: arguments: - prefix: -Xmx valueFrom: $(runtime.ram)M + separate: false - picard.cmdline.PicardCommandLine - - MarkFuplicates + - MarkDuplicates - I=$(inputs.input_bam.path) - O=$(runtime.outdir)/$(inputs.input_bam.nameroot).md.bam - M=$(runtime.outdir)/$(inputs.prefix_str).marked_dup_metrics.txt @@ -38,9 +43,9 @@ outputs: bam_file: type: File outputBinding: - glob: $(runtime.outdir)/$(inputs.input_bam.nameroot).md.bam + glob: $(inputs.input_bam.nameroot).md.bam metrics: type: File outputBinding: - glob: $(runtime.outdir)/$(inputs.prefix_str).marked_dup_metrics.txt + glob: $(inputs.prefix_str).marked_dup_metrics.txt From 68d12917692cfad47313bf2973de904873f359e4 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Sun, 26 Aug 2018 23:57:10 -0700 Subject: [PATCH 14/16] final validation --- .../rnaseq_pipeline_fastq-example.yml | 32 +++++++++++-------- .../input-examples/rsem-example.yml | 2 +- .../TOPMed_RNAseq_pipeline/rna_seqc.cwl | 5 +-- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml index bfba969..e6d6b22 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml @@ -1,23 +1,27 @@ star_index: class: Directory - location: ../test-data/star_chr12 -fastqs: - - class: File - path: ../test-data/sample/LC_C13_cRNA_sequence_R1_sub.fastq.gz - - class: File - path: ../test-data/sample/LC_C13_cRNA_sequence_R2_sub.fastq.gz + location: /star_index/ +fastqs: [ + {class: File, + path: LC_C13_cRNA_sequence_R1.fastq.gz}, + {class: File, + path: LC_C13_cRNA_sequence_R2.fastq.gz} +] rsem_ref_dir: class: Directory - location: ../test-data/rsem_reference + location: /rsem_ref/ max_frag_len: 1000 estimate_rspd: true is_stranded: true paired_end: true -genes_gtf: - class: File - path: ../test-data/gencode.v26.annotation.withTranscriptID.gtf -genome_fasta: - class: File - path: ../test-data/Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta +genes_gtf: { + class: File, + path: gencode.v26.annotation.withTranscriptID.gtf +} +genome_fasta: { + class: File, + path: Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta +} rnaseqc_flags: ["noDoC", "strictMode"] -prefix_str: LC_C13_cRNA +# gatk_flags: [] +prefix_str: "LC_C13_cRNA" diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml index 9398d89..73fa82b 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml @@ -1,6 +1,6 @@ rsem_ref_dir: class: Directory - location: ../test-data/rsem_reference + location: /rsem_ref/ transcriptome_bam: { class: File, path: LC_C13_cRNA.Aligned.toTranscriptome.out.bam diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl index 80412c0..206536d 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl @@ -63,8 +63,9 @@ inputs: arguments: - prefix: --memory - valueFrom: ${runtime.ram / 1024} - + valueFrom: $(runtime.ram / 1024) + - prefix: --java + valueFrom: /usr/lib/jvm/java-1.7.0-openjdk-amd64/bin/java outputs: gene_rpkm: type: File From ad671727c9a9bd060f263ed72187ae682b8e158e Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 27 Aug 2018 00:49:29 -0700 Subject: [PATCH 15/16] add missing samtools sort --- .../TOPMed_RNAseq_pipeline/samtools-sort.cwl | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl new file mode 100644 index 0000000..9980e15 --- /dev/null +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl @@ -0,0 +1,91 @@ +#!/usr/bin/env cwl-runner + +# adapted from https://github.com/common-workflow-language/workflows/tree/master/tools + +class: CommandLineTool + +$namespaces: + dct: http://purl.org/dc/terms/ +$schemas: [ http://dublincore.org/2012/06/14/dcterms.rdf ] +dct:contributor: + foaf:name: Andy Yang + foaf:mbox: mailto:ayang@oicr.on.ca +dct:creator: + '@id': http://orcid.org/0000-0001-9102-5681 + foaf:name: Andrey Kartashov + foaf:mbox: mailto:Andrey.Kartashov@cchmc.org +dct:description: 'Developed at Cincinnati Children’s Hospital Medical Center for the + CWL consortium http://commonwl.org/ Original URL: https://github.com/common-workflow-language/workflows' +cwlVersion: v1.0 + + +requirements: +- class: DockerRequirement + dockerPull: quay.io/cancercollaboratory/dockstore-tool-samtools-sort:1.0 +inputs: + compression_level: + type: int? + inputBinding: + prefix: -l + doc: | + Set compression level, from 0 (uncompressed) to 9 (best) + + doc: Set number of sorting and compression threads [1] + input: + type: File + inputBinding: + position: 1 + + doc: Input bam file. + output_name: + type: string + inputBinding: + position: 2 + prefix: -o + + doc: Desired output filename. + sort_by_name: + type: boolean? + inputBinding: + prefix: -n + + doc: Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates. +outputs: + output_file: + type: File + outputBinding: + glob: $(inputs.output_name) + +arguments: + - prefix: -@ + valueFrom: $(runtime.cores) + - prefix: -m + valueFrom: $(runtime.ram)M + +baseCommand: [samtools, sort] +doc: | + Sort alignments by leftmost coordinates, or by read name when -n is used. An appropriate @HD-SO sort order header tag will be added or an existing one updated if necessary. + + Usage: samtools sort [-l level] [-m maxMem] [-o out.bam] [-O format] [-n] -T out.prefix [-@ threads] [in.bam] + + Options: + -l INT + Set the desired compression level for the final output file, ranging from 0 (uncompressed) or 1 (fastest but minimal compression) to 9 (best compression but slowest to write), similarly to gzip(1)'s compression level setting. + + If -l is not used, the default compression level will apply. + + + -n + Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates. + + -o FILE + Write the final sorted output to FILE, rather than to standard output. + + -O FORMAT + Write the final output as sam, bam, or cram. + + By default, samtools tries to select a format based on the -o filename extension; if output is to standard output or no format can be deduced, -O must be used. + + -T PREFIX + Write temporary files to PREFIX.nnnn.bam. This option is required. + From e578e5f9cb99770c423083da54c6756469e1eba6 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 27 Aug 2018 10:51:24 +0300 Subject: [PATCH 16/16] --validate clean on CWL in topmed-workflows/ --- .../rnaseq_pipeline_fastq_checker-tar.cwl | 17 ++++------------- .../rnaseq_pipeline_fastq_checker.cwl | 19 ++++--------------- .../TOPMed_RNAseq_pipeline/samtools-sort.cwl | 5 +---- 3 files changed, 9 insertions(+), 32 deletions(-) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl index 2721831..32626ba 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl @@ -21,20 +21,16 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir_tar: type: File max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -42,8 +38,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -156,8 +150,6 @@ steps: star_index: untar_star_index/untarred_dir fastqs: fastqs prefix_str: prefix_str - threads: threads - memory: memory rsem_ref_dir: untar_rsem_reference/untarred_dir max_frag_len: max_frag_len estimate_rspd: estimate_rspd @@ -165,7 +157,6 @@ steps: paired_end: paired_end genes_gtf: genes_gtf genome_fasta: genome_fasta - java_path: java_path rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: @@ -309,7 +300,7 @@ steps: # out: [out_hash_string] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl index e08d4da..718247b 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl @@ -19,20 +19,16 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir: type: Directory max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -40,8 +36,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -71,8 +65,6 @@ inputs: # type: string # hash_exon_counts: # type: string - hash_count_metrics: - type: string # hash_count_outputs: # type: string checker_star_output_bam: @@ -144,8 +136,6 @@ steps: star_index: star_index fastqs: fastqs prefix_str: prefix_str - threads: threads - memory: memory rsem_ref_dir: rsem_ref_dir max_frag_len: max_frag_len estimate_rspd: estimate_rspd @@ -153,7 +143,6 @@ steps: paired_end: paired_end genes_gtf: genes_gtf genome_fasta: genome_fasta - java_path: java_path rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: @@ -297,7 +286,7 @@ steps: # out: [out_hash_string] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl index 9980e15..887518d 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl @@ -27,10 +27,7 @@ inputs: type: int? inputBinding: prefix: -l - doc: | - Set compression level, from 0 (uncompressed) to 9 (best) - - doc: Set number of sorting and compression threads [1] + doc: Set compression level, from 0 (uncompressed) to 9 (best) input: type: File inputBinding: