diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl index 77743a6..3b1ab51 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_bams_wf.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Compare 2 input BAM files and report results. Exit 0 if sucess. diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl index 471900b..e4a00bc 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/check_md5_wf.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Calculates the MD5 hash of the input file and compares it to the input MD5 hash. If hashes match: Exit 0 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl index 9d2a724..e72cf7c 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/calc_md5.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Calculate the MD5 hash for the input file. diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl index c65ba33..afa8ff3 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/compare_bams.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Compare 2 input BAM files using [BamUtil diff](https://genome.sph.umich.edu/wiki/BamUtil:_diff) diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl index d453a93..c24f2df 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/read_file.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner cwlVersion: v1.0 class: ExpressionTool diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl index 7420cf0..88b8a02 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/string_comparison.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner cwlVersion: v1.0 class: CommandLineTool baseCommand: [] diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl index d5ad119..e4d50b8 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/components/untar_dir.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | Extract all files from archive.tar and filter through gzip diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl index ff3d050..32626ba 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker-tar.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A workflow to verify the proper execution of [TOPMed RNA-seq Workflow](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl) @@ -20,20 +21,16 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir_tar: type: File max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -41,8 +38,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -72,8 +67,6 @@ inputs: # type: string # hash_exon_counts: # type: string - hash_count_metrics: - type: string # hash_count_outputs: # type: string checker_star_output_bam: @@ -157,8 +150,6 @@ steps: star_index: untar_star_index/untarred_dir fastqs: fastqs prefix_str: prefix_str - threads: threads - memory: memory rsem_ref_dir: untar_rsem_reference/untarred_dir max_frag_len: max_frag_len estimate_rspd: estimate_rspd @@ -166,7 +157,6 @@ steps: paired_end: paired_end genes_gtf: genes_gtf genome_fasta: genome_fasta - java_path: java_path rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: @@ -310,7 +300,7 @@ steps: # out: [out_hash_string] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl index 53a1989..718247b 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/checker-workflows/rnaseq_pipeline_fastq_checker.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A workflow to verify the proper execution of [TOPMed RNA-seq Workflow](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl) @@ -18,20 +19,16 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir: type: Directory max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -39,8 +36,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -70,8 +65,6 @@ inputs: # type: string # hash_exon_counts: # type: string - hash_count_metrics: - type: string # hash_count_outputs: # type: string checker_star_output_bam: @@ -143,8 +136,6 @@ steps: star_index: star_index fastqs: fastqs prefix_str: prefix_str - threads: threads - memory: memory rsem_ref_dir: rsem_ref_dir max_frag_len: max_frag_len estimate_rspd: estimate_rspd @@ -152,7 +143,6 @@ steps: paired_end: paired_end genes_gtf: genes_gtf genome_fasta: genome_fasta - java_path: java_path rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: @@ -296,7 +286,7 @@ steps: # out: [out_hash_string] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl index beb4be4..1451c81 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/indexbam.cwl @@ -1,25 +1,25 @@ +#!/usr/bin/env cwl-runner doc: | A wrapper for running `samtools index `. cwlVersion: v1.0 class: CommandLineTool -id: "run-index-bam" label: "run-index-bam" -baseCommand: ["samtools", "index"] +baseCommand: [ samtools, index ] requirements: -- class: InlineJavascriptRequirement -- class: DockerRequirement - dockerPull: heliumdatacommons/topmed-rnaseq:latest -- class: InitialWorkDirRequirement - listing: - - $(inputs.input_bam) + DockerRequirement: + dockerPull: quay.io/biocontainers/samtools:1.8--4 + InitialWorkDirRequirement: + listing: + - $(inputs.input_bam) inputs: input_bam: type: File inputBinding: position: 1 + valueFrom: $(self.basename) outputs: bam_index: diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml index 967e8e9..e6d6b22 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml @@ -11,9 +11,9 @@ rsem_ref_dir: class: Directory location: /rsem_ref/ max_frag_len: 1000 -estimate_rspd: "true" -is_stranded: "true" -paired_end: "true" +estimate_rspd: true +is_stranded: true +paired_end: true genes_gtf: { class: File, path: gencode.v26.annotation.withTranscriptID.gtf @@ -22,9 +22,6 @@ genome_fasta: { class: File, path: Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta } -java_path: /usr/lib/jvm/java-1.7.0-openjdk-amd64/bin/java -memory: 8 rnaseqc_flags: ["noDoC", "strictMode"] # gatk_flags: [] prefix_str: "LC_C13_cRNA" -threads: 4 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml index fab64df..73fa82b 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rsem-example.yml @@ -7,7 +7,7 @@ transcriptome_bam: { } prefix_str: "LC_C13_cRNA" max_frag_len: 1000 -estimate_rspd: "true" -is_stranded: "true" -paired_end: "true" +estimate_rspd: true +is_stranded: true +paired_end: true threads: 4 diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl index c31c9d6..60086b5 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/markduplicates.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_MarkDuplicates.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_MarkDuplicates.py) @@ -9,35 +10,42 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-MarkDuplicates" label: "run-MarkDuplicates" -baseCommand: ["python3", "-u", "/src/run_MarkDuplicates.py"] +baseCommand: [java] -requirements: - - class: DockerRequirement - dockerPull: heliumdatacommons/topmed-rnaseq:latest +requirements: # turn back into a hint when the biocontainer has its classpath + # updated + EnvVarRequirement: + envDef: + CLASSPATH: /usr/local/share/picard-2.9.2-2/picard.jar + DockerRequirement: + dockerPull: quay.io/biocontainers/picard:2.9.2--2 inputs: input_bam: type: File - inputBinding: - position: 1 prefix_str: type: string - inputBinding: - position: 2 - memory: - type: int - inputBinding: - position: 3 - prefix: --memory + +arguments: + - prefix: -Xmx + valueFrom: $(runtime.ram)M + separate: false + - picard.cmdline.PicardCommandLine + - MarkDuplicates + - I=$(inputs.input_bam.path) + - O=$(runtime.outdir)/$(inputs.input_bam.nameroot).md.bam + - M=$(runtime.outdir)/$(inputs.prefix_str).marked_dup_metrics.txt + - ASSUME_SORT_ORDER=coordinate + - OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 outputs: bam_file: type: File outputBinding: - glob: "*.md.bam" + glob: $(inputs.input_bam.nameroot).md.bam metrics: type: File outputBinding: - glob: "*.marked_dup_metrics.txt" + glob: $(inputs.prefix_str).marked_dup_metrics.txt + diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl index 7d09bf9..206536d 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rna_seqc.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_rnaseqc.py](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/src/run_rnaseqc.py) duplicated from [run_rnaseqc.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_rnaseqc.py) with minor modifications. @@ -9,12 +10,12 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-seqc" label: "run-seqc" # run_rnaseqc.py is not an executable file in the docker container. baseCommand: ["python3", "/src/run_rnaseqc.py"] requirements: + InlineJavascriptRequirement: {} DockerRequirement: dockerPull: heliumdatacommons/topmed-rnaseq:latest @@ -40,16 +41,6 @@ inputs: type: string inputBinding: position: 4 - java_path: - type: string - inputBinding: - position: 5 - prefix: --java - memory: - type: int - inputBinding: - position: 6 - prefix: --memory rnaseqc_flags: type: type: array @@ -70,6 +61,11 @@ inputs: # position: 8 # prefix: --gatk_flags +arguments: + - prefix: --memory + valueFrom: $(runtime.ram / 1024) + - prefix: --java + valueFrom: /usr/lib/jvm/java-1.7.0-openjdk-amd64/bin/java outputs: gene_rpkm: type: File diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl index 589cd2e..3f7bbf0 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rnaseq_pipeline_fastq.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | TOPMed RNA-seq CWL workflow. Documentation on the workflow can be found [here](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/README.md). Example input files: [Dockstore.json](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/Dockstore.json) and [rnaseq_pipeline_fastq-example.yml](https://github.com/heliumdatacommons/cwl_workflows/blob/master/topmed-workflows/TOPMed_RNAseq_pipeline/input-examples/rnaseq_pipeline_fastq-example.yml). @@ -15,15 +16,16 @@ doc: | cwlVersion: v1.0 class: Workflow -id: "TOPMed_RNA-seq" label: "TOPMed_RNA-seq" requirements: - - class: SubworkflowFeatureRequirement - - class: ResourceRequirement - coresMin: 4 - ramMin: 16 - tmpdirMin: 51200 + SubworkflowFeatureRequirement: {} + StepInputExpressionRequirement: {} +# hints: +# ResourceRequirement: +# coresMin: 4 +# ramMin: 16 +# #tmpdirMin: 51200 inputs: star_index: @@ -32,20 +34,16 @@ inputs: type: File[] prefix_str: type: string - threads: - type: int - memory: - type: int rsem_ref_dir: type: Directory max_frag_len: type: int estimate_rspd: - type: string + type: boolean is_stranded: - type: string + type: boolean paired_end: - type: string + type: boolean genes_gtf: type: File genome_fasta: @@ -53,8 +51,6 @@ inputs: secondaryFiles: - .fai - ^.dict - java_path: - type: string rnaseqc_flags: type: string[] # gatk_flags: @@ -64,100 +60,123 @@ inputs: # items: string outputs: - - id: star_output_bam - outputSource: run_star/bam_file + star_output_bam: + outputSource: sort_bam/output_file type: File - - id: star_output_bam_index - outputSource: run_star/bam_index + star_output_bam_index: + outputSource: index_bam/bam_index type: File - - id: star_output_transcriptome_bam + star_output_transcriptome_bam: outputSource: run_star/transcriptome_bam type: File - - id: star_output_chimeric_junctions + star_output_chimeric_junctions: outputSource: run_star/chimeric_junctions type: File - - id: star_output_chimeric_bam_file - outputSource: run_star/chimeric_bam_file + star_output_chimeric_bam_file: + outputSource: sort_chimeras/output_file type: File - - id: star_output_chimeric_bam_index - outputSource: run_star/chimeric_bam_index + star_output_chimeric_bam_index: + outputSource: index_chimeras/bam_index type: File - - id: star_output_read_counts + star_output_read_counts: outputSource: run_star/read_counts type: File - - id: star_output_junctions + star_output_junctions: outputSource: run_star/junctions type: File - - id: star_output_junctions_pass1 + star_output_junctions_pass1: outputSource: run_star/junctions_pass1 type: File - - id: star_output_logs + star_output_logs: outputSource: run_star/logs type: File[] - - id: markduplicates_output_bam + markduplicates_output_bam: outputSource: run_markduplicates/bam_file type: File - - id: markduplicates_output_metrics + markduplicates_output_metrics: outputSource: run_markduplicates/metrics type: File - - id: markduplicates_bam_index + markduplicates_bam_index: outputSource: run_index_markduplicates_bam/bam_index type: File - - id: rsem_output_gene_results + rsem_output_gene_results: outputSource: run_rsem/gene_results type: File - - id: rsem_output_isoforms_results + rsem_output_isoforms_results: outputSource: run_rsem/isoforms_results type: File - - id: rna-seqc_output_gene_rpkm + rna-seqc_output_gene_rpkm: outputSource: run_rna-seqc/gene_rpkm type: File - - id: rna-seqc_output_gene_counts + rna-seqc_output_gene_counts: outputSource: run_rna-seqc/gene_counts type: File - - id: rna-seqc_output_exon_counts + rna-seqc_output_exon_counts: outputSource: run_rna-seqc/exon_counts type: File - - id: rna-seqc_output_count_metrics + rna-seqc_output_count_metrics: outputSource: run_rna-seqc/count_metrics type: File - - id: rna-seqc_output_count_outputs + rna-seqc_output_count_outputs: outputSource: run_rna-seqc/count_outputs type: File steps: run_star: - requirements: - ResourceRequirement: - coresMin: 4 - ramMin: 16 - tmpdirMin: 51200 run: star.cwl in: star_index: star_index fastqs: fastqs - prefix_str: prefix_str - threads: threads + prefix: prefix_str out: [ - bam_file, - bam_index, + bam, transcriptome_bam, chimeric_junctions, - chimeric_bam_file, - chimeric_bam_index, + chimeric_bam, read_counts, junctions, junctions_pass1, logs ] + sort_bam: + run: samtools-sort.cwl + in: + input: + source: run_star/bam + output_name: + source: prefix_str + valueFrom: $(self).Aligned.sortedByCoord.out.bam + out: [ output_file ] + + sort_chimeras: + run: samtools-sort.cwl + in: + input: + source: run_star/chimeric_bam + output_name: + source: prefix_str + valueFrom: $(self).Chimeric.out.sorted.bam + out: [ output_file ] + + index_bam: + run: indexbam.cwl + in: + input_bam: sort_bam/output_file + out: [bam_index] + + index_chimeras: + run: indexbam.cwl + in: + input_bam: sort_chimeras/output_file + out: [bam_index] + run_markduplicates: run: markduplicates.cwl in: - input_bam: run_star/bam_file + input_bam: sort_bam/output_file prefix_str: prefix_str - memory: memory out: [ bam_file, @@ -180,7 +199,6 @@ steps: estimate_rspd: estimate_rspd is_stranded: is_stranded paired_end: paired_end - threads: threads out: [ gene_results, @@ -194,8 +212,6 @@ steps: genes_gtf: genes_gtf genome_fasta: genome_fasta prefix_str: prefix_str - java_path: java_path - memory: memory rnaseqc_flags: rnaseqc_flags # gatk_flags: gatk_flags out: @@ -208,7 +224,7 @@ steps: ] $namespaces: - s: https://schema.org/ + s: http://schema.org/ $schemas: - http://dublincore.org/2012/06/14/dcterms.rdf diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl index 5de090a..514dad9 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/rsem.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_RSEM.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_RSEM.py) @@ -7,61 +8,64 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-rsem" label: "run-rsem" -baseCommand: /src/run_RSEM.py +baseCommand: rsem-calculate-expression requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: + - entry: $(inputs.rsem_ref_dir) + writable: true +hints: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + dockerPull: quay.io/biocontainers/rsem:1.3.0--boost1.64_3 inputs: rsem_ref_dir: type: Directory - default: - type: Directory inputBinding: - position: 1 + position: 2 + valueFrom: $(self.basename)/rsem_reference transcriptome_bam: type: File inputBinding: - position: 2 + position: 1 prefix_str: type: string inputBinding: position: 3 + valueFrom: $(self).rsem max_frag_len: type: int inputBinding: - position: 4 - prefix: --max_frag_len + prefix: --fragment-length-max estimate_rspd: - type: string + type: boolean inputBinding: - position: 5 - prefix: --estimate_rspd + prefix: --estimate-rspd is_stranded: - type: string - inputBinding: - position: 6 - prefix: --is_stranded + type: boolean paired_end: - type: string + type: boolean inputBinding: - position: 7 - prefix: --paired_end - threads: - type: int - inputBinding: - position: 8 - prefix: --threads + prefix: --paired-end + +arguments: + - prefix: --num-threads + valueFrom: $(runtime.cores) + - prefix: --fragment-length-max + valueFrom: "1000" + - --no-bam-output + - --bam + - ${if (inputs.is_stranded) { return ["--forward-prob", "0.0"];} } outputs: gene_results: type: File outputBinding: - glob: "*.rsem.genes.results" + glob: $(inputs.prefix_str).rsem.genes.results isoforms_results: type: File outputBinding: - glob: "*.rsem.isoforms.results" + glob: $(inputs.prefix_str).rsem.isoforms.results diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl new file mode 100644 index 0000000..887518d --- /dev/null +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/samtools-sort.cwl @@ -0,0 +1,88 @@ +#!/usr/bin/env cwl-runner + +# adapted from https://github.com/common-workflow-language/workflows/tree/master/tools + +class: CommandLineTool + +$namespaces: + dct: http://purl.org/dc/terms/ +$schemas: [ http://dublincore.org/2012/06/14/dcterms.rdf ] +dct:contributor: + foaf:name: Andy Yang + foaf:mbox: mailto:ayang@oicr.on.ca +dct:creator: + '@id': http://orcid.org/0000-0001-9102-5681 + foaf:name: Andrey Kartashov + foaf:mbox: mailto:Andrey.Kartashov@cchmc.org +dct:description: 'Developed at Cincinnati Children’s Hospital Medical Center for the + CWL consortium http://commonwl.org/ Original URL: https://github.com/common-workflow-language/workflows' +cwlVersion: v1.0 + + +requirements: +- class: DockerRequirement + dockerPull: quay.io/cancercollaboratory/dockstore-tool-samtools-sort:1.0 +inputs: + compression_level: + type: int? + inputBinding: + prefix: -l + doc: Set compression level, from 0 (uncompressed) to 9 (best) + input: + type: File + inputBinding: + position: 1 + + doc: Input bam file. + output_name: + type: string + inputBinding: + position: 2 + prefix: -o + + doc: Desired output filename. + sort_by_name: + type: boolean? + inputBinding: + prefix: -n + + doc: Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates. +outputs: + output_file: + type: File + outputBinding: + glob: $(inputs.output_name) + +arguments: + - prefix: -@ + valueFrom: $(runtime.cores) + - prefix: -m + valueFrom: $(runtime.ram)M + +baseCommand: [samtools, sort] +doc: | + Sort alignments by leftmost coordinates, or by read name when -n is used. An appropriate @HD-SO sort order header tag will be added or an existing one updated if necessary. + + Usage: samtools sort [-l level] [-m maxMem] [-o out.bam] [-O format] [-n] -T out.prefix [-@ threads] [in.bam] + + Options: + -l INT + Set the desired compression level for the final output file, ranging from 0 (uncompressed) or 1 (fastest but minimal compression) to 9 (best compression but slowest to write), similarly to gzip(1)'s compression level setting. + + If -l is not used, the default compression level will apply. + + + -n + Sort by read names (i.e., the QNAME field) rather than by chromosomal coordinates. + + -o FILE + Write the final sorted output to FILE, rather than to standard output. + + -O FORMAT + Write the final output as sam, bam, or cram. + + By default, samtools tries to select a format based on the -o filename extension; if output is to standard output or no format can be deduced, -O must be used. + + -T PREFIX + Write temporary files to PREFIX.nnnn.bam. This option is required. + diff --git a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl index ddddabf..21d8b8e 100644 --- a/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl +++ b/topmed-workflows/TOPMed_RNAseq_pipeline/star.cwl @@ -1,3 +1,4 @@ +#!/usr/bin/env cwl-runner doc: | A CWL wrapper for [run_STAR.py](https://github.com/broadinstitute/gtex-pipeline/blob/master/rnaseq/src/run_STAR.py) @@ -9,21 +10,42 @@ doc: | cwlVersion: v1.0 class: CommandLineTool -id: "run-star" label: "run-star" -baseCommand: /src/run_STAR.py +baseCommand: STAR -requirements: +hints: DockerRequirement: - dockerPull: heliumdatacommons/topmed-rnaseq:latest + #dockerPull: quay.io/biocontainers/star:2.5.3a--0 + dockerFile: | + FROM debian:buster-slim + MAINTAINER biocontainers + LABEL software="rna-star" \ + container="rna-star" \ + about.summary="ultrafast universal RNA-seq aligner" \ + about.home="https://github.com/alexdobin/STAR/" \ + software.version="2.5.3adfsg-1-deb" \ + version="1" \ + about.copyright="2009-2015 Alexander Dobin " \ + about.license="GPL-3+" \ + about.license_file="/usr/share/doc/rna-star/copyright" \ + extra.binaries="/usr/bin/STAR" \ + about.tags="biology::nucleic-acids, field::biology, field::biology:bioinformatics,:c++, role::program, use::analysing,:biological-sequence" + ENV DEBIAN_FRONTEND noninteractive + RUN echo \ + 'deb http://snapshot.debian.org/archive/debian/20170906/ buster main' > /etc/apt/sources.list \ + && printf "Package: r-*\nPin: origin snapshot.debian.org\nPin-Priority: 990\n" > /etc/apt/preferences.d/snapshot \ + && apt-get -o Acquire::Check-Valid-Until=false update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + --allow-unauthenticated rna-star=2.5.3a+dfsg-3 \ + && apt-get clean && apt-get purge && rm -rf /var/lib/apt/lists/* /tmp/* + dockerImageId: star inputs: star_index: type: Directory - default: - type: Directory inputBinding: - position: 1 + prefix: --genomeDir fastqs: type: type: array @@ -31,26 +53,88 @@ inputs: inputBinding: itemSeparator: "," inputBinding: - position: 2 - prefix_str: + prefix: --readFilesIn + prefix: type: string inputBinding: - position: 3 - threads: - type: int? - inputBinding: - position: 5 - prefix: --threads + prefix: --outFileNamePrefix + valueFrom: $(runtime.outdir)/$(self). + +arguments: + - prefix: --runMode + valueFrom: alignReads + - prefix: --runThreadN + valueFrom: $(runtime.cores) + - prefix: --twopassMode + valueFrom: Basic + - prefix: --outFilterMultimapNmax + valueFrom: "20" + - prefix: --alignSJoverhangMin + valueFrom: "8" + - prefix: --alignSJDBoverhangMin + valueFrom: "1" + - prefix: --outFilterMismatchNmax + valueFrom: "999" + - prefix: --outFilterMismatchNoverLmax + valueFrom: "0.1" + - prefix: --alignIntronMin + valueFrom: "20" + - prefix: --alignIntronMax + valueFrom: "1000000" + - prefix: --alignMatesGapMax + valueFrom: "1000000" + - prefix: --outFilterType + valueFrom: BySJout + - prefix: --outFilterScoreMinOverLread + valueFrom: "0.33" + - prefix: --outFilterMatchNminOverLread + valueFrom: "0.33" + - prefix: --limitSjdbInsertNsj + valueFrom: "1200000" + - prefix: --readFilesCommand + valueFrom: zcat + - prefix: --outSAMstrandField + valueFrom: introMotif + - prefix: --outFilterIntronMotifs + valueFrom: None + - prefix: --alignSoftClipAtReferenceEnds + valueFrom: Yes + - --quantMode + - TranscriptomeSAM + - GeneCounts + - --outSAMtype + - BAM + - Unsorted + - prefix: --outSAMunmapped + valueFrom: Within + - prefix: --genomeLoad + valueFrom: NoSharedMemory + - prefix: --chimSegmentMin + valueFrom: "15" + - prefix: --chimJunctionOverhangMin + valueFrom: "15" + - --chimOutType + - WithinBAM + - SoftClip + - prefix: --chimMainSegmentMultNmax + valueFrom: "1" + - --outSAMattributes + - NH + - HI + - AS + - nM + - NM + - ch + - --outSAMattrRGline + - ID:rg1 + - SM:sm1 + outputs: - bam_file: + bam: type: File outputBinding: - glob: "*.Aligned.sortedByCoord.out.bam" - bam_index: - type: File - outputBinding: - glob: "*.Aligned.sortedByCoord.out.bam.bai" + glob: $(inputs.prefix).Aligned.out.bam transcriptome_bam: type: File outputBinding: @@ -59,14 +143,18 @@ outputs: type: File outputBinding: glob: "*.Chimeric.out.junction" - chimeric_bam_file: - type: File - outputBinding: - glob: "*.Chimeric.out.sorted.bam" - chimeric_bam_index: + chimeric_bam: type: File outputBinding: - glob: "*.Chimeric.out.sorted.bam.bai" + glob: $(inputs.prefix).Chimeric.out.sam + # chimeric_bam_file: + # type: File + # outputBinding: + # glob: "*.Chimeric.out.sorted.bam" + # chimeric_bam_index: + # type: File + # outputBinding: + # glob: "*.Chimeric.out.sorted.bam.bai" read_counts: type: File outputBinding: