Merge branch 'dev' into olgabot/sourmash-sig-merge

nf-core · Jan 6, 2021 · 544bba5 · 544bba5
2 parents 6bce013 + b4afeca
commit 544bba5
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,23 +21,42 @@ barcode fastq
 * Add version printing for sencha, bam2fasta, and sourmash in Dockerfile, update versions in environment.yml
 * For processes translate, sourmash compute  add cpus=1 as they are only serial ([#107](https://github.com/nf-core/kmermaid/pull/107))
 * Add `sourmash sig merge` for aligned/unaligned signatures from bam files, and add `--skip_sig_merge` option to turn it off
+* Add `--protein_fastas` option for creating sketches of already-translated protein sequences
+* Add `--skip_compare option` to skip `sourmash_compare_sketches` process
 
 ### `Fixed`
 
+#### Resources
+
+* Increase CPUs in `high_memory_long` profile from 1 to 10
+
+#### Naming
+
+* Rename splitkmer to `split_kmer`
+
+#### Per-cell fastqs and bams
+
 * Remove `one_signature_per_record` flag and add bam2fasta count_umis_percell and make_fastqs_percell instead of bam2fasta sharding method
-* Update renaming of `khtools` commands to `sencha`
+* Use ripgrep instead of bam2fasta to make per-cell fastq, which will hopefully make resuming long-running pipelines on bams much faster
 * Make sure `samtools_fastq_aligned` outputs ALL aligned reads, regardless of mapping quality or primary alignment status
-* Add `--protein_fastas` option for translated protein input
-* Rename splitkmer to `split_kmer` and add `--skip_compare option` to skip `sourmash_compare_sketches` process
-* Increase CPUs in `high_memory_long` profile from 1 to 10
+
+#### Sourmash
+
 * add `--skip_compute option` to skip `sourmash_compute_sketch_*`
-* add option to write non-coding nucleotide sequences fasta files while doing sencha translate
 * Used `.combine()` instead of `each` to do cartesian product of all possible molecules, ksizes, and sketch values
-* Use ripgrep instead of bam2fasta to make per-cell fastq, which will hopefully make resuming long-running pipelines on bams much faster
-* Fix the use of `skip_multiqc` flag condition with if and not when
-* Updated sencha=1.0.3 to fix the bug in memory errors possibly with the numpy array on unique filenames ([PR #96 on sencha](https://github.com/czbiohub/leaftea/pull/96))
 * Do `sourmash compute` on all input ksizes, and all peptide molecule types, at once to save disk reading/writing efforts
+
+#### Translate
+
+* Updated sencha=1.0.3 to fix the bug in memory errors possibly with the numpy array on unique filenames ([PR #96 on sencha](https://github.com/czbiohub/leaftea/pull/96))
+* Add option to write non-coding nucleotide sequences fasta files while doing sencha translate
+* Don't save translate csvs and jsons by default, add separate `--save_translate_json` and `--save_translate_csv`
 * Updated `sencha translate` default parameters to be `--ksize 8 --jaccard-threshold 0.05` because those were the most successful
+* Update renaming of `khtools` commands to `sencha`
+
+#### MultiQC
+
+* Fix the use of `skip_multiqc` flag condition with if and not when
 
 ### `Dependencies`
 

diff --git a/Dockerfile b/Dockerfile
@@ -14,4 +14,4 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de
 
 # Instruct R processes to use these empty files instead of clashing with a local version
 RUN touch .Rprofile
-RUN touch .Renviron
+RUN touch .Renviron
diff --git a/main.nf b/main.nf
@@ -440,6 +440,10 @@ Channel
     .map { row -> file(row) }
     .set { sortmerna_fasta }
 
+// --- Parse Translate parameters ---
+save_translate_csv = params.save_translate_csv
+save_translate_json = params.save_translate_json
+
 
 // --- Parse the Sourmash parameters ----
 ksizes = params.ksizes?.toString().tokenize(',')
@@ -1178,7 +1182,14 @@ if (!params.remove_ribo_rna) {
     process translate {
       tag "${sample_id}"
       label "low_memory_long"
-      publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode
+      publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode,
+        saveAs: {
+          filename ->
+              if (save_translate_csv && filename.indexOf(".csv") > 0) "description/$filename"
+              if (save_translate_json && filename.indexOf(".json") > 0) "description/$filename"
+              else if (filename.indexOf(".sig") > 0) "sigs/$filename"
+              else null
+          }
 
       input:
       set bloom_id, molecule, file(bloom_filter) from ch_sencha_bloom_filter.collect()
@@ -1189,23 +1200,30 @@ if (!params.remove_ribo_rna) {
       set val(sample_id), file("${sample_id}__noncoding_reads_nucleotides.fasta") into ch_noncoding_nucleotides_potentially_empty
       set val(sample_id), file("${sample_id}__coding_reads_peptides.fasta") into ch_translated_protein_seqs
       set val(sample_id), file("${sample_id}__coding_reads_nucleotides.fasta") into ch_translatable_nucleotide_seqs
-      set val(sample_id), file("${sample_id}__coding_scores.csv") into ch_coding_scores_csv
-      set val(sample_id), file("${sample_id}__coding_summary.json") into ch_coding_scores_json
+      set val(sample_id), file(translate_csv) into ch_coding_scores_csv
+      set val(sample_id), file(translate_json) into ch_coding_scores_json
 
-    script:
-    """
-    sencha translate \\
-      --molecule ${molecule} \\
-      --coding-nucleotide-fasta ${sample_id}__coding_reads_nucleotides.fasta \\
-      --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\
-      --csv ${sample_id}__coding_scores.csv \\
-      --json-summary ${sample_id}__coding_summary.json \\
-      --jaccard-threshold ${jaccard_threshold} \\
-      --peptide-ksize ${peptide_ksize} \\
-      --peptides-are-bloom-filter \\
-      ${bloom_filter} \\
-      ${reads} > ${sample_id}__coding_reads_peptides.fasta
-    """
+      script:
+      translate_json = "${sample_id}__coding_summary.json"
+      translate_csv = "${sample_id}__coding_scores.csv"
+      csv_flag = save_translate_csv ? "--csv ${translate_csv}" : ''
+      json_flag = save_translate_json ? "--json-summary ${translate_json}" : ''
+
+      """
+      sencha translate \\
+        --molecule ${molecule} \\
+        --coding-nucleotide-fasta ${sample_id}__coding_reads_nucleotides.fasta \\
+        --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\
+        ${csv_flag} \\
+        ${json_flag} \\
+        --jaccard-threshold ${jaccard_threshold} \\
+        --peptide-ksize ${peptide_ksize} \\
+        --peptides-are-bloom-filter \\
+        ${bloom_filter} \\
+        ${reads} > ${sample_id}__coding_reads_peptides.fasta
+      touch ${translate_csv}
+      touch ${translate_json}
+      """
     }
 
     // Remove empty files
@@ -1743,4 +1761,4 @@ def checkHostname() {
             }
         }
     }
-}
+}
diff --git a/nextflow.config b/nextflow.config
@@ -21,7 +21,7 @@ params {
 
   // Parsing 10x bam files
   tenx_tgz = false
-  tenx_tags = "CB,XC,UB,XM,XB,RG,GN,GX,TX"
+  tenx_tags = "CB,CR,CY,XC,UB,UR,UY,AN,TR,XM,XB,RG,GN,GX,TX,NH,HI,AS,nM,RE,MM,pa,xf,fb,fr,fq,fx"
   tenx_cell_barcode_pattern = '(CB|XC):Z:([ACGT]+)(\\-1)?'
   tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)'
   tenx_min_umi_per_cell = 1000
@@ -51,6 +51,10 @@ params {
   translate_jaccard_threshold = 0.05
   reference_proteome_fasta = false
   bloomfilter_tablesize = '1e8'
+  // Saving the translate results for each dataset makes it take extra long
+  // Recommended for debugging purposes only
+  save_translate_csv = false
+  save_translate_json = false
 
 
   // Ribosomal RNA removal

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -484,5 +484,17 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ]
+    ],
+    "properties": {
+        "save_translate_csv": {
+            "type": "string",
+            "description": "Path to save the coding scores as a csv",
+            "default": "False"
+        },
+        "save_translate_json": {
+            "type": "string",
+            "description": "Path to save summarization of coding/\"     \"noncoding/other categorizations, the \"     \"min/max/mean/median/stddev of Jaccard scores, and other as a json",
+            "default": "False"
+        }
+    }
 }