From df31610c2fddd399813495aa9bea6385f9f9c076 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 28 Oct 2020 09:00:56 -0700
Subject: [PATCH 01/49] Update extract_per_cell_fastqs to not say
 __aligned__aligned and retry if run out of memory

---
 main.nf | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index b0066f0d..4dc0ad10 100644
--- a/main.nf
+++ b/main.nf
@@ -894,9 +894,9 @@ if (params.tenx_tgz || params.bam) {
     .set{ tenx_reads_with_good_barcodes_ch }
 
   process extract_per_cell_fastqs {
-    tag "${is_aligned_channel_id}__${cell_barcode}"
+    tag "${fastq_id}"
     label "low_memory"
-    errorStrategy 'ignore'
+    errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'ignore' }
     publishDir "${params.outdir}/10x-fastqs/per-cell/${channel_id}/", mode: 'copy', pattern: '*.fastq.gz', saveAs: { filename -> "${filename.replace("|", "-")}"}
 
     input:
@@ -909,10 +909,8 @@ if (params.tenx_tgz || params.bam) {
     set val(fastq_id), val(cell_id), val(is_aligned) into ch_fastq_id_to_cell_id_is_aligned
 
     script:
-    is_aligned_channel_id = "${channel_id}__${is_aligned}"
-    processes = "--processes ${task.cpus}"
     this_cell_barcode = tenx_cell_barcode_pattern.replace('([ACGT]+)', cell_barcode)
-    fastq_id = "${is_aligned_channel_id}__${is_aligned}__${cell_barcode}"
+    fastq_id = "${channel_id}__${is_aligned}__${cell_barcode}"
     cell_id = "${channel_id}__${cell_barcode}"
     this_cell_fastq_gz = "${fastq_id}.fastq.gz"
     """

From 700272de1ce66e884cc681f09b2012dba4830cc5 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 28 Oct 2020 09:06:21 -0700
Subject: [PATCH 02/49] Initial commit for adding sourmash sig merge on
 aligned/unaligned from single cells

---
 main.nf | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 4dc0ad10..e0646685 100644
--- a/main.nf
+++ b/main.nf
@@ -1274,7 +1274,7 @@ if (!params.remove_ribo_rna) {
 
       output:
       file(csv) into ch_sourmash_sig_describe_nucleotides
-      set val(sketch_id), val("dna"), val(ksize), val(sketch_value), file(sig) into sourmash_sketches_all_nucleotide
+      set val(sample_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide
 
       script:
       // Don't calculate DNA signature if this is protein, to minimize disk,
@@ -1299,7 +1299,7 @@ if (!params.remove_ribo_rna) {
         sourmash sig describe --csv ${csv} ${sig}
       """
     }
-    sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[4].size() > 0 }
+    sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[3].size() > 0 }
   }
 } else {
   sourmash_sketches_nucleotide = Channel.empty()
@@ -1342,7 +1342,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
-    set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches_all_peptide
+    set val(sample_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide
 
     script:
     sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
@@ -1367,11 +1367,68 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
       sourmash sig describe --csv ${csv} ${sig}
     """
     }
-  sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[4].size() > 0 }
+  sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[3].size() > 0 }
 } else {
   sourmash_sketches_peptide = Channel.empty()
 }
 
+if (params.bam || params.tenx_tgz) {
+  // Merge signatures from same sample id and sketch id
+
+  sourmash_sketches_nucleotide
+    .mix ( sourmash_sketches_peptide )
+    .set { ch_sourmash_sketches_mixed}
+
+  ch_fastq_id_to_cell_id_is_aligned
+    .combine ( ch_sourmash_sketches_mixed )
+    .dump( tag: 'fastq_id_to_cells__join__sketches' )
+    .groupTuple( by: 1 )
+    .dump( tag: 'fastq_id_to_cells__join__sketches__grouptuple' )
+    .set { ch_sourmash_sketches_to_merge }
+
+  process sourmash_sig_merge {
+    tag "${sig_id}"
+    label "low_memory"
+    publishDir "${params.outdir}/sketches_merged/${sketch_id}", mode: "${params.publish_dir_mode}",
+        saveAs: {filename ->
+            if (filename.indexOf(".csv") > 0) "description/$filename"
+            else if (filename.indexOf(".sig") > 0) "sigs/$filename"
+            else null
+        }
+
+    input:
+    set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketches_to_merge
+
+    output:
+    file(csv) into ch_sourmash_sig_describe_merged
+    set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches
+
+    script:
+    // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
+    sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value)
+    track_abundance_flag = track_abundance ? '--track-abundance' : ''
+    processes = "--processes ${task.cpus}"
+    sig_id = "${sample_id}__${sketch_id}"
+    sig = "${sig_id}.sig"
+    csv = "${sig_id}.csv"
+    """
+    sourmash compute \\
+        ${sketch_value_flag} \\
+        --ksizes $ksize \\
+        --input-is-protein \\
+        --$molecule \\
+        --name '${sample_id}' \\
+        --no-dna \\
+        $processes \\
+        $track_abundance_flag \\
+        --output ${sig} \\
+        $reads
+      sourmash sig describe --csv ${csv} ${sig}
+    """
+  }
+
+}
+
 if (params.split_kmer){
      process ska_compare_sketches {
     tag "${sketch_id}"
@@ -1395,7 +1452,6 @@ if (params.split_kmer){
 if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
-    sourmash_sketches = sourmash_sketches_peptide.concat(sourmash_sketches_nucleotide)
     tag "${sketch_id}"
     publishDir "${params.outdir}/compare_sketches", mode: 'copy'
 

From c4f9521908e1bdf444050879b390d4a1e19e628d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 28 Oct 2020 09:06:50 -0700
Subject: [PATCH 03/49] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a2f13f7..11ea7608 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ Initial release of nf-core/kmermaid, created with the [nf-core](http://nf-co.re/
 barcode fastq
 * Add version printing for sencha, bam2fasta, and sourmash in Dockerfile, update versions in environment.yml
 * For processes translate, sourmash compute  add cpus=1 as they are only serial ([#107](https://github.com/nf-core/kmermaid/pull/107))
+* Add `sourmash sig merge` for aligned/unaligned signatures
 
 ### `Fixed`
 

From 7d3e21508957c6a4a5227dd98d413104303453e1 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 28 Oct 2020 09:16:05 -0700
Subject: [PATCH 04/49] Try to get grouptuple to work

---
 main.nf | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/main.nf b/main.nf
index e0646685..8732e37b 100644
--- a/main.nf
+++ b/main.nf
@@ -1274,7 +1274,7 @@ if (!params.remove_ribo_rna) {
 
       output:
       file(csv) into ch_sourmash_sig_describe_nucleotides
-      set val(sample_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide
+      set val(sample_id), val(sketch_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide
 
       script:
       // Don't calculate DNA signature if this is protein, to minimize disk,
@@ -1342,7 +1342,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
-    set val(sample_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide
+    set val(sample_id), val(sketch_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide
 
     script:
     sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
@@ -1380,10 +1380,18 @@ if (params.bam || params.tenx_tgz) {
     .set { ch_sourmash_sketches_mixed}
 
   ch_fastq_id_to_cell_id_is_aligned
-    .combine ( ch_sourmash_sketches_mixed )
-    .dump( tag: 'fastq_id_to_cells__join__sketches' )
-    .groupTuple( by: 1 )
-    .dump( tag: 'fastq_id_to_cells__join__sketches__grouptuple' )
+    .combine ( ch_sourmash_sketches_mixed, on: 0 )
+    .dump( tag: 'fastq_id_to_cells__combine__sketches' )
+    // [DUMP: fastq_id_to_cells__join__sketches] [
+      // mouse_lung__aligned__AAATGCCCAAACTGCT, mouse_lung__AAATGCCCAAACTGCT, 'aligned',
+      // mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT, 
+      // molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false, 'dayhoff', '3', /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/7c/7c4f8e3e3f2db074502e754a3fcc29/mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false.sig]
+    // it[0]: fastq_id, e.g. "mouse_lung__aligned__AAATGCCCAAACTGCT" (contains aligned/unaligned)
+    // it[1]: cell_id, e.g. "mouse_lung__AAATGCCCAAACTGCT"
+    // it[2]: is_aligned, e.g. "aligned"
+    // it[3]: 
+    .groupTuple( by: [1, 4] )
+    .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
     .set { ch_sourmash_sketches_to_merge }
 
   process sourmash_sig_merge {
@@ -1405,9 +1413,6 @@ if (params.bam || params.tenx_tgz) {
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
-    sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value)
-    track_abundance_flag = track_abundance ? '--track-abundance' : ''
-    processes = "--processes ${task.cpus}"
     sig_id = "${sample_id}__${sketch_id}"
     sig = "${sig_id}.sig"
     csv = "${sig_id}.csv"

From 533dc94e7cf04b8745399f22d02df19edd866a6d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:05:22 -0700
Subject: [PATCH 05/49] Set minimum UMI per cell to be a default of 1000

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 99603843..8e5713a6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -26,7 +26,7 @@ params {
   tenx_tags = "CB,XC,UB,XM,XB,RG,GN,GX,TX"
   tenx_cell_barcode_pattern = '(CB|XC):Z:([ACGT]+)(\\-1)?'
   tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)'
-  tenx_min_umi_per_cell = 0
+  tenx_min_umi_per_cell = 1000
 
   // Creating sketches
   molecules ='dna,protein,dayhoff'

From da84ba4399f3f5e145909a573c0262777e2a19db Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:05:33 -0700
Subject: [PATCH 06/49] Set test min UMI per cell as 5

---
 conf/test_translate_bam.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index e9907698..c15c96a6 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -26,7 +26,7 @@ params {
   write_barcode_meta_csv = "metadata.csv"
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
-  tenx_min_umi_per_cell = 10
+  tenx_min_umi_per_cell = 5
   shard_size = 350
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'

From 6d30732988a2f65fbb08aa23249f9597697e3f7e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:07:36 -0700
Subject: [PATCH 07/49] Remove unused --shard_size option

---
 conf/test_translate_bam.config | 1 -
 main.nf                        | 1 -
 nextflow.config                | 1 -
 3 files changed, 3 deletions(-)

diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index c15c96a6..9063e1e5 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -27,7 +27,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 5
-  shard_size = 350
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e8'
diff --git a/main.nf b/main.nf
index 8732e37b..a06aa9ad 100644
--- a/main.nf
+++ b/main.nf
@@ -111,7 +111,6 @@ def helpMessage() {
                                     based on tenx_min_umi_per_cell
       --tenx_min_umi_per_cell         A barcode is only considered a valid barcode read
                                     and its signature is written if number of umis are greater than tenx_min_umi_per_cell
-      --shard_size                  Number of alignment to contain in each sharded bam file
       --barcodes_file               For bam files, Optional absolute path to a .tsv barcodes file if the input is unfiltered 10x bam file
       --rename_10x_barcodes         For bam files, Optional absolute path to a .tsv Tab-separated file mapping 10x barcode name
                                     to new name, e.g. with channel or cell annotation label
diff --git a/nextflow.config b/nextflow.config
index 8e5713a6..167b4a22 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -73,7 +73,6 @@ params {
   // https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc
   barcodes_file = false
   rename_10x_barcodes = false
-  shard_size = 350
 
   // Variables for testing
   read_paths = false

From 8fc0c33f5166ee389620d0940ea20467ab69ff69 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:18:42 -0700
Subject: [PATCH 08/49] Add option for skipping sig merge

---
 main.nf         | 1 +
 nextflow.config | 1 +
 2 files changed, 2 insertions(+)

diff --git a/main.nf b/main.nf
index a06aa9ad..c11eac3c 100644
--- a/main.nf
+++ b/main.nf
@@ -86,6 +86,7 @@ def helpMessage() {
       --skip_trimming               If provided, skip fastp trimming of reads
       --skip_compare                If provided, skip comparison of hashes using sourmash compare
       --skip_compute                If provided, skip computing of signatures using sourmash compute
+      --skip_sig_merge              If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files
 
      Sketch size options:
       --sketch_num_hashes           Number of hashes to use for making the sketches.
diff --git a/nextflow.config b/nextflow.config
index 167b4a22..37c1847c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -37,6 +37,7 @@ params {
   sketch_num_hashes_log2 = false
   sketch_scaled = false
   sketch_scaled_log2 = false
+  skip_sig_merge = false
 
   // Comparing sketches
   skip_compare = false

From 63273e5accaf4b67f7d5f4406fcc32e2b2321d8b Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:19:04 -0700
Subject: [PATCH 09/49] Update Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index d667dd57..f92421ca 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nfcore/base:1.10.2
+FROM nfcore/base:1.11
 LABEL authors="Olga Botvinnik" \
       description="Docker image containing all software requirements for the nf-core/kmermaid pipeline"
 

From 054d8b3398eb49d32c307fe3d3294c8256fd4fee Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 29 Oct 2020 10:20:11 -0700
Subject: [PATCH 10/49] Add test for --skip_sig_merge

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cdfe0f83..d8f405b7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,7 @@ jobs:
           - "test_tenx_tgz"
           - "test_translate"
           - "test_translate_bam"
+          - "test_translate_bam --skip_sig_merge"
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2

From f1304caee9877edb264fa093fb5821125e087eb4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 4 Jan 2021 16:26:15 -0800
Subject: [PATCH 11/49] Update changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11ea7608..f4467176 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@ Initial release of nf-core/kmermaid, created with the [nf-core](http://nf-co.re/
 barcode fastq
 * Add version printing for sencha, bam2fasta, and sourmash in Dockerfile, update versions in environment.yml
 * For processes translate, sourmash compute  add cpus=1 as they are only serial ([#107](https://github.com/nf-core/kmermaid/pull/107))
-* Add `sourmash sig merge` for aligned/unaligned signatures
+* Add `sourmash sig merge` for aligned/unaligned signatures from bam files, and add `--skip_sig_merge` option to turn it off
 
 ### `Fixed`
 

From a5791756871e821456e462801e8c23a4272e04f0 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 4 Jan 2021 16:26:31 -0800
Subject: [PATCH 12/49] Use more realistic scales and ksizes

---
 conf/test_translate_bam.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index 9063e1e5..1e29b02d 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -18,8 +18,8 @@ params {
   bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  ksizes = '15,21'
+  sketch_scaled = '2,5'
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"

From ef43907fc76de7aea3bf60e3cac2174c46cecb60 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 4 Jan 2021 16:35:55 -0800
Subject: [PATCH 13/49] regular test doesn't fail anymore

---
 main.nf | 58 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/main.nf b/main.nf
index c11eac3c..4d7efbda 100644
--- a/main.nf
+++ b/main.nf
@@ -921,11 +921,11 @@ if (params.tenx_tgz || params.bam) {
       '${this_cell_barcode}' \\
       ${reads} \\
       | gzip -c - \\
-      > ${this_cell_fastq_gz}
+      > ${this_cell_fastq_gz} || touch ${this_cell_fastq_gz}
     """
   }
   per_cell_fastqs_ch_possibly_empty
-    // gzipped files are 20 bytes
+    // Empty gzipped files are 20 bytes
     .filter { it -> it[1].size() > 20 }
     .set { per_cell_fastqs_ch }
   // // Make per-cell fastqs into a flat channel that matches the read channels of yore
@@ -1372,25 +1372,34 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
   sourmash_sketches_peptide = Channel.empty()
 }
 
-if (params.bam || params.tenx_tgz) {
+if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) {
   // Merge signatures from same sample id and sketch id
 
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
-    .set { ch_sourmash_sketches_mixed}
+    .set { ch_sourmash_sketches_mixed }
 
   ch_fastq_id_to_cell_id_is_aligned
-    .combine ( ch_sourmash_sketches_mixed, on: 0 )
+    .combine ( ch_sourmash_sketches_mixed, by: 0 )
     .dump( tag: 'fastq_id_to_cells__combine__sketches' )
-    // [DUMP: fastq_id_to_cells__join__sketches] [
-      // mouse_lung__aligned__AAATGCCCAAACTGCT, mouse_lung__AAATGCCCAAACTGCT, 'aligned',
-      // mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT, 
-      // molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false, 'dayhoff', '3', /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/7c/7c4f8e3e3f2db074502e754a3fcc29/mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false.sig]
-    // it[0]: fastq_id, e.g. "mouse_lung__aligned__AAATGCCCAAACTGCT" (contains aligned/unaligned)
-    // it[1]: cell_id, e.g. "mouse_lung__AAATGCCCAAACTGCT"
+    // [DUMP: fastq_id_to_cells__combine__sketches] 
+    // ['mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT', 
+    //   mouse_brown_fat_ptprc_plus_unaligned__CTGAAGTCAATGGTCT, 
+    //   'aligned', 
+    //   molecule-dna__ksize-3__num_hashes-4__track_abundance-false, 
+    //   'dna', 
+    //   '3', 
+    //   mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig]
+    // it[0]: fastq_id, e.g. "mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT" (contains aligned/unaligned)
+    // it[1]: cell_id, e.g. "mouse_brown_fat_ptprc_plus_unaligned__CTGAAGTCAATGGTCT"
     // it[2]: is_aligned, e.g. "aligned"
-    // it[3]: 
-    .groupTuple( by: [1, 4] )
+    // it[3]: sketch_id, e.g. molecule-dna__ksize-3__num_hashes-4__track_abundance-false
+    // it[4]: molecule, e.g. 'dna'
+    // it[5]: ksize, e.g. '3'
+    // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig
+    .groupTuple( by: [1, 3, 4, 5] )
+    // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] 
+    // [['mouse_lung__aligned__AAAGATGCAGATCTGT', 'mouse_lung__aligned__AAAGATGCAGATCTGT'], mouse_lung__AAAGATGCAGATCTGT, ['aligned', 'aligned'], molecule-dna__ksize-15__scaled-5__track_abundance-false, 'dna', '15', [/Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/36/b23ceef8424a2aeb8e7187f3b30d8b/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/8b/1a1aae2c00520dab66dadb63edbd77/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]]
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
     .set { ch_sourmash_sketches_to_merge }
 
@@ -1413,25 +1422,20 @@ if (params.bam || params.tenx_tgz) {
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
-    sig_id = "${sample_id}__${sketch_id}"
-    sig = "${sig_id}.sig"
+    sig_id = "${cell_id}__${sketch_id}"
     csv = "${sig_id}.csv"
+    output_sig = "${sig_id}.sig"
     """
-    sourmash compute \\
-        ${sketch_value_flag} \\
-        --ksizes $ksize \\
-        --input-is-protein \\
-        --$molecule \\
-        --name '${sample_id}' \\
-        --no-dna \\
-        $processes \\
-        $track_abundance_flag \\
-        --output ${sig} \\
-        $reads
-      sourmash sig describe --csv ${csv} ${sig}
+    sourmash sig merge -o merged.sig ${sigs}
+    sourmash sig rename -o ${output_sig} '${cell_id}'
+    sourmash sig describe --csv ${csv} ${output_sig}
     """
   }
 
+} else {
+  sourmash_sketches_nucleotide
+    .mix ( sourmash_sketches_peptide )
+    .set { sourmash_sketches }
 }
 
 if (params.split_kmer){

From d0bec5c52a2d7de6de49360e4f063ce29cce7078 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 5 Jan 2021 17:39:44 -0800
Subject: [PATCH 14/49] Update bam config

---
 conf/test_bam.config | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/conf/test_bam.config b/conf/test_bam.config
index e46f6568..ac4e8038 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -19,8 +19,8 @@ params {
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  ksizes = '15,21'
+  sketch_scaled = '2,5'
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
@@ -28,6 +28,5 @@ params {
   write_barcode_meta_csv = "metadata.csv"
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
-  tenx_min_umi_per_cell = 5
-  shard_size = 350
+  tenx_min_umi_per_cell = 2
 }

From 000d6ca003f00fc1e8904b2b0441cd613fabd908 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 5 Jan 2021 17:39:52 -0800
Subject: [PATCH 15/49] Add dump ch_sourmash_sketches_mixed

---
 main.nf | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 96a30c02..67a1e6c1 100644
--- a/main.nf
+++ b/main.nf
@@ -1385,6 +1385,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) {
 
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
+    .dump ( tag: 'ch_sourmash_sketches_mixed' )
     .set { ch_sourmash_sketches_mixed }
 
   ch_fastq_id_to_cell_id_is_aligned
@@ -1407,7 +1408,16 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) {
     // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig
     .groupTuple( by: [1, 3, 4, 5] )
     // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] 
-    // [['mouse_lung__aligned__AAAGATGCAGATCTGT', 'mouse_lung__aligned__AAAGATGCAGATCTGT'], mouse_lung__AAAGATGCAGATCTGT, ['aligned', 'aligned'], molecule-dna__ksize-15__scaled-5__track_abundance-false, 'dna', '15', [/Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/36/b23ceef8424a2aeb8e7187f3b30d8b/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/8b/1a1aae2c00520dab66dadb63edbd77/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]]
+    // [ 
+    //   ['mouse_lung__aligned__AAAGATGCAGATCTGT', 
+    //    'mouse_lung__aligned__AAAGATGCAGATCTGT'], 
+    //   mouse_lung__AAAGATGCAGATCTGT, 
+    //   ['aligned', 'aligned'], 
+    //   molecule-dna__ksize-15__scaled-5__track_abundance-false, 
+    //   'dna', '15', 
+    //   [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, 
+    //    mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]
+    // ]
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
     .set { ch_sourmash_sketches_to_merge }
 
@@ -1422,7 +1432,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) {
         }
 
     input:
-    set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketches_to_merge
+    set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged

From 6bce013f5cc6a57638817043032efb647f6f249a Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 6 Jan 2021 10:45:31 -0800
Subject: [PATCH 16/49] Update schema

---
 nextflow_schema.json | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index aa93d85c..711e29ca 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -224,11 +224,6 @@
                     "description": "A barcode is only considered a valid barcode read and its signature is written if number of umis are greater than tenx_min_umi_per_cell",
                     "fa_icon": "fas fa-percentage"
                 },
-                "shard_size": {
-                    "type": "integer",
-                    "description": "Number of alignment to contain in each sharded bam file",
-                    "fa_icon": "fas fa-percentage"
-                },
                 "barcodes_file": {
                     "type": "string",
                     "description": "For bam files, Optional absolute path to a .tsv barcodes file if the input is unfiltered 10x bam file",
@@ -287,6 +282,10 @@
                     "description": "Skip sourmash compare.",
                     "fa_icon": "fas fa-fast-forward"
                 },
+                "skip_sig_merge": {
+                    "type": "boolean",
+                    "description": "Skip merging aligned+unaligned reads per cell (keep aligned/unaligned separate)"
+                },
                 "skip_multiqc": {
                     "type": "boolean",
                     "description": "Skip MultiQC.",

From ba765ffd56ddabe629492a49b27ffb897cf102cd Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 08:58:31 -0800
Subject: [PATCH 17/49] Add params.ksizes to sketch output

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 53a74979..8cc7672e 100644
--- a/main.nf
+++ b/main.nf
@@ -1303,7 +1303,7 @@ if (!params.remove_ribo_rna) {
 
       output:
       file(csv) into ch_sourmash_sig_describe_nucleotides
-      set val(sample_id), val(sketch_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide
+      set val(sample_id), val(sketch_id), val("dna"), val(params.ksizes), file(sig) into sourmash_sketches_all_nucleotide
 
       script:
       // Don't calculate DNA signature if this is protein, to minimize disk,
@@ -1369,7 +1369,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
-    set val(sample_id), val(sketch_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide
+    set val(sample_id), val(sketch_id), val(molecule), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
 
 
     script:

From ed5e72b69b654fe4a6f160ab8a386bde313b53a0 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 10:04:25 -0800
Subject: [PATCH 18/49] Add peptide_molecules

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 8cc7672e..da603034 100644
--- a/main.nf
+++ b/main.nf
@@ -1369,7 +1369,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
-    set val(sample_id), val(sketch_id), val(molecule), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
+    set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
 
 
     script:

From 17185024332e8c3ef1c60b22ef099fb654f6c9f9 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 10:28:32 -0800
Subject: [PATCH 19/49] add check for skip_compute in sig merge logic

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index da603034..98850b24 100644
--- a/main.nf
+++ b/main.nf
@@ -1398,7 +1398,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
   sourmash_sketches_peptide = Channel.empty()
 }
 
-if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) {
+if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_merge) {
   // Merge signatures from same sample id and sketch id
 
   sourmash_sketches_nucleotide

From 0029b51a8f924ca0cb8f6c2fc413fc7681d9cd32 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 10:28:57 -0800
Subject: [PATCH 20/49] Add header

---
 main.nf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 98850b24..ec5eb7f8 100644
--- a/main.nf
+++ b/main.nf
@@ -1398,8 +1398,10 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
   sourmash_sketches_peptide = Channel.empty()
 }
 
+// -------------
+// Merge signatures from same sample id and sketch id
+// -------------
 if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_merge) {
-  // Merge signatures from same sample id and sketch id
 
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )

From c04a5a1d63ee04b68f33604eecfc71e4786c7ccf Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 10:31:26 -0800
Subject: [PATCH 21/49] Only mix sketches if not skip_compute

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index ec5eb7f8..1ef63b13 100644
--- a/main.nf
+++ b/main.nf
@@ -1470,7 +1470,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     """
   }
 
-} else {
+} else if (!param.skip_compute) {
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
     .set { sourmash_sketches }

From 5893884a43bdf783b7c6e0fd91bbd8930b431811 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 7 Jan 2021 10:32:55 -0800
Subject: [PATCH 22/49] param --> params

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 1ef63b13..25dd1426 100644
--- a/main.nf
+++ b/main.nf
@@ -1470,7 +1470,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     """
   }
 
-} else if (!param.skip_compute) {
+} else if (!params.skip_compute) {
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
     .set { sourmash_sketches }

From 4118c6ccd687dfbcfca7ea87bc71fc9069840e02 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 8 Jan 2021 17:54:24 -0800
Subject: [PATCH 23/49] Add some projectdir stuff

---
 main.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 25dd1426..07fb1be2 100644
--- a/main.nf
+++ b/main.nf
@@ -1610,6 +1610,8 @@ workflow.onComplete {
     if (!workflow.success) {
         subject = "[nf-core/kmermaid] FAILED: $workflow.runName"
     }
+    projectDir = workflow.projectDir
+
     def email_fields = [:]
     email_fields['version'] = workflow.manifest.version
     email_fields['runName'] = custom_runName ?: workflow.runName
@@ -1656,18 +1658,18 @@ workflow.onComplete {
 
     // Render the TXT template
     def engine = new groovy.text.GStringTemplateEngine()
-    def tf = new File("$projectDir/assets/email_template.txt")
+    def tf = new File("${workflow.projectDir}/assets/email_template.txt")
     def txt_template = engine.createTemplate(tf).make(email_fields)
     def email_txt = txt_template.toString()
 
     // Render the HTML template
-    def hf = new File("$projectDir/assets/email_template.html")
+    def hf = new File("${workflow.projectDir}/assets/email_template.html")
     def html_template = engine.createTemplate(hf).make(email_fields)
     def email_html = html_template.toString()
 
     // Render the sendmail template
     def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
-    def sf = new File("$projectDir/assets/sendmail_template.txt")
+    def sf = new File("$workflow.projectDir/assets/sendmail_template.txt")
     def sendmail_template = engine.createTemplate(sf).make(smail_fields)
     def sendmail_html = sendmail_template.toString()
 

From baa96f8f8455f8c93d66e77fcc206b2c0267340c Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 08:18:53 -0800
Subject: [PATCH 24/49] More projectDir fixes

---
 main.nf | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 07fb1be2..ee236232 100644
--- a/main.nf
+++ b/main.nf
@@ -543,10 +543,11 @@ if (workflow.profile.contains('awsbatch')) {
 }
 
 // Stage config files
-ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true)
+projectDir = workflow.projectDir
+ch_multiqc_config = file("$workflow.projectDir/assets/multiqc_config.yaml", checkIfExists: true)
 ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
-ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true)
-ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true)
+ch_output_docs = file("$workflow.projectDir/docs/output.md", checkIfExists: true)
+ch_output_docs_images = file("$workflow.projectDir/docs/images/", checkIfExists: true)
 
 
 // Header log info
@@ -1668,7 +1669,7 @@ workflow.onComplete {
     def email_html = html_template.toString()
 
     // Render the sendmail template
-    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
+    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$workflow.projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
     def sf = new File("$workflow.projectDir/assets/sendmail_template.txt")
     def sendmail_template = engine.createTemplate(sf).make(smail_fields)
     def sendmail_html = sendmail_template.toString()

From 8ba5db1bb8e7f615eb60e6282bcb017ef9cb90b4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 08:30:07 -0800
Subject: [PATCH 25/49] Do per-ksize sourmash sig merge

---
 main.nf | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index ee236232..958460e4 100644
--- a/main.nf
+++ b/main.nf
@@ -1453,19 +1453,24 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
         }
 
     input:
-    set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge
+    set val(molecule), val(ksizes), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged
-    set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches
+    set val(sketch_id), val(molecule), val(ksizes), val(sketch_value), file(sig) into sourmash_sketches
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
     sig_id = "${cell_id}__${sketch_id}"
     csv = "${sig_id}.csv"
     output_sig = "${sig_id}.sig"
+    KSIZES = ksizes.split(',')
     """
-    sourmash sig merge -o merged.sig ${sigs}
+    for KSIZE in ${KSIZES}; do
+      # Can only merge one kize at a time
+      sourmash sig merge -k ${KSIZE} -o merged_\$KSIZE.sig ${sigs}
+    done
+    sourmash sig cat merged*.sig > ${output_sig}
     sourmash sig rename -o ${output_sig} '${cell_id}'
     sourmash sig describe --csv ${csv} ${output_sig}
     """
@@ -1559,6 +1564,8 @@ if (!params.skip_multiqc){
       publishDir "${params.outdir}/MultiQC", mode: "${params.publish_dir_mode}"
       input:
       file multiqc_config from ch_multiqc_config
+      file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([])
+      file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([])
       file ('fastp/*') from ch_fastp_results.collect().ifEmpty([])
       file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([])
       file ('software_versions/*') from ch_software_versions_yaml.collect()

From c27b2b41a21ce6ccae64f7bc3555e6271ebd0704 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 08:34:48 -0800
Subject: [PATCH 26/49] Add sourmash describe csvs to multiqc

---
 main.nf | 56 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/main.nf b/main.nf
index 958460e4..3dadcef0 100644
--- a/main.nf
+++ b/main.nf
@@ -1397,6 +1397,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
   sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[3].size() > 0 }
 } else {
   sourmash_sketches_peptide = Channel.empty()
+  ch_sourmash_sig_describe_peptides = Channel.empty()
 }
 
 // -------------
@@ -1503,30 +1504,30 @@ if (params.split_kmer){
   }
 // If skip_compute is true, skip compare must be specified as true as well
 if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
-  // Combine peptide and nucleotide sketches
-  sourmash_sketches_nucleotide
-    .collect()
-    // Set as a list so that combine does cartesian product of all signatures
-    .map { it -> [it] }
-    .combine( ch_ksizes_for_compare_nucleotide )
-    .dump( tag: 'sourmash_sketches_nucleotide__ksizes' )
-    .map { x -> [x[0], x[1], 'dna'] }
-    .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' )
-    .set { sourmash_sketches_nucleotide_for_compare }
-
-  sourmash_sketches_peptide
-    .collect()
-    // Set as a list so that combine does cartesian product of all signatures
-    .map { it -> [it] }
-    .combine( ch_ksizes_for_compare_petide )
-    .dump( tag: 'sourmash_sketches_peptide__ksizes' )
-    .combine( ch_peptide_molecules )
-    .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' )
-    .set { sourmash_sketches_peptide_for_compare }
-
-  sourmash_sketches_peptide_for_compare
-    .mix ( sourmash_sketches_nucleotide_for_compare )
-    .set { ch_sourmash_sketches_to_compare }
+  // // Combine peptide and nucleotide sketches
+  // sourmash_sketches_nucleotide
+  //   .collect()
+  //   // Set as a list so that combine does cartesian product of all signatures
+  //   .map { it -> [it] }
+  //   .combine( ch_ksizes_for_compare_nucleotide )
+  //   .dump( tag: 'sourmash_sketches_nucleotide__ksizes' )
+  //   .map { x -> [x[0], x[1], 'dna'] }
+  //   .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' )
+  //   .set { sourmash_sketches_nucleotide_for_compare }
+
+  // sourmash_sketches_peptide
+  //   .collect()
+  //   // Set as a list so that combine does cartesian product of all signatures
+  //   .map { it -> [it] }
+  //   .combine( ch_ksizes_for_compare_petide )
+  //   .dump( tag: 'sourmash_sketches_peptide__ksizes' )
+  //   .combine( ch_peptide_molecules )
+  //   .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' )
+  //   .set { sourmash_sketches_peptide_for_compare }
+
+  // sourmash_sketches_peptide_for_compare
+  //   .mix ( sourmash_sketches_nucleotide_for_compare )
+  //   .set { ch_sourmash_sketches_to_compare }
 
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
@@ -1534,7 +1535,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
     publishDir "${params.outdir}/compare_sketches", mode: 'copy'
 
     input:
-    set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare
+    set file ("*.sig"), val(ksize), val(molecule) from sourmash_sketches
 
     output:
     file(csv)
@@ -1564,8 +1565,9 @@ if (!params.skip_multiqc){
       publishDir "${params.outdir}/MultiQC", mode: "${params.publish_dir_mode}"
       input:
       file multiqc_config from ch_multiqc_config
-      file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([])
-      file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([])
+      file ("sourmash_describe_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([])
+      file ("sourmash_describe_peptides/") from ch_sourmash_sig_describe_peptides.collect().ifEmpty([])
+      file ("sourmash_describe_nucleotides/") from ch_sourmash_sig_describe_nucleotides.collect().ifEmpty([])
       file ('fastp/*') from ch_fastp_results.collect().ifEmpty([])
       file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([])
       file ('software_versions/*') from ch_software_versions_yaml.collect()

From 05f6702df07b2b5afe8669b335eb3b8c00005b8e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 10:09:05 -0800
Subject: [PATCH 27/49] Update ProjectDir

---
 main.nf | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 3dadcef0..709a175f 100644
--- a/main.nf
+++ b/main.nf
@@ -544,10 +544,10 @@ if (workflow.profile.contains('awsbatch')) {
 
 // Stage config files
 projectDir = workflow.projectDir
-ch_multiqc_config = file("$workflow.projectDir/assets/multiqc_config.yaml", checkIfExists: true)
+ch_multiqc_config = file("${workflow.projectDir}/assets/multiqc_config.yaml", checkIfExists: true)
 ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
-ch_output_docs = file("$workflow.projectDir/docs/output.md", checkIfExists: true)
-ch_output_docs_images = file("$workflow.projectDir/docs/images/", checkIfExists: true)
+ch_output_docs = file("${workflow.projectDir}/docs/output.md", checkIfExists: true)
+ch_output_docs_images = file("${workflow.projectDir}/docs/images/", checkIfExists: true)
 
 
 // Header log info
@@ -1678,8 +1678,8 @@ workflow.onComplete {
     def email_html = html_template.toString()
 
     // Render the sendmail template
-    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$workflow.projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
-    def sf = new File("$workflow.projectDir/assets/sendmail_template.txt")
+    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
+    def sf = new File("${workflow.projectDir}/assets/sendmail_template.txt")
     def sendmail_template = engine.createTemplate(sf).make(smail_fields)
     def sendmail_html = sendmail_template.toString()
 

From 2fe45236fbe8fd8be64cf2e313ed2aa87f88ca7d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 10:09:25 -0800
Subject: [PATCH 28/49] Properly save translate output

---
 main.nf | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 709a175f..b09ab48f 100644
--- a/main.nf
+++ b/main.nf
@@ -1186,10 +1186,9 @@ if (!params.remove_ribo_rna) {
       publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode,
         saveAs: {
           filename ->
-              if (save_translate_csv && filename.indexOf(".csv") > 0) "description/$filename"
-              if (save_translate_json && filename.indexOf(".json") > 0) "description/$filename"
-              else if (filename.indexOf(".sig") > 0) "sigs/$filename"
-              else null
+              if (save_translate_csv && filename.indexOf(".csv") > 0) "$filename"
+              if (save_translate_json && filename.indexOf(".json") > 0) "$filename"
+              else "$filename"
           }
 
       input:

From 27d95f0381d321d5e234500b3193c4ab7f07301e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 10:09:44 -0800
Subject: [PATCH 29/49] Add dump of sourmash sketches

---
 main.nf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index b09ab48f..a9cddb4e 100644
--- a/main.nf
+++ b/main.nf
@@ -1326,7 +1326,10 @@ if (!params.remove_ribo_rna) {
         sourmash sig describe --csv ${csv} ${sig}
       """
     }
-    sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[3].size() > 0 }
+    sourmash_sketches_all_nucleotide
+      .filter{ it[3].size() > 0 }
+      .dump ( tag: "sourmash_sketches_all_nucleotide" )
+      .set { sourmash_sketches_nucleotide }
   }
 } else {
   sourmash_sketches_nucleotide = Channel.empty()

From e68db006f15ea0b3f403c4d409b850256f8f6297 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 10:10:08 -0800
Subject: [PATCH 30/49] Fixing sourmash sig merge

---
 main.nf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index a9cddb4e..6a190bb1 100644
--- a/main.nf
+++ b/main.nf
@@ -1456,11 +1456,11 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
         }
 
     input:
-    set val(molecule), val(ksizes), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge
+    set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(molecule), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged
-    set val(sketch_id), val(molecule), val(ksizes), val(sketch_value), file(sig) into sourmash_sketches
+    set val(cell_id), val(sketch_id), val(molecule), val(ksizes), file(output_sig) into sourmash_sketches
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
@@ -1471,7 +1471,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     """
     for KSIZE in ${KSIZES}; do
       # Can only merge one kize at a time
-      sourmash sig merge -k ${KSIZE} -o merged_\$KSIZE.sig ${sigs}
+      sourmash sig merge -k \$KSIZE -o merged_\$KSIZE.sig ${sigs}
     done
     sourmash sig cat merged*.sig > ${output_sig}
     sourmash sig rename -o ${output_sig} '${cell_id}'
@@ -1483,6 +1483,9 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
     .set { sourmash_sketches }
+  ch_sourmash_sig_describe_merged = Channel.empty()
+} else {
+  ch_sourmash_sig_describe_merged = Channel.empty()
 }
 
 if (params.split_kmer){

From 7195eae37965d9a02743f1d4039a001c6ca540b9 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 12:09:25 -0800
Subject: [PATCH 31/49] Add ch_sourmash_sig_describe_nucleotides

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index 6a190bb1..a3561797 100644
--- a/main.nf
+++ b/main.nf
@@ -1335,6 +1335,7 @@ if (!params.remove_ribo_rna) {
   sourmash_sketches_nucleotide = Channel.empty()
   ch_protein_fastas
     .set { ch_protein_seq_to_sketch }
+  ch_sourmash_sig_describe_nucleotides = Channel.empty()
 }
 
 

From 9d090b4e4312e5248c6ae8c4af42d2c23ce1fae2 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 11 Jan 2021 12:13:16 -0800
Subject: [PATCH 32/49] more if/else

---
 main.nf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.nf b/main.nf
index a3561797..4db39184 100644
--- a/main.nf
+++ b/main.nf
@@ -1330,6 +1330,11 @@ if (!params.remove_ribo_rna) {
       .filter{ it[3].size() > 0 }
       .dump ( tag: "sourmash_sketches_all_nucleotide" )
       .set { sourmash_sketches_nucleotide }
+  } else {
+  sourmash_sketches_nucleotide = Channel.empty()
+  ch_protein_fastas
+    .set { ch_protein_seq_to_sketch }
+  ch_sourmash_sig_describe_nucleotides = Channel.empty()
   }
 } else {
   sourmash_sketches_nucleotide = Channel.empty()

From 5bea4c9673520b1724e6dd344c1e61a182a1eed3 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 15 Jan 2021 17:32:21 -0800
Subject: [PATCH 33/49] Update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0699d039..8ca6377a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ barcode fastq
 * Add `sourmash sig merge` for aligned/unaligned signatures from bam files, and add `--skip_sig_merge` option to turn it off
 * Add `--protein_fastas` option for creating sketches of already-translated protein sequences
 * Add `--skip_compare option` to skip `sourmash_compare_sketches` process
+* Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117))
 
 ### `Fixed`
 
@@ -61,3 +62,5 @@ barcode fastq
 ### `Dependencies`
 
 ### `Deprecated`
+
+* Removed ability to specify multiple `--scaled` or `--num-hashes` values to enable merging of signatures
\ No newline at end of file

From 9682b03ee7f52abdf5b73494b4707518726df954 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 15 Jan 2021 18:11:22 -0800
Subject: [PATCH 34/49] Getting "sig merge" to finally run

---
 main.nf | 171 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 100 insertions(+), 71 deletions(-)

diff --git a/main.nf b/main.nf
index 4db39184..9d51b32b 100644
--- a/main.nf
+++ b/main.nf
@@ -475,33 +475,37 @@ if (!have_sketch_value && !params.split_kmer) {
 
 
 
-def make_sketch_id (molecule, ksizes, sketch_value, track_abundance, sketch_style) {
-  if (sketch_style == 'size') {
-    sketch_value = "num_hashes-${sketch_value}"
+// added "_for_id" to all variables to avoid variable scoping errors
+def make_sketch_id (
+  molecule_for_id, ksizes_for_id, sketch_value_for_id, track_abundance_for_id, sketch_style_for_id
+  ) {
+  if (sketch_style_for_id == 'size') {
+    style_value = "num_hashes-${sketch_value_for_id}"
   } else {
-    sketch_value = "scaled-${sketch_value}"
+    style_value = "scaled-${sketch_value_for_id}"
   }
 
-  sketch_id = "molecule-${molecule}__ksize-${ksizes}__${sketch_value}__track_abundance-${track_abundance}"
-  return sketch_id
+  this_sketch_id = "molecule-${molecule_for_id}__ksize-${ksizes_for_id}__${style_value}__track_abundance-${track_abundance_for_id}"
+  return this_sketch_id
 }
 
 // Create the --num-hashes or --scaled flag for sourmash
-def make_sketch_value_flag(sketch_style, sketch_value) {
-  if (sketch_style == "size") {
-    number_flag = "--num-hashes ${sketch_value}"
-  } else if (sketch_style == "scaled" ) {
-    number_flag = "--scaled ${sketch_value}"
+// added "_for_flag" to all variables to avoid variable scoping errors
+def make_sketch_value_flag(sketch_style_for_flag, sketch_value_for_flag) {
+  if (sketch_style_for_flag == "size") {
+    number_flag = "--num-hashes ${sketch_value_for_flag}"
+  } else if (sketch_style_for_flag == "scaled" ) {
+    number_flag = "--scaled ${sketch_value_for_flag}"
   } else {
-    exit 1, "${sketch_style} is not a valid sketch counting style! Only 'scaled' and 'size' are valid"
+    exit 1, "${sketch_style_for_flag} is not a valid sketch counting style! Only 'scaled' and 'size' are valid"
   }
   return number_flag
 }
 
 int bloomfilter_tablesize = Math.round(Float.valueOf(params.bloomfilter_tablesize))
 
-peptide_ksize = params.translate_peptide_ksize
-peptide_molecule = params.translate_peptide_molecule
+translate_peptide_ksize = params.translate_peptide_ksize
+translate_peptide_molecule = params.translate_peptide_molecule
 jaccard_threshold = params.translate_jaccard_threshold
 track_abundance = params.track_abundance
 
@@ -673,7 +677,7 @@ if ( !params.split_kmer && have_sketch_value ) {
   /*
    * Validate sketch sizes
    */
-  process validate_sketch_values {
+  process validate_sketch_value {
       publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode,
       saveAs: {filename ->
           if (filename.indexOf(".txt") > 0) filename
@@ -686,60 +690,53 @@ if ( !params.split_kmer && have_sketch_value ) {
       val sketch_scaled_log2
 
       output:
-      file sketch_values into ch_sketch_values_unparsed
+      file sketch_value into ch_sketch_value_unparsed
       file sketch_style into ch_sketch_style_unparsed
 
       script:
       sketch_style = "sketch_style.txt"
-      sketch_values = 'sketch_values.txt'
+      sketch_value = 'sketch_value.txt'
       """
-      validate_sketch_values.py \\
+      validate_sketch_value.py \\
         --sketch_num_hashes ${sketch_num_hashes} \\
         --sketch_num_hashes_log2 ${sketch_num_hashes_log2} \\
         --sketch_scaled ${sketch_scaled} \\
         --sketch_scaled_log2 ${sketch_scaled_log2} \\
-        --output ${sketch_values} \\
+        --output ${sketch_value} \\
         --sketch_style ${sketch_style}
       """
   }
 
   // Parse sketch style into value
-  ch_sketch_style_unparsed
+  sketch_style_parsed = ch_sketch_style_unparsed
     .splitText()
     .dump ( tag: 'ch_sketch_style' )
     .map { it -> it.replaceAll('\\n', '' ) }
-    // .first()
+    .first()
     .dump ( tag: 'sketch_style_parsed' )
-    .into { ch_sketch_style_for_nucleotides; ch_sketch_style_for_proteins }
+    .collect ()
+  // get first item of returned array from .collect()
+  // sketch_style_parsed = sketch_style_parsed[0]
+    // .into { ch_sketch_style_for_nucleotides; ch_sketch_style_for_proteins }
   // sketch_style = sketch_styles[0]
   // println "sketch_style_parsed: ${sketch_style_parsed}"
   // println "sketch_style: ${sketch_style}"
 
   // Parse file into values
-  ch_sketch_values_unparsed
+  sketch_value_parsed = ch_sketch_value_unparsed
     .splitText()
     .map { it -> it.replaceAll('\\n', '')}
-    .dump ( tag : 'sketch_values_parsed' )
-    .into { ch_sketch_values_for_proteins; ch_sketch_values_for_dna }
+    .first()
+    .dump ( tag : 'sketch_value_parsed' )
+    .collect()
+  // get first item of returned array from .collect()
+  // sketch_value_parsed = sketch_value_parsed[0]
+    // .into { ch_sketch_value_for_proteins; ch_sketch_value_for_dna }
 
 }
 
 // Combine sketch values with ksize and molecule types
 
-// ch_peptide_molecules
-  // .combine ( ch_ksizes_for_proteins )
-ch_sketch_style_for_proteins
-  // .combine ( ch_sketch_style_for_proteins )
-  .combine ( ch_sketch_values_for_proteins )
-  .set { ch_sourmash_protein_sketch_params }
-
-
-// ch_ksizes_for_dna
-  // .combine ( ch_sketch_style_for_nucleotides )
-ch_sketch_style_for_nucleotides
-  .combine ( ch_sketch_values_for_dna )
-  .set { ch_sourmash_dna_sketch_params }
-
 
 
 if (params.reference_proteome_fasta){
@@ -751,19 +748,19 @@ if (params.reference_proteome_fasta){
 
     input:
     file(peptides) from ch_reference_proteome_fasta
-    peptide_ksize
-    peptide_molecule
+    translate_peptide_ksize
+    translate_peptide_molecule
 
     output:
     set val(bloom_id), val(peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter
 
     script:
-    bloom_id = "molecule-${peptide_molecule}_ksize-${peptide_ksize}"
+    bloom_id = "molecule-${peptide_molecule}_ksize-${translate_peptide_ksize}"
     """
     sencha index \\
       --tablesize ${bloomfilter_tablesize} \\
-      --molecule ${peptide_molecule} \\
-      --peptide-ksize ${peptide_ksize} \\
+      --molecule ${translate_peptide_molecule} \\
+      --peptide-ksize ${translate_peptide_ksize} \\
       --save-as ${peptides.simpleName}__${bloom_id}.bloomfilter \\
       ${peptides}
     """
@@ -1282,10 +1279,6 @@ if (!params.remove_ribo_rna) {
 
       }
   } else if (!params.skip_compute) {
-    ch_sourmash_dna_sketch_params
-      .combine ( ch_reads_to_sketch )
-      .dump ( tag: 'ch_sourmash_dna_params_with_reads' )
-      .set { ch_sourmash_sketch_params_with_reads }
 
     process sourmash_compute_sketch_fastx_nucleotide {
       tag "${sig_id}"
@@ -1298,8 +1291,10 @@ if (!params.remove_ribo_rna) {
           }
 
       input:
-      set val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketch_params_with_reads
       val track_abundance
+      val sketch_value_parsed
+      val sketch_style_parsed
+      set val(sample_id), file(reads) from ch_reads_to_sketch
 
       output:
       file(csv) into ch_sourmash_sig_describe_nucleotides
@@ -1308,8 +1303,14 @@ if (!params.remove_ribo_rna) {
       script:
       // Don't calculate DNA signature if this is protein, to minimize disk,
       // memory and IO requirements in the future
-      sketch_id = make_sketch_id("dna", params.ksizes, sketch_value, track_abundance, sketch_style)
-      sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value)
+      sketch_id = make_sketch_id(
+        "dna", 
+        params.ksizes, 
+        sketch_value_parsed[0], 
+        track_abundance, 
+        sketch_style_parsed[0]
+      )
+      sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
       track_abundance_flag = track_abundance ? '--track-abundance' : ''
       sig_id = "${sample_id}__${sketch_id}"
       sig = "${sig_id}.sig"
@@ -1357,10 +1358,6 @@ if (!have_nucleotide_input) {
 
 
 if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
-  ch_sourmash_protein_sketch_params
-    .combine ( ch_protein_seq_to_sketch )
-    .dump ( tag: 'ch_sourmash_protein_params_with_reads' )
-    .set { ch_sourmash_protein_sketch_params_with_reads }
 
   process sourmash_compute_sketch_fastx_peptide {
     tag "${sig_id}"
@@ -1374,16 +1371,24 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     input:
     val track_abundance
-    set val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads
+    val sketch_value_parsed
+    val sketch_style_parsed
+    set val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
     set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
 
-
     script:
-    sketch_id = make_sketch_id(peptide_molecules_comma_separated, params.ksizes, sketch_value, track_abundance, sketch_style)
-    sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value)
+    sketch_id = make_sketch_id(
+      peptide_molecules_comma_separated, 
+      params.ksizes, 
+      sketch_value_parsed[0], 
+      track_abundance, 
+      sketch_style_parsed[0]
+    )
+
+    sketch_value_flag = make_sketch_value_flag(sketch_style[0], sketch_value[0])
     track_abundance_flag = track_abundance ? '--track-abundance' : ''
     sig_id = "${sample_id}__${sketch_id}"
     sig = "${sig_id}.sig"
@@ -1419,7 +1424,9 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     .set { ch_sourmash_sketches_mixed }
 
   ch_fastq_id_to_cell_id_is_aligned
+    .dump( tag: 'ch_fastq_id_to_cell_id_is_aligned' )
     .combine ( ch_sourmash_sketches_mixed, by: 0 )
+    .unique()
     .dump( tag: 'fastq_id_to_cells__combine__sketches' )
     // [DUMP: fastq_id_to_cells__combine__sketches] 
     // ['mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT', 
@@ -1439,16 +1446,19 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     .groupTuple( by: [1, 3, 4, 5] )
     // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] 
     // [ 
-    //   ['mouse_lung__aligned__AAAGATGCAGATCTGT', 
-    //    'mouse_lung__aligned__AAAGATGCAGATCTGT'], 
-    //   mouse_lung__AAAGATGCAGATCTGT, 
-    //   ['aligned', 'aligned'], 
-    //   molecule-dna__ksize-15__scaled-5__track_abundance-false, 
-    //   'dna', '15', 
-    //   [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, 
+    //  it[0]: ['mouse_lung__aligned__AAAGATGCAGATCTGT', 
+    //          'mouse_lung__aligned__AAAGATGCAGATCTGT'], 
+    //  it[1]: mouse_lung__AAAGATGCAGATCTGT, 
+    //  it[2]: ['aligned', 'aligned'], 
+    //  it[3]: molecule-dna__ksize-15__scaled-5__track_abundance-false, 
+    //  it[4]: 'dna', 
+    //  it[5]: '15', 
+    //  it[6]: [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, 
     //    mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]
     // ]
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
+    .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6].unique() ] }
+    .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' )
     .set { ch_sourmash_sketches_to_merge }
 
   process sourmash_sig_merge {
@@ -1472,15 +1482,34 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
     sig_id = "${cell_id}__${sketch_id}"
     csv = "${sig_id}.csv"
+    concatenated_sig = 'concatenated.sig'
     output_sig = "${sig_id}.sig"
-    KSIZES = ksizes.split(',')
     """
-    for KSIZE in ${KSIZES}; do
+    # Parse ksizes in bash
+    KSIZES=\$(echo ${ksizes} | tr ',' ' ')
+
+    # Iterate over ksizes
+    for KSIZE in \$KSIZES; do
       # Can only merge one kize at a time
-      sourmash sig merge -k \$KSIZE -o merged_\$KSIZE.sig ${sigs}
+      sourmash sig merge \
+          --${molecule} \
+          -k \$KSIZE \
+          -o merged_\$KSIZE.sig \
+          ${sigs}
+
+      # "sig merge" gives default automatic name of md5sum which isn't helpful
+      # --> rename to cell id
+      sourmash sig rename \
+          --${molecule} \
+          -k \$KSIZE \
+          -o merged_renamed_\$KSIZE.sig \
+          merged_\$KSIZE.sig '${cell_id}'
     done
-    sourmash sig cat merged*.sig > ${output_sig}
-    sourmash sig rename -o ${output_sig} '${cell_id}'
+
+    # Concatenate all ksizes together
+    sourmash sig cat merged_renamed*.sig > ${output_sig}
+
+    # Add csv showing number of hashes at each ksize
     sourmash sig describe --csv ${csv} ${output_sig}
     """
   }

From 76b2ed7c2e1df0cbcf06b30c0652b8fe4af84a82 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 15 Jan 2021 18:11:33 -0800
Subject: [PATCH 35/49] Add option to skip sig merge

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e5d219fc..4c5634ee 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -67,6 +67,7 @@ jobs:
           - "test --sketch_scaled 2,4 --sketch_num_hashes_log2 false"
           - "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false"
           - "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false"
+          - "test_bam --skip_sig_merge"
           - "test_bam --write_barcodes_meta_csv false"
           - "test_bam --barcodes_file false --rename_10x_barcodes false"
           - "test_bam --rename_10x_barcodes false"

From 3c95f82b53dfddd5fb8167de83dbce4ac7d3874e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 15 Jan 2021 18:11:52 -0800
Subject: [PATCH 36/49] Update validate_sketch_value to only allow a single
 value

---
 bin/validate_sketch_value.py  | 144 ++++++++++++++++++++++++++++++++++
 bin/validate_sketch_values.py |  93 ----------------------
 2 files changed, 144 insertions(+), 93 deletions(-)
 create mode 100755 bin/validate_sketch_value.py
 delete mode 100755 bin/validate_sketch_values.py

diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py
new file mode 100755
index 00000000..c9e3fe06
--- /dev/null
+++ b/bin/validate_sketch_value.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from collections import OrderedDict, defaultdict, Counter
+import logging
+import argparse
+import glob
+import os
+import sys
+
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+
+def get_sketch_value(value, value_log2):
+    try:
+        if value:
+            sketch_value = int(value)
+        else:
+            sketch_value = 2 ** int(value_log2)
+    except ValueError:
+        logger.exception(
+            "Can only supply a single value to --sketch_num_hashes, "
+            "--sketch_num_hashes_log2, --sketch_scaled, "
+            "--sketch_scaled_log2 "
+        )
+    return sketch_value
+
+
+def value_or_bool(value):
+    if value == "false":
+        return False
+    if value == "true":
+        logger.exception(
+            "Must set a value for the --sketch_num_hashes, "
+            "--sketch_num_hashes_log2, --sketch_scaled, "
+            "--sketch_scaled_log2 options! Cannot simply set the "
+            "flag. E.g. '--sketch_num_hashes 5' is valid but "
+            "'--sketch_num_hashes' on its own is not"
+        )
+        sys.exit(1)
+    else:
+        return value
+
+
+def main(
+    sketch_num_hashes,
+    sketch_num_hashes_log2,
+    sketch_scaled,
+    sketch_scaled_log2,
+    out,
+    sketch_style,
+):
+    sketch_num_hashes = value_or_bool(sketch_num_hashes)
+    sketch_num_hashes_log2 = value_or_bool(sketch_num_hashes_log2)
+    sketch_scaled = value_or_bool(sketch_scaled)
+    sketch_scaled_log2 = value_or_bool(sketch_scaled_log2)
+
+    using_size = sketch_num_hashes or sketch_num_hashes_log2
+    using_scaled = sketch_scaled or sketch_scaled_log2
+
+    if using_size and using_scaled:
+        logger.exception(
+            "Cannot specify both sketch scales and sizes! Can only"
+            " use one of --sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, "
+            "--sketch_scaled_log2. Exiting."
+        )
+        sys.exit(1)
+
+    if using_size:
+        if sketch_num_hashes and sketch_num_hashes_log2:
+            logger.exception(
+                "Cannot specify both --sketch_num_hashes and --sketch_num_hashes_log2! Exiting."
+            )
+            sys.exit(1)
+        sketch_value = get_sketch_value(sketch_num_hashes, sketch_num_hashes_log2)
+        with open(sketch_style, "w") as f:
+            f.write("size")
+    elif using_scaled:
+        if sketch_scaled and sketch_scaled_log2:
+            logger.exception(
+                "Cannot specify both --sketch_scaled and --sketch_scaled_log2! Exiting."
+            )
+            sys.exit(1)
+        sketch_value = get_sketch_value(sketch_scaled, sketch_scaled_log2)
+        with open(sketch_style, "w") as f:
+            f.write("scaled")
+
+    else:
+        logger.info(
+            "Did not specify a sketch size or scale with any of "
+            "--sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, --sketch_scaled_log2! "
+            "Falling back on sourmash's default of --sketch_scaled 500"
+        )
+        sketch_value = 500
+
+    with open(out, "w") as f:
+        f.write(f"{sketch_value}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Ensure that sketch sizes/scaleds provided are valid"""
+    )
+    parser.add_argument(
+        "--sketch_num_hashes", type=str, help="Flat size of the sketches"
+    )
+    parser.add_argument(
+        "--sketch_num_hashes_log2", type=str, help="Flat size of the sketches, log2"
+    )
+    parser.add_argument(
+        "--sketch_scaled", type=str, help="Fraction of total observed hashes to observe"
+    )
+    parser.add_argument(
+        "--sketch_scaled_log2",
+        type=str,
+        help="Fraction of total observed hashes to observe, log2",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        dest="output",
+        default="sketch_value.txt",
+        type=str,
+        help="file with output",
+    )
+    parser.add_argument(
+        "--sketch_style",
+        default="sketch_style.txt",
+        type=str,
+        help="file indicating 'size' or 'scaled'",
+    )
+
+    args = parser.parse_args()
+    main(
+        args.sketch_num_hashes,
+        args.sketch_num_hashes_log2,
+        args.sketch_scaled,
+        args.sketch_scaled_log2,
+        args.output,
+        args.sketch_style,
+    )
diff --git a/bin/validate_sketch_values.py b/bin/validate_sketch_values.py
deleted file mode 100755
index 1310c94f..00000000
--- a/bin/validate_sketch_values.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python
-from __future__ import print_function
-from collections import OrderedDict, defaultdict, Counter
-import logging
-import argparse
-import glob
-import os
-import sys
-
-
-# Create a logger
-logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
-logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-
-
-def get_sketch_values(values, values_log2):
-    if values:
-        sketch_values = [int(x) for x in values.split(',')]
-    else:
-        sketch_values = [2 ** int(x) for x in values_log2.split(',')]
-    return sketch_values
-
-
-def value_or_bool(value):
-    if value == 'false':
-        return False
-    if value == 'true':
-        logger.exception("Must set a value for the --sketch_num_hashes, "
-                        "--sketch_num_hashes_log2, --sketch_scaled, "
-                        "--sketch_scaled_log2 options! Cannot simply set the "
-                        "flag. E.g. '--sketch_num_hashes 5' is valid but "
-                        "'--sketch_num_hashes' on its own is not")
-        sys.exit(1)
-    else:
-        return value
-
-
-def main(sketch_num_hashes, sketch_num_hashes_log2, sketch_scaled, sketch_scaled_log2, out,
-         sketch_style):
-    sketch_num_hashes = value_or_bool(sketch_num_hashes)
-    sketch_num_hashes_log2 = value_or_bool(sketch_num_hashes_log2)
-    sketch_scaled = value_or_bool(sketch_scaled)
-    sketch_scaled_log2 = value_or_bool(sketch_scaled_log2)
-
-    using_size = sketch_num_hashes or sketch_num_hashes_log2
-    using_scaled = sketch_scaled or sketch_scaled_log2
-
-    if using_size and using_scaled:
-        logger.exception("Cannot specify both sketch scales and sizes! Can only"
-                         " use one of --sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, "
-                         "--sketch_scaled_log2. Exiting.")
-        sys.exit(1)
-
-
-    if using_size:
-        if sketch_num_hashes and sketch_num_hashes_log2:
-            logger.exception("Cannot specify both --sketch_num_hashes and --sketch_num_hashes_log2! Exiting.")
-            sys.exit(1)
-        sketch_values = get_sketch_values(sketch_num_hashes, sketch_num_hashes_log2)
-        with open(sketch_style, 'w') as f:
-            f.write('size')
-    elif using_scaled:
-        if sketch_scaled and sketch_scaled_log2:
-            logger.exception("Cannot specify both --sketch_scaled and --sketch_scaled_log2! Exiting.")
-            sys.exit(1)
-        sketch_values = get_sketch_values(sketch_scaled, sketch_scaled_log2)
-        with open(sketch_style, 'w') as f:
-            f.write('scaled')
-
-    else:
-        logger.info("Did not specify a sketch size or scale with any of "
-                    "--sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, --sketch_scaled_log2! "
-                    "Falling back on sourmash's default of --sketch_scaled 500")
-        sketch_values = [500]
-
-    with open(out, 'w') as f:
-        for number in sketch_values:
-            f.write(f"{number}\n")
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="""Ensure that sketch sizes/scaleds provided are valid""")
-    parser.add_argument("--sketch_num_hashes", type=str, help="Flat size of the sketches")
-    parser.add_argument("--sketch_num_hashes_log2", type=str, help="Flat size of the sketches, log2")
-    parser.add_argument("--sketch_scaled", type=str, help="Fraction of total observed hashes to observe")
-    parser.add_argument("--sketch_scaled_log2", type=str, help="Fraction of total observed hashes to observe, log2")
-    parser.add_argument("-o", "--output", dest='output', default='sketch_values.txt',
-        type=str, help="file with output")
-    parser.add_argument("--sketch_style", default='sketch_style.txt', type=str, help="file indicating 'size' or 'scaled'")
-
-    args = parser.parse_args()
-    main(args.sketch_num_hashes, args.sketch_num_hashes_log2, args.sketch_scaled,
-         args.sketch_scaled_log2, args.output, args.sketch_style)

From 1321968a22433573b9667fb639634a88d50be7f0 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Fri, 15 Jan 2021 18:12:06 -0800
Subject: [PATCH 37/49] Change sketch values to single value

---
 conf/test.config                | 2 +-
 conf/test_bam.config            | 2 +-
 conf/test_fastas.config         | 2 +-
 conf/test_full.config           | 2 +-
 conf/test_protein_fastas.config | 2 +-
 conf/test_remove_ribo.config    | 2 +-
 conf/test_tenx_tgz.config       | 2 +-
 conf/test_translate.config      | 2 +-
 conf/test_translate_bam.config  | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index f4b841c7..a4c9e73a 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -18,7 +18,7 @@ params {
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
diff --git a/conf/test_bam.config b/conf/test_bam.config
index ac4e8038..02f53393 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -20,7 +20,7 @@ params {
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
   ksizes = '15,21'
-  sketch_scaled = '2,5'
+  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index 7aede988..83e31e90 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -18,7 +18,7 @@ params {
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_num_hashes_log2 = 4
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
diff --git a/conf/test_full.config b/conf/test_full.config
index 7d612b9d..69bb60f6 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -13,7 +13,7 @@ params {
 
   // Input data for full size test
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_num_hashes_log2 = 4
   molecules = 'dna,protein,dayhoff'
   input_paths = [
     ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index c3f19571..fa11209e 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -27,7 +27,7 @@ params {
 
   // Sketch Parameters
   ksizes = '9,15'
-  sketch_num_hashes = '2,4'
+  sketch_num_hashes = 4
   molecules = 'protein,dayhoff,hp'
   read_pairs = false
 
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index dad49362..246a8e40 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -18,7 +18,7 @@ params {
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_num_hashes_log2 = 4
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index cfa823e3..d1ee07e0 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -21,7 +21,7 @@ params {
   ]
   // Sketch Parameters
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_num_hashes_log2 = 4
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
diff --git a/conf/test_translate.config b/conf/test_translate.config
index 5f241d1c..20e04f3b 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -18,7 +18,7 @@ params {
   fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa"
   // Sketch Parameters
   ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
+  sketch_num_hashes_log2 = 4
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
 
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index 1e29b02d..d8a1d82c 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -19,7 +19,7 @@ params {
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
   ksizes = '15,21'
-  sketch_scaled = '2,5'
+  sketch_scaled = 5
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"

From dc0ecdd1b4507275d255e385c55d731add332d91 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Sat, 16 Jan 2021 23:41:29 -0800
Subject: [PATCH 38/49] peptide_molecule --> translate_peptide_molecule

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 9d51b32b..7ac47075 100644
--- a/main.nf
+++ b/main.nf
@@ -752,10 +752,10 @@ if (params.reference_proteome_fasta){
     translate_peptide_molecule
 
     output:
-    set val(bloom_id), val(peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter
+    set val(bloom_id), val(translate_peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter
 
     script:
-    bloom_id = "molecule-${peptide_molecule}_ksize-${translate_peptide_ksize}"
+    bloom_id = "molecule-${translate_peptide_molecule}_ksize-${translate_peptide_ksize}"
     """
     sencha index \\
       --tablesize ${bloomfilter_tablesize} \\

From 1fe32b1dceeae558dd98c6c7480033a43e1d94dd Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 18 Jan 2021 11:36:28 -0800
Subject: [PATCH 39/49] add "translate_" to peptide ksize and jaccard threshold

---
 .vscode/settings.json | 3 +++
 main.nf               | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..de288e1e
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 7ac47075..9675e7a6 100644
--- a/main.nf
+++ b/main.nf
@@ -506,7 +506,7 @@ int bloomfilter_tablesize = Math.round(Float.valueOf(params.bloomfilter_tablesiz
 
 translate_peptide_ksize = params.translate_peptide_ksize
 translate_peptide_molecule = params.translate_peptide_molecule
-jaccard_threshold = params.translate_jaccard_threshold
+translate_jaccard_threshold = params.translate_jaccard_threshold
 track_abundance = params.track_abundance
 
 // Tenx parameters
@@ -1213,8 +1213,8 @@ if (!params.remove_ribo_rna) {
         --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\
         ${csv_flag} \\
         ${json_flag} \\
-        --jaccard-threshold ${jaccard_threshold} \\
-        --peptide-ksize ${peptide_ksize} \\
+        --jaccard-threshold ${translate_jaccard_threshold} \\
+        --peptide-ksize ${translate_peptide_ksize} \\
         --peptides-are-bloom-filter \\
         ${bloom_filter} \\
         ${reads} > ${sample_id}__coding_reads_peptides.fasta

From 1297886c640bd2d25bd5cf43b44269246f0bb0e4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 8 Mar 2021 17:05:02 -0800
Subject: [PATCH 40/49] Do sig merge on individual moltypes

---
 main.nf | 114 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 78 insertions(+), 36 deletions(-)

diff --git a/main.nf b/main.nf
index 9675e7a6..bf489b03 100644
--- a/main.nf
+++ b/main.nf
@@ -1228,18 +1228,22 @@ if (!params.remove_ribo_rna) {
     // it[1] = sequence fasta file
     ch_translated_protein_seqs
       .mix ( ch_protein_fastas )
+      .dump ( tag: 'ch_protein_seq_to_sketch_from_translate' )
       .set { ch_protein_seq_to_sketch }
     // Remove empty files
     // it[0] = sample bloom id
     // it[1] = sequence fasta file
     ch_noncoding_nucleotides_nonempty =  ch_noncoding_nucleotides_potentially_empty.filter{ it[1].size() > 0 }
-    ch_translatable_nucleotide_seqs.filter{ it[1].size() > 0 }
-      .mix( ch_noncoding_nucleotides_nonempty)
+    ch_translatable_nucleotide_seqs
+      .dump( tag: 'ch_translatable_nucleotide_seqs' )
+      .filter{ it[1].size() > 0 }
+      .dump ( tag: 'ch_reads_to_sketch__from_translate' )
       .set { ch_reads_to_sketch }
 
   } else {
     // Send reads directly into coding/noncoding
     ch_reads_to_translate
+      .dump ( tag: 'ch_reads_to_sketch__no_translation' )
       .set{ ch_reads_to_sketch }
   }
 
@@ -1373,11 +1377,11 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
     val track_abundance
     val sketch_value_parsed
     val sketch_style_parsed
-    set val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads
+    set val(sample_id), file(reads) from ch_protein_seq_to_sketch
 
     output:
     file(csv) into ch_sourmash_sig_describe_peptides
-    set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
+    set val(sample_id), val(sketch_id), val(peptide_molecules_comma_separated), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide
 
     script:
     sketch_id = make_sketch_id(
@@ -1388,7 +1392,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
       sketch_style_parsed[0]
     )
 
-    sketch_value_flag = make_sketch_value_flag(sketch_style[0], sketch_value[0])
+    sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
     track_abundance_flag = track_abundance ? '--track-abundance' : ''
     sig_id = "${sample_id}__${sketch_id}"
     sig = "${sig_id}.sig"
@@ -1445,22 +1449,37 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig
     .groupTuple( by: [1, 3, 4, 5] )
     // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] 
-    // [ 
-    //  it[0]: ['mouse_lung__aligned__AAAGATGCAGATCTGT', 
-    //          'mouse_lung__aligned__AAAGATGCAGATCTGT'], 
-    //  it[1]: mouse_lung__AAAGATGCAGATCTGT, 
-    //  it[2]: ['aligned', 'aligned'], 
-    //  it[3]: molecule-dna__ksize-15__scaled-5__track_abundance-false, 
-    //  it[4]: 'dna', 
-    //  it[5]: '15', 
-    //  it[6]: [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, 
-    //    mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]
+    // [
+    //   ['mouse_brown_fat_ptprc_plus_unaligned__aligned__GCGCAGTCATGCCTTC'], 
+    //   mouse_brown_fat_ptprc_plus_unaligned__GCGCAGTCATGCCTTC, 
+    //   ['aligned'], 
+    //   molecule-dna__ksize-3,9__scaled-2__track_abundance-false, 
+    //   'dna', 
+    //    '3,9', 
+    //    [mouse_brown_fat_ptprc_plus_unaligned__aligned__GCGCAGTCATGCCTTC__molecule-dna__ksize-3,9__scaled-2__track_abundance-false.sig]
     // ]
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
-    .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6].unique() ] }
+    .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6]] }
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' )
+    .branch {
+      to_merge: it[1].length() > 1
+      // only one of aligned or unaligned reads here
+      // No merging necessary
+      single: it[1].length() == 1
+    }
+    .set { ch_sourmash_sketches_branched }
+
+  ch_sourmash_sketches_branched
+    .to_merge
+    .dump ( tag: 'ch_sourmash_sketches_to_merge' )
     .set { ch_sourmash_sketches_to_merge }
 
+  ch_sourmash_sketches_branched
+    .single
+    .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] }
+    .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' )
+    .set { ch_sourmash_sketches_to_mix_with_merged }
+
   process sourmash_sig_merge {
     tag "${sig_id}"
     label "low_memory"
@@ -1472,38 +1491,47 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
         }
 
     input:
-    set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(molecule), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge
+    set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(moltypes), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged
-    set val(cell_id), val(sketch_id), val(molecule), val(ksizes), file(output_sig) into sourmash_sketches
+    set val(cell_id), val(sketch_id), val(molecules), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
     sig_id = "${cell_id}__${sketch_id}"
     csv = "${sig_id}.csv"
-    concatenated_sig = 'concatenated.sig'
     output_sig = "${sig_id}.sig"
+    println(moltypes)
     """
+    ## --- Doing this big crazy loop in bash because the sigfiles are so small, it's not --- ##
+    ## --- worth spinning up docker images for each one, so doing within bash instead    --- ##
     # Parse ksizes in bash
     KSIZES=\$(echo ${ksizes} | tr ',' ' ')
 
+    # Parse molecule types in bash
+    MOLTYPES=\$(echo ${moltypes} | tr ',' ' ')
+
+
     # Iterate over ksizes
-    for KSIZE in \$KSIZES; do
-      # Can only merge one kize at a time
-      sourmash sig merge \
-          --${molecule} \
-          -k \$KSIZE \
-          -o merged_\$KSIZE.sig \
-          ${sigs}
-
-      # "sig merge" gives default automatic name of md5sum which isn't helpful
-      # --> rename to cell id
-      sourmash sig rename \
-          --${molecule} \
-          -k \$KSIZE \
-          -o merged_renamed_\$KSIZE.sig \
-          merged_\$KSIZE.sig '${cell_id}'
+    for MOLTYPE in \$MOLTYPES; do
+      MOLTYPE_FLAG=\$(echo -n "--\$MOLTYPE")
+      for KSIZE in \$KSIZES; do
+        # Can only merge one kize at a time
+        sourmash sig merge \\
+            \$MOLTYPE_FLAG \\
+            -k \$KSIZE \\
+            -o merged_\$MOLTYPE\_\$KSIZE.sig \\
+            ${sigs}
+
+        # "sig merge" gives default automatic name of md5sum which isn't helpful
+        # --> rename to cell id
+        sourmash sig rename \\
+            \$MOLTYPE_FLAG \\
+            -k \$KSIZE \\
+            -o merged_renamed_\$MOLTYPE\_\$KSIZE.sig \\
+            merged_\$MOLTYPE\_\$KSIZE.sig '${cell_id}'
+      done
     done
 
     # Concatenate all ksizes together
@@ -1517,7 +1545,8 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
 } else if (!params.skip_compute) {
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
-    .set { sourmash_sketches }
+    .dump ( tag: 'skip_merge__ch_sourmash_sketches_to_comapre' )
+    .set { ch_sourmash_sketches_to_comapre }
   ch_sourmash_sig_describe_merged = Channel.empty()
 } else {
   ch_sourmash_sig_describe_merged = Channel.empty()
@@ -1569,13 +1598,26 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
   //   .mix ( sourmash_sketches_nucleotide_for_compare )
   //   .set { ch_sourmash_sketches_to_compare }
 
+  ch_sourmash_sketches_merged
+    // Drop first index which is the cell id
+    .map { [it[1], it[2].split(","), it[3].split(","), it[4]] }
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
+    .groupTuple(by: [0, 1, 2])
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple' )
+    .transpose()
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple__transpose' )
+    .dump(tag: 'ch_sourmash_sketches_to_compare' )
+    .set { ch_sourmash_sketches_to_compare }
+
+  ch_sourmash_sketches_merged_to_view
+    .dump( tag: "ch_sourmash_sketches_to_view" )
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
     tag "${sketch_id}"
     publishDir "${params.outdir}/compare_sketches", mode: 'copy'
 
     input:
-    set file ("*.sig"), val(ksize), val(molecule) from sourmash_sketches
+    set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare
 
     output:
     file(csv)

From d8e764b8ae68335d7c990a3708ae17341ab47a0d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 8 Mar 2021 17:05:19 -0800
Subject: [PATCH 41/49] Add test_sig_merge

---
 conf/test_sig_merge.config | 33 +++++++++++++++++++++++++++++++++
 nextflow.config            |  1 +
 2 files changed, 34 insertions(+)
 create mode 100644 conf/test_sig_merge.config

diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
new file mode 100644
index 00000000..3bd3f29d
--- /dev/null
+++ b/conf/test_sig_merge.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------------------------
+ *  Nextflow config file for running tests with bam, translate
+ * -----------------------------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/kmermaid -profile test_translate_bam.config
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+  // Input data
+  bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
+    'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
+  // Sketch Parameters
+  ksizes = '3,9'
+  sketch_scaled = 2
+  molecules = 'dna,protein,dayhoff'
+  read_pairs = false
+  save_fastas = "fastas"
+  write_barcode_meta_csv = "metadata.csv"
+  // For bam, each fasta record represents each barcode and each should have a signature
+  // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
+  tenx_min_umi_per_cell = 2
+
+  reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
+  bloomfilter_tablesize = '1e6'
+}
diff --git a/nextflow.config b/nextflow.config
index 500b4095..857fa62e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -151,6 +151,7 @@ profiles {
   test_fastas { includeConfig 'conf/test_fastas.config' }
   test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' }
   test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' }
+  test_sig_merge { includeConfig 'conf/test_sig_merge.config' }
   test_tenx_tgz { includeConfig 'conf/test_tenx_tgz.config' }
   test_translate { includeConfig 'conf/test_translate.config' }
   test_translate_bam { includeConfig 'conf/test_translate_bam.config' }

From 232ac0d2e1c4a1dc9284f1354be1d32a2fe8d4b3 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 8 Mar 2021 17:05:47 -0800
Subject: [PATCH 42/49] Add test_sig_merge to CI

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4c5634ee..c41b9629 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,6 +74,7 @@ jobs:
           - "test_fastas"
           - "test_protein_fastas"
           - "test_remove_ribo"
+          - "test_sig_merge"
           - "test_tenx_tgz"
           - "test_translate"
           - "test_translate_bam"

From 253fa83cab1e9c6042e52c5ba7b844edb877b1f4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 8 Mar 2021 17:05:59 -0800
Subject: [PATCH 43/49] Don't allow multiple sketch values

---
 bin/validate_sketch_value.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py
index c9e3fe06..d96cff7f 100755
--- a/bin/validate_sketch_value.py
+++ b/bin/validate_sketch_value.py
@@ -17,8 +17,18 @@
 def get_sketch_value(value, value_log2):
     try:
         if value:
+            if "," in value:
+                logger.exception(
+                    f"Can only provide a single number to --sketch_num_hashes or"
+                    f" --sketch_scaled. Provided '{value}"
+                )
             sketch_value = int(value)
         else:
+            if "," in value_log2:
+                logger.exception(
+                    f"Can only provide a single number to --sketch_num_hashes_log2 or"
+                    f" --sketch_scaled_log2. Provided '{value_log2}"
+                )
             sketch_value = 2 ** int(value_log2)
     except ValueError:
         logger.exception(

From 67aec5eac11e8381a1082a868f9168025b48ca06 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 8 Mar 2021 17:06:17 -0800
Subject: [PATCH 44/49] Reduce bloom filter table size

---
 conf/test_translate_bam.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index d8a1d82c..33254ef3 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -29,7 +29,7 @@ params {
   tenx_min_umi_per_cell = 5
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
-  bloomfilter_tablesize = '1e8'
+  bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
 }

From 8c99f9f635d48db5452d1eae472bed21c0fe0846 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 10:39:11 -0800
Subject: [PATCH 45/49] Sig merge is working!

---
 main.nf | 74 ++++++++++++++++-----------------------------------------
 1 file changed, 21 insertions(+), 53 deletions(-)

diff --git a/main.nf b/main.nf
index bf489b03..bb09aa4e 100644
--- a/main.nf
+++ b/main.nf
@@ -1461,24 +1461,18 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' )
     .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6]] }
     .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' )
-    .branch {
-      to_merge: it[1].length() > 1
-      // only one of aligned or unaligned reads here
-      // No merging necessary
-      single: it[1].length() == 1
-    }
-    .set { ch_sourmash_sketches_branched }
-
-  ch_sourmash_sketches_branched
-    .to_merge
-    .dump ( tag: 'ch_sourmash_sketches_to_merge' )
     .set { ch_sourmash_sketches_to_merge }
 
-  ch_sourmash_sketches_branched
-    .single
-    .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] }
-    .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' )
-    .set { ch_sourmash_sketches_to_mix_with_merged }
+  // ch_sourmash_sketches_branched
+  //   .to_merge
+  //   .dump ( tag: 'ch_sourmash_sketches_to_merge' )
+  //   .set { ch_sourmash_sketches_to_merge }
+
+  // ch_sourmash_sketches_branched
+  //   .single
+  //   .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] }
+  //   .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' )
+  //   .set { ch_sourmash_sketches_to_mix_with_merged }
 
   process sourmash_sig_merge {
     tag "${sig_id}"
@@ -1495,47 +1489,20 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged
-    set val(cell_id), val(sketch_id), val(molecules), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view
+    set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
-    sig_id = "${cell_id}__${sketch_id}"
+    sig_id = "${cell_id}---${sketch_id}"
     csv = "${sig_id}.csv"
     output_sig = "${sig_id}.sig"
-    println(moltypes)
     """
-    ## --- Doing this big crazy loop in bash because the sigfiles are so small, it's not --- ##
-    ## --- worth spinning up docker images for each one, so doing within bash instead    --- ##
-    # Parse ksizes in bash
-    KSIZES=\$(echo ${ksizes} | tr ',' ' ')
-
-    # Parse molecule types in bash
-    MOLTYPES=\$(echo ${moltypes} | tr ',' ' ')
-
-
-    # Iterate over ksizes
-    for MOLTYPE in \$MOLTYPES; do
-      MOLTYPE_FLAG=\$(echo -n "--\$MOLTYPE")
-      for KSIZE in \$KSIZES; do
-        # Can only merge one kize at a time
-        sourmash sig merge \\
-            \$MOLTYPE_FLAG \\
-            -k \$KSIZE \\
-            -o merged_\$MOLTYPE\_\$KSIZE.sig \\
-            ${sigs}
-
-        # "sig merge" gives default automatic name of md5sum which isn't helpful
-        # --> rename to cell id
-        sourmash sig rename \\
-            \$MOLTYPE_FLAG \\
-            -k \$KSIZE \\
-            -o merged_renamed_\$MOLTYPE\_\$KSIZE.sig \\
-            merged_\$MOLTYPE\_\$KSIZE.sig '${cell_id}'
-      done
-    done
-
-    # Concatenate all ksizes together
-    sourmash sig cat merged_renamed*.sig > ${output_sig}
+    merge_rename_sigs.py \\
+        --ksizes ${ksizes} \\
+        --moltypes ${moltypes} \\
+        --name '${cell_id}' \\
+        --outsig ${output_sig} \\
+        ${sigs}
 
     # Add csv showing number of hashes at each ksize
     sourmash sig describe --csv ${csv} ${output_sig}
@@ -1597,6 +1564,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
   // sourmash_sketches_peptide_for_compare
   //   .mix ( sourmash_sketches_nucleotide_for_compare )
   //   .set { ch_sourmash_sketches_to_compare }
+  ch_sourmash_sketches_merged_to_view
+    .dump( tag: "ch_sourmash_sketches_to_view" )
 
   ch_sourmash_sketches_merged
     // Drop first index which is the cell id
@@ -1609,8 +1578,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
     .dump(tag: 'ch_sourmash_sketches_to_compare' )
     .set { ch_sourmash_sketches_to_compare }
 
-  ch_sourmash_sketches_merged_to_view
-    .dump( tag: "ch_sourmash_sketches_to_view" )
+
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
     tag "${sketch_id}"

From fccec83ca190d7e1ea33be9e26222468084c3419 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 10:39:20 -0800
Subject: [PATCH 46/49] Make test params more realistic

---
 conf/test_sig_merge.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 3bd3f29d..52b46efc 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -18,7 +18,7 @@ params {
   bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  ksizes = '3,9'
+  ksizes = '21,30,51'
   sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false

From 8de30e30b8a33859d22366ec2c03fe866889d3a6 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 10:39:41 -0800
Subject: [PATCH 47/49] Update default ksizes, add track abundance true

---
 bin/merge_rename_sigs.py | 112 +++++++++++++++++++++++++++++++++++++++
 nextflow.config          |   5 +-
 2 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100755 bin/merge_rename_sigs.py

diff --git a/bin/merge_rename_sigs.py b/bin/merge_rename_sigs.py
new file mode 100755
index 00000000..30a0c809
--- /dev/null
+++ b/bin/merge_rename_sigs.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+import argparse
+import itertools
+import shutil
+
+import sourmash
+
+
+def merge(filenames, ksize, moltype, name=None, outsig=None):
+    """
+    merge one or more signatures.
+
+    Adapted from 'sourmash sig merge' command
+    """
+
+    first_sig = None
+    mh = None
+    total_loaded = 0
+
+    for sigfile in filenames:
+        this_n = 0
+
+        for sigobj in sourmash.load_file_as_signatures(
+            sigfile, ksize=ksize, select_moltype=moltype
+        ):
+
+            if sigobj is None:
+                error(
+                    "No signature in file {}",
+                    sigfile,
+                )
+
+            # first signature? initialize a bunch of stuff
+            if first_sig is None:
+                first_sig = sigobj
+                mh = first_sig.minhash.copy_and_clear()
+
+            try:
+                sigobj_mh = sigobj.minhash
+
+                mh.merge(sigobj_mh)
+            except:
+                error(
+                    "ERROR when merging signature '{}' ({}) from file {}",
+                    sigobj.name(),
+                    sigobj.md5sum()[:8],
+                    sigfile,
+                )
+                raise
+
+            this_n += 1
+            total_loaded += 1
+
+    merged_sigobj = sourmash.SourmashSignature(mh)
+    if name is not None:
+        merged_sigobj._name = name
+
+    if outsig is not None:
+        with open(outsig, "wt") as f:
+            sourmash.save_signatures([merged_sigobj], fp=f)
+
+    return merged_sigobj
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Merge signatures with same ksize and moltype"""
+    )
+    # Signature files
+    parser.add_argument("sigfiles", nargs="+")
+    parser.add_argument(
+        "--moltypes",
+        type=str,
+        help="Molecule types, comma-separated. e,g. 'protein,dayhoff'",
+        required=True,
+    )
+    parser.add_argument(
+        "--ksizes", type=str, help="K-mer sizes to combine", required=True
+    )
+
+    parser.add_argument(
+        "-o", "--outsig", type=str, help="Signature file to output to", required=True
+    )
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        help="Name of the signature to use",
+    )
+
+    args = parser.parse_args()
+
+    # Only iterate and read over the sigfiles if there is really something to merge
+    # "something to merge" = there is more than one sigfile. otherwise there's no point
+    # in reading in the files only to make the same file again
+    if len(args.sigfiles) > 1:
+        ksizes = map(int, args.ksizes.split(","))
+        moltypes = args.moltypes.split(",")
+
+        merged_sigobjs = []
+        for moltype, ksize in itertools.product(moltypes, ksizes):
+            merged_sigobj = merge(
+                args.sigfiles, moltype=moltype, ksize=ksize, name=args.name
+            )
+            merged_sigobjs.append(merged_sigobj)
+
+        with open(args.outsig, "wt") as f:
+            sourmash.save_signatures(merged_sigobjs, fp=f)
+    else:
+        # Otherwise, nothing to merge. Simply copy the file to the
+        # output signature location
+        shutil.copyfile(args.sigfiles[0], args.outsig)
diff --git a/nextflow.config b/nextflow.config
index 857fa62e..4ac52d55 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -28,8 +28,9 @@ params {
 
   // Creating sketches
   molecules ='dna,protein,dayhoff'
-  ksizes = '21,27,33,51'
-  track_abundance = false
+  ksizes = '21,30,51'
+  // Track abundance by default
+  track_abundance = true
   // Number of hashes from each sample
   sketch_num_hashes = false
   sketch_num_hashes_log2 = false

From 0aade17156262784b421703dcb0c81e51400d731 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 11:24:30 -0800
Subject: [PATCH 48/49] Update variables in merge_renamed_sigs.pyh

---
 bin/merge_rename_sigs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/merge_rename_sigs.py b/bin/merge_rename_sigs.py
index 30a0c809..f5a171bf 100755
--- a/bin/merge_rename_sigs.py
+++ b/bin/merge_rename_sigs.py
@@ -99,10 +99,10 @@ def merge(filenames, ksize, moltype, name=None, outsig=None):
 
         merged_sigobjs = []
         for moltype, ksize in itertools.product(moltypes, ksizes):
-            merged_sigobj = merge(
+            merged = merge(
                 args.sigfiles, moltype=moltype, ksize=ksize, name=args.name
             )
-            merged_sigobjs.append(merged_sigobj)
+            merged_sigobjs.append(merged)
 
         with open(args.outsig, "wt") as f:
             sourmash.save_signatures(merged_sigobjs, fp=f)

From ad362594eda7e18f0cb8a5101783766023e405e9 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 11:24:46 -0800
Subject: [PATCH 49/49] Get sourmash compare to happen on correct ksizes and
 moltypes

---
 main.nf | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/main.nf b/main.nf
index bb09aa4e..a6f54186 100644
--- a/main.nf
+++ b/main.nf
@@ -448,7 +448,7 @@ save_translate_json = params.save_translate_json
 // --- Parse the Sourmash parameters ----
 ksizes = params.ksizes?.toString().tokenize(',')
 Channel.from(params.ksizes?.toString().tokenize(','))
-  .into { ch_ksizes_for_compare_petide; ch_ksizes_for_compare_nucleotide }
+  .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide }
 
 molecules = params.molecules?.toString().tokenize(',')
 peptide_molecules = molecules.findAll { it != "dna" }
@@ -1489,7 +1489,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
 
     output:
     file(csv) into ch_sourmash_sig_describe_merged
-    set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view
+    set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view, ch_sourmash_sketches_merged_for_moltypes_ksizes
 
     script:
     // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style)
@@ -1567,31 +1567,47 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
   ch_sourmash_sketches_merged_to_view
     .dump( tag: "ch_sourmash_sketches_to_view" )
 
+
+  ch_peptide_molecules_for_compare
+    .combine( ch_ksizes_for_compare_peptide )
+    .set { ch_sourmash_compare_params_peptide }
+
+  Channel.from("dna")  
+    .combine( ch_ksizes_for_compare_nucleotide )
+    .mix ( ch_sourmash_compare_params_peptide )
+    .set { ch_sourmash_compare_params_both }
+
   ch_sourmash_sketches_merged
-    // Drop first index which is the cell id
-    .map { [it[1], it[2].split(","), it[3].split(","), it[4]] }
+    // Drop first index (index 0) which is the cell id
+    // Drop the second index (index 1) which is the sketch id
+    // Keep only moltype
+    // Drop ksize
+    .map { [tuple(it[2].split(",")), it[4]] }
     .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
-    .groupTuple(by: [0, 1, 2])
-    .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple' )
     .transpose()
-    .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple__transpose' )
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
+    // Perform cartesian product on the molecules with compare params
+    .combine( ch_sourmash_compare_params_both, by: 0)
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
+    .groupTuple(by: [0, 2])
     .dump(tag: 'ch_sourmash_sketches_to_compare' )
     .set { ch_sourmash_sketches_to_compare }
 
 
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
-    tag "${sketch_id}"
+    tag "${compare_id}"
     publishDir "${params.outdir}/compare_sketches", mode: 'copy'
 
     input:
-    set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare
+    // Weird order but that's how it shakes out with the groupTuple
+    set val(molecule), file("*.sig"), val(ksize)  from ch_sourmash_sketches_to_compare
 
     output:
     file(csv)
 
     script:
-    compare_id = "molecule-${molecule}__ksize-${ksize}"
+    compare_id = "${molecule}__k=${ksize}"
     processes = "--processes ${task.cpus}"
     csv = "similarities__${compare_id}.csv"
     """
@@ -1599,8 +1615,10 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
           --ksize ${ksize} \\
           --${molecule} \\
           --csv ${csv} \\
-          $processes \\
+          ${processes} \\
           --traverse-directory .
+    # Use --traverse-directory instead of all the files explicitly to avoid
+    # "too many arguments" error for bash when there are lots of samples
     """
 
   }