From 7b147ba9a5d146718becdc1577f01f3883910c24 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 12:41:11 -0800
Subject: [PATCH 01/43] Remove SortMeRNA from requirements

---
 environment.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index 403f1a6b..37c5b912 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,7 +12,7 @@ dependencies:
   - conda-forge::pymdown-extensions=6.0
   - conda-forge::pygments=2.5.2
   - conda-forge::tqdm=4.43.0
-  - conda-forge::gxx_linux-64=7.3.0
+    # - conda-forge::gxx_linux-64=7.3.0
   - conda-forge::s3fs=0.4.2
   - bioconda::sourmash=3.5.0
   - bioconda::samtools=1.10
@@ -33,8 +33,8 @@ dependencies:
   - ska=1.0
   - sphinx=2.3.1
   - jupyter=1.0.0
-  - sortmerna=2.1b # for metatranscriptomics
   - ripgrep=12.1.1
+  - conda-forge::rust=1.48.0
   - pip:
     - bam2fasta==1.0.8
     - sencha==1.0.3
\ No newline at end of file

From b3d415cebdeb8c815e5b5ff6bdbf3382acaf1106 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 12:41:40 -0800
Subject: [PATCH 02/43] Add Luiz's remove-many code

---
 Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 46764af7..e516001e 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,6 +12,12 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH
 # Dump the details of the installed packages to a file for posterity
 RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml
 
+# Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures
+RUN git clone https://github.com/luizirber/2021-01-27-olga-remove-protein/ 
+RUN cd 2021-01-27-olga-remove-protein  && cargo build --release 
+# Add "subtract" command to path
+ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH
+
 # Instruct R processes to use these empty files instead of clashing with a local version
 RUN touch .Rprofile
 RUN touch .Renviron

From 1a4855594f13d4f7e6fcdece0242e1d3268b5a1b Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 12:41:59 -0800
Subject: [PATCH 03/43] Remove SortMeRNA

---
 main.nf | 251 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 131 insertions(+), 120 deletions(-)

diff --git a/main.nf b/main.nf
index 8b9bf3d6..c0d1900c 100644
--- a/main.nf
+++ b/main.nf
@@ -386,12 +386,12 @@ if (!protein_input) {
   if (params.subsample && params.skip_trimming ) {
     subsample_reads_ch_unchecked
       .ifEmpty{  exit 1, "No reads provided! Check read input files" }
-      .set { subsample_ch_reads_for_ribosomal_removal }
+      .set { subsample_ch_reads_to_translate }
   }
   if (params.skip_trimming && !(params.bam || params.tenx_tgz)) {
     reads_ch_unchecked
       .ifEmpty{ exit 1, "No reads provided! Check read input files" }
-      .set { ch_reads_for_ribosomal_removal }
+      .set { ch_reads_to_translate }
     ch_read_files_trimming_to_check_size = Channel.empty()
   } else if (params.bam || params.tenx_tgz) {
     ch_non_bam_reads_unchecked
@@ -407,11 +407,11 @@ if (!protein_input) {
   // Since there exists protein input, don't check if these are empty
   if (params.subsample) {
     subsample_reads_ch_unchecked
-      .set { subsample_ch_reads_for_ribosomal_removal }
+      .set { subsample_ch_reads_to_translate }
   }
   if (params.skip_trimming) {
     reads_ch_unchecked
-      .set { ch_reads_for_ribosomal_removal }
+      .set { ch_reads_to_translate }
     ch_read_files_trimming_to_check_size = Channel.empty()
   } else if (!have_nucleotide_fasta_input) {
     ch_read_files_trimming_unchecked
@@ -430,15 +430,6 @@ if (params.split_kmer){
     params.ksizes = '21,27,33,51'
 }
 
-// Get rRNA databases
-// Default is set to bundled DB list in `assets/rrna-db-defaults.txt`
-
-rRNA_database = file(params.rrna_database_manifest)
-if (rRNA_database.isEmpty()) {exit 1, "File ${rRNA_database.getName()} is empty!"}
-Channel
-    .from( rRNA_database.readLines() )
-    .map { row -> file(row) }
-    .set { sortmerna_fasta }
 
 // --- Parse Translate parameters ---
 save_translate_csv = params.save_translate_csv
@@ -524,6 +515,26 @@ else {
   barcode_metadata_folder = "barcode_metadata"
 }
 
+
+//////////////////////////////////////////////////////////
+/* --  Parse Housekeeping K-mer removal parameters  -- */
+/////////////////////////////////////////////////////////
+housekeeping_protein_fasta = params.housekeeping_protein_fasta
+housekeeping_rna_fasta = params.housekeeping_rna_fasta
+
+need_refseq_download = !housekeeping_protein_fasta && !housekeeping_rna_fasta
+
+ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta])
+ch_refseq_moltype_to_fasta
+    // filter if the second item, the fasta is false
+    .filter{ !it[1] }
+    // Take only the first item, the molecule type
+    .map{ it[0] }
+    .set{ ch_refseq_moltypes_to_download }
+
+// Parse refseq taxonomy group to download
+refseq_taxonomy = params.refseq_taxonomy
+
 // Has the run name been specified by the user?
 //  this has the bonus effect of catching both -name and --name
 custom_runName = params.name
@@ -851,8 +862,8 @@ if (params.tenx_tgz || params.bam) {
   // Put fastqs from aligned and unaligned reads into a single channel
   tenx_reads_aligned_concatenation_ch
     .mix( tenx_reads_unaligned_ch )
-    .dump(tag: "tenx_ch_reads_for_ribosomal_removal")
-    .set{ tenx_ch_reads_for_ribosomal_removal }
+    .dump(tag: "tenx_ch_reads_to_translate")
+    .set{ tenx_ch_reads_to_translate }
 
   if ((params.tenx_min_umi_per_cell > 0) || !params.barcodes_file) {
     process count_umis_per_cell {
@@ -898,14 +909,14 @@ if (params.tenx_tgz || params.bam) {
     good_barcodes_ch = tenx_bam_barcodes_ch
   }
 
-  tenx_ch_reads_for_ribosomal_removal
+  tenx_ch_reads_to_translate
     .combine( good_barcodes_ch, by: 0 )
-    .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch' )
+    .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch' )
     .map{ it -> [it[0], it[1], it[2], it[3].splitText()] }
     .transpose()
-    .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose' )
+    .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose' )
     .map{ it -> [it[0], it[1], it[2], it[3].replaceAll("\\s+", "") ] }
-    .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose__no_newlines' )
+    .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose__no_newlines' )
     .set{ tenx_reads_with_good_barcodes_ch }
 
   process extract_per_cell_fastqs {
@@ -949,8 +960,8 @@ if (params.tenx_tgz || params.bam) {
   // // Filtering out fastq.gz files less than 200 bytes (arbitary number)
   // // ~200 bytes is about the size of a file with a single read or less
   // // We can't use .size() > 0 because it's fastq.gz is gzipped content
-  // per_channel_cell_ch_reads_for_ribosomal_removal
-  //   .dump(tag: 'per_channel_cell_ch_reads_for_ribosomal_removal')
+  // per_channel_cell_ch_reads_to_translate
+  //   .dump(tag: 'per_channel_cell_ch_reads_to_translate')
   //   .flatten()
   //   .filter{ it -> it.size() > 200 }   // each item is just a single file, no need to do it[1]
   //   .map{ it -> tuple(it.simpleName, file(it)) }
@@ -960,7 +971,7 @@ if (params.tenx_tgz || params.bam) {
   if (params.skip_trimming) {
     ch_non_bam_reads
       .concat(per_cell_fastqs_ch)
-      .set { ch_reads_for_ribosomal_removal }
+      .set { ch_reads_to_translate }
   } else {
     ch_non_bam_reads
       .mix ( per_cell_fastqs_ch )
@@ -1053,10 +1064,10 @@ if ( have_nucleotide_input ) {
     ch_reads_trimmed
       .concat( fastas_ch )
       .dump ( tag: 'trimmed_reads__concat_fastas' )
-      .set { subsample_ch_reads_for_ribosomal_removal }
+      .set { subsample_ch_reads_to_translate }
   } else {
     // Concatenate trimmed reads with fastas for signature generation
-    ch_reads_for_ribosomal_removal = ch_reads_trimmed.concat(fastas_ch)
+    ch_reads_to_translate = ch_reads_trimmed.concat(fastas_ch)
   }
 } else {
   ch_fastp_results = Channel.from(false)
@@ -1068,10 +1079,10 @@ if (params.subsample) {
     publishDir "${params.outdir}/seqtk/", mode: params.publish_dir_mode
 
     input:
-    set val(id), file(reads) from subsample_ch_reads_for_ribosomal_removal
+    set val(id), file(reads) from subsample_ch_reads_to_translate
 
     output:
-    set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_for_ribosomal_removal
+    set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_to_translate
 
     script:
     read1 = reads[0]
@@ -1086,99 +1097,6 @@ if (params.subsample) {
     }
   }
 
-/*
- * STEP 2+ - SortMeRNA - remove rRNA sequences on request
- */
-if (!params.remove_ribo_rna) {
-    ch_reads_for_ribosomal_removal
-        .set { ch_reads_to_translate }
-    sortmerna_logs = Channel.empty()
-} else {
-    process sortmerna_index {
-        label 'mid_memory_long'
-        label 'mid_cpu'
-        tag "${fasta.baseName}"
-
-        input:
-        file(fasta) from sortmerna_fasta
-
-        output:
-        val("${fasta.baseName}") into sortmerna_db_name
-        file("$fasta") into sortmerna_db_fasta
-        file("${fasta.baseName}*") into sortmerna_db
-
-        script:
-        """
-        indexdb_rna --ref $fasta,${fasta.baseName} -m 3072 -v
-        """
-    }
-
-    process sortmerna {
-        label 'mid_memory_long'
-        label 'mid_cpu'
-        tag "$name"
-        publishDir "${params.outdir}/SortMeRNA", mode: "${params.publish_dir_mode}",
-            saveAs: {filename ->
-                if (filename.indexOf("_rRNA_report.txt") > 0) "logs/$filename"
-                else if (params.save_non_rrna_reads) "reads/$filename"
-                else null
-            }
-
-        input:
-        set val(name), file(reads) from ch_reads_for_ribosomal_removal
-        val(db_name) from sortmerna_db_name.collect()
-        file(db_fasta) from sortmerna_db_fasta.collect()
-        file(db) from sortmerna_db.collect()
-
-        output:
-        set val(name), file("*.fq.gz") into ch_reads_to_translate
-        file "*_rRNA_report.txt" into sortmerna_logs
-
-
-        script:
-        //concatenate reference files: ${db_fasta},${db_name}:${db_fasta},${db_name}:...
-        def Refs = ''
-        for (i=0; i<db_fasta.size(); i++) { Refs+= ":${db_fasta[i]},${db_name[i]}" }
-        Refs = Refs.substring(1)
-
-        // One set of reads --> single end
-        if (reads[1] == null) {
-            """
-            gzip -d --force < ${reads} > all-reads.fastq
-            sortmerna --ref ${Refs} \
-                --reads all-reads.fastq \
-                --num_alignments 1 \
-                -a ${task.cpus} \
-                --fastx \
-                --aligned rRNA-reads \
-                --other non-rRNA-reads \
-                --log -v
-            gzip --force < non-rRNA-reads.fastq > ${name}.fq.gz
-            mv rRNA-reads.log ${name}_rRNA_report.txt
-            """
-        } else {
-            """
-            gzip -d --force < ${reads[0]} > reads-fw.fq
-            gzip -d --force < ${reads[1]} > reads-rv.fq
-            merge-paired-reads.sh reads-fw.fq reads-rv.fq all-reads.fastq
-            sortmerna --ref ${Refs} \
-                --reads all-reads.fastq \
-                --num_alignments 1 \
-                -a ${task.cpus} \
-                --fastx --paired_in \
-                --aligned rRNA-reads \
-                --other non-rRNA-reads \
-                --log -v
-            unmerge-paired-reads.sh non-rRNA-reads.fastq non-rRNA-reads-fw.fq non-rRNA-reads-rv.fq
-            gzip < non-rRNA-reads-fw.fq > ${name}-fw.fq.gz
-            gzip < non-rRNA-reads-rv.fq > ${name}-rv.fq.gz
-            mv rRNA-reads.log ${name}_rRNA_report.txt
-            """
-        }
-    }
-  }
-
-
 
   if (params.reference_proteome_fasta){
     process translate {
@@ -1399,6 +1317,100 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
   sourmash_sketches_peptide = Channel.empty()
 }
 
+
+if (!params.skip_remove_housekeeping_genes) {
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /* --                                                                     -- */
+  /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+  /* --                                                                     -- */
+  ///////////////////////////////////////////////////////////////////////////////
+  /////////////////////////////////////////////////////////////////////////////// 
+
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /* --                                                                     -- */
+  /* --         DOWNLOAD NUCLEOTIDE AND PROTEIN SEQS FROM REFSEQ            -- */
+  /* --                                                                     -- */
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /*
+  * STEP 6 - rsync to download refeseq
+  */
+  if (need_refseq_download){
+    // No fastas provided for removing housekeeping genes
+    process download_refseq {
+      tag "${refseq_taxonomy}"
+      label "process_low"
+      publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'
+
+      input:
+      val refseq_moltype from ch_refseq_moltypes_to_download
+
+      output:
+      set val(refseq_moltype), file(output_fasta) into ch_refseq_fasta_to_filter
+
+      script:
+      output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
+      """
+      rsync \\
+            --prune-empty-dirs \\
+            --archive \\
+            --verbose \\
+            --recursive \\
+            --include '*${refseq_moltype}.faa.gz' \\
+            --exclude '/*' \\
+            rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ .
+      wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
+      DATE=\$(date +'%Y-%M-%d')
+      RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
+      zcat *.${refseq_moltype}.faa.gz | gzip -c - > ${output_fasta}
+      """
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /* --                                                                     -- */
+  /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+  /* --                                                                     -- */
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /*
+  * STEP 7 - Filter fastas from refseq
+  */
+  if (need_refseq_download){
+    // No fastas provided for removing housekeeping genes
+    process filter_fasta_housekeeping {
+      tag "${refseq_taxonomy}"
+      label "process_low"
+      publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'
+
+      input:
+      set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
+
+      output:
+      set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
+
+      script:
+      output_fasta = "${fasta.basename}__only_housekeeping_genes.fa"
+      output_fasta_gz = "${fasta.basename}__only_housekeeping_genes.fa.gz"
+      """
+      filter_fasta_regex.py \\
+          --input-fasta ${fasta} \\
+          --output-fasta ${output_fasta} \\
+          --regex-pattern '${params.housekeeping_gene_regex}'
+      gzip -c ${output_fasta} > ${output_fasta_gz}
+      """
+    }
+  }
+}
+
+
+
+
 if (params.split_kmer){
      process ska_compare_sketches {
     tag "${sketch_id}"
@@ -1481,7 +1493,6 @@ if (!params.skip_multiqc){
       input:
       file multiqc_config from ch_multiqc_config
       file ('fastp/*') from ch_fastp_results.collect().ifEmpty([])
-      file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([])
       file ('software_versions/*') from ch_software_versions_yaml.collect()
       file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")
 

From edd1dd111f75755b99ed242bdf22e8156cbe3a0f Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 9 Mar 2021 12:42:10 -0800
Subject: [PATCH 04/43] Add parameters for housekeeping gene removal

---
 .vscode/settings.json            |  6 +++++
 bin/filter_fasta_regex.py        | 44 ++++++++++++++++++++++++++++++++
 conf/test_download_refseq.config | 39 ++++++++++++++++++++++++++++
 nextflow.config                  | 13 ++++++----
 scratch.nf                       | 10 ++++++++
 5 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 bin/filter_fasta_regex.py
 create mode 100644 conf/test_download_refseq.config
 create mode 100644 scratch.nf

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..3022f6a1
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "python.linting.pylintEnabled": false,
+    "python.linting.flake8Enabled": true,
+    "python.linting.enabled": true,
+    "python.formatting.provider": "black"
+}
\ No newline at end of file
diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py
new file mode 100644
index 00000000..97cfb764
--- /dev/null
+++ b/bin/filter_fasta_regex.py
@@ -0,0 +1,44 @@
+import argparse
+import re
+
+
+import screed
+
+
+def write_records_to_fasta(records, fasta):
+    with open(fasta, "w") as f:
+        for record in records:
+            f.write(f'>{record["name"]}\n{record["sequence"]}\n')
+
+
+def filter_records(fasta, pattern):
+    filtered_records = []
+    with screed.open(fasta) as records:
+        for record in records:
+            name = record["name"]
+            if re.findall(pattern, name, flags=re.I):
+                filtered_records.append(record)
+    return filtered_records
+
+
+def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex):
+    record_subset = filter_records(fasta_to_filter, regex)
+    write_records_to_fasta(record_subset, out_fasta)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Extract sequences whose names match a pattern"""
+    )
+    parser.add_argument("--input-fasta", type=str, help="Sequence file to filter")
+    parser.add_argument("--output-fasta", type=str, help="File to write")
+    parser.add_argument(
+        "--regex-pattern",
+        type=str,
+        help="Regular expression pattern to match for the names of seuqences in the file",
+    )
+    args = parser.parse_args()
+
+    filter_fasta_with_regex(
+        parser.input_fasta, parser.output_fasta, parser.regex_pattern
+    )
diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config
new file mode 100644
index 00000000..a7db4d8d
--- /dev/null
+++ b/conf/test_download_refseq.config
@@ -0,0 +1,39 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+  // Input data
+  // samples = 'testing/samples.csv'
+  // fastas = 'testing/fastas/*.fasta'
+  ksizes = '3,9'
+  sketch_num_hashes_log2 = '2,4'
+  molecules = 'dna,protein,dayhoff'
+  // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
+  // sra = "SRP016501"
+  remove_ribo_rna = true
+  save_non_rrna_reads = true
+  input_paths = [
+    ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+    ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+    ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+    ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+  ]
+
+  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
+  // Protein fasta is 453 B
+  refseq_taxonomy = 'other'
+}
diff --git a/nextflow.config b/nextflow.config
index 46d3ca99..b35df235 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -42,6 +42,7 @@ params {
   // Computing sketches
   skip_compute = false
 
+  // Skip trimming of adapters and poly-X sequences
   skip_trimming = false
 
   // translate options
@@ -55,11 +56,12 @@ params {
   save_translate_csv = false
   save_translate_json = false
 
-
-  // Ribosomal RNA removal
-  remove_ribo_rna = false
-  save_non_rrna_reads = false
-  rrna_database_manifest = "${baseDir}/assets/rrna-db-defaults.txt"
+  // Housekeeping gene k-mer removal
+  skip_remove_housekeeping_genes = false
+  housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
+  refseq_taxonomy = 'vertebrate_mammalian'
+  housekeeping_protein_fasta = false
+  housekeeping_rna_fasta = false
 
   // ska options
   split_kmer = false
@@ -145,6 +147,7 @@ profiles {
     podman.enabled = true
   }
   test { includeConfig 'conf/test.config' }
+  test_download_refseq { includeConfig 'conf/test_download_refseq.config' }
   test_full { includeConfig 'conf/test_full.config' }
   test_ska { includeConfig 'conf/test_ska.config' }
   test_bam { includeConfig 'conf/test_bam.config' }
diff --git a/scratch.nf b/scratch.nf
new file mode 100644
index 00000000..617d336d
--- /dev/null
+++ b/scratch.nf
@@ -0,0 +1,10 @@
+housekeeping_protein_fasta = false
+housekeeping_rna_fasta = true
+
+ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta])
+ch_refseq_moltype_to_fasta
+    // filter if the second item, the fasta is false
+    .filter{ !it[1] }
+    // Take only the first item, the molecule type
+    .map{ it[0] }
+    .println()

From 5212de1395fa7c38dc379f6671f078e0272b8196 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:48:06 -0800
Subject: [PATCH 05/43] Add mini refseq download option for testing

---
 nextflow.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index b35df235..95986d63 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -60,6 +60,8 @@ params {
   skip_remove_housekeeping_genes = false
   housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
   refseq_taxonomy = 'vertebrate_mammalian'
+  // For testing purposes --> use a small refseq dataset
+  test_mini_refseq_download = false
   housekeeping_protein_fasta = false
   housekeeping_rna_fasta = false
 

From f27b037f23690c38d594afaa78b0240860ecd444 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:48:41 -0800
Subject: [PATCH 06/43] Add raw quote strings around nf-core lint

---
 .github/workflows/linting.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 6f2be6b0..a3b25f18 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -53,7 +53,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install nf-core
-
+{% raw %}
       - name: Run nf-core lint
         env:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
@@ -74,3 +74,4 @@ jobs:
             lint_log.txt
             lint_results.md
             PR_number.txt
+{% endraw %}
\ No newline at end of file

From c1e9c593c9305a8993f2c8ac7f41c7f6f6e66366 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:48:54 -0800
Subject: [PATCH 07/43] Get fasta filtering working

---
 bin/filter_fasta_regex.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 bin/filter_fasta_regex.py

diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py
old mode 100644
new mode 100755
index 97cfb764..ebded98a
--- a/bin/filter_fasta_regex.py
+++ b/bin/filter_fasta_regex.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import argparse
 import re
 
@@ -39,6 +41,4 @@ def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex):
     )
     args = parser.parse_args()
 
-    filter_fasta_with_regex(
-        parser.input_fasta, parser.output_fasta, parser.regex_pattern
-    )
+    filter_fasta_with_regex(args.input_fasta, args.output_fasta, args.regex_pattern)

From 19e9801ff2bd2f8c475d0f505554d6252641af34 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:49:06 -0800
Subject: [PATCH 08/43] Update test params for download refseq

---
 conf/test_download_refseq.config | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config
index a7db4d8d..cebb0acf 100644
--- a/conf/test_download_refseq.config
+++ b/conf/test_download_refseq.config
@@ -35,5 +35,6 @@ params {
 
   // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
   // Protein fasta is 453 B
-  refseq_taxonomy = 'other'
+  refseq_taxonomy = 'vertebrate_mammalian'
+  test_mini_refseq_download = true
 }

From a8616105553a6bed23794b4d1e0ecfd65afe6c52 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:50:01 -0800
Subject: [PATCH 09/43] Add Rsync, return gxx linux

---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 37c5b912..685d964e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,7 +12,7 @@ dependencies:
   - conda-forge::pymdown-extensions=6.0
   - conda-forge::pygments=2.5.2
   - conda-forge::tqdm=4.43.0
-    # - conda-forge::gxx_linux-64=7.3.0
+  - conda-forge::gxx_linux-64=7.3.0
   - conda-forge::s3fs=0.4.2
   - bioconda::sourmash=3.5.0
   - bioconda::samtools=1.10
@@ -35,6 +35,7 @@ dependencies:
   - jupyter=1.0.0
   - ripgrep=12.1.1
   - conda-forge::rust=1.48.0
+  - rsync=3.2.3
   - pip:
     - bam2fasta==1.0.8
     - sencha==1.0.3
\ No newline at end of file

From caf44503e6c2cd124a430c3ce307bf10a7cfa7b4 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:50:12 -0800
Subject: [PATCH 10/43] Add osx environment yml

---
 environment_osx.yml | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 environment_osx.yml

diff --git a/environment_osx.yml b/environment_osx.yml
new file mode 100644
index 00000000..6de5700e
--- /dev/null
+++ b/environment_osx.yml
@@ -0,0 +1,41 @@
+# You can use this file to create a conda environment for this pipeline:
+#   conda env create -f environment.yml
+name: nf-core-kmermaid-0.1.0dev
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+  - anaconda
+dependencies:
+  - conda-forge::python=3.7.3
+  - conda-forge::markdown=3.1.1
+  - conda-forge::pymdown-extensions=6.0
+  - conda-forge::pygments=2.5.2
+  - conda-forge::tqdm=4.43.0
+  - conda-forge::clangxx_osx-64=11.1.0
+  - conda-forge::s3fs=0.4.2
+  - bioconda::sourmash=3.5.0
+  - bioconda::samtools=1.10
+  - bioconda::screed=1.0.4
+  - bioconda::khmer=3.0.0a3
+  - bioconda::pysam=0.16.0
+  - anaconda::make=4.2.1
+  - alabaster=0.7.12
+  - fastp=0.20.0
+  - fastqc=0.11.9
+  - matplotlib=3.1.1 # don't upgrade, multiqc conflict
+  - multiqc=1.8
+  - numpy=1.17.5
+  - pathos=0.2.5
+  - pip=20.0.2
+  - pytest=5.3.4
+  - seqtk=1.3
+  - ska=1.0
+  - sphinx=2.3.1
+  - jupyter=1.0.0
+  - ripgrep=12.1.1
+  - conda-forge::rust=1.48.0
+  - rsync=3.2.3
+  - pip:
+    - bam2fasta==1.0.8
+    - sencha==1.0.3
\ No newline at end of file

From 5a56dde47d3c5d210a2fd340b44c1eaac2ef47a0 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 08:54:23 -0800
Subject: [PATCH 11/43] download refseq and filter fasta is working

---
 main.nf | 131 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 104 insertions(+), 27 deletions(-)

diff --git a/main.nf b/main.nf
index c0d1900c..165dc693 100644
--- a/main.nf
+++ b/main.nf
@@ -1342,31 +1342,32 @@ if (!params.skip_remove_housekeeping_genes) {
   if (need_refseq_download){
     // No fastas provided for removing housekeeping genes
     process download_refseq {
-      tag "${refseq_taxonomy}"
+      tag "${refseq_taxonomy}--${refseq_moltype}"
       label "process_low"
-      publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'
+      publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy'
 
       input:
       val refseq_moltype from ch_refseq_moltypes_to_download
 
       output:
-      set val(refseq_moltype), file(output_fasta) into ch_refseq_fasta_to_filter
+      set val(refseq_moltype), file("${refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
 
       script:
       output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
+      include_fasta = params.test_mini_refseq_download ? "${refseq_taxonomy}.1.${refseq_moltype}.f*a.gz"  : "*${refseq_moltype}.f*a.gz" 
       """
       rsync \\
             --prune-empty-dirs \\
             --archive \\
             --verbose \\
             --recursive \\
-            --include '*${refseq_moltype}.faa.gz' \\
+            --include '${include_fasta}' \\
             --exclude '/*' \\
             rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ .
       wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
       DATE=\$(date +'%Y-%M-%d')
       RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
-      zcat *.${refseq_moltype}.faa.gz | gzip -c - > ${output_fasta}
+      zcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
       """
     }
   }
@@ -1379,33 +1380,109 @@ if (!params.skip_remove_housekeeping_genes) {
   ///////////////////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////////////////
   /*
-  * STEP 7 - Filter fastas from refseq
+  * STEP 7 - Get only housekeeping genes from 
   */
-  if (need_refseq_download){
-    // No fastas provided for removing housekeeping genes
-    process filter_fasta_housekeeping {
-      tag "${refseq_taxonomy}"
-      label "process_low"
-      publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'
+  // Keep genes whose names match housekeeping gene regular expression pattern
+  process extract_fasta_housekeeping {
+    tag "${refseq_moltype}"
+    label "process_low"
+    publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
-      input:
-      set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
+    input:
+    set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
 
-      output:
-      set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
+    output:
+    set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
 
-      script:
-      output_fasta = "${fasta.basename}__only_housekeeping_genes.fa"
-      output_fasta_gz = "${fasta.basename}__only_housekeeping_genes.fa.gz"
-      """
-      filter_fasta_regex.py \\
-          --input-fasta ${fasta} \\
-          --output-fasta ${output_fasta} \\
-          --regex-pattern '${params.housekeeping_gene_regex}'
-      gzip -c ${output_fasta} > ${output_fasta_gz}
-      """
-    }
+    script:
+    output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
+    output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
+    """
+    filter_fasta_regex.py \\
+        --input-fasta ${fasta} \\
+        --output-fasta ${output_fasta} \\
+        --regex-pattern '${params.housekeeping_gene_regex}'
+    gzip -c ${output_fasta} > ${output_fasta_gz}
+    """
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /* --                                                                     -- */
+  /* --          COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE                  -- */
+  /* --                                                                     -- */
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  /*
+  * STEP 8 - Compute Housekeeping Gene K-mer Signature
+  */
+  // No fastas provided for removing housekeeping genes
+  process compute_housekeeping_kmer_sig {
+    tag "${refseq_moltype}"
+    label "process_low"
+    publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
+
+    input:
+    set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta
+
+    output:
+    set val(sourmash_moltype), file(sig) into ch_houskeeping_sig
+
+    script:
+    sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna'
+    sketch_id = make_sketch_id(sourmash_moltype, params.ksizes, sketch_value, track_abundance, sketch_style)
+
+    moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna'
+    track_abundance_flag = track_abundance ? '--track-abundance' : ''
+    sig_id = "${ch_houskeeping_fasta.baseName}__${sketch_id}"
+    sig = "${sig_id}.sig"
+    csv = "${sig_id}.csv"
+    """
+    sourmash compute \\
+      ${sketch_value_flag} \\
+      --ksizes ${params.ksizes} \\
+      ${moltype_flags} \\
+      ${track_abundance_flag} \\
+      --output ${sig} \\
+      --name '${sample_id}' \\
+      ${fasta}
+    sourmash sig describe --csv ${csv} ${sig}
+    """
   }
+  
+
+  // ///////////////////////////////////////////////////////////////////////////////
+  // ///////////////////////////////////////////////////////////////////////////////
+  // /* --                                                                     -- */
+  // /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+  // /* --                                                                     -- */
+  // ///////////////////////////////////////////////////////////////////////////////
+  // ///////////////////////////////////////////////////////////////////////////////
+  // /*
+  // * STEP 9 - Remove housekeeping gene k-mers from single cells
+  // */
+  // process subtract_houskeeping_kmers {
+  //   tag "${refseq_taxonomy}"
+  //   label "process_low"
+  //   publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
+
+  //   input:
+  //   set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
+
+  //   output:
+  //   set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
+
+  //   script:
+  //   output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
+  //   output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
+  //   """
+  //   filter_fasta_regex.py \\
+  //       --input-fasta ${fasta} \\
+  //       --output-fasta ${output_fasta} \\
+  //       --regex-pattern '${params.housekeeping_gene_regex}'
+  //   gzip -c ${output_fasta} > ${output_fasta_gz}
+  //   """
+  // }
 }
 
 

From 4d49662495be7e746d929a8e58271b59dbcce1cd Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:07:04 -0800
Subject: [PATCH 12/43] Add missing quote

---
 bin/validate_sketch_value.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py
index d96cff7f..497d92bd 100755
--- a/bin/validate_sketch_value.py
+++ b/bin/validate_sketch_value.py
@@ -20,7 +20,7 @@ def get_sketch_value(value, value_log2):
             if "," in value:
                 logger.exception(
                     f"Can only provide a single number to --sketch_num_hashes or"
-                    f" --sketch_scaled. Provided '{value}"
+                    f" --sketch_scaled. Provided '{value}'"
                 )
             sketch_value = int(value)
         else:

From ed82230cf4d5aa303c5597578fe57a05b1f05923 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:07:22 -0800
Subject: [PATCH 13/43] Move merged sigs to view

---
 main.nf | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index a59b9162..dfdddaf1 100644
--- a/main.nf
+++ b/main.nf
@@ -1428,6 +1428,10 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
     """
   }
 
+  ch_sourmash_sketches_merged_to_view
+    .dump( tag: "ch_sourmash_sketches_to_view" )
+
+
 } else if (!params.skip_compute) {
   sourmash_sketches_nucleotide
     .mix ( sourmash_sketches_peptide )
@@ -1544,6 +1548,9 @@ if (!params.skip_remove_housekeeping_genes) {
     publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
     input:
+    val track_abundance
+    val sketch_value_parsed
+    val sketch_style_parsed
     set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta
 
     output:
@@ -1551,7 +1558,15 @@ if (!params.skip_remove_housekeeping_genes) {
 
     script:
     sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna'
-    sketch_id = make_sketch_id(sourmash_moltype, params.ksizes, sketch_value, track_abundance, sketch_style)
+    sketch_id = make_sketch_id(
+      peptide_molecules_comma_separated, 
+      params.ksizes, 
+      sketch_value_parsed[0], 
+      track_abundance, 
+      sketch_style_parsed[0]
+    )
+
+    sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
 
     moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna'
     track_abundance_flag = track_abundance ? '--track-abundance' : ''
@@ -1654,8 +1669,6 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
   // sourmash_sketches_peptide_for_compare
   //   .mix ( sourmash_sketches_nucleotide_for_compare )
   //   .set { ch_sourmash_sketches_to_compare }
-  ch_sourmash_sketches_merged_to_view
-    .dump( tag: "ch_sourmash_sketches_to_view" )
 
 
   ch_peptide_molecules_for_compare
@@ -1667,6 +1680,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
     .mix ( ch_sourmash_compare_params_peptide )
     .set { ch_sourmash_compare_params_both }
 
+  ch_sourmash_sketches_merged = Channel.empty()
+
   ch_sourmash_sketches_merged
     // Drop first index (index 0) which is the cell id
     // Drop the second index (index 1) which is the sketch id

From c5a0aae7c3878f8ed0cb8db10d54d7aa1b970fad Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:20:14 -0800
Subject: [PATCH 14/43] Add test_download_refseq to ci.yml

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0a26f0ed..5e4b59a1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -72,6 +72,7 @@ jobs:
           - "test_bam --write_barcodes_meta_csv false"
           - "test_bam --barcodes_file false --rename_10x_barcodes false"
           - "test_bam --rename_10x_barcodes false"
+          - "test_download_refseq"
           - "test_fastas"
           - "test_protein_fastas"
           - "test_remove_ribo"

From 7606bd246946f2080caa5660751c05fcda764e89 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:20:22 -0800
Subject: [PATCH 15/43] Update scrape_software_versions

---
 bin/scrape_software_versions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 551861e1..597a70f5 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -14,7 +14,8 @@
     "SKA": ["v_ska.txt", r"SKA Version: (\S+)"],
     "htslib": ["v_samtools.txt", r"htslib (\S+)"],
     "Sourmash": ["v_sourmash.txt", r"sourmash (\S+)"],
-    "SortMeRNA": ["v_sortmerna.txt", r"SortMeRNA version (\S+),"],
+    "Rsync": ["v_rsync.txt", r"rsync  version (\S+)"],
+    "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"],
     "orpheum": ["v_orpheum.txt", r"Version: (\S+)"],
 }
 results = OrderedDict()
@@ -25,10 +26,11 @@
 results["bam2fasta"] = '<span style="color:#999999;">N/A</span>'
 results["fastp"] = '<span style="color:#999999;">N/A</span>'
 results["htslib"] = '<span style="color:#999999;">N/A</span>'
+results["Rsync"] = '<span style="color:#999999;">N/A</span>'
+results["Rsync (Protocol)"] = '<span style="color:#999999;">N/A</span>'
 results["Samtools"] = '<span style="color:#999999;">N/A</span>'
 results["SKA"] = '<span style="color:#999999;">N/A</span>'
 results["Sourmash"] = '<span style="color:#999999;">N/A</span>'
-results["SortMeRNA"] = '<span style="color:#999999;">N/A</span>'
 results["orpheum"] = '<span style="color:#999999;">N/A</span>'
 
 # Search each file using its regex

From ba243e3f40b925bc764fd1bc2cc29f516dc58d12 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:20:34 -0800
Subject: [PATCH 16/43] Remove sortmerna from get_software_versions

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 24d44dc4..2c367a28 100644
--- a/main.nf
+++ b/main.nf
@@ -680,8 +680,8 @@ process get_software_versions {
     bam2fasta info &> v_bam2fasta.txt
     fastp --version &> v_fastp.txt
     samtools --version &> v_samtools.txt
+    rsync --version &> v_rsync.txt
     ska version &> v_ska.txt
-    sortmerna --version &> v_sortmerna.txt
     sourmash -v &> v_sourmash.txt
     pip show orpheum &> v_orpheum.txt
     scrape_software_versions.py &> software_versions_mqc.yaml

From dbf824f1a3cd7777c8edc6c517e9f283ea37d536 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 09:20:50 -0800
Subject: [PATCH 17/43] Fix sketch params in test_download_refseq

---
 conf/test_download_refseq.config | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config
index cebb0acf..2d7b5486 100644
--- a/conf/test_download_refseq.config
+++ b/conf/test_download_refseq.config
@@ -14,16 +14,9 @@ params {
   max_cpus = 2
   max_memory = 6.GB
   max_time = 48.h
+
+  sketch_scaled = 2
   // Input data
-  // samples = 'testing/samples.csv'
-  // fastas = 'testing/fastas/*.fasta'
-  ksizes = '3,9'
-  sketch_num_hashes_log2 = '2,4'
-  molecules = 'dna,protein,dayhoff'
-  // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
-  // sra = "SRP016501"
-  remove_ribo_rna = true
-  save_non_rrna_reads = true
   input_paths = [
     ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
                     'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],

From 7b5cc3d14ac30e774a50ab5dab3776935c991305 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Wed, 10 Mar 2021 14:57:14 -0800
Subject: [PATCH 18/43] Got subtract to work!!

---
 main.nf | 164 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 116 insertions(+), 48 deletions(-)

diff --git a/main.nf b/main.nf
index 2c367a28..a6c91209 100644
--- a/main.nf
+++ b/main.nf
@@ -441,9 +441,10 @@ save_translate_json = params.save_translate_json
 // --- Parse the Sourmash parameters ----
 ksizes = params.ksizes?.toString().tokenize(',')
 Channel.from(params.ksizes?.toString().tokenize(','))
-  .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide }
+  .into { ch_ksizes_for_nucleotide; ch_ksizes_for_peptide; ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide }
 
 molecules = params.molecules?.toString().tokenize(',')
+nucleotide_molecules = molecules.findAll { it == "dna" }
 peptide_molecules = molecules.findAll { it != "dna" }
 peptide_molecules_comma_separated = peptide_molecules.join(",")
 peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " )
@@ -451,8 +452,22 @@ peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( "
 Channel.from( molecules )
   .set { ch_molecules }
 
+Channel.from( nucleotide_molecules )
+  .into { ch_nucleotide_molecules; ch_nucleotide_molecules_for_subtract; ch_nucleotide_molecules_for_compare }
+
 Channel.from( peptide_molecules )
-  .into { ch_peptide_molecules; ch_peptide_molecules_for_compare }
+  .into { ch_peptide_molecules; ch_peptide_molecules_for_subtract; ch_peptide_molecules_for_compare }
+
+
+ch_peptide_molecules
+  .combine( ch_ksizes_for_peptide )
+  .set { ch_sourmash_params_peptide }
+
+ch_nucleotide_molecules 
+  .combine( ch_ksizes_for_nucleotide )
+  .mix ( ch_sourmash_params_peptide )
+  .dump ( tag: 'ch_sourmash_params' )
+  .into { ch_sourmash_params_for_compare ; ch_sourmash_params_for_subtract }
 
 // Parse sketch value and style parameters
 sketch_num_hashes = params.sketch_num_hashes
@@ -684,6 +699,7 @@ process get_software_versions {
     ska version &> v_ska.txt
     sourmash -v &> v_sourmash.txt
     pip show orpheum &> v_orpheum.txt
+    python --version &> v_python.txt
     scrape_software_versions.py &> software_versions_mqc.yaml
     """
 }
@@ -1510,9 +1526,9 @@ if (!params.skip_remove_housekeeping_genes) {
             --exclude '/*' \\
             rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ .
       wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
-      DATE=\$(date +'%Y-%M-%d')
+      DATE=\$(date +'%Y-%m-%d')
       RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
-      zcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
+      gzcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
       """
     }
   }
@@ -1529,7 +1545,7 @@ if (!params.skip_remove_housekeeping_genes) {
   */
   // Keep genes whose names match housekeeping gene regular expression pattern
   process extract_fasta_housekeeping {
-    tag "${refseq_moltype}"
+    tag "${fasta.baseName}"
     label "process_low"
     publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
@@ -1537,7 +1553,7 @@ if (!params.skip_remove_housekeeping_genes) {
     set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
 
     output:
-    set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
+    set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view
 
     script:
     output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
@@ -1550,6 +1566,9 @@ if (!params.skip_remove_housekeeping_genes) {
     gzip -c ${output_fasta} > ${output_fasta_gz}
     """
   }
+  
+  ch_housekeeping_fasta_to_view
+    .dump( tag: 'ch_housekeeping_fasta' )
 
   ///////////////////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////////////////
@@ -1563,7 +1582,7 @@ if (!params.skip_remove_housekeeping_genes) {
   */
   // No fastas provided for removing housekeeping genes
   process compute_housekeeping_kmer_sig {
-    tag "${refseq_moltype}"
+    tag "${fasta.baseName}"
     label "process_low"
     publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
@@ -1571,26 +1590,30 @@ if (!params.skip_remove_housekeeping_genes) {
     val track_abundance
     val sketch_value_parsed
     val sketch_style_parsed
-    set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta
+    set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta
 
     output:
-    set val(sourmash_moltype), file(sig) into ch_houskeeping_sig
+    set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig
 
     script:
-    sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna'
+    is_protein = refseq_moltype == "protein"
+    sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna'
+    sourmash_moltypes = tuple(sourmash_moltype.split(","))
     sketch_id = make_sketch_id(
-      peptide_molecules_comma_separated, 
+      sourmash_moltype, 
       params.ksizes, 
       sketch_value_parsed[0], 
       track_abundance, 
       sketch_style_parsed[0]
     )
 
-    sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
-
-    moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna'
+    sketch_value_flag = make_sketch_value_flag(
+      sketch_style_parsed[0], 
+      sketch_value_parsed[0]
+    )
+    moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna'
     track_abundance_flag = track_abundance ? '--track-abundance' : ''
-    sig_id = "${ch_houskeeping_fasta.baseName}__${sketch_id}"
+    sig_id = "${fasta.baseName}__${sketch_id}"
     sig = "${sig_id}.sig"
     csv = "${sig_id}.csv"
     """
@@ -1600,12 +1623,55 @@ if (!params.skip_remove_housekeeping_genes) {
       ${moltype_flags} \\
       ${track_abundance_flag} \\
       --output ${sig} \\
-      --name '${sample_id}' \\
+      --name '${fasta.baseName}' \\
       ${fasta}
     sourmash sig describe --csv ${csv} ${sig}
     """
   }
-  
+
+  ch_sourmash_sketches_merged
+    // index 2: moltypes
+    // index 4: signature
+    .map { tuple( tuple(it[2].split(",")), it[4] ) }
+    .transpose()
+    .dump( tag: 'ch_sourmash_sketches_moltype_to_sig' )
+    .groupTuple( by: 0 )
+    .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' )
+    .set { ch_sourmash_sketches_moltype_to_sigs }
+
+  ch_housekeeping_sig
+    .dump( tag: 'ch_housekeeping_sig' )
+    .transpose()
+    .dump( tag: 'ch_housekeeping_sig__transposed' )
+    .combine( ch_sourmash_params_for_subtract, by: 0)
+    .dump( tag: 'ch_housekeeping_sig__transposed__combined' )
+    .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 )
+    .dump( tag: 'ch_housekeeping_sig__transposed__combined_joined' )
+    .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist }
+
+  ch_subtract_params_to_sigs_for_siglist
+    .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist' )
+    .transpose()
+    .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose')
+    .collectFile() { it -> 
+      [ "${it[0]}__${it[2]}.txt", "${it[3].getFileName()}\n"] 
+    }
+      .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile' )
+      .map { [ tuple( it.baseName.split('__') ), it] }
+      .map { [ it[0][0], it[0][1], it[1] ] }
+      // .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile__map' )
+      // .transpose()
+      .dump ( tag: 'ch_subtract_params_with_siglist' )
+      .set { ch_subtract_params_with_siglist }
+
+  ch_subtract_params_with_sigs
+    // Reorder so molecule (it[0]) and ksize (it[2]) are first
+    .map{ [it[0], it[2], it[1], it[3]] }
+    .dump ( tag: 'ch_subtract_params_with_sigs__map' )
+    .combine( ch_subtract_params_with_siglist,  by: [0, 1] )
+    .dump( tag: 'ch_sigs_with_houskeeping_sig_to_subtract' )
+    .set { ch_sigs_with_houskeeping_sig_to_subtract }
+
 
   // ///////////////////////////////////////////////////////////////////////////////
   // ///////////////////////////////////////////////////////////////////////////////
@@ -1617,28 +1683,38 @@ if (!params.skip_remove_housekeeping_genes) {
   // /*
   // * STEP 9 - Remove housekeeping gene k-mers from single cells
   // */
-  // process subtract_houskeeping_kmers {
-  //   tag "${refseq_taxonomy}"
-  //   label "process_low"
-  //   publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
-
-  //   input:
-  //   set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
-
-  //   output:
-  //   set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta)
-
-  //   script:
-  //   output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
-  //   output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
-  //   """
-  //   filter_fasta_regex.py \\
-  //       --input-fasta ${fasta} \\
-  //       --output-fasta ${output_fasta} \\
-  //       --regex-pattern '${params.housekeeping_gene_regex}'
-  //   gzip -c ${output_fasta} > ${output_fasta_gz}
-  //   """
-  // }
+  process subtract_houskeeping_kmers {
+    tag "${subtract_id}"
+    label "process_medium"
+    publishDir "${params.outdir}/sketches_subtract_housekeeping_kmers/${subtract_id}", mode: 'copy'
+
+    input:
+    val sketch_value_parsed
+    val sketch_style_parsed
+    set val(molecule), val(ksize), file(housekeeping_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract
+
+    output:
+    set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed
+    
+    script:
+    subtract_id = "${molecule}__k-${ksize}"
+    sketch_value_flag = make_sketch_value_flag(
+        sketch_style_parsed[0], 
+        sketch_value_parsed[0]
+    )
+    track_abundance_flag = track_abundance ? '--track-abundance' : ''
+
+    """
+    subtract \\
+        ${track_abundance_flag} \\
+        ${sketch_value_flag} \\
+        --ksize ${ksize} \\
+        --encoding ${molecule} \\
+        --output subtracted/ \\
+        ${housekeeping_sig} \\
+        ${siglist}
+    """
+  }
 }
 
 
@@ -1692,14 +1768,6 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
 
   // ch_sourmash_sketches_to_compare = Channel.empty()
 
-  ch_peptide_molecules_for_compare
-    .combine( ch_ksizes_for_compare_peptide )
-    .set { ch_sourmash_compare_params_peptide }
-
-  Channel.from("dna")  
-    .combine( ch_ksizes_for_compare_nucleotide )
-    .mix ( ch_sourmash_compare_params_peptide )
-    .set { ch_sourmash_compare_params_both }
 
   ch_sourmash_sketches_merged = Channel.empty()
 
@@ -1713,7 +1781,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
     .transpose()
     .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
     // Perform cartesian product on the molecules with compare params
-    .combine( ch_sourmash_compare_params_both, by: 0)
+    .combine( ch_sourmash_params_for_compare, by: 0)
     .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
     .groupTuple(by: [0, 2])
     .dump(tag: 'ch_sourmash_sketches_to_compare' )

From 71695fc8f0aafc956a34f71cacf9a80df33f6691 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:03:00 -0800
Subject: [PATCH 19/43] Use mamba to install packages

---
 Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index e516001e..880a58aa 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,8 @@ LABEL authors="Olga Botvinnik" \
 
 # Install the conda environment
 COPY environment.yml /
-RUN conda env create --quiet -f /environment.yml && conda clean -a
+RUN conda install -c conda-forge mamba
+RUN mamba env create -f /environment.yml && mamba clean -a
 
 # Add conda installation dir to PATH (instead of doing 'conda activate')
 ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH

From ed266eb005e9716cbffe1c5a351d9fd240398969 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:03:21 -0800
Subject: [PATCH 20/43] Move Rust to conda-forge section

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 8650d4d2..d6956e37 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - conda-forge::tqdm=4.43.0
   - conda-forge::gxx_linux-64=7.3.0
   - conda-forge::s3fs=0.4.2
+  - conda-forge::rust=1.48.0
   - bioconda::sourmash=3.5.0
   - bioconda::samtools=1.10
   - bioconda::screed=1.0.4
@@ -34,7 +35,6 @@ dependencies:
   - sphinx=2.3.1
   - jupyter=1.0.0
   - ripgrep=12.1.1
-  - conda-forge::rust=1.48.0
   - rsync=3.2.3
   - pip:
     - bam2fasta==1.0.8

From 87082ac67bce66501c80247a9cce498de9363678 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:03:30 -0800
Subject: [PATCH 21/43] Set sketch_scaled to 10 by default

---
 bin/scrape_software_versions.py               |  4 ++-
 conf/base.config                              |  1 +
 conf/test_download_refseq.config              |  2 +-
 ...test_housekeeping_from_filter_fasta.config | 33 +++++++++++++++++++
 conf/test_housekeeping_from_make_sig.config   | 33 +++++++++++++++++++
 .../test_housekeeping_from_premade_sig.config | 33 +++++++++++++++++++
 nextflow.config                               |  4 ++-
 siglist.txt                                   |  4 +++
 8 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 conf/test_housekeeping_from_filter_fasta.config
 create mode 100644 conf/test_housekeeping_from_make_sig.config
 create mode 100644 conf/test_housekeeping_from_premade_sig.config
 create mode 100644 siglist.txt

diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 597a70f5..80f39c09 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -17,6 +17,7 @@
     "Rsync": ["v_rsync.txt", r"rsync  version (\S+)"],
     "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"],
     "orpheum": ["v_orpheum.txt", r"Version: (\S+)"],
+    "Python": ["v_python.txt", r"Python (\S+)"],
 }
 results = OrderedDict()
 results["nf-core/kmermaid"] = '<span style="color:#999999;">N/A</span>'
@@ -26,12 +27,13 @@
 results["bam2fasta"] = '<span style="color:#999999;">N/A</span>'
 results["fastp"] = '<span style="color:#999999;">N/A</span>'
 results["htslib"] = '<span style="color:#999999;">N/A</span>'
+results["orpheum"] = '<span style="color:#999999;">N/A</span>'
+results["Python"] = '<span style="color:#999999;">N/A</span>'
 results["Rsync"] = '<span style="color:#999999;">N/A</span>'
 results["Rsync (Protocol)"] = '<span style="color:#999999;">N/A</span>'
 results["Samtools"] = '<span style="color:#999999;">N/A</span>'
 results["SKA"] = '<span style="color:#999999;">N/A</span>'
 results["Sourmash"] = '<span style="color:#999999;">N/A</span>'
-results["orpheum"] = '<span style="color:#999999;">N/A</span>'
 
 # Search each file using its regex
 for k, v in regexes.items():
diff --git a/conf/base.config b/conf/base.config
index 07a2aa3b..01e0ffa3 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -54,6 +54,7 @@ process {
 
   withName: 'multiqc|get_software_versions' {
     memory = { check_max( 2.GB * task.attempt, 'memory' ) }
+    errorStrategy = "ignore"
     cache = false
   }
   withName: 'sourmash_compute_sketch_fastx_nucleotide|sourmash_compute_sketch_fastx_peptide' {
diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config
index 2d7b5486..3f624b84 100644
--- a/conf/test_download_refseq.config
+++ b/conf/test_download_refseq.config
@@ -15,7 +15,7 @@ params {
   max_memory = 6.GB
   max_time = 48.h
 
-  sketch_scaled = 2
+  // sketch_scaled = 2
   // Input data
   input_paths = [
     ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
diff --git a/conf/test_housekeeping_from_filter_fasta.config b/conf/test_housekeeping_from_filter_fasta.config
new file mode 100644
index 00000000..2d7b5486
--- /dev/null
+++ b/conf/test_housekeeping_from_filter_fasta.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+
+  sketch_scaled = 2
+  // Input data
+  input_paths = [
+    ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+    ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+    ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+    ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+  ]
+
+  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
+  // Protein fasta is 453 B
+  refseq_taxonomy = 'vertebrate_mammalian'
+  test_mini_refseq_download = true
+}
diff --git a/conf/test_housekeeping_from_make_sig.config b/conf/test_housekeeping_from_make_sig.config
new file mode 100644
index 00000000..2d7b5486
--- /dev/null
+++ b/conf/test_housekeeping_from_make_sig.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+
+  sketch_scaled = 2
+  // Input data
+  input_paths = [
+    ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+    ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+    ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+    ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+  ]
+
+  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
+  // Protein fasta is 453 B
+  refseq_taxonomy = 'vertebrate_mammalian'
+  test_mini_refseq_download = true
+}
diff --git a/conf/test_housekeeping_from_premade_sig.config b/conf/test_housekeeping_from_premade_sig.config
new file mode 100644
index 00000000..2d7b5486
--- /dev/null
+++ b/conf/test_housekeeping_from_premade_sig.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+
+  sketch_scaled = 2
+  // Input data
+  input_paths = [
+    ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+    ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+    ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+    ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+  ]
+
+  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
+  // Protein fasta is 453 B
+  refseq_taxonomy = 'vertebrate_mammalian'
+  test_mini_refseq_download = true
+}
diff --git a/nextflow.config b/nextflow.config
index 96496b34..41b72730 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -34,7 +34,7 @@ params {
   // Number of hashes from each sample
   sketch_num_hashes = false
   sketch_num_hashes_log2 = false
-  sketch_scaled = false
+  sketch_scaled = 10
   sketch_scaled_log2 = false
   skip_sig_merge = false
 
@@ -66,6 +66,8 @@ params {
   test_mini_refseq_download = false
   housekeeping_protein_fasta = false
   housekeeping_rna_fasta = false
+  housekeeping_protein_sig = false
+  housekeeping_rna_sig = false
 
   // ska options
   split_kmer = false
diff --git a/siglist.txt b/siglist.txt
new file mode 100644
index 00000000..43fc713a
--- /dev/null
+++ b/siglist.txt
@@ -0,0 +1,4 @@
+SRR4238351__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4238355__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4050380__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4050379__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig

From 76c32b10f0ae6c8a9e40afccc1b430219bad814b Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:14:05 -0800
Subject: [PATCH 22/43] reference_proteome_fasta --> translate_proteome_fasta

---
 main.nf         | 26 +++++++++++++-------------
 nextflow.config |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/main.nf b/main.nf
index a6c91209..147b5a3e 100644
--- a/main.nf
+++ b/main.nf
@@ -120,7 +120,7 @@ def helpMessage() {
                                     to new name, e.g. with channel or cell annotation label
 
     Translate RNA-seq reads into protein-coding sequences options:
-      --reference_proteome_fasta    Path to a well-curated fasta file of protein sequences. Used to filter for coding reads
+      --translate_proteome_fasta    Path to a well-curated fasta file of protein sequences. Used to filter for coding reads
       --translate_peptide_ksize     K-mer size to use for translating RNA into protein.
                                     Default: 9, which is good for 'protein'. If using dayhoff, suggest 15
       --translate_peptide_molecule  Which molecular encoding to use for translating. Default: "protein"
@@ -324,10 +324,10 @@ if (params.protein_fastas){
   ch_protein_fastas = Channel.empty()
 }
 
-if (params.reference_proteome_fasta) {
-Channel.fromPath(params.reference_proteome_fasta, checkIfExists: true)
-     .ifEmpty { exit 1, "Reference proteome file not found: ${params.reference_proteome_fasta}" }
-     .set{ ch_reference_proteome_fasta }
+if (params.translate_proteome_fasta) {
+Channel.fromPath(params.translate_proteome_fasta, checkIfExists: true)
+     .ifEmpty { exit 1, "Reference proteome file not found: ${params.translate_proteome_fasta}" }
+     .set{ ch_translate_proteome_fasta }
 }
 
 ////////////////////////////////////////////////////
@@ -620,10 +620,10 @@ if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_patte
 if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
 if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
 // Extract coding parameters
-if(params.reference_proteome_fasta) summary["Peptide fasta"] = params.reference_proteome_fasta
-if(params.reference_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize
-if(params.reference_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule
-if(params.reference_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize
+if(params.translate_proteome_fasta) summary["Peptide fasta"] = params.translate_proteome_fasta
+if(params.translate_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize
+if(params.translate_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule
+if(params.translate_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize
 // Resource information
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -770,7 +770,7 @@ if ( !params.split_kmer && have_sketch_value ) {
 
 
 
-if (params.reference_proteome_fasta){
+if (params.translate_proteome_fasta){
   process make_protein_index {
     tag "${peptides}__${bloom_id}"
     label "low_memory"
@@ -778,7 +778,7 @@ if (params.reference_proteome_fasta){
     publishDir "${params.outdir}/protein_index", mode: params.publish_dir_mode
 
     input:
-    file(peptides) from ch_reference_proteome_fasta
+    file(peptides) from ch_translate_proteome_fasta
     translate_peptide_ksize
     translate_peptide_molecule
 
@@ -1115,7 +1115,7 @@ if (params.subsample) {
   }
 
 
-  if (params.reference_proteome_fasta){
+  if (params.translate_proteome_fasta){
     process translate {
       tag "${sample_id}"
       label "low_memory_long"
@@ -1309,7 +1309,7 @@ if (!have_nucleotide_input) {
 }
 
 
-if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
+if (!params.skip_compute && (protein_input || params.translate_proteome_fasta)){
 
   process sourmash_compute_sketch_fastx_peptide {
     tag "${sig_id}"
diff --git a/nextflow.config b/nextflow.config
index 41b72730..bdf0200b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -51,7 +51,7 @@ params {
   translate_peptide_ksize = 8
   translate_peptide_molecule = 'protein'
   translate_jaccard_threshold = 0.05
-  reference_proteome_fasta = false
+  translate_proteome_fasta = false
   bloomfilter_tablesize = '1e8'
   // Saving the translate results for each dataset makes it take extra long
   // Recommended for debugging purposes only

From e4154cfe2d00d0b5c6ba26f03b5872f71eee5869 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:31:24 -0800
Subject: [PATCH 23/43] Use my branch of the rust sourmash remove code

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 880a58aa..2a407b08 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,7 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH
 RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml
 
 # Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures
-RUN git clone https://github.com/luizirber/2021-01-27-olga-remove-protein/ 
+RUN git clone https://github.com/olgabot/2021-01-27-olga-remove-protein.git@olgabot/mut-warning
 RUN cd 2021-01-27-olga-remove-protein  && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH

From 94d5f2c9183ed2824f9dbe99ac0026d50dc96ded Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 14:31:32 -0800
Subject: [PATCH 24/43] Add cmake to help with gcc building

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index d6956e37..093d9644 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,6 +7,7 @@ channels:
   - defaults
   - anaconda
 dependencies:
+  - conda-forge::cmake=3.19.6
   - conda-forge::python=3.7.3
   - conda-forge::markdown=3.1.1
   - conda-forge::pymdown-extensions=6.0

From c7d603da93f2359c3263964d0f799f40f5987195 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 15:00:12 -0800
Subject: [PATCH 25/43] Get housekeeping removal from sig, fasta to work

---
 main.nf         | 246 +++++++++++++++++++++++++++---------------------
 nextflow.config |   6 +-
 2 files changed, 145 insertions(+), 107 deletions(-)

diff --git a/main.nf b/main.nf
index 147b5a3e..10fae059 100644
--- a/main.nf
+++ b/main.nf
@@ -544,18 +544,45 @@ else {
 housekeeping_protein_fasta = params.housekeeping_protein_fasta
 housekeeping_rna_fasta = params.housekeeping_rna_fasta
 
-need_refseq_download = !housekeeping_protein_fasta && !housekeeping_rna_fasta
-
-ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta])
-ch_refseq_moltype_to_fasta
-    // filter if the second item, the fasta is false
-    .filter{ !it[1] }
+housekeeping_protein_sig = params.housekeeping_protein_sig
+housekeeping_rna_sig = params.housekeeping_rna_sig
+
+have_housekeeping_fastas = housekeeping_protein_fasta && housekeeping_rna_fasta
+have_housekeeping_sigs = housekeeping_protein_sig && housekeeping_rna_sig
+need_refseq_download = (!have_housekeeping_fastas) && (!have_housekeeping_sigs)
+
+if (have_housekeeping_fastas) {
+  Channel.from(
+    ["protein", file(housekeeping_protein_fasta)], 
+    ["rna", file(housekeeping_rna_fasta)])
+    .into { ch_housekeeping_fasta; ch_refseq_moltype_to_fasta }
+
+  ch_refseq_moltype_to_fasta
+    // Check if protein molecules were even specified 
+    .filter{ 
+      it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 
+    }
     // Take only the first item, the molecule type
     .map{ it[0] }
     .set{ ch_refseq_moltypes_to_download }
+}
+
+if (have_housekeeping_sigs) {
+  // Use sourmash moltypes of "protein,dayhoff" instead of the original protein
+  // as used for the fastas as that's what matches the sourmash outputs
+  ch_housekeeping_sig = Channel.from(
+    ["protein,dayhoff", file(housekeeping_protein_sig)], 
+    ["dna", file(housekeeping_rna_sig)]
+  )
+}
+
 
 // Parse refseq taxonomy group to download
-refseq_taxonomy = params.refseq_taxonomy
+housekeeping_refseq_taxonomy = params.housekeeping_refseq_taxonomy
+/////////////////////////////////////////////////////////////
+/* -- END: Parse Housekeeping K-mer removal parameters  -- */
+/////////////////////////////////////////////////////////////
+
 
 // Has the run name been specified by the user?
 //  this has the bonus effect of catching both -name and --name
@@ -619,11 +646,17 @@ if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags
 if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern
 if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
 if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
-// Extract coding parameters
-if(params.translate_proteome_fasta) summary["Peptide fasta"] = params.translate_proteome_fasta
-if(params.translate_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize
-if(params.translate_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule
-if(params.translate_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize
+// Orpheum Translate parameters
+if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta
+if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize
+if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule
+if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize
+// Housekeeping k-mer removal paramters
+if(params.housekeeping_protein_fasta) summary["Housekeping Peptide fasta"] = params.housekeeping_protein_fasta
+if(params.housekeeping_rna_fasta) summary["Housekeping RNA fasta"] = params.housekeeping_rna_fasta
+if(params.housekeeping_protein_sig) summary["Housekeping Peptide K-mer Signature"] = params.housekeeping_protein_sig
+if(params.housekeeping_rna_sig) summary["Housekeping RNA K-mer Signature"] = params.housekeeping_rna_sig
+if(need_refseq_download) summary["Housekeeping Refseq Taxonomy"] = params.housekeeping_refseq_taxonomy
 // Resource information
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -1503,7 +1536,7 @@ if (!params.skip_remove_housekeeping_genes) {
   if (need_refseq_download){
     // No fastas provided for removing housekeeping genes
     process download_refseq {
-      tag "${refseq_taxonomy}--${refseq_moltype}"
+      tag "${housekeeping_refseq_taxonomy}--${refseq_moltype}"
       label "process_low"
       publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy'
 
@@ -1511,11 +1544,11 @@ if (!params.skip_remove_housekeeping_genes) {
       val refseq_moltype from ch_refseq_moltypes_to_download
 
       output:
-      set val(refseq_moltype), file("${refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
+      set val(refseq_moltype), file("${housekeeping_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
 
       script:
-      output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
-      include_fasta = params.test_mini_refseq_download ? "${refseq_taxonomy}.1.${refseq_moltype}.f*a.gz"  : "*${refseq_moltype}.f*a.gz" 
+      output_fasta = "${housekeeping_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
+      include_fasta = params.test_mini_refseq_download ? "${housekeeping_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz"  : "*${refseq_moltype}.f*a.gz" 
       """
       rsync \\
             --prune-empty-dirs \\
@@ -1524,111 +1557,114 @@ if (!params.skip_remove_housekeeping_genes) {
             --recursive \\
             --include '${include_fasta}' \\
             --exclude '/*' \\
-            rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ .
+            rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${housekeeping_refseq_taxonomy}/ .
       wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
       DATE=\$(date +'%Y-%m-%d')
       RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
-      gzcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
+      gzcat ${housekeeping_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
       """
     }
-  }
 
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  /* --                                                                     -- */
-  /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
-  /* --                                                                     -- */
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  /*
-  * STEP 7 - Get only housekeeping genes from 
-  */
-  // Keep genes whose names match housekeeping gene regular expression pattern
-  process extract_fasta_housekeeping {
-    tag "${fasta.baseName}"
-    label "process_low"
-    publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
+    ///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    /* --                                                                     -- */
+    /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+    /* --                                                                     -- */
+    ///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    /*
+    * STEP 7 - Get only housekeeping genes from 
+    */
+    // Keep genes whose names match housekeeping gene regular expression pattern
+    process extract_fasta_housekeeping {
+      tag "${fasta.baseName}"
+      label "process_low"
+      publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
-    input:
-    set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
+      input:
+      set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
 
-    output:
-    set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view
+      output:
+      set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view
 
-    script:
-    output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
-    output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
-    """
-    filter_fasta_regex.py \\
-        --input-fasta ${fasta} \\
-        --output-fasta ${output_fasta} \\
-        --regex-pattern '${params.housekeeping_gene_regex}'
-    gzip -c ${output_fasta} > ${output_fasta_gz}
-    """
+      script:
+      output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
+      output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
+      """
+      filter_fasta_regex.py \\
+          --input-fasta ${fasta} \\
+          --output-fasta ${output_fasta} \\
+          --regex-pattern '${params.housekeeping_gene_regex}'
+      gzip -c ${output_fasta} > ${output_fasta_gz}
+      """
+    }
+    
+    ch_housekeeping_fasta_to_view
+      .dump( tag: 'ch_housekeeping_fasta' )
   }
-  
-  ch_housekeeping_fasta_to_view
-    .dump( tag: 'ch_housekeeping_fasta' )
-
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  /* --                                                                     -- */
-  /* --          COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE                  -- */
-  /* --                                                                     -- */
-  ///////////////////////////////////////////////////////////////////////////////
-  ///////////////////////////////////////////////////////////////////////////////
-  /*
-  * STEP 8 - Compute Housekeeping Gene K-mer Signature
-  */
-  // No fastas provided for removing housekeeping genes
-  process compute_housekeeping_kmer_sig {
-    tag "${fasta.baseName}"
-    label "process_low"
-    publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
-    input:
-    val track_abundance
-    val sketch_value_parsed
-    val sketch_style_parsed
-    set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta
+  if (!have_housekeeping_sigs) {
+      ///////////////////////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////////////////////
+      /* --                                                                     -- */
+      /* --          COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE                  -- */
+      /* --                                                                     -- */
+      ///////////////////////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////////////////////
+      /*
+      * STEP 8 - Compute Housekeeping Gene K-mer Signature
+      */
+      // No fastas provided for removing housekeeping genes
+      process compute_housekeeping_kmer_sig {
+        tag "${fasta.baseName}"
+        label "process_low"
+        publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
 
-    output:
-    set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig
+        input:
+        val track_abundance
+        val sketch_value_parsed
+        val sketch_style_parsed
+        set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta
 
-    script:
-    is_protein = refseq_moltype == "protein"
-    sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna'
-    sourmash_moltypes = tuple(sourmash_moltype.split(","))
-    sketch_id = make_sketch_id(
-      sourmash_moltype, 
-      params.ksizes, 
-      sketch_value_parsed[0], 
-      track_abundance, 
-      sketch_style_parsed[0]
-    )
+        output:
+        set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig
 
-    sketch_value_flag = make_sketch_value_flag(
-      sketch_style_parsed[0], 
-      sketch_value_parsed[0]
-    )
-    moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna'
-    track_abundance_flag = track_abundance ? '--track-abundance' : ''
-    sig_id = "${fasta.baseName}__${sketch_id}"
-    sig = "${sig_id}.sig"
-    csv = "${sig_id}.csv"
-    """
-    sourmash compute \\
-      ${sketch_value_flag} \\
-      --ksizes ${params.ksizes} \\
-      ${moltype_flags} \\
-      ${track_abundance_flag} \\
-      --output ${sig} \\
-      --name '${fasta.baseName}' \\
-      ${fasta}
-    sourmash sig describe --csv ${csv} ${sig}
-    """
+        script:
+        is_protein = refseq_moltype == "protein"
+        sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna'
+        sourmash_moltypes = tuple(sourmash_moltype.split(","))
+        sketch_id = make_sketch_id(
+          sourmash_moltype, 
+          params.ksizes, 
+          sketch_value_parsed[0], 
+          track_abundance, 
+          sketch_style_parsed[0]
+        )
+
+        sketch_value_flag = make_sketch_value_flag(
+          sketch_style_parsed[0], 
+          sketch_value_parsed[0]
+        )
+        moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna'
+        track_abundance_flag = track_abundance ? '--track-abundance' : ''
+        sig_id = "${fasta.baseName}__${sketch_id}"
+        sig = "${sig_id}.sig"
+        csv = "${sig_id}.csv"
+        """
+        sourmash compute \\
+          ${sketch_value_flag} \\
+          --ksizes ${params.ksizes} \\
+          ${moltype_flags} \\
+          ${track_abundance_flag} \\
+          --output ${sig} \\
+          --name '${fasta.baseName}' \\
+          ${fasta}
+        sourmash sig describe --csv ${csv} ${sig}
+        """
+      }
   }
 
+
   ch_sourmash_sketches_merged
     // index 2: moltypes
     // index 4: signature
diff --git a/nextflow.config b/nextflow.config
index bdf0200b..780e4b04 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -61,7 +61,7 @@ params {
   // Housekeeping gene k-mer removal
   skip_remove_housekeeping_genes = false
   housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
-  refseq_taxonomy = 'vertebrate_mammalian'
+  housekeeping_refseq_taxonomy = 'vertebrate_mammalian'
   // For testing purposes --> use a small refseq dataset
   test_mini_refseq_download = false
   housekeeping_protein_fasta = false
@@ -152,11 +152,13 @@ profiles {
     podman.enabled = true
   }
   test { includeConfig 'conf/test.config' }
-  test_download_refseq { includeConfig 'conf/test_download_refseq.config' }
   test_full { includeConfig 'conf/test_full.config' }
   test_ska { includeConfig 'conf/test_ska.config' }
   test_bam { includeConfig 'conf/test_bam.config' }
   test_fastas { includeConfig 'conf/test_fastas.config' }
+  test_housekeeping_from_download_refseq { includeConfig 'conf/test_housekeeping_from_download_refseq.config' }
+  test_housekeeping_from_fasta { includeConfig 'conf/test_housekeeping_from_fasta.config' }
+  test_housekeeping_from_sig { includeConfig 'conf/test_housekeeping_from_sig.config' }
   test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' }
   test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' }
   test_sig_merge { includeConfig 'conf/test_sig_merge.config' }

From b65ebcdce7c3328f736b4147776ad95485ed9682 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 15:03:20 -0800
Subject: [PATCH 26/43] Update vital gene tests

---
 .github/workflows/ci.yml                      |  4 ++-
 ..._housekeeping_from_download_refseq.config} |  1 -
 ...ig => test_housekeeping_from_fasta.config} |  9 +++--
 .../test_housekeeping_from_premade_sig.config | 33 -------------------
 ...nfig => test_housekeeping_from_sig.config} |  8 ++---
 5 files changed, 9 insertions(+), 46 deletions(-)
 rename conf/{test_housekeeping_from_filter_fasta.config => test_housekeeping_from_download_refseq.config} (98%)
 rename conf/{test_housekeeping_from_make_sig.config => test_housekeeping_from_fasta.config} (68%)
 delete mode 100644 conf/test_housekeeping_from_premade_sig.config
 rename conf/{test_download_refseq.config => test_housekeeping_from_sig.config} (70%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5e4b59a1..76b2cbeb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -72,8 +72,10 @@ jobs:
           - "test_bam --write_barcodes_meta_csv false"
           - "test_bam --barcodes_file false --rename_10x_barcodes false"
           - "test_bam --rename_10x_barcodes false"
-          - "test_download_refseq"
           - "test_fastas"
+          - "test_housekeeping_from_download_refseq"
+          - "test_housekeeping_from_fasta"
+          - "test_housekeeping_from_sig"
           - "test_protein_fastas"
           - "test_remove_ribo"
           - "test_sig_merge"
diff --git a/conf/test_housekeeping_from_filter_fasta.config b/conf/test_housekeeping_from_download_refseq.config
similarity index 98%
rename from conf/test_housekeeping_from_filter_fasta.config
rename to conf/test_housekeeping_from_download_refseq.config
index 2d7b5486..886a8424 100644
--- a/conf/test_housekeeping_from_filter_fasta.config
+++ b/conf/test_housekeeping_from_download_refseq.config
@@ -15,7 +15,6 @@ params {
   max_memory = 6.GB
   max_time = 48.h
 
-  sketch_scaled = 2
   // Input data
   input_paths = [
     ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
diff --git a/conf/test_housekeeping_from_make_sig.config b/conf/test_housekeeping_from_fasta.config
similarity index 68%
rename from conf/test_housekeeping_from_make_sig.config
rename to conf/test_housekeeping_from_fasta.config
index 2d7b5486..2bf6ba6d 100644
--- a/conf/test_housekeeping_from_make_sig.config
+++ b/conf/test_housekeeping_from_fasta.config
@@ -15,7 +15,6 @@ params {
   max_memory = 6.GB
   max_time = 48.h
 
-  sketch_scaled = 2
   // Input data
   input_paths = [
     ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
@@ -25,9 +24,9 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
+  housekeeping_protein_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa.gz"
+  housekeeping_rna_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa.gz"
 
-  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
-  // Protein fasta is 453 B
-  refseq_taxonomy = 'vertebrate_mammalian'
-  test_mini_refseq_download = true
+  reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
+  bloomfilter_tablesize = '1e6'
 }
diff --git a/conf/test_housekeeping_from_premade_sig.config b/conf/test_housekeeping_from_premade_sig.config
deleted file mode 100644
index 2d7b5486..00000000
--- a/conf/test_housekeeping_from_premade_sig.config
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * -------------------------------------------------
- *  Nextflow config file for running tests
- * -------------------------------------------------
- * Defines bundled input files and everything required
- * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/kmermaid -profile test
- */
-
-params {
-  config_profile_name = 'Test profile'
-  config_profile_description = 'Minimal test dataset to check pipeline function'
-  // Limit resources so that this can run on Travis
-  max_cpus = 2
-  max_memory = 6.GB
-  max_time = 48.h
-
-  sketch_scaled = 2
-  // Input data
-  input_paths = [
-    ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
-                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
-    ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
-                    'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
-    ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
-    ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
-  ]
-
-  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
-  // Protein fasta is 453 B
-  refseq_taxonomy = 'vertebrate_mammalian'
-  test_mini_refseq_download = true
-}
diff --git a/conf/test_download_refseq.config b/conf/test_housekeeping_from_sig.config
similarity index 70%
rename from conf/test_download_refseq.config
rename to conf/test_housekeeping_from_sig.config
index 3f624b84..21ed7073 100644
--- a/conf/test_download_refseq.config
+++ b/conf/test_housekeeping_from_sig.config
@@ -15,7 +15,6 @@ params {
   max_memory = 6.GB
   max_time = 48.h
 
-  // sketch_scaled = 2
   // Input data
   input_paths = [
     ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
@@ -25,9 +24,6 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-
-  // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
-  // Protein fasta is 453 B
-  refseq_taxonomy = 'vertebrate_mammalian'
-  test_mini_refseq_download = true
+  housekeeping_protein_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  housekeeping_rna_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 }

From c22d7ee31e7c9b6fde2c32929caef359b535211d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 15:26:36 -0800
Subject: [PATCH 27/43] Soft link conda C libraries

---
 Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2a407b08..4958bb1e 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,8 +14,11 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH
 RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml
 
 # Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures
-RUN git clone https://github.com/olgabot/2021-01-27-olga-remove-protein.git@olgabot/mut-warning
-RUN cd 2021-01-27-olga-remove-protein  && cargo build --release 
+RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
+# Soft link all conda C-related libraries to their non-prefixed name 
+# for rust to be able to build the C libraries
+RUN  for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
+RUN cd 2021-01-27-olga-remove-protein && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH
 

From 253a04d8c2803828d1c37fe59722c851cb1a59d8 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Thu, 11 Mar 2021 16:45:01 -0800
Subject: [PATCH 28/43] housekeeping --> constitutive

---
 Dockerfile                                    |   2 +-
 ..._constitutive_from_download_refseq.config} |   0
 ...ig => test_constitutive_from_fasta.config} |   0
 ...nfig => test_constitutive_from_sig.config} |   4 +-
 main.nf                                       | 124 +++++++++---------
 nextflow.config                               |  22 ++--
 6 files changed, 76 insertions(+), 76 deletions(-)
 rename conf/{test_housekeeping_from_download_refseq.config => test_constitutive_from_download_refseq.config} (100%)
 rename conf/{test_housekeeping_from_fasta.config => test_constitutive_from_fasta.config} (100%)
 rename conf/{test_housekeeping_from_sig.config => test_constitutive_from_sig.config} (70%)

diff --git a/Dockerfile b/Dockerfile
index 4958bb1e..911e2f04 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de
 RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
 # Soft link all conda C-related libraries to their non-prefixed name 
 # for rust to be able to build the C libraries
-RUN  for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
+RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
 RUN cd 2021-01-27-olga-remove-protein && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH
diff --git a/conf/test_housekeeping_from_download_refseq.config b/conf/test_constitutive_from_download_refseq.config
similarity index 100%
rename from conf/test_housekeeping_from_download_refseq.config
rename to conf/test_constitutive_from_download_refseq.config
diff --git a/conf/test_housekeeping_from_fasta.config b/conf/test_constitutive_from_fasta.config
similarity index 100%
rename from conf/test_housekeeping_from_fasta.config
rename to conf/test_constitutive_from_fasta.config
diff --git a/conf/test_housekeeping_from_sig.config b/conf/test_constitutive_from_sig.config
similarity index 70%
rename from conf/test_housekeeping_from_sig.config
rename to conf/test_constitutive_from_sig.config
index 21ed7073..12a88fdb 100644
--- a/conf/test_housekeeping_from_sig.config
+++ b/conf/test_constitutive_from_sig.config
@@ -24,6 +24,6 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  housekeeping_protein_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  housekeeping_rna_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 }
diff --git a/main.nf b/main.nf
index 10fae059..992f95d8 100644
--- a/main.nf
+++ b/main.nf
@@ -539,23 +539,23 @@ else {
 
 
 //////////////////////////////////////////////////////////
-/* --  Parse Housekeeping K-mer removal parameters  -- */
+/* --  Parse constitutive K-mer removal parameters  -- */
 /////////////////////////////////////////////////////////
-housekeeping_protein_fasta = params.housekeeping_protein_fasta
-housekeeping_rna_fasta = params.housekeeping_rna_fasta
+constitutive_protein_fasta = params.constitutive_protein_fasta
+constitutive_rna_fasta = params.constitutive_rna_fasta
 
-housekeeping_protein_sig = params.housekeeping_protein_sig
-housekeeping_rna_sig = params.housekeeping_rna_sig
+constitutive_protein_sig = params.constitutive_protein_sig
+constitutive_rna_sig = params.constitutive_rna_sig
 
-have_housekeeping_fastas = housekeeping_protein_fasta && housekeeping_rna_fasta
-have_housekeeping_sigs = housekeeping_protein_sig && housekeeping_rna_sig
-need_refseq_download = (!have_housekeeping_fastas) && (!have_housekeeping_sigs)
+have_constitutive_fastas = constitutive_protein_fasta && constitutive_rna_fasta
+have_constitutive_sigs = constitutive_protein_sig && constitutive_rna_sig
+need_refseq_download = (!have_constitutive_fastas) && (!have_constitutive_sigs)
 
-if (have_housekeeping_fastas) {
+if (have_constitutive_fastas) {
   Channel.from(
-    ["protein", file(housekeeping_protein_fasta)], 
-    ["rna", file(housekeeping_rna_fasta)])
-    .into { ch_housekeeping_fasta; ch_refseq_moltype_to_fasta }
+    ["protein", file(constitutive_protein_fasta)], 
+    ["rna", file(constitutive_rna_fasta)])
+    .into { ch_constitutive_fasta; ch_refseq_moltype_to_fasta }
 
   ch_refseq_moltype_to_fasta
     // Check if protein molecules were even specified 
@@ -567,20 +567,20 @@ if (have_housekeeping_fastas) {
     .set{ ch_refseq_moltypes_to_download }
 }
 
-if (have_housekeeping_sigs) {
+if (have_constitutive_sigs) {
   // Use sourmash moltypes of "protein,dayhoff" instead of the original protein
   // as used for the fastas as that's what matches the sourmash outputs
-  ch_housekeeping_sig = Channel.from(
-    ["protein,dayhoff", file(housekeeping_protein_sig)], 
-    ["dna", file(housekeeping_rna_sig)]
+  ch_constitutive_sig = Channel.from(
+    ["protein,dayhoff", file(constitutive_protein_sig)], 
+    ["dna", file(constitutive_rna_sig)]
   )
 }
 
 
 // Parse refseq taxonomy group to download
-housekeeping_refseq_taxonomy = params.housekeeping_refseq_taxonomy
+constitutive_refseq_taxonomy = params.constitutive_refseq_taxonomy
 /////////////////////////////////////////////////////////////
-/* -- END: Parse Housekeeping K-mer removal parameters  -- */
+/* -- END: Parse constitutive K-mer removal parameters  -- */
 /////////////////////////////////////////////////////////////
 
 
@@ -651,12 +651,12 @@ if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] =
 if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize
 if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule
 if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize
-// Housekeeping k-mer removal paramters
-if(params.housekeeping_protein_fasta) summary["Housekeping Peptide fasta"] = params.housekeeping_protein_fasta
-if(params.housekeeping_rna_fasta) summary["Housekeping RNA fasta"] = params.housekeeping_rna_fasta
-if(params.housekeeping_protein_sig) summary["Housekeping Peptide K-mer Signature"] = params.housekeeping_protein_sig
-if(params.housekeeping_rna_sig) summary["Housekeping RNA K-mer Signature"] = params.housekeeping_rna_sig
-if(need_refseq_download) summary["Housekeeping Refseq Taxonomy"] = params.housekeeping_refseq_taxonomy
+// constitutive k-mer removal paramters
+if(params.constitutive_protein_fasta) summary["Constitutive Peptide fasta"] = params.constitutive_protein_fasta
+if(params.constitutive_rna_fasta) summary["Constitutive RNA fasta"] = params.constitutive_rna_fasta
+if(params.constitutive_protein_sig) summary["Constitutive Peptide K-mer Signature"] = params.constitutive_protein_sig
+if(params.constitutive_rna_sig) summary["Constitutive RNA K-mer Signature"] = params.constitutive_rna_sig
+if(need_refseq_download) summary["Constitutive GBenes' Refseq Taxonomy"] = params.constitutive_refseq_taxonomy
 // Resource information
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -1512,11 +1512,11 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
 }
 
 
-if (!params.skip_remove_housekeeping_genes) {
+if (!params.skip_remove_constitutive_genes) {
   ///////////////////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////////////////
   /* --                                                                     -- */
-  /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+  /* --              REMOVE K-MERS FROM constitutive GENES                  -- */
   /* --                                                                     -- */
   ///////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////// 
@@ -1534,9 +1534,9 @@ if (!params.skip_remove_housekeeping_genes) {
   * STEP 6 - rsync to download refeseq
   */
   if (need_refseq_download){
-    // No fastas provided for removing housekeeping genes
+    // No fastas provided for removing constitutive genes
     process download_refseq {
-      tag "${housekeeping_refseq_taxonomy}--${refseq_moltype}"
+      tag "${constitutive_refseq_taxonomy}--${refseq_moltype}"
       label "process_low"
       publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy'
 
@@ -1544,11 +1544,11 @@ if (!params.skip_remove_housekeeping_genes) {
       val refseq_moltype from ch_refseq_moltypes_to_download
 
       output:
-      set val(refseq_moltype), file("${housekeeping_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
+      set val(refseq_moltype), file("${constitutive_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
 
       script:
-      output_fasta = "${housekeeping_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
-      include_fasta = params.test_mini_refseq_download ? "${housekeeping_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz"  : "*${refseq_moltype}.f*a.gz" 
+      output_fasta = "${constitutive_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
+      include_fasta = params.test_mini_refseq_download ? "${constitutive_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz"  : "*${refseq_moltype}.f*a.gz" 
       """
       rsync \\
             --prune-empty-dirs \\
@@ -1557,77 +1557,77 @@ if (!params.skip_remove_housekeeping_genes) {
             --recursive \\
             --include '${include_fasta}' \\
             --exclude '/*' \\
-            rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${housekeeping_refseq_taxonomy}/ .
+            rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${constitutive_refseq_taxonomy}/ .
       wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
       DATE=\$(date +'%Y-%m-%d')
       RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
-      gzcat ${housekeeping_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
+      gzcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
       """
     }
 
     ///////////////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////
     /* --                                                                     -- */
-    /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+    /* --              REMOVE K-MERS FROM constitutive GENES                  -- */
     /* --                                                                     -- */
     ///////////////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////
     /*
-    * STEP 7 - Get only housekeeping genes from 
+    * STEP 7 - Get only constitutive genes from 
     */
-    // Keep genes whose names match housekeeping gene regular expression pattern
-    process extract_fasta_housekeeping {
+    // Keep genes whose names match constitutive gene regular expression pattern
+    process extract_fasta_constitutive {
       tag "${fasta.baseName}"
       label "process_low"
-      publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
+      publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy'
 
       input:
       set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
 
       output:
-      set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view
+      set val(refseq_moltype), file(output_fasta_gz) into ch_constitutive_fasta, ch_constitutive_fasta_to_view
 
       script:
-      output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa"
-      output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz"
+      output_fasta = "${fasta.baseName}__only_constitutive_genes.fa"
+      output_fasta_gz = "${fasta.baseName}__only_constitutive_genes.fa.gz"
       """
       filter_fasta_regex.py \\
           --input-fasta ${fasta} \\
           --output-fasta ${output_fasta} \\
-          --regex-pattern '${params.housekeeping_gene_regex}'
+          --regex-pattern '${params.constitutive_gene_regex}'
       gzip -c ${output_fasta} > ${output_fasta_gz}
       """
     }
     
-    ch_housekeeping_fasta_to_view
-      .dump( tag: 'ch_housekeeping_fasta' )
+    ch_constitutive_fasta_to_view
+      .dump( tag: 'ch_constitutive_fasta' )
   }
 
-  if (!have_housekeeping_sigs) {
+  if (!have_constitutive_sigs) {
       ///////////////////////////////////////////////////////////////////////////////
       ///////////////////////////////////////////////////////////////////////////////
       /* --                                                                     -- */
-      /* --          COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE                  -- */
+      /* --          COMPUTE constitutive GENE K-MER SIGNATURE                  -- */
       /* --                                                                     -- */
       ///////////////////////////////////////////////////////////////////////////////
       ///////////////////////////////////////////////////////////////////////////////
       /*
-      * STEP 8 - Compute Housekeeping Gene K-mer Signature
+      * STEP 8 - Compute constitutive Gene K-mer Signature
       */
-      // No fastas provided for removing housekeeping genes
-      process compute_housekeeping_kmer_sig {
+      // No fastas provided for removing constitutive genes
+      process compute_constitutive_kmer_sig {
         tag "${fasta.baseName}"
         label "process_low"
-        publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy'
+        publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy'
 
         input:
         val track_abundance
         val sketch_value_parsed
         val sketch_style_parsed
-        set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta
+        set val(refseq_moltype), file(fasta) from ch_constitutive_fasta
 
         output:
-        set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig
+        set val(sourmash_moltypes), file(sig) into ch_constitutive_sig
 
         script:
         is_protein = refseq_moltype == "protein"
@@ -1675,14 +1675,14 @@ if (!params.skip_remove_housekeeping_genes) {
     .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' )
     .set { ch_sourmash_sketches_moltype_to_sigs }
 
-  ch_housekeeping_sig
-    .dump( tag: 'ch_housekeeping_sig' )
+  ch_constitutive_sig
+    .dump( tag: 'ch_constitutive_sig' )
     .transpose()
-    .dump( tag: 'ch_housekeeping_sig__transposed' )
+    .dump( tag: 'ch_constitutive_sig__transposed' )
     .combine( ch_sourmash_params_for_subtract, by: 0)
-    .dump( tag: 'ch_housekeeping_sig__transposed__combined' )
+    .dump( tag: 'ch_constitutive_sig__transposed__combined' )
     .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 )
-    .dump( tag: 'ch_housekeeping_sig__transposed__combined_joined' )
+    .dump( tag: 'ch_constitutive_sig__transposed__combined_joined' )
     .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist }
 
   ch_subtract_params_to_sigs_for_siglist
@@ -1712,22 +1712,22 @@ if (!params.skip_remove_housekeeping_genes) {
   // ///////////////////////////////////////////////////////////////////////////////
   // ///////////////////////////////////////////////////////////////////////////////
   // /* --                                                                     -- */
-  // /* --              REMOVE K-MERS FROM HOUSEKEEPING GENES                  -- */
+  // /* --              REMOVE K-MERS FROM constitutive GENES                  -- */
   // /* --                                                                     -- */
   // ///////////////////////////////////////////////////////////////////////////////
   // ///////////////////////////////////////////////////////////////////////////////
   // /*
-  // * STEP 9 - Remove housekeeping gene k-mers from single cells
+  // * STEP 9 - Remove constitutive gene k-mers from single cells
   // */
   process subtract_houskeeping_kmers {
     tag "${subtract_id}"
     label "process_medium"
-    publishDir "${params.outdir}/sketches_subtract_housekeeping_kmers/${subtract_id}", mode: 'copy'
+    publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy'
 
     input:
     val sketch_value_parsed
     val sketch_style_parsed
-    set val(molecule), val(ksize), file(housekeeping_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract
+    set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract
 
     output:
     set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed
@@ -1747,7 +1747,7 @@ if (!params.skip_remove_housekeeping_genes) {
         --ksize ${ksize} \\
         --encoding ${molecule} \\
         --output subtracted/ \\
-        ${housekeeping_sig} \\
+        ${constitutive_sig} \\
         ${siglist}
     """
   }
diff --git a/nextflow.config b/nextflow.config
index 780e4b04..ee47e75f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -58,16 +58,16 @@ params {
   save_translate_csv = false
   save_translate_json = false
 
-  // Housekeeping gene k-mer removal
-  skip_remove_housekeeping_genes = false
-  housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
-  housekeeping_refseq_taxonomy = 'vertebrate_mammalian'
+  // constitutive gene k-mer removal
+  skip_remove_constitutive_genes = false
+  constitutive_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
+  constitutive_refseq_taxonomy = 'vertebrate_mammalian'
   // For testing purposes --> use a small refseq dataset
   test_mini_refseq_download = false
-  housekeeping_protein_fasta = false
-  housekeeping_rna_fasta = false
-  housekeeping_protein_sig = false
-  housekeeping_rna_sig = false
+  constitutive_protein_fasta = false
+  constitutive_rna_fasta = false
+  constitutive_protein_sig = false
+  constitutive_rna_sig = false
 
   // ska options
   split_kmer = false
@@ -156,9 +156,9 @@ profiles {
   test_ska { includeConfig 'conf/test_ska.config' }
   test_bam { includeConfig 'conf/test_bam.config' }
   test_fastas { includeConfig 'conf/test_fastas.config' }
-  test_housekeeping_from_download_refseq { includeConfig 'conf/test_housekeeping_from_download_refseq.config' }
-  test_housekeeping_from_fasta { includeConfig 'conf/test_housekeeping_from_fasta.config' }
-  test_housekeeping_from_sig { includeConfig 'conf/test_housekeeping_from_sig.config' }
+  test_constitutive_from_download_refseq { includeConfig 'conf/test_constitutive_from_download_refseq.config' }
+  test_constitutive_from_fasta { includeConfig 'conf/test_constitutive_from_fasta.config' }
+  test_constitutive_from_sig { includeConfig 'conf/test_constitutive_from_sig.config' }
   test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' }
   test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' }
   test_sig_merge { includeConfig 'conf/test_sig_merge.config' }

From d9cbf42ddd7f7c723328fdb468e643f8578435cc Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 12:58:53 -0700
Subject: [PATCH 29/43] Add explicit path for conda bin

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 911e2f04..263e0713 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de
 RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
 # Soft link all conda C-related libraries to their non-prefixed name 
 # for rust to be able to build the C libraries
-RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
+RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
 RUN cd 2021-01-27-olga-remove-protein && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH

From ea742f3efe01cc483c9813899c06cf378d6be7cd Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 13:33:16 -0700
Subject: [PATCH 30/43] Actually do soft links

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 263e0713..4e914ee7 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de
 RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
 # Soft link all conda C-related libraries to their non-prefixed name 
 # for rust to be able to build the C libraries
-RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done
+RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done
 RUN cd 2021-01-27-olga-remove-protein && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH

From 8d73e3078398f87cdf64d5b25e1c64460bcf7dfe Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 13:40:37 -0700
Subject: [PATCH 31/43] Update whitespace to make dockerfile more readable

---
 Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 4e914ee7..a0cb55b8 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,7 +17,8 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de
 RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
 # Soft link all conda C-related libraries to their non-prefixed name 
 # for rust to be able to build the C libraries
-RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done
+RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); \
+      do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done
 RUN cd 2021-01-27-olga-remove-protein && cargo build --release 
 # Add "subtract" command to path
 ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH

From b4e91b819837804a15314fd3999da79aef41d503 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 15:03:45 -0700
Subject: [PATCH 32/43] Add separate creation of ch_refseq_moltypes_to_download

---
 main.nf | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/main.nf b/main.nf
index 992f95d8..32ddf86b 100644
--- a/main.nf
+++ b/main.nf
@@ -565,6 +565,13 @@ if (have_constitutive_fastas) {
     // Take only the first item, the molecule type
     .map{ it[0] }
     .set{ ch_refseq_moltypes_to_download }
+} else {
+  // Don't look at the fastas, only check the parsed molecule types
+  Channel.from(['protein', 'rna'])
+    .filter{ 
+      it == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 
+    }
+    .set{ ch_refseq_moltypes_to_download }
 }
 
 if (have_constitutive_sigs) {

From 403ab49d4d4152060c508115b486b66b10056a2d Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 15:04:00 -0700
Subject: [PATCH 33/43] Update tests to all use mini refseq data

---
 conf/test.config                | 1 +
 conf/test_bam.config            | 2 ++
 conf/test_fastas.config         | 2 ++
 conf/test_full.config           | 1 +
 conf/test_protein_fastas.config | 2 +-
 conf/test_remove_ribo.config    | 1 +
 conf/test_sig_merge.config      | 1 +
 conf/test_tenx_tgz.config       | 2 +-
 conf/test_translate.config      | 1 +
 conf/test_translate_bam.config  | 1 +
 10 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 4d4dced7..58f777a6 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,4 +29,5 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_bam.config b/conf/test_bam.config
index 8bcdb775..7de684af 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -28,4 +28,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 2
+  test_mini_refseq_download = true
+
 }
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index a6509d4e..16cdcfb6 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -26,4 +26,6 @@ params {
     ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']],
 
   ]
+  test_mini_refseq_download = true
+
 }
diff --git a/conf/test_full.config b/conf/test_full.config
index 5dfaeafb..d6db2819 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -18,4 +18,5 @@ params {
     ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
     ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']]
   ]
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index ea22bcb6..0d91f942 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -29,5 +29,5 @@ params {
   sketch_scaled = 2
   molecules = 'protein,dayhoff,hp'
   read_pairs = false
-
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index 8aa689ac..884b654a 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -31,4 +31,5 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 21a27939..5c761cf0 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -29,4 +29,5 @@ params {
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e6'
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index 39b9b2f0..62f6f3fa 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -29,5 +29,5 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 10
-  shard_size = 350
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_translate.config b/conf/test_translate.config
index c6e488a5..5208d60d 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -25,4 +25,5 @@ params {
   bloomfilter_tablesize = '1e8'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
+  test_mini_refseq_download = true
 }
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index 15365382..27194329 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -31,4 +31,5 @@ params {
   bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
+  test_mini_refseq_download = true
 }

From 9a1af8ad14256c5fd6b8a64aa82b1025019ad6ee Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 15:14:32 -0700
Subject: [PATCH 34/43] Pipeline is running!

---
 conf/test.config                |  3 +++
 conf/test_bam.config            |  3 ++-
 conf/test_fastas.config         |  4 ++--
 conf/test_full.config           |  1 -
 conf/test_protein_fastas.config |  4 +++-
 conf/test_remove_ribo.config    |  4 +++-
 conf/test_sig_merge.config      |  4 +++-
 conf/test_tenx_tgz.config       |  3 ++-
 conf/test_translate.config      |  6 +++++-
 conf/test_translate_bam.config  |  4 +++-
 main.nf                         | 11 ++++++++++-
 11 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 58f777a6..92549b30 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,5 +29,8 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
+  // Remove constitutively expressed genes
   test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  // constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 }
diff --git a/conf/test_bam.config b/conf/test_bam.config
index 7de684af..1f6dff7a 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -28,6 +28,7 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 2
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 
 }
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index 16cdcfb6..34ea3dbc 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -26,6 +26,6 @@ params {
     ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']],
 
   ]
-  test_mini_refseq_download = true
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
index d6db2819..5dfaeafb 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -18,5 +18,4 @@ params {
     ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
     ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']]
   ]
-  test_mini_refseq_download = true
 }
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index 0d91f942..bd3d28ea 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -29,5 +29,7 @@ params {
   sketch_scaled = 2
   molecules = 'protein,dayhoff,hp'
   read_pairs = false
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+
 }
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index 884b654a..40b320be 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -31,5 +31,7 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+
 }
diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 5c761cf0..42dbd539 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -29,5 +29,7 @@ params {
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e6'
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+
 }
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index 62f6f3fa..648c5f30 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -29,5 +29,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 10
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
 }
diff --git a/conf/test_translate.config b/conf/test_translate.config
index 5208d60d..ba3b3c09 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -25,5 +25,9 @@ params {
   bloomfilter_tablesize = '1e8'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
-  test_mini_refseq_download = true
+
+  // Remove constitutively expressed genes
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+
 }
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index 27194329..ae9d5c8b 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -31,5 +31,7 @@ params {
   bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
-  test_mini_refseq_download = true
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+
 }
diff --git a/main.nf b/main.nf
index 32ddf86b..ceed6dc9 100644
--- a/main.nf
+++ b/main.nf
@@ -581,6 +581,15 @@ if (have_constitutive_sigs) {
     ["protein,dayhoff", file(constitutive_protein_sig)], 
     ["dna", file(constitutive_rna_sig)]
   )
+
+  ch_refseq_moltype_to_fasta
+    // Check if protein molecules were even specified 
+    .filter{ 
+      it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 
+    }
+    // Take only the first item, the molecule type
+    .map{ it[0] }
+    .set{ ch_refseq_moltypes_to_download }
 }
 
 
@@ -1568,7 +1577,7 @@ if (!params.skip_remove_constitutive_genes) {
       wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
       DATE=\$(date +'%Y-%m-%d')
       RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
-      gzcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
+      zcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
       """
     }
 

From a565bd36462a90d095ccc01a9f6a77c0d92d6401 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 15:51:20 -0700
Subject: [PATCH 35/43] Get "sourmash compare" to run

---
 main.nf | 101 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/main.nf b/main.nf
index ceed6dc9..065020d3 100644
--- a/main.nf
+++ b/main.nf
@@ -446,6 +446,8 @@ Channel.from(params.ksizes?.toString().tokenize(','))
 molecules = params.molecules?.toString().tokenize(',')
 nucleotide_molecules = molecules.findAll { it == "dna" }
 peptide_molecules = molecules.findAll { it != "dna" }
+// have_protein_input = params.translate_proteome_fasta || params.protein_fastas || protein_input
+// peptide_molecules = 
 peptide_molecules_comma_separated = peptide_molecules.join(",")
 peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " )
 
@@ -1721,8 +1723,8 @@ if (!params.skip_remove_constitutive_genes) {
     .map{ [it[0], it[2], it[1], it[3]] }
     .dump ( tag: 'ch_subtract_params_with_sigs__map' )
     .combine( ch_subtract_params_with_siglist,  by: [0, 1] )
-    .dump( tag: 'ch_sigs_with_houskeeping_sig_to_subtract' )
-    .set { ch_sigs_with_houskeeping_sig_to_subtract }
+    .dump( tag: 'ch_sigs_with_constitutive_sig_to_subtract' )
+    .set { ch_sigs_with_constitutive_sig_to_subtract }
 
 
   // ///////////////////////////////////////////////////////////////////////////////
@@ -1735,7 +1737,7 @@ if (!params.skip_remove_constitutive_genes) {
   // /*
   // * STEP 9 - Remove constitutive gene k-mers from single cells
   // */
-  process subtract_houskeeping_kmers {
+  process subtract_constitutive_kmers {
     tag "${subtract_id}"
     label "process_medium"
     publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy'
@@ -1743,10 +1745,10 @@ if (!params.skip_remove_constitutive_genes) {
     input:
     val sketch_value_parsed
     val sketch_style_parsed
-    set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract
+    set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_constitutive_sig_to_subtract
 
     output:
-    set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed
+    set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_constitutive_removed
     
     script:
     subtract_id = "${molecule}__k-${ksize}"
@@ -1767,6 +1769,29 @@ if (!params.skip_remove_constitutive_genes) {
         ${siglist}
     """
   }
+
+  ch_sigs_constitutive_removed
+    // .groupTuple( by: [0, 1] )
+    .transpose( by: 2 )
+    .set{ ch_sourmash_sketches_to_compare }
+
+} else {
+  ch_sourmash_sketches_merged
+    .map { [tuple(it[2].split(",")), it[4]] }
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
+    .transpose()
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
+    // Perform cartesian product on the molecules with compare params
+    .combine( ch_sourmash_params_for_compare, by: 0)
+    .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
+    // .groupTuple(by: [0, 2])
+    .dump(tag: 'ch_sourmash_sketches_to_compare' )
+    // Reorder so signature files are last
+    // moltype, ksize, signature file
+    .map { [it[0], it[2], it[1]] }
+    .set { ch_sourmash_sketches_to_compare }
+
+    ch_sourmash_sig_describe_merged = Channel.empty()
 }
 
 
@@ -1793,51 +1818,23 @@ if (params.split_kmer){
 }
 // If skip_compute is true, skip compare must be specified as true as well
 if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
-  // // Combine peptide and nucleotide sketches
-  // sourmash_sketches_nucleotide
-  //   .collect()
-  //   // Set as a list so that combine does cartesian product of all signatures
-  //   .map { it -> [it] }
-  //   .combine( ch_ksizes_for_compare_nucleotide )
-  //   .dump( tag: 'sourmash_sketches_nucleotide__ksizes' )
-  //   .map { x -> [x[0], x[1], 'dna'] }
-  //   .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' )
-  //   .set { sourmash_sketches_nucleotide_for_compare }
-
-  // sourmash_sketches_peptide
-  //   .collect()
-  //   // Set as a list so that combine does cartesian product of all signatures
-  //   .map { it -> [it] }
-  //   .combine( ch_ksizes_for_compare_petide )
-  //   .dump( tag: 'sourmash_sketches_peptide__ksizes' )
-  //   .combine( ch_peptide_molecules )
-  //   .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' )
-  //   .set { sourmash_sketches_peptide_for_compare }
-
-  // sourmash_sketches_peptide_for_compare
-  //   .mix ( sourmash_sketches_nucleotide_for_compare )
-  //   .set { ch_sourmash_sketches_to_compare }
-
-  // ch_sourmash_sketches_to_compare = Channel.empty()
-
-
-  ch_sourmash_sketches_merged = Channel.empty()
+  ch_sourmash_compare_sketch_params_to_sketches = Channel.create()
 
-  ch_sourmash_sketches_merged
-    // Drop first index (index 0) which is the cell id
-    // Drop the second index (index 1) which is the sketch id
-    // Keep only moltype
-    // Drop ksize
-    .map { [tuple(it[2].split(",")), it[4]] }
-    .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
-    .transpose()
-    .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
-    // Perform cartesian product on the molecules with compare params
-    .combine( ch_sourmash_params_for_compare, by: 0)
-    .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
-    .groupTuple(by: [0, 2])
-    .dump(tag: 'ch_sourmash_sketches_to_compare' )
-    .set { ch_sourmash_sketches_to_compare }
+  ch_sourmash_sketches_to_compare
+    .tap ( ch_sourmash_compare_sketch_params_to_sketches )
+    .dump( tag: 'ch_compare_params_to_sigs_for_siglist__transpose' )
+    .collectFile() { it -> 
+      [ "${it[0]}__${it[1]}.txt", "${it[2].getFileName()}\n"] 
+    }
+    .dump ( tag: 'ch_compare_params_to_sigs_for_siglist__transpose__collectfile' )
+    .map { [ tuple( it.baseName.split('__') ), it] }
+    .map { [ it[0][0], it[0][1], it[1] ] }
+    .dump ( tag: 'ch_compare_params_with_siglist' )
+    .combine( ch_sourmash_compare_sketch_params_to_sketches,  by: [0, 1] )
+    .dump( tag: 'ch_compare_params_with_siglist__add_sketches' )
+    .groupTuple( by: [0, 1, 2] )
+    .dump ( tag: 'ch_compare_params_with_siglist__add_sketches__groupTuple' )
+    .set { ch_sourmash_params_to_siglist_sketches }
 
   process sourmash_compare_sketches {
     // Combine peptide and nucleotide sketches
@@ -1845,8 +1842,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
     publishDir "${params.outdir}/compare_sketches", mode: 'copy'
 
     input:
-    // Weird order but that's how it shakes out with the groupTuple
-    set val(molecule), file("*.sig"), val(ksize) from ch_sourmash_sketches_to_compare
+    // file(sigs) is necessary to stage all the signature files present in file(siglist)
+    set val(molecule), val(ksize), file(siglist), file(sigs) from ch_sourmash_params_to_siglist_sketches
 
     output:
     file(csv)
@@ -1861,7 +1858,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
           --${molecule} \\
           --csv ${csv} \\
           ${processes} \\
-          --traverse-directory .
+          --from-file ${siglist}
     # Use --traverse-directory instead of all the files explicitly to avoid
     # "too many arguments" error for bash when there are lots of samples
     """

From f0337b87601ffe48272ea7adf76ae4f09e281a23 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Mon, 15 Mar 2021 15:56:51 -0700
Subject: [PATCH 36/43] Update constitutive rna sig for all configs

---
 conf/test.config                | 5 ++---
 conf/test_bam.config            | 6 ++----
 conf/test_fastas.config         | 5 ++---
 conf/test_full.config           | 1 -
 conf/test_protein_fastas.config | 6 ++----
 conf/test_remove_ribo.config    | 6 ++----
 conf/test_sig_merge.config      | 6 ++----
 conf/test_tenx_tgz.config       | 5 ++---
 conf/test_translate.config      | 6 ++----
 conf/test_translate_bam.config  | 6 ++----
 10 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 92549b30..f6b25ac9 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -17,7 +17,6 @@ params {
   // Input data
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
@@ -31,6 +30,6 @@ params {
   ]
   // Remove constitutively expressed genes
   test_mini_refseq_download = true
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  // constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_bam.config b/conf/test_bam.config
index 1f6dff7a..12106f56 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -19,7 +19,6 @@ params {
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
@@ -28,7 +27,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 2
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index 34ea3dbc..3e893067 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -17,7 +17,6 @@ params {
   // Input data
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
@@ -26,6 +25,6 @@ params {
     ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']],
 
   ]
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
index 5dfaeafb..7c0d46dc 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -12,7 +12,6 @@ params {
   config_profile_description = 'Full test dataset to check pipeline function'
 
   // Input data for full size test
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   input_paths = [
     ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index bd3d28ea..35d88c36 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -26,10 +26,8 @@ params {
       ['https://github.com/czbiohub/test-datasets/raw/predictorthologs/testdata/bonobo_liver_ptprc__molecule-dayhoff__coding_reads_peptides.fasta']]]
 
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'protein,dayhoff,hp'
   read_pairs = false
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index 40b320be..722c42a7 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -17,7 +17,6 @@ params {
   // Input data
   // samples = 'testing/samples.csv'
   // fastas = 'testing/fastas/*.fasta'
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
@@ -31,7 +30,6 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 42dbd539..54d28fe8 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -18,7 +18,6 @@ params {
   bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
@@ -29,7 +28,6 @@ params {
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e6'
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index 648c5f30..292bfbf9 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -20,7 +20,6 @@ params {
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid-unaligned-tgz-v3/testdata/mouse_brown_fat_ptprc_plus_unaligned.tgz'
   ]
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
@@ -29,6 +28,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 10
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_translate.config b/conf/test_translate.config
index ba3b3c09..fa69416c 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -17,7 +17,6 @@ params {
   // Input data
   fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa"
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
 
@@ -27,7 +26,6 @@ params {
   translate_peptide_molecule = 'dayhoff'
 
   // Remove constitutively expressed genes
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index ae9d5c8b..db9a7164 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -18,7 +18,6 @@ params {
   bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
     'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
   // Sketch Parameters
-  sketch_scaled = 2
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
   save_fastas = "fastas"
@@ -31,7 +30,6 @@ params {
   bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }

From dbf97f9bad463429598aa6d7797d6cb22a4c34c8 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 10:18:29 -0700
Subject: [PATCH 37/43] Add test_bam alone

---
 .github/workflows/ci.yml |  1 +
 main.nf                  | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 76b2cbeb..14376386 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -66,6 +66,7 @@ jobs:
           - "test --sketch_scaled false --sketch_scaled_log2 2"
           - "test --sketch_scaled false --sketch_num_hashes 20"
           - "test --sketch_scaled false --sketch_num_hashes_log2 20"
+          - "test_bam"
           - "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false"
           - "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false"
           - "test_bam --skip_sig_merge"
diff --git a/main.nf b/main.nf
index 065020d3..ca704c05 100644
--- a/main.nf
+++ b/main.nf
@@ -579,15 +579,21 @@ if (have_constitutive_fastas) {
 if (have_constitutive_sigs) {
   // Use sourmash moltypes of "protein,dayhoff" instead of the original protein
   // as used for the fastas as that's what matches the sourmash outputs
-  ch_constitutive_sig = Channel.from(
+  Channel.from(
     ["protein,dayhoff", file(constitutive_protein_sig)], 
-    ["dna", file(constitutive_rna_sig)]
-  )
+    ["dna", file(constitutive_rna_sig)])
+    .set { ch_constitutive_sig }
 
-  ch_refseq_moltype_to_fasta
+  // Refseq molecule types are "protein" and "rna"
+  Channel.from(
+    ["protein", file(constitutive_protein_sig)], 
+    ["rna", file(constitutive_rna_sig)])
+    .into { ch_refseq_moltype_to_sig }
+
+  ch_refseq_moltype_to_sig
     // Check if protein molecules were even specified 
     .filter{ 
-      it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 
+      it[0]== "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 
     }
     // Take only the first item, the molecule type
     .map{ it[0] }

From 2ed90f8fa608bf5de28a41287d92784a90e75618 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 10:18:42 -0700
Subject: [PATCH 38/43] Update constitutive signatures

---
 conf/test.config                         | 4 ++--
 conf/test_bam.config                     | 4 ++--
 conf/test_constitutive_from_fasta.config | 4 ++--
 conf/test_constitutive_from_sig.config   | 4 ++--
 conf/test_fastas.config                  | 4 ++--
 conf/test_full.config                    | 3 +++
 conf/test_protein_fastas.config          | 4 ++--
 conf/test_remove_ribo.config             | 4 ++--
 conf/test_sig_merge.config               | 4 ++--
 conf/test_tenx_tgz.config                | 4 ++--
 conf/test_translate.config               | 4 ++--
 conf/test_translate_bam.config           | 4 ++--
 12 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index f6b25ac9..cf19689a 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -30,6 +30,6 @@ params {
   ]
   // Remove constitutively expressed genes
   test_mini_refseq_download = true
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_bam.config b/conf/test_bam.config
index 12106f56..07579d50 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -27,6 +27,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 2
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config
index 2bf6ba6d..8be0ba00 100644
--- a/conf/test_constitutive_from_fasta.config
+++ b/conf/test_constitutive_from_fasta.config
@@ -24,8 +24,8 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  housekeeping_protein_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa.gz"
-  housekeeping_rna_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa.gz"
+  constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz"
+  constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz"
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
   bloomfilter_tablesize = '1e6'
diff --git a/conf/test_constitutive_from_sig.config b/conf/test_constitutive_from_sig.config
index 12a88fdb..0e2bad4d 100644
--- a/conf/test_constitutive_from_sig.config
+++ b/conf/test_constitutive_from_sig.config
@@ -24,6 +24,6 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index 3e893067..b439e03c 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -25,6 +25,6 @@ params {
     ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']],
 
   ]
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
index 7c0d46dc..ac6e6677 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -17,4 +17,7 @@ params {
     ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
     ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']]
   ]
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+
 }
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index 35d88c36..91a2325d 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -28,6 +28,6 @@ params {
   // Sketch Parameters
   molecules = 'protein,dayhoff,hp'
   read_pairs = false
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index 722c42a7..72c2710b 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -30,6 +30,6 @@ params {
     ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
     ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
   ]
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 54d28fe8..ad821450 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -28,6 +28,6 @@ params {
 
   reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e6'
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index 292bfbf9..eaf23f7a 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -28,6 +28,6 @@ params {
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 10
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_translate.config b/conf/test_translate.config
index fa69416c..be799ae9 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -26,6 +26,6 @@ params {
   translate_peptide_molecule = 'dayhoff'
 
   // Remove constitutively expressed genes
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index db9a7164..cc6f1cae 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -30,6 +30,6 @@ params {
   bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
-  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
-  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+  constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
 }

From 6a694f0264c2dd52d180d3890c8def19730eb36e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 10:34:35 -0700
Subject: [PATCH 39/43] housekeeping --> constitutive

---
 .github/workflows/ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 14376386..7c95ece8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,9 +74,9 @@ jobs:
           - "test_bam --barcodes_file false --rename_10x_barcodes false"
           - "test_bam --rename_10x_barcodes false"
           - "test_fastas"
-          - "test_housekeeping_from_download_refseq"
-          - "test_housekeeping_from_fasta"
-          - "test_housekeeping_from_sig"
+          - "test_constitutive_from_download_refseq"
+          - "test_constitutive_from_fasta"
+          - "test_constitutive_from_sig"
           - "test_protein_fastas"
           - "test_remove_ribo"
           - "test_sig_merge"

From ddaed1cad89074f8b47b7affe87ac111eac2d3e6 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 14:50:13 -0700
Subject: [PATCH 40/43] Reference proteome fasta --> translate_proteome_fasta

---
 conf/test_translate.config     | 2 +-
 conf/test_translate_bam.config | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/test_translate.config b/conf/test_translate.config
index be799ae9..c4a7bccd 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -20,7 +20,7 @@ params {
   molecules = 'dna,protein,dayhoff'
   read_pairs = false
 
-  reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
+  translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
   bloomfilter_tablesize = '1e8'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index cc6f1cae..4f8a5487 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -24,9 +24,9 @@ params {
   write_barcode_meta_csv = "metadata.csv"
   // For bam, each fasta record represents each barcode and each should have a signature
   // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
-  tenx_min_umi_per_cell = 5
+  tenx_min_umi_per_cell = 2
 
-  reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
+  translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
   bloomfilter_tablesize = '1e6'
   translate_peptide_ksize = '11'
   translate_peptide_molecule = 'dayhoff'

From e7ff62f2a48d6e6ecb1569e85fa72f6b030090e2 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 14:50:25 -0700
Subject: [PATCH 41/43] Move bam to input section

---
 conf/test_tenx_tgz.config |  1 -
 main.nf                   | 12 ++++++------
 nextflow.config           |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index eaf23f7a..10ae33ab 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -26,7 +26,6 @@ params {
   save_intermediate_files = "/tmp/"
   write_barcode_meta_csv = "metadata.csv"
   // For bam, each fasta record represents each barcode and each should have a signature
-  // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
   tenx_min_umi_per_cell = 10
   constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
   constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
diff --git a/main.nf b/main.nf
index ca704c05..0d4890a9 100644
--- a/main.nf
+++ b/main.nf
@@ -588,7 +588,7 @@ if (have_constitutive_sigs) {
   Channel.from(
     ["protein", file(constitutive_protein_sig)], 
     ["rna", file(constitutive_rna_sig)])
-    .into { ch_refseq_moltype_to_sig }
+    .set { ch_refseq_moltype_to_sig }
 
   ch_refseq_moltype_to_sig
     // Check if protein molecules were even specified 
@@ -665,11 +665,11 @@ if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)']      = params.
 if (params.sketch_scaled) summary['Sketch scaled']               = params.sketch_scaled
 if (params.sketch_scaled_log2) summary['Sketch scaled (log2)']   = params.sketch_scaled_log2
 // 10x parameters
-if(params.tenx_tgz) summary["10x .tgz"] = params.tenx_tgz
-if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags
-if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern
-if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
-if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
+if(params.tenx_tgz || params.bam) summary["10x .tgz"] = params.tenx_tgz
+if(params.tenx_tgz || params.bam) summary["10x SAM tags"] = params.tenx_tags
+if(params.tenx_tgz || params.bam) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern
+if(params.tenx_tgz || params.bam) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
+if(params.tenx_tgz || params.bam) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
 // Orpheum Translate parameters
 if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta
 if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize
diff --git a/nextflow.config b/nextflow.config
index ee47e75f..7bdc2902 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,7 +16,7 @@ params {
   fastas = false
   protein_fastas = false
   sra = false
-
+  bam = false
   input = false
 
   // Parsing 10x bam files
@@ -77,7 +77,7 @@ params {
   save_fastas = "fastas"
   tenx_min_umi_per_cell = '0'
   write_barcode_meta_csv = false
-  bam = false
+
 
   // 10x optional input parameters set using the below pattern
   // https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc

From f188f77cf5ebc5c043813e53d4d4540c116e8f4e Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 14:50:52 -0700
Subject: [PATCH 42/43] reference proteome fasta to translate_proteome_fasta in
 test_constitutive_from_fasta

---
 conf/test_constitutive_from_fasta.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config
index 8be0ba00..ea757073 100644
--- a/conf/test_constitutive_from_fasta.config
+++ b/conf/test_constitutive_from_fasta.config
@@ -27,6 +27,6 @@ params {
   constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz"
   constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz"
 
-  reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
+  translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
   bloomfilter_tablesize = '1e6'
 }

From 793e9f199f1de956c9e7c2b1e753c789a605c103 Mon Sep 17 00:00:00 2001
From: Olga Botvinnik <olga.botvinnik@gmail.com>
Date: Tue, 16 Mar 2021 14:53:19 -0700
Subject: [PATCH 43/43] Don't fail fast for all tests to see which individual
 ones are failing

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7c95ece8..cdf23457 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,6 +61,7 @@ jobs:
       NXF_VER: '20.07.1'
       NXF_ANSI_LOG: false
     strategy:
+      fail-fast: false
       matrix:
         profile_flags:
           - "test --sketch_scaled false --sketch_scaled_log2 2"