From df31610c2fddd399813495aa9bea6385f9f9c076 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 28 Oct 2020 09:00:56 -0700 Subject: [PATCH 01/49] Update extract_per_cell_fastqs to not say __aligned__aligned and retry if run out of memory --- main.nf | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index b0066f0d..4dc0ad10 100644 --- a/main.nf +++ b/main.nf @@ -894,9 +894,9 @@ if (params.tenx_tgz || params.bam) { .set{ tenx_reads_with_good_barcodes_ch } process extract_per_cell_fastqs { - tag "${is_aligned_channel_id}__${cell_barcode}" + tag "${fastq_id}" label "low_memory" - errorStrategy 'ignore' + errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'ignore' } publishDir "${params.outdir}/10x-fastqs/per-cell/${channel_id}/", mode: 'copy', pattern: '*.fastq.gz', saveAs: { filename -> "${filename.replace("|", "-")}"} input: @@ -909,10 +909,8 @@ if (params.tenx_tgz || params.bam) { set val(fastq_id), val(cell_id), val(is_aligned) into ch_fastq_id_to_cell_id_is_aligned script: - is_aligned_channel_id = "${channel_id}__${is_aligned}" - processes = "--processes ${task.cpus}" this_cell_barcode = tenx_cell_barcode_pattern.replace('([ACGT]+)', cell_barcode) - fastq_id = "${is_aligned_channel_id}__${is_aligned}__${cell_barcode}" + fastq_id = "${channel_id}__${is_aligned}__${cell_barcode}" cell_id = "${channel_id}__${cell_barcode}" this_cell_fastq_gz = "${fastq_id}.fastq.gz" """ From 700272de1ce66e884cc681f09b2012dba4830cc5 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 28 Oct 2020 09:06:21 -0700 Subject: [PATCH 02/49] Initial commit for adding sourmash sig merge on aligned/unaligned from single cells --- main.nf | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 4dc0ad10..e0646685 100644 --- a/main.nf +++ b/main.nf @@ -1274,7 +1274,7 @@ if (!params.remove_ribo_rna) { output: file(csv) into ch_sourmash_sig_describe_nucleotides - set val(sketch_id), val("dna"), val(ksize), val(sketch_value), file(sig) into sourmash_sketches_all_nucleotide + set val(sample_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide script: // Don't calculate DNA signature if this is protein, to minimize disk, @@ -1299,7 +1299,7 @@ if (!params.remove_ribo_rna) { sourmash sig describe --csv ${csv} ${sig} """ } - sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[4].size() > 0 } + sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[3].size() > 0 } } } else { sourmash_sketches_nucleotide = Channel.empty() @@ -1342,7 +1342,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ output: file(csv) into ch_sourmash_sig_describe_peptides - set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches_all_peptide + set val(sample_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide script: sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) @@ -1367,11 +1367,68 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash sig describe --csv ${csv} ${sig} """ } - sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[4].size() > 0 } + sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[3].size() > 0 } } else { sourmash_sketches_peptide = Channel.empty() } +if (params.bam || params.tenx_tgz) { + // Merge signatures from same sample id and sketch id + + sourmash_sketches_nucleotide + .mix ( sourmash_sketches_peptide ) + .set { ch_sourmash_sketches_mixed} + + ch_fastq_id_to_cell_id_is_aligned + .combine ( ch_sourmash_sketches_mixed ) + .dump( tag: 'fastq_id_to_cells__join__sketches' ) + .groupTuple( by: 1 ) + .dump( tag: 'fastq_id_to_cells__join__sketches__grouptuple' ) + .set { ch_sourmash_sketches_to_merge } + + process sourmash_sig_merge { + tag "${sig_id}" + label "low_memory" + publishDir "${params.outdir}/sketches_merged/${sketch_id}", mode: "${params.publish_dir_mode}", + saveAs: {filename -> + if (filename.indexOf(".csv") > 0) "description/$filename" + else if (filename.indexOf(".sig") > 0) "sigs/$filename" + else null + } + + input: + set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketches_to_merge + + output: + file(csv) into ch_sourmash_sig_describe_merged + set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches + + script: + // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) + sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value) + track_abundance_flag = track_abundance ? '--track-abundance' : '' + processes = "--processes ${task.cpus}" + sig_id = "${sample_id}__${sketch_id}" + sig = "${sig_id}.sig" + csv = "${sig_id}.csv" + """ + sourmash compute \\ + ${sketch_value_flag} \\ + --ksizes $ksize \\ + --input-is-protein \\ + --$molecule \\ + --name '${sample_id}' \\ + --no-dna \\ + $processes \\ + $track_abundance_flag \\ + --output ${sig} \\ + $reads + sourmash sig describe --csv ${csv} ${sig} + """ + } + +} + if (params.split_kmer){ process ska_compare_sketches { tag "${sketch_id}" @@ -1395,7 +1452,6 @@ if (params.split_kmer){ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { process sourmash_compare_sketches { // Combine peptide and nucleotide sketches - sourmash_sketches = sourmash_sketches_peptide.concat(sourmash_sketches_nucleotide) tag "${sketch_id}" publishDir "${params.outdir}/compare_sketches", mode: 'copy' From c4f9521908e1bdf444050879b390d4a1e19e628d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 28 Oct 2020 09:06:50 -0700 Subject: [PATCH 03/49] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a2f13f7..11ea7608 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Initial release of nf-core/kmermaid, created with the [nf-core](http://nf-co.re/ barcode fastq * Add version printing for sencha, bam2fasta, and sourmash in Dockerfile, update versions in environment.yml * For processes translate, sourmash compute add cpus=1 as they are only serial ([#107](https://github.com/nf-core/kmermaid/pull/107)) +* Add `sourmash sig merge` for aligned/unaligned signatures ### `Fixed` From 7d3e21508957c6a4a5227dd98d413104303453e1 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 28 Oct 2020 09:16:05 -0700 Subject: [PATCH 04/49] Try to get grouptuple to work --- main.nf | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/main.nf b/main.nf index e0646685..8732e37b 100644 --- a/main.nf +++ b/main.nf @@ -1274,7 +1274,7 @@ if (!params.remove_ribo_rna) { output: file(csv) into ch_sourmash_sig_describe_nucleotides - set val(sample_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide + set val(sample_id), val(sketch_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide script: // Don't calculate DNA signature if this is protein, to minimize disk, @@ -1342,7 +1342,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ output: file(csv) into ch_sourmash_sig_describe_peptides - set val(sample_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide + set val(sample_id), val(sketch_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide script: sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) @@ -1380,10 +1380,18 @@ if (params.bam || params.tenx_tgz) { .set { ch_sourmash_sketches_mixed} ch_fastq_id_to_cell_id_is_aligned - .combine ( ch_sourmash_sketches_mixed ) - .dump( tag: 'fastq_id_to_cells__join__sketches' ) - .groupTuple( by: 1 ) - .dump( tag: 'fastq_id_to_cells__join__sketches__grouptuple' ) + .combine ( ch_sourmash_sketches_mixed, on: 0 ) + .dump( tag: 'fastq_id_to_cells__combine__sketches' ) + // [DUMP: fastq_id_to_cells__join__sketches] [ + // mouse_lung__aligned__AAATGCCCAAACTGCT, mouse_lung__AAATGCCCAAACTGCT, 'aligned', + // mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT, + // molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false, 'dayhoff', '3', /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/7c/7c4f8e3e3f2db074502e754a3fcc29/mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false.sig] + // it[0]: fastq_id, e.g. "mouse_lung__aligned__AAATGCCCAAACTGCT" (contains aligned/unaligned) + // it[1]: cell_id, e.g. "mouse_lung__AAATGCCCAAACTGCT" + // it[2]: is_aligned, e.g. "aligned" + // it[3]: + .groupTuple( by: [1, 4] ) + .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) .set { ch_sourmash_sketches_to_merge } process sourmash_sig_merge { @@ -1405,9 +1413,6 @@ if (params.bam || params.tenx_tgz) { script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) - sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value) - track_abundance_flag = track_abundance ? '--track-abundance' : '' - processes = "--processes ${task.cpus}" sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" From 533dc94e7cf04b8745399f22d02df19edd866a6d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:05:22 -0700 Subject: [PATCH 05/49] Set minimum UMI per cell to be a default of 1000 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 99603843..8e5713a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,7 +26,7 @@ params { tenx_tags = "CB,XC,UB,XM,XB,RG,GN,GX,TX" tenx_cell_barcode_pattern = '(CB|XC):Z:([ACGT]+)(\\-1)?' tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)' - tenx_min_umi_per_cell = 0 + tenx_min_umi_per_cell = 1000 // Creating sketches molecules ='dna,protein,dayhoff' From da84ba4399f3f5e145909a573c0262777e2a19db Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:05:33 -0700 Subject: [PATCH 06/49] Set test min UMI per cell as 5 --- conf/test_translate_bam.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index e9907698..c15c96a6 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -26,7 +26,7 @@ params { write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag - tenx_min_umi_per_cell = 10 + tenx_min_umi_per_cell = 5 shard_size = 350 reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' From 6d30732988a2f65fbb08aa23249f9597697e3f7e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:07:36 -0700 Subject: [PATCH 07/49] Remove unused --shard_size option --- conf/test_translate_bam.config | 1 - main.nf | 1 - nextflow.config | 1 - 3 files changed, 3 deletions(-) diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index c15c96a6..9063e1e5 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -27,7 +27,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 5 - shard_size = 350 reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e8' diff --git a/main.nf b/main.nf index 8732e37b..a06aa9ad 100644 --- a/main.nf +++ b/main.nf @@ -111,7 +111,6 @@ def helpMessage() { based on tenx_min_umi_per_cell --tenx_min_umi_per_cell A barcode is only considered a valid barcode read and its signature is written if number of umis are greater than tenx_min_umi_per_cell - --shard_size Number of alignment to contain in each sharded bam file --barcodes_file For bam files, Optional absolute path to a .tsv barcodes file if the input is unfiltered 10x bam file --rename_10x_barcodes For bam files, Optional absolute path to a .tsv Tab-separated file mapping 10x barcode name to new name, e.g. with channel or cell annotation label diff --git a/nextflow.config b/nextflow.config index 8e5713a6..167b4a22 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,7 +73,6 @@ params { // https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc barcodes_file = false rename_10x_barcodes = false - shard_size = 350 // Variables for testing read_paths = false From 8fc0c33f5166ee389620d0940ea20467ab69ff69 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:18:42 -0700 Subject: [PATCH 08/49] Add option for skipping sig merge --- main.nf | 1 + nextflow.config | 1 + 2 files changed, 2 insertions(+) diff --git a/main.nf b/main.nf index a06aa9ad..c11eac3c 100644 --- a/main.nf +++ b/main.nf @@ -86,6 +86,7 @@ def helpMessage() { --skip_trimming If provided, skip fastp trimming of reads --skip_compare If provided, skip comparison of hashes using sourmash compare --skip_compute If provided, skip computing of signatures using sourmash compute + --skip_sig_merge If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files Sketch size options: --sketch_num_hashes Number of hashes to use for making the sketches. diff --git a/nextflow.config b/nextflow.config index 167b4a22..37c1847c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,6 +37,7 @@ params { sketch_num_hashes_log2 = false sketch_scaled = false sketch_scaled_log2 = false + skip_sig_merge = false // Comparing sketches skip_compare = false From 63273e5accaf4b67f7d5f4406fcc32e2b2321d8b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:19:04 -0700 Subject: [PATCH 09/49] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d667dd57..f92421ca 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base:1.10.2 +FROM nfcore/base:1.11 LABEL authors="Olga Botvinnik" \ description="Docker image containing all software requirements for the nf-core/kmermaid pipeline" From 054d8b3398eb49d32c307fe3d3294c8256fd4fee Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 29 Oct 2020 10:20:11 -0700 Subject: [PATCH 10/49] Add test for --skip_sig_merge --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cdfe0f83..d8f405b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,7 @@ jobs: - "test_tenx_tgz" - "test_translate" - "test_translate_bam" + - "test_translate_bam --skip_sig_merge" steps: - name: Check out pipeline code uses: actions/checkout@v2 From f1304caee9877edb264fa093fb5821125e087eb4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:26:15 -0800 Subject: [PATCH 11/49] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11ea7608..f4467176 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ Initial release of nf-core/kmermaid, created with the [nf-core](http://nf-co.re/ barcode fastq * Add version printing for sencha, bam2fasta, and sourmash in Dockerfile, update versions in environment.yml * For processes translate, sourmash compute add cpus=1 as they are only serial ([#107](https://github.com/nf-core/kmermaid/pull/107)) -* Add `sourmash sig merge` for aligned/unaligned signatures +* Add `sourmash sig merge` for aligned/unaligned signatures from bam files, and add `--skip_sig_merge` option to turn it off ### `Fixed` From a5791756871e821456e462801e8c23a4272e04f0 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:26:31 -0800 Subject: [PATCH 12/49] Use more realistic scales and ksizes --- conf/test_translate_bam.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index 9063e1e5..1e29b02d 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -18,8 +18,8 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + ksizes = '15,21' + sketch_scaled = '2,5' molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" From ef43907fc76de7aea3bf60e3cac2174c46cecb60 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 4 Jan 2021 16:35:55 -0800 Subject: [PATCH 13/49] regular test doesn't fail anymore --- main.nf | 58 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/main.nf b/main.nf index c11eac3c..4d7efbda 100644 --- a/main.nf +++ b/main.nf @@ -921,11 +921,11 @@ if (params.tenx_tgz || params.bam) { '${this_cell_barcode}' \\ ${reads} \\ | gzip -c - \\ - > ${this_cell_fastq_gz} + > ${this_cell_fastq_gz} || touch ${this_cell_fastq_gz} """ } per_cell_fastqs_ch_possibly_empty - // gzipped files are 20 bytes + // Empty gzipped files are 20 bytes .filter { it -> it[1].size() > 20 } .set { per_cell_fastqs_ch } // // Make per-cell fastqs into a flat channel that matches the read channels of yore @@ -1372,25 +1372,34 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash_sketches_peptide = Channel.empty() } -if (params.bam || params.tenx_tgz) { +if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) { // Merge signatures from same sample id and sketch id sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) - .set { ch_sourmash_sketches_mixed} + .set { ch_sourmash_sketches_mixed } ch_fastq_id_to_cell_id_is_aligned - .combine ( ch_sourmash_sketches_mixed, on: 0 ) + .combine ( ch_sourmash_sketches_mixed, by: 0 ) .dump( tag: 'fastq_id_to_cells__combine__sketches' ) - // [DUMP: fastq_id_to_cells__join__sketches] [ - // mouse_lung__aligned__AAATGCCCAAACTGCT, mouse_lung__AAATGCCCAAACTGCT, 'aligned', - // mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT, - // molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false, 'dayhoff', '3', /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/7c/7c4f8e3e3f2db074502e754a3fcc29/mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dayhoff__ksize-3__num_hashes-4__track_abundance-false.sig] - // it[0]: fastq_id, e.g. "mouse_lung__aligned__AAATGCCCAAACTGCT" (contains aligned/unaligned) - // it[1]: cell_id, e.g. "mouse_lung__AAATGCCCAAACTGCT" + // [DUMP: fastq_id_to_cells__combine__sketches] + // ['mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT', + // mouse_brown_fat_ptprc_plus_unaligned__CTGAAGTCAATGGTCT, + // 'aligned', + // molecule-dna__ksize-3__num_hashes-4__track_abundance-false, + // 'dna', + // '3', + // mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig] + // it[0]: fastq_id, e.g. "mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT" (contains aligned/unaligned) + // it[1]: cell_id, e.g. "mouse_brown_fat_ptprc_plus_unaligned__CTGAAGTCAATGGTCT" // it[2]: is_aligned, e.g. "aligned" - // it[3]: - .groupTuple( by: [1, 4] ) + // it[3]: sketch_id, e.g. molecule-dna__ksize-3__num_hashes-4__track_abundance-false + // it[4]: molecule, e.g. 'dna' + // it[5]: ksize, e.g. '3' + // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig + .groupTuple( by: [1, 3, 4, 5] ) + // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] + // [['mouse_lung__aligned__AAAGATGCAGATCTGT', 'mouse_lung__aligned__AAAGATGCAGATCTGT'], mouse_lung__AAAGATGCAGATCTGT, ['aligned', 'aligned'], molecule-dna__ksize-15__scaled-5__track_abundance-false, 'dna', '15', [/Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/36/b23ceef8424a2aeb8e7187f3b30d8b/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/8b/1a1aae2c00520dab66dadb63edbd77/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]] .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) .set { ch_sourmash_sketches_to_merge } @@ -1413,25 +1422,20 @@ if (params.bam || params.tenx_tgz) { script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) - sig_id = "${sample_id}__${sketch_id}" - sig = "${sig_id}.sig" + sig_id = "${cell_id}__${sketch_id}" csv = "${sig_id}.csv" + output_sig = "${sig_id}.sig" """ - sourmash compute \\ - ${sketch_value_flag} \\ - --ksizes $ksize \\ - --input-is-protein \\ - --$molecule \\ - --name '${sample_id}' \\ - --no-dna \\ - $processes \\ - $track_abundance_flag \\ - --output ${sig} \\ - $reads - sourmash sig describe --csv ${csv} ${sig} + sourmash sig merge -o merged.sig ${sigs} + sourmash sig rename -o ${output_sig} '${cell_id}' + sourmash sig describe --csv ${csv} ${output_sig} """ } +} else { + sourmash_sketches_nucleotide + .mix ( sourmash_sketches_peptide ) + .set { sourmash_sketches } } if (params.split_kmer){ From d0bec5c52a2d7de6de49360e4f063ce29cce7078 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 5 Jan 2021 17:39:44 -0800 Subject: [PATCH 14/49] Update bam config --- conf/test_bam.config | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/conf/test_bam.config b/conf/test_bam.config index e46f6568..ac4e8038 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -19,8 +19,8 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + ksizes = '15,21' + sketch_scaled = '2,5' molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -28,6 +28,5 @@ params { write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag - tenx_min_umi_per_cell = 5 - shard_size = 350 + tenx_min_umi_per_cell = 2 } From 000d6ca003f00fc1e8904b2b0441cd613fabd908 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 5 Jan 2021 17:39:52 -0800 Subject: [PATCH 15/49] Add dump ch_sourmash_sketches_mixed --- main.nf | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 96a30c02..67a1e6c1 100644 --- a/main.nf +++ b/main.nf @@ -1385,6 +1385,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) { sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) + .dump ( tag: 'ch_sourmash_sketches_mixed' ) .set { ch_sourmash_sketches_mixed } ch_fastq_id_to_cell_id_is_aligned @@ -1407,7 +1408,16 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) { // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig .groupTuple( by: [1, 3, 4, 5] ) // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] - // [['mouse_lung__aligned__AAAGATGCAGATCTGT', 'mouse_lung__aligned__AAAGATGCAGATCTGT'], mouse_lung__AAAGATGCAGATCTGT, ['aligned', 'aligned'], molecule-dna__ksize-15__scaled-5__track_abundance-false, 'dna', '15', [/Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/36/b23ceef8424a2aeb8e7187f3b30d8b/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, /Users/olgabot/code/nf-core/kmermaid--olgabot/sourmash-sig-merge/work/8b/1a1aae2c00520dab66dadb63edbd77/mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig]] + // [ + // ['mouse_lung__aligned__AAAGATGCAGATCTGT', + // 'mouse_lung__aligned__AAAGATGCAGATCTGT'], + // mouse_lung__AAAGATGCAGATCTGT, + // ['aligned', 'aligned'], + // molecule-dna__ksize-15__scaled-5__track_abundance-false, + // 'dna', '15', + // [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, + // mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig] + // ] .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) .set { ch_sourmash_sketches_to_merge } @@ -1422,7 +1432,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) { } input: - set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketches_to_merge + set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge output: file(csv) into ch_sourmash_sig_describe_merged From 6bce013f5cc6a57638817043032efb647f6f249a Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 6 Jan 2021 10:45:31 -0800 Subject: [PATCH 16/49] Update schema --- nextflow_schema.json | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index aa93d85c..711e29ca 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -224,11 +224,6 @@ "description": "A barcode is only considered a valid barcode read and its signature is written if number of umis are greater than tenx_min_umi_per_cell", "fa_icon": "fas fa-percentage" }, - "shard_size": { - "type": "integer", - "description": "Number of alignment to contain in each sharded bam file", - "fa_icon": "fas fa-percentage" - }, "barcodes_file": { "type": "string", "description": "For bam files, Optional absolute path to a .tsv barcodes file if the input is unfiltered 10x bam file", @@ -287,6 +282,10 @@ "description": "Skip sourmash compare.", "fa_icon": "fas fa-fast-forward" }, + "skip_sig_merge": { + "type": "boolean", + "description": "Skip merging aligned+unaligned reads per cell (keep aligned/unaligned separate)" + }, "skip_multiqc": { "type": "boolean", "description": "Skip MultiQC.", From ba765ffd56ddabe629492a49b27ffb897cf102cd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 08:58:31 -0800 Subject: [PATCH 17/49] Add params.ksizes to sketch output --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 53a74979..8cc7672e 100644 --- a/main.nf +++ b/main.nf @@ -1303,7 +1303,7 @@ if (!params.remove_ribo_rna) { output: file(csv) into ch_sourmash_sig_describe_nucleotides - set val(sample_id), val(sketch_id), val("dna"), val(ksize), file(sig) into sourmash_sketches_all_nucleotide + set val(sample_id), val(sketch_id), val("dna"), val(params.ksizes), file(sig) into sourmash_sketches_all_nucleotide script: // Don't calculate DNA signature if this is protein, to minimize disk, @@ -1369,7 +1369,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ output: file(csv) into ch_sourmash_sig_describe_peptides - set val(sample_id), val(sketch_id), val(molecule), val(ksize), file(sig) into sourmash_sketches_all_peptide + set val(sample_id), val(sketch_id), val(molecule), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide script: From ed5e72b69b654fe4a6f160ab8a386bde313b53a0 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 10:04:25 -0800 Subject: [PATCH 18/49] Add peptide_molecules --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 8cc7672e..da603034 100644 --- a/main.nf +++ b/main.nf @@ -1369,7 +1369,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ output: file(csv) into ch_sourmash_sig_describe_peptides - set val(sample_id), val(sketch_id), val(molecule), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide + set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide script: From 17185024332e8c3ef1c60b22ef099fb654f6c9f9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 10:28:32 -0800 Subject: [PATCH 19/49] add check for skip_compute in sig merge logic --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index da603034..98850b24 100644 --- a/main.nf +++ b/main.nf @@ -1398,7 +1398,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash_sketches_peptide = Channel.empty() } -if ((params.bam || params.tenx_tgz) && !params.skip_sig_merge) { +if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_merge) { // Merge signatures from same sample id and sketch id sourmash_sketches_nucleotide From 0029b51a8f924ca0cb8f6c2fc413fc7681d9cd32 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 10:28:57 -0800 Subject: [PATCH 20/49] Add header --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 98850b24..ec5eb7f8 100644 --- a/main.nf +++ b/main.nf @@ -1398,8 +1398,10 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash_sketches_peptide = Channel.empty() } +// ------------- +// Merge signatures from same sample id and sketch id +// ------------- if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_merge) { - // Merge signatures from same sample id and sketch id sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) From c04a5a1d63ee04b68f33604eecfc71e4786c7ccf Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 10:31:26 -0800 Subject: [PATCH 21/49] Only mix sketches if not skip_compute --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ec5eb7f8..1ef63b13 100644 --- a/main.nf +++ b/main.nf @@ -1470,7 +1470,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ """ } -} else { +} else if (!param.skip_compute) { sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) .set { sourmash_sketches } From 5893884a43bdf783b7c6e0fd91bbd8930b431811 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 7 Jan 2021 10:32:55 -0800 Subject: [PATCH 22/49] param --> params --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 1ef63b13..25dd1426 100644 --- a/main.nf +++ b/main.nf @@ -1470,7 +1470,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ """ } -} else if (!param.skip_compute) { +} else if (!params.skip_compute) { sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) .set { sourmash_sketches } From 4118c6ccd687dfbcfca7ea87bc71fc9069840e02 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 8 Jan 2021 17:54:24 -0800 Subject: [PATCH 23/49] Add some projectdir stuff --- main.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 25dd1426..07fb1be2 100644 --- a/main.nf +++ b/main.nf @@ -1610,6 +1610,8 @@ workflow.onComplete { if (!workflow.success) { subject = "[nf-core/kmermaid] FAILED: $workflow.runName" } + projectDir = workflow.projectDir + def email_fields = [:] email_fields['version'] = workflow.manifest.version email_fields['runName'] = custom_runName ?: workflow.runName @@ -1656,18 +1658,18 @@ workflow.onComplete { // Render the TXT template def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$projectDir/assets/email_template.txt") + def tf = new File("${workflow.projectDir}/assets/email_template.txt") def txt_template = engine.createTemplate(tf).make(email_fields) def email_txt = txt_template.toString() // Render the HTML template - def hf = new File("$projectDir/assets/email_template.html") + def hf = new File("${workflow.projectDir}/assets/email_template.html") def html_template = engine.createTemplate(hf).make(email_fields) def email_html = html_template.toString() // Render the sendmail template def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$projectDir/assets/sendmail_template.txt") + def sf = new File("$workflow.projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() From baa96f8f8455f8c93d66e77fcc206b2c0267340c Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 08:18:53 -0800 Subject: [PATCH 24/49] More projectDir fixes --- main.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 07fb1be2..ee236232 100644 --- a/main.nf +++ b/main.nf @@ -543,10 +543,11 @@ if (workflow.profile.contains('awsbatch')) { } // Stage config files -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +projectDir = workflow.projectDir +ch_multiqc_config = file("$workflow.projectDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) -ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) +ch_output_docs = file("$workflow.projectDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$workflow.projectDir/docs/images/", checkIfExists: true) // Header log info @@ -1668,7 +1669,7 @@ workflow.onComplete { def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$workflow.projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] def sf = new File("$workflow.projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() From 8ba5db1bb8e7f615eb60e6282bcb017ef9cb90b4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 08:30:07 -0800 Subject: [PATCH 25/49] Do per-ksize sourmash sig merge --- main.nf | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index ee236232..958460e4 100644 --- a/main.nf +++ b/main.nf @@ -1453,19 +1453,24 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } input: - set val(molecule), val(ksize), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge + set val(molecule), val(ksizes), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge output: file(csv) into ch_sourmash_sig_describe_merged - set val(sketch_id), val(molecule), val(ksize), val(sketch_value), file(sig) into sourmash_sketches + set val(sketch_id), val(molecule), val(ksizes), val(sketch_value), file(sig) into sourmash_sketches script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) sig_id = "${cell_id}__${sketch_id}" csv = "${sig_id}.csv" output_sig = "${sig_id}.sig" + KSIZES = ksizes.split(',') """ - sourmash sig merge -o merged.sig ${sigs} + for KSIZE in ${KSIZES}; do + # Can only merge one kize at a time + sourmash sig merge -k ${KSIZE} -o merged_\$KSIZE.sig ${sigs} + done + sourmash sig cat merged*.sig > ${output_sig} sourmash sig rename -o ${output_sig} '${cell_id}' sourmash sig describe --csv ${csv} ${output_sig} """ @@ -1559,6 +1564,8 @@ if (!params.skip_multiqc){ publishDir "${params.outdir}/MultiQC", mode: "${params.publish_dir_mode}" input: file multiqc_config from ch_multiqc_config + file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([]) + file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([]) file ('fastp/*') from ch_fastp_results.collect().ifEmpty([]) file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([]) file ('software_versions/*') from ch_software_versions_yaml.collect() From c27b2b41a21ce6ccae64f7bc3555e6271ebd0704 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 08:34:48 -0800 Subject: [PATCH 26/49] Add sourmash describe csvs to multiqc --- main.nf | 56 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/main.nf b/main.nf index 958460e4..3dadcef0 100644 --- a/main.nf +++ b/main.nf @@ -1397,6 +1397,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash_sketches_peptide = sourmash_sketches_all_peptide.filter{ it[3].size() > 0 } } else { sourmash_sketches_peptide = Channel.empty() + ch_sourmash_sig_describe_peptides = Channel.empty() } // ------------- @@ -1503,30 +1504,30 @@ if (params.split_kmer){ } // If skip_compute is true, skip compare must be specified as true as well if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { - // Combine peptide and nucleotide sketches - sourmash_sketches_nucleotide - .collect() - // Set as a list so that combine does cartesian product of all signatures - .map { it -> [it] } - .combine( ch_ksizes_for_compare_nucleotide ) - .dump( tag: 'sourmash_sketches_nucleotide__ksizes' ) - .map { x -> [x[0], x[1], 'dna'] } - .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' ) - .set { sourmash_sketches_nucleotide_for_compare } - - sourmash_sketches_peptide - .collect() - // Set as a list so that combine does cartesian product of all signatures - .map { it -> [it] } - .combine( ch_ksizes_for_compare_petide ) - .dump( tag: 'sourmash_sketches_peptide__ksizes' ) - .combine( ch_peptide_molecules ) - .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' ) - .set { sourmash_sketches_peptide_for_compare } - - sourmash_sketches_peptide_for_compare - .mix ( sourmash_sketches_nucleotide_for_compare ) - .set { ch_sourmash_sketches_to_compare } + // // Combine peptide and nucleotide sketches + // sourmash_sketches_nucleotide + // .collect() + // // Set as a list so that combine does cartesian product of all signatures + // .map { it -> [it] } + // .combine( ch_ksizes_for_compare_nucleotide ) + // .dump( tag: 'sourmash_sketches_nucleotide__ksizes' ) + // .map { x -> [x[0], x[1], 'dna'] } + // .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' ) + // .set { sourmash_sketches_nucleotide_for_compare } + + // sourmash_sketches_peptide + // .collect() + // // Set as a list so that combine does cartesian product of all signatures + // .map { it -> [it] } + // .combine( ch_ksizes_for_compare_petide ) + // .dump( tag: 'sourmash_sketches_peptide__ksizes' ) + // .combine( ch_peptide_molecules ) + // .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' ) + // .set { sourmash_sketches_peptide_for_compare } + + // sourmash_sketches_peptide_for_compare + // .mix ( sourmash_sketches_nucleotide_for_compare ) + // .set { ch_sourmash_sketches_to_compare } process sourmash_compare_sketches { // Combine peptide and nucleotide sketches @@ -1534,7 +1535,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { publishDir "${params.outdir}/compare_sketches", mode: 'copy' input: - set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare + set file ("*.sig"), val(ksize), val(molecule) from sourmash_sketches output: file(csv) @@ -1564,8 +1565,9 @@ if (!params.skip_multiqc){ publishDir "${params.outdir}/MultiQC", mode: "${params.publish_dir_mode}" input: file multiqc_config from ch_multiqc_config - file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([]) - file ("sourmash_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([]) + file ("sourmash_describe_sig_merge/") from ch_sourmash_sig_describe_merged.collect().ifEmpty([]) + file ("sourmash_describe_peptides/") from ch_sourmash_sig_describe_peptides.collect().ifEmpty([]) + file ("sourmash_describe_nucleotides/") from ch_sourmash_sig_describe_nucleotides.collect().ifEmpty([]) file ('fastp/*') from ch_fastp_results.collect().ifEmpty([]) file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([]) file ('software_versions/*') from ch_software_versions_yaml.collect() From 05f6702df07b2b5afe8669b335eb3b8c00005b8e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 10:09:05 -0800 Subject: [PATCH 27/49] Update ProjectDir --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 3dadcef0..709a175f 100644 --- a/main.nf +++ b/main.nf @@ -544,10 +544,10 @@ if (workflow.profile.contains('awsbatch')) { // Stage config files projectDir = workflow.projectDir -ch_multiqc_config = file("$workflow.projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_config = file("${workflow.projectDir}/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$workflow.projectDir/docs/output.md", checkIfExists: true) -ch_output_docs_images = file("$workflow.projectDir/docs/images/", checkIfExists: true) +ch_output_docs = file("${workflow.projectDir}/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("${workflow.projectDir}/docs/images/", checkIfExists: true) // Header log info @@ -1678,8 +1678,8 @@ workflow.onComplete { def email_html = html_template.toString() // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$workflow.projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$workflow.projectDir/assets/sendmail_template.txt") + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "${workflow.projectDir}", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("${workflow.projectDir}/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) def sendmail_html = sendmail_template.toString() From 2fe45236fbe8fd8be64cf2e313ed2aa87f88ca7d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 10:09:25 -0800 Subject: [PATCH 28/49] Properly save translate output --- main.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 709a175f..b09ab48f 100644 --- a/main.nf +++ b/main.nf @@ -1186,10 +1186,9 @@ if (!params.remove_ribo_rna) { publishDir "${params.outdir}/translate/", mode: params.publish_dir_mode, saveAs: { filename -> - if (save_translate_csv && filename.indexOf(".csv") > 0) "description/$filename" - if (save_translate_json && filename.indexOf(".json") > 0) "description/$filename" - else if (filename.indexOf(".sig") > 0) "sigs/$filename" - else null + if (save_translate_csv && filename.indexOf(".csv") > 0) "$filename" + if (save_translate_json && filename.indexOf(".json") > 0) "$filename" + else "$filename" } input: From 27d95f0381d321d5e234500b3193c4ab7f07301e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 10:09:44 -0800 Subject: [PATCH 29/49] Add dump of sourmash sketches --- main.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b09ab48f..a9cddb4e 100644 --- a/main.nf +++ b/main.nf @@ -1326,7 +1326,10 @@ if (!params.remove_ribo_rna) { sourmash sig describe --csv ${csv} ${sig} """ } - sourmash_sketches_nucleotide = sourmash_sketches_all_nucleotide.filter{ it[3].size() > 0 } + sourmash_sketches_all_nucleotide + .filter{ it[3].size() > 0 } + .dump ( tag: "sourmash_sketches_all_nucleotide" ) + .set { sourmash_sketches_nucleotide } } } else { sourmash_sketches_nucleotide = Channel.empty() From e68db006f15ea0b3f403c4d409b850256f8f6297 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 10:10:08 -0800 Subject: [PATCH 30/49] Fixing sourmash sig merge --- main.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index a9cddb4e..6a190bb1 100644 --- a/main.nf +++ b/main.nf @@ -1456,11 +1456,11 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } input: - set val(molecule), val(ksizes), val(sketch_style), val(sketch_value), val(cell_id), file(reads) from ch_sourmash_sketches_to_merge + set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(molecule), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge output: file(csv) into ch_sourmash_sig_describe_merged - set val(sketch_id), val(molecule), val(ksizes), val(sketch_value), file(sig) into sourmash_sketches + set val(cell_id), val(sketch_id), val(molecule), val(ksizes), file(output_sig) into sourmash_sketches script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) @@ -1471,7 +1471,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ """ for KSIZE in ${KSIZES}; do # Can only merge one kize at a time - sourmash sig merge -k ${KSIZE} -o merged_\$KSIZE.sig ${sigs} + sourmash sig merge -k \$KSIZE -o merged_\$KSIZE.sig ${sigs} done sourmash sig cat merged*.sig > ${output_sig} sourmash sig rename -o ${output_sig} '${cell_id}' @@ -1483,6 +1483,9 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) .set { sourmash_sketches } + ch_sourmash_sig_describe_merged = Channel.empty() +} else { + ch_sourmash_sig_describe_merged = Channel.empty() } if (params.split_kmer){ From 7195eae37965d9a02743f1d4039a001c6ca540b9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 12:09:25 -0800 Subject: [PATCH 31/49] Add ch_sourmash_sig_describe_nucleotides --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 6a190bb1..a3561797 100644 --- a/main.nf +++ b/main.nf @@ -1335,6 +1335,7 @@ if (!params.remove_ribo_rna) { sourmash_sketches_nucleotide = Channel.empty() ch_protein_fastas .set { ch_protein_seq_to_sketch } + ch_sourmash_sig_describe_nucleotides = Channel.empty() } From 9d090b4e4312e5248c6ae8c4af42d2c23ce1fae2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 11 Jan 2021 12:13:16 -0800 Subject: [PATCH 32/49] more if/else --- main.nf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.nf b/main.nf index a3561797..4db39184 100644 --- a/main.nf +++ b/main.nf @@ -1330,6 +1330,11 @@ if (!params.remove_ribo_rna) { .filter{ it[3].size() > 0 } .dump ( tag: "sourmash_sketches_all_nucleotide" ) .set { sourmash_sketches_nucleotide } + } else { + sourmash_sketches_nucleotide = Channel.empty() + ch_protein_fastas + .set { ch_protein_seq_to_sketch } + ch_sourmash_sig_describe_nucleotides = Channel.empty() } } else { sourmash_sketches_nucleotide = Channel.empty() From 5bea4c9673520b1724e6dd344c1e61a182a1eed3 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 15 Jan 2021 17:32:21 -0800 Subject: [PATCH 33/49] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0699d039..8ca6377a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ barcode fastq * Add `sourmash sig merge` for aligned/unaligned signatures from bam files, and add `--skip_sig_merge` option to turn it off * Add `--protein_fastas` option for creating sketches of already-translated protein sequences * Add `--skip_compare option` to skip `sourmash_compare_sketches` process +* Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117)) ### `Fixed` @@ -61,3 +62,5 @@ barcode fastq ### `Dependencies` ### `Deprecated` + +* Removed ability to specify multiple `--scaled` or `--num-hashes` values to enable merging of signatures \ No newline at end of file From 9682b03ee7f52abdf5b73494b4707518726df954 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 15 Jan 2021 18:11:22 -0800 Subject: [PATCH 34/49] Getting "sig merge" to finally run --- main.nf | 171 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 71 deletions(-) diff --git a/main.nf b/main.nf index 4db39184..9d51b32b 100644 --- a/main.nf +++ b/main.nf @@ -475,33 +475,37 @@ if (!have_sketch_value && !params.split_kmer) { -def make_sketch_id (molecule, ksizes, sketch_value, track_abundance, sketch_style) { - if (sketch_style == 'size') { - sketch_value = "num_hashes-${sketch_value}" +// added "_for_id" to all variables to avoid variable scoping errors +def make_sketch_id ( + molecule_for_id, ksizes_for_id, sketch_value_for_id, track_abundance_for_id, sketch_style_for_id + ) { + if (sketch_style_for_id == 'size') { + style_value = "num_hashes-${sketch_value_for_id}" } else { - sketch_value = "scaled-${sketch_value}" + style_value = "scaled-${sketch_value_for_id}" } - sketch_id = "molecule-${molecule}__ksize-${ksizes}__${sketch_value}__track_abundance-${track_abundance}" - return sketch_id + this_sketch_id = "molecule-${molecule_for_id}__ksize-${ksizes_for_id}__${style_value}__track_abundance-${track_abundance_for_id}" + return this_sketch_id } // Create the --num-hashes or --scaled flag for sourmash -def make_sketch_value_flag(sketch_style, sketch_value) { - if (sketch_style == "size") { - number_flag = "--num-hashes ${sketch_value}" - } else if (sketch_style == "scaled" ) { - number_flag = "--scaled ${sketch_value}" +// added "_for_flag" to all variables to avoid variable scoping errors +def make_sketch_value_flag(sketch_style_for_flag, sketch_value_for_flag) { + if (sketch_style_for_flag == "size") { + number_flag = "--num-hashes ${sketch_value_for_flag}" + } else if (sketch_style_for_flag == "scaled" ) { + number_flag = "--scaled ${sketch_value_for_flag}" } else { - exit 1, "${sketch_style} is not a valid sketch counting style! Only 'scaled' and 'size' are valid" + exit 1, "${sketch_style_for_flag} is not a valid sketch counting style! Only 'scaled' and 'size' are valid" } return number_flag } int bloomfilter_tablesize = Math.round(Float.valueOf(params.bloomfilter_tablesize)) -peptide_ksize = params.translate_peptide_ksize -peptide_molecule = params.translate_peptide_molecule +translate_peptide_ksize = params.translate_peptide_ksize +translate_peptide_molecule = params.translate_peptide_molecule jaccard_threshold = params.translate_jaccard_threshold track_abundance = params.track_abundance @@ -673,7 +677,7 @@ if ( !params.split_kmer && have_sketch_value ) { /* * Validate sketch sizes */ - process validate_sketch_values { + process validate_sketch_value { publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, saveAs: {filename -> if (filename.indexOf(".txt") > 0) filename @@ -686,60 +690,53 @@ if ( !params.split_kmer && have_sketch_value ) { val sketch_scaled_log2 output: - file sketch_values into ch_sketch_values_unparsed + file sketch_value into ch_sketch_value_unparsed file sketch_style into ch_sketch_style_unparsed script: sketch_style = "sketch_style.txt" - sketch_values = 'sketch_values.txt' + sketch_value = 'sketch_value.txt' """ - validate_sketch_values.py \\ + validate_sketch_value.py \\ --sketch_num_hashes ${sketch_num_hashes} \\ --sketch_num_hashes_log2 ${sketch_num_hashes_log2} \\ --sketch_scaled ${sketch_scaled} \\ --sketch_scaled_log2 ${sketch_scaled_log2} \\ - --output ${sketch_values} \\ + --output ${sketch_value} \\ --sketch_style ${sketch_style} """ } // Parse sketch style into value - ch_sketch_style_unparsed + sketch_style_parsed = ch_sketch_style_unparsed .splitText() .dump ( tag: 'ch_sketch_style' ) .map { it -> it.replaceAll('\\n', '' ) } - // .first() + .first() .dump ( tag: 'sketch_style_parsed' ) - .into { ch_sketch_style_for_nucleotides; ch_sketch_style_for_proteins } + .collect () + // get first item of returned array from .collect() + // sketch_style_parsed = sketch_style_parsed[0] + // .into { ch_sketch_style_for_nucleotides; ch_sketch_style_for_proteins } // sketch_style = sketch_styles[0] // println "sketch_style_parsed: ${sketch_style_parsed}" // println "sketch_style: ${sketch_style}" // Parse file into values - ch_sketch_values_unparsed + sketch_value_parsed = ch_sketch_value_unparsed .splitText() .map { it -> it.replaceAll('\\n', '')} - .dump ( tag : 'sketch_values_parsed' ) - .into { ch_sketch_values_for_proteins; ch_sketch_values_for_dna } + .first() + .dump ( tag : 'sketch_value_parsed' ) + .collect() + // get first item of returned array from .collect() + // sketch_value_parsed = sketch_value_parsed[0] + // .into { ch_sketch_value_for_proteins; ch_sketch_value_for_dna } } // Combine sketch values with ksize and molecule types -// ch_peptide_molecules - // .combine ( ch_ksizes_for_proteins ) -ch_sketch_style_for_proteins - // .combine ( ch_sketch_style_for_proteins ) - .combine ( ch_sketch_values_for_proteins ) - .set { ch_sourmash_protein_sketch_params } - - -// ch_ksizes_for_dna - // .combine ( ch_sketch_style_for_nucleotides ) -ch_sketch_style_for_nucleotides - .combine ( ch_sketch_values_for_dna ) - .set { ch_sourmash_dna_sketch_params } - if (params.reference_proteome_fasta){ @@ -751,19 +748,19 @@ if (params.reference_proteome_fasta){ input: file(peptides) from ch_reference_proteome_fasta - peptide_ksize - peptide_molecule + translate_peptide_ksize + translate_peptide_molecule output: set val(bloom_id), val(peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter script: - bloom_id = "molecule-${peptide_molecule}_ksize-${peptide_ksize}" + bloom_id = "molecule-${peptide_molecule}_ksize-${translate_peptide_ksize}" """ sencha index \\ --tablesize ${bloomfilter_tablesize} \\ - --molecule ${peptide_molecule} \\ - --peptide-ksize ${peptide_ksize} \\ + --molecule ${translate_peptide_molecule} \\ + --peptide-ksize ${translate_peptide_ksize} \\ --save-as ${peptides.simpleName}__${bloom_id}.bloomfilter \\ ${peptides} """ @@ -1282,10 +1279,6 @@ if (!params.remove_ribo_rna) { } } else if (!params.skip_compute) { - ch_sourmash_dna_sketch_params - .combine ( ch_reads_to_sketch ) - .dump ( tag: 'ch_sourmash_dna_params_with_reads' ) - .set { ch_sourmash_sketch_params_with_reads } process sourmash_compute_sketch_fastx_nucleotide { tag "${sig_id}" @@ -1298,8 +1291,10 @@ if (!params.remove_ribo_rna) { } input: - set val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_sketch_params_with_reads val track_abundance + val sketch_value_parsed + val sketch_style_parsed + set val(sample_id), file(reads) from ch_reads_to_sketch output: file(csv) into ch_sourmash_sig_describe_nucleotides @@ -1308,8 +1303,14 @@ if (!params.remove_ribo_rna) { script: // Don't calculate DNA signature if this is protein, to minimize disk, // memory and IO requirements in the future - sketch_id = make_sketch_id("dna", params.ksizes, sketch_value, track_abundance, sketch_style) - sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value) + sketch_id = make_sketch_id( + "dna", + params.ksizes, + sketch_value_parsed[0], + track_abundance, + sketch_style_parsed[0] + ) + sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" @@ -1357,10 +1358,6 @@ if (!have_nucleotide_input) { if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ - ch_sourmash_protein_sketch_params - .combine ( ch_protein_seq_to_sketch ) - .dump ( tag: 'ch_sourmash_protein_params_with_reads' ) - .set { ch_sourmash_protein_sketch_params_with_reads } process sourmash_compute_sketch_fastx_peptide { tag "${sig_id}" @@ -1374,16 +1371,24 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ input: val track_abundance - set val(sketch_style), val(sketch_value), val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads + val sketch_value_parsed + val sketch_style_parsed + set val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads output: file(csv) into ch_sourmash_sig_describe_peptides set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide - script: - sketch_id = make_sketch_id(peptide_molecules_comma_separated, params.ksizes, sketch_value, track_abundance, sketch_style) - sketch_value_flag = make_sketch_value_flag(sketch_style, sketch_value) + sketch_id = make_sketch_id( + peptide_molecules_comma_separated, + params.ksizes, + sketch_value_parsed[0], + track_abundance, + sketch_style_parsed[0] + ) + + sketch_value_flag = make_sketch_value_flag(sketch_style[0], sketch_value[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" @@ -1419,7 +1424,9 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ .set { ch_sourmash_sketches_mixed } ch_fastq_id_to_cell_id_is_aligned + .dump( tag: 'ch_fastq_id_to_cell_id_is_aligned' ) .combine ( ch_sourmash_sketches_mixed, by: 0 ) + .unique() .dump( tag: 'fastq_id_to_cells__combine__sketches' ) // [DUMP: fastq_id_to_cells__combine__sketches] // ['mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT', @@ -1439,16 +1446,19 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ .groupTuple( by: [1, 3, 4, 5] ) // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] // [ - // ['mouse_lung__aligned__AAAGATGCAGATCTGT', - // 'mouse_lung__aligned__AAAGATGCAGATCTGT'], - // mouse_lung__AAAGATGCAGATCTGT, - // ['aligned', 'aligned'], - // molecule-dna__ksize-15__scaled-5__track_abundance-false, - // 'dna', '15', - // [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, + // it[0]: ['mouse_lung__aligned__AAAGATGCAGATCTGT', + // 'mouse_lung__aligned__AAAGATGCAGATCTGT'], + // it[1]: mouse_lung__AAAGATGCAGATCTGT, + // it[2]: ['aligned', 'aligned'], + // it[3]: molecule-dna__ksize-15__scaled-5__track_abundance-false, + // it[4]: 'dna', + // it[5]: '15', + // it[6]: [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, // mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig] // ] .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) + .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6].unique() ] } + .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' ) .set { ch_sourmash_sketches_to_merge } process sourmash_sig_merge { @@ -1472,15 +1482,34 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) sig_id = "${cell_id}__${sketch_id}" csv = "${sig_id}.csv" + concatenated_sig = 'concatenated.sig' output_sig = "${sig_id}.sig" - KSIZES = ksizes.split(',') """ - for KSIZE in ${KSIZES}; do + # Parse ksizes in bash + KSIZES=\$(echo ${ksizes} | tr ',' ' ') + + # Iterate over ksizes + for KSIZE in \$KSIZES; do # Can only merge one kize at a time - sourmash sig merge -k \$KSIZE -o merged_\$KSIZE.sig ${sigs} + sourmash sig merge \ + --${molecule} \ + -k \$KSIZE \ + -o merged_\$KSIZE.sig \ + ${sigs} + + # "sig merge" gives default automatic name of md5sum which isn't helpful + # --> rename to cell id + sourmash sig rename \ + --${molecule} \ + -k \$KSIZE \ + -o merged_renamed_\$KSIZE.sig \ + merged_\$KSIZE.sig '${cell_id}' done - sourmash sig cat merged*.sig > ${output_sig} - sourmash sig rename -o ${output_sig} '${cell_id}' + + # Concatenate all ksizes together + sourmash sig cat merged_renamed*.sig > ${output_sig} + + # Add csv showing number of hashes at each ksize sourmash sig describe --csv ${csv} ${output_sig} """ } From 76b2ed7c2e1df0cbcf06b30c0652b8fe4af84a82 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 15 Jan 2021 18:11:33 -0800 Subject: [PATCH 35/49] Add option to skip sig merge --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e5d219fc..4c5634ee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,6 +67,7 @@ jobs: - "test --sketch_scaled 2,4 --sketch_num_hashes_log2 false" - "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false" - "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false" + - "test_bam --skip_sig_merge" - "test_bam --write_barcodes_meta_csv false" - "test_bam --barcodes_file false --rename_10x_barcodes false" - "test_bam --rename_10x_barcodes false" From 3c95f82b53dfddd5fb8167de83dbce4ac7d3874e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 15 Jan 2021 18:11:52 -0800 Subject: [PATCH 36/49] Update validate_sketch_value to only allow a single value --- bin/validate_sketch_value.py | 144 ++++++++++++++++++++++++++++++++++ bin/validate_sketch_values.py | 93 ---------------------- 2 files changed, 144 insertions(+), 93 deletions(-) create mode 100755 bin/validate_sketch_value.py delete mode 100755 bin/validate_sketch_values.py diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py new file mode 100755 index 00000000..c9e3fe06 --- /dev/null +++ b/bin/validate_sketch_value.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +from __future__ import print_function +from collections import OrderedDict, defaultdict, Counter +import logging +import argparse +import glob +import os +import sys + + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__file__) +logger.setLevel(logging.INFO) + + +def get_sketch_value(value, value_log2): + try: + if value: + sketch_value = int(value) + else: + sketch_value = 2 ** int(value_log2) + except ValueError: + logger.exception( + "Can only supply a single value to --sketch_num_hashes, " + "--sketch_num_hashes_log2, --sketch_scaled, " + "--sketch_scaled_log2 " + ) + return sketch_value + + +def value_or_bool(value): + if value == "false": + return False + if value == "true": + logger.exception( + "Must set a value for the --sketch_num_hashes, " + "--sketch_num_hashes_log2, --sketch_scaled, " + "--sketch_scaled_log2 options! Cannot simply set the " + "flag. E.g. '--sketch_num_hashes 5' is valid but " + "'--sketch_num_hashes' on its own is not" + ) + sys.exit(1) + else: + return value + + +def main( + sketch_num_hashes, + sketch_num_hashes_log2, + sketch_scaled, + sketch_scaled_log2, + out, + sketch_style, +): + sketch_num_hashes = value_or_bool(sketch_num_hashes) + sketch_num_hashes_log2 = value_or_bool(sketch_num_hashes_log2) + sketch_scaled = value_or_bool(sketch_scaled) + sketch_scaled_log2 = value_or_bool(sketch_scaled_log2) + + using_size = sketch_num_hashes or sketch_num_hashes_log2 + using_scaled = sketch_scaled or sketch_scaled_log2 + + if using_size and using_scaled: + logger.exception( + "Cannot specify both sketch scales and sizes! Can only" + " use one of --sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, " + "--sketch_scaled_log2. Exiting." + ) + sys.exit(1) + + if using_size: + if sketch_num_hashes and sketch_num_hashes_log2: + logger.exception( + "Cannot specify both --sketch_num_hashes and --sketch_num_hashes_log2! Exiting." + ) + sys.exit(1) + sketch_value = get_sketch_value(sketch_num_hashes, sketch_num_hashes_log2) + with open(sketch_style, "w") as f: + f.write("size") + elif using_scaled: + if sketch_scaled and sketch_scaled_log2: + logger.exception( + "Cannot specify both --sketch_scaled and --sketch_scaled_log2! Exiting." + ) + sys.exit(1) + sketch_value = get_sketch_value(sketch_scaled, sketch_scaled_log2) + with open(sketch_style, "w") as f: + f.write("scaled") + + else: + logger.info( + "Did not specify a sketch size or scale with any of " + "--sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, --sketch_scaled_log2! " + "Falling back on sourmash's default of --sketch_scaled 500" + ) + sketch_value = 500 + + with open(out, "w") as f: + f.write(f"{sketch_value}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Ensure that sketch sizes/scaleds provided are valid""" + ) + parser.add_argument( + "--sketch_num_hashes", type=str, help="Flat size of the sketches" + ) + parser.add_argument( + "--sketch_num_hashes_log2", type=str, help="Flat size of the sketches, log2" + ) + parser.add_argument( + "--sketch_scaled", type=str, help="Fraction of total observed hashes to observe" + ) + parser.add_argument( + "--sketch_scaled_log2", + type=str, + help="Fraction of total observed hashes to observe, log2", + ) + parser.add_argument( + "-o", + "--output", + dest="output", + default="sketch_value.txt", + type=str, + help="file with output", + ) + parser.add_argument( + "--sketch_style", + default="sketch_style.txt", + type=str, + help="file indicating 'size' or 'scaled'", + ) + + args = parser.parse_args() + main( + args.sketch_num_hashes, + args.sketch_num_hashes_log2, + args.sketch_scaled, + args.sketch_scaled_log2, + args.output, + args.sketch_style, + ) diff --git a/bin/validate_sketch_values.py b/bin/validate_sketch_values.py deleted file mode 100755 index 1310c94f..00000000 --- a/bin/validate_sketch_values.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from collections import OrderedDict, defaultdict, Counter -import logging -import argparse -import glob -import os -import sys - - -# Create a logger -logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s') -logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) - - -def get_sketch_values(values, values_log2): - if values: - sketch_values = [int(x) for x in values.split(',')] - else: - sketch_values = [2 ** int(x) for x in values_log2.split(',')] - return sketch_values - - -def value_or_bool(value): - if value == 'false': - return False - if value == 'true': - logger.exception("Must set a value for the --sketch_num_hashes, " - "--sketch_num_hashes_log2, --sketch_scaled, " - "--sketch_scaled_log2 options! Cannot simply set the " - "flag. E.g. '--sketch_num_hashes 5' is valid but " - "'--sketch_num_hashes' on its own is not") - sys.exit(1) - else: - return value - - -def main(sketch_num_hashes, sketch_num_hashes_log2, sketch_scaled, sketch_scaled_log2, out, - sketch_style): - sketch_num_hashes = value_or_bool(sketch_num_hashes) - sketch_num_hashes_log2 = value_or_bool(sketch_num_hashes_log2) - sketch_scaled = value_or_bool(sketch_scaled) - sketch_scaled_log2 = value_or_bool(sketch_scaled_log2) - - using_size = sketch_num_hashes or sketch_num_hashes_log2 - using_scaled = sketch_scaled or sketch_scaled_log2 - - if using_size and using_scaled: - logger.exception("Cannot specify both sketch scales and sizes! Can only" - " use one of --sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, " - "--sketch_scaled_log2. Exiting.") - sys.exit(1) - - - if using_size: - if sketch_num_hashes and sketch_num_hashes_log2: - logger.exception("Cannot specify both --sketch_num_hashes and --sketch_num_hashes_log2! Exiting.") - sys.exit(1) - sketch_values = get_sketch_values(sketch_num_hashes, sketch_num_hashes_log2) - with open(sketch_style, 'w') as f: - f.write('size') - elif using_scaled: - if sketch_scaled and sketch_scaled_log2: - logger.exception("Cannot specify both --sketch_scaled and --sketch_scaled_log2! Exiting.") - sys.exit(1) - sketch_values = get_sketch_values(sketch_scaled, sketch_scaled_log2) - with open(sketch_style, 'w') as f: - f.write('scaled') - - else: - logger.info("Did not specify a sketch size or scale with any of " - "--sketch_num_hashes, --sketch_num_hashes_log2, --sketch_scaled, --sketch_scaled_log2! " - "Falling back on sourmash's default of --sketch_scaled 500") - sketch_values = [500] - - with open(out, 'w') as f: - for number in sketch_values: - f.write(f"{number}\n") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Ensure that sketch sizes/scaleds provided are valid""") - parser.add_argument("--sketch_num_hashes", type=str, help="Flat size of the sketches") - parser.add_argument("--sketch_num_hashes_log2", type=str, help="Flat size of the sketches, log2") - parser.add_argument("--sketch_scaled", type=str, help="Fraction of total observed hashes to observe") - parser.add_argument("--sketch_scaled_log2", type=str, help="Fraction of total observed hashes to observe, log2") - parser.add_argument("-o", "--output", dest='output', default='sketch_values.txt', - type=str, help="file with output") - parser.add_argument("--sketch_style", default='sketch_style.txt', type=str, help="file indicating 'size' or 'scaled'") - - args = parser.parse_args() - main(args.sketch_num_hashes, args.sketch_num_hashes_log2, args.sketch_scaled, - args.sketch_scaled_log2, args.output, args.sketch_style) From 1321968a22433573b9667fb639634a88d50be7f0 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Fri, 15 Jan 2021 18:12:06 -0800 Subject: [PATCH 37/49] Change sketch values to single value --- conf/test.config | 2 +- conf/test_bam.config | 2 +- conf/test_fastas.config | 2 +- conf/test_full.config | 2 +- conf/test_protein_fastas.config | 2 +- conf/test_remove_ribo.config | 2 +- conf/test_tenx_tgz.config | 2 +- conf/test_translate.config | 2 +- conf/test_translate_bam.config | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/conf/test.config b/conf/test.config index f4b841c7..a4c9e73a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,7 +18,7 @@ params { // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" diff --git a/conf/test_bam.config b/conf/test_bam.config index ac4e8038..02f53393 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -20,7 +20,7 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters ksizes = '15,21' - sketch_scaled = '2,5' + sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" diff --git a/conf/test_fastas.config b/conf/test_fastas.config index 7aede988..83e31e90 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -18,7 +18,7 @@ params { // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_num_hashes_log2 = 4 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" diff --git a/conf/test_full.config b/conf/test_full.config index 7d612b9d..69bb60f6 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -13,7 +13,7 @@ params { // Input data for full size test ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_num_hashes_log2 = 4 molecules = 'dna,protein,dayhoff' input_paths = [ ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index c3f19571..fa11209e 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -27,7 +27,7 @@ params { // Sketch Parameters ksizes = '9,15' - sketch_num_hashes = '2,4' + sketch_num_hashes = 4 molecules = 'protein,dayhoff,hp' read_pairs = false diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index dad49362..246a8e40 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -18,7 +18,7 @@ params { // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_num_hashes_log2 = 4 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index cfa823e3..d1ee07e0 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -21,7 +21,7 @@ params { ] // Sketch Parameters ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_num_hashes_log2 = 4 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" diff --git a/conf/test_translate.config b/conf/test_translate.config index 5f241d1c..20e04f3b 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -18,7 +18,7 @@ params { fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa" // Sketch Parameters ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' + sketch_num_hashes_log2 = 4 molecules = 'dna,protein,dayhoff' read_pairs = false diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index 1e29b02d..d8a1d82c 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -19,7 +19,7 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters ksizes = '15,21' - sketch_scaled = '2,5' + sketch_scaled = 5 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" From dc0ecdd1b4507275d255e385c55d731add332d91 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 16 Jan 2021 23:41:29 -0800 Subject: [PATCH 38/49] peptide_molecule --> translate_peptide_molecule --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 9d51b32b..7ac47075 100644 --- a/main.nf +++ b/main.nf @@ -752,10 +752,10 @@ if (params.reference_proteome_fasta){ translate_peptide_molecule output: - set val(bloom_id), val(peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter + set val(bloom_id), val(translate_peptide_molecule), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filter script: - bloom_id = "molecule-${peptide_molecule}_ksize-${translate_peptide_ksize}" + bloom_id = "molecule-${translate_peptide_molecule}_ksize-${translate_peptide_ksize}" """ sencha index \\ --tablesize ${bloomfilter_tablesize} \\ From 1fe32b1dceeae558dd98c6c7480033a43e1d94dd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 18 Jan 2021 11:36:28 -0800 Subject: [PATCH 39/49] add "translate_" to peptide ksize and jaccard threshold --- .vscode/settings.json | 3 +++ main.nf | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..de288e1e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "black" +} \ No newline at end of file diff --git a/main.nf b/main.nf index 7ac47075..9675e7a6 100644 --- a/main.nf +++ b/main.nf @@ -506,7 +506,7 @@ int bloomfilter_tablesize = Math.round(Float.valueOf(params.bloomfilter_tablesiz translate_peptide_ksize = params.translate_peptide_ksize translate_peptide_molecule = params.translate_peptide_molecule -jaccard_threshold = params.translate_jaccard_threshold +translate_jaccard_threshold = params.translate_jaccard_threshold track_abundance = params.track_abundance // Tenx parameters @@ -1213,8 +1213,8 @@ if (!params.remove_ribo_rna) { --noncoding-nucleotide-fasta ${sample_id}__noncoding_reads_nucleotides.fasta \\ ${csv_flag} \\ ${json_flag} \\ - --jaccard-threshold ${jaccard_threshold} \\ - --peptide-ksize ${peptide_ksize} \\ + --jaccard-threshold ${translate_jaccard_threshold} \\ + --peptide-ksize ${translate_peptide_ksize} \\ --peptides-are-bloom-filter \\ ${bloom_filter} \\ ${reads} > ${sample_id}__coding_reads_peptides.fasta From 1297886c640bd2d25bd5cf43b44269246f0bb0e4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 8 Mar 2021 17:05:02 -0800 Subject: [PATCH 40/49] Do sig merge on individual moltypes --- main.nf | 114 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 36 deletions(-) diff --git a/main.nf b/main.nf index 9675e7a6..bf489b03 100644 --- a/main.nf +++ b/main.nf @@ -1228,18 +1228,22 @@ if (!params.remove_ribo_rna) { // it[1] = sequence fasta file ch_translated_protein_seqs .mix ( ch_protein_fastas ) + .dump ( tag: 'ch_protein_seq_to_sketch_from_translate' ) .set { ch_protein_seq_to_sketch } // Remove empty files // it[0] = sample bloom id // it[1] = sequence fasta file ch_noncoding_nucleotides_nonempty = ch_noncoding_nucleotides_potentially_empty.filter{ it[1].size() > 0 } - ch_translatable_nucleotide_seqs.filter{ it[1].size() > 0 } - .mix( ch_noncoding_nucleotides_nonempty) + ch_translatable_nucleotide_seqs + .dump( tag: 'ch_translatable_nucleotide_seqs' ) + .filter{ it[1].size() > 0 } + .dump ( tag: 'ch_reads_to_sketch__from_translate' ) .set { ch_reads_to_sketch } } else { // Send reads directly into coding/noncoding ch_reads_to_translate + .dump ( tag: 'ch_reads_to_sketch__no_translation' ) .set{ ch_reads_to_sketch } } @@ -1373,11 +1377,11 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ val track_abundance val sketch_value_parsed val sketch_style_parsed - set val(sample_id), file(reads) from ch_sourmash_protein_sketch_params_with_reads + set val(sample_id), file(reads) from ch_protein_seq_to_sketch output: file(csv) into ch_sourmash_sig_describe_peptides - set val(sample_id), val(sketch_id), val(peptide_molecules), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide + set val(sample_id), val(sketch_id), val(peptide_molecules_comma_separated), val(params.ksizes), file(sig) into sourmash_sketches_all_peptide script: sketch_id = make_sketch_id( @@ -1388,7 +1392,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sketch_style_parsed[0] ) - sketch_value_flag = make_sketch_value_flag(sketch_style[0], sketch_value[0]) + sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" @@ -1445,22 +1449,37 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ // it[6]: signature file, e.g. mouse_brown_fat_ptprc_plus_unaligned__aligned__CTGAAGTCAATGGTCT__molecule-dna__ksize-3__num_hashes-4__track_abundance-false.sig .groupTuple( by: [1, 3, 4, 5] ) // [DUMP: fastq_id_to_cells__combine__sketches__grouptuple] - // [ - // it[0]: ['mouse_lung__aligned__AAAGATGCAGATCTGT', - // 'mouse_lung__aligned__AAAGATGCAGATCTGT'], - // it[1]: mouse_lung__AAAGATGCAGATCTGT, - // it[2]: ['aligned', 'aligned'], - // it[3]: molecule-dna__ksize-15__scaled-5__track_abundance-false, - // it[4]: 'dna', - // it[5]: '15', - // it[6]: [mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig, - // mouse_lung__aligned__AAAGATGCAGATCTGT__molecule-dna__ksize-15__scaled-5__track_abundance-false.sig] + // [ + // ['mouse_brown_fat_ptprc_plus_unaligned__aligned__GCGCAGTCATGCCTTC'], + // mouse_brown_fat_ptprc_plus_unaligned__GCGCAGTCATGCCTTC, + // ['aligned'], + // molecule-dna__ksize-3,9__scaled-2__track_abundance-false, + // 'dna', + // '3,9', + // [mouse_brown_fat_ptprc_plus_unaligned__aligned__GCGCAGTCATGCCTTC__molecule-dna__ksize-3,9__scaled-2__track_abundance-false.sig] // ] .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) - .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6].unique() ] } + .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6]] } .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' ) + .branch { + to_merge: it[1].length() > 1 + // only one of aligned or unaligned reads here + // No merging necessary + single: it[1].length() == 1 + } + .set { ch_sourmash_sketches_branched } + + ch_sourmash_sketches_branched + .to_merge + .dump ( tag: 'ch_sourmash_sketches_to_merge' ) .set { ch_sourmash_sketches_to_merge } + ch_sourmash_sketches_branched + .single + .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] } + .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' ) + .set { ch_sourmash_sketches_to_mix_with_merged } + process sourmash_sig_merge { tag "${sig_id}" label "low_memory" @@ -1472,38 +1491,47 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } input: - set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(molecule), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge + set val(fasta_ids), val(cell_id), val(is_aligned), val(sketch_id), val(moltypes), val(ksizes), file(sigs) from ch_sourmash_sketches_to_merge output: file(csv) into ch_sourmash_sig_describe_merged - set val(cell_id), val(sketch_id), val(molecule), val(ksizes), file(output_sig) into sourmash_sketches + set val(cell_id), val(sketch_id), val(molecules), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) sig_id = "${cell_id}__${sketch_id}" csv = "${sig_id}.csv" - concatenated_sig = 'concatenated.sig' output_sig = "${sig_id}.sig" + println(moltypes) """ + ## --- Doing this big crazy loop in bash because the sigfiles are so small, it's not --- ## + ## --- worth spinning up docker images for each one, so doing within bash instead --- ## # Parse ksizes in bash KSIZES=\$(echo ${ksizes} | tr ',' ' ') + # Parse molecule types in bash + MOLTYPES=\$(echo ${moltypes} | tr ',' ' ') + + # Iterate over ksizes - for KSIZE in \$KSIZES; do - # Can only merge one kize at a time - sourmash sig merge \ - --${molecule} \ - -k \$KSIZE \ - -o merged_\$KSIZE.sig \ - ${sigs} - - # "sig merge" gives default automatic name of md5sum which isn't helpful - # --> rename to cell id - sourmash sig rename \ - --${molecule} \ - -k \$KSIZE \ - -o merged_renamed_\$KSIZE.sig \ - merged_\$KSIZE.sig '${cell_id}' + for MOLTYPE in \$MOLTYPES; do + MOLTYPE_FLAG=\$(echo -n "--\$MOLTYPE") + for KSIZE in \$KSIZES; do + # Can only merge one kize at a time + sourmash sig merge \\ + \$MOLTYPE_FLAG \\ + -k \$KSIZE \\ + -o merged_\$MOLTYPE\_\$KSIZE.sig \\ + ${sigs} + + # "sig merge" gives default automatic name of md5sum which isn't helpful + # --> rename to cell id + sourmash sig rename \\ + \$MOLTYPE_FLAG \\ + -k \$KSIZE \\ + -o merged_renamed_\$MOLTYPE\_\$KSIZE.sig \\ + merged_\$MOLTYPE\_\$KSIZE.sig '${cell_id}' + done done # Concatenate all ksizes together @@ -1517,7 +1545,8 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } else if (!params.skip_compute) { sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) - .set { sourmash_sketches } + .dump ( tag: 'skip_merge__ch_sourmash_sketches_to_comapre' ) + .set { ch_sourmash_sketches_to_comapre } ch_sourmash_sig_describe_merged = Channel.empty() } else { ch_sourmash_sig_describe_merged = Channel.empty() @@ -1569,13 +1598,26 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { // .mix ( sourmash_sketches_nucleotide_for_compare ) // .set { ch_sourmash_sketches_to_compare } + ch_sourmash_sketches_merged + // Drop first index which is the cell id + .map { [it[1], it[2].split(","), it[3].split(","), it[4]] } + .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) + .groupTuple(by: [0, 1, 2]) + .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple' ) + .transpose() + .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple__transpose' ) + .dump(tag: 'ch_sourmash_sketches_to_compare' ) + .set { ch_sourmash_sketches_to_compare } + + ch_sourmash_sketches_merged_to_view + .dump( tag: "ch_sourmash_sketches_to_view" ) process sourmash_compare_sketches { // Combine peptide and nucleotide sketches tag "${sketch_id}" publishDir "${params.outdir}/compare_sketches", mode: 'copy' input: - set file ("*.sig"), val(ksize), val(molecule) from sourmash_sketches + set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare output: file(csv) From d8e764b8ae68335d7c990a3708ae17341ab47a0d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 8 Mar 2021 17:05:19 -0800 Subject: [PATCH 41/49] Add test_sig_merge --- conf/test_sig_merge.config | 33 +++++++++++++++++++++++++++++++++ nextflow.config | 1 + 2 files changed, 34 insertions(+) create mode 100644 conf/test_sig_merge.config diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config new file mode 100644 index 00000000..3bd3f29d --- /dev/null +++ b/conf/test_sig_merge.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------------------------- + * Nextflow config file for running tests with bam, translate + * ----------------------------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test_translate_bam.config + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + // Input data + bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', + 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] + // Sketch Parameters + ksizes = '3,9' + sketch_scaled = 2 + molecules = 'dna,protein,dayhoff' + read_pairs = false + save_fastas = "fastas" + write_barcode_meta_csv = "metadata.csv" + // For bam, each fasta record represents each barcode and each should have a signature + // they should not be merged, For computation on bam file using sourmash, please set true for the below flag + tenx_min_umi_per_cell = 2 + + reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' + bloomfilter_tablesize = '1e6' +} diff --git a/nextflow.config b/nextflow.config index 500b4095..857fa62e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -151,6 +151,7 @@ profiles { test_fastas { includeConfig 'conf/test_fastas.config' } test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' } test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' } + test_sig_merge { includeConfig 'conf/test_sig_merge.config' } test_tenx_tgz { includeConfig 'conf/test_tenx_tgz.config' } test_translate { includeConfig 'conf/test_translate.config' } test_translate_bam { includeConfig 'conf/test_translate_bam.config' } From 232ac0d2e1c4a1dc9284f1354be1d32a2fe8d4b3 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 8 Mar 2021 17:05:47 -0800 Subject: [PATCH 42/49] Add test_sig_merge to CI --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c5634ee..c41b9629 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,7 @@ jobs: - "test_fastas" - "test_protein_fastas" - "test_remove_ribo" + - "test_sig_merge" - "test_tenx_tgz" - "test_translate" - "test_translate_bam" From 253fa83cab1e9c6042e52c5ba7b844edb877b1f4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 8 Mar 2021 17:05:59 -0800 Subject: [PATCH 43/49] Don't allow multiple sketch values --- bin/validate_sketch_value.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py index c9e3fe06..d96cff7f 100755 --- a/bin/validate_sketch_value.py +++ b/bin/validate_sketch_value.py @@ -17,8 +17,18 @@ def get_sketch_value(value, value_log2): try: if value: + if "," in value: + logger.exception( + f"Can only provide a single number to --sketch_num_hashes or" + f" --sketch_scaled. Provided '{value}" + ) sketch_value = int(value) else: + if "," in value_log2: + logger.exception( + f"Can only provide a single number to --sketch_num_hashes_log2 or" + f" --sketch_scaled_log2. Provided '{value_log2}" + ) sketch_value = 2 ** int(value_log2) except ValueError: logger.exception( From 67aec5eac11e8381a1082a868f9168025b48ca06 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 8 Mar 2021 17:06:17 -0800 Subject: [PATCH 44/49] Reduce bloom filter table size --- conf/test_translate_bam.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index d8a1d82c..33254ef3 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -29,7 +29,7 @@ params { tenx_min_umi_per_cell = 5 reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' - bloomfilter_tablesize = '1e8' + bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' } From 8c99f9f635d48db5452d1eae472bed21c0fe0846 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 10:39:11 -0800 Subject: [PATCH 45/49] Sig merge is working! --- main.nf | 74 ++++++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 53 deletions(-) diff --git a/main.nf b/main.nf index bf489b03..bb09aa4e 100644 --- a/main.nf +++ b/main.nf @@ -1461,24 +1461,18 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple' ) .map { it -> [it[0].unique(), it[1], it[2].unique(), it[3], it[4], it[5], it[6]] } .dump( tag: 'fastq_id_to_cells__combine__sketches__grouptuple__unique' ) - .branch { - to_merge: it[1].length() > 1 - // only one of aligned or unaligned reads here - // No merging necessary - single: it[1].length() == 1 - } - .set { ch_sourmash_sketches_branched } - - ch_sourmash_sketches_branched - .to_merge - .dump ( tag: 'ch_sourmash_sketches_to_merge' ) .set { ch_sourmash_sketches_to_merge } - ch_sourmash_sketches_branched - .single - .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] } - .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' ) - .set { ch_sourmash_sketches_to_mix_with_merged } + // ch_sourmash_sketches_branched + // .to_merge + // .dump ( tag: 'ch_sourmash_sketches_to_merge' ) + // .set { ch_sourmash_sketches_to_merge } + + // ch_sourmash_sketches_branched + // .single + // .map { [it[0][0], it[1], it[2][0], it[3], it[4], it[5], file(it[6][0])] } + // .dump ( tag: 'ch_sourmash_sketches_to_mix_with_merged' ) + // .set { ch_sourmash_sketches_to_mix_with_merged } process sourmash_sig_merge { tag "${sig_id}" @@ -1495,47 +1489,20 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ output: file(csv) into ch_sourmash_sig_describe_merged - set val(cell_id), val(sketch_id), val(molecules), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view + set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) - sig_id = "${cell_id}__${sketch_id}" + sig_id = "${cell_id}---${sketch_id}" csv = "${sig_id}.csv" output_sig = "${sig_id}.sig" - println(moltypes) """ - ## --- Doing this big crazy loop in bash because the sigfiles are so small, it's not --- ## - ## --- worth spinning up docker images for each one, so doing within bash instead --- ## - # Parse ksizes in bash - KSIZES=\$(echo ${ksizes} | tr ',' ' ') - - # Parse molecule types in bash - MOLTYPES=\$(echo ${moltypes} | tr ',' ' ') - - - # Iterate over ksizes - for MOLTYPE in \$MOLTYPES; do - MOLTYPE_FLAG=\$(echo -n "--\$MOLTYPE") - for KSIZE in \$KSIZES; do - # Can only merge one kize at a time - sourmash sig merge \\ - \$MOLTYPE_FLAG \\ - -k \$KSIZE \\ - -o merged_\$MOLTYPE\_\$KSIZE.sig \\ - ${sigs} - - # "sig merge" gives default automatic name of md5sum which isn't helpful - # --> rename to cell id - sourmash sig rename \\ - \$MOLTYPE_FLAG \\ - -k \$KSIZE \\ - -o merged_renamed_\$MOLTYPE\_\$KSIZE.sig \\ - merged_\$MOLTYPE\_\$KSIZE.sig '${cell_id}' - done - done - - # Concatenate all ksizes together - sourmash sig cat merged_renamed*.sig > ${output_sig} + merge_rename_sigs.py \\ + --ksizes ${ksizes} \\ + --moltypes ${moltypes} \\ + --name '${cell_id}' \\ + --outsig ${output_sig} \\ + ${sigs} # Add csv showing number of hashes at each ksize sourmash sig describe --csv ${csv} ${output_sig} @@ -1597,6 +1564,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { // sourmash_sketches_peptide_for_compare // .mix ( sourmash_sketches_nucleotide_for_compare ) // .set { ch_sourmash_sketches_to_compare } + ch_sourmash_sketches_merged_to_view + .dump( tag: "ch_sourmash_sketches_to_view" ) ch_sourmash_sketches_merged // Drop first index which is the cell id @@ -1609,8 +1578,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { .dump(tag: 'ch_sourmash_sketches_to_compare' ) .set { ch_sourmash_sketches_to_compare } - ch_sourmash_sketches_merged_to_view - .dump( tag: "ch_sourmash_sketches_to_view" ) + process sourmash_compare_sketches { // Combine peptide and nucleotide sketches tag "${sketch_id}" From fccec83ca190d7e1ea33be9e26222468084c3419 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 10:39:20 -0800 Subject: [PATCH 46/49] Make test params more realistic --- conf/test_sig_merge.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 3bd3f29d..52b46efc 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -18,7 +18,7 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - ksizes = '3,9' + ksizes = '21,30,51' sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false From 8de30e30b8a33859d22366ec2c03fe866889d3a6 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 10:39:41 -0800 Subject: [PATCH 47/49] Update default ksizes, add track abundance true --- bin/merge_rename_sigs.py | 112 +++++++++++++++++++++++++++++++++++++++ nextflow.config | 5 +- 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100755 bin/merge_rename_sigs.py diff --git a/bin/merge_rename_sigs.py b/bin/merge_rename_sigs.py new file mode 100755 index 00000000..30a0c809 --- /dev/null +++ b/bin/merge_rename_sigs.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +import argparse +import itertools +import shutil + +import sourmash + + +def merge(filenames, ksize, moltype, name=None, outsig=None): + """ + merge one or more signatures. + + Adapted from 'sourmash sig merge' command + """ + + first_sig = None + mh = None + total_loaded = 0 + + for sigfile in filenames: + this_n = 0 + + for sigobj in sourmash.load_file_as_signatures( + sigfile, ksize=ksize, select_moltype=moltype + ): + + if sigobj is None: + error( + "No signature in file {}", + sigfile, + ) + + # first signature? initialize a bunch of stuff + if first_sig is None: + first_sig = sigobj + mh = first_sig.minhash.copy_and_clear() + + try: + sigobj_mh = sigobj.minhash + + mh.merge(sigobj_mh) + except: + error( + "ERROR when merging signature '{}' ({}) from file {}", + sigobj.name(), + sigobj.md5sum()[:8], + sigfile, + ) + raise + + this_n += 1 + total_loaded += 1 + + merged_sigobj = sourmash.SourmashSignature(mh) + if name is not None: + merged_sigobj._name = name + + if outsig is not None: + with open(outsig, "wt") as f: + sourmash.save_signatures([merged_sigobj], fp=f) + + return merged_sigobj + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Merge signatures with same ksize and moltype""" + ) + # Signature files + parser.add_argument("sigfiles", nargs="+") + parser.add_argument( + "--moltypes", + type=str, + help="Molecule types, comma-separated. e,g. 'protein,dayhoff'", + required=True, + ) + parser.add_argument( + "--ksizes", type=str, help="K-mer sizes to combine", required=True + ) + + parser.add_argument( + "-o", "--outsig", type=str, help="Signature file to output to", required=True + ) + parser.add_argument( + "-n", + "--name", + type=str, + help="Name of the signature to use", + ) + + args = parser.parse_args() + + # Only iterate and read over the sigfiles if there is really something to merge + # "something to merge" = there is more than one sigfile. otherwise there's no point + # in reading in the files only to make the same file again + if len(args.sigfiles) > 1: + ksizes = map(int, args.ksizes.split(",")) + moltypes = args.moltypes.split(",") + + merged_sigobjs = [] + for moltype, ksize in itertools.product(moltypes, ksizes): + merged_sigobj = merge( + args.sigfiles, moltype=moltype, ksize=ksize, name=args.name + ) + merged_sigobjs.append(merged_sigobj) + + with open(args.outsig, "wt") as f: + sourmash.save_signatures(merged_sigobjs, fp=f) + else: + # Otherwise, nothing to merge. Simply copy the file to the + # output signature location + shutil.copyfile(args.sigfiles[0], args.outsig) diff --git a/nextflow.config b/nextflow.config index 857fa62e..4ac52d55 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,8 +28,9 @@ params { // Creating sketches molecules ='dna,protein,dayhoff' - ksizes = '21,27,33,51' - track_abundance = false + ksizes = '21,30,51' + // Track abundance by default + track_abundance = true // Number of hashes from each sample sketch_num_hashes = false sketch_num_hashes_log2 = false From 0aade17156262784b421703dcb0c81e51400d731 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 11:24:30 -0800 Subject: [PATCH 48/49] Update variables in merge_renamed_sigs.pyh --- bin/merge_rename_sigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/merge_rename_sigs.py b/bin/merge_rename_sigs.py index 30a0c809..f5a171bf 100755 --- a/bin/merge_rename_sigs.py +++ b/bin/merge_rename_sigs.py @@ -99,10 +99,10 @@ def merge(filenames, ksize, moltype, name=None, outsig=None): merged_sigobjs = [] for moltype, ksize in itertools.product(moltypes, ksizes): - merged_sigobj = merge( + merged = merge( args.sigfiles, moltype=moltype, ksize=ksize, name=args.name ) - merged_sigobjs.append(merged_sigobj) + merged_sigobjs.append(merged) with open(args.outsig, "wt") as f: sourmash.save_signatures(merged_sigobjs, fp=f) From ad362594eda7e18f0cb8a5101783766023e405e9 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 11:24:46 -0800 Subject: [PATCH 49/49] Get sourmash compare to happen on correct ksizes and moltypes --- main.nf | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index bb09aa4e..a6f54186 100644 --- a/main.nf +++ b/main.nf @@ -448,7 +448,7 @@ save_translate_json = params.save_translate_json // --- Parse the Sourmash parameters ---- ksizes = params.ksizes?.toString().tokenize(',') Channel.from(params.ksizes?.toString().tokenize(',')) - .into { ch_ksizes_for_compare_petide; ch_ksizes_for_compare_nucleotide } + .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide } molecules = params.molecules?.toString().tokenize(',') peptide_molecules = molecules.findAll { it != "dna" } @@ -1489,7 +1489,7 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ output: file(csv) into ch_sourmash_sig_describe_merged - set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view + set val(cell_id), val(sketch_id), val(moltypes), val(ksizes), file(output_sig) into ch_sourmash_sketches_merged, ch_sourmash_sketches_merged_to_view, ch_sourmash_sketches_merged_for_moltypes_ksizes script: // sketch_id = make_sketch_id(molecule, ksize, sketch_value, track_abundance, sketch_style) @@ -1567,31 +1567,47 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { ch_sourmash_sketches_merged_to_view .dump( tag: "ch_sourmash_sketches_to_view" ) + + ch_peptide_molecules_for_compare + .combine( ch_ksizes_for_compare_peptide ) + .set { ch_sourmash_compare_params_peptide } + + Channel.from("dna") + .combine( ch_ksizes_for_compare_nucleotide ) + .mix ( ch_sourmash_compare_params_peptide ) + .set { ch_sourmash_compare_params_both } + ch_sourmash_sketches_merged - // Drop first index which is the cell id - .map { [it[1], it[2].split(","), it[3].split(","), it[4]] } + // Drop first index (index 0) which is the cell id + // Drop the second index (index 1) which is the sketch id + // Keep only moltype + // Drop ksize + .map { [tuple(it[2].split(",")), it[4]] } .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) - .groupTuple(by: [0, 1, 2]) - .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple' ) .transpose() - .dump(tag: 'ch_sourmash_sketches_merged__map_split__grouptuple__transpose' ) + .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) + // Perform cartesian product on the molecules with compare params + .combine( ch_sourmash_compare_params_both, by: 0) + .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) + .groupTuple(by: [0, 2]) .dump(tag: 'ch_sourmash_sketches_to_compare' ) .set { ch_sourmash_sketches_to_compare } process sourmash_compare_sketches { // Combine peptide and nucleotide sketches - tag "${sketch_id}" + tag "${compare_id}" publishDir "${params.outdir}/compare_sketches", mode: 'copy' input: - set file ("*.sig"), val(ksize), val(molecule) from ch_sourmash_sketches_to_compare + // Weird order but that's how it shakes out with the groupTuple + set val(molecule), file("*.sig"), val(ksize) from ch_sourmash_sketches_to_compare output: file(csv) script: - compare_id = "molecule-${molecule}__ksize-${ksize}" + compare_id = "${molecule}__k=${ksize}" processes = "--processes ${task.cpus}" csv = "similarities__${compare_id}.csv" """ @@ -1599,8 +1615,10 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { --ksize ${ksize} \\ --${molecule} \\ --csv ${csv} \\ - $processes \\ + ${processes} \\ --traverse-directory . + # Use --traverse-directory instead of all the files explicitly to avoid + # "too many arguments" error for bash when there are lots of samples """ }