From 0df94e59f183470dd9660f8abdbc469f9a5b757f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 11:26:08 -0700 Subject: [PATCH 1/2] Start working on sourmash search for https://github.com/nf-core/kmermaid/issues/118 --- main.nf | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/main.nf b/main.nf index 94756edf..b922fb22 100644 --- a/main.nf +++ b/main.nf @@ -531,6 +531,42 @@ else { barcode_metadata_folder = "barcode_metadata" } + +//////////////////////////////////////////////////// +/* -- Parse Sourmash Seach Parameters -- */ +//////////////////////////////////////////////////// + +if (params.celltype_sbt_db_dna) { + Channel.fromPath(params.celltype_sbt_db_dna, checkIfExists: true) + .ifEmpty { exit 1, "Reference cell label DNA k-mer signatures file not found:" + "${params.reference_proteome_fasta}" } + .map { tuple("dna", params.celltype_sbt_db_dna_ksize, it) } + .set{ ch_celltype_db_sbt_dna } +} else { + ch_celltype_db_sbt_dna = Channel.empty() +} + +if (params.celltype_sbt_db_protein) { + Channel.fromPath(params.celltype_sbt_db_protein, checkIfExists: true) + .ifEmpty { exit 1, "Reference cell label Protein k-mer signatures file not found:" + "${params.reference_proteome_fasta}" } + .map { tuple("protein", params.celltype_sbt_db_protein_ksize, it) } + .set{ ch_celltype_db_sbt_protein } +} else { + ch_celltype_db_sbt_protein = Channel.empty() +} + +if (params.celltype_sbt_db_dayhoff) { + Channel.fromPath(params.celltype_sbt_db_dayhoff, checkIfExists: true) + .ifEmpty { exit 1, "Reference cell label Dayhoff k-mer signatures file not found:" + "${params.reference_proteome_fasta}" } + .map { tuple("dayhoff", params.celltype_sbt_db_dayhoff_ksize, it) } + .set{ ch_celltype_db_sbt_dayhoff } +} else { + ch_celltype_db_sbt_dayhoff = Channel.empty() +} + + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name @@ -1648,6 +1684,41 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { } } +// If a cell type databse is present +if (params.celltype_sbt_db) { + process sourmash_search { + // Combine peptide and nucleotide sketches + tag "${sketch_id}" + publishDir "${params.outdir}/index", mode: 'copy' + + input: + file(celltype_db_sbt_zip) from ch_celltype_db_sbt_zip.collect() + set val(molecule), val(ksize), file(sigs) from ch_sourmash_sketches_to_search + + output: + file(csv) + + script: + csv = "${sbt_zip.simpleName}.csv" + // Parse sourmash search parameters + containment_flag = params.containment ? "--containment" : "" + threshold_flag = "--threshold ${params.search_threshold}" + """ + sourmash search \\ + ${threshold_flag} \\ + ${containment_flag} \\ + --ksize ${ksize} \\ + --${molecule} \\ + --output ${csv} \\ + ${celltype_db_sbt_zip} \\ + . + """ + } +} + + +} + /* * STEP 16 - MultiQC From e39634ec636b152defd36698fe9a9e6f86a2db2b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 11:26:21 -0700 Subject: [PATCH 2/2] Add search parameters --- nextflow.config | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nextflow.config b/nextflow.config index 589d186f..8d12554e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -41,6 +41,19 @@ params { // Comparing sketches skip_compare = false + // Query per-cell k-mer signatures for labels in a cell type database + containment = false + search_threshold = 1e-10 + // DNA, k_nuc size = 21 + celltype_sbt_db_dna = false + celltype_sbt_db_dna_ksize = 21 + // Protein, k_nuc size = 30, k_aa size = 10 + celltype_sbt_db_protein = false + celltype_sbt_db_protein_ksize = 30 + // Dayhoff, k_nuc size = 51, k_aa size = 17 + celltype_sbt_db_dayhoff = false + celltype_sbt_db_dayhoff_ksize = 51 + // Computing sketches skip_compute = false