diff --git a/CHANGELOG.md b/CHANGELOG.md index a977cf1b..bbfedb75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ barcode fastq * Add `--skip_compare option` to skip `sourmash_compare_sketches` process * Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117)) * Add renamed package dependency orpheum (used to be known as sencha) +* Added `--singleton` option for sourmash to compute one signature per FASTA/FASTQ entry ### `Fixed` diff --git a/main.nf b/main.nf index 94756edf..05b9349a 100644 --- a/main.nf +++ b/main.nf @@ -90,6 +90,7 @@ def helpMessage() { --skip_compare If provided, skip comparison of hashes using sourmash compare --skip_compute If provided, skip computing of signatures using sourmash compute --skip_sig_merge If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files + --sketch_singleton If provided, compute one k-mer sketch per fasta entry, not for the whole file Sketch size options: --sketch_num_hashes Number of hashes to use for making the sketches. @@ -468,6 +469,7 @@ sketch_num_hashes = params.sketch_num_hashes sketch_num_hashes_log2 = params.sketch_num_hashes_log2 sketch_scaled = params.sketch_scaled sketch_scaled_log2 = params.sketch_scaled_log2 +sketch_singleton = params.sketch_singleton have_sketch_value = params.sketch_num_hashes || params.sketch_num_hashes_log2 || params.sketch_scaled || params.sketch_scaled_log2 if (!have_sketch_value && !params.split_kmer) { @@ -582,6 +584,7 @@ summary['Skip multiqc?'] = params.skip_multiqc summary['K-mer sizes'] = params.ksizes summary['Molecule'] = params.molecules summary['Track Abundance'] = params.track_abundance +summary['Singleton sketches?'] = params.sketch_singleton // -- Sketch size parameters -- if (params.sketch_num_hashes) summary['Sketch Sizes'] = params.sketch_num_hashes if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params.sketch_num_hashes_log2 @@ -1324,17 +1327,18 @@ if (!params.remove_ribo_rna) { ) sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' + singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'" sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" """ sourmash compute \\ ${sketch_value_flag} \\ + ${singleton_flag} \\ --ksizes ${params.ksizes} \\ --dna \\ $track_abundance_flag \\ --output ${sig} \\ - --name '${sample_id}' \\ $reads sourmash sig describe --csv ${csv} ${sig} """ @@ -1408,16 +1412,17 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' + singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'" sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" """ sourmash compute \\ ${sketch_value_flag} \\ + ${singleton_flag} \\ --ksizes ${params.ksizes} \\ --input-is-protein \\ ${peptide_molecule_flags} \\ - --name '${sample_id}' \\ --no-dna \\ $track_abundance_flag \\ --output ${sig} \\ diff --git a/nextflow.config b/nextflow.config index 589d186f..6e775a21 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,9 @@ params { tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)' tenx_min_umi_per_cell = 1000 + // DNA sequence parsing + skip_trimming = false + // Creating sketches molecules ='dna,protein,dayhoff' ksizes = '21,30,51' @@ -36,6 +39,7 @@ params { sketch_num_hashes_log2 = false sketch_scaled = false sketch_scaled_log2 = false + sketch_singleton = false skip_sig_merge = false // Comparing sketches @@ -44,8 +48,6 @@ params { // Computing sketches skip_compute = false - skip_trimming = false - // translate options translate_peptide_ksize = 8 translate_peptide_molecule = 'protein' diff --git a/nextflow_schema.json b/nextflow_schema.json index a279fa64..b7f8ec88 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -150,6 +150,11 @@ "type": "integer", "fa_icon": "fas fa-barcode", "description": "Integer value to subsample reads from input fastq files" + }, + "sketch_singleton": { + "type": "string", + "description": "Compute one signature per entry in the FASTA file, which is useful when the file contains e.g. a transcript or genome per entry. This is not recommended for FASTQ files as it would compute one signature per read, and presumably one would want one signature per sequencing dataset", + "fa_icon": "fas fa-dice-one" } }, "fa_icon": "fas fa-cogs" @@ -187,6 +192,16 @@ "type": "integer", "description": "Maximum table size for bloom filter creation", "fa_icon": "fas fa-code-branch" + }, + "save_translate_csv": { + "type": "string", + "description": "Path to save the coding scores as a csv", + "default": "False" + }, + "save_translate_json": { + "type": "string", + "description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json", + "default": "False" } } }, @@ -484,17 +499,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "save_translate_csv": { - "type": "string", - "description": "Path to save the coding scores as a csv", - "default": "False" - }, - "save_translate_json": { - "type": "string", - "description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json", - "default": "False" - } - } + ] } \ No newline at end of file