diff --git a/Cargo.toml b/Cargo.toml index a84a884..8222d5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.10" +version = "0.3.11" authors = [ "John Lees ", "Simon Harris ", diff --git a/README.md b/README.md index 77dd9de..978941d 100644 --- a/README.md +++ b/README.md @@ -26,11 +26,18 @@ Split k-mer analysis (version 2) uses exact matching of split k-mer sequences to SKA can only align SNPs further than the k-mer length apart, and does not use a gap penalty approach or give alignment scores. But the advantages are speed and flexibility, particularly the ability to run on a reference-free manner (i.e. including accessory genome variation) on both assemblies and reads. +### Citation + +Romain Derelle, Johanna von Wachsmann, Tommi Mäklin, Joel Hellewell, Timothy Russell, Ajit Lalvani, Leonid Chindelevitch, Nicholas J. Croucher, Simon R. Harris, John A. Lees (2024). +**Seamless, rapid and accurate analyses of outbreak genomic data using Split _k_-mer Analysis** +*Genome Research* + ## Documentation Can be found at https://docs.rs/ska. We also have some tutorials available: - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/). +- [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/). ## Installation diff --git a/src/cli.rs b/src/cli.rs index dcdf0c8..fea883a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17; pub const DEFAULT_PROPORTION_READS: Option = None; /// Default single strand (which is equivalent to !rc) pub const DEFAULT_STRAND: bool = false; +/// Default minimum frequency filter threshold +pub const DEFAULT_MINFREQ: f64 = 0.9; /// Default behaviour when min-freq counting ambig sites pub const DEFAULT_AMBIGMISSING: bool = false; /// Default repeat masking behaviour @@ -191,7 +193,7 @@ pub enum Commands { output: Option, /// Minimum fraction of samples a k-mer has to appear in - #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)] + #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)] min_freq: f64, /// With min_freq, only count non-ambiguous sites @@ -255,9 +257,9 @@ pub enum Commands { #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)] min_freq: f64, - /// Filter for ambiguous bases + /// Filter out ambiguous bases ('N' still a mismatch) #[arg(long, default_value_t = false)] - filter_ambiguous: bool, + allow_ambiguous: bool, /// Number of CPU threads #[arg(long, value_parser = valid_cpus, default_value_t = 1)] @@ -312,7 +314,7 @@ pub enum Commands { reverse: bool, /// Minimum fraction of samples a k-mer has to appear in - #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)] + #[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)] min_freq: f64, /// With min_freq, only count non-ambiguous sites diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 9b53be8..b6ddb44 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -152,6 +152,8 @@ pub fn distance UInt<'a>>( ); if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) { if filt_ambig { + let filter_ambig_as_missing = true; + let mask_ambig = true; apply_filters( ska_array, min_freq, diff --git a/src/lib.rs b/src/lib.rs index 6601751..988725f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,7 @@ //! //! Tutorials: //! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/). +//! - [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/). //! //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage). //! @@ -208,9 +209,12 @@ //! ska distance -o distances.txt seqs.skf //! ``` //! -//! Ignore ambiguous bases by adding `--filter-ambiguous` flag, and `--min-freq` to -//! ignore k-mers only found in some samples. Multiple threads -//! can be used, but this will only be effective with large numbers of samples. +//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and change `--min-freq` to +//! ignore more/less k-mers only found in some samples (default = 0.9). Note that ambiguous bases may overestimate +//! distances due to repeat k-mers. For finer control over filtering, first run `ska weed` +//! on the input .skf. +//! +//! Multiple threads can be used, but this will only be effective with large numbers of samples. //! //! The companion script in `scripts/cluster_dists.py` (requires `networkx`) can //! be used to make single linkage clusters from these distances at given thresholds, @@ -619,10 +623,11 @@ pub fn main() { skf_file, output, min_freq, - filter_ambiguous, + allow_ambiguous, threads, } => { check_threads(*threads); + let filter_ambiguous = !*allow_ambiguous; if let Ok(mut ska_array) = MergeSkaArray::::load(skf_file) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); @@ -630,7 +635,7 @@ pub fn main() { &mut ska_array, output, *min_freq, - *filter_ambiguous, + filter_ambiguous, *threads, ); } else if let Ok(mut ska_array) = MergeSkaArray::::load(skf_file) { @@ -640,7 +645,7 @@ pub fn main() { &mut ska_array, output, *min_freq, - *filter_ambiguous, + filter_ambiguous, *threads, ); } else { diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index ff472a8..ce413f8 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -342,6 +342,8 @@ where } else { removed += 1; } + } else { + removed += 1; } } self.variants = filtered_variants; diff --git a/src/ska_dict/bit_encoding.rs b/src/ska_dict/bit_encoding.rs index 790ab3e..7cb9da6 100644 --- a/src/ska_dict/bit_encoding.rs +++ b/src/ska_dict/bit_encoding.rs @@ -76,7 +76,7 @@ pub fn base_to_prob(base: u8) -> [f64; 4] { b'D' => [1.0 / 3.0, 0.0, 1.0 / 3.0, 1.0 / 3.0], b'H' => [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0], b'V' => [1.0 / 3.0, 1.0 / 3.0, 0.0, 1.0 / 3.0], - b'N' => [0.25, 0.25, 0.25, 0.25], + b'N' => [0.0, 0.0, 0.0, 0.0], // This gives more expected behaviour than 0.25 in each entry _ => [0.0, 0.0, 0.0, 0.0], } } @@ -494,7 +494,7 @@ mod tests { assert_eq!(overlap(&k, &b), 1.0 / 3.0); assert_eq!(overlap(&d, &h), 2.0 / 9.0); - assert_eq!(overlap(&v, &n), 0.25); + assert_eq!(overlap(&v, &n), 0.0); assert_eq!(overlap(&n, &empty), 0.0); } diff --git a/tests/distance.rs b/tests/distance.rs index 1fb6265..a4ed150 100644 --- a/tests/distance.rs +++ b/tests/distance.rs @@ -50,6 +50,7 @@ fn dist_filter() { .current_dir(sandbox.get_wd()) .arg("distance") .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--allow-ambiguous") .arg("-v") .args(&["--threads", "2"]) .assert() @@ -60,7 +61,6 @@ fn dist_filter() { .current_dir(sandbox.get_wd()) .arg("distance") .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) - .arg("--filter-ambiguous") .arg("-v") .assert() .stdout_eq_path(sandbox.file_string("merge_k9_no_ambig.dist.stdout", TestDir::Correct)); @@ -97,21 +97,24 @@ fn multisample_dists() { .assert() .success(); + // Test with filters off Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("distance") .arg("multidist.skf") .arg("-v") + .args(&["--min-freq", "0"]) + .arg("--allow-ambiguous") .args(&["--threads", "2"]) .assert() .stdout_eq_path(sandbox.file_string("multidist.stdout", TestDir::Correct)); + // Test with default filters Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("distance") .arg("multidist.skf") .arg("-v") - .args(&["--min-freq", "0.5"]) .assert() .stdout_eq_path(sandbox.file_string("multidist.filter.stdout", TestDir::Correct)); } diff --git a/tests/test_results_correct/merge_k9_min_freq.dist.stdout b/tests/test_results_correct/merge_k9_min_freq.dist.stdout index 6bef4ce..5556e85 100644 --- a/tests/test_results_correct/merge_k9_min_freq.dist.stdout +++ b/tests/test_results_correct/merge_k9_min_freq.dist.stdout @@ -1,2 +1,2 @@ Sample1 Sample2 Distance Mismatches -test_1 test_2 2.50 0.00000 +test_1 test_2 2.00 0.00000 diff --git a/tests/test_results_correct/multidist.filter.stdout b/tests/test_results_correct/multidist.filter.stdout index ff50dc8..2f24536 100644 --- a/tests/test_results_correct/multidist.filter.stdout +++ b/tests/test_results_correct/multidist.filter.stdout @@ -1,16 +1,16 @@ Sample1 Sample2 Distance Mismatches -N_test_1 N_test_2 2.00 0.22222 +N_test_1 N_test_2 2.00 0.51724 N_test_1 ambig_test_1 0.00 1.00000 N_test_1 ambig_test_2 0.00 1.00000 -N_test_1 test_1 0.50 0.08333 -N_test_1 test_2 1.00 0.05556 +N_test_1 test_1 1.00 0.25455 +N_test_1 test_2 1.00 0.42623 N_test_2 ambig_test_1 0.00 1.00000 N_test_2 ambig_test_2 0.00 1.00000 -N_test_2 test_1 1.50 0.19444 -N_test_2 test_2 1.00 0.16667 -ambig_test_1 ambig_test_2 0.00 0.00000 +N_test_2 test_1 2.00 0.56716 +N_test_2 test_2 1.00 0.28571 +ambig_test_1 ambig_test_2 1.00 0.44444 ambig_test_1 test_1 0.00 1.00000 ambig_test_1 test_2 0.00 1.00000 ambig_test_2 test_1 0.00 1.00000 ambig_test_2 test_2 0.00 1.00000 -test_1 test_2 1.50 0.02778 +test_1 test_2 3.00 0.44118