From 4ae14f050d4dac659f6323a5ec5bfd9b8e473afe Mon Sep 17 00:00:00 2001 From: John Lees Date: Sat, 14 Oct 2023 10:21:06 +0100 Subject: [PATCH 1/7] Author list update --- Cargo.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 24c93d8..a3803e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,10 +4,12 @@ version = "0.3.3" authors = [ "John Lees ", "Simon Harris ", - "Johanna von Wachsmann ", + "Johanna von Wachsmann ", "Tommi Maklin ", "Joel Hellewell ", - "Timothy Russell " + "Timothy Russell ", + "Romain Derelle ", + "Nicholas Croucher " ] edition = "2021" description = "Split k-mer analysis" From 9c9fa72f971708f62b2ac5e04ac0cddbf9ecc332 Mon Sep 17 00:00:00 2001 From: John Lees Date: Sat, 14 Oct 2023 10:23:11 +0100 Subject: [PATCH 2/7] Change middle base filter to strict with reads --- src/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli.rs b/src/cli.rs index e004b7a..ce0c353 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -151,7 +151,7 @@ pub enum Commands { min_qual: u8, /// Quality filtering criteria (with reads) - #[arg(long, value_enum, default_value_t = QualFilter::Middle)] + #[arg(long, value_enum, default_value_t = QualFilter::Strict)] qual_filter: QualFilter, /// Number of CPU threads From bc0d3e90f89fb44a9e2a97072b51147cc08358a4 Mon Sep 17 00:00:00 2001 From: John Lees Date: Sat, 14 Oct 2023 10:26:39 +0100 Subject: [PATCH 3/7] Bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a3803e0..02e3e65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.3" +version = "0.3.4" authors = [ "John Lees ", "Simon Harris ", From aef549dc35ca90bfce1735a73b175d25f7a77db6 Mon Sep 17 00:00:00 2001 From: John Lees Date: Sun, 15 Oct 2023 11:23:09 +0100 Subject: [PATCH 4/7] Add an iterator to MergeSkaArray --- src/merge_ska_array.rs | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index aae700f..f5c432b 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -511,6 +511,15 @@ where }; (distance, mismatches) } + + /// Iterator over split k-mers and middle bases + pub fn iter(&self) -> KmerIter { + KmerIter { + kmers: &self.split_kmers, + vars: &self.variants, + index: 0, + } + } } /// Writes basic information. @@ -569,6 +578,38 @@ where } } +/// Iterator type over split k-mers and middle bases +/// +/// Each return is a tuple of the encoded split-kmer and a vector +/// of the encoded middle bases +pub struct KmerIter<'a, IntT> { + kmers: &'a Vec, + vars: &'a Array2, + index: usize, +} + +impl<'a, IntT> Iterator for KmerIter<'a, IntT> +where + IntT: for<'b> UInt<'b>, +{ + // Note this returns a Vec of the middle bases rather than an array + // because this is more likely to be useful in user code + type Item = (IntT, Vec); + + fn next(&mut self) -> Option { + if self.index < self.kmers.len() { + let row = Some(( + self.kmers[self.index], + self.vars.index_axis(Axis(0), self.index).to_vec(), + )); + self.index += 1; + row + } else { + None + } + } +} + #[cfg(test)] mod tests { use super::*; // Import functions and types from the parent module @@ -592,6 +633,25 @@ mod tests { example_array } + #[test] + fn test_kmer_iterator() { + let ska = setup_struct(); + let mut iter = ska.iter(); + + // First iteration + let (kmer, vars) = iter.next().unwrap(); + assert_eq!(kmer, 0); + assert_eq!(vars, vec![1, 2, 3]); + + // Second iteration + let (kmer, vars) = iter.next().unwrap(); + assert_eq!(kmer, 1); + assert_eq!(vars, vec![4, 5, 6]); + + // No more items + assert!(iter.next().is_none()); + } + #[test] fn test_delete_samples_normal() { let mut my_struct = setup_struct(); From c791de4516e9cd3e844da4048cf1da864a4d5611 Mon Sep 17 00:00:00 2001 From: John Lees Date: Sun, 15 Oct 2023 22:14:55 +0100 Subject: [PATCH 5/7] Add option to filter out gap only sites --no-gap-only-sites useful for lower coverage/low quality samples --- src/cli.rs | 10 ++++++ src/generic_modes.rs | 51 ++++++++++++++++++++++----- src/lib.rs | 25 +++++++++++-- src/merge_ska_array.rs | 71 +++++++++++++++++++++---------------- tests/align.rs | 79 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 40 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index ce0c353..8ed36a0 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -13,6 +13,8 @@ pub const DEFAULT_STRAND: bool = false; pub const DEFAULT_REPEATMASK: bool = false; /// Default ambiguous masking behaviour pub const DEFAULT_AMBIGMASK: bool = false; +/// Default gap ignoring behaviour (at constant sites) +pub const DEFAULT_CONSTGAPS: bool = false; /// Default minimum k-mer count for FASTQ files pub const DEFAULT_MINCOUNT: u16 = 10; /// Default minimum base quality (PHRED score) for FASTQ files @@ -180,6 +182,10 @@ pub enum Commands { #[arg(long, default_value_t = DEFAULT_AMBIGMASK)] ambig_mask: bool, + /// Ignore gaps '-' in constant sites (for low coverage samples) + #[arg(long, default_value_t = DEFAULT_CONSTGAPS)] + no_gap_only_sites: bool, + /// Number of CPU threads #[arg(long, value_parser = valid_cpus, default_value_t = 1)] threads: usize, @@ -287,6 +293,10 @@ pub enum Commands { /// Mask any ambiguous bases in the alignment with 'N' #[arg(long, default_value_t = DEFAULT_AMBIGMASK)] ambig_mask: bool, + + /// Ignore gaps '-' in constant sites + #[arg(long, default_value_t = DEFAULT_CONSTGAPS)] + no_gap_only_sites: bool, }, /// Get the number of k-mers in a split k-mer file, and other information Nk { diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 556be0a..db21ec3 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -20,13 +20,14 @@ pub fn align UInt<'a>>( output: &Option, filter: &FilterType, mask_ambig: bool, + ignore_const_gaps: bool, min_freq: f64, ) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); // Apply filters - apply_filters(ska_array, min_freq, filter, mask_ambig); + apply_filters(ska_array, min_freq, filter, mask_ambig, ignore_const_gaps); // Write out to file/stdout log::info!("Writing alignment"); @@ -104,11 +105,18 @@ pub fn apply_filters UInt<'a>>( min_freq: f64, filter: &FilterType, ambig_mask: bool, + ignore_const_gaps: bool, ) -> i32 { let update_kmers = false; let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize; - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}"); - ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers) + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); + ska_array.filter( + filter_threshold, + filter, + ambig_mask, + ignore_const_gaps, + update_kmers, + ) } /// Calculate distances between samples @@ -125,12 +133,31 @@ pub fn distance UInt<'a>>( log::debug!("{ska_array}"); let mask_ambig = false; - let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst, mask_ambig); + let ignore_const_gaps = false; + let constant = apply_filters( + ska_array, + min_freq, + &FilterType::NoConst, + mask_ambig, + ignore_const_gaps, + ); if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) { if filt_ambig { - apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst, mask_ambig); + apply_filters( + ska_array, + min_freq, + &FilterType::NoAmbigOrConst, + mask_ambig, + ignore_const_gaps, + ); } else { - apply_filters(ska_array, min_freq, &FilterType::NoFilter, mask_ambig); + apply_filters( + ska_array, + min_freq, + &FilterType::NoFilter, + mask_ambig, + ignore_const_gaps, + ); } } @@ -178,6 +205,7 @@ pub fn delete UInt<'a>>( } /// Remove k-mers, and optionally apply filters to an array +#[allow(clippy::too_many_arguments)] pub fn weed UInt<'a>>( ska_array: &mut MergeSkaArray, weed_file: &Option, @@ -185,6 +213,7 @@ pub fn weed UInt<'a>>( min_freq: f64, filter: &FilterType, ambig_mask: bool, + ignore_const_gaps: bool, out_file: &str, ) { if let Some(weed_fasta) = weed_file { @@ -213,9 +242,15 @@ pub fn weed UInt<'a>>( let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize; if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask { - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}"); + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); let update_kmers = true; - ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers); + ska_array.filter( + filter_threshold, + filter, + ambig_mask, + ignore_const_gaps, + update_kmers, + ); } log::info!("Saving modified skf file"); diff --git a/src/lib.rs b/src/lib.rs index a07bee6..33a97ec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -367,8 +367,9 @@ //! let min_count = 2; //! let update_kmers = false; //! let filter = FilterType::NoConst; +//! let ignore_const_gaps = false; //! let ambig_mask = false; -//! ska_array.filter(min_count, &filter, update_kmers, ambig_mask); +//! ska_array.filter(min_count, &filter, ambig_mask, ignore_const_gaps, update_kmers); //! //! // Write out to stdout //! let mut out_stream = set_ostream(&None); @@ -512,17 +513,32 @@ pub fn main() { min_freq, filter, ambig_mask, + no_gap_only_sites, threads, } => { check_threads(*threads); if let Ok(mut ska_array) = load_array::(input, *threads) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); - align(&mut ska_array, output, filter, *ambig_mask, *min_freq); + align( + &mut ska_array, + output, + filter, + *ambig_mask, + *no_gap_only_sites, + *min_freq, + ); } else if let Ok(mut ska_array) = load_array::(input, *threads) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); - align(&mut ska_array, output, filter, *ambig_mask, *min_freq); + align( + &mut ska_array, + output, + filter, + *ambig_mask, + *no_gap_only_sites, + *min_freq, + ); } else { panic!("Could not read input file(s): {input:?}"); } @@ -640,6 +656,7 @@ pub fn main() { reverse, min_freq, ambig_mask, + no_gap_only_sites, filter, } => { log::info!("Loading skf file"); @@ -651,6 +668,7 @@ pub fn main() { *min_freq, filter, *ambig_mask, + *no_gap_only_sites, if output.is_none() { skf_file } else { @@ -665,6 +683,7 @@ pub fn main() { *min_freq, filter, *ambig_mask, + *no_gap_only_sites, if output.is_none() { skf_file } else { diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index f5c432b..e6e697e 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -56,10 +56,15 @@ use crate::cli::FilterType; /// let min_count = 1; // no filtering by minor allele frequency /// let filter = FilterType::NoAmbigOrConst; // remove sites with no minor allele /// let mask_ambiguous = false; // leave ambiguous sites as IUPAC codes +/// let ignore_const_gaps = false; // keep sites with only '-' as variants /// let update_counts = true; // keep counts updated, as saving -/// ska_array.filter(min_count, &filter, mask_ambiguous, update_counts); +/// ska_array.filter(min_count, &filter, mask_ambiguous, ignore_const_gaps, update_counts); /// ska_array.save(&"no_const_sites.skf"); /// +/// // Create an iterators +/// let mut kmer_iter = ska_array.iter(); +/// let (kmer, middle_base_vec) = kmer_iter.next().unwrap(); +/// /// // Delete a sample /// ska_array.delete_samples(&[&"test_1"]); /// @@ -231,8 +236,9 @@ where /// # Arguments /// /// - `min_count` -- minimum number of samples split k-mer found in. - /// - `const_sites` -- include sites where all middle bases are the same. + /// - `filter` -- either none, remove constant, remove ambiguous, or both. See [`FilterType`] /// - `mask_ambig` -- replace any non-ACGTUN- with N + /// - `ignore_const_gaps` -- filter any sites where the only variants are gaps /// - `update_kmers` -- update counts and split k-mers after removing variants. /// /// The default for `update_kmers` should be `true`, but it can be `false` @@ -242,6 +248,7 @@ where min_count: usize, filter: &FilterType, mask_ambig: bool, + ignore_const_gaps: bool, update_kmers: bool, ) -> i32 { let total = self.names.len(); @@ -260,15 +267,16 @@ where let keep_var = match *filter { FilterType::NoFilter => true, FilterType::NoConst => { - let first_var = row[0]; - let mut keep = false; + let mut var_types = HashSet::new(); for var in row { - if *var != first_var { - keep = true; - break; + if !ignore_const_gaps || *var != b'-' { + var_types.insert(*var); + if var_types.len() > 1 { + break; + } } } - keep + var_types.len() > 1 } FilterType::NoAmbig => { let mut keep = true; @@ -281,21 +289,26 @@ where keep } FilterType::NoAmbigOrConst => { - let mut keep = false; - let mut first_var = None; + let mut var_types = HashSet::new(); for var in row { - if !is_ambiguous(*var) { - if first_var.is_none() { - first_var = Some(*var); - } else if *var != first_var.unwrap() { - keep = true; + var_types.insert(*var); + } + let mut count = 0; + for base in var_types { + let lower_base = base | 0x20; + count += match lower_base { + b'a' | b'c' | b'g' | b't' | b'u' => 1, + b'-' => { + if ignore_const_gaps { + 0 + } else { + 1 + } } - } else { - keep = false; - break; + _ => 0, } } - keep + count > 1 } }; if keep_var { @@ -635,8 +648,8 @@ mod tests { #[test] fn test_kmer_iterator() { - let ska = setup_struct(); - let mut iter = ska.iter(); + let ska_array = setup_struct(); + let mut iter = ska_array.iter(); // First iteration let (kmer, vars) = iter.next().unwrap(); @@ -654,30 +667,30 @@ mod tests { #[test] fn test_delete_samples_normal() { - let mut my_struct = setup_struct(); + let mut ska_array = setup_struct(); - my_struct.delete_samples(&["Sample1", "Sample2"]); + ska_array.delete_samples(&["Sample1", "Sample2"]); // Check that the samples were deleted - assert_eq!(my_struct.names, vec!["Sample3"]); - assert_eq!(my_struct.variants, array![[3], [6]]); + assert_eq!(ska_array.names, vec!["Sample3"]); + assert_eq!(ska_array.variants, array![[3], [6]]); } #[test] #[should_panic(expected = "Invalid number of samples to remove")] fn test_delete_samples_empty_or_all() { - let mut my_struct = setup_struct(); + let mut ska_array = setup_struct(); // This should panic - my_struct.delete_samples(&[]); + ska_array.delete_samples(&[]); } #[test] #[should_panic(expected = "Could not find sample(s): ")] fn test_delete_samples_non_existent() { - let mut my_struct = setup_struct(); + let mut ska_array = setup_struct(); // This should panic because "Sample4" does not exist - my_struct.delete_samples(&["Sample4"]); + ska_array.delete_samples(&["Sample4"]); } } diff --git a/tests/align.rs b/tests/align.rs index 6228c3b..bd22fe6 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -234,6 +234,85 @@ fn filters() { let correct_aln = HashSet::from([vec!['T', 'A'], vec!['C', 'T'], vec!['N', 'G']]); assert_eq!(var_hash(&ambig_filter_align_out), correct_aln); + + // Output everything by using min-freq 0, check for behaviour on gap only sites + let unfilt_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-const") + .arg("--min-freq") + .arg("0") + .arg("-v") + .output() + .unwrap() + .stdout; + + let full_length = 33; + let lengths = aln_length(&unfilt_align_out); + for length in lengths { + assert_eq!(full_length, length); + } + + let filt_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-const") + .arg("--min-freq") + .arg("0") + .arg("--no-gap-only-sites") + .arg("-v") + .output() + .unwrap() + .stdout; + + let full_length = 3; + let lengths = aln_length(&filt_align_out); + for length in lengths { + assert_eq!(full_length, length); + } + + let unfilt_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-ambig-or-const") + .arg("--min-freq") + .arg("0") + .arg("-v") + .output() + .unwrap() + .stdout; + + let full_length = 32; + let lengths = aln_length(&unfilt_align_out); + for length in lengths { + assert_eq!(full_length, length); + } + + let filt_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-ambig-or-const") + .arg("--min-freq") + .arg("0") + .arg("-v") + .arg("--no-gap-only-sites") + .output() + .unwrap() + .stdout; + + let full_length = 2; + let lengths = aln_length(&filt_align_out); + for length in lengths { + assert_eq!(full_length, length); + } } #[test] From 8840bcc53c56a88e5dbee8c7ae4c8091c8b5bdd9 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 16 Oct 2023 10:30:54 +0100 Subject: [PATCH 6/7] Update the README --- README.md | 40 ++++++++++++++++++++++++++++------------ src/lib.rs | 3 +++ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c0f4a69..77dd9de 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,29 @@ [![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/bacpop/ska.rust)](https://github.com/bacpop/ska.rust/releases) +## Description + +This is a reimplementation of the [SKA package](https://github.com/simonrharris/SKA) +in the rust language, by Johanna von Wachsmann, Simon Harris and John Lees. We are also grateful to have +received user contributions from: + +- Romain Derelle +- Tommi Maklin +- Joel Hellewell +- Timothy Russell +- Nicholas Croucher +- Dan Lu + +Split k-mer analysis (version 2) uses exact matching of split k-mer sequences to align closely related sequences, typically small haploid genomes such as bacteria and viruses. + +SKA can only align SNPs further than the k-mer length apart, and does not use a gap penalty approach or give alignment scores. But the advantages are speed and flexibility, particularly the ability to run on a reference-free manner (i.e. including accessory genome variation) on both assemblies and reads. + +## Documentation + +Can be found at https://docs.rs/ska. We also have some tutorials available: + +- [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/). + ## Installation Choose from: @@ -35,16 +58,7 @@ xattr -d "com.apple.quarantine" ./ska 1. Clone the repository with `git clone`. 2. Run `cargo install --path .` or `RUSTFLAGS="-C target-cpu=native" cargo install --path .` to optimise for your machine. -## Documentation - -Can be found at https://docs.rs/ska. - -## Description - -This is a reimplementation of Simon Harris' [SKA package](https://github.com/simonrharris/SKA) -in the rust language, by Johanna von Wachsmann, Simon Harris and John Lees. - -> SKA (Split Kmer Analysis) is a toolkit for prokaryotic (and any other small, haploid) DNA sequence analysis using split kmers. A split kmer is a pair of kmers in a DNA sequence that are separated by a single base. Split kmers allow rapid comparison and alignment of small genomes, and is particulalry suited for surveillance or outbreak investigation. SKA can produce split kmer files from fasta format assemblies or directly from fastq format read sequences, cluster them, align them with or without a reference sequence and provide various comparison and summary statistics. Currently all testing has been carried out on high-quality Illumina read data, so results for other platforms may vary. +## Differences from SKA1 Optimisations include: @@ -52,7 +66,7 @@ Optimisations include: - Faster dictionaries. - Full parallelisation of build phase. - Smaller, standardised input/output files. Faster to save/load. -- Reduced memory footprint with read filtering. +- Reduced memory footprint and increased speed with read filtering. And other improvements: @@ -73,12 +87,14 @@ footprint than the original. ## Planned features -None at present +- Sparse data structure which will reduce space and make parallelisation more efficient. [Issue #47](https://github.com/bacpop/ska.rust/issues/47). +- 'fastcall' mode. [Issue #52](https://github.com/bacpop/ska.rust/issues/52). ## Feature ideas (not definitely planned) - Add support for ambiguity in VCF output (`ska map`). [Issue #5](https://github.com/bacpop/ska.rust/issues/5). - Non-serial loading of .skf files (for when they are very large). [Issue #22](https://github.com/bacpop/ska.rust/issues/22). +- Alternative mixture models for read error correction. [Issue #50](https://github.com/bacpop/ska.rust/issues/50). ## Things you can no longer do diff --git a/src/lib.rs b/src/lib.rs index 33a97ec..a8756ae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,9 @@ //! possibly introducing a SNP across samples. This version uses an ambiguous middle //! base (W for A/T; S for C/G) to represent this case.* //! +//! Tutorials: +//! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/). +//! //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage). //! //! # Usage From deb514ecbe4c1cc131d2407a6bddfbd769737c90 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 16 Oct 2023 17:40:50 +0100 Subject: [PATCH 7/7] Clean up some of the filter logging messages --- src/generic_modes.rs | 2 +- src/merge_ska_array.rs | 5 ++++- tests/align.rs | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/generic_modes.rs b/src/generic_modes.rs index db21ec3..7893e59 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -241,7 +241,7 @@ pub fn weed UInt<'a>>( } let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize; - if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask { + if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask || ignore_const_gaps { log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); let update_kmers = true; ska_array.filter( diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index e6e697e..9bca332 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -154,7 +154,6 @@ where /// Load a split k-mer array from a `.skf` file. pub fn load(filename: &str) -> Result> { - log::info!("Loading skf file"); let ska_file = BufReader::new(File::open(filename)?); let decompress_reader = snap::read::FrameDecoder::new(ska_file); let ska_obj: Self = ciborium::de::from_reader(decompress_reader)?; @@ -251,6 +250,10 @@ where ignore_const_gaps: bool, update_kmers: bool, ) -> i32 { + if ignore_const_gaps && matches!(filter, FilterType::NoAmbig | FilterType::NoFilter) { + log::warn!("--no-gap-only-sites can only be applied when filtering constant bases") + } + let total = self.names.len(); let mut filtered_variants = Array2::zeros((0, total)); let mut filtered_counts = Vec::new(); diff --git a/tests/align.rs b/tests/align.rs index bd22fe6..a6a93be 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -165,6 +165,7 @@ fn filters() { .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) .arg("--filter") .arg("no-filter") + .arg("--no-gap-only-sites") // adding with no filter produces a warning .arg("-v") .output() .unwrap()