Merge pull request #61 from bacpop/0.3.4-candidate

0.3.4 candidate -- final feature requests
bacpop · Oct 17, 2023 · 798c290 · 798c290
2 parents f240fc5 + deb514e
commit 798c290
Show file tree

Hide file tree

Showing 7 changed files with 297 additions and 56 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,13 +1,15 @@
 [package]
 name = "ska"
-version = "0.3.3"
+version = "0.3.4"
 authors = [
     "John Lees <[email protected]>",
     "Simon Harris <[email protected]>",
-    "Johanna von Wachsmann <[email protected]>",
+    "Johanna von Wachsmann <[email protected]>",
     "Tommi Maklin <[email protected]>",
     "Joel Hellewell <[email protected]>",
-    "Timothy Russell <[email protected]>"
+    "Timothy Russell <[email protected]>",
+    "Romain Derelle <[email protected]>",
+    "Nicholas Croucher <[email protected]>"
 ]
 edition = "2021"
 description = "Split k-mer analysis"

diff --git a/README.md b/README.md
@@ -9,6 +9,29 @@
 [![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/bacpop/ska.rust)](https://github.com/bacpop/ska.rust/releases)
 <!-- badges: end -->
 
+## Description
+
+This is a reimplementation of the [SKA package](https://github.com/simonrharris/SKA)
+in the rust language, by Johanna von Wachsmann, Simon Harris and John Lees. We are also grateful to have
+received user contributions from:
+
+- Romain Derelle
+- Tommi Maklin
+- Joel Hellewell
+- Timothy Russell
+- Nicholas Croucher
+- Dan Lu
+
+Split k-mer analysis (version 2) uses exact matching of split k-mer sequences to align closely related sequences, typically small haploid genomes such as bacteria and viruses.
+
+SKA can only align SNPs further than the k-mer length apart, and does not use a gap penalty approach or give alignment scores. But the advantages are speed and flexibility, particularly the ability to run on a reference-free manner (i.e. including accessory genome variation) on both assemblies and reads.
+
+## Documentation
+
+Can be found at https://docs.rs/ska. We also have some tutorials available:
+
+- [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
+
 ## Installation
 
 Choose from:
@@ -35,24 +58,15 @@ xattr -d "com.apple.quarantine" ./ska
 1. Clone the repository with `git clone`.
 2. Run `cargo install --path .` or `RUSTFLAGS="-C target-cpu=native" cargo install --path .` to optimise for your machine.
 
-## Documentation
-
-Can be found at https://docs.rs/ska.
-
-## Description
-
-This is a reimplementation of Simon Harris' [SKA package](https://github.com/simonrharris/SKA)
-in the rust language, by Johanna von Wachsmann, Simon Harris and John Lees.
-
-> SKA (Split Kmer Analysis) is a toolkit for prokaryotic (and any other small, haploid) DNA sequence analysis using split kmers. A split kmer is a pair of kmers in a DNA sequence that are separated by a single base. Split kmers allow rapid comparison and alignment of small genomes, and is particulalry suited for surveillance or outbreak investigation. SKA can produce split kmer files from fasta format assemblies or directly from fastq format read sequences, cluster them, align them with or without a reference sequence and provide various comparison and summary statistics. Currently all testing has been carried out on high-quality Illumina read data, so results for other platforms may vary.
+## Differences from SKA1
 
 Optimisations include:
 
 - Integer DNA encoding, optimised parsing from FASTA/FASTQ.
 - Faster dictionaries.
 - Full parallelisation of build phase.
 - Smaller, standardised input/output files. Faster to save/load.
-- Reduced memory footprint with read filtering.
+- Reduced memory footprint and increased speed with read filtering.
 
 And other improvements:
 
@@ -73,12 +87,14 @@ footprint than the original.
 
 ## Planned features
 
-None at present
+- Sparse data structure which will reduce space and make parallelisation more efficient. [Issue #47](https://github.com/bacpop/ska.rust/issues/47).
+- 'fastcall' mode. [Issue #52](https://github.com/bacpop/ska.rust/issues/52).
 
 ## Feature ideas (not definitely planned)
 
 - Add support for ambiguity in VCF output (`ska map`). [Issue #5](https://github.com/bacpop/ska.rust/issues/5).
 - Non-serial loading of .skf files (for when they are very large). [Issue #22](https://github.com/bacpop/ska.rust/issues/22).
+- Alternative mixture models for read error correction. [Issue #50](https://github.com/bacpop/ska.rust/issues/50).
 
 ## Things you can no longer do
 

diff --git a/src/cli.rs b/src/cli.rs
@@ -13,6 +13,8 @@ pub const DEFAULT_STRAND: bool = false;
 pub const DEFAULT_REPEATMASK: bool = false;
 /// Default ambiguous masking behaviour
 pub const DEFAULT_AMBIGMASK: bool = false;
+/// Default gap ignoring behaviour (at constant sites)
+pub const DEFAULT_CONSTGAPS: bool = false;
 /// Default minimum k-mer count for FASTQ files
 pub const DEFAULT_MINCOUNT: u16 = 10;
 /// Default minimum base quality (PHRED score) for FASTQ files
@@ -151,7 +153,7 @@ pub enum Commands {
         min_qual: u8,
 
         /// Quality filtering criteria (with reads)
-        #[arg(long, value_enum, default_value_t = QualFilter::Middle)]
+        #[arg(long, value_enum, default_value_t = QualFilter::Strict)]
         qual_filter: QualFilter,
 
         /// Number of CPU threads
@@ -180,6 +182,10 @@ pub enum Commands {
         #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
         ambig_mask: bool,
 
+        /// Ignore gaps '-' in constant sites (for low coverage samples)
+        #[arg(long, default_value_t = DEFAULT_CONSTGAPS)]
+        no_gap_only_sites: bool,
+
         /// Number of CPU threads
         #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
         threads: usize,
@@ -287,6 +293,10 @@ pub enum Commands {
         /// Mask any ambiguous bases in the alignment with 'N'
         #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
         ambig_mask: bool,
+
+        /// Ignore gaps '-' in constant sites
+        #[arg(long, default_value_t = DEFAULT_CONSTGAPS)]
+        no_gap_only_sites: bool,
     },
     /// Get the number of k-mers in a split k-mer file, and other information
     Nk {

diff --git a/src/generic_modes.rs b/src/generic_modes.rs
@@ -20,13 +20,14 @@ pub fn align<IntT: for<'a> UInt<'a>>(
     output: &Option<String>,
     filter: &FilterType,
     mask_ambig: bool,
+    ignore_const_gaps: bool,
     min_freq: f64,
 ) {
     // In debug mode (cannot be set from CLI, give details)
     log::debug!("{ska_array}");
 
     // Apply filters
-    apply_filters(ska_array, min_freq, filter, mask_ambig);
+    apply_filters(ska_array, min_freq, filter, mask_ambig, ignore_const_gaps);
 
     // Write out to file/stdout
     log::info!("Writing alignment");
@@ -104,11 +105,18 @@ pub fn apply_filters<IntT: for<'a> UInt<'a>>(
     min_freq: f64,
     filter: &FilterType,
     ambig_mask: bool,
+    ignore_const_gaps: bool,
 ) -> i32 {
     let update_kmers = false;
     let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize;
-    log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
-    ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers)
+    log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
+    ska_array.filter(
+        filter_threshold,
+        filter,
+        ambig_mask,
+        ignore_const_gaps,
+        update_kmers,
+    )
 }
 
 /// Calculate distances between samples
@@ -125,12 +133,31 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
     log::debug!("{ska_array}");
 
     let mask_ambig = false;
-    let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst, mask_ambig);
+    let ignore_const_gaps = false;
+    let constant = apply_filters(
+        ska_array,
+        min_freq,
+        &FilterType::NoConst,
+        mask_ambig,
+        ignore_const_gaps,
+    );
     if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
         if filt_ambig {
-            apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst, mask_ambig);
+            apply_filters(
+                ska_array,
+                min_freq,
+                &FilterType::NoAmbigOrConst,
+                mask_ambig,
+                ignore_const_gaps,
+            );
         } else {
-            apply_filters(ska_array, min_freq, &FilterType::NoFilter, mask_ambig);
+            apply_filters(
+                ska_array,
+                min_freq,
+                &FilterType::NoFilter,
+                mask_ambig,
+                ignore_const_gaps,
+            );
         }
     }
 
@@ -178,13 +205,15 @@ pub fn delete<IntT: for<'a> UInt<'a>>(
 }
 
 /// Remove k-mers, and optionally apply filters to an array
+#[allow(clippy::too_many_arguments)]
 pub fn weed<IntT: for<'a> UInt<'a>>(
     ska_array: &mut MergeSkaArray<IntT>,
     weed_file: &Option<String>,
     reverse: bool,
     min_freq: f64,
     filter: &FilterType,
     ambig_mask: bool,
+    ignore_const_gaps: bool,
     out_file: &str,
 ) {
     if let Some(weed_fasta) = weed_file {
@@ -212,10 +241,16 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
     }
 
     let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
-    if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask {
-        log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
+    if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask || ignore_const_gaps {
+        log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
         let update_kmers = true;
-        ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers);
+        ska_array.filter(
+            filter_threshold,
+            filter,
+            ambig_mask,
+            ignore_const_gaps,
+            update_kmers,
+        );
     }
 
     log::info!("Saving modified skf file");

diff --git a/src/lib.rs b/src/lib.rs
@@ -33,6 +33,9 @@
 //! possibly introducing a SNP across samples. This version uses an ambiguous middle
 //! base (W for A/T; S for C/G) to represent this case.*
 //!
+//! Tutorials:
+//! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
+//!
 //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
 //!
 //! # Usage
@@ -367,8 +370,9 @@
 //! let min_count = 2;
 //! let update_kmers = false;
 //! let filter = FilterType::NoConst;
+//! let ignore_const_gaps = false;
 //! let ambig_mask = false;
-//! ska_array.filter(min_count, &filter, update_kmers, ambig_mask);
+//! ska_array.filter(min_count, &filter, ambig_mask, ignore_const_gaps, update_kmers);
 //!
 //! // Write out to stdout
 //! let mut out_stream = set_ostream(&None);
@@ -512,17 +516,32 @@ pub fn main() {
             min_freq,
             filter,
             ambig_mask,
+            no_gap_only_sites,
             threads,
         } => {
             check_threads(*threads);
             if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
                 // In debug mode (cannot be set from CLI, give details)
                 log::debug!("{ska_array}");
-                align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
+                align(
+                    &mut ska_array,
+                    output,
+                    filter,
+                    *ambig_mask,
+                    *no_gap_only_sites,
+                    *min_freq,
+                );
             } else if let Ok(mut ska_array) = load_array::<u128>(input, *threads) {
                 // In debug mode (cannot be set from CLI, give details)
                 log::debug!("{ska_array}");
-                align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
+                align(
+                    &mut ska_array,
+                    output,
+                    filter,
+                    *ambig_mask,
+                    *no_gap_only_sites,
+                    *min_freq,
+                );
             } else {
                 panic!("Could not read input file(s): {input:?}");
             }
@@ -640,6 +659,7 @@ pub fn main() {
             reverse,
             min_freq,
             ambig_mask,
+            no_gap_only_sites,
             filter,
         } => {
             log::info!("Loading skf file");
@@ -651,6 +671,7 @@ pub fn main() {
                     *min_freq,
                     filter,
                     *ambig_mask,
+                    *no_gap_only_sites,
                     if output.is_none() {
                         skf_file
                     } else {
@@ -665,6 +686,7 @@ pub fn main() {
                     *min_freq,
                     filter,
                     *ambig_mask,
+                    *no_gap_only_sites,
                     if output.is_none() {
                         skf_file
                     } else {