Skip to content

Commit

Permalink
Better default filtering options for ska distance
Browse files Browse the repository at this point in the history
Remove ambig sites by default
Link to filtering tutorial
Slightly expand description of ska distance in the docs
  • Loading branch information
johnlees committed Sep 25, 2024
1 parent b9c8909 commit 4fde581
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.3.10"
version = "0.3.11"
authors = [
"John Lees <[email protected]>",
"Simon Harris <[email protected]>",
Expand Down
2 changes: 1 addition & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ pub enum Commands {

/// Filter for ambiguous bases
#[arg(long, default_value_t = false)]
filter_ambiguous: bool,
allow_ambiguous: bool,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
Expand Down
2 changes: 2 additions & 0 deletions src/generic_modes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
);
if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
if filt_ambig {
let filter_ambig_as_missing = true;
let mask_ambig = true;
apply_filters(
ska_array,
min_freq,
Expand Down
17 changes: 11 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
//!
//! Tutorials:
//! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
//! - [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/).
//!
//! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
//!
Expand Down Expand Up @@ -208,9 +209,12 @@
//! ska distance -o distances.txt seqs.skf
//! ```
//!
//! Ignore ambiguous bases by adding `--filter-ambiguous` flag, and `--min-freq` to
//! ignore k-mers only found in some samples. Multiple threads
//! can be used, but this will only be effective with large numbers of samples.
//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and `--min-freq` to
//! ignore k-mers only found in some samples. Note that ambiguous bases may overestimate
//! distances due to repeat k-mers. For finer control over filtering, first run `ska weed`
//! on the input .skf.
//!
//! Multiple threads can be used, but this will only be effective with large numbers of samples.
//!
//! The companion script in `scripts/cluster_dists.py` (requires `networkx`) can
//! be used to make single linkage clusters from these distances at given thresholds,
Expand Down Expand Up @@ -619,18 +623,19 @@ pub fn main() {
skf_file,
output,
min_freq,
filter_ambiguous,
allow_ambiguous,
threads,
} => {
check_threads(*threads);
let filter_ambiguous = !*allow_ambiguous;
if let Ok(mut ska_array) = MergeSkaArray::<u64>::load(skf_file) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
distance(
&mut ska_array,
output,
*min_freq,
*filter_ambiguous,
filter_ambiguous,
*threads,
);
} else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
Expand All @@ -640,7 +645,7 @@ pub fn main() {
&mut ska_array,
output,
*min_freq,
*filter_ambiguous,
filter_ambiguous,
*threads,
);
} else {
Expand Down

0 comments on commit 4fde581

Please sign in to comment.