Skip to content

Commit

Permalink
Merge pull request #24 from bacpop/missed-assert
Browse files Browse the repository at this point in the history
Add 'reverse' weed to keep only selected split k-mers
  • Loading branch information
johnlees authored Apr 6, 2023
2 parents 893a674 + 935ebfb commit d9beb84
Show file tree
Hide file tree
Showing 12 changed files with 86 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.2.3"
version = "0.2.4"
authors = [
"John Lees <[email protected]>",
"Simon Harris <[email protected]>",
Expand Down
4 changes: 4 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ pub enum Commands {
/// A FASTA file containing sequences to remove
weed_file: Option<String>,

/// Remove k-mers not in the weed_file
#[arg(long, default_value_t = false)]
reverse: bool,

/// Minimum fraction of samples a k-mer has to appear in
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
min_freq: f64,
Expand Down
9 changes: 7 additions & 2 deletions src/generic_modes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub fn delete<IntT: for<'a> UInt<'a>>(
pub fn weed<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
weed_file: &Option<String>,
reverse: bool,
min_freq: f64,
filter: &FilterType,
out_file: &str,
Expand All @@ -126,8 +127,12 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
);
let ska_weed = RefSka::new(ska_array.kmer_len(), weed_fasta, ska_array.rc());

log::info!("Removing weed k-mers");
ska_array.weed(&ska_weed);
if !reverse {
log::info!("Removing weed k-mers");
} else {
log::info!("Keeping only weed k-mers");
}
ska_array.weed(&ska_weed, reverse);
}

let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
Expand Down
19 changes: 17 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,14 +498,29 @@ pub fn main() {
Commands::Weed {
skf_file,
weed_file,
reverse,
min_freq,
filter,
} => {
log::info!("Loading skf file");
if let Ok(mut ska_array) = MergeSkaArray::<u64>::load(skf_file) {
weed(&mut ska_array, weed_file, *min_freq, filter, skf_file);
weed(
&mut ska_array,
weed_file,
*reverse,
*min_freq,
filter,
skf_file,
);
} else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
weed(&mut ska_array, weed_file, *min_freq, filter, skf_file);
weed(
&mut ska_array,
weed_file,
*reverse,
*min_freq,
filter,
skf_file,
);
} else {
panic!("Could not read input file: {skf_file}");
}
Expand Down
24 changes: 20 additions & 4 deletions src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ use crate::cli::FilterType;
///
/// // Delete k-mers
/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc());
/// ska_array.weed(&ska_weed);
/// let reverse = false;
/// ska_array.weed(&ska_weed, reverse);
/// ```
#[derive(Serialize, Deserialize)]
pub struct MergeSkaArray<IntT> {
Expand Down Expand Up @@ -298,7 +299,13 @@ where
/// may be a multi-FASTA) generated with [`RefSka::new()`]
///
/// Used with `ska weed`.
pub fn weed(&mut self, weed_ref: &RefSka<IntT>) {
///
/// # Arguments
///
/// - `weed_ref` -- a processed reference with split k-mers to remove.
/// - `reverse` -- only remove k-mers not in the input file.
///
pub fn weed(&mut self, weed_ref: &RefSka<IntT>, reverse: bool) {
let weed_kmers: HashSet<IntT> = HashSet::from_iter(weed_ref.kmer_iter());

let mut removed = 0;
Expand All @@ -312,7 +319,8 @@ where
.zip(self.variant_count.iter())
{
let ((kmer, var_row), count) = kmer_it;
if !weed_kmers.contains(kmer) {
let kmer_found = weed_kmers.contains(kmer);
if (!reverse && !kmer_found) || (reverse && kmer_found) {
new_sk.push(*kmer);
new_variants.push_row(var_row).unwrap();
new_counts.push(*count);
Expand All @@ -323,7 +331,15 @@ where
self.split_kmers = new_sk;
self.variants = new_variants;
self.variant_count = new_counts;
log::info!("Removed {} of {} weed k-mers", removed, weed_ref.ksize());
if !reverse {
log::info!("Removed {} of {} weed k-mers", removed, weed_ref.ksize());
} else {
log::info!(
"Kept {} k-mers using {} reverse weed k-mers",
self.split_kmers.len(),
weed_ref.ksize()
);
}
}

/// Write the middle bases as an alignment (FASTA).
Expand Down
2 changes: 1 addition & 1 deletion tests/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::common::*;
use hashbrown::HashSet;

#[cfg(test)]
use pretty_assertions::{assert_eq};
use pretty_assertions::assert_eq;

// NB: to view output, uncomment the current_dir lines

Expand Down
2 changes: 1 addition & 1 deletion tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use predicates::prelude::*;
use hashbrown::HashSet;

#[cfg(test)]
use pretty_assertions::{assert_eq};
use pretty_assertions::assert_eq;

// Creates correct path for input/output files
static FILE_IN: &'static str = "tests/test_files_in";
Expand Down
2 changes: 1 addition & 1 deletion tests/fasta_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub mod common;
use crate::common::{var_hash, TestDir, TestSetup};

#[cfg(test)]
use pretty_assertions::{assert_eq};
use pretty_assertions::assert_eq;

#[test]
fn align_n() {
Expand Down
2 changes: 1 addition & 1 deletion tests/fastq_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use snapbox::cmd::{cargo_bin, Command};
use hashbrown::HashSet;

#[cfg(test)]
use pretty_assertions::{assert_eq};
use pretty_assertions::assert_eq;

pub mod common;
use crate::common::*;
Expand Down
3 changes: 3 additions & 0 deletions tests/map.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use snapbox::cmd::{cargo_bin, Command};

#[cfg(test)]
use pretty_assertions::assert_eq;

pub mod common;
use crate::common::*;

Expand Down
27 changes: 26 additions & 1 deletion tests/skf_ops.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use snapbox::cmd::{cargo_bin, Command};

#[cfg(test)]
use pretty_assertions::{assert_eq};
use pretty_assertions::assert_eq;

pub mod common;
use crate::common::{TestDir, TestSetup};
Expand Down Expand Up @@ -202,6 +202,31 @@ fn weed() {
.assert()
.stdout_matches_path(sandbox.file_string("weed_nk.stdout", TestDir::Correct));

// Keep rather than weed
Command::new("cp")
.current_dir(sandbox.get_wd())
.arg(sandbox.file_string("merge.skf", TestDir::Input))
.arg("merge.skf")
.assert()
.success();

Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("weed")
.arg("merge.skf")
.arg(sandbox.file_string("weed.fa", TestDir::Input))
.arg("--reverse")
.arg("-v")
.assert()
.success();

Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("align")
.arg("merge.skf")
.assert()
.stdout_eq_path(sandbox.file_string("weed_align_reverse.stdout", TestDir::Correct));

// With longer k-mers
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
Expand Down
4 changes: 4 additions & 0 deletions tests/test_results_correct/weed_align_reverse.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>test_1
A
>test_2
T

0 comments on commit d9beb84

Please sign in to comment.