Skip to content

Commit

Permalink
Moved rest of skalo code into ska, using Indel command runs the code …
Browse files Browse the repository at this point in the history
…that was in main.rs of skalo
  • Loading branch information
jhellewell14 committed Jan 14, 2025
1 parent 01f74c0 commit 5ad7482
Show file tree
Hide file tree
Showing 4 changed files with 968 additions and 4 deletions.
132 changes: 131 additions & 1 deletion src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ use regex::Regex;
use super::QualOpts;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::{build_and_merge, InputFastx};
use crate::ska_dict::bit_encoding::UInt;
use crate::ska_dict::bit_encoding::{decode_kmer, UInt};
use crate::skalo_utils::{encode_kmer, rev_compl};
use hashbrown::HashMap;

use crate::cli::{
DEFAULT_KMER, DEFAULT_MINCOUNT, DEFAULT_MINQUAL, DEFAULT_PROPORTION_READS, DEFAULT_QUALFILTER,
Expand Down Expand Up @@ -149,3 +151,131 @@ pub fn any_fastq(files: &[InputFastx]) -> bool {
}
fastq
}

/// Reads .skf files for skalo
pub fn read_skalo_input(
input_file: &str,
) -> (
usize,
Vec<String>,
HashMap<u128, HashMap<u128, u32>>,
HashMap<u32, String>,
) {

Check warning on line 163 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

very complex type used. Consider factoring parts into `type` definitions

warning: very complex type used. Consider factoring parts into `type` definitions --> src/io_utils.rs:158:6 | 158 | ) -> ( | ______^ 159 | | usize, 160 | | Vec<String>, 161 | | HashMap<u128, HashMap<u128, u32>>, 162 | | HashMap<u32, String>, 163 | | ) { | |_^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#type_complexity = note: `#[warn(clippy::type_complexity)]` on by default
println!(" # read file '{}'", input_file);

// read the skf file and load split-kmers (ska_array), kmer length and sample names
let ska_array = load_array::<u128>(&[input_file.to_string()], 1)
.expect("\nerror: coud not read the skf file\n\n");
let sample_names = ska_array.names().to_vec();
let len_kmer = ska_array.kmer_len();
let mask = (1 << (len_kmer * 2)) - 1;

println!(" . {}-mers", len_kmer);
println!(" . {} samples", sample_names.len());

println!(" # build colored de Bruijn graph");

// build De Bruijn graph
let degenerate_code: HashMap<u8, Vec<char>> = [
(b'A', vec!['A']),
(b'T', vec!['T']),
(b'G', vec!['G']),
(b'C', vec!['C']),
(b'M', vec!['A', 'C']),
(b'S', vec!['C', 'G']),
(b'W', vec!['A', 'T']),
(b'R', vec!['A', 'G']),
(b'Y', vec!['C', 'T']),
(b'K', vec!['G', 'T']),
(b'B', vec!['C', 'G', 'T']),
(b'D', vec!['A', 'G', 'T']),
(b'H', vec!['A', 'C', 'T']),
(b'V', vec!['A', 'C', 'G']),
(b'N', vec!['A', 'C', 'G', 'T']),
]
.iter()
.cloned()
.collect();

let mut all_kmers: HashMap<u128, HashMap<u128, u32>> = HashMap::new();
let mut all_indexes: HashMap<String, u32> = HashMap::new();
let mut index_map: HashMap<u32, String> = HashMap::new();

let kmer_iter = ska_array.iter();

for (int_kmer, int_middle_base_vec) in kmer_iter {
// select non-ubiquitous split k-mers (ie, absent in at least one sample)
if int_middle_base_vec.contains(&45) {
let (kmer_left, kmer_right) = decode_kmer(len_kmer, int_kmer, mask, mask);

// combine samples by middle-base by using degenerate code
let mut middle_2_samples: HashMap<char, Vec<String>> = HashMap::new();
for (i, nucl) in int_middle_base_vec.iter().enumerate() {
// if middle-base not absent
if *nucl != 45 {
for &new_nucl in &degenerate_code[nucl] {
middle_2_samples
.entry(new_nucl)
.or_default()
.push(i.to_string());
}
}
}

for (nucl, l_indexes) in middle_2_samples.iter() {
let str_sample_id = l_indexes.join("|");
let value_index: u32;

// get index to save species list (as String)
if let Some(index) = all_indexes.get(&str_sample_id) {
// Key exists, use the existing index
value_index = *index;
} else {
// Key does not exist, insert a new entry with the next index
value_index = all_indexes.len() as u32;
all_indexes.insert(str_sample_id.to_string(), value_index.clone());

Check warning on line 236 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

using `clone` on type `u32` which implements the `Copy` trait

warning: using `clone` on type `u32` which implements the `Copy` trait --> src/io_utils.rs:236:67 | 236 | all_indexes.insert(str_sample_id.to_string(), value_index.clone()); | ^^^^^^^^^^^^^^^^^^^ help: try removing the `clone` call: `value_index` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#clone_on_copy = note: `#[warn(clippy::clone_on_copy)]` on by default
index_map.insert(value_index.clone(), str_sample_id.to_string());

Check warning on line 237 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

using `clone` on type `u32` which implements the `Copy` trait

warning: using `clone` on type `u32` which implements the `Copy` trait --> src/io_utils.rs:237:38 | 237 | index_map.insert(value_index.clone(), str_sample_id.to_string()); | ^^^^^^^^^^^^^^^^^^^ help: try removing the `clone` call: `value_index` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#clone_on_copy
}

// save kmers in coloured de Bruijn graph
let full_kmer = format!("{}{}{}", kmer_left, nucl, kmer_right);

let encoded_kmer_1 = encode_kmer(&full_kmer[..full_kmer.len() - 1].to_string());

Check warning on line 243 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary use of `to_string`

warning: unnecessary use of `to_string` --> src/io_utils.rs:243:50 | 243 | let encoded_kmer_1 = encode_kmer(&full_kmer[..full_kmer.len() - 1].to_string()); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: use: `&full_kmer[..full_kmer.len() - 1]` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_to_owned = note: `#[warn(clippy::unnecessary_to_owned)]` on by default
let encoded_kmer_2 = encode_kmer(&full_kmer[1..].to_string());

Check warning on line 244 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary use of `to_string`

warning: unnecessary use of `to_string` --> src/io_utils.rs:244:50 | 244 | let encoded_kmer_2 = encode_kmer(&full_kmer[1..].to_string()); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: use: `&full_kmer[1..]` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_to_owned

// uncomment to print network
// println!("{} {} ", &full_kmer[..full_kmer.len() - 1].to_string(), &full_kmer[1..].to_string());

all_kmers
.entry(encoded_kmer_1)
.or_default()
.insert(encoded_kmer_2, value_index.clone());

Check warning on line 252 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

using `clone` on type `u32` which implements the `Copy` trait

warning: using `clone` on type `u32` which implements the `Copy` trait --> src/io_utils.rs:252:45 | 252 | .insert(encoded_kmer_2, value_index.clone()); | ^^^^^^^^^^^^^^^^^^^ help: try removing the `clone` call: `value_index` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#clone_on_copy

let rc_kmer = rev_compl(&full_kmer);
let rc_encoded_kmer_1 = encode_kmer(&rc_kmer[..full_kmer.len() - 1].to_string());

Check warning on line 255 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary use of `to_string`

warning: unnecessary use of `to_string` --> src/io_utils.rs:255:53 | 255 | let rc_encoded_kmer_1 = encode_kmer(&rc_kmer[..full_kmer.len() - 1].to_string()); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: use: `&rc_kmer[..full_kmer.len() - 1]` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_to_owned
let rc_encoded_kmer_2 = encode_kmer(&rc_kmer[1..].to_string());

Check warning on line 256 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary use of `to_string`

warning: unnecessary use of `to_string` --> src/io_utils.rs:256:53 | 256 | let rc_encoded_kmer_2 = encode_kmer(&rc_kmer[1..].to_string()); | ^^^^^^^^^^^^^^^^^^^^^^^^^ help: use: `&rc_kmer[1..]` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_to_owned

// uncomment to print network
// println!("{} {} ", &rc_kmer[..full_kmer.len() - 1].to_string(), &rc_kmer[1..].to_string());

all_kmers
.entry(rc_encoded_kmer_1)
.or_default()
.insert(rc_encoded_kmer_2, value_index);
}
}
}

// sanity check: test if the number of sample combinations is too high to be stored as u64 integers
let max_u32_value: usize = u32::MAX as usize;
if all_indexes.len() > max_u32_value {
eprintln!(
"\nError: the number of sample combinations is too high to be stored as u64 integers\n"
);
std::process::exit(1);
}

println!(" . {} nodes (includes rev-compl)", all_kmers.len());
println!(" . {} sample combinations", all_indexes.len());
(len_kmer, sample_names, all_kmers, index_map)
}
27 changes: 24 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,11 @@ use crate::io_utils::*;
pub mod coverage;
use crate::coverage::CoverageHistogram;

// pub mod skalo;
use crate::skalo::read_input_file;
pub mod skalo_utils;

Check warning on line 434 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

missing documentation for a module

warning: missing documentation for a module --> src/lib.rs:434:1 | 434 | pub mod skalo_utils; | ^^^^^^^^^^^^^^^^^^^ | note: the lint level is defined here --> src/lib.rs:406:9 | 406 | #![warn(missing_docs)] | ^^^^^^^^^^^^
use crate::skalo_utils::{filter_output_sequences, identify_good_kmers};

pub mod skalo_read_graph;

Check warning on line 437 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

missing documentation for a module

warning: missing documentation for a module --> src/lib.rs:437:1 | 437 | pub mod skalo_read_graph; | ^^^^^^^^^^^^^^^^^^^^^^^^
use crate::skalo_read_graph::build_sequences;

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
Expand Down Expand Up @@ -790,7 +793,25 @@ pub fn main() {
output_name,
} => {
// read input file
// let (len_kmer, l_sample_names, all_kmers, index_map) = read_input_file(skf_file);
let (len_kmer, l_sample_names, all_kmers, index_map) = read_skalo_input(skf_file);

// identify 'good' kmers in De Bruijn graph
let (start_kmers, end_kmers) = identify_good_kmers(len_kmer, &all_kmers, &index_map);

// build sequences
let sequences =
build_sequences(len_kmer, &all_kmers, &start_kmers, &end_kmers, &index_map);

// filter and output sequences
filter_output_sequences(
sequences,
len_kmer,
l_sample_names.clone(),
*n_poly,
*max_missing,
output_name,
skf_file,
);
}
}
let end = Instant::now();
Expand Down
Loading

0 comments on commit 5ad7482

Please sign in to comment.