Skip to content

Commit

Permalink
Made skalo command commensurable with the file input logic that runs …
Browse files Browse the repository at this point in the history
…build if you provide a list of fasta files
  • Loading branch information
jhellewell14 committed Jan 15, 2025
1 parent 5ad7482 commit 087b2e7
Show file tree
Hide file tree
Showing 6 changed files with 716 additions and 705 deletions.
9 changes: 6 additions & 3 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,8 @@ pub enum Commands {
},
/// Run the skalo graph-based algorithm to infer indels from a provided .skf file
Indel {
/// input SKA2 file [required]
#[arg(short = 'i', long)]
skf_file: String,
/// A .skf file, or list of .fasta files
input: Vec<String>,

/// ignore indels occuring after homopolymers of size n+
#[arg(short = 'n', long)]
Expand All @@ -375,6 +374,10 @@ pub enum Commands {
/// name of output files
#[arg(short = 'o', long, default_value_t = ("skalo").to_string())]
output_name: String,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,
},
}

Expand Down
129 changes: 0 additions & 129 deletions src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use super::QualOpts;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::{build_and_merge, InputFastx};
use crate::ska_dict::bit_encoding::{decode_kmer, UInt};

Check warning on line 16 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unused import: `decode_kmer`

warning: unused import: `decode_kmer` --> src/io_utils.rs:16:37 | 16 | use crate::ska_dict::bit_encoding::{decode_kmer, UInt}; | ^^^^^^^^^^^ | = note: `#[warn(unused_imports)]` on by default
use crate::skalo_utils::{encode_kmer, rev_compl};
use hashbrown::HashMap;

Check warning on line 17 in src/io_utils.rs

View workflow job for this annotation

GitHub Actions / clippy

unused import: `hashbrown::HashMap`

warning: unused import: `hashbrown::HashMap` --> src/io_utils.rs:17:5 | 17 | use hashbrown::HashMap; | ^^^^^^^^^^^^^^^^^^

use crate::cli::{
Expand Down Expand Up @@ -151,131 +150,3 @@ pub fn any_fastq(files: &[InputFastx]) -> bool {
}
fastq
}

/// Reads .skf files for skalo
pub fn read_skalo_input(
input_file: &str,
) -> (
usize,
Vec<String>,
HashMap<u128, HashMap<u128, u32>>,
HashMap<u32, String>,
) {
println!(" # read file '{}'", input_file);

// read the skf file and load split-kmers (ska_array), kmer length and sample names
let ska_array = load_array::<u128>(&[input_file.to_string()], 1)
.expect("\nerror: coud not read the skf file\n\n");
let sample_names = ska_array.names().to_vec();
let len_kmer = ska_array.kmer_len();
let mask = (1 << (len_kmer * 2)) - 1;

println!(" . {}-mers", len_kmer);
println!(" . {} samples", sample_names.len());

println!(" # build colored de Bruijn graph");

// build De Bruijn graph
let degenerate_code: HashMap<u8, Vec<char>> = [
(b'A', vec!['A']),
(b'T', vec!['T']),
(b'G', vec!['G']),
(b'C', vec!['C']),
(b'M', vec!['A', 'C']),
(b'S', vec!['C', 'G']),
(b'W', vec!['A', 'T']),
(b'R', vec!['A', 'G']),
(b'Y', vec!['C', 'T']),
(b'K', vec!['G', 'T']),
(b'B', vec!['C', 'G', 'T']),
(b'D', vec!['A', 'G', 'T']),
(b'H', vec!['A', 'C', 'T']),
(b'V', vec!['A', 'C', 'G']),
(b'N', vec!['A', 'C', 'G', 'T']),
]
.iter()
.cloned()
.collect();

let mut all_kmers: HashMap<u128, HashMap<u128, u32>> = HashMap::new();
let mut all_indexes: HashMap<String, u32> = HashMap::new();
let mut index_map: HashMap<u32, String> = HashMap::new();

let kmer_iter = ska_array.iter();

for (int_kmer, int_middle_base_vec) in kmer_iter {
// select non-ubiquitous split k-mers (ie, absent in at least one sample)
if int_middle_base_vec.contains(&45) {
let (kmer_left, kmer_right) = decode_kmer(len_kmer, int_kmer, mask, mask);

// combine samples by middle-base by using degenerate code
let mut middle_2_samples: HashMap<char, Vec<String>> = HashMap::new();
for (i, nucl) in int_middle_base_vec.iter().enumerate() {
// if middle-base not absent
if *nucl != 45 {
for &new_nucl in &degenerate_code[nucl] {
middle_2_samples
.entry(new_nucl)
.or_default()
.push(i.to_string());
}
}
}

for (nucl, l_indexes) in middle_2_samples.iter() {
let str_sample_id = l_indexes.join("|");
let value_index: u32;

// get index to save species list (as String)
if let Some(index) = all_indexes.get(&str_sample_id) {
// Key exists, use the existing index
value_index = *index;
} else {
// Key does not exist, insert a new entry with the next index
value_index = all_indexes.len() as u32;
all_indexes.insert(str_sample_id.to_string(), value_index.clone());
index_map.insert(value_index.clone(), str_sample_id.to_string());
}

// save kmers in coloured de Bruijn graph
let full_kmer = format!("{}{}{}", kmer_left, nucl, kmer_right);

let encoded_kmer_1 = encode_kmer(&full_kmer[..full_kmer.len() - 1].to_string());
let encoded_kmer_2 = encode_kmer(&full_kmer[1..].to_string());

// uncomment to print network
// println!("{} {} ", &full_kmer[..full_kmer.len() - 1].to_string(), &full_kmer[1..].to_string());

all_kmers
.entry(encoded_kmer_1)
.or_default()
.insert(encoded_kmer_2, value_index.clone());

let rc_kmer = rev_compl(&full_kmer);
let rc_encoded_kmer_1 = encode_kmer(&rc_kmer[..full_kmer.len() - 1].to_string());
let rc_encoded_kmer_2 = encode_kmer(&rc_kmer[1..].to_string());

// uncomment to print network
// println!("{} {} ", &rc_kmer[..full_kmer.len() - 1].to_string(), &rc_kmer[1..].to_string());

all_kmers
.entry(rc_encoded_kmer_1)
.or_default()
.insert(rc_encoded_kmer_2, value_index);
}
}
}

// sanity check: test if the number of sample combinations is too high to be stored as u64 integers
let max_u32_value: usize = u32::MAX as usize;
if all_indexes.len() > max_u32_value {
eprintln!(
"\nError: the number of sample combinations is too high to be stored as u64 integers\n"
);
std::process::exit(1);
}

println!(" . {} nodes (includes rev-compl)", all_kmers.len());
println!(" . {} sample combinations", all_indexes.len());
(len_kmer, sample_names, all_kmers, index_map)
}
52 changes: 30 additions & 22 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,11 +431,11 @@ use crate::io_utils::*;
pub mod coverage;
use crate::coverage::CoverageHistogram;

pub mod skalo;

Check warning on line 434 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

missing documentation for a module

warning: missing documentation for a module --> src/lib.rs:434:1 | 434 | pub mod skalo; | ^^^^^^^^^^^^^ | note: the lint level is defined here --> src/lib.rs:406:9 | 406 | #![warn(missing_docs)] | ^^^^^^^^^^^^
pub mod skalo_utils;

Check warning on line 435 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

missing documentation for a module

warning: missing documentation for a module --> src/lib.rs:435:1 | 435 | pub mod skalo_utils; | ^^^^^^^^^^^^^^^^^^^
use crate::skalo_utils::{filter_output_sequences, identify_good_kmers};

pub mod skalo_read_graph;
use crate::skalo_read_graph::build_sequences;
use crate::skalo::{
build_sequences, filter_output_sequences, identify_good_kmers, ska_array_to_dbgraph,
};

/// Possible quality score filters when building with reads
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
Expand Down Expand Up @@ -787,31 +787,39 @@ pub fn main() {
eprintln!("Estimated cutoff\t{cutoff}");
}
Commands::Indel {
skf_file,
input,
n_poly,
max_missing,
output_name,
threads,
} => {
// read input file
let (len_kmer, l_sample_names, all_kmers, index_map) = read_skalo_input(skf_file);
check_threads(*threads);
if let Ok(ska_array) = load_array::<u128>(input, *threads) {
// Process ska array
let (len_kmer, l_sample_names, all_kmers, index_map) =
ska_array_to_dbgraph(ska_array);

// identify 'good' kmers in De Bruijn graph
let (start_kmers, end_kmers) =
identify_good_kmers(len_kmer, &all_kmers, &index_map);

// identify 'good' kmers in De Bruijn graph
let (start_kmers, end_kmers) = identify_good_kmers(len_kmer, &all_kmers, &index_map);
// build sequences
let sequences =
build_sequences(len_kmer, &all_kmers, &start_kmers, &end_kmers, &index_map);

// build sequences
let sequences =
build_sequences(len_kmer, &all_kmers, &start_kmers, &end_kmers, &index_map);
let input_name: String = input.join(" ").to_string();

// filter and output sequences
filter_output_sequences(
sequences,
len_kmer,
l_sample_names.clone(),
*n_poly,
*max_missing,
output_name,
skf_file,
);
// filter and output sequences
filter_output_sequences(
sequences,
len_kmer,
l_sample_names.clone(),
*n_poly,
*max_missing,
output_name,
&input_name,
);
}
}
}
let end = Instant::now();
Expand Down
Loading

0 comments on commit 087b2e7

Please sign in to comment.