diff --git a/Cargo.lock b/Cargo.lock index 0284f76..276cc8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -471,6 +471,7 @@ dependencies = [ "criterion", "ndarray", "rand", + "regex", "rstest", ] diff --git a/phylo2vec/Cargo.toml b/phylo2vec/Cargo.toml index 99f3a7b..dc694b6 100644 --- a/phylo2vec/Cargo.toml +++ b/phylo2vec/Cargo.toml @@ -8,6 +8,7 @@ license = "LGPL-3.0" [dependencies] rand = "*" +regex = "1.11.1" [dev-dependencies] rstest = "0.23.0" diff --git a/phylo2vec/src/tree_vec/ops/newick.rs b/phylo2vec/src/tree_vec/ops/newick/mod.rs similarity index 61% rename from phylo2vec/src/tree_vec/ops/newick.rs rename to phylo2vec/src/tree_vec/ops/newick/mod.rs index d7d8888..b776f42 100644 --- a/phylo2vec/src/tree_vec/ops/newick.rs +++ b/phylo2vec/src/tree_vec/ops/newick/mod.rs @@ -1,5 +1,9 @@ use crate::tree_vec::types::Ancestry; +mod newick_patterns; + +pub use newick_patterns::NewickPatterns; + fn _get_cherries_recursive_inner(ancestry: &mut Ancestry, newick: &str, has_parents: bool) { let mut open_idx: usize = 0; @@ -84,6 +88,30 @@ fn _build_newick_recursive_inner(p: usize, ancestry: &Ancestry) -> String { format!("({},{}){}", left, right, p) } +pub fn remove_parent_labels(newick: &str) -> String { + let newick_patterns = NewickPatterns::new(); + return newick_patterns.parents.replace_all(newick, ")").to_string(); +} + +pub fn has_parents(newick: &str) -> bool { + let newick_patterns = NewickPatterns::new(); + return newick_patterns.parents.is_match(newick); +} + +pub fn find_num_leaves(newick: &str) -> usize { + let newick_patterns = NewickPatterns::new(); + let result: Vec = newick_patterns + .pairs + .captures_iter(newick) + .map(|caps| { + let (_, [_, node]) = caps.extract(); + node.parse::().unwrap() + }) + .collect(); + + return result.len(); +} + /// Build newick string from the ancestry matrix pub fn build_newick(ancestry: &Ancestry) -> String { // Get the root node, which is the parent value of the last ancestry element @@ -92,3 +120,48 @@ pub fn build_newick(ancestry: &Ancestry) -> String { // Build the Newick string starting from the root, and append a semicolon format!("{};", _build_newick_recursive_inner(root, ancestry)) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::tree_vec::ops::to_newick; + use crate::utils::sample; + use rstest::*; + + #[rstest] + #[case("(((0,(3,5)6)8,2)9,(1,4)7)10;", "(((0,(3,5)),2),(1,4));")] + #[case("(0,(1,(2,(3,(4,5)6)7)8)9)10;", "(0,(1,(2,(3,(4,5)))));")] + #[case("((0,2)5,(1,3)4)6;", "((0,2),(1,3));")] + fn test_remove_parent_labels(#[case] newick: &str, #[case] expected: &str) { + let result = remove_parent_labels(&newick); + assert_eq!(result, expected); + } + + #[rstest] + #[case(10)] + #[case(100)] + #[case(1000)] + fn test_has_parents(#[case] n_leaves: usize) { + let v = sample(n_leaves, false); + let newick = to_newick(&v); + // Check if the newick string has parents + let result = has_parents(&newick); + assert_eq!(result, true); + + // Check if the newick string does not have parents + let result_no_parents = has_parents(&remove_parent_labels(&newick)); + assert_eq!(result_no_parents, false); + } + + #[rstest] + #[case(10)] + #[case(100)] + #[case(1000)] + fn test_find_num_leaves(#[case] n_leaves: usize) { + let v = sample(n_leaves, false); + let newick = to_newick(&v); + // Check if the newick string has parents + let result = find_num_leaves(&newick); + assert_eq!(result, n_leaves); + } +} diff --git a/phylo2vec/src/tree_vec/ops/newick/newick_patterns.rs b/phylo2vec/src/tree_vec/ops/newick/newick_patterns.rs new file mode 100644 index 0000000..2922604 --- /dev/null +++ b/phylo2vec/src/tree_vec/ops/newick/newick_patterns.rs @@ -0,0 +1,30 @@ +#[derive(Debug)] +pub struct NewickPatterns { + pub left_node: regex::Regex, + pub right_node: regex::Regex, + pub pairs: regex::Regex, + pub branch_lengths: regex::Regex, + pub parents: regex::Regex, +} + +impl NewickPatterns { + pub fn new() -> Self { + let _left_node = r"\(\b(\d+)\b"; + let _right_node = r",\b(\d+)\b"; + let _branch_lengths = r":\d+(\.\d+)?"; + let _parents = r"\)(\d+)"; + let _pairs = format!(r"({})|({})", _left_node, _right_node); + NewickPatterns { + // Pattern of an integer label on the left of a pair + left_node: regex::Regex::new(&_left_node).unwrap(), + // Pattern of an integer label on the right of a pair + right_node: regex::Regex::new(&_right_node).unwrap(), + // Pattern of a pair of integer labels + pairs: regex::Regex::new(&_pairs).unwrap(), + // Pattern of a branch length annotation + branch_lengths: regex::Regex::new(&_branch_lengths).unwrap(), + // Pattern of a parent label + parents: regex::Regex::new(&_parents).unwrap(), + } + } +}