Skip to content

Commit

Permalink
feat: Add newick string operation functions
Browse files Browse the repository at this point in the history
  • Loading branch information
lsetiawan committed Dec 13, 2024
1 parent 091e282 commit 5b85482
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions phylo2vec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ license = "LGPL-3.0"

[dependencies]
rand = "*"
regex = "1.11.1"

[dev-dependencies]
rstest = "0.23.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
use crate::tree_vec::types::Ancestry;

mod newick_patterns;

pub use newick_patterns::NewickPatterns;

fn _get_cherries_recursive_inner(ancestry: &mut Ancestry, newick: &str, has_parents: bool) {
let mut open_idx: usize = 0;

Expand Down Expand Up @@ -84,6 +88,30 @@ fn _build_newick_recursive_inner(p: usize, ancestry: &Ancestry) -> String {
format!("({},{}){}", left, right, p)
}

pub fn remove_parent_labels(newick: &str) -> String {
let newick_patterns = NewickPatterns::new();
return newick_patterns.parents.replace_all(newick, ")").to_string();
}

pub fn has_parents(newick: &str) -> bool {
let newick_patterns = NewickPatterns::new();
return newick_patterns.parents.is_match(newick);
}

pub fn find_num_leaves(newick: &str) -> usize {
let newick_patterns = NewickPatterns::new();
let result: Vec<usize> = newick_patterns
.pairs
.captures_iter(newick)
.map(|caps| {
let (_, [_, node]) = caps.extract();
node.parse::<usize>().unwrap()
})
.collect();

return result.len();
}

/// Build newick string from the ancestry matrix
pub fn build_newick(ancestry: &Ancestry) -> String {
// Get the root node, which is the parent value of the last ancestry element
Expand All @@ -92,3 +120,48 @@ pub fn build_newick(ancestry: &Ancestry) -> String {
// Build the Newick string starting from the root, and append a semicolon
format!("{};", _build_newick_recursive_inner(root, ancestry))
}

#[cfg(test)]
mod tests {
use super::*;
use crate::tree_vec::ops::to_newick;
use crate::utils::sample;
use rstest::*;

#[rstest]
#[case("(((0,(3,5)6)8,2)9,(1,4)7)10;", "(((0,(3,5)),2),(1,4));")]
#[case("(0,(1,(2,(3,(4,5)6)7)8)9)10;", "(0,(1,(2,(3,(4,5)))));")]
#[case("((0,2)5,(1,3)4)6;", "((0,2),(1,3));")]
fn test_remove_parent_labels(#[case] newick: &str, #[case] expected: &str) {
let result = remove_parent_labels(&newick);
assert_eq!(result, expected);
}

#[rstest]
#[case(10)]
#[case(100)]
#[case(1000)]
fn test_has_parents(#[case] n_leaves: usize) {
let v = sample(n_leaves, false);
let newick = to_newick(&v);
// Check if the newick string has parents
let result = has_parents(&newick);
assert_eq!(result, true);

// Check if the newick string does not have parents
let result_no_parents = has_parents(&remove_parent_labels(&newick));
assert_eq!(result_no_parents, false);
}

#[rstest]
#[case(10)]
#[case(100)]
#[case(1000)]
fn test_find_num_leaves(#[case] n_leaves: usize) {
let v = sample(n_leaves, false);
let newick = to_newick(&v);
// Check if the newick string has parents
let result = find_num_leaves(&newick);
assert_eq!(result, n_leaves);
}
}
30 changes: 30 additions & 0 deletions phylo2vec/src/tree_vec/ops/newick/newick_patterns.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#[derive(Debug)]
pub struct NewickPatterns {
pub left_node: regex::Regex,
pub right_node: regex::Regex,
pub pairs: regex::Regex,
pub branch_lengths: regex::Regex,
pub parents: regex::Regex,
}

impl NewickPatterns {
pub fn new() -> Self {
let _left_node = r"\(\b(\d+)\b";
let _right_node = r",\b(\d+)\b";
let _branch_lengths = r":\d+(\.\d+)?";
let _parents = r"\)(\d+)";
let _pairs = format!(r"({})|({})", _left_node, _right_node);
NewickPatterns {
// Pattern of an integer label on the left of a pair
left_node: regex::Regex::new(&_left_node).unwrap(),
// Pattern of an integer label on the right of a pair
right_node: regex::Regex::new(&_right_node).unwrap(),
// Pattern of a pair of integer labels
pairs: regex::Regex::new(&_pairs).unwrap(),
// Pattern of a branch length annotation
branch_lengths: regex::Regex::new(&_branch_lengths).unwrap(),
// Pattern of a parent label
parents: regex::Regex::new(&_parents).unwrap(),
}
}
}

0 comments on commit 5b85482

Please sign in to comment.