Skip to content

Commit

Permalink
feat: Integrate binding to python rust core and clean up (#69)
Browse files Browse the repository at this point in the history
* feat: Integrate binding to python rust core and clean up

* refactor: Pass numpy array input directly

* fix: Fix input variable name error

* refactor: Remove unused import
  • Loading branch information
lsetiawan authored Dec 19, 2024
1 parent ec18e9f commit 50811ab
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 151 deletions.
111 changes: 8 additions & 103 deletions py-phylo2vec/phylo2vec/base/to_newick.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,11 @@
"""
Methods to convert a Phylo2Vec vector to a Newick-format string.
"""
from phylo2vec import _phylo2vec_core

import numba as nb
import numpy as np

# tuple type
value_type = nb.types.UniTuple(nb.types.int64, 2)


@nb.njit(cache=True)
def _get_pairs(v):
pairs = []

for i in range(len(v) - 1, -1, -1):
next_leaf = i + 1
if v[i] <= i:
# If v[i] <= i, it's an easy BD
# We now that the next pair to add now is (v[i], next_leaf)
# (as the branch leading to v[i] gives birth to the next_leaf)
# Why pairs.insert(0)? Let's take an example with [0, 0]
# We initially have (0, 1), but 0 gives birth to 2 afterwards
# So the "shallowest" pair is (0, 2)
pairs.append((v[i], next_leaf))

for j in range(1, len(v)):
next_leaf = j + 1
if v[j] == 2 * j:
# 2*j = extra root ==> pairing = (0, next_leaf)
pairs.append((0, next_leaf))
elif v[j] > j:
# If v[i] > i, it's not the branch leading v[i] that gives birth but an internal branch
# Remark 1: it will not be the "shallowest" pair, so we do not insert it at position 0
# len(pairs) = number of pairings we did so far
# So what v[i] - len(pairs) gives us is the depth of the next pairing
# And pairs[v[i] - len(pairs) - 1][0] is a node that we processed beforehand
# which is deeper than the branch v[i]
index = v[j] - 2 * j
pairs.insert(index, (pairs[index - 1][0], next_leaf))

return pairs


@nb.njit(cache=True)
def _get_ancestry(v):
from phylo2vec import _phylo2vec_core

def _get_ancestry(v: np.ndarray) -> np.ndarray:
"""
Get the "ancestry" of v (see "Returns" paragraph)
Expand Down Expand Up @@ -77,37 +38,11 @@ def _get_ancestry(v):
2nd column: child 2
3rd column: parent node
"""
pairs = _get_pairs(v)

# We have our pairs, we can now build our ancestry
# Matrix with 3 columns: child1, child2, parent
ancestry = np.zeros((len(pairs), 3), dtype=np.int16)

# Dictionary to keep track of the following relationship: child->highest parent
parents = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.int64)

# Leaves are number 0, 1, ..., n_leaves - 1, so the next parent is n_leaves
next_parent = len(v) + 1

for i, pair in enumerate(pairs):
child1, child2 = pair

parent_child1 = parents.get(child1, child1)
parent_child2 = parents.get(child2, child2)

ancestry[i, :] = [parent_child1, parent_child2, next_parent]

# Change the parents of the current children
parents[child1] = next_parent
parents[child2] = next_parent

next_parent += 1
ancestry_list = _phylo2vec_core.get_ancestry(v)
return np.asarray(ancestry_list)

return ancestry


@nb.njit
def _build_newick(ancestry):
def _build_newick(ancestry: np.ndarray) -> str:
"""Build a Newick string from an "ancestry" array
The input should always be 3-dimensional with the following format:
Expand All @@ -128,35 +63,9 @@ def _build_newick(ancestry):
newick : str
Newick string
"""
root = ancestry[-1][-1]

newick = _build_newick_inner(root, ancestry) + ";"

return newick


@nb.njit
def _build_newick_inner(node, ancestry):
leaf_max = ancestry.shape[0]

c1, c2, _ = ancestry[node - leaf_max - 1]
return _phylo2vec_core.build_newick(ancestry)

if c1 > leaf_max:
left = _build_newick_inner(c1, ancestry)
else:
left = f"{c1}"

if c2 > leaf_max:
right = _build_newick_inner(c2, ancestry)
else:
right = f"{c2}"

newick = f"({left},{right}){node}"

return newick


@nb.njit(cache=True)
def to_newick(v):
"""Recover a rooted tree (in Newick format) from a Phylo2Vec v
Expand All @@ -170,8 +79,4 @@ def to_newick(v):
newick : str
Newick tree
"""
ancestry = _get_ancestry(v)

newick = _build_newick(ancestry)

return newick
return _phylo2vec_core.to_newick(v)
2 changes: 0 additions & 2 deletions py-phylo2vec/phylo2vec/metrics/pairwise.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import numba as nb
import numpy as np

from phylo2vec.base.to_newick import _get_ancestry
from phylo2vec.utils.validation import check_v


@nb.njit(cache=True)
def cophenetic_distances(v, unrooted=False):
# Should be very similar to dist_nodes in ape

Expand Down
12 changes: 4 additions & 8 deletions py-phylo2vec/phylo2vec/utils/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import os
import random

import numba as nb
import numpy as np

from phylo2vec import _phylo2vec_core

@nb.njit
def sample(n_leaves, ordered=False):
def sample(n_leaves: int, ordered: bool = False) -> np.ndarray:
"""Sample a random tree via Phylo2Vec
Parameters
Expand All @@ -30,11 +29,8 @@ def sample(n_leaves, ordered=False):
Phylo2Vec vector
"""

if ordered:
v_list = [np.random.randint(0, i + 1) for i in range(n_leaves - 1)]
else:
v_list = [np.random.randint(0, 2 * i + 1) for i in range(n_leaves - 1)]
return np.array(v_list, dtype=np.uint16)
v_list = _phylo2vec_core.sample(n_leaves, ordered)
return np.asarray(v_list)


def seed_everything(seed):
Expand Down
10 changes: 4 additions & 6 deletions py-phylo2vec/phylo2vec/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import numpy as np

from phylo2vec import _phylo2vec_core

def check_v(v):

def check_v(v: np.ndarray) -> None:
"""Input validation of a Phylo2Vec vector
The input is checked to satisfy the Phylo2Vec constraints
Expand All @@ -13,8 +15,4 @@ def check_v(v):
v : numpy.ndarray
Phylo2Vec vector
"""
k = len(v)

v_max = 2 * np.arange(k)

assert np.all((0 <= v) & (v <= v_max)), print(v, v >= 0, v <= v_max)
_phylo2vec_core.check_v(v.tolist())
8 changes: 4 additions & 4 deletions py-phylo2vec/phylo2vec/utils/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def _find_indices_of_first_leaf(ancestry, leaf):
return r, c


@nb.njit(cache=True)

def remove_leaf(v, leaf):
"""Remove a leaf from a Phylo2Vec v
Expand Down Expand Up @@ -302,7 +302,7 @@ def remove_leaf(v, leaf):
return v_sub, sister


@nb.njit(cache=True)

def add_leaf(v, leaf, pos):
"""Add a leaf to a Phylo2Vec vector v
Expand Down Expand Up @@ -355,7 +355,7 @@ def add_leaf(v, leaf, pos):
return v_add


@nb.njit(cache=True)

def get_ancestry_paths(v):
"""
Get the ancestry paths for each node in the Phylo2Vec vector.
Expand Down Expand Up @@ -386,7 +386,7 @@ def get_ancestry_paths(v):
return ancestry_paths


@nb.njit(cache=True)

def get_common_ancestor(v, node1, node2):
"""Get the first recent common ancestor between two nodes in a Phylo2Vec tree
Expand Down
28 changes: 0 additions & 28 deletions py-phylo2vec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use phylo2vec::utils;
/// This function takes a Python list and converts it to a Rust vector.
#[pyfunction]
fn to_newick(input_vector: Vec<usize>) -> PyResult<String> {
// TODO: Take in numpy array instead of just simple list?
let newick = ops::to_newick(&input_vector);
Ok(newick)
}
Expand All @@ -18,26 +17,13 @@ fn get_ancestry(input_vector: Vec<usize>) -> Vec<[usize; 3]> {
ancestry
}

#[pyfunction]
fn get_pairs(input_vector: Vec<usize>) -> Vec<(usize, usize)> {
let pairs: Vec<(usize, usize)> = ops::get_pairs(&input_vector);
pairs
}

#[pyfunction]
fn get_pairs_avl(input_vector: Vec<usize>) -> Vec<(usize, usize)> {
let pairs: Vec<(usize, usize)> = ops::get_pairs_avl(&input_vector);
pairs
}

#[pyfunction]
fn build_newick(input_ancestry: Vec<[usize; 3]>) -> String {
let newick_string: String = ops::newick::build_newick(&input_ancestry);
newick_string
}

#[pyfunction]
#[pyo3(signature = (n_leaves, ordered=false, /), text_signature = "(n_leaves, ordered=False, /)")]
fn sample(n_leaves: usize, ordered: bool) -> Vec<usize> {
let v = utils::sample(n_leaves, ordered);
v
Expand All @@ -54,21 +40,7 @@ fn _phylo2vec_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(to_newick, m)?)?;
m.add_function(wrap_pyfunction!(build_newick, m)?)?;
m.add_function(wrap_pyfunction!(get_ancestry, m)?)?;
m.add_function(wrap_pyfunction!(get_pairs, m)?)?;
m.add_function(wrap_pyfunction!(get_pairs_avl, m)?)?;
m.add_function(wrap_pyfunction!(sample, m)?)?;
m.add_function(wrap_pyfunction!(check_v, m)?)?;
Ok(())
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_to_newick() {
let v = vec![0, 0, 1];
let newick = ops::to_newick(&v);
assert_eq!(newick, "((0,2)5,(1,3)4)6;");
}
}

0 comments on commit 50811ab

Please sign in to comment.