Skip to content

Commit

Permalink
Fix epsilon check when there is a single cluster and it is the root c…
Browse files Browse the repository at this point in the history
…luster (#33)

* Prevent epsilon check when there is a single root cluster

* Add test

* simplify logic

* another test

* reformat
  • Loading branch information
tom-whitehead authored Oct 15, 2024
1 parent ec8ae9a commit 23e27b3
Showing 1 changed file with 77 additions and 3 deletions.
80 changes: 77 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,7 @@ impl<'a, T: Float> Hdbscan<'a, T> {
.filter(|(_id, should_keep)| *should_keep)
.map(|(id, _should_keep)| id)
.collect();

if self.hp.epsilon != 0.0 && n_clusters > 0 {
selected_cluster_ids =
self.check_cluster_epsilons(selected_cluster_ids, condensed_tree);
Expand Down Expand Up @@ -811,7 +812,9 @@ impl<'a, T: Float> Hdbscan<'a, T> {
.iter()
.find(|node| node.node_id == current_id)
.map(|node| node.parent_node_id)
.expect("Couldn't find node");
// If the node isn't in the tree there must be only a single root cluster as
// this isn't stored explicitly in the tree. Its id is always max node id + 1
.unwrap_or(self.n_samples);
if self.is_top_cluster(&parent_id) {
if self.hp.allow_single_cluster {
winning_cluster_id = parent_id;
Expand All @@ -828,7 +831,6 @@ impl<'a, T: Float> Hdbscan<'a, T> {
}
current_id = parent_id;
}

winning_cluster_id
}

Expand Down Expand Up @@ -923,7 +925,7 @@ impl<'a, T: Float> Hdbscan<'a, T> {
max_lambda
}),
})
.expect("Could not find child nodes")
.unwrap_or(T::zero())
} else {
T::from(1.0 / self.hp.epsilon).unwrap()
}
Expand Down Expand Up @@ -1002,6 +1004,78 @@ mod tests {
assert_eq!(1, noise_points.len());
}

#[test]
fn single_cluster_epsilon_search() {
let data = vec![
vec![1.1, 1.1],
vec![1.2, 1.1],
vec![1.3, 1.2],
vec![2.1, 1.3],
vec![2.2, 1.2],
vec![2.0, 1.2],
vec![3.0, 3.0],
];

let hp = HdbscanHyperParams::builder().min_cluster_size(3).build();
let clusterer = Hdbscan::new(&data, hp);
let result = clusterer.cluster().unwrap();

// Without allow_single_cluster and epsilon, there are two clusters
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(2, unique_clusters.len());
// One point is noise
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);

let hp = HdbscanHyperParams::builder()
.allow_single_cluster(true)
.min_cluster_size(3)
.epsilon(1.2)
.build();
let clusterer = Hdbscan::new(&data, hp);
let result = clusterer.cluster().unwrap();

// With allow_single_cluster and epsilon, first size points are one merged cluster
let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(1, unique_clusters.len());
// One point is still noise
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);
}

#[test]
fn single_root_cluster_only_epsilon_search() {
// This used to cause a panic
let data = vec![
vec![1.1, 1.1],
vec![1.2, 1.1],
vec![1.3, 1.2],
vec![3.0, 3.0],
];

let hp = HdbscanHyperParams::builder()
.allow_single_cluster(true)
.min_cluster_size(3)
.epsilon(1.2)
.build();
let clusterer = Hdbscan::new(&data, hp);
let result = clusterer.cluster().unwrap();

let unique_clusters = result
.iter()
.filter(|&&label| label != -1)
.collect::<HashSet<_>>();
assert_eq!(1, unique_clusters.len());
let n_noise = result.iter().filter(|&&label| label == -1).count();
assert_eq!(1, n_noise);
}

#[test]
fn empty_data() {
let data: Vec<Vec<f32>> = Vec::new();
Expand Down

0 comments on commit 23e27b3

Please sign in to comment.