diff --git a/CHANGELOG.md b/CHANGELOG.md index 07d8756..392d440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# Version 0.9.0 2024-11-13 +## Changes +- Added `Center::Medoid` as a means of calculating the center of clusters. The medoid is the point in a cluster with + the minimum distance to all other points. Computationally more expensive than centroids as requires calculation of + pairwise distances (using the selected distance metric). The output will be an observed data point in the cluster. + # Version 0.8.3 2024-10-15 ## Changes - Fix for a bug that occurred when `allow_single_cluster` was set to true and the root cluster is the only one diff --git a/Cargo.toml b/Cargo.toml index 4da0c13..49a6164 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hdbscan" -version = "0.8.3" +version = "0.9.0" edition = "2021" authors = [ "Tom Whitehead ", ] description = "HDBSCAN clustering in pure Rust. A huge improvement on DBSCAN, capable of identifying clusters of varying densities." diff --git a/src/centers.rs b/src/centers.rs index 662039e..021490d 100644 --- a/src/centers.rs +++ b/src/centers.rs @@ -1,4 +1,5 @@ use num_traits::Float; +use std::cmp::Ordering; use std::collections::HashSet; /// Possible methodologies for calculating the center of clusters @@ -7,22 +8,32 @@ pub enum Center { /// The elementwise mean of all data points in a cluster. /// The output is not guaranteed to be an observed data point. Centroid, - /// Calculates the geographical centeroid for lat/lon coordinates. + /// Calculates the geographical centroid for lat/lon coordinates. /// Assumes input coordinates are in degrees (latitude, longitude). /// Output coordinates are also in degrees. GeoCentroid, + /// The point in a cluster with the minimum distance to all other points. Computationally more + /// expensive than centroids as requires calculation of pairwise distances (using the selected + /// distance metric). The output will be an observed data point in the cluster. + Medoid, } impl Center { - pub(crate) fn calc_centers(&self, data: &[Vec], labels: &[i32]) -> Vec> { + pub(crate) fn calc_centers T>( + &self, + data: &[Vec], + labels: &[i32], + dist_func: F, + ) -> Vec> { match self { Center::Centroid => self.calc_centroids(data, labels), Center::GeoCentroid => self.calc_geo_centroids(data, labels), + Center::Medoid => self.calc_medoids(data, labels, dist_func), } } fn calc_centroids(&self, data: &[Vec], labels: &[i32]) -> Vec> { - // All points weighted equally + // All points weighted equally for now let weights = vec![T::one(); data.len()]; Center::calc_weighted_centroids(data, labels, &weights) } @@ -121,4 +132,45 @@ impl Center { centers } + + fn calc_medoids T>( + &self, + data: &[Vec], + labels: &[i32], + dist_func: F, + ) -> Vec> { + let n_clusters = labels + .iter() + .filter(|&&label| label != -1) + .collect::>() + .len(); + let mut medoids = Vec::with_capacity(n_clusters); + + for cluster_id in 0..n_clusters as i32 { + let cluster_data = data + .iter() + .zip(labels.iter()) + .filter(|(_datapoint, &label)| label == cluster_id) + .map(|(datapoint, _label)| datapoint) + .collect::>>(); + + let n_samples = cluster_data.len(); + let medoid_idx = (0..n_samples) + .map(|i| { + (0..n_samples) + .map(|j| dist_func(cluster_data[i], cluster_data[j])) + .fold(T::zero(), std::ops::Add::add) + }) + .enumerate() + .min_by(|(_idx_a, sum_a), (_idx_b, sum_b)| { + sum_a.partial_cmp(sum_b).unwrap_or(Ordering::Equal) + }) + .map(|(idx, _sum)| idx) + .unwrap_or(0); + + medoids.push(cluster_data[medoid_idx].clone()) + } + + medoids + } } diff --git a/src/lib.rs b/src/lib.rs index 998d206..f25cf5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -218,8 +218,7 @@ impl<'a, T: Float> Hdbscan<'a, T> { /// Calculates the centers of the clusters just calculate. /// /// # Parameters - /// * `center` - the type of center to calculate. Currently only centroid (the element wise mean - /// of all the data points in a cluster) is supported. + /// * `center` - the type of center to calculate. /// * `labels` - a reference to the labels calculated by a call to `Hdbscan::cluster`. /// /// # Returns @@ -227,9 +226,6 @@ impl<'a, T: Float> Hdbscan<'a, T> { /// index of the centroid is the cluster label. For example, the centroid cluster of label 0 /// will be the first centroid in the vector of centroids. /// - /// # Panics - /// * If the labels are of different length to the data passed to the `Hdbscan` constructor - /// /// # Examples /// ``` ///use hdbscan::{Center, Hdbscan}; @@ -258,14 +254,22 @@ impl<'a, T: Float> Hdbscan<'a, T> { center: Center, labels: &[i32], ) -> Result>, HdbscanError> { - assert_eq!(labels.len(), self.data.len()); + if labels.len() != self.data.len() { + return Err(HdbscanError::WrongDimension(String::from( + "The length of the labels must equal the length of the original clustering data.", + ))); + } if self.hp.dist_metric != DistanceMetric::Haversine && center == Center::GeoCentroid { // TODO: Implement a more appropriate error variant when doing a major version bump return Err(HdbscanError::WrongDimension(String::from( "Geographical centroids can only be used with geographical coordinates.", ))); } - Ok(center.calc_centers(self.data, labels)) + Ok(center.calc_centers( + self.data, + labels, + distance::get_dist_func(&self.hp.dist_metric), + )) } fn validate_input_data(&self) -> Result<(), HdbscanError> { @@ -1101,7 +1105,7 @@ mod tests { } #[test] - fn calc_centers() { + fn calc_centroids() { let data = cluster_test_data(); let clusterer = Hdbscan::default_hyper_params(&data); let labels = clusterer.cluster().unwrap(); @@ -1110,6 +1114,37 @@ mod tests { assert!(centroids.contains(&vec![3.8, 4.0]) && centroids.contains(&vec![1.12, 1.34])); } + #[test] + fn calc_medoids() { + let data: Vec> = vec![ + vec![1.3, 1.2], + vec![1.2, 1.3], + vec![1.5, 1.5], + vec![1.6, 1.7], + vec![1.7, 1.6], + vec![6.3, 6.2], + vec![6.2, 6.3], + vec![6.5, 6.5], + vec![6.6, 6.7], + vec![6.7, 6.6], + ]; + let clusterer = Hdbscan::default_hyper_params(&data); + let result = clusterer.cluster().unwrap(); + let centers = clusterer.calc_centers(Center::Medoid, &result).unwrap(); + + let unique_clusters = result + .iter() + .filter(|&&label| label != -1) + .collect::>(); + assert_eq!(centers.len(), unique_clusters.len()); + + centers + .iter() + .for_each(|center| assert!(data.contains(center))); + assert_eq!(vec![1.5, 1.5], centers[0]); + assert_eq!(vec![6.5, 6.5], centers[1]); + } + fn cluster_test_data() -> Vec> { vec![ vec![1.5, 2.2],