From 9662a7f2c793dac7acca0faff3f9b1532eb4a37c Mon Sep 17 00:00:00 2001 From: kchardon Date: Mon, 11 Nov 2024 17:59:33 +0100 Subject: [PATCH] remove hcluster --- river/cluster/__init__.py | 2 - river/cluster/hcluster.py | 402 -------------------------------------- 2 files changed, 404 deletions(-) delete mode 100644 river/cluster/hcluster.py diff --git a/river/cluster/__init__.py b/river/cluster/__init__.py index 21157d971b..a69efe554a 100644 --- a/river/cluster/__init__.py +++ b/river/cluster/__init__.py @@ -5,7 +5,6 @@ from .clustream import CluStream from .dbstream import DBSTREAM from .denstream import DenStream -from .hcluster import HierarchicalClustering from .k_means import KMeans from .odac import ODAC from .streamkmeans import STREAMKMeans @@ -15,7 +14,6 @@ "CluStream", "DBSTREAM", "DenStream", - "HierarchicalClustering", "KMeans", "ODAC", "STREAMKMeans", diff --git a/river/cluster/hcluster.py b/river/cluster/hcluster.py deleted file mode 100644 index c3511c9a6b..0000000000 --- a/river/cluster/hcluster.py +++ /dev/null @@ -1,402 +0,0 @@ -from __future__ import annotations - -import functools - -from river import base, utils -from river.neighbors.base import DistanceFunc, FunctionWrapper - - -# Node of a binary tree for Hierarchical Clustering -class BinaryTreeNode: - def __init__(self, key: int, data: dict = None): - self.data = data - self.key = key - # Children and parent - self.left = None - self.right = None - self.parent = None - - -class HierarchicalClustering(base.Clusterer): - """Hierarchical Clustering. - - HierarchicalClustering is a stream hierarchical clustering algorithm. This algorithm [^1] inserts new nodes - near the nodes it is similar to without breaking clusters of very similar nodes. - - Beginning with the whole tree `T`, it will compare the new node to this respective tree: - * If `T` is just a leaf: merge - * Else, if the nodes of `T` are more similar between them than with the new node: merge - * Else, if the new node is more similar to the left subtree than to the right subtree: - redo from the first point with `T` equal to left subtree - * Else, if the new node is more similar to the right subtree than to the left subtree: - redo from the first point with `T` right subtree - - A window size can also be chosen to use only the most recent points to make sure that the tree is not overloaded. - - Parameters - ---------- - window_size - number of data points to use - dist_func - A distance function to use to compare the nodes. The Minkowski distance with `p=2` is used as default. - - Attributes - ---------- - n - number of nodes - x_clusters - data points used by the algorithm with the key of the node representing them - - References - ---------- - [^1]: Anand Rajagopalan, Aditya Krishna Menon, Qin Cao, Gui Citovsky, Baris Sumengen and Sanjiv Kumar (2019). Online - Hierarchical Clustering Approximations. arXiV:1909.09667. Available at: https://doi.org/10.48550/arXiv.1909.09667 - - Examples - -------- - - The first example is with leaving the window size to 100. In the second one we put it at 2 to see how it works. - - >>> from river import cluster - >>> from river import stream - - >>> X = [[1, 2, 1], [2, 1, 0], [3, 2, 1], [2, 2, 1], [5, 2, 3]] - - >>> hierarchical_clustering = cluster.HierarchicalClustering() - - >>> for x, _ in stream.iter_array(X): - ... hierarchical_clustering = hierarchical_clustering.learn_one(x) - - >>> hierarchical_clustering.x_clusters - {'[1, 2, 1]': 1, - '[2, 1, 0]': 2, - '[3, 2, 1]': 4, - '[2, 2, 1]': 6, - '[5, 2, 3]': 8} - - >>> hierarchical_clustering.n - 9 - - >>> print(hierarchical_clustering) - -> 8 - -> 9 - -> 6 - -> 7 - -> 4 - -> 5 - -> 2 - -> 3 - -> 1 - Printed Hierarchical Clustering Tree. - - >>> hierarchical_clustering.get_all_clusters() - [(1, ['[1, 2, 1]']), - (2, ['[2, 1, 0]']), - (4, ['[3, 2, 1]']), - (6, ['[2, 2, 1]']), - (8, ['[5, 2, 3]']), - (3, [1, 2]), - (5, [3, 7]), - (7, [4, 6]), - (9, [5, 8])] - - >>> hierarchical_clustering.get_clusters_by_point() - {'[1, 2, 1]': [1, 3, 5, 9], - '[2, 1, 0]': [2, 3, 5, 9], - '[3, 2, 1]': [4, 7, 5, 9], - '[2, 2, 1]': [6, 7, 5, 9], - '[5, 2, 3]': [8, 9]} - - >>> hierarchical_clustering.predict_one({0: 4, 1: 3, 2: 1}) - ([10, 11, 9], 8) - - >>> hierarchical_clustering = hierarchical_clustering.learn_one({0: 4, 1: 3, 2: 1}) - - >>> print(hierarchical_clustering) - -> 10 - -> 11 - -> 8 - -> 9 - -> 6 - -> 7 - -> 4 - -> 5 - -> 2 - -> 3 - -> 1 - Printed Hierarchical Clustering Tree. - - >>> hierarchical_clustering = cluster.HierarchicalClustering(window_size=2) - - >>> for x, _ in stream.iter_array(X): - ... hierarchical_clustering = hierarchical_clustering.learn_one(x) - - >>> hierarchical_clustering.x_clusters - {'[2, 2, 1]': 2, '[5, 2, 3]': 1} - - >>> hierarchical_clustering.n - 3 - - >>> print(hierarchical_clustering) - -> 2 - -> 3 - -> 1 - Printed Hierarchical Clustering Tree. - """ - - def __init__( - self, - window_size: int = 100, - dist_func: DistanceFunc | FunctionWrapper | None = None, - ): - # Number of nodes - self.n = 0 - # Max number of leaves - self.window_size = window_size - # Dict : x data (str(array of size m)) -> key of the node - self.x_clusters: dict[str, int] = {} - # Dict : key -> node - self.nodes: dict[int, BinaryTreeNode] = {} - # First node of the tree - self.root = None - # Distance function - if dist_func is None: - dist_func = functools.partial(utils.math.minkowski_distance, p=2) - self.dist_func = dist_func - - def otd_clustering(self, tree, x): - # Online top down clustering (OTD), the first algorithm for online hierarchical clustering. - # The algorithm performs highly efficient online updates and provably approximates Moseley-Wang revenue. - x_string = str(list(x.values())) - if self.n == 1: - # First node in the tree - self.root = self.nodes[1] - elif tree.data is not None: - # If T is a leaf, we merge the two nodes together - self.merge_nodes(tree, self.nodes[self.x_clusters[x_string]]) - elif tree.left is None: - # If there is no node at the left of the intermediate node, we add it there - tree.left = self.nodes[self.x_clusters[x_string]] - self.nodes[self.x_clusters[x_string]].parent = tree - elif tree.right is None: - # If there is no node at the right of the intermediate node, we add it there - tree.right = self.nodes[self.x_clusters[x_string]] - self.nodes[self.x_clusters[x_string]].parent = tree - elif self.intra_subtree_similarity(tree) < self.inter_subtree_similarity( - tree, self.nodes[self.x_clusters[x_string]] - ): - # If the nodes in T are closer between them than with the new node, we merge T and the new node - self.merge_nodes(tree, self.nodes[self.x_clusters[x_string]]) - elif self.inter_subtree_similarity( - tree.left, self.nodes[self.x_clusters[x_string]] - ) > self.inter_subtree_similarity(tree.right, self.nodes[self.x_clusters[x_string]]): - # Continue to search where to merge the new node in the right part of T - self.otd_clustering(tree.right, x) - else: - # Continue to search where to merge the new node in the left part of T - self.otd_clustering(tree.left, x) - - def merge_nodes(self, tree, added_node): - # Merge a new node (added node) to the tree - # We create the node that will be the parent of the tree and the added node - self.n += 1 - new_node = BinaryTreeNode(self.n) - # We add the tree and the added node as its children - new_node.left = tree - new_node.right = added_node - # The parent of the new node is the parent of the tree - new_node.parent = tree.parent - # If the tree is not the root, we set the child of its parent as new node (instead of T) - if tree.parent is not None: - if tree.parent.left.key == tree.key: - tree.parent.left = new_node - else: - tree.parent.right = new_node - # We add the new node as the parent of the tree and the added node - tree.parent = new_node - added_node.parent = new_node - # We add the new node to the dict - self.nodes[self.n] = new_node - # If the tree was the root, the new node become the root - if self.root.key == tree.key: - self.root = self.nodes[self.n] - - def learn_one(self, x): - # We create the node for x and add it to the tree - if len(self.x_clusters.keys()) >= self.window_size: - # Delete the oldest data point and add a node with the same key as the one deleted - oldest_key = self.x_clusters[list(self.x_clusters.keys())[0]] - oldest = self.nodes[oldest_key] - if oldest.parent.left.key == oldest_key: - oldest.parent.left = None - else: - oldest.parent.right = None - del self.nodes[oldest_key] - del self.x_clusters[list(self.x_clusters.keys())[0]] - self.x_clusters[str(list(x.values()))] = oldest_key - self.nodes[oldest_key] = BinaryTreeNode(oldest_key, x) - else: - # Else, add a node - self.n += 1 - self.x_clusters[str(list(x.values()))] = self.n - self.nodes[self.n] = BinaryTreeNode(self.n, x) - # We add it to the tree - self.otd_clustering(self.root, x) - return self - - def predict_otd(self, x, node, clusters): - # get the list of predicted clusters for x - if node is None: - # If there is still no node in the tree - return [1], -1 - if node.data is not None: - # Add itself (n+1) and the key of the node that would merge x and node (n+2) - clusters.extend([self.n + 2, self.n + 1]) - return clusters, node.key - if self.intra_subtree_similarity(node) < self.inter_subtree_similarity( - node, BinaryTreeNode(self.n + 1, x) - ): - # Add itself (n+1) and the key of the node that would merge x and node (n+2) - clusters.extend([self.n + 2, self.n + 1]) - return clusters, node.key - else: - # Else, x would be added in the tree, we add the key of node - clusters.extend([node.key]) - if self.inter_subtree_similarity( - node.left, BinaryTreeNode(self.n + 1, x) - ) > self.inter_subtree_similarity(node.right, BinaryTreeNode(self.n + 1, x)): - # If the right part of the tree is closer to x than the left part, we continue in the right part - return self.predict_otd(x, node.right, clusters) - else: - # If the left part of the tree is closer to x than the right part, we continue in the left part - return self.predict_otd(x, node.left, clusters) - - def predict_one(self, x): - """Predicts the clusters for a set of features `x`. - - Parameters - ---------- - x - A dictionary of features. - Returns - ------- - (list, int) - A list of clusters (from node `x` to root) and the node to which it would have been merged. - - """ - # We predict to which cluster x would be if we added in the tree - r, merged = self.predict_otd(x, self.root, []) - r.reverse() - return r, merged - - @staticmethod - def find_path(root, path, k): - # find the path from root to k - # Adapted from https://www.geeksforgeeks.org/lowest-common-ancestor-binary-tree-set-1/ - - if root is None: - return False - - path.append(root) - - if root.key == k: - return True - - if (root.left is not None and HierarchicalClustering.find_path(root.left, path, k)) or ( - root.right is not None and HierarchicalClustering.find_path(root.right, path, k) - ): - return True - - path.pop() - return False - - def leaves(self, v): - # find all the leaves from node v - - if v is None: - return -1 - if v.data is not None: - return [v] - - leave_list = [] - leave_list.extend(self.leaves(v.left)) - leave_list.extend(self.leaves(v.right)) - return leave_list - - def inter_subtree_similarity(self, tree_a, tree_b): - # compute the mean distance (mean of distances) between two trees - leaves_a = self.leaves(tree_a) - leaves_b = self.leaves(tree_b) - r = 0 - nb = 0 - for i, w_i in enumerate(leaves_a): - for j, w_j in enumerate(leaves_b): - nb += 1 - r += self.dist_func(w_i.data, w_j.data) - return r / nb - - def intra_subtree_similarity(self, tree): - # compute mean of distances between the nodes from a certain tree - leaves = self.leaves(tree) - r = 0 - nb = 0 - if len(leaves) == 1: - return 0 - for i, w_i in enumerate(leaves): - for j, w_j in enumerate(leaves): - if i < j: - nb += 1 - r += self.dist_func(w_i.data, w_j.data) - return r / nb - - def __str__(self): - self.print_tree(self.root) - return "Printed Hierarchical Clustering Tree." - - @staticmethod - def print_tree(node, level=0): - # Print node and its children - # Adapted from https://stackoverflow.com/questions/34012886/print-binary-tree-level-by-level-in-python - if node is not None: - HierarchicalClustering.print_tree(node.right, level + 1) - print(" " * 4 * level + "-> " + str(node.key)) - HierarchicalClustering.print_tree(node.left, level + 1) - - def get_parents(self, node): - # Get all the parents of the node (the clusters it belongs to) - clusters = [node.key] - if node.parent is None: - return clusters - clusters.extend(self.get_parents(node.parent)) - return clusters - - def get_clusters_by_point(self): - """Returns the list of clusters (from the data point node to the root) for all data points. - - Returns - ------- - {x : list} - A dict of all the data points with their clusters. - """ - # Get all the clusters each data point belong to - clusters = {} - for x in self.x_clusters.keys(): - clusters[x] = self.get_parents(self.nodes[self.x_clusters[x]]) - return clusters - - def get_all_clusters(self): - """Returns all the clusters of the tree. - - Returns - ------- - {int : list} - A dict of all the clusters with their children (or the data point for the leaves). - """ - # Get the data of each cluster - clusters = {} - for i in range(1, self.n + 1): - if self.nodes[i].data is not None: - clusters[i] = [str(list(self.nodes[i].data.values()))] - else: - clusters[i] = [self.nodes[i].left.key, self.nodes[i].right.key] - return sorted(clusters.items(), key=lambda x: len(x[1]))