From 45e12b5dc2c48ff8cd825b8a3bb14ac826ad9d3b Mon Sep 17 00:00:00 2001 From: tomlincr Date: Fri, 11 Aug 2023 12:44:26 +0100 Subject: [PATCH 1/9] add VSCode + Mac to gitignore --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index b6e47617..3a228c23 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,9 @@ dmypy.json # Pyre type checker .pyre/ + +# VSCode project settings +.vscode/ + +# Mac OS X clutter +**/.DS_Store \ No newline at end of file From b327689e2e94a8de3dfaa21579031669ea1a12e6 Mon Sep 17 00:00:00 2001 From: tomlincr Date: Fri, 11 Aug 2023 12:45:12 +0100 Subject: [PATCH 2/9] add set_params to estimator --- karateclub/estimator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/karateclub/estimator.py b/karateclub/estimator.py index 7e260cd8..9ca57ec4 100644 --- a/karateclub/estimator.py +++ b/karateclub/estimator.py @@ -33,7 +33,7 @@ def get_memberships(self): def get_cluster_centers(self): """Getting the cluster centers.""" pass - + def get_params(self): """Get parameter dictionary for this estimator..""" rx = re.compile(r'^\_') @@ -41,6 +41,12 @@ def get_params(self): params = {key: params[key] for key in params if not rx.search(key)} return params + def set_params(self, **parameters): + """Set the parameters of this estimator.""" + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self + def _set_seed(self): """Creating the initial random seed.""" random.seed(self.seed) From 45e833ea04b345a9a567ac53d83ca38902e6c2dd Mon Sep 17 00:00:00 2001 From: tomlincr Date: Fri, 11 Aug 2023 12:46:10 +0100 Subject: [PATCH 3/9] test set_params --- test/test_base.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test/test_base.py b/test/test_base.py index 5c2d4b47..c8a72877 100644 --- a/test/test_base.py +++ b/test/test_base.py @@ -5,4 +5,15 @@ def test_get_params(): params = model.get_params() assert len(params) != 0 assert type(params) is dict - assert '_embedding' not in params \ No newline at end of file + assert '_embedding' not in params + +def test_set_params(): + model = DeepWalk() + default_params = model.get_params() + params = {'dimensions': 1, + 'seed': 123} + model.set_params(**params) + new_params = model.get_params() + assert new_params != default_params + assert new_params['dimensions'] == 1 + assert new_params['seed'] == 123 From eed3e0bfa19a6f5696c7c819f75cd5c02cadb93b Mon Sep 17 00:00:00 2001 From: nicolasdugue Date: Thu, 29 Feb 2024 17:46:10 +0100 Subject: [PATCH 4/9] SINr implementation to compute node embeddings from communities detected by Louvain --- karateclub/node_embedding/structural/sinr.py | 74 ++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 karateclub/node_embedding/structural/sinr.py diff --git a/karateclub/node_embedding/structural/sinr.py b/karateclub/node_embedding/structural/sinr.py new file mode 100644 index 00000000..31c27eb0 --- /dev/null +++ b/karateclub/node_embedding/structural/sinr.py @@ -0,0 +1,74 @@ +from typing import Dict, List, Set +import networkx as nx +from karateclub.estimator import Estimator +from scipy.sparse import coo_matrix, csr_matrix +from sklearn.preprocessing import normalize +import numpy as np + + +class SINr(Estimator): + r"""An implementation of `"SINr" `_ + from the IDA '21 best paper "SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!". + The procedure computes community detection using Louvain algorithm, and calculates the distribution of edges of each node + across communities. + + Args: + gamma (int): modularity multi-resolution parameter. Default is 1. The higher it is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. + seed (int): Random seed value. Default is 42. + """ + + def __init__( + self, + gamma: int = 1, + seed: int = 42, + ): + + self.gamma = gamma + self.workers = workers + self.seed = seed + self.erase_base_features = erase_base_features + + + def fit(self, graph: nx.classes.graph.Graph): + """ + Fitting a SINr model model. + + Arg types: + * **graph** *(NetworkX graph)* - The graph to be embedded. + """ + self._set_seed() + graph = self._check_graph(graph) + # Get the adjacency matrix of the graph + adjacency = nx.adjacency_matrix(graph) + norm_adjacency = normalize(adjacency, "l1") # Make rows of matrix sum at 1 + # Detect communities use louvain algorithm with the gamma resolution parameter + communities = nx.community.louvain_communities(graph, resolution = self.gamma, seed = self.seed) + # Get the community membership of the graph + membership_matrix = self._get_matrix_membership(communities) + + self._embedding = norm_adjacency.dot(membership_matrix) + + def _get_matrix_membership(self, list_of_communities:List[Set[int]]): + r"""Getting the membership matrix describing for each node (rows), to which community (columns) it belongs. + + Return types: + * **Membership matrix** *(scipy sparse matrix csr)* - Size nodes, communities + """ + row = list() + col = list() + data = list() + for idx_c, community in enumerate(list_of_communities): + for node in community: + row.append(node) + col.append(idx_c) + data.append(1) + return coo_matrix((data, (row, col)), shape=(len(row), len(list_of_communities))).tocsr() + + + def get_embedding(self) -> np.array: + r"""Getting the node embedding. + + Return types: + * **embedding** *(Numpy array)* - The embedding of nodes. + """ + return self._embedding.toarray() From a9b02b13a8c0d04588e07c7df36faa1abd01a6b5 Mon Sep 17 00:00:00 2001 From: nicolasdugue Date: Sat, 2 Mar 2024 15:53:34 +0100 Subject: [PATCH 5/9] Adding some tests and comments --- .../node_embedding/structural/__init__.py | 1 + karateclub/node_embedding/structural/sinr.py | 13 +++---- test/structral_node_embedding_test.py | 35 ++++++++++++++++++- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/karateclub/node_embedding/structural/__init__.py b/karateclub/node_embedding/structural/__init__.py index 3d24e674..5d4cca47 100644 --- a/karateclub/node_embedding/structural/__init__.py +++ b/karateclub/node_embedding/structural/__init__.py @@ -1,2 +1,3 @@ from .graphwave import GraphWave from .role2vec import Role2Vec +from .sinr import SINr diff --git a/karateclub/node_embedding/structural/sinr.py b/karateclub/node_embedding/structural/sinr.py index 31c27eb0..e0c70d9f 100644 --- a/karateclub/node_embedding/structural/sinr.py +++ b/karateclub/node_embedding/structural/sinr.py @@ -9,11 +9,13 @@ class SINr(Estimator): r"""An implementation of `"SINr" `_ from the IDA '21 best paper "SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!". - The procedure computes community detection using Louvain algorithm, and calculates the distribution of edges of each node - across communities. + The procedure computes community detection using Louvain algorithm, and calculates the distribution of edges of each node across communities. + The algorithm is one of the fastest, because it relies mostly on Louvain community detection. It thus runs in + quasi-linear time. Regarding space complexity, it requires to be able to store the adjacency matrix and the community membership matrix, it is also quasi-linear. Args: - gamma (int): modularity multi-resolution parameter. Default is 1. The higher it is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. + gamma (int): modularity multi-resolution parameter. Default is 1. + The dimensions parameter does not exist for SINr, gamma should be use instead: the numbers of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. For small graphs, setting gamma to 1 is usually a good fit. For bigger graphs, it is recommended to increase gamma (5 or 10 for instance). For word co-occurrence graphs, to deal with word embedding, gamma is isually set to 50 to get a lot of small communities. seed (int): Random seed value. Default is 42. """ @@ -24,9 +26,7 @@ def __init__( ): self.gamma = gamma - self.workers = workers self.seed = seed - self.erase_base_features = erase_base_features def fit(self, graph: nx.classes.graph.Graph): @@ -43,9 +43,10 @@ def fit(self, graph: nx.classes.graph.Graph): norm_adjacency = normalize(adjacency, "l1") # Make rows of matrix sum at 1 # Detect communities use louvain algorithm with the gamma resolution parameter communities = nx.community.louvain_communities(graph, resolution = self.gamma, seed = self.seed) + self.dimensions = len(communities) # Get the community membership of the graph membership_matrix = self._get_matrix_membership(communities) - + #Computes the node-recall: for each node, the distribution of links across communities self._embedding = norm_adjacency.dot(membership_matrix) def _get_matrix_membership(self, list_of_communities:List[Set[int]]): diff --git a/test/structral_node_embedding_test.py b/test/structral_node_embedding_test.py index 45442f9e..d2b7dbb6 100644 --- a/test/structral_node_embedding_test.py +++ b/test/structral_node_embedding_test.py @@ -1,6 +1,6 @@ import numpy as np import networkx as nx -from karateclub import Role2Vec, GraphWave +from karateclub import Role2Vec, GraphWave, SINr def test_role2vec(): @@ -73,3 +73,36 @@ def test_graphwave(): assert embedding.shape[0] == graph.number_of_nodes() assert embedding.shape[1] == 2 * model.sample_number assert type(embedding) == np.ndarray + + + +def test_sinr(): + """ + Testing the SINr class. + """ + model = SINr() + + graph = nx.watts_strogatz_graph(100, 10, 0.5) + + model.fit(graph) + + embedding = model.get_embedding() + + assert embedding.shape[0] == graph.number_of_nodes() + assert embedding.shape[1] == model.dimensions + assert type(embedding) == np.ndarray + + model = SINr(gamma=5) + + graph = nx.watts_strogatz_graph(200, 10, 0.5) + + model.fit(graph) + + embedding = model.get_embedding() + + assert embedding.shape[0] == graph.number_of_nodes() + assert embedding.shape[1] == model.dimensions + model2 = SINr(gamma=10) + model2.fit(graph) + assert model2.dimensions > model.dimensions + assert type(embedding) == np.ndarray From 6c1cf02539acda35a99bca60ae9f56d2a7ff6017 Mon Sep 17 00:00:00 2001 From: nicolasdugue Date: Sat, 2 Mar 2024 16:34:51 +0100 Subject: [PATCH 6/9] Adding an example --- .../structral_node_embedding/sinr_example.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/structral_node_embedding/sinr_example.py diff --git a/examples/structral_node_embedding/sinr_example.py b/examples/structral_node_embedding/sinr_example.py new file mode 100644 index 00000000..121964e3 --- /dev/null +++ b/examples/structral_node_embedding/sinr_example.py @@ -0,0 +1,34 @@ +"""SINr illustrative example. +Nodes in both cliques will get the same embedding vectors, except for the one connected to the path. +Nodes in the paths are in distinct communities with sufficient gamma, and get thus distinct vectors. +""" + +import networkx as nx +from karateclub.node_embedding.structural import SINr +import matplotlib.pyplot as plt + +def embed_and_plot(g, gamma, ax): + model = SINr(gamma=gamma) + model.fit(g) + X = model.get_embedding() + + + from sklearn.decomposition import PCA + pca = PCA(n_components=2) + X_2 = pca.fit_transform(X) + + ax.scatter(X_2[:,0], X_2[:,1]) + for idx, x in enumerate(X_2): + ax.annotate(idx, (x[0], x[1])) + + + +g = nx.barbell_graph(4,8) +fig, axs = plt.subplots(3) + +nx.draw_kamada_kawai(g, with_labels=True, ax=axs[0]) + +embed_and_plot(g,0.5, axs[1]) +embed_and_plot(g,10, axs[2]) + +plt.show() \ No newline at end of file From f19a1a4d1501d33033cd4898d5aed227c1db8db2 Mon Sep 17 00:00:00 2001 From: nicolasdugue Date: Sun, 3 Mar 2024 00:26:29 +0100 Subject: [PATCH 7/9] correcting some typos --- examples/structral_node_embedding/sinr_example.py | 2 +- karateclub/node_embedding/structural/sinr.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/structral_node_embedding/sinr_example.py b/examples/structral_node_embedding/sinr_example.py index 121964e3..459c5e53 100644 --- a/examples/structral_node_embedding/sinr_example.py +++ b/examples/structral_node_embedding/sinr_example.py @@ -1,5 +1,5 @@ """SINr illustrative example. -Nodes in both cliques will get the same embedding vectors, except for the one connected to the path. +Nodes in both cliques (barbell graph) will get the same embedding vectors, except for the ones connected to the path. Nodes in the paths are in distinct communities with sufficient gamma, and get thus distinct vectors. """ diff --git a/karateclub/node_embedding/structural/sinr.py b/karateclub/node_embedding/structural/sinr.py index e0c70d9f..a334fbd9 100644 --- a/karateclub/node_embedding/structural/sinr.py +++ b/karateclub/node_embedding/structural/sinr.py @@ -15,7 +15,7 @@ class SINr(Estimator): Args: gamma (int): modularity multi-resolution parameter. Default is 1. - The dimensions parameter does not exist for SINr, gamma should be use instead: the numbers of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. For small graphs, setting gamma to 1 is usually a good fit. For bigger graphs, it is recommended to increase gamma (5 or 10 for instance). For word co-occurrence graphs, to deal with word embedding, gamma is isually set to 50 to get a lot of small communities. + The dimension parameter does not exist for SINr, gamma should be use instead: the number of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. For small graphs, setting gamma to 1 is usually a good fit. For bigger graphs, it is recommended to increase gamma (5 or 10 for instance). For word co-occurrence graphs, to deal with word embedding, gamma is isually set to 50 to get a lot of small communities. seed (int): Random seed value. Default is 42. """ @@ -31,7 +31,7 @@ def __init__( def fit(self, graph: nx.classes.graph.Graph): """ - Fitting a SINr model model. + Fitting a SINr model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. From e6064c186d050257c56a55b26e8ba8df9974c4a5 Mon Sep 17 00:00:00 2001 From: nicolasdugue Date: Mon, 4 Mar 2024 15:50:54 +0100 Subject: [PATCH 8/9] typos fixes --- examples/structral_node_embedding/sinr_example.py | 4 ++-- karateclub/node_embedding/structural/sinr.py | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/structral_node_embedding/sinr_example.py b/examples/structral_node_embedding/sinr_example.py index 459c5e53..2598c458 100644 --- a/examples/structral_node_embedding/sinr_example.py +++ b/examples/structral_node_embedding/sinr_example.py @@ -1,6 +1,6 @@ """SINr illustrative example. -Nodes in both cliques (barbell graph) will get the same embedding vectors, except for the ones connected to the path. -Nodes in the paths are in distinct communities with sufficient gamma, and get thus distinct vectors. +Nodes in both cliques (barbell graph) will get the same embedding vectors, except for those connected to the path. +Nodes in the path are in distinct communities with a high-enough gamma, and will thus get distinct vectors. """ import networkx as nx diff --git a/karateclub/node_embedding/structural/sinr.py b/karateclub/node_embedding/structural/sinr.py index a334fbd9..323d075a 100644 --- a/karateclub/node_embedding/structural/sinr.py +++ b/karateclub/node_embedding/structural/sinr.py @@ -9,13 +9,12 @@ class SINr(Estimator): r"""An implementation of `"SINr" `_ from the IDA '21 best paper "SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!". - The procedure computes community detection using Louvain algorithm, and calculates the distribution of edges of each node across communities. - The algorithm is one of the fastest, because it relies mostly on Louvain community detection. It thus runs in - quasi-linear time. Regarding space complexity, it requires to be able to store the adjacency matrix and the community membership matrix, it is also quasi-linear. + The procedure performs community detection using the Louvain algorithm, and computes the distribution of edges of each node across all communities. + The algorithm is one of the fastest, because it mostly relies on Louvain community detection. It thus runs in quasi-linear time. Regarding space complexity, the adjacency matrix and the community membership matrix need to be stored, it is also quasi-linear. Args: gamma (int): modularity multi-resolution parameter. Default is 1. - The dimension parameter does not exist for SINr, gamma should be use instead: the number of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space uncovered. For small graphs, setting gamma to 1 is usually a good fit. For bigger graphs, it is recommended to increase gamma (5 or 10 for instance). For word co-occurrence graphs, to deal with word embedding, gamma is isually set to 50 to get a lot of small communities. + The dimension parameter does not exist for SINr, gamma should be used instead: the number of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space are uncovered. For small graphs, setting gamma to 1 is usually sufficient. For bigger graphs, it is recommended to increase gamma (5 or 10 for example). For word co-occurrence graphs, to deal with word embedding, gamma is usually set to 50 in order to get many small communities. seed (int): Random seed value. Default is 42. """ @@ -50,7 +49,7 @@ def fit(self, graph: nx.classes.graph.Graph): self._embedding = norm_adjacency.dot(membership_matrix) def _get_matrix_membership(self, list_of_communities:List[Set[int]]): - r"""Getting the membership matrix describing for each node (rows), to which community (columns) it belongs. + r"""Getting the membership matrix describing for each node (rows), in which community (column) it belongs. Return types: * **Membership matrix** *(scipy sparse matrix csr)* - Size nodes, communities From f317b2f847fa611553b7b5851bb24175c64829f1 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 5 Mar 2024 12:44:17 +0100 Subject: [PATCH 9/9] Added some more comments and resolved code smells --- .../structral_node_embedding/sinr_example.py | 50 ++++++---- karateclub/estimator.py | 64 ++++++------ .../node_embedding/structural/__init__.py | 1 + karateclub/node_embedding/structural/sinr.py | 99 ++++++++++++++----- test/structral_node_embedding_test.py | 10 +- 5 files changed, 137 insertions(+), 87 deletions(-) diff --git a/examples/structral_node_embedding/sinr_example.py b/examples/structral_node_embedding/sinr_example.py index 2598c458..96c61226 100644 --- a/examples/structral_node_embedding/sinr_example.py +++ b/examples/structral_node_embedding/sinr_example.py @@ -1,34 +1,44 @@ """SINr illustrative example. -Nodes in both cliques (barbell graph) will get the same embedding vectors, except for those connected to the path. -Nodes in the path are in distinct communities with a high-enough gamma, and will thus get distinct vectors. +Nodes in both cliques (barbell graph) will get the same embedding vectors, +except for those connected to the path. +Nodes in the path are in distinct communities with a high-enough gamma, +and will thus get distinct vectors. """ import networkx as nx -from karateclub.node_embedding.structural import SINr import matplotlib.pyplot as plt +from matplotlib.axes import Axes +from sklearn.decomposition import PCA +from karateclub.node_embedding.structural import SINr -def embed_and_plot(g, gamma, ax): - model = SINr(gamma=gamma) - model.fit(g) - X = model.get_embedding() +def embed_and_plot(graph: nx.Graph, gamma: int, ax: Axes): + """Embed the graph using SINr and plot the 2D PCA projection. - from sklearn.decomposition import PCA - pca = PCA(n_components=2) - X_2 = pca.fit_transform(X) + Args: + graph (nx.Graph): The graph to embed. + gamma (int): The modularity multi-resolution parameter. + ax (Axes): The matplotlib axis to plot the graph on. + """ + model = SINr(gamma=gamma) + model.fit(graph) + embedding = model.get_embedding() - ax.scatter(X_2[:,0], X_2[:,1]) - for idx, x in enumerate(X_2): + pca_embedding = PCA(n_components=2).fit_transform(embedding) + + ax.scatter(pca_embedding[:, 0], pca_embedding[:, 1]) + for idx, x in enumerate(pca_embedding): ax.annotate(idx, (x[0], x[1])) - - -g = nx.barbell_graph(4,8) -fig, axs = plt.subplots(3) -nx.draw_kamada_kawai(g, with_labels=True, ax=axs[0]) +if __name__ == "__main__": + + barbell = nx.barbell_graph(4, 8) + fig, axs = plt.subplots(3) + + nx.draw_kamada_kawai(barbell, with_labels=True, ax=axs[0]) -embed_and_plot(g,0.5, axs[1]) -embed_and_plot(g,10, axs[2]) + embed_and_plot(barbell, 0.5, axs[1]) + embed_and_plot(barbell, 10, axs[2]) -plt.show() \ No newline at end of file + plt.show() diff --git a/karateclub/estimator.py b/karateclub/estimator.py index 9ca57ec4..dfb5b547 100644 --- a/karateclub/estimator.py +++ b/karateclub/estimator.py @@ -1,12 +1,12 @@ +"""General Estimator base class.""" + +import warnings +from typing import List +import re import random import numpy as np import networkx as nx -import warnings -from typing import List from tqdm.auto import trange -import re - -"""General Estimator base class.""" class Estimator(object): @@ -16,27 +16,22 @@ class Estimator(object): def __init__(self): """Creating an estimator.""" - pass def fit(self): """Fitting a model.""" - pass def get_embedding(self): """Getting the embeddings (graph or node level).""" - pass def get_memberships(self): """Getting the membership dictionary.""" - pass def get_cluster_centers(self): """Getting the cluster centers.""" - pass def get_params(self): """Get parameter dictionary for this estimator..""" - rx = re.compile(r'^\_') + rx = re.compile(r"^\_") params = self.__dict__ params = {key: params[key] for key in params if not rx.search(key)} return params @@ -53,7 +48,9 @@ def _set_seed(self): np.random.seed(self.seed) @staticmethod - def _ensure_walk_traversal_conditions(graph: nx.classes.graph.Graph) -> nx.classes.graph.Graph: + def _ensure_walk_traversal_conditions( + graph: nx.classes.graph.Graph, + ) -> nx.classes.graph.Graph: """Ensure walk traversal conditions.""" for node_index in trange( graph.number_of_nodes(), @@ -63,37 +60,34 @@ def _ensure_walk_traversal_conditions(graph: nx.classes.graph.Graph) -> nx.class # for this process to take a bit of time. disable=graph.number_of_nodes() < 10_000, desc="Checking main diagonal existance", - dynamic_ncols=True + dynamic_ncols=True, ): if not graph.has_edge(node_index, node_index): warnings.warn( - ( - "Please do be advised that " - "the graph you have provided does not " - "contain (some) edges in the main " - "diagonal, for instance the self-loop " - "constitued of ({}, {}). These selfloops " - "are necessary to ensure that the graph " - "is traversable, and for this reason we " - "create a copy of the graph and add therein " - "the missing edges. Since we are creating " - "a copy, this will immediately duplicate " - "the memory requirements. To avoid this double " - "allocation, you can provide the graph with the selfloops." - ).format( - node_index, - node_index - ) + "Please do be advised that " + "the graph you have provided does not " + "contain (some) edges in the main " + "diagonal, for instance the self-loop " + f"constitued of ({node_index}, {node_index}). These selfloops " + "are necessary to ensure that the graph " + "is traversable, and for this reason we " + "create a copy of the graph and add therein " + "the missing edges. Since we are creating " + "a copy, this will immediately duplicate " + "the memory requirements. To avoid this double " + "allocation, you can provide the graph with the selfloops." ) # We create a copy of the graph graph = graph.copy() # And we add the missing edges # for filling the main diagonal - graph.add_edges_from(( - (index, index) - for index in range(graph.number_of_nodes()) - if not graph.has_edge(index, index) - )) + graph.add_edges_from( + ( + (index, index) + for index in range(graph.number_of_nodes()) + if not graph.has_edge(index, index) + ) + ) break return graph diff --git a/karateclub/node_embedding/structural/__init__.py b/karateclub/node_embedding/structural/__init__.py index 5d4cca47..8377c96f 100644 --- a/karateclub/node_embedding/structural/__init__.py +++ b/karateclub/node_embedding/structural/__init__.py @@ -1,3 +1,4 @@ +"""Submodule for the structural node embedding methods.""" from .graphwave import GraphWave from .role2vec import Role2Vec from .sinr import SINr diff --git a/karateclub/node_embedding/structural/sinr.py b/karateclub/node_embedding/structural/sinr.py index 323d075a..b411883b 100644 --- a/karateclub/node_embedding/structural/sinr.py +++ b/karateclub/node_embedding/structural/sinr.py @@ -1,20 +1,30 @@ -from typing import Dict, List, Set +"""Implementation of SINr: Fast Computing of Sparse Interpretable Node Representations.""" + +from typing import List, Set, Optional import networkx as nx -from karateclub.estimator import Estimator -from scipy.sparse import coo_matrix, csr_matrix +from scipy.sparse import csr_matrix from sklearn.preprocessing import normalize import numpy as np +from karateclub.estimator import Estimator class SINr(Estimator): r"""An implementation of `"SINr" `_ from the IDA '21 best paper "SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!". - The procedure performs community detection using the Louvain algorithm, and computes the distribution of edges of each node across all communities. - The algorithm is one of the fastest, because it mostly relies on Louvain community detection. It thus runs in quasi-linear time. Regarding space complexity, the adjacency matrix and the community membership matrix need to be stored, it is also quasi-linear. + The procedure performs community detection using the Louvain algorithm, and computes the distribution of edges + of each node across all communities. + The algorithm is one of the fastest, because it mostly relies on Louvain community detection. + It thus runs in quasi-linear time. Regarding space complexity, the adjacency matrix and the community + membership matrix need to be stored, it is also quasi-linear. Args: - gamma (int): modularity multi-resolution parameter. Default is 1. - The dimension parameter does not exist for SINr, gamma should be used instead: the number of dimensions of the embedding space is based on the number of communities uncovered. The higher gamma is, the more communities are detected, the higher the number of dimensions of the latent space are uncovered. For small graphs, setting gamma to 1 is usually sufficient. For bigger graphs, it is recommended to increase gamma (5 or 10 for example). For word co-occurrence graphs, to deal with word embedding, gamma is usually set to 50 in order to get many small communities. + gamma (int): modularity multi-resolution parameter. Default is 1. + The dimension parameter does not exist for SINr, gamma should be used instead: + the number of dimensions of the embedding space is based on the number of communities uncovered. + The higher gamma is, the more communities are detected, the higher the number of dimensions of + the latent space are uncovered. For small graphs, setting gamma to 1 is usually sufficient. + For bigger graphs, it is recommended to increase gamma (5 or 10 for example). + For word co-occurrence graphs, to deal with word embedding, gamma is usually set to 50 in order to get many small communities. seed (int): Random seed value. Default is 42. """ @@ -23,10 +33,11 @@ def __init__( gamma: int = 1, seed: int = 42, ): - - self.gamma = gamma - self.seed = seed - + self.gamma: int = gamma + self.seed: int = seed + self.number_of_nodes: Optional[int] = None + self.number_of_communities: Optional[int] = None + self._embedding: Optional[np.ndarray] = None def fit(self, graph: nx.classes.graph.Graph): """ @@ -39,36 +50,70 @@ def fit(self, graph: nx.classes.graph.Graph): graph = self._check_graph(graph) # Get the adjacency matrix of the graph adjacency = nx.adjacency_matrix(graph) - norm_adjacency = normalize(adjacency, "l1") # Make rows of matrix sum at 1 + norm_adjacency = normalize(adjacency, "l1") # Make rows of matrix sum at 1 # Detect communities use louvain algorithm with the gamma resolution parameter - communities = nx.community.louvain_communities(graph, resolution = self.gamma, seed = self.seed) - self.dimensions = len(communities) + communities = nx.community.louvain_communities( + graph, resolution=self.gamma, seed=self.seed + ) + self.number_of_nodes = graph.number_of_nodes() + self.number_of_communities = len(communities) # Get the community membership of the graph membership_matrix = self._get_matrix_membership(communities) - #Computes the node-recall: for each node, the distribution of links across communities + # Computes the node-recall: for each node, the distribution of links across communities self._embedding = norm_adjacency.dot(membership_matrix) - - def _get_matrix_membership(self, list_of_communities:List[Set[int]]): + + def _get_matrix_membership(self, list_of_communities: List[Set[int]]): r"""Getting the membership matrix describing for each node (rows), in which community (column) it belongs. Return types: * **Membership matrix** *(scipy sparse matrix csr)* - Size nodes, communities """ - row = list() - col = list() - data = list() - for idx_c, community in enumerate(list_of_communities): + # Since we will have a lot of zeros, we use a sparse matrix. + # We build a CSR matrix. + + # A CSR matrix is composite of two arrays: the data array and the indices array. + # The data array is a 1D array that contains all the non-zero values of the matrix. + nodes_per_community = np.empty(self.number_of_nodes, dtype=np.uint32) + # The indices array is a 1D array that contains the offsets of the start of each row of the matrix. + communities_comulative_degrees = np.empty(self.number_of_communities + 1, dtype=np.uint32) + offset: int = 0 + + # For each community, we store the nodes that belong to it. + for column_index, community in enumerate(list_of_communities): + # We store the offset of the start of each row of the matrix. + communities_comulative_degrees[column_index] = offset + # We store the nodes that belong to the community. for node in community: - row.append(node) - col.append(idx_c) - data.append(1) - return coo_matrix((data, (row, col)), shape=(len(row), len(list_of_communities))).tocsr() - - + nodes_per_community[offset] = node + offset += 1 + + assert offset == self.number_of_nodes + + # We set the offset of the end of the last row of the matrix + # to the number of nodes, which is expected to be identical + # to the offset of the start of the last row of the matrix. + communities_comulative_degrees[-1] = self.number_of_nodes + + # And finally we can build the matrix. + return csr_matrix( + ( + np.ones(self.number_of_nodes, dtype=np.float32), + nodes_per_community, + communities_comulative_degrees, + ), + shape=(self.number_of_communities, self.number_of_nodes), + ).T + def get_embedding(self) -> np.array: r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)* - The embedding of nodes. """ + if self._embedding is None: + raise ValueError( + "No embedding has been computed. " + "Please call the fit method first." + ) + return self._embedding.toarray() diff --git a/test/structral_node_embedding_test.py b/test/structral_node_embedding_test.py index d2b7dbb6..a2033701 100644 --- a/test/structral_node_embedding_test.py +++ b/test/structral_node_embedding_test.py @@ -89,8 +89,8 @@ def test_sinr(): embedding = model.get_embedding() assert embedding.shape[0] == graph.number_of_nodes() - assert embedding.shape[1] == model.dimensions - assert type(embedding) == np.ndarray + assert embedding.shape[1] == model.number_of_communities + assert isinstance(embedding, np.ndarray) model = SINr(gamma=5) @@ -101,8 +101,8 @@ def test_sinr(): embedding = model.get_embedding() assert embedding.shape[0] == graph.number_of_nodes() - assert embedding.shape[1] == model.dimensions + assert embedding.shape[1] == model.number_of_communities model2 = SINr(gamma=10) model2.fit(graph) - assert model2.dimensions > model.dimensions - assert type(embedding) == np.ndarray + assert model2.number_of_communities > model.number_of_communities + assert isinstance(embedding, np.ndarray)