From f91b050868dcc61fcbe39911486a2d567711e57f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Karla=C5=A1?= Date: Sat, 28 Sep 2024 15:39:25 -0400 Subject: [PATCH] Add the faiss-based KNN model and custom metrics. --- .../datascope/experiments/pipelines/models.py | 106 ++++++++++++++++-- 1 file changed, 99 insertions(+), 7 deletions(-) diff --git a/experiments/datascope/experiments/pipelines/models.py b/experiments/datascope/experiments/pipelines/models.py index 2fa0569..447749c 100644 --- a/experiments/datascope/experiments/pipelines/models.py +++ b/experiments/datascope/experiments/pipelines/models.py @@ -1,3 +1,4 @@ +import faiss import numpy as np import tempfile import torch @@ -91,6 +92,36 @@ def predict_proba(self, X: Union[NDArray, DataFrame]) -> NDArray: return self.model.predict_proba(X) +class FaissKNN(BaseEstimator, ClassifierMixin): + def __init__(self, n_neighbors=5): + self.n_neighbors = n_neighbors + self.index = None + self.y = None + self.label_encoder = LabelEncoder() + + def fit(self, X, y): + self.X = X.astype(np.float32) + self.y = self.label_encoder.fit_transform(y) + d = self.X.shape[1] + self.index = faiss.IndexFlatL2(d) # L2 distance index + self.index.add(self.X) + return self + + def predict(self, X): + X = X.astype(np.float32) + distances, indices = self.index.search(X, self.n_neighbors) + votes = self.y[indices.reshape(-1)].reshape(-1, self.n_neighbors) + predictions = np.array([np.argmax(np.bincount(v, minlength=len(self.label_encoder.classes_))) for v in votes]) + return predictions + + def predict_proba(self, X): + X = X.astype(np.float32) + distances, indices = self.index.search(X, self.n_neighbors) + votes = self.y[indices.reshape(-1)].reshape(-1, self.n_neighbors) + proba = np.array([np.bincount(v, minlength=len(self.label_encoder.classes_)) / self.n_neighbors for v in votes]) + return proba + + class EvalLoggerCallback(TrainerCallback): def __init__( self, @@ -457,44 +488,105 @@ def construct(self: "RandomForestModel", dataset: Dataset) -> BaseEstimator: class KNearestNeighborsModel(BaseModel, id="knn", longname="K-Nearest Neighbors"): - def __init__(self, num_neighbors: int = 1, **kwargs) -> None: + def __init__(self, num_neighbors: int = 1, metric: str = "minkowski", **kwargs) -> None: self._num_neighbors = num_neighbors + self._metric = metric @attribute def num_neighbors(self) -> int: """Number of neighbors to use.""" return self._num_neighbors + @attribute + def metric(self) -> str: + """The distance metric to use.""" + return self._metric + def construct(self: "KNearestNeighborsModel", dataset: Dataset) -> BaseEstimator: - return KNeighborsClassifier(n_neighbors=self.num_neighbors) + return KNeighborsClassifier(n_neighbors=self.num_neighbors, metric=self.metric) class KNearestNeighborsModelK1(KNearestNeighborsModel, id="knn-1", longname="K-Nearest Neighbors (K=1)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=1, metric=metric) + + +class KNearestNeighborsModelK3(KNearestNeighborsModel, id="knn-3", longname="K-Nearest Neighbors (K=3)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=3, metric=metric) + + +class KNearestNeighborsModelK5(KNearestNeighborsModel, id="knn-5", longname="K-Nearest Neighbors (K=5)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=5, metric=metric) + + +class KNearestNeighborsModelK10(KNearestNeighborsModel, id="knn-10", longname="K-Nearest Neighbors (K=10)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=10, metric=metric) + + +class KNearestNeighborsModelK50(KNearestNeighborsModel, id="knn-50", longname="K-Nearest Neighbors (K=50)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=50, metric=metric) + + +class KNearestNeighborsModelK100(KNearestNeighborsModel, id="knn-100", longname="K-Nearest Neighbors (K=100)"): + def __init__(self, metric: str = "minkowski", **kwargs) -> None: + super().__init__(num_neighbors=100, metric=metric) + + +class FastKNearestNeighborsModel(BaseModel, id="fast-knn", longname="Fast K-Nearest Neighbors"): + def __init__(self, num_neighbors: int = 1, **kwargs) -> None: + self._num_neighbors = num_neighbors + + @attribute + def num_neighbors(self) -> int: + """Number of neighbors to use.""" + return self._num_neighbors + + def construct(self: "FastKNearestNeighborsModel", dataset: Dataset) -> BaseEstimator: + return FaissKNN(n_neighbors=self.num_neighbors) + + +class FastKNearestNeighborsModelK1( + FastKNearestNeighborsModel, id="fast-knn-1", longname="Fast K-Nearest Neighbors (K=1)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=1) -class KNearestNeighborsModelK3(KNearestNeighborsModel, id="knn-3", longname="K-Nearest Neighbors (K=3)"): +class FastKNearestNeighborsModelK3( + FastKNearestNeighborsModel, id="fast-knn-3", longname="Fast K-Nearest Neighbors (K=3)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=3) -class KNearestNeighborsModelK5(KNearestNeighborsModel, id="knn-5", longname="K-Nearest Neighbors (K=5)"): +class FastKNearestNeighborsModelK5( + FastKNearestNeighborsModel, id="fast-knn-5", longname="Fast K-Nearest Neighbors (K=5)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=5) -class KNearestNeighborsModelK10(KNearestNeighborsModel, id="knn-10", longname="K-Nearest Neighbors (K=10)"): +class FastKNearestNeighborsModelK10( + FastKNearestNeighborsModel, id="fast-knn-10", longname="Fast K-Nearest Neighbors (K=10)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=10) -class KNearestNeighborsModelK50(KNearestNeighborsModel, id="knn-50", longname="K-Nearest Neighbors (K=50)"): +class FastKNearestNeighborsModelK50( + FastKNearestNeighborsModel, id="fast-knn-50", longname="Fast K-Nearest Neighbors (K=50)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=50) -class KNearestNeighborsModelK100(KNearestNeighborsModel, id="knn-100", longname="K-Nearest Neighbors (K=100)"): +class FastKNearestNeighborsModelK100( + KNearestNeighborsModel, id="fast-knn-100", longname="Fast K-Nearest Neighbors (K=100)" +): def __init__(self, **kwargs) -> None: super().__init__(num_neighbors=100)