From dea6d4507080a7ed135bdbe5054b1ec13b6c9b58 Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Wed, 20 Nov 2024 11:44:20 +0100 Subject: [PATCH 1/3] Create AUC_SGD.py --- river/linear_model/AUC_SGD.py | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 river/linear_model/AUC_SGD.py diff --git a/river/linear_model/AUC_SGD.py b/river/linear_model/AUC_SGD.py new file mode 100644 index 0000000000..ba350de660 --- /dev/null +++ b/river/linear_model/AUC_SGD.py @@ -0,0 +1,53 @@ +import numpy as np + +def sigma_eps(z, eps): + return 1 / (1 + np.exp(-z / eps)) + + +class AUCMetric: + def __init__(self): + super().__init__() + self.positive_scores = [] + self.negative_scores = [] + + def update(self, y_true, y_pred): + """Updates the metric with the new prediction and true label.""" + if y_true == 1: + self.positive_scores.append(y_pred) + else: + self.negative_scores.append(y_pred) + return self + + def get(self, X_train, y_train, X_test, y_test, epochs=900, lr=0.5, n_mc=500, gamma=1e-4, eps=0.01): + """ + Implements the stochastic gradient ascent method to optimize theta and computes the AUC + based on the accumulated scores. + + Parameters: + - X_train: Training feature matrix. + - y_train: Training labels. + - X_test: Test feature matrix. + - y_test: Test labels. + - epochs: Number of training epochs. + - lr: Initial learning rate. + - n_mc: Number of Monte Carlo samples for gradient estimation. + - gamma: Learning rate discount factor. + - eps: Smoothing parameter for the sigmoid function. + + Returns: + - auc: Final AUC score based on the accumulated scores. + """ + + # Define the stochastic gradient function + def stochastic_gradient(theta, X1, X0, N, eps, random_state=1): + np.random.seed(random_state) + indices_1 = np.random.choice(np.arange(X1.shape[0]), size=N) + indices_0 = np.random.choice(np.arange(X0.shape[0]), size=N) + X1_, X0_ = X1[indices_1], X0[indices_0] + avg = np.zeros_like(theta) + for xi, xj in zip(X1_, X0_): + dx = xj - xi + sig = sigma_eps(np.dot(theta, dx)) + avg += sig * (1 - sig) * dx + return avg / (N * eps) + From ac717883b557964fa4babd645d9e7092e818e57d Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Sun, 1 Dec 2024 14:48:26 +0100 Subject: [PATCH 2/3] AUC_SGD --- river/linear_model/AUC_SGD.py | 166 +++++++++++++++++++++++++++------ river/linear_model/__init__.py | 2 + 2 files changed, 138 insertions(+), 30 deletions(-) diff --git a/river/linear_model/AUC_SGD.py b/river/linear_model/AUC_SGD.py index ba350de660..c9a20fc47f 100644 --- a/river/linear_model/AUC_SGD.py +++ b/river/linear_model/AUC_SGD.py @@ -1,53 +1,159 @@ +from __future__ import annotations + import numpy as np -def sigma_eps(z, eps): - return 1 / (1 + np.exp(-z / eps)) +class AUC_SGD: + """ + AUC Stochastic Gradient Descent (SGD) + + This class implements an SGD-based optimization method for maximizing the AUC (Area Under the Curve) + of a binary classifier assuming a linear regression model. + + Attributes + ---------- + epochs : int + Number of training epochs. + lr : float + Initial learning rate for gradient descent updates. + n_mc : int + Number of Monte Carlo samples used for estimating gradients. + gamma : float + Learning rate decay parameter. + eps : float + Smoothing parameter for numerical stability. + + Methods + ------- + + getTrain(X_train, y_train): + Returns the Prediction to maximize training AUC score. + getTest(X_test, y_test): + Returns the Prediction to maximize testing AUC score. -class AUCMetric: - def __init__(self): + Examples + -------- + >>> from river import linear_model + >>> from sklearn.metrics import roc_auc_score + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=2000, n_informative=9, n_redundant=0, n_repeated=0, random_state=2) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=4) + >>> base = LogisticRegression().fit(X_train, y_train) + >>> X1 = X_train[y_train==1] + >>> X0 = X_train[y_train==0] + >>> model = linear_model.AUC_SGD() + >>> np.random.seed(123) + >>> theta = np.random.randn(X_train[0].shape[0]) + >>> test_auc = model.getTest(X_train, X_test, y_train) + >>> train_auc = model.getTrain(X_train, y_train) + >>> print(roc_auc_score(y_train, train_auc)) + 0.8899135830932864 + >>> print(roc_auc_score(y_test, test_auc)) + 0.8849634963496349 + """ + + def __init__(self, epochs=900, lr=0.5, n_mc=500, gamma=1e-4, eps=0.01): super().__init__() - self.positive_scores = [] - self.negative_scores = [] + self.epochs = epochs + self.lr = lr + self.n_mc = n_mc + self.gamma = gamma + self.eps = eps - def update(self, y_true, y_pred): - """Updates the metric with the new prediction and true label.""" - if y_true == 1: - self.positive_scores.append(y_pred) + def sigma_eps(self, z, eps): + q = - z / eps + if abs(q) < 35: + return 1 / (1 + np.exp(q)) + elif q > 0: + return 0 else: - self.negative_scores.append(y_pred) - return self + return 1 + - def get(self, X_train, y_train, X_test, y_test, epochs=900, lr=0.5, n_mc=500, gamma=1e-4, eps=0.01): + def stochastic_gradient(self, theta, X1, X0, N=1000, eps=0.01, random_state=1): """ - Implements the stochastic gradient ascent method to optimize theta and computes the AUC - based on the accumulated scores. + Computes the stochastic gradient of the AUC objective. - Parameters: - - X_train: Training feature matrix. - - y_train: Training labels. - - X_test: Test feature matrix. - - y_test: Test labels. - - epochs: Number of training epochs. - - lr: Initial learning rate. - - n_mc: Number of Monte Carlo samples for gradient estimation. - - gamma: Learning rate discount factor. - - eps: Smoothing parameter for the sigmoid function. + This function calculates the gradient of the AUC metric with respect to the parameter `theta` using + Monte Carlo sampling. Positive and negative samples are drawn to estimate the difference in predictions. - Returns: - - auc: Final AUC score based on the accumulated scores. + Parameters + ---------- + theta : numpy.ndarray + The parameter vector (weights) of the model. + X1 : numpy.ndarray + Feature matrix for the positive class (label = 1). + X0 : numpy.ndarray + Feature matrix for the negative class (label = 0). + N : int, optional, default=1000 + Number of Monte Carlo samples to use for gradient estimation. + eps : float, optional, default=0.01 + Smoothing parameter for the sigmoid function. + random_state : int, optional, default=1 + Random seed for reproducibility. + + Returns + ------- + numpy.ndarray + The estimated gradient vector. """ - # Define the stochastic gradient function - def stochastic_gradient(theta, X1, X0, N, eps, random_state=1): np.random.seed(random_state) indices_1 = np.random.choice(np.arange(X1.shape[0]), size=N) indices_0 = np.random.choice(np.arange(X0.shape[0]), size=N) X1_, X0_ = X1[indices_1], X0[indices_0] avg = np.zeros_like(theta) + for xi, xj in zip(X1_, X0_): dx = xj - xi - sig = sigma_eps(np.dot(theta, dx)) + sig = self.sigma_eps(np.dot(theta, dx), eps) avg += sig * (1 - sig) * dx return avg / (N * eps) + def compute(self, X_train, X_test, y_train): + X1 = X_train[y_train == 1] + X0 = X_train[y_train == 0] + np.random.seed(123) + theta = np.random.randn(X_train[0].shape[0]) + epochs_list = list(range(self.epochs)) + + for seed, epoch in enumerate(epochs_list): + # learning rate scheduler + self.lr = self.lr / (1 + self.gamma) + + theta = theta - self.lr * self.stochastic_gradient(theta, X1, X0, N=self.n_mc, random_state=seed) + + if X_test is not None: + return theta @ X_test.T + else: + return theta @ X_train.T + + def getTrain(self, X_train, y_train): + """ + Implements the stochastic gradient ascent method to optimize theta for a maximised AUC training score. + + Parameters: + - X_train: Training feature matrix. + - y_train: Training labels. + + Returns: + - Prediction to maximize training AUC score. + """ + + return self.compute(X_train, None, y_train) + + def getTest(self, X_train, X_test, y_train): + """ + Implements the stochastic gradient ascent method to optimize theta for a maximised AUC testing score. + + Parameters: + - X_train: Training feature matrix. + - X_test: Testing feature matrix. + - y_train: Training labels. + + Returns: + - Prediction to maximize testing AUC score. + """ + return self.compute(X_train, X_test, y_train) diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 756720490a..2c3847a33e 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -10,6 +10,7 @@ from .pa import PAClassifier, PARegressor from .perceptron import Perceptron from .softmax import SoftmaxRegression +from .AUC_SGD import AUC_SGD __all__ = [ "base", @@ -21,4 +22,5 @@ "PARegressor", "Perceptron", "SoftmaxRegression", + "AUC_SGD", ] From c1d08937611d73bb95d831802b267f066bd0d7db Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Sun, 1 Dec 2024 14:49:14 +0100 Subject: [PATCH 3/3] Update AUC_SGD.py --- river/linear_model/AUC_SGD.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/linear_model/AUC_SGD.py b/river/linear_model/AUC_SGD.py index c9a20fc47f..3fd2f51a75 100644 --- a/river/linear_model/AUC_SGD.py +++ b/river/linear_model/AUC_SGD.py @@ -132,7 +132,7 @@ def compute(self, X_train, X_test, y_train): def getTrain(self, X_train, y_train): """ - Implements the stochastic gradient ascent method to optimize theta for a maximised AUC training score. + Implements the stochastic gradient ascent method to optimize theta for a maximized AUC training score. Parameters: - X_train: Training feature matrix. @@ -146,7 +146,7 @@ def getTrain(self, X_train, y_train): def getTest(self, X_train, X_test, y_train): """ - Implements the stochastic gradient ascent method to optimize theta for a maximised AUC testing score. + Implements the stochastic gradient ascent method to optimize theta for a maximized AUC testing score. Parameters: - X_train: Training feature matrix.