From ea6066f85a7faa15813cd2d24996a21913389a16 Mon Sep 17 00:00:00 2001 From: Yzy <2154597198@qq.com> Date: Mon, 4 Dec 2023 18:44:54 +0800 Subject: [PATCH] [Modify] Modify error in predict_weight() --- requirements.txt | 5 ++++ src/gnnwr/datasets.py | 41 ++++++++++++++++++----------- src/gnnwr/models.py | 41 ++++++++++++++++++----------- src/gnnwr/utils.py | 60 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 115 insertions(+), 32 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3c0d659..398e9b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,8 @@ scikit_learn>=1.0.2 statsmodels>=0.13.5 torch>=1.8.1 tqdm>=4.63.0 + +folium~=0.14.0 +branca~=0.6.0 +scipy~=1.10.1 +scikit-learn~=1.2.2 \ No newline at end of file diff --git a/src/gnnwr/datasets.py b/src/gnnwr/datasets.py index d2b62e6..3b6e548 100644 --- a/src/gnnwr/datasets.py +++ b/src/gnnwr/datasets.py @@ -117,19 +117,19 @@ def scale(self, scale_fn=None, scale_params=None): if scale_fn == "minmax_scale": self.scale_fn = "minmax_scale" x_scale_params = scale_params[0] - # y_scale_params = scale_params[1] + y_scale_params = scale_params[1] self.x_scale_info = {"min": x_scale_params.data_min_, "max": x_scale_params.data_max_} self.x_data = x_scale_params.transform(pd.DataFrame(self.x_data, columns=self.x)) - # self.y_scale_info = {"min": y_scale_params.data_min_, "max": y_scale_params.data_max_} - # self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y)) + self.y_scale_info = {"min": y_scale_params.data_min_, "max": y_scale_params.data_max_} + self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y)) elif scale_fn == "standard_scale": self.scale_fn = "standard_scale" x_scale_params = scale_params[0] - # y_scale_params = scale_params[1] + y_scale_params = scale_params[1] self.x_scale_info = {"mean": x_scale_params.mean_, "var": x_scale_params.var_} self.x_data = x_scale_params.transform(pd.DataFrame(self.x_data, columns=self.x)) - # self.y_scale_info = {"mean": y_scale_params.mean_, "var": y_scale_params.var_} - # self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y)) + self.y_scale_info = {"mean": y_scale_params.mean_, "var": y_scale_params.var_} + self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y)) self.getScaledDataframe() @@ -154,17 +154,17 @@ def scale2(self, scale_fn, scale_params): if scale_fn == "minmax_scale": self.scale_fn = "minmax_scale" x_scale_params = scale_params[0] - # y_scale_params = scale_params[1] + y_scale_params = scale_params[1] # self.x_data = self.x_data * (x_scale_params["max"] - x_scale_params["min"]) + x_scale_params["min"] self.x_data = (self.x_data - x_scale_params["min"]) / (x_scale_params["max"] - x_scale_params["min"]) - # self.y_data = self.y_data * (y_scale_params["max"] - y_scale_params["min"]) + y_scale_params["min"] + self.y_data = (self.y_data - y_scale_params["min"]) / (y_scale_params["max"] - y_scale_params["min"]) elif scale_fn == "standard_scale": self.scale_fn = "standard_scale" x_scale_params = scale_params[0] - # y_scale_params = scale_params[1] + y_scale_params = scale_params[1] # self.x_data = self.x_data * np.sqrt(x_scale_params["var"]) + x_scale_params["mean"] self.x_data = (self.x_data - x_scale_params['mean']) / np.sqrt(x_scale_params["var"]) - # self.y_data = self.y_data * np.sqrt(y_scale_params["var"]) + y_scale_params["mean"] + self.y_data = (self.y_data - y_scale_params['mean']) / np.sqrt(y_scale_params["var"]) self.getScaledDataframe() @@ -287,9 +287,11 @@ class predictDataset(Dataset): :param is_need_STNN: whether need STNN """ - def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=[], is_need_STNN=False): + def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=None, is_need_STNN=False): # data = data.astype(np.float32) + if scale_info is None: + scale_info = [] self.dataframe = data self.x = x_column if data is None: @@ -368,7 +370,7 @@ def rescale(self, x): return x - def minmax_scaler(self, x, min=[], max=[]): + def minmax_scaler(self, x, min=None, max=None): """ function of minmax scaler @@ -377,13 +379,17 @@ def minmax_scaler(self, x, min=[], max=[]): :param max: maximum value of each attribute :return: Output attribute data """ + if max is None: + max = [] + if min is None: + min = [] if len(min) == 0: x = (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)) else: x = (x - min) / (max - min) return x - def standard_scaler(self, x, mean=[], std=[]): + def standard_scaler(self, x, mean=None, std=None): """ function of standard scaler @@ -392,6 +398,10 @@ def standard_scaler(self, x, mean=[], std=[]): :param std: standard deviation of each attribute :return: Output attribute data """ + if std is None: + std = [] + if mean is None: + mean = [] if len(mean) == 0: x = (x - x.mean(axis=0)) / x.std(axis=0) else: @@ -410,7 +420,7 @@ def BasicDistance(x, y): x = np.float32(x) y = np.float32(y) dist = distance.cdist(x, y, 'euclidean') - return dist # np.float32(np.sqrt(np.sum((x[:, np.newaxis, :] - y) ** 2, axis=2))) + return dist def Manhattan_distance(x, y): @@ -425,7 +435,7 @@ def Manhattan_distance(x, y): def init_dataset(data, test_ratio, valid_ratio, x_column, y_column, spatial_column=None, temp_column=None, - id_column=None, sample_seed=100, process_fn="minmax_scale", batch_size=32, shuffle=True, + id_column=None, sample_seed=42, process_fn="minmax_scale", batch_size=32, shuffle=True, use_class=baseDataset, spatial_fun=BasicDistance, temporal_fun=Manhattan_distance, max_val_size=-1, max_test_size=-1, from_for_cv=0, is_need_STNN=False, Reference=None, simple_distance=True): @@ -439,6 +449,7 @@ def init_dataset(data, test_ratio, valid_ratio, x_column, y_column, spatial_colu :param y_column: output attribute column name :param spatial_column: spatial attribute column name :param temp_column: temporal attribute column name + :param id_column: id column name :param sample_seed: random seed :param process_fn: data pre-process function :param batch_size: batch size diff --git a/src/gnnwr/models.py b/src/gnnwr/models.py index 7b32410..33dc265 100644 --- a/src/gnnwr/models.py +++ b/src/gnnwr/models.py @@ -247,7 +247,7 @@ def init_optimizer(self, optimizer, optimizer_params=None): upepoch = optimizer_params.get("upepoch", 10000) uprate = (maxlr - minlr) / upepoch * (upepoch // 20) decayepoch = optimizer_params.get("decayepoch", 20000) - decayrate = optimizer_params.get("decayrate", 0.1) + decayrate = optimizer_params.get("decayrate", 0.95) stop_change_epoch = optimizer_params.get("stop_change_epoch", 30000) stop_lr = optimizer_params.get("stop_lr", 0.001) lamda_lr = lambda epoch: (epoch // (upepoch // 20)) * uprate + minlr if epoch < upepoch else ( @@ -422,6 +422,9 @@ def run(self, max_epoch=1, early_stop=-1, print_frequency=50, show_detailed_info if ``early_stop`` is ``-1``, the training will not stop until the max epoch print_frequency : int the frequency of printing the information (default: ``50``) + + show_detailed_info : bool + if ``True``, the detailed information will be shown (default: ``True``) """ self.__istrained = True if self._use_gpu: @@ -560,7 +563,7 @@ def load_model(self, path, use_dict=False, map_location=None): path : str the path of the model use_dict : bool - whether use dict to load the model (default: ``False``) + whether the function use dict to load the model (default: ``False``) map_location : str the location of the model (default: ``None``) the location can be ``"cpu"`` or ``"cuda"`` @@ -635,7 +638,7 @@ def result(self, path=None, use_dict=False, map_location=None): the path of the model(default: ``None``) | if ``path`` is ``None``, the model will be loaded from ``self._modelSavePath + "/" + self._modelName + ".pkl"`` use_dict : bool - whether use dict to load the model (default: ``False``) + whether the function use dict to load the model (default: ``False``) | if ``use_dict`` is ``True``, the model will be loaded from ``path`` as dict map_location : str the location of the model (default: ``None``) @@ -664,24 +667,32 @@ def result(self, path=None, use_dict=False, map_location=None): logging.info("Test Loss: " + str(self.__testLoss) + "; Test R2: " + str(self.__testr2)) # print result # basic information - print("--------------------Result Table--------------------\n") + print("--------------------Model Information-----------------") print("Model Name: |", self._modelName) print("Model Structure: |\n", self._model) print("Optimizer: |\n", self._optimizer) print("independent variable: |", self._train_dataset.x) print("dependent variable: |", self._train_dataset.y) - print("\n----------------------------------------------------\n") - print("Test Loss: ", self.__testLoss, " Test R2: ", self.__testr2) - if self._valid_r2 is not None and self._valid_r2 != float('-inf'): - print("Train R2: {:5f}".format(self._besttrainr2), " Valid R2: ", self._bestr2) # OLS - print("\nOLS: |", self._weight) - # Diagnostics - print("R2: |", self.__testr2) - print("RMSE: | {:5f}".format(self._test_diagnosis.RMSE().data)) - print("AIC: | {:5f}".format(self._test_diagnosis.AIC())) - print("AICc: | {:5f}".format(self._test_diagnosis.AICc())) - print("F1: | {:5f}".format(self._test_diagnosis.F1_GNN().data)) + print("\nOLS weight:|", end=" ") + for i in range(len(self._weight)): + print(" {:.5f}".format(self._weight[i]), end=" ") + print("\n") + print("\n--------------------Result Information----------------") + print("Test Loss: | {:>25.5f}".format(self.__testLoss)) + print("Test R2 : | {:>25.5f}".format(self.__testr2)) + if self._valid_r2 is not None and self._valid_r2 != float('-inf'): + print("Train R2 : | {:>25.5f}".format(self._besttrainr2)) + print("Valid R2 : | {:>25.5f}".format(self._valid_r2)) + print("RMSE: | {:>30.5f}".format(self._test_diagnosis.RMSE().data)) + print("AIC: | {:>30.5f}".format(self._test_diagnosis.AIC())) + print("AICc: | {:>30.5f}".format(self._test_diagnosis.AICc())) + print("F1: | {:>30.5f}".format(self._test_diagnosis.F1_Global().data)) + print("F2: | {:>30.5f}".format(self._test_diagnosis.F2_Global().flatten()[0].data)) + F3_Local_dict = self._test_diagnosis.F3_Local()[0] + for key in F3_Local_dict: + width = 30-(len(key) - 4) + print("{}: | {:>{width}.5f}".format(key, F3_Local_dict[key].data, width=width)) def reg_result(self, filename=None, model_path=None, use_dict=False, only_return=False, map_location=None): """ diff --git a/src/gnnwr/utils.py b/src/gnnwr/utils.py index b7a3450..5ab53e7 100644 --- a/src/gnnwr/utils.py +++ b/src/gnnwr/utils.py @@ -3,7 +3,8 @@ import pandas as pd import torch import warnings -import copy +from scipy.stats import f +from scipy.stats import t import folium from folium.plugins import HeatMap, MarkerCluster import branca @@ -42,6 +43,7 @@ class DIAGNOSIS: """ def __init__(self, weight, x_data, y_data, y_pred): + self.__weight = weight self.__x_data = x_data self.__y_data = y_data @@ -61,10 +63,13 @@ def __init__(self, weight, x_data, y_data, y_pred): gtweight_3d = torch.diag_embed(self.__weight) hatS_temp = torch.matmul(gtweight_3d, torch.matmul(torch.inverse(torch.matmul(x_data_tile_t, x_data_tile)), x_data_tile_t)) + self.__hat_temp = hatS_temp hatS = torch.matmul(x_data.view(-1, 1, x_data.size(1)), hatS_temp) hatS = hatS.view(-1, self.__n) self.__hat = hatS self.__S = torch.trace(self.__hat) + self.f3_dict = None + self.f3_dict_2 = None def hat(self): """ @@ -72,17 +77,68 @@ def hat(self): """ return self.__hat - def F1_GNN(self): + def F1_Global(self): """ :return: F1-test """ k1 = self.__n - 2 * torch.trace(self.__hat) + \ torch.trace(torch.mm(self.__hat.transpose(-2, -1), self.__hat)) + k2 = self.__n - self.__k - 1 rss_olr = torch.sum( (torch.mean(self.__y_data) - torch.mm(self.__ols_hat, self.__y_data)) ** 2) + F_value = self.__ssr / k1 / (rss_olr / k2) + # p_value = f.sf(F_value, k1, k2) return self.__ssr / k1 / (rss_olr / k2) + def F2_Global(self): + """ + :return: F2-test + """ + # A = (I - H) - (I - S)^T*(I - S) + A = (torch.eye(self.__n) - self.__ols_hat) - torch.mm( + (torch.eye(self.__n) - self.__hat).transpose(-2, -1), + (torch.eye(self.__n) - self.__hat)) + v1 = torch.trace(A) + # DSS = y^T*A*y + DSS = torch.mm(self.__y_data.transpose(-2, -1), torch.mm(A, self.__y_data)) + k2 = self.__n - self.__k - 1 + rss_olr = torch.sum( + (torch.mean(self.__y_data) - torch.mm(self.__ols_hat, self.__y_data)) ** 2) + + return DSS / v1 / (rss_olr / k2) + + def F3_Local(self): + """ + :return: F1-test of each variable + """ + + ek_dict = {} + self.f3_dict = {} + self.f3_dict_2 = {} + for i in range(self.__x_data.size(1)): + ek_zeros = torch.zeros([self.__x_data.size(1)]) + ek_zeros[i] = 1 + ek_dict['ek' + str(i)] = torch.reshape(torch.reshape(torch.tile(ek_zeros.clone().detach(), [self.__n]), + [self.__n, -1]), + [-1, 1, self.__x_data.size(1)]) + hatB = torch.matmul(ek_dict['ek' + str(i)], self.__hat_temp) + hatB = torch.reshape(hatB, [-1, self.__n]) + + J_n = torch.ones([self.__n, self.__n]) / self.__n + L = torch.matmul(hatB.transpose(-2, -1), torch.matmul(torch.eye(self.__n) - J_n, hatB)) + + vk2 = 1 / self.__n * torch.matmul(self.__y_data.transpose(-2, -1), torch.matmul(L, self.__y_data)) + trace_L = torch.trace(1 / self.__n * L) + f3 = torch.squeeze(vk2 / trace_L / (self.__ssr / self.__n)) + self.f3_dict['f3_param_' + str(i)] = f3 + + bk = torch.matmul(hatB, self.__y_data) + vk2_2 = 1 / self.__n * torch.sum((bk - torch.mean(bk)) ** 2) + f3_2 = torch.squeeze(vk2_2 / trace_L / (self.__ssr / self.__n)) + self.f3_dict_2['f3_param_' + str(i)] = f3_2 + return self.f3_dict, self.f3_dict_2 + def AIC(self): """ :return: AIC