From ea6066f85a7faa15813cd2d24996a21913389a16 Mon Sep 17 00:00:00 2001
From: Yzy <2154597198@qq.com>
Date: Mon, 4 Dec 2023 18:44:54 +0800
Subject: [PATCH] [Modify] Modify error in predict_weight()

---
 requirements.txt      |  5 ++++
 src/gnnwr/datasets.py | 41 ++++++++++++++++++-----------
 src/gnnwr/models.py   | 41 ++++++++++++++++++-----------
 src/gnnwr/utils.py    | 60 +++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 115 insertions(+), 32 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3c0d659..398e9b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,8 @@ scikit_learn>=1.0.2
 statsmodels>=0.13.5
 torch>=1.8.1
 tqdm>=4.63.0
+
+folium~=0.14.0
+branca~=0.6.0
+scipy~=1.10.1
+scikit-learn~=1.2.2
\ No newline at end of file
diff --git a/src/gnnwr/datasets.py b/src/gnnwr/datasets.py
index d2b62e6..3b6e548 100644
--- a/src/gnnwr/datasets.py
+++ b/src/gnnwr/datasets.py
@@ -117,19 +117,19 @@ def scale(self, scale_fn=None, scale_params=None):
         if scale_fn == "minmax_scale":
             self.scale_fn = "minmax_scale"
             x_scale_params = scale_params[0]
-            # y_scale_params = scale_params[1]
+            y_scale_params = scale_params[1]
             self.x_scale_info = {"min": x_scale_params.data_min_, "max": x_scale_params.data_max_}
             self.x_data = x_scale_params.transform(pd.DataFrame(self.x_data, columns=self.x))
-            # self.y_scale_info = {"min": y_scale_params.data_min_, "max": y_scale_params.data_max_}
-            # self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y))
+            self.y_scale_info = {"min": y_scale_params.data_min_, "max": y_scale_params.data_max_}
+            self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y))
         elif scale_fn == "standard_scale":
             self.scale_fn = "standard_scale"
             x_scale_params = scale_params[0]
-            # y_scale_params = scale_params[1]
+            y_scale_params = scale_params[1]
             self.x_scale_info = {"mean": x_scale_params.mean_, "var": x_scale_params.var_}
             self.x_data = x_scale_params.transform(pd.DataFrame(self.x_data, columns=self.x))
-            # self.y_scale_info = {"mean": y_scale_params.mean_, "var": y_scale_params.var_}
-            # self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y))
+            self.y_scale_info = {"mean": y_scale_params.mean_, "var": y_scale_params.var_}
+            self.y_data = y_scale_params.transform(pd.DataFrame(self.y_data, columns=self.y))
 
         self.getScaledDataframe()
 
@@ -154,17 +154,17 @@ def scale2(self, scale_fn, scale_params):
         if scale_fn == "minmax_scale":
             self.scale_fn = "minmax_scale"
             x_scale_params = scale_params[0]
-            # y_scale_params = scale_params[1]
+            y_scale_params = scale_params[1]
             # self.x_data = self.x_data * (x_scale_params["max"] - x_scale_params["min"]) + x_scale_params["min"]
             self.x_data = (self.x_data - x_scale_params["min"]) / (x_scale_params["max"] - x_scale_params["min"])
-            # self.y_data = self.y_data * (y_scale_params["max"] - y_scale_params["min"]) + y_scale_params["min"]
+            self.y_data = (self.y_data - y_scale_params["min"]) / (y_scale_params["max"] - y_scale_params["min"])
         elif scale_fn == "standard_scale":
             self.scale_fn = "standard_scale"
             x_scale_params = scale_params[0]
-            # y_scale_params = scale_params[1]
+            y_scale_params = scale_params[1]
             # self.x_data = self.x_data * np.sqrt(x_scale_params["var"]) + x_scale_params["mean"]
             self.x_data = (self.x_data - x_scale_params['mean']) / np.sqrt(x_scale_params["var"])
-            # self.y_data = self.y_data * np.sqrt(y_scale_params["var"]) + y_scale_params["mean"]
+            self.y_data = (self.y_data - y_scale_params['mean']) / np.sqrt(y_scale_params["var"])
 
         self.getScaledDataframe()
 
@@ -287,9 +287,11 @@ class predictDataset(Dataset):
     :param is_need_STNN: whether need STNN
     """
 
-    def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=[], is_need_STNN=False):
+    def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=None, is_need_STNN=False):
 
         # data = data.astype(np.float32)
+        if scale_info is None:
+            scale_info = []
         self.dataframe = data
         self.x = x_column
         if data is None:
@@ -368,7 +370,7 @@ def rescale(self, x):
 
         return x
 
-    def minmax_scaler(self, x, min=[], max=[]):
+    def minmax_scaler(self, x, min=None, max=None):
         """
         function of minmax scaler
 
@@ -377,13 +379,17 @@ def minmax_scaler(self, x, min=[], max=[]):
         :param max: maximum value of each attribute
         :return: Output attribute data
         """
+        if max is None:
+            max = []
+        if min is None:
+            min = []
         if len(min) == 0:
             x = (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0))
         else:
             x = (x - min) / (max - min)
         return x
 
-    def standard_scaler(self, x, mean=[], std=[]):
+    def standard_scaler(self, x, mean=None, std=None):
         """
         function of standard scaler
 
@@ -392,6 +398,10 @@ def standard_scaler(self, x, mean=[], std=[]):
         :param std: standard deviation of each attribute
         :return: Output attribute data
         """
+        if std is None:
+            std = []
+        if mean is None:
+            mean = []
         if len(mean) == 0:
             x = (x - x.mean(axis=0)) / x.std(axis=0)
         else:
@@ -410,7 +420,7 @@ def BasicDistance(x, y):
     x = np.float32(x)
     y = np.float32(y)
     dist = distance.cdist(x, y, 'euclidean')
-    return dist  # np.float32(np.sqrt(np.sum((x[:, np.newaxis, :] - y) ** 2, axis=2)))
+    return dist
 
 
 def Manhattan_distance(x, y):
@@ -425,7 +435,7 @@ def Manhattan_distance(x, y):
 
 
 def init_dataset(data, test_ratio, valid_ratio, x_column, y_column, spatial_column=None, temp_column=None,
-                 id_column=None, sample_seed=100, process_fn="minmax_scale", batch_size=32, shuffle=True,
+                 id_column=None, sample_seed=42, process_fn="minmax_scale", batch_size=32, shuffle=True,
                  use_class=baseDataset,
                  spatial_fun=BasicDistance, temporal_fun=Manhattan_distance, max_val_size=-1, max_test_size=-1,
                  from_for_cv=0, is_need_STNN=False, Reference=None, simple_distance=True):
@@ -439,6 +449,7 @@ def init_dataset(data, test_ratio, valid_ratio, x_column, y_column, spatial_colu
     :param y_column: output attribute column name
     :param spatial_column: spatial attribute column name
     :param temp_column: temporal attribute column name
+    :param id_column: id column name
     :param sample_seed: random seed
     :param process_fn: data pre-process function
     :param batch_size: batch size
diff --git a/src/gnnwr/models.py b/src/gnnwr/models.py
index 7b32410..33dc265 100644
--- a/src/gnnwr/models.py
+++ b/src/gnnwr/models.py
@@ -247,7 +247,7 @@ def init_optimizer(self, optimizer, optimizer_params=None):
             upepoch = optimizer_params.get("upepoch", 10000)
             uprate = (maxlr - minlr) / upepoch * (upepoch // 20)
             decayepoch = optimizer_params.get("decayepoch", 20000)
-            decayrate = optimizer_params.get("decayrate", 0.1)
+            decayrate = optimizer_params.get("decayrate", 0.95)
             stop_change_epoch = optimizer_params.get("stop_change_epoch", 30000)
             stop_lr = optimizer_params.get("stop_lr", 0.001)
             lamda_lr = lambda epoch: (epoch // (upepoch // 20)) * uprate + minlr if epoch < upepoch else (
@@ -422,6 +422,9 @@ def run(self, max_epoch=1, early_stop=-1, print_frequency=50, show_detailed_info
             if ``early_stop`` is ``-1``, the training will not stop until the max epoch
         print_frequency : int
             the frequency of printing the information (default: ``50``)
+
+        show_detailed_info : bool
+            if ``True``, the detailed information will be shown (default: ``True``)
         """
         self.__istrained = True
         if self._use_gpu:
@@ -560,7 +563,7 @@ def load_model(self, path, use_dict=False, map_location=None):
         path : str
             the path of the model
         use_dict : bool
-            whether use dict to load the model (default: ``False``)
+            whether the function use dict to load the model (default: ``False``)
         map_location : str
             the location of the model (default: ``None``)
             the location can be ``"cpu"`` or ``"cuda"``
@@ -635,7 +638,7 @@ def result(self, path=None, use_dict=False, map_location=None):
             the path of the model(default: ``None``)
             | if ``path`` is ``None``, the model will be loaded from ``self._modelSavePath + "/" + self._modelName + ".pkl"``
         use_dict : bool
-            whether use dict to load the model (default: ``False``)
+            whether the function use dict to load the model (default: ``False``)
             | if ``use_dict`` is ``True``, the model will be loaded from ``path`` as dict
         map_location : str
             the location of the model (default: ``None``)
@@ -664,24 +667,32 @@ def result(self, path=None, use_dict=False, map_location=None):
         logging.info("Test Loss: " + str(self.__testLoss) + "; Test R2: " + str(self.__testr2))
         # print result
         # basic information
-        print("--------------------Result Table--------------------\n")
+        print("--------------------Model Information-----------------")
         print("Model Name:           |", self._modelName)
         print("Model Structure:      |\n", self._model)
         print("Optimizer:            |\n", self._optimizer)
         print("independent variable: |", self._train_dataset.x)
         print("dependent variable:   |", self._train_dataset.y)
-        print("\n----------------------------------------------------\n")
-        print("Test Loss: ", self.__testLoss, " Test R2: ", self.__testr2)
-        if self._valid_r2 is not None and self._valid_r2 != float('-inf'):
-            print("Train R2:  {:5f}".format(self._besttrainr2), " Valid R2: ", self._bestr2)
         # OLS
-        print("\nOLS:  |", self._weight)
-        # Diagnostics
-        print("R2:   |", self.__testr2)
-        print("RMSE: | {:5f}".format(self._test_diagnosis.RMSE().data))
-        print("AIC:  | {:5f}".format(self._test_diagnosis.AIC()))
-        print("AICc: | {:5f}".format(self._test_diagnosis.AICc()))
-        print("F1:   | {:5f}".format(self._test_diagnosis.F1_GNN().data))
+        print("\nOLS weight:|", end=" ")
+        for i in range(len(self._weight)):
+            print(" {:.5f}".format(self._weight[i]), end=" ")
+        print("\n")
+        print("\n--------------------Result Information----------------")
+        print("Test Loss: | {:>25.5f}".format(self.__testLoss))
+        print("Test R2  : | {:>25.5f}".format(self.__testr2))
+        if self._valid_r2 is not None and self._valid_r2 != float('-inf'):
+            print("Train R2 : | {:>25.5f}".format(self._besttrainr2))
+            print("Valid R2 : | {:>25.5f}".format(self._valid_r2))
+        print("RMSE: | {:>30.5f}".format(self._test_diagnosis.RMSE().data))
+        print("AIC:  | {:>30.5f}".format(self._test_diagnosis.AIC()))
+        print("AICc: | {:>30.5f}".format(self._test_diagnosis.AICc()))
+        print("F1:   | {:>30.5f}".format(self._test_diagnosis.F1_Global().data))
+        print("F2:   | {:>30.5f}".format(self._test_diagnosis.F2_Global().flatten()[0].data))
+        F3_Local_dict = self._test_diagnosis.F3_Local()[0]
+        for key in F3_Local_dict:
+            width = 30-(len(key) - 4)
+            print("{}: | {:>{width}.5f}".format(key, F3_Local_dict[key].data, width=width))
 
     def reg_result(self, filename=None, model_path=None, use_dict=False, only_return=False, map_location=None):
         """
diff --git a/src/gnnwr/utils.py b/src/gnnwr/utils.py
index b7a3450..5ab53e7 100644
--- a/src/gnnwr/utils.py
+++ b/src/gnnwr/utils.py
@@ -3,7 +3,8 @@
 import pandas as pd
 import torch
 import warnings
-import copy
+from scipy.stats import f
+from scipy.stats import t
 import folium
 from folium.plugins import HeatMap, MarkerCluster
 import branca
@@ -42,6 +43,7 @@ class DIAGNOSIS:
     """
 
     def __init__(self, weight, x_data, y_data, y_pred):
+
         self.__weight = weight
         self.__x_data = x_data
         self.__y_data = y_data
@@ -61,10 +63,13 @@ def __init__(self, weight, x_data, y_data, y_pred):
         gtweight_3d = torch.diag_embed(self.__weight)
         hatS_temp = torch.matmul(gtweight_3d,
                                  torch.matmul(torch.inverse(torch.matmul(x_data_tile_t, x_data_tile)), x_data_tile_t))
+        self.__hat_temp = hatS_temp
         hatS = torch.matmul(x_data.view(-1, 1, x_data.size(1)), hatS_temp)
         hatS = hatS.view(-1, self.__n)
         self.__hat = hatS
         self.__S = torch.trace(self.__hat)
+        self.f3_dict = None
+        self.f3_dict_2 = None
 
     def hat(self):
         """
@@ -72,17 +77,68 @@ def hat(self):
         """
         return self.__hat
 
-    def F1_GNN(self):
+    def F1_Global(self):
         """
         :return: F1-test
         """
         k1 = self.__n - 2 * torch.trace(self.__hat) + \
              torch.trace(torch.mm(self.__hat.transpose(-2, -1), self.__hat))
+
         k2 = self.__n - self.__k - 1
         rss_olr = torch.sum(
             (torch.mean(self.__y_data) - torch.mm(self.__ols_hat, self.__y_data)) ** 2)
+        F_value = self.__ssr / k1 / (rss_olr / k2)
+        # p_value = f.sf(F_value, k1, k2)
         return self.__ssr / k1 / (rss_olr / k2)
 
+    def F2_Global(self):
+        """
+        :return: F2-test
+        """
+        # A = (I - H) - (I - S)^T*(I - S)
+        A = (torch.eye(self.__n) - self.__ols_hat) - torch.mm(
+            (torch.eye(self.__n) - self.__hat).transpose(-2, -1),
+            (torch.eye(self.__n) - self.__hat))
+        v1 = torch.trace(A)
+        # DSS = y^T*A*y
+        DSS = torch.mm(self.__y_data.transpose(-2, -1), torch.mm(A, self.__y_data))
+        k2 = self.__n - self.__k - 1
+        rss_olr = torch.sum(
+            (torch.mean(self.__y_data) - torch.mm(self.__ols_hat, self.__y_data)) ** 2)
+
+        return DSS / v1 / (rss_olr / k2)
+
+    def F3_Local(self):
+        """
+        :return: F1-test of each variable
+        """
+
+        ek_dict = {}
+        self.f3_dict = {}
+        self.f3_dict_2 = {}
+        for i in range(self.__x_data.size(1)):
+            ek_zeros = torch.zeros([self.__x_data.size(1)])
+            ek_zeros[i] = 1
+            ek_dict['ek' + str(i)] = torch.reshape(torch.reshape(torch.tile(ek_zeros.clone().detach(), [self.__n]),
+                                                                 [self.__n, -1]),
+                                                   [-1, 1, self.__x_data.size(1)])
+            hatB = torch.matmul(ek_dict['ek' + str(i)], self.__hat_temp)
+            hatB = torch.reshape(hatB, [-1, self.__n])
+
+            J_n = torch.ones([self.__n, self.__n]) / self.__n
+            L = torch.matmul(hatB.transpose(-2, -1), torch.matmul(torch.eye(self.__n) - J_n, hatB))
+
+            vk2 = 1 / self.__n * torch.matmul(self.__y_data.transpose(-2, -1), torch.matmul(L, self.__y_data))
+            trace_L = torch.trace(1 / self.__n * L)
+            f3 = torch.squeeze(vk2 / trace_L / (self.__ssr / self.__n))
+            self.f3_dict['f3_param_' + str(i)] = f3
+
+            bk = torch.matmul(hatB, self.__y_data)
+            vk2_2 = 1 / self.__n * torch.sum((bk - torch.mean(bk)) ** 2)
+            f3_2 = torch.squeeze(vk2_2 / trace_L / (self.__ssr / self.__n))
+            self.f3_dict_2['f3_param_' + str(i)] = f3_2
+        return self.f3_dict, self.f3_dict_2
+
     def AIC(self):
         """
         :return: AIC