From 4b16b0c674e3e4106197db6b7a8c9d9d7cf2f410 Mon Sep 17 00:00:00 2001 From: Yzy <2154597198@qq.com> Date: Tue, 23 Jul 2024 13:39:06 +0800 Subject: [PATCH] [Modify] fixed some warnings --- .gitignore | 1 + src/gnnwr/__init__.py | 5 ++- src/gnnwr/datasets.py | 87 ++++++++++++++++++++++++++++--------------- src/gnnwr/models.py | 27 +++++++++++--- src/gnnwr/utils.py | 2 +- 5 files changed, 84 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 5c490f1..ea5e76e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ *_models/ dist/ *.toml +*.log \ No newline at end of file diff --git a/src/gnnwr/__init__.py b/src/gnnwr/__init__.py index 1f356cc..8307eb7 100644 --- a/src/gnnwr/__init__.py +++ b/src/gnnwr/__init__.py @@ -1 +1,4 @@ -__version__ = '1.0.0' +from .datasets import * +from .models import * +from .networks import * +from .utils import * \ No newline at end of file diff --git a/src/gnnwr/datasets.py b/src/gnnwr/datasets.py index 013d2d5..96f65f6 100644 --- a/src/gnnwr/datasets.py +++ b/src/gnnwr/datasets.py @@ -160,12 +160,18 @@ def scale2(self, scale_fn, scale_params): self.scale_fn = "minmax_scale" x_scale_params = scale_params[0] y_scale_params = scale_params[1] - self.x_data = (self.x_data - x_scale_params["min"]) / (x_scale_params["max"] - x_scale_params["min"]) + if x_scale_params is not None: + self.x_data = (self.x_data - x_scale_params["min"]) / (x_scale_params["max"] - x_scale_params["min"]) + if y_scale_params is not None: + self.y_data = (self.y_data - y_scale_params["min"]) / (y_scale_params["max"] - y_scale_params["min"]) elif scale_fn == "standard_scale": self.scale_fn = "standard_scale" x_scale_params = scale_params[0] y_scale_params = scale_params[1] - self.x_data = (self.x_data - x_scale_params['mean']) / np.sqrt(x_scale_params["var"]) + if x_scale_params is not None: + self.x_data = (self.x_data - x_scale_params['mean']) / np.sqrt(x_scale_params["var"]) + if y_scale_params is not None: + self.y_data = (self.y_data - y_scale_params['mean']) / np.sqrt(y_scale_params["var"]) self.getScaledDataframe() @@ -180,7 +186,7 @@ def getScaledDataframe(self): scaledData = np.concatenate((self.x_data, self.y_data), axis=1) self.scaledDataframe = pd.DataFrame(scaledData, columns=columns) - def rescale(self, x): + def rescale(self, x, y): """ rescale the data with the scale function and scale parameters @@ -199,24 +205,31 @@ def rescale(self, x): rescaled dependent variable data """ if self.scale_fn == "minmax_scale": - x = np.multiply(x, self.x_scale_info["max"] - self.x_scale_info["min"]) + self.x_scale_info["min"] + if x is not None: + x = np.multiply(x, self.x_scale_info["max"] - self.x_scale_info["min"]) + self.x_scale_info["min"] + if y is not None: + y = np.multiply(y, self.y_scale_info["max"] - self.y_scale_info["min"]) + self.y_scale_info["min"] elif self.scale_fn == "standard_scale": - x = np.multiply(x, np.sqrt(self.x_scale_info["var"])) + self.x_scale_info["mean"] + if x is not None: + x = np.multiply(x, np.sqrt(self.x_scale_info["var"])) + self.x_scale_info["mean"] + if y is not None: + y = np.multiply(y, np.sqrt(self.y_scale_info["var"])) + self.y_scale_info["mean"] else: raise ValueError("invalid process_fn") - return x + return x, y - def save(self, dirname): + def save(self, dirname, exist_ok=False): """ save the dataset :param dirname: save directory """ - if os.path.exists(dirname): + if os.path.exists(dirname) and not exist_ok: raise ValueError("dir is already exists") if self.dataframe is None: raise ValueError("dataframe is None") - os.makedirs(dirname) + if not os.path.exists(dirname): + os.makedirs(dirname) x_scale_info = {} y_scale_info = {} for key, value in self.x_scale_info.items(): @@ -302,11 +315,12 @@ class predictDataset(Dataset): def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=None, is_need_STNN=False): - # data = data.astype(np.float32) if scale_info is None: scale_info = [] + self.dataframe = data self.x = x_column + if data is None: self.x_data = None self.datasize = -1 @@ -348,6 +362,7 @@ def __init__(self, data, x_column, process_fn="minmax_scale", scale_info=None, i self.distances = None self.temporal = None + self.scale_info_y = None def __len__(self): """ @@ -367,7 +382,7 @@ def __getitem__(self, index): return torch.tensor(self.distances[index], dtype=torch.float), torch.tensor(self.x_data[index], dtype=torch.float) - def rescale(self, x): + def rescale(self, x, y): """ rescale the attribute data @@ -375,13 +390,19 @@ def rescale(self, x): :return: rescaled attribute data """ if self.scale_fn == "minmax_scale": - x = x * (self.scale_info_x[1] - self.scale_info_x[0]) + self.scale_info_x[0] + if x is not None: + x = x * (self.scale_info_x[1] - self.scale_info_x[0]) + self.scale_info_x[0] + elif y is not None and self.scale_info_y is not None: + y = y * (self.scale_info_y["max"] - self.scale_info_y["min"]) + self.scale_info_y["min"] elif self.scale_fn == "standard_scale": - x = x * np.sqrt(self.scale_info_x[1]) + self.scale_info_x[0] + if x is not None: + x = x * np.sqrt(self.scale_info_x[1]) + self.scale_info_x[0] + elif y is not None and self.scale_info_y is not None: + y = y * np.sqrt(self.scale_info_y["var"]) + self.scale_info_y["mean"] else: raise ValueError("invalid process_fn") - return x + return x,y def minmax_scaler(self, x, min=None, max=None): """ @@ -577,9 +598,11 @@ def init_dataset(data, test_ratio, elif use_model == "gtnnwr": assert temp_column is not None, "temp_column must be not None in gtnnwr" train_dataset.distances, val_dataset.distances, test_dataset.distances = _init_gtnnwr_distance( - reference_data[spatial_column + temp_column].values, train_data[spatial_column + temp_column].values, - val_data[spatial_column + temp_column].values, test_data[spatial_column + temp_column].values, - spatial_fun,temporal_fun + [reference_data[spatial_column].values,reference_data[temp_column].values], + [train_data[spatial_column].values, train_data[temp_column].values], + [val_data[spatial_column].values, val_data[temp_column].values], + [test_data[spatial_column].values, test_data[temp_column].values], + spatial_fun, temporal_fun ) elif use_model == "gnnwr spnn": train_dataset.distances, val_dataset.distances, test_dataset.distances = _init_gnnwr_spnn_distance( @@ -611,28 +634,31 @@ def init_dataset(data, test_ratio, distance_scale = StandardScaler() temporal_scale = StandardScaler() # scale distance matrix - train_distance_len = len(train_dataset.distances) - val_distance_len = len(val_dataset.distances) - distances = np.concatenate((train_dataset.distances, val_dataset.distances, test_dataset.distances), axis=0) + distances = train_dataset.distances distances = distance_scale.fit_transform(distances.reshape(-1, distances.shape[-1])).reshape(distances.shape) + + train_dataset.distances = distance_scale.transform(train_dataset.distances.reshape(-1, train_dataset.distances.shape[-1])).reshape(train_dataset.distances.shape) + val_dataset.distances = distance_scale.transform(val_dataset.distances.reshape(-1, val_dataset.distances.shape[-1])).reshape(val_dataset.distances.shape) + test_dataset.distances = distance_scale.transform(test_dataset.distances.reshape(-1, test_dataset.distances.shape[-1])).reshape(test_dataset.distances.shape) + if process_fn == "minmax_scale": distance_scale_param = {"min": distance_scale.data_min_, "max": distance_scale.data_max_} else: - distance_scale_param = {"mean": distance_scale.mean_, "var": distance_scale.var_} - train_dataset.distances = distances[:train_distance_len] - val_dataset.distances = distances[train_distance_len:train_distance_len + val_distance_len] - test_dataset.distances = distances[train_distance_len + val_distance_len:] + distance_scale_param = {"mean": distance_scale.mean_, "var": distance_scale.var_} train_dataset.distances_scale_param = val_dataset.distances_scale_param = test_dataset.distances_scale_param = distance_scale_param - if temp_column is not None: - temporal = np.concatenate((train_dataset.temporal, val_dataset.temporal, test_dataset.temporal), axis=0) + + if train_dataset.temporal is not None and val_dataset.temporal is not None and test_dataset.temporal is not None: + temporal = train_dataset.temporal temporal = temporal_scale.fit_transform(temporal.reshape(-1, temporal.shape[-1])).reshape(temporal.shape) + + train_dataset.temporal = temporal_scale.transform(train_dataset.temporal.reshape(-1, train_dataset.temporal.shape[-1])).reshape(train_dataset.temporal.shape) + val_dataset.temporal = temporal_scale.transform(val_dataset.temporal.reshape(-1, val_dataset.temporal.shape[-1])).reshape(val_dataset.temporal.shape) + test_dataset.temporal = temporal_scale.transform(test_dataset.temporal.reshape(-1, test_dataset.temporal.shape[-1])).reshape(test_dataset.temporal.shape) + if process_fn == "minmax_scale": temporal_scale_param = {"min": temporal_scale.data_min_, "max": temporal_scale.data_max_} else: temporal_scale_param = {"mean": temporal_scale.mean_, "var": temporal_scale.var_} - train_dataset.temporal = temporal[:train_distance_len] - val_dataset.temporal = temporal[train_distance_len:train_distance_len + val_distance_len] - test_dataset.temporal = temporal[train_distance_len + val_distance_len:] train_dataset.temporal_scale_param = val_dataset.temporal_scale_param = test_dataset.temporal_scale_param = temporal_scale_param # initialize dataloader for train/val/test dataset # set batch_size for train_dataset as batch_size @@ -735,13 +761,14 @@ def init_predict_dataset(data, train_dataset, x_column, spatial_column=None, tem process_params = [[train_dataset.x_scale_info['mean'], train_dataset.x_scale_info['std']]] else: raise ValueError("scale_fn must be minmax_scale or standard_scale") - # print("ProcessParams:",process_params) if scale_sync: predict_dataset = use_class(data=data, x_column=x_column, process_fn=process_fn, scale_info=process_params, is_need_STNN=is_need_STNN) else: predict_dataset = use_class(data=data, x_column=x_column, process_fn=process_fn, is_need_STNN=is_need_STNN) + # get the y scale information + predict_dataset.scale_info_y = train_dataset.y_scale_info # train_data = train_dataset.dataframe reference_data = train_dataset.reference diff --git a/src/gnnwr/models.py b/src/gnnwr/models.py index 777ed01..5f633ab 100644 --- a/src/gnnwr/models.py +++ b/src/gnnwr/models.py @@ -138,7 +138,7 @@ def __init__( self._log_file_name = log_file_name # log file self._log_level = log_level # log level self.__istrained = False # whether the model is trained - # TODO: use OLS in scaled data ot original data + self._coefficient = OLS( train_dataset.scaledDataframe, train_dataset.x, train_dataset.y).params # coefficients of OLS @@ -316,10 +316,11 @@ def __train(self): x_true = torch.cat((x_true, coef), 0) y_true = torch.cat((y_true, label), 0) weight = self._model(data) + weight_all = torch.cat((weight_all, weight.to(torch.float32)), 0) output = self._out(weight.mul(coef.to(torch.float32))) y_pred = torch.cat((y_pred, output), 0) - loss = self._criterion(output, label) # calculate the loss + loss = self._criterion(output, label) # calculate the loss loss.backward() # back propagation self._optimizer.step() # update the parameters if isinstance(data, list): @@ -419,8 +420,9 @@ def __test(self): self.__testLoss = test_loss self.__testr2 = r2_score(label_list, out_list) self._test_diagnosis = DIAGNOSIS(weight_all, x_data, y_data, y_pred) + return self._test_diagnosis.R2().data - def run(self, max_epoch=1, early_stop=-1): + def run(self, max_epoch=1, early_stop=-1,**kwargs): """ train the model and validate the model @@ -438,6 +440,12 @@ def run(self, max_epoch=1, early_stop=-1): show_detailed_info : bool if ``True``, the detailed information will be shown (default: ``True``) """ + if kwargs.get("print_frequency") is not None: + warnings.warn("The parameter print_frequency is deprecated, the information will be shown in tqdm") + if kwargs.get("show_detailed_info") is not None: + warnings.warn("The parameter show_detailed_info is deprecated, the information will be shown in tqdm") + # model selection method + model_selection = kwargs.get("model_selection", "val") self.__istrained = True if self._use_gpu: self._model = nn.DataParallel(module=self._model) # parallel computing @@ -490,7 +498,11 @@ def run(self, max_epoch=1, early_stop=-1): if 0 < early_stop < self._noUpdateEpoch: # stop when the model has not been updated for long time print("Training stop! Model has not been improved for over {} epochs.".format(early_stop)) break - self.load_model(self._modelSavePath + '/' + self._modelName + ".pkl") + torch.save(self._model, self._modelSavePath + '/' + self._modelName + "_last.pkl") + if model_selection == "val": + self.load_model(self._modelSavePath + '/' + self._modelName + ".pkl") + elif model_selection == "last": + self.load_model(self._modelSavePath + '/' + self._modelName + "_last.pkl") self.result_data = self.getCoefs() def predict(self, dataset): @@ -507,6 +519,7 @@ def predict(self, dataset): dataframe the Pandas dataframe of the dataset with the predicted result """ + data = dataset.distances coef = dataset.x_data if not self.__istrained: @@ -520,7 +533,7 @@ def predict(self, dataset): weight = self._model(data) result = self._out(weight.mul(coef)).cpu().detach().numpy() dataset.dataframe['pred_result'] = result - dataset.dataframe['denormalized_pred_result'] = dataset.rescale(result) + _,dataset.dataframe['denormalized_pred_result'] = dataset.rescale(None,result) dataset.pred_result = result return dataset.dataframe @@ -581,7 +594,8 @@ def load_model(self, path, use_dict=False, map_location=None): self._model = self._model.cpu() self._out = self._out.cpu() self._modelSavePath = os.path.dirname(path) - self._modelName = os.path.basename(path).split('/')[-1].split('.')[0] + if self._modelName is None: + self._modelName = os.path.basename(path).split('/')[-1].split('.')[0] self.__istrained = True self.result_data = self.getCoefs() @@ -729,6 +743,7 @@ def reg_result(self, filename=None, model_path=None, use_dict=False, only_return """ if model_path is None: model_path = self._modelSavePath + "/" + self._modelName + ".pkl" + if use_dict: data = torch.load(model_path, map_location=map_location) self._model.load_state_dict(data) diff --git a/src/gnnwr/utils.py b/src/gnnwr/utils.py index a8cb190..65313e4 100644 --- a/src/gnnwr/utils.py +++ b/src/gnnwr/utils.py @@ -24,7 +24,7 @@ def __init__(self, dataset, xName: list, yName: list): self.__formula = yName[0] + '~' + '+'.join(xName) self.__fit = sm.formula.ols(self.__formula, dataset).fit() self.params = list(self.__fit.params.to_dict().values()) - intercept = self.__fit.params[0] + intercept = self.__fit.params.iloc[0] self.params = self.params[1:] self.params.append(intercept)