From 821701460660f3aa7d9cd4ab0a209179ee0af600 Mon Sep 17 00:00:00 2001 From: Wes Warriner Date: Sat, 21 Dec 2024 22:29:02 -0700 Subject: [PATCH 1/2] refactor: replace pandas df eval() string-based expressions with explicit pandas operations --- src/onemod/utils/residual.py | 24 ++++++++++++------------ src/onemod/utils/uncertainty.py | 24 +++++++++++++++--------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/onemod/utils/residual.py b/src/onemod/utils/residual.py index 5201cb4b..ef5a170f 100644 --- a/src/onemod/utils/residual.py +++ b/src/onemod/utils/residual.py @@ -1,4 +1,4 @@ -# mypy: ignore-errors +import numpy as np import pandas as pd @@ -15,11 +15,11 @@ def get_residual_binomial( data: pd.DataFrame, pred: str, obs: str, weights: str ) -> pd.DataFrame: result = pd.DataFrame(index=data.index) - result["residual"] = data.eval( - f"({obs} - {pred}) / ({pred} * (1 - {pred}))" + result["residual"] = (data[obs] - data[pred]) / ( + data[pred] * (1 - data[pred]) ) - result["residual_se"] = data.eval( - f"1 / sqrt({pred} * (1 - {pred}) * {weights})" + result["residual_se"] = 1 / np.sqrt( + data[pred] * (1 - data[pred]) * data[weights] ) return result @@ -27,37 +27,37 @@ def get_residual_binomial( def predict_binomial( data: pd.DataFrame, pred: str, residual: str = "residual" ) -> pd.Series: - return data.eval(f"{pred} + {residual} * {pred} * (1 - {pred})") + return data[pred] + data[residual] * data[pred] * (1 - data[pred]) @staticmethod def get_residual_poisson( data: pd.DataFrame, pred: str, obs: str, weights: str ) -> pd.DataFrame: result = pd.DataFrame(index=data.index) - result["residual"] = data.eval(f"{obs} / {pred} - 1") - result["residual_se"] = data.eval(f"1 / sqrt({pred} * {weights})") + result["residual"] = data[obs] / data[pred] - 1 + result["residual_se"] = 1 / np.sqrt(data[pred] * data[weights]) return result @staticmethod def predict_poisson( data: pd.DataFrame, pred: str, residual: str = "residual" ) -> pd.Series: - return data.eval(f"({residual} + 1) * {pred}") + return (data[residual] + 1) * data[pred] @staticmethod def get_residual_gaussian( data: pd.DataFrame, pred: str, obs: str, weights: str ) -> pd.DataFrame: result = pd.DataFrame(index=data.index) - result["residual"] = data.eval(f"{obs} - {pred}") - result["residual_se"] = data.eval(f"1 / sqrt({weights})") + result["residual"] = data[obs] - data[pred] + result["residual_se"] = 1 / np.sqrt(data[weights]) return result @staticmethod def predict_gaussian( data: pd.DataFrame, pred: str, residual: str = "residual" ) -> pd.Series: - return data.eval(f"{pred} + {residual}") + return data[pred] + data[residual] def __call__(self, *args, **kwargs) -> pd.DataFrame: return self.get_residual(*args, **kwargs) diff --git a/src/onemod/utils/uncertainty.py b/src/onemod/utils/uncertainty.py index 1308a275..3aac962a 100644 --- a/src/onemod/utils/uncertainty.py +++ b/src/onemod/utils/uncertainty.py @@ -1,4 +1,3 @@ -# mypy: ignore-errors import numpy as np import pandas as pd from msca.c2fun import c2fun_dict @@ -48,8 +47,10 @@ def get_ci_coverage( data["lwr"] = norm.ppf(lwr, loc=data[pred], scale=data[pred_sd]) data["upr"] = norm.ppf(upr, loc=data[pred], scale=data[pred_sd]) - coverage = data.eval(f"{truth} >= lwr and {truth} <= upr").mean() - return coverage + data["coverage_bool"] = (data[truth] >= data["lwr"]) & ( + data[truth] <= data["upr"] + ) + return data["coverage_bool"].mean() def get_pi_coverage( @@ -105,8 +106,10 @@ def get_pi_coverage( upr = 1.0 - lwr residual["lwr"] = norm.ppf(lwr, loc=0.0, scale=residual["total_sd"]) residual["upr"] = norm.ppf(upr, loc=0.0, scale=residual["total_sd"]) - coverage = residual.eval("residual >= lwr and residual <= upr").mean() - return coverage + residual["coverage_bool"] = (residual["residual"] >= residual["lwr"]) & ( + residual["residual"] <= residual["upr"] + ) + return residual["coverage_bool"].mean() def calibrate_pred_sd( @@ -166,13 +169,16 @@ def equation(alpha: float) -> float: # deviation is bounded by the range of the random variable divided by 2. # So, we want to find an alpha such that the maximum absolute value of # the Person residual is less than 1. + residual_squared = residual["residual"] ** 2 + residual_se_squared = residual["residual_se"] ** 2 + adjusted_residual = residual_squared - residual_se_squared + alpha_upr = 1.1 * np.sqrt( - np.max( - residual.eval("residual ** 2 - residual_se ** 2") - / data[pred_sd] ** 2 - ) + np.max(adjusted_residual / (data[pred_sd] ** 2)) ) + alpha = brentq(equation, 0.0, alpha_upr) + else: alpha = 0.0 From 05d57353ba34c24a451c38816a2bbb4a66fbeb2b Mon Sep 17 00:00:00 2001 From: Wes Warriner Date: Sat, 21 Dec 2024 22:53:16 -0700 Subject: [PATCH 2/2] refactor: replace pandas df eval() string-based expressions with explicit pandas operations in Rover stage as well --- src/onemod/stage/model_stages/rover_stage.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/onemod/stage/model_stages/rover_stage.py b/src/onemod/stage/model_stages/rover_stage.py index 47bc0e9b..673e704e 100644 --- a/src/onemod/stage/model_stages/rover_stage.py +++ b/src/onemod/stage/model_stages/rover_stage.py @@ -1,4 +1,3 @@ -# mypy: ignore-errors """ModRover covariate selection stage. Notes @@ -145,7 +144,9 @@ def _get_rover_summaries(self) -> pd.DataFrame: # Merge with subsets and add t-statistic summaries_df = summaries_df.merge(subsets, on="subset_id", how="left") - summaries_df["abs_t_stat"] = summaries_df.eval("abs(coef / coef_sd)") + summaries_df["abs_t_stat"] = ( + summaries_df["coef"].abs() / summaries_df["coef_sd"] + ) return summaries_df def _get_selected_covs(self, summaries: pd.DataFrame) -> pd.DataFrame: @@ -186,8 +187,8 @@ def _get_subset_selected_covs( .mean() .sort_values(ascending=False) .reset_index() - .eval(f"selected = abs_t_stat >= {self.config.t_threshold}") ) + t_stats["selected"] = t_stats["abs_t_stat"] >= self.config.t_threshold # Add/remove covariates based on min_covs/max_covs if (