From b49543bfe63eb8d48169035e8c4ad59cb6a90a15 Mon Sep 17 00:00:00 2001 From: jitingxu1 Date: Tue, 17 Sep 2024 14:31:11 -0700 Subject: [PATCH] handle near constant column --- ibis_ml/steps/_standardize.py | 29 +++++++++++++---------------- tests/test_standardize.py | 25 +++++++++---------------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py index d4d92f0..7df080f 100644 --- a/ibis_ml/steps/_standardize.py +++ b/ibis_ml/steps/_standardize.py @@ -11,6 +11,8 @@ from collections.abc import Iterable _DOCS_PAGE_NAME = "standardization" +# a small epsilon value to handle near-constant columns during normalization +_APPROX_EPS = 10e-7 class ScaleMinMax(Step): @@ -61,21 +63,18 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self._fit_expr = [expr] results = expr.execute().to_dict("records")[0] for name in columns: - col_max = results[f"{name}_max"] - col_min = results[f"{name}_min"] - if col_max == col_min: - raise ValueError( - f"Cannot standardize {name!r} - " - "the maximum and minimum values are equal" - ) - stats[name] = (col_max, col_min) + stats[name] = (results[f"{name}_max"], results[f"{name}_min"]) self.stats_ = stats def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( [ - ((table[c] - min) / (max - min)).name(c) # type: ignore + # for near-constant column, set the scale to 1.0 + ( + (table[c] - min) + / (1.0 if abs(max - min) < _APPROX_EPS else max - min) + ).name(c) for c, (max, min) in self.stats_.items() ] ) @@ -128,19 +127,17 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self._fit_expr = [table.aggregate(aggs)] results = self._fit_expr[-1].execute().to_dict("records")[0] for name in columns: - col_std = results[f"{name}_std"] - if col_std == 0: - raise ValueError( - f"Cannot standardize {name!r} - the standard deviation is zero" - ) - stats[name] = (results[f"{name}_mean"], col_std) + stats[name] = (results[f"{name}_mean"], results[f"{name}_std"]) self.stats_ = stats def transform_table(self, table: ir.Table) -> ir.Table: return table.mutate( [ - ((table[c] - center) / scale).name(c) # type: ignore + # for near-constant column, set the scale to 1.0 + ( + (table[c] - center) / (1.0 if abs(scale) < _APPROX_EPS else scale) + ).name(c) for c, (center, scale) in self.stats_.items() ] ) diff --git a/tests/test_standardize.py b/tests/test_standardize.py index 241646c..ad35217 100644 --- a/tests/test_standardize.py +++ b/tests/test_standardize.py @@ -31,19 +31,12 @@ def test_scaleminmax(): tm.assert_frame_equal(result.execute(), expected, check_exact=False) -@pytest.mark.parametrize( - ("model", "msg"), - [ - ("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"), - ( - "ScaleMinMax", - "Cannot standardize 'col' - the maximum and minimum values are equal", - ), - ], -) -def test_scale_unique_col(model, msg): - table = ibis.memtable({"col": [1]}) - scale_class = getattr(ml, model) - step = scale_class("col") - with pytest.raises(ValueError, match=msg): - step.fit_table(table, ml.core.Metadata()) +@pytest.mark.parametrize("scaler", ["ScaleStandard", "ScaleMinMax"]) +def test_constant_columns(scaler): + table = ibis.memtable({"int_col": [100], "float_col": [100.0]}) + scaler_class = getattr(ml, scaler) + scale_step = scaler_class(ml.numeric()) + scale_step.fit_table(table, ml.core.Metadata()) + result = scale_step.transform_table(table) + expected = pd.DataFrame({"int_col": [0.0], "float_col": [0.0]}) + tm.assert_frame_equal(result.execute(), expected)