From b49543bfe63eb8d48169035e8c4ad59cb6a90a15 Mon Sep 17 00:00:00 2001
From: jitingxu1 <jack9527xu@gmail.com>
Date: Tue, 17 Sep 2024 14:31:11 -0700
Subject: [PATCH] handle near constant column

---
 ibis_ml/steps/_standardize.py | 29 +++++++++++++----------------
 tests/test_standardize.py     | 25 +++++++++----------------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/ibis_ml/steps/_standardize.py b/ibis_ml/steps/_standardize.py
index d4d92f0..7df080f 100644
--- a/ibis_ml/steps/_standardize.py
+++ b/ibis_ml/steps/_standardize.py
@@ -11,6 +11,8 @@
     from collections.abc import Iterable
 
 _DOCS_PAGE_NAME = "standardization"
+# a small epsilon value to handle near-constant columns during normalization
+_APPROX_EPS = 10e-7
 
 
 class ScaleMinMax(Step):
@@ -61,21 +63,18 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [expr]
             results = expr.execute().to_dict("records")[0]
             for name in columns:
-                col_max = results[f"{name}_max"]
-                col_min = results[f"{name}_min"]
-                if col_max == col_min:
-                    raise ValueError(
-                        f"Cannot standardize {name!r} - "
-                        "the maximum and minimum values are equal"
-                    )
-                stats[name] = (col_max, col_min)
+                stats[name] = (results[f"{name}_max"], results[f"{name}_min"])
 
         self.stats_ = stats
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         return table.mutate(
             [
-                ((table[c] - min) / (max - min)).name(c)  # type: ignore
+                # for near-constant column, set the scale to 1.0
+                (
+                    (table[c] - min)
+                    / (1.0 if abs(max - min) < _APPROX_EPS else max - min)
+                ).name(c)
                 for c, (max, min) in self.stats_.items()
             ]
         )
@@ -128,19 +127,17 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
             self._fit_expr = [table.aggregate(aggs)]
             results = self._fit_expr[-1].execute().to_dict("records")[0]
             for name in columns:
-                col_std = results[f"{name}_std"]
-                if col_std == 0:
-                    raise ValueError(
-                        f"Cannot standardize {name!r} - the standard deviation is zero"
-                    )
-                stats[name] = (results[f"{name}_mean"], col_std)
+                stats[name] = (results[f"{name}_mean"], results[f"{name}_std"])
 
         self.stats_ = stats
 
     def transform_table(self, table: ir.Table) -> ir.Table:
         return table.mutate(
             [
-                ((table[c] - center) / scale).name(c)  # type: ignore
+                # for near-constant column, set the scale to 1.0
+                (
+                    (table[c] - center) / (1.0 if abs(scale) < _APPROX_EPS else scale)
+                ).name(c)
                 for c, (center, scale) in self.stats_.items()
             ]
         )
diff --git a/tests/test_standardize.py b/tests/test_standardize.py
index 241646c..ad35217 100644
--- a/tests/test_standardize.py
+++ b/tests/test_standardize.py
@@ -31,19 +31,12 @@ def test_scaleminmax():
     tm.assert_frame_equal(result.execute(), expected, check_exact=False)
 
 
-@pytest.mark.parametrize(
-    ("model", "msg"),
-    [
-        ("ScaleStandard", "Cannot standardize 'col' - the standard deviation is zero"),
-        (
-            "ScaleMinMax",
-            "Cannot standardize 'col' - the maximum and minimum values are equal",
-        ),
-    ],
-)
-def test_scale_unique_col(model, msg):
-    table = ibis.memtable({"col": [1]})
-    scale_class = getattr(ml, model)
-    step = scale_class("col")
-    with pytest.raises(ValueError, match=msg):
-        step.fit_table(table, ml.core.Metadata())
+@pytest.mark.parametrize("scaler", ["ScaleStandard", "ScaleMinMax"])
+def test_constant_columns(scaler):
+    table = ibis.memtable({"int_col": [100], "float_col": [100.0]})
+    scaler_class = getattr(ml, scaler)
+    scale_step = scaler_class(ml.numeric())
+    scale_step.fit_table(table, ml.core.Metadata())
+    result = scale_step.transform_table(table)
+    expected = pd.DataFrame({"int_col": [0.0], "float_col": [0.0]})
+    tm.assert_frame_equal(result.execute(), expected)