ibis-project · deepyaman · Mar 21, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/ibisml/select.py b/ibisml/select.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from collections.abc import Collection
+from collections.abc import Iterable
 from typing import TYPE_CHECKING, Callable, ClassVar, Union
 
 import ibis.expr.datatypes as dt
@@ -73,7 +73,7 @@ def select_columns(self, table: ir.Table, metadata: Metadata) -> list[str]:
         ]
 
 
-SelectionType = Union[str, Collection[str], Callable[[ir.Column], bool], Selector]
+SelectionType = Union[str, Iterable[str], Callable[[ir.Column], bool], Selector]
 
 
 def selector(obj: SelectionType) -> Selector:
@@ -82,7 +82,7 @@ def selector(obj: SelectionType) -> Selector:
         return obj
     elif isinstance(obj, str):
         return cols(obj)
-    elif isinstance(obj, Collection):
+    elif isinstance(obj, Iterable):
         return cols(*obj)
     elif callable(obj):
         return where(obj)

diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py
@@ -1,12 +1,13 @@
 from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
-from ibisml.steps.encode import CategoricalEncode, OneHotEncode
+from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
 from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
 from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
 from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime
 
 __all__ = (
     "Cast",
     "CategoricalEncode",
+    "CountEncode",
     "Drop",
     "ExpandDate",
     "ExpandDateTime",

diff --git a/ibisml/steps/encode.py b/ibisml/steps/encode.py
@@ -9,6 +9,7 @@
 
 from ibisml.core import Metadata, Step
 from ibisml.select import SelectionType, selector
+from ibisml.steps.impute import FillNA
 
 
 def _compute_categories(
@@ -93,7 +94,7 @@ class OneHotEncode(Step):
     --------
     >>> import ibisml as ml
 
-    One-hot encode all string columns
+    One-hot encode all string columns.
 
     >>> step = ml.OneHotEncode(ml.string())
 
@@ -148,6 +149,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
     def transform_table(self, table: ir.Table) -> ir.Table:
         if not self.categories_:
             return table
+
         return table.mutate(
             [
                 (table[col] == cat).cast("int8").name(f"{col}_{cat}")
@@ -180,7 +182,7 @@ class CategoricalEncode(Step):
     --------
     >>> import ibisml as ml
 
-    Categorical encode all string columns
+    Categorical encode all string columns.
 
     >>> step = ml.CategoricalEncode(ml.string())
 
@@ -237,12 +239,53 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
         self.category_tables_ = tables
 
     def transform_table(self, table: ir.Table) -> ir.Table:
-        if not self.category_tables_:
-            return table
-
         for col, lookup in self.category_tables_.items():
             joined = table.left_join(
                 lookup, table[col] == lookup[0], lname="{name}_left", rname=""
             )
             table = joined.drop(lookup.columns[0], f"{col}_left")
+
         return table
+
+
+class CountEncode(Step):
+    """A step for count encoding select columns.
+
+    Parameters
+    ----------
+    inputs
+        A selection of columns to count encode.
+
+    Examples
+    --------
+    >>> import ibisml as ml
+
+    Count encode all string columns.
+
+    >>> step = ml.CountEncode(ml.string())
+    """
+
+    def __init__(self, inputs: SelectionType) -> None:
+        self.inputs = selector(inputs)
+
+    def _repr(self) -> Iterable[tuple[str, Any]]:
+        yield ("", self.inputs)
+
+    def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
+        columns = self.inputs.select_columns(table, metadata)
+        self.value_counts_ = {
+            c: ibis.memtable(table[c].value_counts().to_pyarrow()) for c in columns
+        }
+
+    def transform_table(self, table: ir.Table) -> ir.Table:
+        for c, value_counts in self.value_counts_.items():
+            joined = table.left_join(
+                value_counts, table[c] == value_counts[0], lname="left_{name}", rname=""
+            )
+            table = joined.drop(value_counts.columns[0], f"left_{c}").rename(
+                {c: f"{c}_count"}
+            )
+
+        fillna = FillNA(self.value_counts_, 0)
+        fillna.fit_table(table, Metadata())
+        return fillna.transform_table(table)
diff --git a/tests/test_encode.py b/tests/test_encode.py
@@ -0,0 +1,40 @@
+import ibis
+import pandas as pd
+
+import ibisml as ml
+
+
+def test_count_encode():
+    t_train = ibis.memtable(
+        {
+            "time": [
+                pd.Timestamp("2016-05-25 13:30:00.023"),
+                pd.Timestamp("2016-05-25 13:30:00.023"),
+                pd.Timestamp("2016-05-25 13:30:00.030"),
+                pd.Timestamp("2016-05-25 13:30:00.041"),
+                pd.Timestamp("2016-05-25 13:30:00.048"),
+                pd.Timestamp("2016-05-25 13:30:00.049"),
+                pd.Timestamp("2016-05-25 13:30:00.072"),
+                pd.Timestamp("2016-05-25 13:30:00.075"),
+            ],
+            "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", None, "AAPL", "GOOG", "MSFT"],
+        }
+    )
+    t_test = ibis.memtable(
+        {
+            "time": [
+                pd.Timestamp("2016-05-25 13:30:00.023"),
+                pd.Timestamp("2016-05-25 13:30:00.038"),
+                pd.Timestamp("2016-05-25 13:30:00.048"),
+                pd.Timestamp("2016-05-25 13:30:00.049"),
+                pd.Timestamp("2016-05-25 13:30:00.050"),
+                pd.Timestamp("2016-05-25 13:30:00.051"),
+            ],
+            "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AMZN", None],
+        }
+    )
+
+    step = ml.CountEncode("ticker")
+    step.fit_table(t_train, ml.core.Metadata())
+    res = step.transform_table(t_test)
+    assert res.to_pandas().sort_values(by="time").ticker.to_list() == [4, 4, 2, 2, 0, 0]