Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(steps): implement a step to count encode cols #31

Merged
merged 2 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ibisml/select.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import re
from collections.abc import Collection
from collections.abc import Iterable
from typing import TYPE_CHECKING, Callable, ClassVar, Union

import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -73,7 +73,7 @@ def select_columns(self, table: ir.Table, metadata: Metadata) -> list[str]:
]


SelectionType = Union[str, Collection[str], Callable[[ir.Column], bool], Selector]
SelectionType = Union[str, Iterable[str], Callable[[ir.Column], bool], Selector]


def selector(obj: SelectionType) -> Selector:
Expand All @@ -82,7 +82,7 @@ def selector(obj: SelectionType) -> Selector:
return obj
elif isinstance(obj, str):
return cols(obj)
elif isinstance(obj, Collection):
elif isinstance(obj, Iterable):
return cols(*obj)
elif callable(obj):
return where(obj)
Expand Down
3 changes: 2 additions & 1 deletion ibisml/steps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from ibisml.steps.common import Cast, Drop, Mutate, MutateAt
from ibisml.steps.encode import CategoricalEncode, OneHotEncode
from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode
from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode
from ibisml.steps.standardize import ScaleMinMax, ScaleStandard
from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime

__all__ = (
"Cast",
"CategoricalEncode",
"CountEncode",
"Drop",
"ExpandDate",
"ExpandDateTime",
Expand Down
53 changes: 48 additions & 5 deletions ibisml/steps/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from ibisml.core import Metadata, Step
from ibisml.select import SelectionType, selector
from ibisml.steps.impute import FillNA


def _compute_categories(
Expand Down Expand Up @@ -93,7 +94,7 @@ class OneHotEncode(Step):
--------
>>> import ibisml as ml

One-hot encode all string columns
One-hot encode all string columns.

>>> step = ml.OneHotEncode(ml.string())

Expand Down Expand Up @@ -148,6 +149,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
def transform_table(self, table: ir.Table) -> ir.Table:
if not self.categories_:
return table

return table.mutate(
[
(table[col] == cat).cast("int8").name(f"{col}_{cat}")
Expand Down Expand Up @@ -180,7 +182,7 @@ class CategoricalEncode(Step):
--------
>>> import ibisml as ml

Categorical encode all string columns
Categorical encode all string columns.

>>> step = ml.CategoricalEncode(ml.string())

Expand Down Expand Up @@ -237,12 +239,53 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
self.category_tables_ = tables

def transform_table(self, table: ir.Table) -> ir.Table:
if not self.category_tables_:
return table

for col, lookup in self.category_tables_.items():
joined = table.left_join(
lookup, table[col] == lookup[0], lname="{name}_left", rname=""
)
table = joined.drop(lookup.columns[0], f"{col}_left")

return table


class CountEncode(Step):
"""A step for count encoding select columns.

Parameters
----------
inputs
A selection of columns to count encode.

Examples
--------
>>> import ibisml as ml

Count encode all string columns.

>>> step = ml.CountEncode(ml.string())
"""

def __init__(self, inputs: SelectionType) -> None:
self.inputs = selector(inputs)

def _repr(self) -> Iterable[tuple[str, Any]]:
yield ("", self.inputs)

def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
columns = self.inputs.select_columns(table, metadata)
self.value_counts_ = {
c: ibis.memtable(table[c].value_counts().to_pyarrow()) for c in columns
}

def transform_table(self, table: ir.Table) -> ir.Table:
for c, value_counts in self.value_counts_.items():
joined = table.left_join(
value_counts, table[c] == value_counts[0], lname="left_{name}", rname=""
)
table = joined.drop(value_counts.columns[0], f"left_{c}").rename(
{c: f"{c}_count"}
)

fillna = FillNA(self.value_counts_, 0)
fillna.fit_table(table, Metadata())
return fillna.transform_table(table)
40 changes: 40 additions & 0 deletions tests/test_encode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import ibis
import pandas as pd

import ibisml as ml


def test_count_encode():
t_train = ibis.memtable(
{
"time": [
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.030"),
pd.Timestamp("2016-05-25 13:30:00.041"),
pd.Timestamp("2016-05-25 13:30:00.048"),
pd.Timestamp("2016-05-25 13:30:00.049"),
pd.Timestamp("2016-05-25 13:30:00.072"),
pd.Timestamp("2016-05-25 13:30:00.075"),
],
"ticker": ["GOOG", "MSFT", "MSFT", "MSFT", None, "AAPL", "GOOG", "MSFT"],
}
)
t_test = ibis.memtable(
{
"time": [
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.038"),
pd.Timestamp("2016-05-25 13:30:00.048"),
pd.Timestamp("2016-05-25 13:30:00.049"),
pd.Timestamp("2016-05-25 13:30:00.050"),
pd.Timestamp("2016-05-25 13:30:00.051"),
],
"ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AMZN", None],
}
)

step = ml.CountEncode("ticker")
step.fit_table(t_train, ml.core.Metadata())
res = step.transform_table(t_test)
assert res.to_pandas().sort_values(by="time").ticker.to_list() == [4, 4, 2, 2, 0, 0]
Loading