From c2a4b70400a3ab55ab8a704239bd498cc345d438 Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:45:27 -0700 Subject: [PATCH 1/7] polynomial features --- ibisml/steps/__init__.py | 2 + ibisml/steps/feature_engineering.py | 81 +++++++++++++++++++++++++++++ tests/test_feature_engineering.py | 19 +++++++ 3 files changed, 102 insertions(+) create mode 100644 ibisml/steps/feature_engineering.py create mode 100644 tests/test_feature_engineering.py diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py index 8b6d7c7..78ed7b6 100644 --- a/ibisml/steps/__init__.py +++ b/ibisml/steps/__init__.py @@ -1,5 +1,6 @@ from ibisml.steps.common import Cast, Drop, Mutate, MutateAt from ibisml.steps.encode import CategoricalEncode, CountEncode, OneHotEncode +from ibisml.steps.feature_engineering import PolynomialFeatures from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode from ibisml.steps.standardize import ScaleMinMax, ScaleStandard from ibisml.steps.temporal import ExpandDate, ExpandDateTime, ExpandTime @@ -19,6 +20,7 @@ "Mutate", "MutateAt", "OneHotEncode", + "PolynomialFeatures", "ScaleMinMax", "ScaleStandard", ) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py new file mode 100644 index 0000000..0a48e78 --- /dev/null +++ b/ibisml/steps/feature_engineering.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from typing import Any, Iterable +from itertools import combinations_with_replacement + +import ibis.expr.types as ir + +from ibisml.core import Metadata, Step +from ibisml.select import SelectionType, selector + + +class PolynomialFeatures(Step): + """A step for generating polynomial features. + + Parameters + ---------- + inputs + A selection of columns to generate polynomial features. + All columns must be numeric. + degree : int, default `2` + The maximum degree of polynomial features to generate. + + Examples + -------- + >>> import ibisml as ml + + Generate polynomial features for all numeric columns with a degree is 2. + + >>> step = ml.PolynomialFeatures(ml.numeric(), 2) + + Generate polynomial features a specific set of columns. + + >>> step = ml.PolynomialFeatures(["x", "y"], 2) + """ + + def __init__( + self, + inputs: SelectionType, + *, + degree: int = 2, + ): + if degree < 2: + raise ValueError("Degree must be greater than 1") + + self.inputs = selector(inputs) + self.degree = degree + + def _repr(self) -> Iterable[tuple[str, Any]]: + yield ("", self.inputs) + yield ("degree", self.degree) + + def fit_table(self, table: ir.Table, metadata: Metadata) -> None: + columns = self.inputs.select_columns(table, metadata) + + non_numeric_cols = [ + col for col in columns if not isinstance(table[col], ir.NumericColumn) + ] + if non_numeric_cols: + raise ValueError( + "Cannot fit polynomial features step: " + f"{[c for c in non_numeric_cols]} is not numeric" + ) + combinations = [] + for d in range(2, self.degree + 1): + combinations.extend(combinations_with_replacement(columns, d)) + self.combinations_ = combinations + + def transform_table(self, table: ir.Table) -> ir.Table: + + expressions = [] + for combination in self.combinations_: + expression = 1 + for column in combination: + expression *= table[column] + expressions.append(expression.name(f"poly_{'_'.join(column for column in combination)}")) + + return table.mutate(**{exp.get_name(): exp for exp in expressions}) + + + + diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py new file mode 100644 index 0000000..393761b --- /dev/null +++ b/tests/test_feature_engineering.py @@ -0,0 +1,19 @@ +import ibis +from ibis import _ + +import ibisml as ml + + +def test_polynomial_features(): + t = ibis.table({"x": "int", "y": "float", "z": "string"}) + step = ml.PolynomialFeatures(ml.numeric(), degree=2) + step.fit_table(t, ml.core.Metadata()) + res = step.transform_table(t) + sol = t.mutate( + poly_x_x=_.x * 1 * _.x, + poly_x_y=_.x * 1 * _.y, + poly_y_y=_.y * 1 * _.y + ) + assert step.is_fitted() + assert set(res.columns) == set(sol.columns) + assert res.equals(sol) \ No newline at end of file From 9ac6d793907008bec6f12ad06f12063a544edde3 Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:49:12 -0700 Subject: [PATCH 2/7] lint --- ibisml/steps/feature_engineering.py | 24 ++++++++---------------- tests/test_feature_engineering.py | 6 ++---- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py index 0a48e78..a9c1995 100644 --- a/ibisml/steps/feature_engineering.py +++ b/ibisml/steps/feature_engineering.py @@ -1,7 +1,7 @@ from __future__ import annotations -from typing import Any, Iterable from itertools import combinations_with_replacement +from typing import Any, Iterable import ibis.expr.types as ir @@ -15,7 +15,7 @@ class PolynomialFeatures(Step): Parameters ---------- inputs - A selection of columns to generate polynomial features. + A selection of columns to generate polynomial features. All columns must be numeric. degree : int, default `2` The maximum degree of polynomial features to generate. @@ -33,15 +33,10 @@ class PolynomialFeatures(Step): >>> step = ml.PolynomialFeatures(["x", "y"], 2) """ - def __init__( - self, - inputs: SelectionType, - *, - degree: int = 2, - ): + def __init__(self, inputs: SelectionType, *, degree: int = 2): if degree < 2: raise ValueError("Degree must be greater than 1") - + self.inputs = selector(inputs) self.degree = degree @@ -66,16 +61,13 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: self.combinations_ = combinations def transform_table(self, table: ir.Table) -> ir.Table: - expressions = [] for combination in self.combinations_: expression = 1 for column in combination: expression *= table[column] - expressions.append(expression.name(f"poly_{'_'.join(column for column in combination)}")) - - return table.mutate(**{exp.get_name(): exp for exp in expressions}) - - - + expressions.append( + expression.name(f"poly_{'_'.join(column for column in combination)}") + ) + return table.mutate(**{exp.get_name(): exp for exp in expressions}) diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index 393761b..ba827e5 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -10,10 +10,8 @@ def test_polynomial_features(): step.fit_table(t, ml.core.Metadata()) res = step.transform_table(t) sol = t.mutate( - poly_x_x=_.x * 1 * _.x, - poly_x_y=_.x * 1 * _.y, - poly_y_y=_.y * 1 * _.y + poly_x_x=_.x * 1 * _.x, poly_x_y=_.x * 1 * _.y, poly_y_y=_.y * 1 * _.y ) assert step.is_fitted() assert set(res.columns) == set(sol.columns) - assert res.equals(sol) \ No newline at end of file + assert res.equals(sol) From 896088db3f201f2f44ceaa26a2891ec4eba3c20f Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:57:41 -0700 Subject: [PATCH 3/7] lint --- ibisml/steps/feature_engineering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py index a9c1995..592d5ca 100644 --- a/ibisml/steps/feature_engineering.py +++ b/ibisml/steps/feature_engineering.py @@ -53,7 +53,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: if non_numeric_cols: raise ValueError( "Cannot fit polynomial features step: " - f"{[c for c in non_numeric_cols]} is not numeric" + f"{non_numeric_cols} is not numeric" ) combinations = [] for d in range(2, self.degree + 1): From 391b577979fad435d9a5f62ef0cfff9e9bb2cd05 Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Tue, 9 Apr 2024 20:02:02 -0700 Subject: [PATCH 4/7] change to better format of poly --- ibisml/steps/feature_engineering.py | 43 +++++++++++++++++------------ tests/test_feature_engineering.py | 30 ++++++++++++++------ 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py index 592d5ca..e8b5b9e 100644 --- a/ibisml/steps/feature_engineering.py +++ b/ibisml/steps/feature_engineering.py @@ -1,5 +1,8 @@ from __future__ import annotations +import functools +import operator +from collections import Counter from itertools import combinations_with_replacement from typing import Any, Iterable @@ -47,27 +50,31 @@ def _repr(self) -> Iterable[tuple[str, Any]]: def fit_table(self, table: ir.Table, metadata: Metadata) -> None: columns = self.inputs.select_columns(table, metadata) - non_numeric_cols = [ - col for col in columns if not isinstance(table[col], ir.NumericColumn) - ] - if non_numeric_cols: - raise ValueError( - "Cannot fit polynomial features step: " - f"{non_numeric_cols} is not numeric" - ) + for col_name in columns: + if not isinstance(table[col_name], ir.NumericColumn): + raise ValueError( + f"Cannot calculate polynomial features of {col_name!r} - " + "this column is not numeric" + ) combinations = [] for d in range(2, self.degree + 1): - combinations.extend(combinations_with_replacement(columns, d)) + combinations.extend( + [ + dict(Counter(comb)) + for comb in combinations_with_replacement(columns, d) + ] + ) self.combinations_ = combinations def transform_table(self, table: ir.Table) -> ir.Table: - expressions = [] - for combination in self.combinations_: - expression = 1 - for column in combination: - expression *= table[column] - expressions.append( - expression.name(f"poly_{'_'.join(column for column in combination)}") + expressions = [ + functools.reduce( + operator.mul, + [ + operator.pow(table[col], p) if p > 1 else table[col] + for col, p in combination.items() + ], ) - - return table.mutate(**{exp.get_name(): exp for exp in expressions}) + for combination in self.combinations_ + ] + return table.mutate(*expressions) diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index ba827e5..b4eab17 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -1,17 +1,29 @@ +import operator + import ibis +import pytest from ibis import _ import ibisml as ml -def test_polynomial_features(): - t = ibis.table({"x": "int", "y": "float", "z": "string"}) +@pytest.fixture() +def train_table(): + N = 100 + return ibis.memtable({"x": list(range(N)), "y": [10] * N, "z": ["s"] * N}) + + +def test_PolynomialFeatures(train_table): step = ml.PolynomialFeatures(ml.numeric(), degree=2) - step.fit_table(t, ml.core.Metadata()) - res = step.transform_table(t) - sol = t.mutate( - poly_x_x=_.x * 1 * _.x, poly_x_y=_.x * 1 * _.y, poly_y_y=_.y * 1 * _.y + step.fit_table(train_table, ml.core.Metadata()) + result_table = step.transform_table(train_table) + sol = train_table.mutate( + operator.pow(_.x, 2), operator.mul(_.x, _.y), operator.pow(_.y, 2) ) - assert step.is_fitted() - assert set(res.columns) == set(sol.columns) - assert res.equals(sol) + assert sol.equals(result_table) + # Check if the transformed table has the expected data + for col_name in sol.columns: + assert ( + sol[col_name].execute().tolist() + == result_table[col_name].execute().tolist() + ) From 221233a4da978b3e8cd6b6cf6f15ba82e7fb1419 Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Tue, 9 Apr 2024 23:10:35 -0700 Subject: [PATCH 5/7] add name of new col --- ibisml/steps/feature_engineering.py | 16 ++++++++++------ tests/test_feature_engineering.py | 7 +++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py index e8b5b9e..06db4a1 100644 --- a/ibisml/steps/feature_engineering.py +++ b/ibisml/steps/feature_engineering.py @@ -66,15 +66,19 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: ) self.combinations_ = combinations - def transform_table(self, table: ir.Table) -> ir.Table: - expressions = [ - functools.reduce( + def transform_table(self, table): + expressions = {} + for combination in self.combinations_: + exp = functools.reduce( operator.mul, [ operator.pow(table[col], p) if p > 1 else table[col] for col, p in combination.items() ], ) - for combination in self.combinations_ - ] - return table.mutate(*expressions) + name = "poly_" + "_".join( + f"{col}^{p}" if p > 1 else f"{col}" for col, p in combination.items() + ) + expressions[name] = exp + + return table.mutate(**expressions) diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index b4eab17..ce48d24 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -2,7 +2,6 @@ import ibis import pytest -from ibis import _ import ibisml as ml @@ -18,7 +17,11 @@ def test_PolynomialFeatures(train_table): step.fit_table(train_table, ml.core.Metadata()) result_table = step.transform_table(train_table) sol = train_table.mutate( - operator.pow(_.x, 2), operator.mul(_.x, _.y), operator.pow(_.y, 2) + **{ + "poly_x^2": operator.pow(train_table.x, 2), + "poly_x_y": operator.mul(train_table.x, train_table.y), + "poly_y^2": operator.pow(train_table.y, 2), + } ) assert sol.equals(result_table) # Check if the transformed table has the expected data From dd6411f86bf0877f5391eea3acfcc82735174d6f Mon Sep 17 00:00:00 2001 From: Jiting Xu <126802425+jitingxu1@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:17:35 -0700 Subject: [PATCH 6/7] resolve comments 2 --- ibisml/steps/feature_engineering.py | 9 +++------ tests/test_feature_engineering.py | 21 ++++++++------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/ibisml/steps/feature_engineering.py b/ibisml/steps/feature_engineering.py index 06db4a1..fc95f56 100644 --- a/ibisml/steps/feature_engineering.py +++ b/ibisml/steps/feature_engineering.py @@ -59,10 +59,7 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None: combinations = [] for d in range(2, self.degree + 1): combinations.extend( - [ - dict(Counter(comb)) - for comb in combinations_with_replacement(columns, d) - ] + [Counter(comb) for comb in combinations_with_replacement(columns, d)] ) self.combinations_ = combinations @@ -72,12 +69,12 @@ def transform_table(self, table): exp = functools.reduce( operator.mul, [ - operator.pow(table[col], p) if p > 1 else table[col] + table[col] ** p if p > 1 else table[col] for col, p in combination.items() ], ) name = "poly_" + "_".join( - f"{col}^{p}" if p > 1 else f"{col}" for col, p in combination.items() + f"{col}^{p}" if p > 1 else col for col, p in combination.items() ) expressions[name] = exp diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index ce48d24..5040fc7 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -1,6 +1,5 @@ -import operator - import ibis +import pandas.testing as tm import pytest import ibisml as ml @@ -15,18 +14,14 @@ def train_table(): def test_PolynomialFeatures(train_table): step = ml.PolynomialFeatures(ml.numeric(), degree=2) step.fit_table(train_table, ml.core.Metadata()) - result_table = step.transform_table(train_table) - sol = train_table.mutate( + result = step.transform_table(train_table) + expected = train_table.mutate( **{ - "poly_x^2": operator.pow(train_table.x, 2), - "poly_x_y": operator.mul(train_table.x, train_table.y), - "poly_y^2": operator.pow(train_table.y, 2), + "poly_x^2": train_table.x**2, + "poly_x_y": train_table.x * train_table.y, + "poly_y^2": train_table.y**2, } ) - assert sol.equals(result_table) + assert expected.equals(result) # Check if the transformed table has the expected data - for col_name in sol.columns: - assert ( - sol[col_name].execute().tolist() - == result_table[col_name].execute().tolist() - ) + tm.assert_frame_equal(result.execute(), expected.execute()) From fcd6fb9229403d949c047c52b719074eebb14998 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 16 Apr 2024 13:03:12 -0600 Subject: [PATCH 7/7] Remove extraneous structural check in Ibis --- tests/test_feature_engineering.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_feature_engineering.py b/tests/test_feature_engineering.py index 5040fc7..5d007f3 100644 --- a/tests/test_feature_engineering.py +++ b/tests/test_feature_engineering.py @@ -22,6 +22,5 @@ def test_PolynomialFeatures(train_table): "poly_y^2": train_table.y**2, } ) - assert expected.equals(result) # Check if the transformed table has the expected data tm.assert_frame_equal(result.execute(), expected.execute())