Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Making DataFrameMapper compatible with GridSearchCV #170

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import sys
import contextlib
from itertools import chain
from collections import defaultdict

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -37,6 +39,12 @@ def _build_feature(columns, transformers, options={}):
return (columns, _build_transformer(transformers), options)


def _build_feature_name(values):
if isinstance(values, list):
values = '-'.join([str(value) for value in values])
return values


def _get_feature_names(estimator):
"""
Attempt to extract feature names based on a given estimator
Expand All @@ -48,6 +56,10 @@ def _get_feature_names(estimator):
return None


def _get_lowercased_class_name(inst):
return type(inst).__name__.lower()


@contextlib.contextmanager
def add_column_names_to_exception(column_names):
# Stolen from https://stackoverflow.com/a/17677938/356729
Expand Down Expand Up @@ -386,3 +398,57 @@ def fit_transform(self, X, y=None):
y the target vector relative to X, optional
"""
return self._transform(X, y, True)

def get_params(self, deep=True):
out = super(DataFrameMapper, self).get_params(deep=False)
if not deep:
return out
for feature_name, transformers in out['features']:
if isinstance(transformers, list):
for transformer in transformers:
if transformer is None:
continue
transformer_name = _get_lowercased_class_name(transformer)
parameters = transformer.get_params(deep=True)
for key, value in parameters.items():
param_path = '{column}__{transformer}__{key}'.format(
column=feature_name,
transformer=transformer_name,
key=key
)
out[param_path] = value
else:
transformer = transformers
if transformer is None:
continue
for key, value in transformer.get_params(deep=True).items():
out['%s__%s' % (feature_name, key)] = value
return out

def set_params(self, **params):
features = {}
for column_names, transformers in self.features:
key = _build_feature_name(column_names)
features[key] = transformers

assignment = defaultdict(dict)

for key, value in params.items():
feature_name, _, parameter = key.partition('__')
if '__' in parameter:
transformer_name, _, parameter = parameter.partition('__')
transformers = features[feature_name]
for transformer in transformers:
class_name = _get_lowercased_class_name(transformer)
if class_name == transformer_name:
assignment[id(transformer)][parameter] = value
else:
transformer = features[feature_name]
assignment[id(transformer)][parameter] = value

transformers_instances = chain(*[
x if isinstance(x, list) else [x]
for name, x in self.features])

for instance in transformers_instances:
instance.set_params(**assignment[id(instance)])
144 changes: 144 additions & 0 deletions tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf8 -*-

import pytest
from itertools import product
from pkg_resources import parse_version

# In py3, mock is included with the unittest standard library
Expand All @@ -15,6 +16,7 @@
from scipy import sparse
from sklearn import __version__ as sklearn_version
from sklearn.cross_validation import cross_val_score as sklearn_cv_score
from sklearn.grid_search import GridSearchCV as sklearn_grid_search
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
Expand Down Expand Up @@ -96,6 +98,44 @@ def transform(self, X):
return X - self.min


class NoOpTransformer(BaseEstimator, TransformerMixin):

def __init__(self, string='', number=0, flag=False):
self.string = string
self.number = number
self.flag = flag

def fit(self, X, y=None):
return self

def transform(self, X):
return X


class Adder(BaseEstimator, TransformerMixin):

def __init__(self, num_to_add=0):
self.num_to_add = num_to_add

def fit(self, X, y=None):
return self

def transform(self, X):
return X + self.num_to_add


class Divider(BaseEstimator, TransformerMixin):

def __init__(self, denominator=1):
self.denominator = denominator

def fit(self, X, y=None):
return self

def transform(self, X):
return X / self.denominator


@pytest.fixture
def simple_dataframe():
return pd.DataFrame({'a': [1, 2, 3]})
Expand Down Expand Up @@ -950,3 +990,107 @@ def test_heterogeneous_output_types_input_df():
dft = M.fit_transform(df)
assert dft['feat1'].dtype == np.dtype('int64')
assert dft['feat2'].dtype == np.dtype('float64')


def test_getting_single_transformer_parameters():
"""
Tests that a data frame mapper with a single transformer exposes its
parameters via get_params() method.
"""
noop = NoOpTransformer()
nested_keys = list(noop.get_params().keys())
step_name = 'data_frame_mapper'
transformer_name = 'nested_transformer'
expected_keys = [
'{step_name}__{transformer_name}__{key}'.format(
step_name=step_name,
transformer_name=transformer_name,
key=nested_key)
for nested_key in nested_keys]

mapper = DataFrameMapper([(transformer_name, noop)], df_out=False)
pipeline = Pipeline([(step_name, mapper)])
params = pipeline.get_params()

assert all([key in params for key in expected_keys])


def test_setting_single_transformer_parameters():
"""
Tests that a data frame mapper with a single transformer correctly assigns
parameters to the transformer when the set_params() method is called.
"""
noop = NoOpTransformer()
old_parameters = noop.get_params()
mapper = DataFrameMapper([('noop', noop)], df_out=False)
pipeline = Pipeline([('mapper', mapper)])

pipeline.set_params(
mapper__noop__string='string',
mapper__noop__number=1,
mapper__noop__flag=True)

assert old_parameters != noop.get_params()
assert noop.string == 'string'
assert noop.number == 1
assert noop.flag


def test_getting_parameters_from_a_list_of_transformers():
expected_keys = [
'mapper__{column}__{name}__{value}'.format(
column=column, name=name, value=value)
for column, (name, value) in product(
('colA', 'colB'),
(('adder', 'num_to_add'), ('divider', 'denominator'))
)
]
mapper = Pipeline([
('mapper', DataFrameMapper([
('colA', [Adder(1), Divider(2)]),
('colB', [Divider(1), Adder(2)])
]))
])

params = mapper.get_params()

assert all([key in params for key in expected_keys])


def test_setting_parameters_to_a_list_of_transformers():
transformers = adder, divider = Adder(1), Divider(2)
mapper = DataFrameMapper([('colA', list(transformers))], df_out=False)
pipeline = Pipeline([('mapper', mapper)])

pipeline.set_params(
mapper__colA__adder__num_to_add=0,
mapper__colA__divider__denominator=1
)

assert adder.num_to_add == 0
assert divider.denominator == 1


def test_compliant_with_grid_search(iris_dataframe):
pipeline = Pipeline([
('mapper', DataFrameMapper([
(['petal length (cm)'], StandardScaler()),
(['petal width (cm)'], StandardScaler()),
(['sepal length (cm)'], StandardScaler()),
(['sepal width (cm)'], StandardScaler()),
])),
('classifier', SVC(kernel='linear'))
])
param_grid = {
'mapper__petal length (cm)__with_mean': [True, False],
'mapper__petal width (cm)__with_mean': [True, False],
'mapper__sepal length (cm)__with_mean': [True, False],
'mapper__sepal width (cm)__with_mean': [True, False]
}
data = iris_dataframe.drop("species", axis=1)
labels = iris_dataframe["species"]

grid_search = sklearn_grid_search(pipeline, param_grid=param_grid)
grid_search.fit(data, labels)

assert len(grid_search.grid_scores_) == 2**len(param_grid)