diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index f530521..5c3bd75 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -1,5 +1,7 @@ import sys import contextlib +from itertools import chain +from collections import defaultdict import pandas as pd import numpy as np @@ -37,6 +39,12 @@ def _build_feature(columns, transformers, options={}): return (columns, _build_transformer(transformers), options) +def _build_feature_name(values): + if isinstance(values, list): + values = '-'.join([str(value) for value in values]) + return values + + def _get_feature_names(estimator): """ Attempt to extract feature names based on a given estimator @@ -48,6 +56,10 @@ def _get_feature_names(estimator): return None +def _get_lowercased_class_name(inst): + return type(inst).__name__.lower() + + @contextlib.contextmanager def add_column_names_to_exception(column_names): # Stolen from https://stackoverflow.com/a/17677938/356729 @@ -386,3 +398,57 @@ def fit_transform(self, X, y=None): y the target vector relative to X, optional """ return self._transform(X, y, True) + + def get_params(self, deep=True): + out = super(DataFrameMapper, self).get_params(deep=False) + if not deep: + return out + for feature_name, transformers in out['features']: + if isinstance(transformers, list): + for transformer in transformers: + if transformer is None: + continue + transformer_name = _get_lowercased_class_name(transformer) + parameters = transformer.get_params(deep=True) + for key, value in parameters.items(): + param_path = '{column}__{transformer}__{key}'.format( + column=feature_name, + transformer=transformer_name, + key=key + ) + out[param_path] = value + else: + transformer = transformers + if transformer is None: + continue + for key, value in transformer.get_params(deep=True).items(): + out['%s__%s' % (feature_name, key)] = value + return out + + def set_params(self, **params): + features = {} + for column_names, transformers in self.features: + key = _build_feature_name(column_names) + features[key] = transformers + + assignment = defaultdict(dict) + + for key, value in params.items(): + feature_name, _, parameter = key.partition('__') + if '__' in parameter: + transformer_name, _, parameter = parameter.partition('__') + transformers = features[feature_name] + for transformer in transformers: + class_name = _get_lowercased_class_name(transformer) + if class_name == transformer_name: + assignment[id(transformer)][parameter] = value + else: + transformer = features[feature_name] + assignment[id(transformer)][parameter] = value + + transformers_instances = chain(*[ + x if isinstance(x, list) else [x] + for name, x in self.features]) + + for instance in transformers_instances: + instance.set_params(**assignment[id(instance)]) diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 95adcfb..d170a72 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -1,6 +1,7 @@ # -*- coding: utf8 -*- import pytest +from itertools import product from pkg_resources import parse_version # In py3, mock is included with the unittest standard library @@ -15,6 +16,7 @@ from scipy import sparse from sklearn import __version__ as sklearn_version from sklearn.cross_validation import cross_val_score as sklearn_cv_score +from sklearn.grid_search import GridSearchCV as sklearn_grid_search from sklearn.datasets import load_iris from sklearn.pipeline import Pipeline from sklearn.svm import SVC @@ -96,6 +98,44 @@ def transform(self, X): return X - self.min +class NoOpTransformer(BaseEstimator, TransformerMixin): + + def __init__(self, string='', number=0, flag=False): + self.string = string + self.number = number + self.flag = flag + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X + + +class Adder(BaseEstimator, TransformerMixin): + + def __init__(self, num_to_add=0): + self.num_to_add = num_to_add + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X + self.num_to_add + + +class Divider(BaseEstimator, TransformerMixin): + + def __init__(self, denominator=1): + self.denominator = denominator + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X / self.denominator + + @pytest.fixture def simple_dataframe(): return pd.DataFrame({'a': [1, 2, 3]}) @@ -950,3 +990,107 @@ def test_heterogeneous_output_types_input_df(): dft = M.fit_transform(df) assert dft['feat1'].dtype == np.dtype('int64') assert dft['feat2'].dtype == np.dtype('float64') + + +def test_getting_single_transformer_parameters(): + """ + Tests that a data frame mapper with a single transformer exposes its + parameters via get_params() method. + """ + noop = NoOpTransformer() + nested_keys = list(noop.get_params().keys()) + step_name = 'data_frame_mapper' + transformer_name = 'nested_transformer' + expected_keys = [ + '{step_name}__{transformer_name}__{key}'.format( + step_name=step_name, + transformer_name=transformer_name, + key=nested_key) + for nested_key in nested_keys] + + mapper = DataFrameMapper([(transformer_name, noop)], df_out=False) + pipeline = Pipeline([(step_name, mapper)]) + params = pipeline.get_params() + + assert all([key in params for key in expected_keys]) + + +def test_setting_single_transformer_parameters(): + """ + Tests that a data frame mapper with a single transformer correctly assigns + parameters to the transformer when the set_params() method is called. + """ + noop = NoOpTransformer() + old_parameters = noop.get_params() + mapper = DataFrameMapper([('noop', noop)], df_out=False) + pipeline = Pipeline([('mapper', mapper)]) + + pipeline.set_params( + mapper__noop__string='string', + mapper__noop__number=1, + mapper__noop__flag=True) + + assert old_parameters != noop.get_params() + assert noop.string == 'string' + assert noop.number == 1 + assert noop.flag + + +def test_getting_parameters_from_a_list_of_transformers(): + expected_keys = [ + 'mapper__{column}__{name}__{value}'.format( + column=column, name=name, value=value) + for column, (name, value) in product( + ('colA', 'colB'), + (('adder', 'num_to_add'), ('divider', 'denominator')) + ) + ] + mapper = Pipeline([ + ('mapper', DataFrameMapper([ + ('colA', [Adder(1), Divider(2)]), + ('colB', [Divider(1), Adder(2)]) + ])) + ]) + + params = mapper.get_params() + + assert all([key in params for key in expected_keys]) + + +def test_setting_parameters_to_a_list_of_transformers(): + transformers = adder, divider = Adder(1), Divider(2) + mapper = DataFrameMapper([('colA', list(transformers))], df_out=False) + pipeline = Pipeline([('mapper', mapper)]) + + pipeline.set_params( + mapper__colA__adder__num_to_add=0, + mapper__colA__divider__denominator=1 + ) + + assert adder.num_to_add == 0 + assert divider.denominator == 1 + + +def test_compliant_with_grid_search(iris_dataframe): + pipeline = Pipeline([ + ('mapper', DataFrameMapper([ + (['petal length (cm)'], StandardScaler()), + (['petal width (cm)'], StandardScaler()), + (['sepal length (cm)'], StandardScaler()), + (['sepal width (cm)'], StandardScaler()), + ])), + ('classifier', SVC(kernel='linear')) + ]) + param_grid = { + 'mapper__petal length (cm)__with_mean': [True, False], + 'mapper__petal width (cm)__with_mean': [True, False], + 'mapper__sepal length (cm)__with_mean': [True, False], + 'mapper__sepal width (cm)__with_mean': [True, False] + } + data = iris_dataframe.drop("species", axis=1) + labels = iris_dataframe["species"] + + grid_search = sklearn_grid_search(pipeline, param_grid=param_grid) + grid_search.fit(data, labels) + + assert len(grid_search.grid_scores_) == 2**len(param_grid)