diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index d57af5a6..163c40c4 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -28,6 +28,7 @@ ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer, + LogScaler, ) from rdt.transformers.pii.anonymizer import ( AnonymizedFaker, @@ -46,6 +47,7 @@ 'FrequencyEncoder', 'GaussianNormalizer', 'LabelEncoder', + 'LogScaler', 'NullTransformer', 'OneHotEncoder', 'OptimizedTimestampEncoder', diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 12bc345e..6cb3eadb 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -8,7 +8,7 @@ import pandas as pd import scipy -from rdt.errors import TransformerInputError +from rdt.errors import InvalidDataError, TransformerInputError from rdt.transformers.base import BaseTransformer from rdt.transformers.null import NullTransformer from rdt.transformers.utils import learn_rounding_digits @@ -626,3 +626,122 @@ def _reverse_transform(self, data): recovered_data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013 return super()._reverse_transform(recovered_data) + + +class LogScaler(FloatFormatter): + """Transformer for numerical data using log. + + This transformer scales numerical values using log and an optional constant. + + Null values are replaced using a ``NullTransformer``. + + Args: + missing_value_replacement (object): + Indicate what to replace the null values with. If an integer or float is given, + replace them with the given value. If the strings ``'mean'`` or ``'mode'`` + are given, replace them with the corresponding aggregation and if ``'random'`` + replace each null value with a random value in the data range. Defaults to ``mean``. + missing_value_generation (str or None): + The way missing values are being handled. There are three strategies: + + * ``random``: Randomly generates missing values based on the percentage of + missing values. + * ``from_column``: Creates a binary column that describes whether the original + value was missing. Then use it to recreate missing values. + * ``None``: Do nothing with the missing values on the reverse transform. Simply + pass whatever data we get through. + constant (float): + The constant to set as the 0-value for the log-based transform. Defaults to 0 + (do not modify the 0-value of the data). + invert (bool): + Whether to invert the data with respect to the constant value. If False, do not + invert the data (all values will be greater than the constant value). If True, + invert the data (all the values will be less than the constant value). + Defaults to False. + learn_rounding_scheme (bool): + Whether or not to learn what place to round to based on the data seen during ``fit``. + If ``True``, the data returned by ``reverse_transform`` will be rounded to that place. + Defaults to ``False``. + """ + + def __init__( + self, + missing_value_replacement='mean', + missing_value_generation='random', + constant: float = 0.0, + invert: bool = False, + learn_rounding_scheme: bool = False, + ): + if isinstance(constant, (int, float)): + self.constant = constant + else: + raise ValueError('The constant parameter must be a float or int.') + if isinstance(invert, bool): + self.invert = invert + else: + raise ValueError('The invert parameter must be a bool.') + + super().__init__( + missing_value_replacement=missing_value_replacement, + missing_value_generation=missing_value_generation, + learn_rounding_scheme=learn_rounding_scheme, + ) + + def _validate_data(self, data: pd.Series): + column_name = self.get_input_column() + if self.invert: + if not all(data < self.constant): + raise InvalidDataError( + f"Unable to apply a log transform to column '{column_name}' due to constant" + ' being too small.' + ) + else: + if not all(data > self.constant): + raise InvalidDataError( + f"Unable to apply a log transform to column '{column_name}' due to constant" + ' being too large.' + ) + + def _fit(self, data): + super()._fit(data) + data = super()._transform(data) + + if data.ndim > 1: + self._validate_data(data[:, 0]) + else: + self._validate_data(data) + + def _log_transform(self, data): + self._validate_data(data) + + if self.invert: + return np.log(self.constant - data) + else: + return np.log(data - self.constant) + + def _transform(self, data): + data = super()._transform(data) + + if data.ndim > 1: + data[:, 0] = self._log_transform(data[:, 0]) + else: + data = self._log_transform(data) + + return data + + def _reverse_log(self, data): + if self.invert: + return self.constant - np.exp(data) + else: + return np.exp(data) + self.constant + + def _reverse_transform(self, data): + if not isinstance(data, np.ndarray): + data = data.to_numpy() + + if data.ndim > 1: + data[:, 0] = self._reverse_log(data[:, 0]) + else: + data = self._reverse_log(data) + + return super()._reverse_transform(data) diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py index e58d4f08..25f12b5f 100644 --- a/tests/integration/test_transformers.py +++ b/tests/integration/test_transformers.py @@ -1,5 +1,6 @@ from collections import defaultdict +import numpy as np import pandas as pd import pytest @@ -12,6 +13,8 @@ PRIMARY_SDTYPES = ['boolean', 'categorical', 'datetime', 'numerical'] +INT64_MIN = np.iinfo(np.int64).min + # Additional arguments for transformers TRANSFORMER_ARGS = { 'BinaryEncoder': { @@ -23,6 +26,7 @@ 'FloatFormatter': {'missing_value_generation': 'from_column'}, 'GaussianNormalizer': {'missing_value_generation': 'from_column'}, 'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'}, + 'LogScaler': {'constant': INT64_MIN, 'missing_value_generation': 'from_column'}, } # Mapping of rdt sdtype to dtype diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index e2790f67..6b8c06d9 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -7,6 +7,7 @@ ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer, + LogScaler, ) @@ -560,3 +561,61 @@ def test_out_of_bounds_reverse_transform(self): # Assert assert isinstance(reverse, pd.DataFrame) + + +class TestLogScaler: + def test_learn_rounding(self): + """Test that transformer learns rounding scheme from data.""" + # Setup + data = pd.DataFrame({'test': [1.0, np.nan, 1.5]}) + transformer = LogScaler( + missing_value_generation=None, + missing_value_replacement='mean', + learn_rounding_scheme=True, + ) + expected = pd.DataFrame({'test': [1.0, 1.2, 1.5]}) + + # Run + transformer.fit(data, 'test') + transformed = transformer.transform(data) + reversed_values = transformer.reverse_transform(transformed) + + # Assert + np.testing.assert_array_equal(reversed_values, expected) + + def test_missing_value_generation_from_column(self): + """Test from_column missing value generation with nans present.""" + # Setup + data = pd.DataFrame({'test': [1.0, np.nan, 1.5]}) + transformer = LogScaler( + missing_value_generation='from_column', + missing_value_replacement='mean', + ) + + # Run + transformer.fit(data, 'test') + transformed = transformer.transform(data) + reversed_values = transformer.reverse_transform(transformed) + + # Assert + np.testing.assert_array_equal(reversed_values, data) + + def test_missing_value_generation_random(self): + """Test random missing_value_generation with nans present.""" + # Setup + data = pd.DataFrame({'test': [1.0, np.nan, 1.5, 1.5]}) + transformer = LogScaler( + missing_value_generation='random', + missing_value_replacement='mode', + invert=True, + constant=3.0, + ) + expected = pd.DataFrame({'test': [np.nan, 1.5, 1.5, 1.5]}) + + # Run + transformer.fit(data, 'test') + transformed = transformer.transform(data) + reversed_values = transformer.reverse_transform(transformed) + + # Assert + np.testing.assert_array_equal(reversed_values, expected) diff --git a/tests/unit/transformers/test_numerical.py b/tests/unit/transformers/test_numerical.py index bfcddc8c..004b88c6 100644 --- a/tests/unit/transformers/test_numerical.py +++ b/tests/unit/transformers/test_numerical.py @@ -9,12 +9,13 @@ from copulas import univariate from pandas.api.types import is_float_dtype -from rdt.errors import TransformerInputError +from rdt.errors import InvalidDataError, TransformerInputError from rdt.transformers.null import NullTransformer from rdt.transformers.numerical import ( ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer, + LogScaler, ) @@ -1863,3 +1864,336 @@ def test__reverse_transform_missing_value_replacement_missing_value_replacement_ call_data, rtol=1e-1, ) + + +class TestLogScaler: + def test___init__super_attrs(self): + """Test super() arguments are properly passed and set as attributes.""" + ls = LogScaler( + missing_value_generation='random', + learn_rounding_scheme=False, + ) + + assert ls.missing_value_replacement == 'mean' + assert ls.missing_value_generation == 'random' + assert ls.learn_rounding_scheme is False + + def test___init__constant(self): + """Test constant parameter is set as an attribute.""" + # Setup + ls_set = LogScaler(constant=2.5) + ls_default = LogScaler() + + # Test + assert ls_set.constant == 2.5 + assert ls_default.constant == 0.0 + + def test__init__validates_constant(self): + """Test __init__ validates constat parameter.""" + # Setup + message = 'The constant parameter must be a float or int.' + # Run and Assert + with pytest.raises(ValueError, match=message): + LogScaler(constant='2') + + LogScaler(constant=2) + + def test___init__invert(self): + """Test invert parameter is set as an attribute.""" + # Setup + ls_set = LogScaler(invert=True) + ls_default = LogScaler() + + # Test + assert ls_set.invert + assert not ls_default.invert + + def test__init__validates_invert(self): + """Test __init__ validates constat parameter.""" + # Setup + message = 'The invert parameter must be a bool.' + # Run and Assert + with pytest.raises(ValueError, match=message): + LogScaler(invert=2) + + def test__validate_data(self): + """Test the ``_validate_data`` method""" + # Setup + ls = LogScaler() + ls.columns = ['test_col'] + valid_data = pd.Series([1, 2, 3]) + invalid_data = pd.Series([-1, 2, 4]) + message = ( + "Unable to apply a log transform to column 'test_col' due to constant being too large." + ) + # Run and Assert + ls._validate_data(valid_data) + + with pytest.raises(InvalidDataError, match=message): + ls._validate_data(invalid_data) + + def test__validate_data_invert(self): + """Test the ``_validate_data`` method""" + # Setup + ls = LogScaler(invert=True) + ls.columns = ['test'] + valid_data = pd.Series([-1, -2, -3]) + invalid_data = pd.Series([-1, 2, 4]) + message = ( + "Unable to apply a log transform to column 'test' due to constant being too small." + ) + + # Run and Assert + ls._validate_data(valid_data) + + with pytest.raises(InvalidDataError, match=message): + ls._validate_data(invalid_data) + + @patch('rdt.transformers.LogScaler._validate_data') + def test__fit(self, mock_validate): + """Test the ``_fit`` method.""" + # Setup + data = pd.Series([0.5, np.nan, 1.0]) + ls = LogScaler() + + # Run + ls._fit(data) + + # Assert + mock_validate.assert_called_once() + call_value = mock_validate.call_args_list[0] + np.testing.assert_array_equal(call_value[0][0], np.array([0.5, 0.75, 1.0])) + assert isinstance(ls.null_transformer, NullTransformer) + + @patch('rdt.transformers.LogScaler._validate_data') + def test__fit_from_column(self, mock_validate): + """Test the ``_fit`` method.""" + # Setup + data = pd.Series([0.5, np.nan, 1.0]) + ls = LogScaler(missing_value_generation='from_column') + + # Run + ls._fit(data) + + # Assert + mock_validate.assert_called_once() + call_value = mock_validate.call_args_list[0] + np.testing.assert_array_equal(call_value[0][0], np.array([0.5, 0.75, 1.0])) + assert isinstance(ls.null_transformer, NullTransformer) + + def test__transform(self): + """Test the ``_transform`` method.""" + # Setup + ls = LogScaler() + ls._validate_data = Mock() + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', missing_value_generation='from_column' + ) + data = pd.Series([0.1, 1.0, 2.0], name='test') + ls.null_transformer.fit(data) + expected = np.array([-2.30259, 0, 0.69314]) + + # Run + transformed_data = ls._transform(data) + + # Assert + ls._validate_data.assert_called_once() + call_value = ls._validate_data.call_args_list[0] + np.testing.assert_array_equal(call_value[0][0], np.array([0.1, 1.0, 2.0])) + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__transform_invert(self): + """Test the ``_transform`` method with ``invert=True``""" + # Setup + ls = LogScaler(constant=3.0, invert=True, missing_value_replacement='from_column') + ls._validate_data = Mock() + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', missing_value_generation='from_column' + ) + ls.null_transformer.fit(pd.Series([0.25, 0.5, 0.75], name='test')) + data = pd.Series([0.1, 1.0, 2.0], name='test') + expected = np.array([1.06471, 0.69315, 0]) + + # Run + transformed_data = ls._transform(data) + + # Assert + ls._validate_data.assert_called_once() + call_value = ls._validate_data.call_args_list[0] + np.testing.assert_array_equal(call_value[0][0], np.array([0.1, 1.0, 2.0])) + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__transform_null_values(self): + """Test the ``_transform`` method with ``invert=True``""" + # Setup + ls = LogScaler() + ls._validate_data = Mock() + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', missing_value_generation='from_column' + ) + data = pd.Series([0.1, 1.0, np.nan], name='test') + ls.null_transformer.fit(data) + expected = np.array([[-2.30259, 0], [0, 0], [-0.597837, 1]]) + + # Run + transformed_data = ls._transform(data) + + # Assert + ls._validate_data.assert_called_once() + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__transform_null_values_invert(self): + """Test the ``_transform`` method with ``invert=True``""" + # Setup + ls = LogScaler(constant=3.0, invert=True, missing_value_replacement='from_column') + ls._validate_data = Mock() + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', missing_value_generation='from_column' + ) + ls.null_transformer.fit(pd.Series([0.25, 0.5, np.nan], name='test')) + data = pd.Series([0.1, 1.0, np.nan], name='test') + expected = np.array([[1.06471, 0], [0.69315, 0], [0.96508, 1]]) + + # Run + transformed_data = ls._transform(data) + + # Assert + ls._validate_data.assert_called_once() + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__transform_invalid_data(self): + # Setup + ls = LogScaler(missing_value_replacement='from_column') + data = pd.Series([-0.1, 1.0, 2.0], name='test') + ls.columns = ['test'] + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', missing_value_generation='from_column' + ) + ls.null_transformer.fit(pd.Series([0.25, 0.5, 0.75], name='test')) + message = ( + "Unable to apply a log transform to column 'test' due to constant being too large." + ) + + # Run and Assert + with pytest.raises(InvalidDataError, match=message): + ls._transform(data) + + def test__transform_missing_value_generation_is_random(self): + """Test the ``_transform`` method. + + Validate that ``_transform`` produces the correct values when ``missing_value_generation`` + is ``random``. + """ + # Setup + data = pd.Series([1.0, 2.0, 1.0]) + ls = LogScaler() + ls.columns = ['test'] + ls.null_transformer = NullTransformer('mean', missing_value_generation='random') + + # Run + ls.null_transformer.fit(data) + transformed_data = ls._transform(data) + + # Assert + expected = np.array([0, 0.69315, 0]) + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__reverse_transform(self): + """Test the ``_reverse_transform`` method. + + Validate that ``_reverse_transform`` produces the correct values when + ``missing_value_generation`` is 'from_column'. + """ + # Setup + data = np.array([ + [0, 0.6931471805599453, 0], + [0, 0, 1.0], + ]).T + expected = pd.Series([1.0, 2.0, np.nan]) + ls = LogScaler() + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', + missing_value_generation='from_column', + ) + + # Run + ls.null_transformer.fit(expected) + transformed_data = ls._reverse_transform(data) + + # Assert + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__reverse_transform_invert(self): + """Test the ``_reverse_transform`` method. + + Validate that ``_reverse_transform`` produces the correct values when + ``missing_value_generation`` is 'from_column'. + """ + # Setup + data = pd.DataFrame([ + [1.06471, 0.69315, 0], + [0, 0, 1.0], + ]).T + expected = pd.Series([0.1, 1.0, np.nan]) + ls = LogScaler(constant=3.0, invert=True) + ls.null_transformer = NullTransformer( + missing_value_replacement='mean', + missing_value_generation='from_column', + ) + + # Run + ls.null_transformer.fit(expected) + transformed_data = ls._reverse_transform(data) + + # Assert + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__reverse_transform_missing_value_generation(self): + """Test the ``_reverse_transform`` method. + + Validate that ``_reverse_transform`` produces the correct values when + ``missing_value_generation`` is 'random'. + """ + # Setup + data = np.array([0, 0.6931471805599453, 0]) + expected = pd.Series([1.0, 2.0, 1.0]) + ls = LogScaler() + ls.null_transformer = NullTransformer(None, missing_value_generation='random') + + # Run + ls.null_transformer.fit(expected) + transformed_data = ls._reverse_transform(data) + + # Assert + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test__reverse_transform_invert_missing_value_generation(self): + """Test the ``_reverse_transform`` method. + + Validate that ``_reverse_transform`` produces the correct values when + ``missing_value_generation`` is 'random'. + """ + # Setup + data = np.array([1.06471, 0.69315, 0]) + expected = pd.Series([0.1, 1.0, 2.0]) + ls = LogScaler(constant=3.0, invert=True) + ls.null_transformer = NullTransformer(None, missing_value_generation='random') + + # Run + ls.null_transformer.fit(expected) + transformed_data = ls._reverse_transform(data) + + # Assert + np.testing.assert_allclose(transformed_data, expected, rtol=1e-3) + + def test_print(self, capsys): + """Test the class can be printed. GH#883""" + # Setup + transformer = LogScaler() + + # Run + print(transformer) # noqa: T201 `print` found + + # Assert + captured = capsys.readouterr() + assert captured.out == 'LogScaler()\n'