diff --git a/HISTORY.md b/HISTORY.md index aa237b594..bd8cdceee 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ # History +## 0.5.2 - 2021-08-16 + +This release fixes a couple of bugs introduced by the previous release regarding the +`OneHotEncoder` and the `BooleanTransformer`. + +### Issues closed + +* BooleanTransformer.reverse_transform sometimes crashes with TypeError - Isssue [#210](https://github.com/sdv-dev/RDT/issues/210) by @katxiao +* OneHotEncoder causing shape misalignment in CopulaGAN, CTGAN, and TVAE - Issue [#208](https://github.com/sdv-dev/RDT/issues/210) by @sarahmish + ## 0.5.1 - 2021-08-11 This release improves the overall performance of the library, both in terms of memory and time consumption. diff --git a/conda/meta.yaml b/conda/meta.yaml index 72f48ad9b..686184910 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = 'rdt' %} -{% set version = '0.5.1' %} +{% set version = '0.5.2.dev1' %} package: name: "{{ name|lower }}" diff --git a/rdt/__init__.py b/rdt/__init__.py index a53b74925..ebea6308a 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.5.1' +__version__ = '0.5.2.dev1' import numpy as np import pandas as pd diff --git a/rdt/transformers/boolean.py b/rdt/transformers/boolean.py index 559154075..57a0cb094 100644 --- a/rdt/transformers/boolean.py +++ b/rdt/transformers/boolean.py @@ -43,7 +43,7 @@ def fit(self, data): if isinstance(data, np.ndarray): data = pd.Series(data) - self.null_transformer = NullTransformer(self.nan, self.null_column) + self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(data) def transform(self, data): @@ -85,4 +85,4 @@ def reverse_transform(self, data): data = pd.Series(data) - return np.round(data).astype('boolean').astype('object') + return np.round(data).clip(0, 1).astype('boolean').astype('object') diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index c8ff67a7d..d72cdd83d 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -260,11 +260,11 @@ class OneHotEncodingTransformer(BaseTransformer): """ dummies = None - dummy_na = None - num_dummies = None - dummy_encoded = False - indexer = None - decoder = None + _dummy_na = None + _num_dummies = None + _dummy_encoded = False + _indexer = None + _uniques = None def __init__(self, error_on_unknown=True): self.error_on_unknown = error_on_unknown @@ -297,19 +297,19 @@ def _prepare_data(data): return data def _transform(self, data): - if self.dummy_encoded: - coder = self.indexer - codes = pd.Categorical(data, categories=self.dummies).codes + if self._dummy_encoded: + coder = self._indexer + codes = pd.Categorical(data, categories=self._uniques).codes else: - coder = self.dummies + coder = self._uniques codes = data rows = len(data) - dummies = np.broadcast_to(coder, (rows, self.num_dummies)) - coded = np.broadcast_to(codes, (self.num_dummies, rows)).T + dummies = np.broadcast_to(coder, (rows, self._num_dummies)) + coded = np.broadcast_to(codes, (self._num_dummies, rows)).T array = (coded == dummies).astype(int) - if self.dummy_na: + if self._dummy_na: null = np.zeros((rows, 1), dtype=int) null[pd.isnull(data)] = 1 array = np.append(array, null, axis=1) @@ -328,17 +328,17 @@ def fit(self, data): data = self._prepare_data(data) null = pd.isnull(data) - self.dummy_na = null.any() - self.dummies = list(pd.unique(data[~null])) - self.num_dummies = len(self.dummies) - self.indexer = list(range(self.num_dummies)) - self.decoder = self.dummies.copy() + self._uniques = list(pd.unique(data[~null])) + self._dummy_na = null.any() + self._num_dummies = len(self._uniques) + self._indexer = list(range(self._num_dummies)) + self.dummies = self._uniques.copy() if not np.issubdtype(data.dtype, np.number): - self.dummy_encoded = True + self._dummy_encoded = True - if self.dummy_na: - self.decoder.append(np.nan) + if self._dummy_na: + self.dummies.append(np.nan) def transform(self, data): """Replace each category with the OneHot vectors. @@ -375,7 +375,7 @@ def reverse_transform(self, data): data = data.reshape(-1, 1) indices = np.argmax(data, axis=1) - return pd.Series(indices).map(self.decoder.__getitem__) + return pd.Series(indices).map(self.dummies.__getitem__) class LabelEncodingTransformer(BaseTransformer): diff --git a/setup.cfg b/setup.cfg index abe42e8e4..802dc377c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1 +current_version = 0.5.2.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index e1189e60f..3a9f97d9f 100644 --- a/setup.py +++ b/setup.py @@ -101,6 +101,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.5.1', + version='0.5.2.dev1', zip_safe=False, ) diff --git a/tests/integration/transformers/test_boolean.py b/tests/integration/transformers/test_boolean.py new file mode 100644 index 000000000..c57002e54 --- /dev/null +++ b/tests/integration/transformers/test_boolean.py @@ -0,0 +1,83 @@ +import numpy as np +import pandas as pd + +from rdt.transformers import BooleanTransformer + + +class TestBooleanTransformer: + + def test_boolean_some_nans(self): + """Test BooleanTransformer on input with some nan values. + + Ensure that the BooleanTransformer can fit, transform, and reverse + transform on boolean data with Nones. Expect that the reverse + transformed data is the same as the input. + + Input: + - boolean data with None values + Output: + - The reversed transformed data + """ + # Setup + data = pd.Series([True, False, None, False]) + transformer = BooleanTransformer() + + # Run + transformer.fit(data) + transformed = transformer.transform(data) + reverse = transformer.reverse_transform(transformed) + + # Assert + pd.testing.assert_series_equal(reverse, data) + + def test_boolean_all_nans(self): + """Test BooleanTransformer on input with all nan values. + + Ensure that the BooleanTransformer can fit, transform, and reverse + transform on boolean data with all Nones. Expect that the reverse + transformed data is the same as the input. + + Input: + - 4 rows of all None values + Output: + - The reversed transformed data + """ + # Setup + data = pd.Series([None, None, None, None]) + transformer = BooleanTransformer() + + # Run + transformer.fit(data) + transformed = transformer.transform(data) + reverse = transformer.reverse_transform(transformed) + + # Assert + pd.testing.assert_series_equal(reverse, data) + + def test_boolean_input_unchanged(self): + """Test BooleanTransformer on input with some nan values. + + Ensure that the BooleanTransformer can fit, transform, and reverse + transform on boolean data with all Nones. Expect that the intermediate + transformed data is unchanged. + + Input: + - 4 rows of all None values + Output: + - The reversed transformed data + Side effects: + - The intermediate transformed data is unchanged. + """ + # Setup + data = pd.Series([True, False, None, False]) + transformer = BooleanTransformer() + + # Run + transformer.fit(data) + transformed = transformer.transform(data) + unchanged_transformed = transformed.copy() + reverse = transformer.reverse_transform(transformed) + + # Assert + pd.testing.assert_series_equal(reverse, data) + np.testing.assert_array_equal(unchanged_transformed, transformed) diff --git a/tests/performance/test_cases/boolean/BooleanTransformer/default_RandomSkewedBooleanNaNsGenerator_1000_1000.json b/tests/performance/test_cases/boolean/BooleanTransformer/default_RandomSkewedBooleanNaNsGenerator_1000_1000.json index ecb605038..77d35a6cd 100644 --- a/tests/performance/test_cases/boolean/BooleanTransformer/default_RandomSkewedBooleanNaNsGenerator_1000_1000.json +++ b/tests/performance/test_cases/boolean/BooleanTransformer/default_RandomSkewedBooleanNaNsGenerator_1000_1000.json @@ -14,8 +14,8 @@ "memory": 1000000.0 }, "reverse_transform": { - "time": 0.003, + "time": 0.01, "memory": 1000000.0 } } -} \ No newline at end of file +} diff --git a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_ConstantBooleanNaNsGenerator_1000_1000.json b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_ConstantBooleanNaNsGenerator_1000_1000.json index e58dff120..e36120b13 100644 --- a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_ConstantBooleanNaNsGenerator_1000_1000.json +++ b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_ConstantBooleanNaNsGenerator_1000_1000.json @@ -17,8 +17,8 @@ "memory": 1000000.0 }, "reverse_transform": { - "time": 0.002, + "time": 0.01, "memory": 500000.0 } } -} \ No newline at end of file +} diff --git a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomBooleanNaNsGenerator_1000_1000.json b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomBooleanNaNsGenerator_1000_1000.json index 2319f1051..3eba2bfd9 100644 --- a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomBooleanNaNsGenerator_1000_1000.json +++ b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomBooleanNaNsGenerator_1000_1000.json @@ -17,8 +17,8 @@ "memory": 1000000.0 }, "reverse_transform": { - "time": 0.002, + "time": 0.01, "memory": 500000.0 } } -} \ No newline at end of file +} diff --git a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomSkewedBooleanNaNsGenerator_1000_1000.json b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomSkewedBooleanNaNsGenerator_1000_1000.json index 93646b508..ac106e390 100644 --- a/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomSkewedBooleanNaNsGenerator_1000_1000.json +++ b/tests/performance/test_cases/boolean/BooleanTransformer/nan_null_default_RandomSkewedBooleanNaNsGenerator_1000_1000.json @@ -17,8 +17,8 @@ "memory": 1000000.0 }, "reverse_transform": { - "time": 0.002, + "time": 0.01, "memory": 500000.0 } } -} \ No newline at end of file +} diff --git a/tests/unit/transformers/test_boolean.py b/tests/unit/transformers/test_boolean.py index ca89ea4f6..d58fce918 100644 --- a/tests/unit/transformers/test_boolean.py +++ b/tests/unit/transformers/test_boolean.py @@ -199,3 +199,54 @@ def test_reverse_transform_2d_ndarray(self): assert isinstance(result, pd.Series) np.testing.assert_equal(result.values, expected) + + def test_reverse_transform_float_values(self): + """Test the ``reverse_transform`` method with decimals. + + Expect that the ``reverse_transform`` method handles decimal inputs + correctly by rounding them. + + Input: + - Transformed data with decimal values. + Output: + - Reversed transformed data. + """ + # Setup + data = np.array([1.2, 0.32, 1.01]) + transformer = Mock() + transformer.nan = None + + # Run + result = BooleanTransformer.reverse_transform(transformer, data) + + # Asserts + expected = np.array([True, False, True]) + + assert isinstance(result, pd.Series) + np.testing.assert_equal(result.values, expected) + + def test_reverse_transform_float_values_out_of_range(self): + """Test the ``reverse_transform`` method with decimals that are out of range. + + Expect that the ``reverse_transform`` method handles decimal inputs + correctly by rounding them. If the rounded decimal inputs are < 0 or > 1, expect + expect them to be clipped. + + Input: + - Transformed data with decimal values, some of which round to < 0 or > 1. + Output: + - Reversed transformed data. + """ + # Setup + data = np.array([1.9, -0.7, 1.01]) + transformer = Mock() + transformer.nan = None + + # Run + result = BooleanTransformer.reverse_transform(transformer, data) + + # Asserts + expected = np.array([True, False, True]) + + assert isinstance(result, pd.Series) + np.testing.assert_equal(result.values, expected) diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index 8a74517a3..3add852ec 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -515,6 +515,45 @@ def test__prepare_data_pandas_series(self): expected = pd.Series(['a', 'b', 'c']) np.testing.assert_array_equal(out, expected) + def test_fit_dummies_no_nans(self): + """Test the ``fit`` method without nans. + + Check that ``self.dummies`` does not + contain nans. + + Input: + - Series with values + """ + + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = pd.Series(['a', 2, 'c']) + ohet.fit(data) + + # Assert + np.testing.assert_array_equal(ohet.dummies, ['a', 2, 'c']) + + def test_fit_dummies_nans(self): + """Test the ``fit`` method without nans. + + Check that ``self.dummies`` contain ``np.nan``. + + Input: + - Series with values + """ + + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = pd.Series(['a', 2, 'c', None]) + ohet.fit(data) + + # Assert + np.testing.assert_array_equal(ohet.dummies, ['a', 2, 'c', np.nan]) + def test_fit_no_nans(self): """Test the ``fit`` method without nans. @@ -535,9 +574,9 @@ def test_fit_no_nans(self): # Assert np.testing.assert_array_equal(ohet.dummies, ['a', 'b', 'c']) - np.testing.assert_array_equal(ohet.decoder, ['a', 'b', 'c']) - assert ohet.dummy_encoded - assert not ohet.dummy_na + np.testing.assert_array_equal(ohet._uniques, ['a', 'b', 'c']) + assert ohet._dummy_encoded + assert not ohet._dummy_na def test_fit_no_nans_numeric(self): """Test the ``fit`` method without nans. @@ -559,9 +598,9 @@ def test_fit_no_nans_numeric(self): # Assert np.testing.assert_array_equal(ohet.dummies, [1, 2, 3]) - np.testing.assert_array_equal(ohet.decoder, [1, 2, 3]) - assert not ohet.dummy_encoded - assert not ohet.dummy_na + np.testing.assert_array_equal(ohet._uniques, [1, 2, 3]) + assert not ohet._dummy_encoded + assert not ohet._dummy_na def test_fit_nans(self): """Test the ``fit`` method with nans. @@ -582,10 +621,10 @@ def test_fit_nans(self): ohet.fit(data) # Assert - np.testing.assert_array_equal(ohet.dummies, ['a', 'b']) - np.testing.assert_array_equal(ohet.decoder, ['a', 'b', np.nan]) - assert ohet.dummy_encoded - assert ohet.dummy_na + np.testing.assert_array_equal(ohet.dummies, ['a', 'b', np.nan]) + np.testing.assert_array_equal(ohet._uniques, ['a', 'b']) + assert ohet._dummy_encoded + assert ohet._dummy_na def test_fit_nans_numeric(self): """Test the ``fit`` method with nans. @@ -606,10 +645,10 @@ def test_fit_nans_numeric(self): ohet.fit(data) # Assert - np.testing.assert_array_equal(ohet.dummies, [1, 2]) - np.testing.assert_array_equal(ohet.decoder, [1, 2, np.nan]) - assert not ohet.dummy_encoded - assert ohet.dummy_na + np.testing.assert_array_equal(ohet.dummies, [1, 2, np.nan]) + np.testing.assert_array_equal(ohet._uniques, [1, 2]) + assert not ohet._dummy_encoded + assert ohet._dummy_na def test_fit_single(self): # Setup @@ -636,8 +675,8 @@ def test__transform_no_nan(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', 'c']) - ohet.dummies = ['a', 'b', 'c'] - ohet.num_dummies = 3 + ohet._uniques = ['a', 'b', 'c'] + ohet._num_dummies = 3 # Run out = ohet._transform(data) @@ -665,10 +704,10 @@ def test__transform_no_nan_categorical(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'b', 'c']) - ohet.dummies = ['a', 'b', 'c'] - ohet.indexer = [0, 1, 2] - ohet.num_dummies = 3 - ohet.dummy_encoded = True + ohet._uniques = ['a', 'b', 'c'] + ohet._indexer = [0, 1, 2] + ohet._num_dummies = 3 + ohet._dummy_encoded = True # Run out = ohet._transform(data) @@ -696,9 +735,9 @@ def test__transform_nans(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series([np.nan, None, 'a', 'b']) - ohet.dummies = ['a', 'b'] - ohet.dummy_na = True - ohet.num_dummies = 2 + ohet._uniques = ['a', 'b'] + ohet._dummy_na = True + ohet._num_dummies = 2 # Run out = ohet._transform(data) @@ -728,11 +767,11 @@ def test__transform_nans_categorical(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series([np.nan, None, 'a', 'b']) - ohet.dummies = ['a', 'b'] - ohet.indexer = [0, 1] - ohet.dummy_na = True - ohet.num_dummies = 2 - ohet.dummy_encoded = True + ohet._uniques = ['a', 'b'] + ohet._indexer = [0, 1] + ohet._dummy_na = True + ohet._num_dummies = 2 + ohet._dummy_encoded = True # Run out = ohet._transform(data) @@ -761,8 +800,8 @@ def test__transform_single(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) - ohet.dummies = ['a'] - ohet.num_dummies = 1 + ohet._uniques = ['a'] + ohet._num_dummies = 1 # Run out = ohet._transform(data) @@ -791,10 +830,10 @@ def test__transform_single_categorical(self): # Setup ohet = OneHotEncodingTransformer() data = pd.Series(['a', 'a', 'a']) - ohet.dummies = ['a'] - ohet.indexer = [0] - ohet.num_dummies = 1 - ohet.dummy_encoded = True + ohet._uniques = ['a'] + ohet._indexer = [0] + ohet._num_dummies = 1 + ohet._dummy_encoded = True # Run out = ohet._transform(data) @@ -822,8 +861,8 @@ def test__transform_zeros(self): # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) - ohet.dummies = ['a'] - ohet.num_dummies = 1 + ohet._uniques = ['a'] + ohet._num_dummies = 1 # Run out = ohet._transform(pd.Series(['b', 'b', 'b'])) @@ -852,9 +891,9 @@ def test__transform_zeros_categorical(self): # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) - ohet.dummies = ['a'] - ohet.indexer = [0] - ohet.num_dummies = 1 + ohet._uniques = ['a'] + ohet._indexer = [0] + ohet._num_dummies = 1 ohet.dummy_encoded = True # Run @@ -883,9 +922,9 @@ def test__transform_unknown_nan(self): # Setup ohet = OneHotEncodingTransformer() pd.Series(['a']) - ohet.dummies = ['a'] - ohet.dummy_na = True - ohet.num_dummies = 1 + ohet._uniques = ['a'] + ohet._dummy_na = True + ohet._num_dummies = 1 # Run out = ohet._transform(pd.Series(['b', 'b', np.nan])) @@ -1023,7 +1062,7 @@ def test_transform_numeric(self): out = ohet.transform(data) # Assert - assert not ohet.dummy_encoded + assert not ohet._dummy_encoded np.testing.assert_array_equal(out, expected) def test_reverse_transform_no_nans(self):