From abe86ac7bc4e99aa4ebdf1c403d1b6498bbf2738 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 5 Oct 2020 18:12:35 +0200 Subject: [PATCH 01/28] =?UTF-8?q?Bump=20version:=200.2.6=20=E2=86=92=200.2?= =?UTF-8?q?.7.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index ed43d66f6..63b1cd408 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.6' +__version__ = '0.2.7.dev0' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index a21d7c635..2ab123819 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.6 +current_version = 0.2.7.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 09c644fbd..a08cdfb9a 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.6', + version='0.2.7.dev0', zip_safe=False, ) From 91d4685f3cb64dd63aa6fb7ebf51386a9efd7c5d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 15 Oct 2020 19:59:12 +0200 Subject: [PATCH 02/28] Add option to drop constant parts of the datetimes (#131) * Add option to strip constant parts of the datetimes * Drop py35 support --- .github/workflows/tests.yml | 2 +- .travis.yml | 1 - rdt/transformers/datetime.py | 35 +++++- rdt/transformers/null.py | 3 + setup.py | 3 +- tests/transformers/test_datetime.py | 171 ++++------------------------ tox.ini | 4 +- 7 files changed, 60 insertions(+), 159 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a5c5f7283..607a8e1e2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest, macos-latest] steps: diff --git a/.travis.yml b/.travis.yml index 388b7d3a1..ecfa96045 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,6 @@ python: - 3.8 - 3.7 - 3.6 - - 3.5 # Command to install dependencies install: pip install -U tox-travis codecov diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 8988e07b8..162b0067c 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -26,23 +26,45 @@ class DatetimeTransformer(BaseTransformer): If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. + strip_constant (bool): + Whether to optimize the output values by finding the smallest time unit that + is not zero on the training datetimes and dividing the generated numerical + values by the value of the next smallest time unit. This, a part from reducing the + orders of magnitued of the transformed values, ensures that reverted values always + are zero on the lower time units. """ null_transformer = None + divider = None - def __init__(self, nan='mean', null_column=None): + def __init__(self, nan='mean', null_column=None, strip_constant=False): self.nan = nan self.null_column = null_column + self.strip_constant = strip_constant - @staticmethod - def _transform(datetimes): + def _find_divider(self, transformed): + self.divider = 1 + multipliers = [10] * 9 + [60, 60, 24] + for multiplier in multipliers: + candidate = self.divider * multiplier + if np.mod(transformed, candidate).any(): + break + + self.divider = candidate + + def _transform(self, datetimes): """Transform datetime values to integer.""" nulls = datetimes.isnull() integers = np.zeros(len(datetimes)) integers[~nulls] = datetimes[~nulls].astype(int).astype(float).values integers[nulls] = np.nan - return pd.Series(integers) + transformed = pd.Series(integers) + if self.strip_constant: + self._find_divider(transformed) + transformed = transformed.floordiv(self.divider) + + return transformed def fit(self, data): """Fit the transformer to the data. @@ -55,7 +77,7 @@ def fit(self, data): data = pd.Series(data) transformed = self._transform(data) - self.null_transformer = NullTransformer(self.nan, self.null_column) + self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(transformed) def transform(self, data): @@ -89,4 +111,7 @@ def reverse_transform(self, data): data = self.null_transformer.reverse_transform(data) data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(int) + if self.strip_constant: + data = data * self.divider + return pd.to_datetime(data) diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index 887e0e861..f2455a71f 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -111,6 +111,9 @@ def reverse_transform(self, data): data = pd.Series(data) if isnull.any(): + if self.copy: + data = data.copy() + data.iloc[isnull] = np.nan return data diff --git a/setup.py b/setup.py index a08cdfb9a..036078b9b 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', @@ -90,7 +89,7 @@ long_description_content_type='text/markdown', name='rdt', packages=find_packages(include=['rdt', 'rdt.*']), - python_requires='>=3.5,<3.9', + python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tests/transformers/test_datetime.py b/tests/transformers/test_datetime.py index 1bce84a37..0befe0faa 100644 --- a/tests/transformers/test_datetime.py +++ b/tests/transformers/test_datetime.py @@ -1,164 +1,41 @@ -from unittest import TestCase -from unittest.mock import Mock - import numpy as np import pandas as pd from rdt.transformers import DatetimeTransformer -class TestDatetimeTransformer(TestCase): - - def test___init__(self): - """Test default instance""" - # Run - transformer = DatetimeTransformer() - - # Asserts - self.assertEqual(transformer.nan, 'mean', "Unexpected nan") - self.assertIsNone(transformer.null_column, "null_column is None by default") - self.assertIsNone(transformer.null_transformer, "null_transformer is None by default") - - def test__transform(self): - """Test transform datetimes series to integer""" - # Setup - data = pd.Series([None, '1996-10-17', '1965-05-23']) - data = pd.to_datetime(data) - - # Run - result = DatetimeTransformer._transform(data) - - # Asserts - expect = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - pd.testing.assert_series_equal(result, expect) - - def test_fit(self): - """Test fit nan custom value with numpy.array""" - # Setup - data = pd.to_datetime([None, '1996-10-17', '1965-05-23']).values - - # Run - transformer = DatetimeTransformer(nan='nan') - transformer.fit(data) - - # Asserts - expect_nan = 'nan' - expect_fill_value = 'nan' - - self.assertEqual( - transformer.nan, - expect_nan, - 'Unexpected nan' - ) - self.assertEqual( - transformer.null_transformer.fill_value, - expect_fill_value, - "Data mean is wrong" - ) - - def test_transform_array(self): - """Test tranform datetime arary""" - # Setup - data = pd.to_datetime([None, '1996-10-17', '1965-05-23']).values +class TestDatetimeTransformer: - data_transform = pd.Series([np.nan, 845510400000000000, -145497600000000000]) + def test_no_strip(self): + dtt = DatetimeTransformer(strip_constant=False) + data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23'])) # Run - transformer = Mock() - transformer._transform.return_value = data_transform - - DatetimeTransformer.transform(transformer, data) + transformed = dtt.fit_transform(data.copy().to_numpy()) + reverted = dtt.reverse_transform(transformed) # Asserts - exp_call_data = pd.Series([None, '1996-10-17', '1965-05-23']) - expect_call_args = pd.to_datetime(exp_call_data) - expect_call_count = 1 - - pd.testing.assert_series_equal( - transformer._transform.call_args[0][0], - expect_call_args - ) - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "NullTransformer.transform must be called only once." - ) - - def test_transform_series(self): - """Test transform datetime series""" - # Setup - data = pd.Series([None, '1996-10-17', '1965-05-23']) - data = pd.to_datetime(data) - - data_transform = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - # Run - transformer = Mock() - transformer._transform.return_value = data_transform - - DatetimeTransformer.transform(transformer, data) - - # Asserts - exp_call_data = pd.Series([None, '1996-10-17', '1965-05-23']) - expect_call_args = pd.to_datetime(exp_call_data) - expect_call_count = 1 - - pd.testing.assert_series_equal( - transformer._transform.call_args[0][0], - expect_call_args - ) - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "NullTransformer.transform must be called only once." - ) - - def test_reverse_transform_nan_not_ignore(self): - """Test reverse_transform with nan not equal to ignore""" - # Setup - data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - reversed_data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - # Run - transformer = Mock() - transformer.nan = 'mean' - transformer.null_transformer.reverse_transform.return_value = reversed_data - - DatetimeTransformer.reverse_transform(transformer, data) - - # Asserts - expect_reverse_call_count = 1 - - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expect_reverse_call_count, - "NullTransformer.reverse_transform must be called when nan is not ignore" - ) + expect_trans = np.array([ + [350006400000000000, 1.0], + [845510400000000000, 0.0], + [-145497600000000000, 0.0] + ]) + np.testing.assert_almost_equal(expect_trans, transformed) + pd.testing.assert_series_equal(reverted, data) - def test_reverse_transform_nan_ignore(self): - """Test reverse_transform with nan equal to ignore""" - # Setup - data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) + def test_strip(self): + dtt = DatetimeTransformer(strip_constant=True) + data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23'])) # Run - transformer = Mock() - transformer.nan = None - - result = DatetimeTransformer.reverse_transform(transformer, data) + transformed = dtt.fit_transform(data.copy().to_numpy()) + reverted = dtt.reverse_transform(transformed) # Asserts - expect = pd.Series([ - np.nan, - pd.to_datetime(845510400000000000), - pd.to_datetime(-145497600000000000) + expect_trans = np.array([ + [4051.0, 1.0], + [9786.0, 0.0], + [-1684.0, 0.0] ]) - expect_reverse_call_count = 0 - - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expect_reverse_call_count, - "NullTransformer.reverse_transform won't be called when nan is ignore" - ) + np.testing.assert_almost_equal(expect_trans, transformed) + pd.testing.assert_series_equal(reverted, data) diff --git a/tox.ini b/tox.ini index c8ace7f65..19779305d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,19 +1,17 @@ [tox] -envlist = py3{5,6,7,8}, test-devel +envlist = py3{6,7,8}, test-devel [travis] python = 3.8: py38, test-devel 3.7: py37 3.6: py36 - 3.5: py35 [gh-actions] python = 3.8: py38, test-devel 3.7: py37, 3.6: py36 - 3.5: py35 [testenv] passenv = CI TRAVIS TRAVIS_* From 66894b45a04295a9bd0f0122b413ddc1980e3e61 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 16 Oct 2020 19:01:45 +0200 Subject: [PATCH 03/28] Add release notes for v0.2.7 --- HISTORY.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 267a8bf17..37c9e8e98 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ # History +## 0.2.7 - 2020-10-16 + +In this release we drop the support for the now officially dead Python 3.5 +and introduce a new feature in the DatetimeTransformer which reduces the dimensionality +of the generated numerical values while also ensuring that the reverted datetimes +maintain the same level as time unit precision as the original ones. + +* Drop Py35 support - Issue [#129](https://github.com/sdv-dev/RDT/issues/129) by @csala +* Add option to drop constant parts of the datetimes - Issue [#130](https://github.com/sdv-dev/RDT/issues/130) by @csala + ## 0.2.6 - 2020-10-05 * Add GaussianCopulaTransformer - Issue [#125](https://github.com/sdv-dev/RDT/issues/125) by @csala From a088f4cb23c62a73ed64199bae53ca5267adad4c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 16 Oct 2020 19:02:01 +0200 Subject: [PATCH 04/28] =?UTF-8?q?Bump=20version:=200.2.7.dev0=20=E2=86=92?= =?UTF-8?q?=200.2.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 63b1cd408..85eb49bee 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.7.dev0' +__version__ = '0.2.7' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 2ab123819..79bf00375 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.7.dev0 +current_version = 0.2.7 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 036078b9b..2e9901046 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.7.dev0', + version='0.2.7', zip_safe=False, ) From 0e5cecb54d2280f4860d952ca40f1a2e59a6e079 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 16 Oct 2020 19:02:15 +0200 Subject: [PATCH 05/28] =?UTF-8?q?Bump=20version:=200.2.7=20=E2=86=92=200.2?= =?UTF-8?q?.8.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 85eb49bee..22f1884f2 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.7' +__version__ = '0.2.8.dev0' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 79bf00375..672c5f2ed 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.7 +current_version = 0.2.8.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 2e9901046..315f43a44 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.7', + version='0.2.8.dev0', zip_safe=False, ) From c09a4ca5fe650d8c3ed778b0def17a3c055c13a8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 9 Nov 2020 17:39:27 +0100 Subject: [PATCH 06/28] Update list of Python versions listed in the README (#133) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c97eebdbb..4d86bae9e 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ the transformations in order to revert them as needed. ## Requirements -**RDT** has been developed and tested on [Python 3.5, 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +**RDT** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid From 3ee57fd274abb0f92fa7af79213271f73eaae32e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 10 Nov 2020 13:13:56 +0100 Subject: [PATCH 07/28] Update travis-ci badge after migration to .com (#134) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4d86bae9e..a059ce132 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi Shield](https://img.shields.io/pypi/v/RDT.svg)](https://pypi.python.org/pypi/RDT) -[![Travis CI Shield](https://travis-ci.org/sdv-dev/RDT.svg?branch=master)](https://travis-ci.org/sdv-dev/RDT) +[![Travis CI Shield](https://travis-ci.com/sdv-dev/RDT.svg?branch=master)](https://travis-ci.com/sdv-dev/RDT) [![Coverage Status](https://codecov.io/gh/sdv-dev/RDT/branch/master/graph/badge.svg)](https://codecov.io/gh/sdv-dev/RDT) [![Downloads](https://pepy.tech/badge/rdt)](https://pepy.tech/project/rdt) From a69c68f4ace79dcf83a78e8721f9d6ebe3a660d6 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 13 Nov 2020 17:05:05 +0100 Subject: [PATCH 08/28] Update testing setup and readme (#138) --- README.md | 1 + setup.py | 2 +- tox.ini | 10 ++++------ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a059ce132..6beeb8192 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ the transformations in order to revert them as needed. ## Requirements **RDT** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +on GNU/Linux and macOS systems. Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid diff --git a/setup.py b/setup.py index 315f43a44..0c268516d 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], - description='Reversible Data Transformsi', + description='Reversible Data Transforms', extras_require={ 'test': tests_require, 'dev': development_requires + tests_require, diff --git a/tox.ini b/tox.ini index 19779305d..abca4a14f 100644 --- a/tox.ini +++ b/tox.ini @@ -15,15 +15,13 @@ python = [testenv] passenv = CI TRAVIS TRAVIS_* -skipsdist = true -skip_install = true -commands_pre = - /usr/bin/env pip install .[test] +skipsdist = false +skip_install = false +extras = test commands = /usr/bin/env make test [testenv:test-devel] -commands_pre = - /usr/bin/env pip install .[dev] +extras = dev commands = /usr/bin/env make test-devel From c2842b6169b9f79709bca7e39312f6173ab5600e Mon Sep 17 00:00:00 2001 From: fealho Date: Tue, 17 Nov 2020 23:45:59 -0300 Subject: [PATCH 09/28] OneHotEncodingTransformer support for lists and lists of lists (#137) * Created _prepare_data * Tests _prepare_data * Improved code. * Fix lint. * Fix documentation. --- rdt/transformers/categorical.py | 33 ++++++++++++++++++-- tests/transformers/test_categorical.py | 42 ++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 8d2f64e93..a2206f4dd 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -223,15 +223,43 @@ class OneHotEncodingTransformer(BaseTransformer): dummy_na = None dummies = None + @staticmethod + def _prepare_data(data): + """Transform data to appropriate format. + + If data is a valid list or a list of lists, transforms it into an np.array, + otherwise returns it. + + Args: + data (pandas.Series, numpy.ndarray, list or list of lists): + Data to prepare. + + Returns: + pandas.Series or numpy.ndarray + """ + if isinstance(data, list): + data = np.array(data) + + if len(data.shape) > 2: + raise ValueError("Unexpected format.") + if len(data.shape) == 2: + if data.shape[1] != 1: + raise ValueError("Unexpected format.") + + data = data[:, 0] + + return data + def fit(self, data): """Fit the transformer to the data. Get the pandas `dummies` which will be used later on for OneHotEncoding. Args: - data (pandas.Series or numpy.ndarray): + data (pandas.Series, numpy.ndarray, list or list of lists): Data to fit the transformer to. """ + data = self._prepare_data(data) self.dummy_na = pd.isnull(data).any() self.dummies = list(pd.get_dummies(data, dummy_na=self.dummy_na).columns) @@ -239,12 +267,13 @@ def transform(self, data): """Replace each category with the OneHot vectors. Args: - data (pandas.Series or numpy.ndarray): + data (pandas.Series, numpy.ndarray, list or list of lists): Data to transform. Returns: numpy.ndarray: """ + data = self._prepare_data(data) dummies = pd.get_dummies(data, dummy_na=self.dummy_na) return dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) diff --git a/tests/transformers/test_categorical.py b/tests/transformers/test_categorical.py index 971c9111f..f4dd144a6 100644 --- a/tests/transformers/test_categorical.py +++ b/tests/transformers/test_categorical.py @@ -283,6 +283,48 @@ def test_reversible_mixed(self): class TestOneHotEncodingTransformer: + def test__prepare_data_empty_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + data = [[], [], []] + + # Assert + with pytest.raises(ValueError): + ohet._prepare_data(data) + + def test__prepare_data_nested_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + data = [[[]]] + + # Assert + with pytest.raises(ValueError): + ohet._prepare_data(data) + + def test__prepare_data_list_of_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = [['a'], ['b'], ['c']] + out = ohet._prepare_data(data) + + # Assert + expected = np.array(['a', 'b', 'c']) + np.testing.assert_array_equal(out, expected) + + def test__prepare_data_pandas_series(self): + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = pd.Series(['a', 'b', 'c']) + out = ohet._prepare_data(data) + + # Assert + expected = pd.Series(['a', 'b', 'c']) + np.testing.assert_array_equal(out, expected) + def test_fit_no_nans(self): # Setup ohet = OneHotEncodingTransformer() From 1c1e7cc4b9d00b41e0ba55d658b9a8d7403f682d Mon Sep 17 00:00:00 2001 From: fealho Date: Thu, 19 Nov 2020 13:10:55 -0300 Subject: [PATCH 10/28] Issue 135 all zeros one hot vectors (#136) * Raise error for non-existing category. * Tests if error properly raised. * Fixed error checking * Fix lint * Added flag for unknown value error * Fix lint * Fix lint * Fix lint --- rdt/transformers/categorical.py | 15 ++++++++++++++- tests/transformers/test_boolean.py | 2 +- tests/transformers/test_categorical.py | 10 ++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index a2206f4dd..e1d634d42 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -218,11 +218,19 @@ class OneHotEncodingTransformer(BaseTransformer): is found and 0s on the rest. Null values are considered just another category. + + Args: + error_on_unknown (bool): + If a value that was not seen during the fit stage is passed to + transform, then an error will be raised if this is True. """ dummy_na = None dummies = None + def __init__(self, error_on_unknown=True): + self.error_on_unknown = error_on_unknown + @staticmethod def _prepare_data(data): """Transform data to appropriate format. @@ -275,7 +283,12 @@ def transform(self, data): """ data = self._prepare_data(data) dummies = pd.get_dummies(data, dummy_na=self.dummy_na) - return dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) + array = dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) + for i, row in enumerate(array): + if np.all(row == 0) and self.error_on_unknown: + raise ValueError(f"The value {data[i]} was not seen during the fit stage.") + + return array def reverse_transform(self, data): """Convert float values back to the original categorical values. diff --git a/tests/transformers/test_boolean.py b/tests/transformers/test_boolean.py index ef49d1f1a..201f3ff9b 100644 --- a/tests/transformers/test_boolean.py +++ b/tests/transformers/test_boolean.py @@ -180,5 +180,5 @@ def test_reverse_transform_not_null_values(self): # Asserts expected = np.array([True, False, True]) - assert type(result) == pd.Series + assert isinstance(result, pd.Series) np.testing.assert_equal(result.values, expected) diff --git a/tests/transformers/test_categorical.py b/tests/transformers/test_categorical.py index f4dd144a6..873033399 100644 --- a/tests/transformers/test_categorical.py +++ b/tests/transformers/test_categorical.py @@ -409,6 +409,16 @@ def test_transform_single(self): ]) np.testing.assert_array_equal(out, expected) + def test_transform_all_zeros(self): + # Setup + ohet = OneHotEncodingTransformer() + data = pd.Series(['a']) + ohet.fit(data) + + # Assert + with np.testing.assert_raises(ValueError): + ohet.transform(['b']) + def test_reverse_transform_no_nans(self): # Setup ohet = OneHotEncodingTransformer() From 01ac5645d0a50ac81a79582132396e76c6c6966a Mon Sep 17 00:00:00 2001 From: fealho Date: Thu, 19 Nov 2020 17:07:13 -0300 Subject: [PATCH 11/28] Switched double quotes to single quotes. (#139) --- rdt/transformers/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index e1d634d42..9b4328aea 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -249,10 +249,10 @@ def _prepare_data(data): data = np.array(data) if len(data.shape) > 2: - raise ValueError("Unexpected format.") + raise ValueError('Unexpected format.') if len(data.shape) == 2: if data.shape[1] != 1: - raise ValueError("Unexpected format.") + raise ValueError('Unexpected format.') data = data[:, 0] @@ -286,7 +286,7 @@ def transform(self, data): array = dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) for i, row in enumerate(array): if np.all(row == 0) and self.error_on_unknown: - raise ValueError(f"The value {data[i]} was not seen during the fit stage.") + raise ValueError(f'The value {data[i]} was not seen during the fit stage.') return array From 1ab4b08fe0fb60a5f8b86cd2363bd52cf9a8fa2c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 10:32:58 +0100 Subject: [PATCH 12/28] Improve testing setup and test on Windows (#140) * Change test config to use invoke * Add windows testing * Adapt tests * Fix test failures on windows --- .github/workflows/docs.yml | 28 ---- .github/workflows/tests.yml | 60 +++++++- .travis.yml | 14 +- Makefile | 14 +- README.md | 2 +- rdt/hyper_transformer.py | 6 +- rdt/transformers/datetime.py | 4 +- rdt/transformers/numerical.py | 2 +- setup.py | 11 +- tasks.py | 80 ++++++++++ tests/integration/test_hyper_transformer.py | 4 +- .../transformers/test_numerical.py | 140 ++++++++++++------ tests/transformers/test_numerical.py | 137 +---------------- tox.ini | 35 +++-- 14 files changed, 290 insertions(+), 247 deletions(-) delete mode 100644 .github/workflows/docs.yml create mode 100644 tasks.py diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index b15bf4adc..000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Generate Docs - -on: - push: - branches: [ stable ] - -jobs: - - docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - name: Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Build - run: | - python -m pip install --upgrade pip - pip install -e .[dev] - make docs - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{secrets.GITHUB_TOKEN}} - publish_dir: docs/_build/html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 607a8e1e2..579c1b37d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,24 +5,78 @@ on: - pull_request jobs: - build: + lint: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e lint + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest] # skip windows bc rundoc fails steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e readme + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install tox tox-gh-actions + - name: Test with tox + run: tox -e pytest + minimum: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions - name: Test with tox - run: tox + run: tox -e minimum diff --git a/.travis.yml b/.travis.yml index ecfa96045..bd6bd1740 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,21 @@ # Config file for automatic testing at travis-ci.org +os: linux dist: bionic language: python python: - - 3.8 - - 3.7 - 3.6 + - 3.7 + - 3.8 +env: + - TOXENV=lint + - TOXENV=readme + - TOXENV=pytest + - TOXENV=minimum # Command to install dependencies -install: pip install -U tox-travis codecov +install: + - pip install -U tox-travis codecov after_success: codecov -# Command to run tests script: tox diff --git a/Makefile b/Makefile index f2cb5c603..779a40589 100644 --- a/Makefile +++ b/Makefile @@ -86,10 +86,7 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint lint: ## check style with flake8 and isort - flake8 rdt - flake8 tests --ignore=D - isort -c --recursive rdt tests - pylint rdt --rcfile=setup.cfg + invoke lint .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort @@ -102,20 +99,15 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --cov=rdt + invoke pytest .PHONY: test-readme test-readme: ## run the readme snippets - rm -rf tests/readme_test && mkdir tests/readme_test - cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md - rm -rf tests/readme_test + invoke readme .PHONY: test test: test-unit test-readme ## test everything that needs test dependencies -.PHONY: test-devel -test-devel: lint docs ## test everything that needs development dependencies - .PHONY: test-all test-all: ## test using tox tox -r diff --git a/README.md b/README.md index 6beeb8192..4ff6bf84e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ the transformations in order to revert them as needed. ## Requirements **RDT** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) -on GNU/Linux and macOS systems. +on GNU/Linux, macOS and Windows systems. Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index fd576152d..3348f50f1 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -102,10 +102,8 @@ def _analyze(self, data): # probably category kind = dtype - if kind == 'i': - transformer = NumericalTransformer(dtype=int) - elif kind == 'f': - transformer = NumericalTransformer(dtype=float) + if kind in ('i', 'f'): + transformer = NumericalTransformer(dtype=np.dtype(dtype)) elif kind in ('O', 'category'): anonymize = self.anonymize.get(name) transformer = CategoricalTransformer(anonymize=anonymize) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 162b0067c..a05c3f82a 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -56,7 +56,7 @@ def _transform(self, datetimes): """Transform datetime values to integer.""" nulls = datetimes.isnull() integers = np.zeros(len(datetimes)) - integers[~nulls] = datetimes[~nulls].astype(int).astype(float).values + integers[~nulls] = datetimes[~nulls].astype(np.int64).astype(np.float64).values integers[nulls] = np.nan transformed = pd.Series(integers) @@ -110,7 +110,7 @@ def reverse_transform(self, data): if self.nan is not None: data = self.null_transformer.reverse_transform(data) - data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(int) + data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(np.int64) if self.strip_constant: data = data * self.divider diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index b8a514110..5f9bbc6b0 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -85,7 +85,7 @@ def reverse_transform(self, data): Data to transform. Returns: - pandas.Series + numpy.ndarray """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) diff --git a/setup.py b/setup.py index 0c268516d..3470891a4 100644 --- a/setup.py +++ b/setup.py @@ -12,11 +12,11 @@ history = history_file.read() install_requires = [ - 'numpy>=1.15.4,<2', - 'pandas>=0.21,<2', - 'scipy>=1.1.0,<2', + 'numpy>=1.17.4,<2', + 'pandas>=1.1,<2', + 'scipy>=1.4,<2', 'Faker>=1.0.1,<2', - 'copulas>=0.3.0,<0.4', + 'copulas>=0.3.3,<0.4', ] setup_requires = [ @@ -61,6 +61,9 @@ # Advanced testing 'coverage>=4.5.1,<6', 'tox>=2.9.1,<4', + + # Invoking test commands + 'invoke' ] setup( diff --git a/tasks.py b/tasks.py new file mode 100644 index 000000000..89e73c1e8 --- /dev/null +++ b/tasks.py @@ -0,0 +1,80 @@ +import os +import re +import shutil +import stat +from pathlib import Path + +from invoke import task + + +@task +def pytest(c): + c.run('python -m pytest --cov=rdt') + + +@task +def install_minimum(c): + with open('setup.py', 'r') as setup_py: + lines = setup_py.read().splitlines() + + versions = [] + started = False + for line in lines: + if started: + if line == ']': + break + + line = line.strip() + line = re.sub(r',?<=?[\d.]*,?', '', line) + line = re.sub(r'>=?', '==', line) + line = re.sub(r"""['",]""", '', line) + versions.append(line) + + elif line.startswith('install_requires = ['): + started = True + + c.run(f'python -m pip install {" ".join(versions)}') + + +@task +def minimum(c): + install_minimum(c) + c.run('python -m pip check') + c.run('python -m pytest') + + +@task +def readme(c): + test_path = Path('tests/readme_test') + if test_path.exists() and test_path.is_dir(): + shutil.rmtree(test_path) + + cwd = os.getcwd() + os.makedirs(test_path, exist_ok=True) + shutil.copy('README.md', test_path / 'README.md') + os.chdir(test_path) + c.run('rundoc run --single-session python3 -t python3 README.md') + os.chdir(cwd) + shutil.rmtree(test_path) + + +@task +def lint(c): + c.run('flake8 rdt') + c.run('flake8 tests --ignore=D') + c.run('isort -c --recursive rdt tests') + c.run('pylint rdt --rcfile=setup.cfg') + + +def remove_readonly(func, path, _): + "Clear the readonly bit and reattempt the removal" + os.chmod(path, stat.S_IWRITE) + func(path) + + +@task +def rmdir(c, path): + try: + shutil.rmtree(path, onerror=remove_readonly) + except PermissionError: + pass diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index 3ce56adcf..bd83b447b 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -45,13 +45,13 @@ def get_transformers(): 'integer': { 'class': 'NumericalTransformer', 'kwargs': { - 'dtype': int, + 'dtype': np.int64, } }, 'float': { 'class': 'NumericalTransformer', 'kwargs': { - 'dtype': float, + 'dtype': np.float64, } }, 'categorical': { diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index 07f1caf33..880c1958c 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -1,75 +1,127 @@ import numpy as np -from rdt.transformers.numerical import GaussianCopulaTransformer +from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer -def test_copula_transformer_stats(): - data = np.random.normal(loc=4, scale=4, size=1000) +class TestNumericalTransformer: - ct = GaussianCopulaTransformer() - transformed = ct.fit_transform(data) + def test_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (1000, ) + nt = NumericalTransformer() + transformed = nt.fit_transform(data) - np.testing.assert_almost_equal(transformed.mean(), 0, decimal=1) - np.testing.assert_almost_equal(transformed.std(), 1, decimal=1) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] - reverse = ct.reverse_transform(transformed) + reverse = nt.reverse_transform(transformed) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_not_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) -def test_copula_transformer_null_column(): - data = np.array([1, 2, 1, 2, np.nan, 1]) + nt = NumericalTransformer(null_column=False) + transformed = nt.fit_transform(data) - ct = GaussianCopulaTransformer() - transformed = ct.fit_transform(data) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, ) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, 2) - assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] + reverse = nt.reverse_transform(transformed) - reverse = ct.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_int(self): + data = np.array([1, 2, 1, 2, 1]) + nt = NumericalTransformer(dtype=int) + transformed = nt.fit_transform(data) -def test_copula_transformer_not_null_column(): - data = np.array([1, 2, 1, 2, np.nan, 1]) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (5, ) - ct = GaussianCopulaTransformer(null_column=False) - transformed = ct.fit_transform(data) + reverse = nt.reverse_transform(transformed) + assert list(reverse) == [1, 2, 1, 2, 1] - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, ) + def test_int_nan(self): + data = np.array([1, 2, 1, 2, 1, np.nan]) - reverse = ct.reverse_transform(transformed) + nt = NumericalTransformer(dtype=int) + transformed = nt.fit_transform(data) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + reverse = nt.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) -def test_copula_transformer_int(): - data = np.array([1, 2, 1, 2, 1]) - ct = GaussianCopulaTransformer(dtype=int) - transformed = ct.fit_transform(data) +class TestGaussianCopulaTransformer: - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (5, ) + def test_stats(self): + data = np.random.normal(loc=4, scale=4, size=1000) - reverse = ct.reverse_transform(transformed) - assert list(reverse) == [1, 2, 1, 2, 1] + ct = GaussianCopulaTransformer() + transformed = ct.fit_transform(data) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (1000, ) -def test_copula_transformer_int_nan(): - data = np.array([1, 2, 1, 2, 1, np.nan]) + np.testing.assert_almost_equal(transformed.mean(), 0, decimal=1) + np.testing.assert_almost_equal(transformed.std(), 1, decimal=1) - ct = GaussianCopulaTransformer(dtype=int) - transformed = ct.fit_transform(data) + reverse = ct.reverse_transform(transformed) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, 2) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) - reverse = ct.reverse_transform(transformed) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) + + ct = GaussianCopulaTransformer() + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] + + reverse = ct.reverse_transform(transformed) + + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + + def test_not_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) + + ct = GaussianCopulaTransformer(null_column=False) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, ) + + reverse = ct.reverse_transform(transformed) + + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + + def test_int(self): + data = np.array([1, 2, 1, 2, 1]) + + ct = GaussianCopulaTransformer(dtype=int) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (5, ) + + reverse = ct.reverse_transform(transformed) + assert list(reverse) == [1, 2, 1, 2, 1] + + def test_int_nan(self): + data = np.array([1, 2, 1, 2, 1, np.nan]) + + ct = GaussianCopulaTransformer(dtype=int) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + + reverse = ct.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) diff --git a/tests/transformers/test_numerical.py b/tests/transformers/test_numerical.py index 89161052b..b799be321 100644 --- a/tests/transformers/test_numerical.py +++ b/tests/transformers/test_numerical.py @@ -1,9 +1,7 @@ from unittest import TestCase -from unittest.mock import Mock, patch import copulas import numpy as np -import pandas as pd import pytest from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer @@ -11,15 +9,13 @@ class TestNumericalTransformer(TestCase): - def test___init__(self): - """Test default instance""" - # Run - transformer = NumericalTransformer() + def test___init__super_attrs(self): + """super() arguments are properly passed and set as attributes.""" + nt = NumericalTransformer(dtype='int', nan='mode', null_column=False) - # Asserts - self.assertEqual(transformer.nan, 'mean', "Unexpected nan") - self.assertIsNone(transformer.null_column, "null_column is None by default") - self.assertIsNone(transformer.dtype, "dtype is None by default") + assert nt.dtype == 'int' + assert nt.nan == 'mode' + assert nt.null_column is False def test_fit(self): """Test fit nan mean with numpy.array""" @@ -34,125 +30,8 @@ def test_fit(self): expect_fill_value = 'nan' expect_dtype = np.float - self.assertEqual( - transformer.null_transformer.fill_value, - expect_fill_value, - "Data mean is wrong" - ) - - self.assertEqual( - transformer._dtype, - expect_dtype, - "Expected dtype: float" - ) - - def test_transform_array(self): - """Test transform numpy.array""" - # Setup - data = np.array([1.5, None, 2.5]) - - # Run - transformer = Mock() - NumericalTransformer.transform(transformer, data) - - # Asserts - expect_call_count = 1 - - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "Transform must be called only once" - ) - - def test_transform_series(self): - """Test transform pandas.Series""" - # Setup - data = pd.Series([1.5, None, 2.5]) - - # Run - transformer = Mock() - NumericalTransformer.transform(transformer, data) - - # Asserts - expect_call_count = 1 - - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "Transform must be called only once" - ) - - def test_reverse_transform_nan_ignore(self): - """Test reverse_transform with nan equal to ignore""" - # Setup - data = pd.Series([1.5, None, 2.5]) - - # Run - transformer = Mock() - transformer.nan = None - transformer._dtype = np.float - - result = NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expect = pd.Series([1.5, None, 2.5]) - expected_reverse_transform_call_count = 0 - - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform can't be called when nan is ignore" - ) - - def test_reverse_transform_nan_not_ignore(self): - """Test reverse_transform with nan not equal to ignore""" - # Setup - data = pd.Series([1.5, 2.0, 2.5]) - reversed_data = pd.Series([1.5, 2.0, 2.5]) - - # Run - transformer = Mock() - transformer.nan = 'mean' - transformer._dtype = np.float - transformer.null_transformer.nulls = False - transformer.null_transformer.reverse_transform.return_value = reversed_data - - NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expected_reverse_transform_call_count = 1 - - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform must be called at least once" - ) - - @patch('numpy.round') - def test_reverse_transform_dtype_int(self, numpy_mock): - """Test reverse_transform with dtype equal to int""" - # Setup - numpy_mock.return_value = pd.Series([3, 2, 3]) - data = pd.Series([3.0, 2.0, 3.0]) - - # Run - transformer = Mock() - transformer.nan = None - transformer._dtype = np.int - - result = NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expect = pd.Series([3, 2, 3]) - expected_reverse_transform_call_count = 0 - - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform must be called at least once" - ) + assert transformer.null_transformer.fill_value == expect_fill_value + assert transformer._dtype == expect_dtype class TestGaussianCopulaTransformer: diff --git a/tox.ini b/tox.ini index abca4a14f..83282478a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,27 +1,34 @@ [tox] -envlist = py3{6,7,8}, test-devel +envlist = py3{6,7,8}-{lint,readme,pytest,minimum} [travis] python = - 3.8: py38, test-devel - 3.7: py37 - 3.6: py36 + 3.8: py38-lint, py38-readme, py38-pytest, py38-minimum + 3.7: py37-lint, py37-readme, py37-pytest, py37-minimum + 3.6: py36-lint, py36-readme, py36-pytest, py36-minimum [gh-actions] python = - 3.8: py38, test-devel - 3.7: py37, - 3.6: py36 + 3.8: py38-lint, py38-readme, py38-pytest, py38-minimum + 3.7: py37-lint, py37-readme, py37-pytest, py37-minimum + 3.6: py36-lint, py36-readme, py36-pytest, py36-minimum [testenv] passenv = CI TRAVIS TRAVIS_* skipsdist = false skip_install = false -extras = test +deps = + invoke + readme: rundoc + tutorials: jupyter +extras = + lint: dev + pytest: test + minimum: test + tutorials: ctgan commands = - /usr/bin/env make test - -[testenv:test-devel] -extras = dev -commands = - /usr/bin/env make test-devel + lint: invoke lint + readme: invoke readme + pytest: invoke pytest + minimum: invoke minimum + invoke rmdir {envdir} From fd8620252f54c5c984540fd995b86a33feaebd00 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 12:31:10 +0100 Subject: [PATCH 13/28] Ensure all None input is returned as NaT (#141) * Ensure all None input is returned as NaT * Revert default argument change --- rdt/transformers/datetime.py | 2 +- tests/transformers/test_datetime.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index a05c3f82a..a2f6fa296 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -112,6 +112,6 @@ def reverse_transform(self, data): data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(np.int64) if self.strip_constant: - data = data * self.divider + data = data.astype(float) * self.divider return pd.to_datetime(data) diff --git a/tests/transformers/test_datetime.py b/tests/transformers/test_datetime.py index 0befe0faa..e35a0eb14 100644 --- a/tests/transformers/test_datetime.py +++ b/tests/transformers/test_datetime.py @@ -39,3 +39,13 @@ def test_strip(self): ]) np.testing.assert_almost_equal(expect_trans, transformed) pd.testing.assert_series_equal(reverted, data) + + def test_reverse_transform_all_none(self): + dt = pd.to_datetime(['2020-01-01']) + dtt = DatetimeTransformer(strip_constant=True) + dtt.fit(dt) + + output = dtt.reverse_transform(np.array([None])) + + expected = pd.to_datetime(['NaT']) + pd.testing.assert_series_equal(output.to_series(), expected.to_series()) From 7e79da0fb9f9d3596b53d5ee436e49b3fcfdc31e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 12:32:44 +0100 Subject: [PATCH 14/28] Add release notes for v0.2.8 --- HISTORY.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 37c9e8e98..9921c86ed 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,21 @@ # History +## 0.2.8 - 2020-11-20 + +This release fixes a few minor bugs, including some which prevented RDT from fully working +on Windows systems. + +Thanks to this fixes, as well as a new testing infrastructure that has been set up, from now +on RDT is officially supported on Windows systems, as well as on the Linux and macOS systems +which were previously supported. + +### Issues closed + +* TypeError: unsupported operand type(s) for: 'NoneType' and 'int' - Issue [#132](https://github.com/sdv-dev/RDT/issues/132) by @csala +* Example does not work on Windows - Issue [#114](https://github.com/sdv-dev/RDT/issues/114) by @csala +* OneHotEncodingTransformer producing all zeros - Issue [#135](https://github.com/sdv-dev/RDT/issues/135) by @fealho +* OneHotEncodingTransformer support for lists and lists of lists - Issue [#137](https://github.com/sdv-dev/RDT/issues/137) by @fealho + ## 0.2.7 - 2020-10-16 In this release we drop the support for the now officially dead Python 3.5 From 2168c4cf2c1e34e14d93bbba293197ed5dd682db Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 12:32:55 +0100 Subject: [PATCH 15/28] =?UTF-8?q?Bump=20version:=200.2.8.dev0=20=E2=86=92?= =?UTF-8?q?=200.2.8.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 22f1884f2..c6350d309 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.8.dev0' +__version__ = '0.2.8.dev1' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 672c5f2ed..58d84aec2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.8.dev0 +current_version = 0.2.8.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 3470891a4..5034f8cb5 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.8.dev0', + version='0.2.8.dev1', zip_safe=False, ) From c1152fe496d7e4f8e5e3d0b52448786b47fbbd12 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 21:44:20 +0100 Subject: [PATCH 16/28] =?UTF-8?q?Bump=20version:=200.2.8.dev1=20=E2=86=92?= =?UTF-8?q?=200.2.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index c6350d309..448efa86b 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.8.dev1' +__version__ = '0.2.8' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 58d84aec2..aa81e304a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.8.dev1 +current_version = 0.2.8 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 5034f8cb5..0c9c586b2 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.8.dev1', + version='0.2.8', zip_safe=False, ) From 6fad4d6092d016516aa5faa416499e97f1c137ec Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 20 Nov 2020 21:44:33 +0100 Subject: [PATCH 17/28] =?UTF-8?q?Bump=20version:=200.2.8=20=E2=86=92=200.2?= =?UTF-8?q?.9.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 448efa86b..0bb2b68dc 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.8' +__version__ = '0.2.9.dev0' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index aa81e304a..abcb28b60 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.8 +current_version = 0.2.9.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 0c9c586b2..bf0ea81d5 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.8', + version='0.2.9.dev0', zip_safe=False, ) From a965534fb7777615d11a0102293ba94b8651d63a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 26 Nov 2020 21:47:56 +0100 Subject: [PATCH 18/28] Ensure CategoricalTransformer works on numerical+nans data (#143) --- rdt/transformers/categorical.py | 7 +++++++ .../transformers/test_categorical.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tests/integration/transformers/test_categorical.py diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 9b4328aea..146ee3747 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -115,6 +115,9 @@ def _get_intervals(data): end = start + prob mean = (start + end) / 2 std = prob / 6 + if pd.isnull(value): + value = np.nan + intervals[value] = (start, end, mean, std) start = end @@ -144,7 +147,11 @@ def fit(self, data): def _get_value(self, category): """Get the value that represents this category.""" + if pd.isnull(category): + category = np.nan + mean, std = self.intervals[category][2:] + if self.fuzzy: return norm.rvs(mean, std) diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py new file mode 100644 index 000000000..461f94d0c --- /dev/null +++ b/tests/integration/transformers/test_categorical.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd + +from rdt.transformers import CategoricalTransformer + + +def test_categorical_numerical_nans(): + """Ensure CategoricalTransformers work on numerical + nan only columns.""" + + data = pd.Series([1, 2, float('nan'), np.nan]) + + ct = CategoricalTransformer() + ct.fit(data) + transformed = ct.transform(data) + reverse = ct.reverse_transform(transformed) + + pd.testing.assert_series_equal(reverse, data) From a6eaa41de3286dcbf2745320ed2a10f77040c006 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Nov 2020 14:03:43 +0100 Subject: [PATCH 19/28] =?UTF-8?q?Bump=20version:=200.2.9.dev0=20=E2=86=92?= =?UTF-8?q?=200.2.9.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 0bb2b68dc..af2029e81 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.9.dev0' +__version__ = '0.2.9.dev1' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index abcb28b60..e3812dffb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.9.dev0 +current_version = 0.2.9.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index bf0ea81d5..c6a8e0511 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.9.dev0', + version='0.2.9.dev1', zip_safe=False, ) From 425997da38abeeab8c3a46738f2bff37aa3aa435 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Nov 2020 20:53:07 +0100 Subject: [PATCH 20/28] Add release notes for v0.2.9 --- HISTORY.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 9921c86ed..9b90d402c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ # History +## 0.2.9 - 2020-11-27 + +This release fixes a bug that prevented the `CategoricalTransformer` from working properly +when being passed data that contained numerical data only, without any strings, but also +contained `None` or `NaN` values. + +### Issues closed + +* KeyError: nan - CategoricalTransformer fails on numerical + nan data only - Issue [#142](https://github.com/sdv-dev/RDT/issues/142) by @csala + ## 0.2.8 - 2020-11-20 This release fixes a few minor bugs, including some which prevented RDT from fully working From 50f5145c0bbf5e1c35e520559584438e70828903 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Nov 2020 20:53:16 +0100 Subject: [PATCH 21/28] =?UTF-8?q?Bump=20version:=200.2.9.dev1=20=E2=86=92?= =?UTF-8?q?=200.2.9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index af2029e81..dc2801f83 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.9.dev1' +__version__ = '0.2.9' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index e3812dffb..4b0be3aed 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.9.dev1 +current_version = 0.2.9 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index c6a8e0511..75d08d6b3 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.9.dev1', + version='0.2.9', zip_safe=False, ) From aa863a80b08c3b05e6b190fb7a87e5f2cd4aff93 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 27 Nov 2020 20:53:27 +0100 Subject: [PATCH 22/28] =?UTF-8?q?Bump=20version:=200.2.9=20=E2=86=92=200.2?= =?UTF-8?q?.10.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index dc2801f83..f2eae6188 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.9' +__version__ = '0.2.10.dev0' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 4b0be3aed..e6b4e01ad 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.9 +current_version = 0.2.10.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 75d08d6b3..107dd5c2e 100644 --- a/setup.py +++ b/setup.py @@ -97,6 +97,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.9', + version='0.2.10.dev0', zip_safe=False, ) From e2282950c0939f82ef59eeea513285b13cdeca32 Mon Sep 17 00:00:00 2001 From: fealho Date: Wed, 2 Dec 2020 15:01:51 -0300 Subject: [PATCH 23/28] Makes Copulas an optional dependency (#144) * Makes copulas optional. * Move copulas in setup.py * Fix lint * Addresses feedback * Fixed bugs. * Added copulas to test/dev tests --- rdt/transformers/numerical.py | 105 ++++++++++-------- setup.py | 12 +- tasks.py | 6 +- .../transformers/test_numerical.py | 2 +- 4 files changed, 73 insertions(+), 52 deletions(-) diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 5f9bbc6b0..a6ea1c79c 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -1,15 +1,15 @@ """Transformers for numerical data.""" - import copy import numpy as np import pandas as pd import scipy -from copulas import EPSILON, univariate from rdt.transformers.base import BaseTransformer from rdt.transformers.null import NullTransformer +EPSILON = np.finfo(np.float32).eps + class NumericalTransformer(BaseTransformer): """Transformer for numerical data. @@ -161,64 +161,77 @@ class GaussianCopulaTransformer(NumericalTransformer): * ``truncated_gaussian``: Use a Truncated Gaussian distribution. """ - _DISTRIBUTIONS = { - 'univariate': univariate.Univariate, - 'parametric': ( - univariate.Univariate, { - 'parametric': univariate.ParametricType.PARAMETRIC, - }, - ), - 'bounded': ( - univariate.Univariate, - { - 'bounded': univariate.BoundedType.BOUNDED, - }, - ), - 'semi_bounded': ( - univariate.Univariate, - { - 'bounded': univariate.BoundedType.SEMI_BOUNDED, - }, - ), - 'parametric_bounded': ( - univariate.Univariate, - { - 'parametric': univariate.ParametricType.PARAMETRIC, - 'bounded': univariate.BoundedType.BOUNDED, - }, - ), - 'parametric_semi_bounded': ( - univariate.Univariate, - { - 'parametric': univariate.ParametricType.PARAMETRIC, - 'bounded': univariate.BoundedType.SEMI_BOUNDED, - }, - ), - 'gaussian': univariate.GaussianUnivariate, - 'gamma': univariate.GammaUnivariate, - 'beta': univariate.BetaUnivariate, - 'student_t': univariate.StudentTUnivariate, - 'gaussian_kde': univariate.GaussianKDE, - 'truncated_gaussian': univariate.TruncatedGaussian, - } - _univariate = None def __init__(self, dtype=None, nan='mean', null_column=None, distribution='parametric'): super().__init__(dtype=dtype, nan=nan, null_column=null_column) + self._distributions = self._get_distributions() if isinstance(distribution, str): - distribution = self._DISTRIBUTIONS[distribution] + distribution = self._distributions[distribution] self._distribution = distribution + @staticmethod + def _get_distributions(): + try: + from copulas import univariate # pylint: disable=import-outside-toplevel + except ImportError as error: + error.msg += ( + '\n\nIt seems like `copulas` is not installed.\n' + 'Please install it using:\n\n pip install rdt[copulas]' + ) + raise + + return { + 'univariate': univariate.Univariate, + 'parametric': ( + univariate.Univariate, { + 'parametric': univariate.ParametricType.PARAMETRIC, + }, + ), + 'bounded': ( + univariate.Univariate, + { + 'bounded': univariate.BoundedType.BOUNDED, + }, + ), + 'semi_bounded': ( + univariate.Univariate, + { + 'bounded': univariate.BoundedType.SEMI_BOUNDED, + }, + ), + 'parametric_bounded': ( + univariate.Univariate, + { + 'parametric': univariate.ParametricType.PARAMETRIC, + 'bounded': univariate.BoundedType.BOUNDED, + }, + ), + 'parametric_semi_bounded': ( + univariate.Univariate, + { + 'parametric': univariate.ParametricType.PARAMETRIC, + 'bounded': univariate.BoundedType.SEMI_BOUNDED, + }, + ), + 'gaussian': univariate.GaussianUnivariate, + 'gamma': univariate.GammaUnivariate, + 'beta': univariate.BetaUnivariate, + 'student_t': univariate.StudentTUnivariate, + 'gaussian_kde': univariate.GaussianKDE, + 'truncated_gaussian': univariate.TruncatedGaussian, + } + def _get_univariate(self): distribution = self._distribution - if isinstance(distribution, univariate.Univariate): + if isinstance(distribution, self._distributions['univariate']): return copy.deepcopy(distribution) if isinstance(distribution, tuple): return distribution[0](**distribution[1]) - if isinstance(distribution, type) and issubclass(distribution, univariate.Univariate): + if isinstance(distribution, type) and \ + issubclass(distribution, self._distributions['univariate']): return distribution() raise TypeError('Invalid distribution: {}'.format(distribution)) diff --git a/setup.py b/setup.py index 107dd5c2e..123f1b03f 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,11 @@ 'pandas>=1.1,<2', 'scipy>=1.4,<2', 'Faker>=1.0.1,<2', - 'copulas>=0.3.3,<0.4', ] +copulas_requires = [ + 'copulas>=0.3.3,<0.4', +] setup_requires = [ 'pytest-runner>=2.11.1', ] @@ -28,8 +30,11 @@ 'pytest-cov>=2.6.0', 'jupyter>=1.0.0,<2', 'rundoc>=0.4.3,<0.5', + 'copulas>=0.3.3,<0.4', ] + + development_requires = [ # general 'bumpversion>=0.5.3,<0.6', @@ -81,8 +86,9 @@ ], description='Reversible Data Transforms', extras_require={ - 'test': tests_require, - 'dev': development_requires + tests_require, + 'copulas': copulas_requires, + 'test': tests_require + copulas_requires, + 'dev': development_requires + tests_require + copulas_requires, }, include_package_data=True, install_requires=install_requires, diff --git a/tasks.py b/tasks.py index 89e73c1e8..13af582a2 100644 --- a/tasks.py +++ b/tasks.py @@ -22,7 +22,8 @@ def install_minimum(c): for line in lines: if started: if line == ']': - break + started = False + continue line = line.strip() line = re.sub(r',?<=?[\d.]*,?', '', line) @@ -30,7 +31,8 @@ def install_minimum(c): line = re.sub(r"""['",]""", '', line) versions.append(line) - elif line.startswith('install_requires = ['): + elif line.startswith('install_requires = [') or \ + line.startswith('copulas_requires = ['): started = True c.run(f'python -m pip install {" ".join(versions)}') diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index 880c1958c..535a6c317 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -73,7 +73,7 @@ def test_stats(self): reverse = ct.reverse_transform(transformed) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + np.testing.assert_array_almost_equal(reverse, data, decimal=1) def test_null_column(self): data = np.array([1, 2, 1, 2, np.nan, 1]) From b004933d81602fc7a982caad521da250dfe04115 Mon Sep 17 00:00:00 2001 From: fealho Date: Thu, 3 Dec 2020 14:47:24 -0300 Subject: [PATCH 24/28] Added conda support (#146) * Adds conda support. * Fixes readme issue. * Remove copulas dependency --- README.md | 10 ++++++++++ conda/README.md | 29 +++++++++++++++++++++++++++++ conda/meta.yaml | 43 +++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 4 ++++ setup.py | 2 +- 5 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 conda/README.md create mode 100644 conda/meta.yaml diff --git a/README.md b/README.md index 4ff6bf84e..86d13f80e 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,16 @@ This will pull and install the latest stable release from [PyPi](https://pypi.or If you want to install from source or contribute to the project please read the [Contributing Guide](CONTRIBUTING.rst). +## Install with conda + +**RDT** can also be installed using [conda](https://docs.conda.io/en/latest/): + +```bash +conda install -c sdv-dev -c conda-forge rdt +``` + +This will pull and install the latest stable release from [Anaconda](https://anaconda.org/). + # Quickstart diff --git a/conda/README.md b/conda/README.md new file mode 100644 index 000000000..ccb42950f --- /dev/null +++ b/conda/README.md @@ -0,0 +1,29 @@ +## Instructions + +These are instructions to deploy the latest version of **RDT** to [conda](https://docs.conda.io/en/latest/). +It should be done after every new release. + +## Update the recipe +Prior to making the release on PyPI, you should update the meta.yaml to reflect any changes in the dependencies. +Note that you do not need to edit the version number as that is managed by bumpversion. + +## Make the PyPI release +Follow the standard release instructions to make a PyPI release. Then, return here to make the conda release. + +## Build a package +As part of the PyPI release, you will have updated the stable branch. You should now check out the stable +branch and build the conda package. + +```bash +git checkout stable +cd conda +conda build . +``` + +## Upload to Anaconda +Finally, you can upload the resulting package to Anaconda. + +```bash +anaconda login +anaconda upload -u sdv-dev +``` \ No newline at end of file diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 000000000..18053c3e8 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,43 @@ +{% set name = 'rdt' %} +{% set version = '0.2.10.dev0' %} + +package: + name: "{{ name|lower }}" + version: "{{ version }}" + +source: + url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz" + +build: + number: 0 + noarch: python + script: "{{ PYTHON }} -m pip install . --no-dependencies" + +requirements: + host: + - faker >=1.0.1,<4.15.0 + - numpy >=1.17.4,<2 + - pandas >=1.1,<2 + - pip + - python + - scipy >=1.4,<2 + - pytest-runner + run: + - faker >=1.0.1,<4.15.0 + - numpy >=1.17.4,<2 + - pandas >=1.1,<2 + - python + - scipy >=1.4,<2 + +about: + home: "https://github.com/sdv-dev/RDT" + license: MIT + license_family: MIT + license_file: + summary: "Reversible Data Transforms" + doc_url: + dev_url: + +extra: + recipe-maintainers: + - sdv-dev diff --git a/setup.cfg b/setup.cfg index e6b4e01ad..6a2db9883 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,10 @@ replace = version='{new_version}' search = __version__ = '{current_version}' replace = __version__ = '{new_version}' +[bumpversion:file:conda/meta.yaml] +search = version = '{current_version}' +replace = version = '{new_version}' + [bdist_wheel] universal = 1 diff --git a/setup.py b/setup.py index 123f1b03f..db94e861e 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ 'numpy>=1.17.4,<2', 'pandas>=1.1,<2', 'scipy>=1.4,<2', - 'Faker>=1.0.1,<2', + 'Faker>=1.0.1,<4.15', ] copulas_requires = [ From 99b1cb3ab25ad146735f79927d6686157def2b65 Mon Sep 17 00:00:00 2001 From: fealho Date: Mon, 7 Dec 2020 16:28:46 -0300 Subject: [PATCH 25/28] Update conda recipe/instructions (#147) * Updates recipe and installation files * Deletes pytorch/constraints pandas version * Fixes version --- conda/README.md | 2 +- conda/meta.yaml | 6 +++--- setup.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conda/README.md b/conda/README.md index ccb42950f..60cd558ee 100644 --- a/conda/README.md +++ b/conda/README.md @@ -17,7 +17,7 @@ branch and build the conda package. ```bash git checkout stable cd conda -conda build . +conda build -c sdv-dev -c conda-forge . ``` ## Upload to Anaconda diff --git a/conda/meta.yaml b/conda/meta.yaml index 18053c3e8..a3111242e 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -11,13 +11,13 @@ source: build: number: 0 noarch: python - script: "{{ PYTHON }} -m pip install . --no-dependencies" + script: "{{ PYTHON }} -m pip install ." requirements: host: - faker >=1.0.1,<4.15.0 - numpy >=1.17.4,<2 - - pandas >=1.1,<2 + - pandas >=1.1,<1.1.5 - pip - python - scipy >=1.4,<2 @@ -25,7 +25,7 @@ requirements: run: - faker >=1.0.1,<4.15.0 - numpy >=1.17.4,<2 - - pandas >=1.1,<2 + - pandas >=1.1,<1.1.5 - python - scipy >=1.4,<2 diff --git a/setup.py b/setup.py index db94e861e..62cd1440b 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ install_requires = [ 'numpy>=1.17.4,<2', - 'pandas>=1.1,<2', + 'pandas>=1.1,<1.1.5', 'scipy>=1.4,<2', 'Faker>=1.0.1,<4.15', ] From 73f079bec179bb5d1c3fa9c54a0b59137d9f3406 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 9 Dec 2020 14:39:34 +0100 Subject: [PATCH 26/28] Add dtype_transformers argument to HyperTransformer (#148) * Add dtype_transformers argument to HyperTransformer * Capture original dtype by default * Make the warning a DeprecationWarning --- rdt/hyper_transformer.py | 65 ++++++++++++---- setup.cfg | 1 + tests/test_hyper_transformer.py | 134 ++++++-------------------------- 3 files changed, 73 insertions(+), 127 deletions(-) diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index 3348f50f1..cdec5ab7d 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -1,12 +1,14 @@ """Hyper transformer module.""" import re +import warnings +from copy import deepcopy import numpy as np from rdt.transformers import ( - BooleanTransformer, CategoricalTransformer, DatetimeTransformer, NumericalTransformer, - load_transformers) + BooleanTransformer, CategoricalTransformer, DatetimeTransformer, LabelEncodingTransformer, + NumericalTransformer, OneHotEncodingTransformer, load_transformers) class HyperTransformer: @@ -28,6 +30,10 @@ class HyperTransformer: dtypes (list or None): List of column data types to use when building the ``transformers`` dict automatically. If not passed, the ``DataFrame.dtypes`` are used. + dtype_transformers (dict or None): + Transformer templates to use for each dtype. Passed as a dictionary of + dtype kinds ('i', 'f', 'O', 'b', 'M') and transformer names, classes + or instances. Example: Create a simple ``HyperTransformer`` instance that will decide which transformers @@ -53,12 +59,35 @@ class HyperTransformer: >>> ht = HyperTransformer(transformers) """ - def __init__(self, transformers=None, copy=True, anonymize=None, dtypes=None): + _TRANSFORMER_TEMPLATES = { + 'numerical': NumericalTransformer, + 'integer': NumericalTransformer(dtype=int), + 'float': NumericalTransformer(dtype=float), + 'categorical': CategoricalTransformer, + 'categorical_fuzzy': CategoricalTransformer(fuzzy=True), + 'one_hot_encoding': OneHotEncodingTransformer(error_on_unknown=False), + 'label_encoding': LabelEncodingTransformer, + 'boolean': BooleanTransformer, + 'datetime': DatetimeTransformer, + } + _DTYPE_TRANSFORMERS = { + 'i': 'numerical', + 'f': 'numerical', + 'O': 'categorical', + 'b': 'boolean', + 'M': 'datetime', + } + + def __init__(self, transformers=None, copy=True, anonymize=None, + dtypes=None, dtype_transformers=None): self.transformers = transformers self._transformers = dict() self.copy = copy self.anonymize = anonymize or dict() self.dtypes = dtypes + self.dtype_transformers = self._DTYPE_TRANSFORMERS.copy() + if dtype_transformers: + self.dtype_transformers.update(dtype_transformers) def _analyze(self, data): """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``. @@ -100,20 +129,26 @@ def _analyze(self, data): kind = np.dtype(dtype).kind except TypeError: # probably category - kind = dtype - - if kind in ('i', 'f'): - transformer = NumericalTransformer(dtype=np.dtype(dtype)) - elif kind in ('O', 'category'): - anonymize = self.anonymize.get(name) - transformer = CategoricalTransformer(anonymize=anonymize) - elif kind == 'b': - transformer = BooleanTransformer() - elif kind == 'M': - transformer = DatetimeTransformer() - else: + kind = 'O' + + transformer_template = self.dtype_transformers[kind] + if not transformer_template: raise ValueError('Unsupported dtype: {}'.format(dtype)) + if isinstance(transformer_template, str): + transformer_template = self._TRANSFORMER_TEMPLATES[transformer_template] + + if not isinstance(transformer_template, type): + transformer = deepcopy(transformer_template) + elif self.anonymize and transformer_template == CategoricalTransformer: + warnings.warn( + 'Categorical anonymization is deprecated and will be removed from RDT soon.', + DeprecationWarning + ) + transformer = CategoricalTransformer(anonymize=self.anonymize) + else: + transformer = transformer_template() + transformers[name] = transformer return transformers diff --git a/setup.cfg b/setup.cfg index 6a2db9883..01ed81abf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,6 +56,7 @@ ignore = D107, D407 [pylint] extension-pkg-whitelist = numpy min-similarity-lines = 5 +max-args = 8 ignore-comments = yes ignore-docstrings = yes ignore-imports = yes diff --git a/tests/test_hyper_transformer.py b/tests/test_hyper_transformer.py index cdd9a7c24..5cfd884b7 100644 --- a/tests/test_hyper_transformer.py +++ b/tests/test_hyper_transformer.py @@ -1,5 +1,5 @@ from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import Mock import numpy as np import pandas as pd @@ -7,7 +7,7 @@ from rdt import HyperTransformer from rdt.transformers import ( - BooleanTransformer, CategoricalTransformer, DatetimeTransformer, NumericalTransformer) + BooleanTransformer, DatetimeTransformer, NumericalTransformer, OneHotEncodingTransformer) class TestHyperTransformerTransformer(TestCase): @@ -22,123 +22,33 @@ def test___init__(self): self.assertEqual(ht.anonymize, dict()) self.assertEqual(ht.dtypes, None) - def test__analyze_int(self): - """Test _analyze int dtype""" + def test__analyze(self): + """Test _analyze""" # Setup - data = pd.DataFrame({ - 'integers': [1, 2, 3, 4, 5, None, 6, 7, 8, 9, 0] - }) - - dtypes = [int] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = NumericalTransformer - - self.assertIsInstance(result['integers'], expect_class) - - def test__analyze_float(self): - """Test _analyze float dtype""" - # Setup - data = pd.DataFrame({ - 'floats': [1.1, 2.2, 3.3, 4.4, 5.5, None, 6.6, 7.7, 8.8, 9.9, 0.0] - }) - - dtypes = [float] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = NumericalTransformer - - self.assertIsInstance(result['floats'], expect_class) - - def test__analyze_object(self): - """Test _analyze object dtype""" - # Setup - data = pd.DataFrame({ - 'objects': ['foo', 'bar', None, 'tar'] - }) - - dtypes = [np.object] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = CategoricalTransformer - - self.assertIsInstance(result['objects'], expect_class) - - def test__analyze_bool(self): - """Test _analyze bool dtype""" - # Setup - data = pd.DataFrame({ - 'booleans': [True, False, None, False, True] - }) - - dtypes = [bool] + hp = HyperTransformer(dtype_transformers={'O': 'one_hot_encoding'}) # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = BooleanTransformer - - self.assertIsInstance(result['booleans'], expect_class) - - def test__analyze_datetime64(self): - """Test _analyze datetime64 dtype""" - # Setup data = pd.DataFrame({ - 'datetimes': ['1965-05-23', None, '1997-10-17'] + 'int': [1, 2, None], + 'float': [1.0, 2.0, None], + 'object': ['foo', 'bar', None], + 'category': [1, 2, None], + 'bool': [True, False, None], + 'datetime': pd.to_datetime(['1965-05-23', None, '1997-10-17']), }) - - data['datetimes'] = pd.to_datetime(data['datetimes'], format='%Y-%m-%d', errors='coerce') - - dtypes = [np.datetime64] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) + data['category'] = data['category'].astype('category') + result = hp._analyze(data) # Asserts - expect_class = DatetimeTransformer - - self.assertIsInstance(result['datetimes'], expect_class) - - @patch('rdt.hyper_transformer.np.dtype', new=Mock()) - def test__analyze_raise_error(self): - """Test _analyze raise error""" - # Setup - data = Mock() - data.columns = ['foo'] - - dtypes = [Mock()] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - with self.assertRaises(ValueError): - HyperTransformer._analyze(transformer, data) + assert isinstance(result, dict) + assert set(result.keys()) == {'int', 'float', 'object', 'category', 'bool', 'datetime'} + + assert isinstance(result['int'], NumericalTransformer) + assert isinstance(result['float'], NumericalTransformer) + assert isinstance(result['object'], OneHotEncodingTransformer) + assert isinstance(result['category'], OneHotEncodingTransformer) + assert isinstance(result['bool'], BooleanTransformer) + assert isinstance(result['datetime'], DatetimeTransformer) def test_fit_with_analyze(self): """Test fit and analyze the transformers""" From 8b456f0d0e6f31aa34dbbab09ba12d53cc802b8e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 9 Dec 2020 14:40:00 +0100 Subject: [PATCH 27/28] =?UTF-8?q?Bump=20version:=200.2.10.dev0=20=E2=86=92?= =?UTF-8?q?=200.2.10.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conda/meta.yaml | 2 +- rdt/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index a3111242e..ca3cdfdaf 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = 'rdt' %} -{% set version = '0.2.10.dev0' %} +{% set version = '0.2.10.dev1' %} package: name: "{{ name|lower }}" diff --git a/rdt/__init__.py b/rdt/__init__.py index f2eae6188..9aa0a6073 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.10.dev0' +__version__ = '0.2.10.dev1' import numpy as np import pandas as pd diff --git a/setup.cfg b/setup.cfg index 01ed81abf..f7a5ce8fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.10.dev0 +current_version = 0.2.10.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 62cd1440b..0d2c17173 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.10.dev0', + version='0.2.10.dev1', zip_safe=False, ) From 0ea98501a48285e93dd6dbfa13886d3bc38c3375 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 18 Dec 2020 18:17:39 +0100 Subject: [PATCH 28/28] Add release notes for v0.2.10 --- HISTORY.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 9b90d402c..6745f5d82 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,18 @@ # History +## 0.2.10 - 2020-12-18 + +This release adds a new argument to the `HyperTransformer` which gives control over +which transformers to use by default for each `dtype` if no specific transformer +has been specified for the field. + +This is also the first version to be officially released on conda. + +### Issues closed + +* Add `dtype_transformers` argument to HyperTransformer - Issue [#148](https://github.com/sdv-dev/RDT/issues/148) by @csala +* Makes Copulas an optional dependency - Issue [#144](https://github.com/sdv-dev/RDT/issues/144) by @fealho + ## 0.2.9 - 2020-11-27 This release fixes a bug that prevented the `CategoricalTransformer` from working properly