diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a7ad18f34..81afd6a82 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml index 1d0d9ecc1..e125c8034 100644 --- a/.github/workflows/minimum.yml +++ b/.github/workflows/minimum.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml index b031ebe64..1b9a911bc 100644 --- a/.github/workflows/unit.yml +++ b/.github/workflows/unit.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/HISTORY.md b/HISTORY.md index 19c8109b8..e6253ff6a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,22 @@ # History +## 1.12.1 - 2024-05-09 + +This release handles a pandas warning that was showing up in the `UniformEncoder`. + +### Bugs Fixed + +* Fix pandas FutureWarning in UniformEncoder - Issue [#819](https://github.com/sdv-dev/RDT/issues/819) by @R-Palazzo + +### Maintenance + +* Switch to using ruff for Python linting and code formatting - Issue [#765](https://github.com/sdv-dev/RDT/issues/765) by @gsheni +* Only run unit and integration tests on oldest and latest python versions for macos - Issue [#812](https://github.com/sdv-dev/RDT/issues/812) by @R-Palazzo + +### Internal + +* Refactoring code for Enterprise issue #529 - PR[#815](https://github.com/sdv-dev/RDT/pull/815) by @amontanez24 + ## 1.12.0 - 2024-04-19 This release adds a new parameter to the `RegexGenerator` called `generation_order`. This parameter lets users change if they want the generated values for the regex to come out in alphanumeric or scrambled order. Additionally, warnings that were disrupting the progress bar are handled. diff --git a/Makefile b/Makefile index af7ad97ba..e5bd6b7e9 100644 --- a/Makefile +++ b/Makefile @@ -80,14 +80,13 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a # LINT TARGETS .PHONY: lint -lint: ## check style with flake8 and isort +lint: ## Run all code style checks invoke lint .PHONY: fix-lint -fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find rdt tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive rdt tests - isort --apply --atomic rdt tests +fix-lint: ## fix lint issues using ruff + ruff check --fix . + ruff format . # TEST TARGETS diff --git a/latest_requirements.txt b/latest_requirements.txt index 6ab9f1631..3995bb04d 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,4 +1,4 @@ -Faker==24.11.0 +Faker==25.0.1 copulas==0.11.0 numpy==1.26.4 pandas==2.2.2 diff --git a/pyproject.toml b/pyproject.toml index 9ddbc16b9..f35009680 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,17 +21,16 @@ license = { text = 'BSL-1.1' } requires-python = '>=3.8,<3.13' readme = 'README.md' dependencies = [ - "numpy>=1.20.0;python_version<'3.10'", + "numpy>=1.21.0;python_version<'3.10'", "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'", "numpy>=1.26.0;python_version>='3.12'", - "pandas>=1.1.3;python_version<'3.10'", - "pandas>=1.3.4;python_version>='3.10' and python_version<'3.11'", + "pandas>=1.4.0;python_version<'3.11'", "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'", "pandas>=2.1.1;python_version>='3.12'", - "scipy>=1.5.4;python_version<'3.10'", + "scipy>=1.7.3;python_version<'3.10'", "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'", "scipy>=1.12.0;python_version>='3.12'", - "scikit-learn>=0.24;python_version<'3.10'", + "scikit-learn>=1.0.2;python_version<'3.10'", "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'", "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'", "scikit-learn>=1.3.1;python_version>='3.12'", @@ -71,35 +70,7 @@ dev = [ 'watchdog>=1.0.1,<5', # style check - 'pycodestyle>=2.7.0,<2.12', - 'pyflakes>=2.3.0,<3.3', - 'flake8>=3.7.7,<8', - 'flake8-absolute-import>=1.0,<2', - 'flake8-builtins>=1.5.3,<3', - 'flake8-comprehensions>=3.6.1,<4', - 'flake8-debugger>=4.0.0,<5', - 'flake8-docstrings>=1.5.0,<2', - 'flake8-eradicate>=1.1.0,<2', - 'flake8-fixme>=1.1.1,<1.2', - 'flake8-mock>=0.3,<1', - 'flake8-multiline-containers>=0.0.18,<0.1', - 'flake8-mutable>=1.2.0,<1.3', - 'flake8-expression-complexity>=0.0.9,<0.1', - 'flake8-print>=4.0.0,<4.1', - 'flake8-pytest-style>=2.0.0,<3', - 'flake8-quotes>=3.3.0,<4', - 'flake8-sfs>=0.0.3,<2', - 'flake8-variables-names>=0.0.4,<0.1', - 'dlint>=0.11.0,<1', - 'isort>=5.13.2,<6', - 'pandas-vet>=0.2.3,<2024', - 'pep8-naming>=0.12.1,<1', - 'pydocstyle>=6.1.1,<7', - 'pylint>=2.5.3,<4', - - # fix style issues - 'autoflake>=1.1,<3', - 'autopep8>=1.4.3,<3', + 'ruff>=0.3.2,<1', # distribute on PyPI 'twine>=1.10.0,<6', @@ -166,7 +137,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.0" +current_version = "1.12.1.dev2" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', @@ -201,3 +172,52 @@ replace = "__version__ = '{new_version}'" [build-system] requires = ['setuptools', 'wheel'] build-backend = 'setuptools.build_meta' + +[tool.ruff] +preview = true +line-length = 100 +indent-width = 4 +src = ["rdt"] +target-version = "py312" +exclude = [ + "docs", + ".tox", + ".git", + "__pycache__", + ".ipynb_checkpoints" +] + +[tool.ruff.lint] +select = [ + # Pyflakes + "F", + # Pycodestyle + "E", + "W", + # isort + "I001" +] +ignore = [ + "E501", + "D107", # Missing docstring in __init__ + "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 +] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +preview = true +docstring-code-format = true +docstring-code-line-length = "dynamic" + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] + +[tool.ruff.lint.isort] +known-first-party = ["rdt"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] + +[tool.ruff.lint.pydocstyle] +convention = "google" \ No newline at end of file diff --git a/rdt/__init__.py b/rdt/__init__.py index cd03c1c9d..caeb32c94 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -2,10 +2,9 @@ """Top-level package for RDT.""" - __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.0' +__version__ = '1.12.1.dev2' import sys @@ -20,10 +19,7 @@ from rdt import transformers from rdt.hyper_transformer import HyperTransformer -__all__ = [ - 'HyperTransformer', - 'transformers' -] +__all__ = ['HyperTransformer', 'transformers'] RANDOM_SEED = 42 @@ -42,9 +38,10 @@ def get_demo(num_rows=5): pd.DataFrame """ # Hard code first five rows - login_dates = pd.Series([ - '2021-06-26', '2021-02-10', 'NAT', '2020-09-26', '2020-12-22' - ], dtype='datetime64[ns]') + login_dates = pd.Series( + ['2021-06-26', '2021-02-10', 'NAT', '2020-09-26', '2020-12-22'], + dtype='datetime64[ns]', + ) email_optin = pd.Series([False, False, False, True, np.nan], dtype='object') credit_card = ['VISA', 'VISA', 'AMEX', np.nan, 'DISCOVER'] age = [29, 18, 21, 45, 32] @@ -55,7 +52,7 @@ def get_demo(num_rows=5): 'email_optin': email_optin, 'credit_card': credit_card, 'age': age, - 'dollars_spent': dollars_spent + 'dollars_spent': dollars_spent, }) if num_rows <= 5: @@ -67,14 +64,18 @@ def get_demo(num_rows=5): try: num_rows -= 5 - login_dates = np.array([ - np.datetime64('2000-01-01') + np.timedelta64(np.random.randint(0, 10000), 'D') - for _ in range(num_rows) - ], dtype='datetime64[ns]') + login_dates = np.array( + [ + np.datetime64('2000-01-01') + np.timedelta64(np.random.randint(0, 10000), 'D') + for _ in range(num_rows) + ], + dtype='datetime64[ns]', + ) login_dates[np.random.random(size=num_rows) > 0.8] = np.datetime64('NaT') email_optin = pd.Series([True, False, np.nan], dtype='object').sample( - num_rows, replace=True) + num_rows, replace=True + ) credit_card = np.random.choice(['VISA', 'AMEX', np.nan, 'DISCOVER'], size=num_rows) age = np.random.randint(18, 100, size=num_rows) @@ -84,16 +85,19 @@ def get_demo(num_rows=5): finally: np.random.set_state(random_state) - return pd.concat([ - data, - pd.DataFrame({ - 'last_login': login_dates, - 'email_optin': email_optin, - 'credit_card': credit_card, - 'age': age, - 'dollars_spent': dollars_spent - }) - ], ignore_index=True) + return pd.concat( + [ + data, + pd.DataFrame({ + 'last_login': login_dates, + 'email_optin': email_optin, + 'credit_card': credit_card, + 'age': age, + 'dollars_spent': dollars_spent, + }), + ], + ignore_index=True, + ) def _get_addon_target(addon_path_name): @@ -159,7 +163,7 @@ def _find_addons(): try: addon = entry_point.load() except Exception: # pylint: disable=broad-exception-caught - msg = f'Failed to load "{entry_point.name}" from "{entry_point.version}".' + msg = f'Failed to load "{entry_point.name}" from "{entry_point.value}".' warnings.warn(msg) continue diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index e048b26bd..652597c53 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -9,11 +9,20 @@ import pandas as pd from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError, - TransformerProcessingError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, + TransformerProcessingError, +) from rdt.transformers import ( - BaseMultiColumnTransformer, BaseTransformer, get_class_by_transformer_name, - get_default_transformer, get_transformers_by_type) + BaseMultiColumnTransformer, + BaseTransformer, + get_class_by_transformer_name, + get_default_transformer, + get_transformers_by_type, +) from rdt.transformers.utils import flatten_column_list LOGGER = logging.getLogger(__name__) @@ -31,7 +40,7 @@ def __repr__(self): config = { 'sdtypes': self['sdtypes'], - 'transformers': {str(k): repr(v) for k, v in self['transformers'].items()} + 'transformers': {str(k): repr(v) for k, v in self['transformers'].items()}, } printed = json.dumps(config, indent=4) @@ -59,11 +68,7 @@ class HyperTransformer: 'b': 'boolean', 'M': 'datetime', } - _DEFAULT_OUTPUT_SDTYPES = [ - 'numerical', - 'float', - 'integer' - ] + _DEFAULT_OUTPUT_SDTYPES = ['numerical', 'float', 'integer'] _REFIT_MESSAGE = ( "For this change to take effect, please refit your data using 'fit' or 'fit_transform'." ) @@ -92,11 +97,7 @@ def _field_in_set(field, field_set): @staticmethod def _subset(input_list, other_list, not_in=False): - return [ - element - for element in input_list - if (element in other_list) ^ not_in - ] + return [element for element in input_list if (element in other_list) ^ not_in] def _create_multi_column_fields(self): multi_column_fields = {} @@ -109,9 +110,11 @@ def _create_multi_column_fields(self): def _validate_field_transformers(self): for field in self.field_transformers: if self._field_in_set(field, self._specified_fields): - raise ValueError(f'Multiple transformers specified for the field {field}. ' - 'Each field can have at most one transformer defined in ' - 'field_transformers.') + raise ValueError( + f'Multiple transformers specified for the field {field}. ' + 'Each field can have at most one transformer defined in ' + 'field_transformers.' + ) self._add_field_to_set(field, self._specified_fields) @@ -150,7 +153,7 @@ def get_config(self): """ return Config({ 'sdtypes': self.field_sdtypes, - 'transformers': self.field_transformers + 'transformers': self.field_transformers, }) @staticmethod @@ -250,7 +253,9 @@ def _validate_config(config): def _validate_update_columns(self, update_columns): unknown_columns = self._subset( - flatten_column_list(update_columns), self.field_sdtypes.keys(), not_in=True + flatten_column_list(update_columns), + self.field_sdtypes.keys(), + not_in=True, ) if unknown_columns: raise InvalidConfigError( @@ -279,7 +284,8 @@ def set_config(self, config): warnings.warn(self._REFIT_MESSAGE) def _validate_update_transformers_by_sdtype( - self, sdtype, transformer, transformer_name, transformer_parameters): + self, sdtype, transformer, transformer_name, transformer_parameters + ): if not self.field_sdtypes: raise ConfigNotSetError( 'Nothing to update. Use the `detect_initial_config` method to ' @@ -301,10 +307,14 @@ def _validate_update_transformers_by_sdtype( ) else: - if transformer_name not in get_class_by_transformer_name() or sdtype not in \ - get_class_by_transformer_name()[transformer_name].get_supported_sdtypes(): + if ( + transformer_name not in get_class_by_transformer_name() + or sdtype + not in get_class_by_transformer_name()[transformer_name].get_supported_sdtypes() + ): raise InvalidConfigError( - f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype.") + f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype." + ) if transformer_parameters is not None: transformer = get_class_by_transformer_name()[transformer_name] @@ -325,14 +335,15 @@ def _warn_update_transformers_by_sdtype(self, transformer, transformer_name): warnings.warn( "The 'transformer' parameter will no longer be supported in future versions " "of the RDT. Using the 'transformer_name' parameter instead.", - FutureWarning + FutureWarning, ) else: warnings.warn( "The 'transformer' parameter will no longer be supported in future versions " "of the RDT. Please use the 'transformer_name' and 'transformer_parameters' " - 'parameters instead.', FutureWarning + 'parameters instead.', + FutureWarning, ) def _remove_column_in_multi_column_fields(self, column): @@ -349,7 +360,7 @@ def _remove_column_in_multi_column_fields(self, column): new_tuple = tuple(item for item in old_tuple if item != column) if len(new_tuple) == 1: - new_tuple, = new_tuple + (new_tuple,) = new_tuple self._multi_column_fields.pop(new_tuple, None) else: for col in new_tuple: @@ -369,7 +380,7 @@ def _update_multi_column_transformer(self): columns_to_sdtypes = self._get_columns_to_sdtypes(field) try: - transformer._validate_sdtypes( # pylint: disable=protected-access + transformer._validate_sdtypes( # pylint: disable=protected-access columns_to_sdtypes ) except TransformerInputError: @@ -384,7 +395,12 @@ def _update_multi_column_transformer(self): self._multi_column_fields = self._create_multi_column_fields() def update_transformers_by_sdtype( - self, sdtype, transformer=None, transformer_name=None, transformer_parameters=None): + self, + sdtype, + transformer=None, + transformer_name=None, + transformer_parameters=None, + ): """Update the transformers for the specified ``sdtype``. Given an ``sdtype`` and a ``transformer``, change all the fields of the ``sdtype`` @@ -403,15 +419,17 @@ def update_transformers_by_sdtype( A dict of the kwargs of the transformer. """ self._validate_update_transformers_by_sdtype( - sdtype, transformer, transformer_name, transformer_parameters) + sdtype, transformer, transformer_name, transformer_parameters + ) self._warn_update_transformers_by_sdtype(transformer, transformer_name) transformer_instance = transformer if transformer_name is not None: if transformer_parameters is not None: - transformer_instance = \ - get_class_by_transformer_name()[transformer_name](**transformer_parameters) + transformer_instance = get_class_by_transformer_name()[transformer_name]( + **transformer_parameters + ) else: transformer_instance = get_class_by_transformer_name()[transformer_name]() @@ -620,7 +638,7 @@ def detect_initial_config(self, data): config = Config({ 'sdtypes': self.field_sdtypes, - 'transformers': self.field_transformers + 'transformers': self.field_transformers, }) LOGGER.info('Config:') @@ -675,7 +693,6 @@ def _fit_field_transformer(self, data, field, transformer): next_transformers = transformer.get_next_transformers() for column_name, next_transformer in next_transformers.items(): - # If the column is part of a multi-column field, and at least one column # isn't present in the data, then it should not fit the next transformer if self._field_in_data(column_name, data): diff --git a/rdt/performance/datasets/__init__.py b/rdt/performance/datasets/__init__.py index ca3470b99..eaae213d9 100644 --- a/rdt/performance/datasets/__init__.py +++ b/rdt/performance/datasets/__init__.py @@ -2,7 +2,14 @@ from collections import defaultdict -from rdt.performance.datasets import boolean, categorical, datetime, numerical, pii, text +from rdt.performance.datasets import ( + boolean, + categorical, + datetime, + numerical, + pii, + text, +) from rdt.performance.datasets.base import BaseDatasetGenerator __all__ = [ diff --git a/rdt/performance/datasets/boolean.py b/rdt/performance/datasets/boolean.py index 27c6eeb4e..40d98cb90 100644 --- a/rdt/performance/datasets/boolean.py +++ b/rdt/performance/datasets/boolean.py @@ -36,18 +36,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 2e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -71,18 +65,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } @@ -104,18 +92,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -139,18 +121,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } @@ -167,18 +143,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -201,16 +171,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/categorical.py b/rdt/performance/datasets/categorical.py index e78c70653..c2ce6928e 100644 --- a/rdt/performance/datasets/categorical.py +++ b/rdt/performance/datasets/categorical.py @@ -28,18 +28,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -55,18 +49,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 5e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -83,18 +71,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 2e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -110,18 +92,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -136,12 +112,13 @@ def generate(num_rows): """Generate a ``num_rows`` number of rows.""" cat_size = 5 categories = np.hstack([ - cat.astype('O') for cat in [ + cat.astype('O') + for cat in [ RandomGapDatetimeGenerator.generate(cat_size), np.random.randint(0, 100, cat_size), np.random.uniform(0, 100, cat_size), np.arange(cat_size).astype(str), - np.array([True, False]) + np.array([True, False]), ] ]) @@ -151,18 +128,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 2000.0, - } + }, } @@ -189,18 +160,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 2000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 2000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 2000.0, - } + }, } @@ -217,18 +182,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -244,18 +203,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 500.0, - } + }, } @@ -272,18 +225,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -299,18 +246,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 500.0, - } + }, } @@ -326,18 +267,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.0004, - 'memory': 2000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 500000.0 - }, + 'fit': {'time': 0.0004, 'memory': 2000.0}, + 'transform': {'time': 0.0004, 'memory': 500000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -353,18 +288,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.0004, - 'memory': 1000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 1000000.0 - }, + 'fit': {'time': 0.0004, 'memory': 1000.0}, + 'transform': {'time': 0.0004, 'memory': 1000000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -380,18 +309,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.002, - 'memory': 2000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 500000.0 - }, + 'fit': {'time': 0.002, 'memory': 2000.0}, + 'transform': {'time': 0.0004, 'memory': 500000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -407,16 +330,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.001, - 'memory': 1000.0 - }, - 'transform': { - 'time': 0.0005, - 'memory': 1000000.0 - }, + 'fit': {'time': 0.001, 'memory': 1000.0}, + 'transform': {'time': 0.0005, 'memory': 1000000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } diff --git a/rdt/performance/datasets/datetime.py b/rdt/performance/datasets/datetime.py index 295ae1184..a6bd738ac 100644 --- a/rdt/performance/datasets/datetime.py +++ b/rdt/performance/datasets/datetime.py @@ -31,18 +31,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -61,18 +55,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -89,18 +77,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -119,18 +101,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -152,18 +128,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -185,16 +155,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/numerical.py b/rdt/performance/datasets/numerical.py index d092b660e..2d5a973ec 100644 --- a/rdt/performance/datasets/numerical.py +++ b/rdt/performance/datasets/numerical.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -54,18 +48,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } @@ -83,18 +71,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -110,18 +92,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 600.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 600.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } @@ -142,18 +118,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 2000.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 1e-05, 'memory': 2000.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 2000.0, - } + }, } @@ -174,18 +144,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 3e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } @@ -201,18 +165,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 1e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -228,18 +186,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 350.0, - } + }, } @@ -255,18 +207,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -282,16 +228,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } diff --git a/rdt/performance/datasets/pii.py b/rdt/performance/datasets/pii.py index 523a06739..95c9cbf95 100644 --- a/rdt/performance/datasets/pii.py +++ b/rdt/performance/datasets/pii.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 1e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 3e-05, 'memory': 1000.0, - } + }, } @@ -54,16 +48,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 3e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/text.py b/rdt/performance/datasets/text.py index 1039ae711..5473d9826 100644 --- a/rdt/performance/datasets/text.py +++ b/rdt/performance/datasets/text.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 1e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } @@ -54,16 +48,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/performance.py b/rdt/performance/performance.py index 989f015fa..d4bdbaf2e 100644 --- a/rdt/performance/performance.py +++ b/rdt/performance/performance.py @@ -11,23 +11,13 @@ TRANSFORMER_ARGS = { 'BinaryEncoder': { 'missing_value_replacement': -1, - 'missing_value_generation': 'from_column' - }, - 'UnixTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'OptimizedTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'FloatFormatter': { - 'missing_value_generation': 'from_column' - }, - 'GaussianNormalizer': { - 'missing_value_generation': 'from_column' - }, - 'ClusterBasedNormalizer': { - 'missing_value_generation': 'from_column' + 'missing_value_generation': 'from_column', }, + 'UnixTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'OptimizedTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'FloatFormatter': {'missing_value_generation': 'from_column'}, + 'GaussianNormalizer': {'missing_value_generation': 'from_column'}, + 'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'}, } @@ -91,7 +81,7 @@ def evaluate_transformer_performance(transformer, dataset_generator, verbose=Fal performance['Number of fit rows'] = fit_size performance['Number of transform rows'] = transform_size performance['Dataset'] = dataset_generator.__name__ - performance['Transformer'] = f'{transformer.__module__ }.{transformer.get_name()}' + performance['Transformer'] = f'{transformer.__module__}.{transformer.get_name()}' out.append(performance) diff --git a/rdt/performance/profiling.py b/rdt/performance/profiling.py index d3ba9c0c3..9b2598b8e 100644 --- a/rdt/performance/profiling.py +++ b/rdt/performance/profiling.py @@ -47,7 +47,7 @@ def _profile_memory(method, dataset, column=None): peak_memory = ctx.Value('i', 0) profiling_process = ctx.Process( target=_set_memory_for_method, - args=(method, dataset, column, peak_memory) + args=(method, dataset, column, peak_memory), ) profiling_process.start() profiling_process.join() @@ -99,5 +99,5 @@ def profile_transformer(transformer, dataset_generator, transform_size, fit_size 'Transform Time': transform_time, 'Transform Memory': transform_memory, 'Reverse Transform Time': reverse_time, - 'Reverse Transform Memory': reverse_memory + 'Reverse Transform Memory': reverse_memory, }) diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index ac214b1d4..4f8f79b69 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -9,12 +9,28 @@ from rdt.transformers.base import BaseMultiColumnTransformer, BaseTransformer from rdt.transformers.boolean import BinaryEncoder from rdt.transformers.categorical import ( - CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, - OrderedUniformEncoder, UniformEncoder) -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder + CustomLabelEncoder, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) from rdt.transformers.null import NullTransformer -from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.numerical import ( + ClusterBasedNormalizer, + FloatFormatter, + GaussianNormalizer, +) +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) from rdt.transformers.text import IDGenerator, RegexGenerator __all__ = [ @@ -144,7 +160,7 @@ def get_default_transformers(): """ transformers_by_type = get_transformers_by_type() defaults = deepcopy(DEFAULT_TRANSFORMERS) - for (sdtype, transformers) in transformers_by_type.items(): + for sdtype, transformers in transformers_by_type.items(): if sdtype not in defaults: defaults[sdtype] = transformers[0]() diff --git a/rdt/transformers/_validators.py b/rdt/transformers/_validators.py index 5c5f7fd84..2cc22a4c6 100644 --- a/rdt/transformers/_validators.py +++ b/rdt/transformers/_validators.py @@ -1,4 +1,5 @@ """Validations for multi-column transformers.""" + import importlib from rdt.errors import TransformerInputError @@ -60,8 +61,14 @@ class AddressValidator(BaseValidator): """Validation class for Address data.""" SUPPORTED_SDTYPES = [ - 'country_code', 'administrative_unit', 'city', 'postcode', - 'street_address', 'secondary_address', 'state', 'state_abbr' + 'country_code', + 'administrative_unit', + 'city', + 'postcode', + 'street_address', + 'secondary_address', + 'state', + 'state_abbr', ] VALIDATION_TYPE = 'Address' @@ -156,16 +163,18 @@ def validate_sdtypes(cls, columns_to_sdtypes): @classmethod def validate_imports(cls): """Check that the GPS transformers can be imported.""" - error_message = ( - 'You must have SDV Enterprise with the gps add-on to use the GPS features.' - ) + error_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features.' try: gps_module = importlib.import_module('rdt.transformers.gps') except ModuleNotFoundError: raise ImportError(error_message) from None - required_classes = ['RandomLocationGenerator', 'MetroAreaAnonymizer', 'GPSNoiser'] + required_classes = [ + 'RandomLocationGenerator', + 'MetroAreaAnonymizer', + 'GPSNoiser', + ] for class_name in required_classes: if not hasattr(gps_module, class_name): raise ImportError(error_message) diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index 91734ec52..d1520b21a 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -1,4 +1,5 @@ """BaseTransformer module.""" + import abc import contextlib import hashlib @@ -45,6 +46,7 @@ def random_state(function): function (Callable): The function to wrap around. """ + @wraps(function) def wrapper(self, *args, **kwargs): if self.random_states is None: @@ -82,7 +84,7 @@ def __init__(self): self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': None, - 'reverse_transform': None + 'reverse_transform': None, } def set_random_state(self, state, method_name): @@ -106,7 +108,7 @@ def reset_randomization(self): self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': np.random.RandomState(self.random_seed), - 'reverse_transform': np.random.RandomState(self.random_seed + 1) + 'reverse_transform': np.random.RandomState(self.random_seed + 1), } @property @@ -115,7 +117,7 @@ def model_missing_values(self): warnings.warn( "Future versions of RDT will not support the 'model_missing_values' parameter. " "Please switch to using the 'missing_value_generation' parameter instead.", - FutureWarning + FutureWarning, ) return self.missing_value_generation == 'from_column' @@ -132,7 +134,8 @@ def _set_model_missing_values(self, model_missing_values): warnings.warn( "Future versions of RDT will not support the 'model_missing_values' parameter. " "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.', FutureWarning + 'strategy.', + FutureWarning, ) if model_missing_values is True: self._set_missing_value_generation('from_column') @@ -143,7 +146,8 @@ def _set_missing_value_replacement(self, default, missing_value_replacement): if missing_value_replacement is None: warnings.warn( "Setting 'missing_value_replacement' to 'None' is no longer supported. " - f"Imputing with the '{default}' instead.", FutureWarning + f"Imputing with the '{default}' instead.", + FutureWarning, ) self.missing_value_replacement = default else: @@ -186,7 +190,7 @@ def get_input_sdtype(cls): """ warnings.warn( '`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.', - FutureWarning + FutureWarning, ) return cls.get_supported_sdtypes()[0] @@ -367,11 +371,11 @@ def _set_seed(self, data): hash_value += str(value) hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) - self.random_seed = hash_value % ((2 ** 32) - 1) # maximum value for a seed + self.random_seed = hash_value % ((2**32) - 1) # maximum value for a seed self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': np.random.RandomState(self.random_seed), - 'reverse_transform': np.random.RandomState(self.random_seed + 1) + 'reverse_transform': np.random.RandomState(self.random_seed + 1), } @random_state diff --git a/rdt/transformers/boolean.py b/rdt/transformers/boolean.py index de2cd5fec..5ae13c5ae 100644 --- a/rdt/transformers/boolean.py +++ b/rdt/transformers/boolean.py @@ -39,8 +39,12 @@ class BinaryEncoder(BaseTransformer): INPUT_SDTYPE = 'boolean' null_transformer = None - def __init__(self, missing_value_replacement='mode', model_missing_values=None, - missing_value_generation='random'): + def __init__( + self, + missing_value_replacement='mode', + model_missing_values=None, + missing_value_generation='random', + ): super().__init__() self._set_missing_value_generation(missing_value_generation) self._set_missing_value_replacement('random', missing_value_replacement) @@ -55,12 +59,14 @@ def _fit(self, data): Data to fit to. """ self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(data) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform boolean to float. diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 2f07f6635..e4c2993d2 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -9,7 +9,11 @@ from rdt.errors import TransformerInputError from rdt.transformers.base import BaseTransformer -from rdt.transformers.utils import check_nan_in_transform, fill_nan_with_none, try_convert_to_dtype +from rdt.transformers.utils import ( + check_nan_in_transform, + fill_nan_with_none, + try_convert_to_dtype, +) LOGGER = logging.getLogger(__name__) @@ -155,7 +159,7 @@ def _transform(self, data): f"that did not appear during 'fit' ({categories_to_print}). Assigning " 'them random values. If you want to model new categories, ' "please fit the data again using 'fit'.", - category=UserWarning + category=UserWarning, ) choices = list(self.frequencies.keys()) @@ -193,7 +197,9 @@ def _reverse_transform(self, data): labels.append(key) result = pd.cut(data, bins=bins, labels=labels, include_lowest=True) - result = result.replace(nan_name, np.nan) + if nan_name in result.cat.categories: + result = result.cat.remove_categories(nan_name) + result = try_convert_to_dtype(result, self.dtype) return result @@ -257,8 +263,8 @@ def _fit(self, data): data = fill_nan_with_none(data) self._check_unknown_categories(data) - category_not_seen = (set(self.order.dropna()) != set(data.dropna())) - nans_not_seen = (pd.isna(self.order).any() and not pd.isna(data).any()) + category_not_seen = set(self.order.dropna()) != set(data.dropna()) + nans_not_seen = pd.isna(self.order).any() and not pd.isna(data).any() if category_not_seen or nans_not_seen: unseen_categories = [x for x in self.order if x not in data.array] categories_to_print = self._get_message_unseen_categories(unseen_categories) @@ -266,7 +272,7 @@ def _fit(self, data): "For column '%s', some of the provided category values were not present in the" ' data during fit: (%s).', self.get_input_column(), - categories_to_print + categories_to_print, ) freq = data.value_counts(normalize=True, dropna=False) @@ -333,7 +339,7 @@ def __init__(self, add_noise=False): warnings.warn( "The 'FrequencyEncoder' transformer will no longer be supported in future versions " "of the RDT library. Please use the 'UniformEncoder' transformer instead.", - FutureWarning + FutureWarning, ) super().__init__() self.add_noise = add_noise @@ -367,8 +373,7 @@ def tie_breaker(element): augmented_frequencies[sortable_column_name] = frequencies.index.map(tie_breaker) augmented_frequencies = augmented_frequencies.sort_values( - [column_name, sortable_column_name], - ascending=[False, True] + [column_name, sortable_column_name], ascending=[False, True] ) sorted_frequencies = augmented_frequencies[column_name] @@ -423,7 +428,7 @@ def _clip_noised_transform(result, start, end): def _transform_by_category(self, data): """Transform the data by iterating over the different categories.""" - result = np.empty(shape=(len(data), ), dtype=float) + result = np.empty(shape=(len(data),), dtype=float) # loop over categories for category, values in self.intervals.items(): @@ -435,9 +440,10 @@ def _transform_by_category(self, data): if self.add_noise: result[mask] = norm.rvs( - mean, std, + mean, + std, size=mask.sum(), - random_state=self.random_states['transform'] + random_state=self.random_states['transform'], ) result[mask] = self._clip_noised_transform(result[mask], start, end) else: @@ -495,7 +501,7 @@ def _transform(self, data): def _reverse_transform_by_category(self, data): """Reverse transform the data by iterating over all the categories.""" - result = np.empty(shape=(len(data), ), dtype=self.dtype) + result = np.empty(shape=(len(data),), dtype=self.dtype) # loop over categories for category, values in self.intervals.items(): @@ -762,8 +768,7 @@ def _fit(self, data): unique_data = self._order_categories(unique_data) self.values_to_categories = dict(enumerate(unique_data)) self.categories_to_values = { - category: value - for value, category in self.values_to_categories.items() + category: value for value, category in self.values_to_categories.items() } def _transform(self, data): @@ -793,10 +798,7 @@ def _transform(self, data): 'please fit the transformer again with the new data.' ) - mapped[is_null] = np.random.randint( - len(self.categories_to_values), - size=is_null.sum() - ) + mapped[is_null] = np.random.randint(len(self.categories_to_values), size=is_null.sum()) if self.add_noise: mapped = mapped.astype(float) @@ -892,8 +894,7 @@ def _fit(self, data): self.values_to_categories = dict(enumerate(self.order)) self.categories_to_values = { - category: value - for value, category in self.values_to_categories.items() + category: value for value, category in self.values_to_categories.items() } @@ -906,6 +907,7 @@ class CustomLabelEncoder(OrderedLabelEncoder): def __init__(self, order, add_noise=False): warnings.warn( "The 'CustomLabelEncoder' is renamed to 'OrderedLabelEncoder'. Please update the" - 'name to ensure compatibility with future versions of RDT.', FutureWarning + 'name to ensure compatibility with future versions of RDT.', + FutureWarning, ) super().__init__(order, add_noise) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 28ad451f8..d66c06104 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -1,4 +1,5 @@ """Transformer for datetime data.""" + import numpy as np import pandas as pd from pandas.api.types import is_datetime64_dtype, is_numeric_dtype @@ -49,9 +50,14 @@ class UnixTimestampEncoder(BaseTransformer): _min_value = None _max_value = None - def __init__(self, missing_value_replacement='mean', model_missing_values=None, - datetime_format=None, missing_value_generation='random', - enforce_min_max_values=False): + def __init__( + self, + missing_value_replacement='mean', + model_missing_values=None, + datetime_format=None, + missing_value_generation='random', + enforce_min_max_values=False, + ): super().__init__() self._set_missing_value_replacement('mean', missing_value_replacement) self._set_missing_value_generation(missing_value_generation) @@ -136,12 +142,14 @@ def _fit(self, data): self._max_value = transformed.max() self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(transformed) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform datetime values to float values. @@ -229,14 +237,21 @@ class OptimizedTimestampEncoder(UnixTimestampEncoder): divider = None - def __init__(self, missing_value_replacement=None, model_missing_values=None, - datetime_format=None, missing_value_generation='random', - enforce_min_max_values=False): - super().__init__(missing_value_replacement=missing_value_replacement, - missing_value_generation=missing_value_generation, - enforce_min_max_values=enforce_min_max_values, - model_missing_values=model_missing_values, - datetime_format=datetime_format) + def __init__( + self, + missing_value_replacement=None, + model_missing_values=None, + datetime_format=None, + missing_value_generation='random', + enforce_min_max_values=False, + ): + super().__init__( + missing_value_replacement=missing_value_replacement, + missing_value_generation=missing_value_generation, + enforce_min_max_values=enforce_min_max_values, + model_missing_values=model_missing_values, + datetime_format=datetime_format, + ) def _find_divider(self, transformed): self.divider = 1 diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index c10b5295c..ed583a489 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -10,7 +10,7 @@ LOGGER = logging.getLogger(__name__) -class NullTransformer(): +class NullTransformer: """Transformer for data that contains Null values. Args: @@ -135,11 +135,9 @@ def transform(self, data): """ isna = data.isna() if self._missing_value_replacement == 'random': - data_mask = list(np.random.uniform( - low=self._min_value, - high=self._max_value, - size=len(data) - )) + data_mask = list( + np.random.uniform(low=self._min_value, high=self._max_value, size=len(data)) + ) data = data.mask(data.isna(), data_mask) elif isna.any() and self._missing_value_replacement is not None: @@ -172,7 +170,7 @@ def reverse_transform(self, data): data = data[:, 0] elif self.nulls: - isna = np.random.random((len(data), )) < self._null_percentage + isna = np.random.random((len(data),)) < self._null_percentage data = pd.Series(data) diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 94be5b7d1..1c6d1ddf1 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -1,4 +1,5 @@ """Transformers for numerical data.""" + import copy import warnings @@ -13,10 +14,10 @@ EPSILON = np.finfo(np.float32).eps INTEGER_BOUNDS = { - 'Int8': (-2**7, 2**7 - 1), - 'Int16': (-2**15, 2**15 - 1), - 'Int32': (-2**31, 2**31 - 1), - 'Int64': (-2**63, 2**63 - 1), + 'Int8': (-(2**7), 2**7 - 1), + 'Int16': (-(2**15), 2**15 - 1), + 'Int32': (-(2**31), 2**31 - 1), + 'Int64': (-(2**63), 2**63 - 1), 'UInt8': (0, 2**8 - 1), 'UInt16': (0, 2**16 - 1), 'UInt32': (0, 2**32 - 1), @@ -73,9 +74,15 @@ class FloatFormatter(BaseTransformer): _min_value = None _max_value = None - def __init__(self, missing_value_replacement='mean', model_missing_values=None, - learn_rounding_scheme=False, enforce_min_max_values=False, - computer_representation='Float', missing_value_generation='random'): + def __init__( + self, + missing_value_replacement='mean', + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + computer_representation='Float', + missing_value_generation='random', + ): super().__init__() self.missing_value_replacement = missing_value_replacement self._set_missing_value_generation(missing_value_generation) @@ -108,11 +115,13 @@ def _validate_values_within_bounds(self, data): min_bound, max_bound = INTEGER_BOUNDS[self.computer_representation] if min_value < min_bound: self._raise_out_of_bounds_error( - min_value, data.name, 'minimum', min_bound, max_bound) + min_value, data.name, 'minimum', min_bound, max_bound + ) if max_value > max_bound: self._raise_out_of_bounds_error( - max_value, data.name, 'maximum', min_bound, max_bound) + max_value, data.name, 'maximum', min_bound, max_bound + ) def _fit(self, data): """Fit the transformer to the data. @@ -132,12 +141,14 @@ def _fit(self, data): self._rounding_digits = learn_rounding_digits(data) self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(data) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform numerical data. @@ -246,13 +257,15 @@ class GaussianNormalizer(FloatFormatter): _DEPRECATED_DISTRIBUTIONS_MAPPING = { 'gaussian': 'norm', 'student_t': 't', - 'truncated_gaussian': 'truncnorm' + 'truncated_gaussian': 'truncnorm', } @staticmethod def _get_distributions(): try: - from copulas import univariate # pylint: disable=import-outside-toplevel + from copulas import ( + univariate, # pylint: disable=import-outside-toplevel + ) except ImportError as error: error.msg += ( '\n\nIt seems like `copulas` is not installed.\n' @@ -270,10 +283,14 @@ def _get_distributions(): 'uniform': univariate.UniformUnivariate, } - def __init__(self, model_missing_values=None, learn_rounding_scheme=False, - enforce_min_max_values=False, distribution='truncated_gaussian', - missing_value_generation='random'): - + def __init__( + self, + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + distribution='truncated_gaussian', + missing_value_generation='random', + ): # Using missing_value_replacement='mean' as the default instead of random # as this may lead to different outcomes in certain synthesizers # affecting the synthesizers directly and this is out of scope for now. @@ -282,7 +299,7 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, missing_value_generation=missing_value_generation, missing_value_replacement='mean', learn_rounding_scheme=learn_rounding_scheme, - enforce_min_max_values=enforce_min_max_values + enforce_min_max_values=enforce_min_max_values, ) self._distributions = self._get_distributions() @@ -292,7 +309,7 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, f"Future versions of RDT will not support '{distribution}' as an option. " f"Please use '{self._DEPRECATED_DISTRIBUTIONS_MAPPING[distribution]}' " 'instead.', - FutureWarning + FutureWarning, ) distribution = self._DEPRECATED_DISTRIBUTIONS_MAPPING[distribution] @@ -423,10 +440,15 @@ class ClusterBasedNormalizer(FloatFormatter): _bgm_transformer = None valid_component_indicator = None - def __init__(self, model_missing_values=None, learn_rounding_scheme=False, - enforce_min_max_values=False, max_clusters=10, weight_threshold=0.005, - missing_value_generation='random'): - + def __init__( + self, + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + max_clusters=10, + weight_threshold=0.005, + missing_value_generation='random', + ): # Using missing_value_replacement='mean' as the default instead of random # as this may lead to different outcomes in certain synthesizers # affecting the synthesizers directly and this is out of scope for now. @@ -435,7 +457,7 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, missing_value_generation=missing_value_generation, missing_value_replacement='mean', learn_rounding_scheme=learn_rounding_scheme, - enforce_min_max_values=enforce_min_max_values + enforce_min_max_values=enforce_min_max_values, ) self.max_clusters = max_clusters self.weight_threshold = weight_threshold @@ -461,7 +483,7 @@ def _fit(self, data): n_components=self.max_clusters, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.001, - random_state=self._get_current_random_seed() + random_state=self._get_current_random_seed(), ) super()._fit(data) @@ -492,7 +514,10 @@ def _transform(self, data): data = data.reshape((len(data), 1)) means = self._bgm_transformer.means_.reshape((1, self.max_clusters)) means = means[:, self.valid_component_indicator] - stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self.max_clusters)) + stds = np.sqrt(self._bgm_transformer.covariances_).reshape(( + 1, + self.max_clusters, + )) stds = stds[:, self.valid_component_indicator] # Multiply stds by 4 so that a value will be in the range [-1,1] with 99.99% probability @@ -506,12 +531,15 @@ def _transform(self, data): component_prob_t = component_prob_t / component_prob_t.sum() selected_component[i] = np.random.choice( np.arange(self.valid_component_indicator.sum()), - p=component_prob_t + p=component_prob_t, ) aranged = np.arange(len(data)) - normalized = normalized_values[aranged, selected_component].reshape([-1, 1]) - normalized = np.clip(normalized, -.99, .99) + normalized = normalized_values[aranged, selected_component].reshape([ + -1, + 1, + ]) + normalized = np.clip(normalized, -0.99, 0.99) normalized = normalized[:, 0] rows = [normalized, selected_component] if self.null_transformer and self.null_transformer.models_missing_values(): diff --git a/rdt/transformers/pii/__init__.py b/rdt/transformers/pii/__init__.py index c52ada4bc..f2bd3549f 100644 --- a/rdt/transformers/pii/__init__.py +++ b/rdt/transformers/pii/__init__.py @@ -1,6 +1,9 @@ """Personal Identifiable Information Transformers module.""" -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) __all__ = [ 'AnonymizedFaker', diff --git a/rdt/transformers/pii/anonymization.py b/rdt/transformers/pii/anonymization.py index eb4443d37..f3fb66101 100644 --- a/rdt/transformers/pii/anonymization.py +++ b/rdt/transformers/pii/anonymization.py @@ -10,41 +10,23 @@ from rdt.transformers import AnonymizedFaker SDTYPE_ANONYMIZERS = { - 'address': { - 'provider_name': 'address', - 'function_name': 'address' - }, - 'email': { - 'provider_name': 'internet', - 'function_name': 'email' - }, - 'ipv4_address': { - 'provider_name': 'internet', - 'function_name': 'ipv4' - }, - 'ipv6_address': { - 'provider_name': 'internet', - 'function_name': 'ipv6' - }, + 'address': {'provider_name': 'address', 'function_name': 'address'}, + 'email': {'provider_name': 'internet', 'function_name': 'email'}, + 'ipv4_address': {'provider_name': 'internet', 'function_name': 'ipv4'}, + 'ipv6_address': {'provider_name': 'internet', 'function_name': 'ipv6'}, 'mac_address': { 'provider_name': 'internet', - 'function_name': 'mac_address' - }, - 'name': { - 'provider_name': 'person', - 'function_name': 'name' + 'function_name': 'mac_address', }, + 'name': {'provider_name': 'person', 'function_name': 'name'}, 'phone_number': { 'provider_name': 'phone_number', - 'function_name': 'phone_number' - }, - 'ssn': { - 'provider_name': 'ssn', - 'function_name': 'ssn' + 'function_name': 'phone_number', }, + 'ssn': {'provider_name': 'ssn', 'function_name': 'ssn'}, 'user_agent_string': { 'provider_name': 'user_agent', - 'function_name': 'user_agent' + 'function_name': 'user_agent', }, } @@ -102,7 +84,7 @@ def get_anonymized_transformer(function_name, transformer_kwargs=None): provider_name = _detect_provider_name(function_name, locales=locales) transformer_kwargs.update({ 'function_name': function_name, - 'provider_name': provider_name + 'provider_name': provider_name, }) return AnonymizedFaker(**transformer_kwargs) diff --git a/rdt/transformers/pii/anonymizer.py b/rdt/transformers/pii/anonymizer.py index 177bd7182..c4dab9a57 100644 --- a/rdt/transformers/pii/anonymizer.py +++ b/rdt/transformers/pii/anonymizer.py @@ -106,9 +106,16 @@ def _check_locales(self): 'information: https://faker.readthedocs.io/en/master/locales.html' ) - def __init__(self, provider_name=None, function_name=None, function_kwargs=None, - locales=None, cardinality_rule=None, enforce_uniqueness=False, - missing_value_generation='random'): + def __init__( + self, + provider_name=None, + function_name=None, + function_kwargs=None, + locales=None, + cardinality_rule=None, + enforce_uniqueness=False, + missing_value_generation='random', + ): super().__init__() self._data_cardinality = None self.data_length = None @@ -118,7 +125,7 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None, warnings.warn( "The 'enforce_uniqueness' parameter is no longer supported. " "Please use the 'cardinality_rule' parameter instead.", - FutureWarning + FutureWarning, ) if not self.cardinality_rule: self.cardinality_rule = 'unique' @@ -159,7 +166,11 @@ def get_supported_sdtypes(cls): Accepted input sdtypes of the transformer. """ unsupported_sdtypes = { - 'numerical', 'datetime', 'categorical', 'boolean', None + 'numerical', + 'datetime', + 'categorical', + 'boolean', + None, } all_sdtypes = {cls.INPUT_SDTYPE} for transformer in BaseTransformer.get_subclasses(): @@ -198,7 +209,7 @@ def _set_faker_seed(self, data): hash_value += str(value) hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) - self._faker_random_seed = hash_value % ((2 ** 32) - 1) # maximum value for a seed + self._faker_random_seed = hash_value % ((2**32) - 1) # maximum value for a seed self.faker.seed_instance(self._faker_random_seed) def _fit(self, data): @@ -229,7 +240,10 @@ def _reverse_transform_cardinality_rule_match(self, sample_size): reverse_transformed = np.array([], dtype=object) if self.missing_value_generation == 'random': num_nans = int(self._nan_frequency * sample_size) - reverse_transformed = np.concatenate([reverse_transformed, np.full(num_nans, np.nan)]) + reverse_transformed = np.concatenate([ + reverse_transformed, + np.full(num_nans, np.nan), + ]) else: num_nans = 0 @@ -238,12 +252,19 @@ def _reverse_transform_cardinality_rule_match(self, sample_size): if sample_size < num_nans + self._data_cardinality: unique_categories = self._get_unique_categories(sample_size - num_nans) - reverse_transformed = np.concatenate([reverse_transformed, unique_categories]) + reverse_transformed = np.concatenate([ + reverse_transformed, + unique_categories, + ]) else: unique_categories = self._get_unique_categories(self._data_cardinality) num_copies = sample_size - self._data_cardinality - num_nans copies = np.random.choice(unique_categories, num_copies) - reverse_transformed = np.concatenate([reverse_transformed, unique_categories, copies]) + reverse_transformed = np.concatenate([ + reverse_transformed, + unique_categories, + copies, + ]) np.random.shuffle(reverse_transformed) @@ -268,10 +289,10 @@ def _reverse_transform(self, data): if hasattr(self, 'cardinality_rule') and self.cardinality_rule == 'match': reverse_transformed = self._reverse_transform_cardinality_rule_match(sample_size) else: - reverse_transformed = np.array([ - self._function() - for _ in range(sample_size) - ], dtype=object) + reverse_transformed = np.array( + [self._function() for _ in range(sample_size)], + dtype=object, + ) except faker.exceptions.UniquenessException as exception: raise TransformerProcessingError( @@ -334,26 +355,37 @@ class PseudoAnonymizedFaker(AnonymizedFaker): def __getstate__(self): """Return a dictionary representation of the instance and warn the user when pickling.""" - warnings.warn(( - 'You are saving the mapping information, which includes the original data. ' - 'Sharing this object with others will also give them access to the original data ' - 'used with this transformer.' - )) + warnings.warn( + ( + 'You are saving the mapping information, which includes the original data. ' + 'Sharing this object with others will also give them access to the original data ' + 'used with this transformer.' + ) + ) return self.__dict__ - def __init__(self, provider_name=None, function_name=None, function_kwargs=None, locales=None): + def __init__( + self, + provider_name=None, + function_name=None, + function_kwargs=None, + locales=None, + ): super().__init__( provider_name=provider_name, function_name=function_name, function_kwargs=function_kwargs, locales=locales, - cardinality_rule='unique' + cardinality_rule='unique', ) self._mapping_dict = {} self._reverse_mapping_dict = {} self.output_properties = { - None: {'sdtype': 'categorical', 'next_transformer': LabelEncoder(add_noise=True)} + None: { + 'sdtype': 'categorical', + 'next_transformer': LabelEncoder(add_noise=True), + } } def get_mapping(self): diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index c6ef8c9a6..2d02650d8 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -1,4 +1,5 @@ """Transformers for text data.""" + import logging import warnings @@ -116,8 +117,12 @@ def __setstate__(self, state): state['generator'] = generator self.__dict__ = state - def __init__(self, regex_format='[A-Za-z]{5}', enforce_uniqueness=False, - generation_order='alphanumeric'): + def __init__( + self, + regex_format='[A-Za-z]{5}', + enforce_uniqueness=False, + generation_order='alphanumeric', + ): super().__init__() self.output_properties = {None: {'next_transformer': None}} self.enforce_uniqueness = enforce_uniqueness @@ -171,8 +176,10 @@ def _warn_not_enough_unique_values(self, sample_size): LOGGER.info( "The data has %s rows but the regex for '%s' can only create %s unique values." " Some values in '%s' may be repeated.", - sample_size, self.get_input_column(), self.generator_size, - self.get_input_column() + sample_size, + self.get_input_column(), + self.generator_size, + self.get_input_column(), ) remaining = self.generator_size - self.generated @@ -204,27 +211,33 @@ def _reverse_transform(self, data): self.reset_randomization() remaining = self.generator_size - if remaining >= sample_size: - reverse_transformed = [next(self.generator) for _ in range(sample_size)] - self.generated += sample_size + generated_values = [] + while len(generated_values) < sample_size: + try: + generated_values.append(next(self.generator)) + self.generated += 1 + except (RuntimeError, StopIteration): + # Can't generate more rows without collision so breaking out of loop + break - else: - generated_values = list(self.generator) - reverse_transformed = generated_values[:] - self.generated = self.generator_size + reverse_transformed = generated_values[:] + + if len(reverse_transformed) < sample_size: if self.enforce_uniqueness: try: remaining_samples = sample_size - len(reverse_transformed) start = int(generated_values[-1]) + 1 - reverse_transformed.extend( - [str(i) for i in range(start, start + remaining_samples)]) + reverse_transformed.extend([ + str(i) for i in range(start, start + remaining_samples) + ]) except ValueError: counter = 0 while len(reverse_transformed) < sample_size: remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend( - [f'{i}({counter})' for i in generated_values[:remaining_samples]]) + reverse_transformed.extend([ + f'{i}({counter})' for i in generated_values[:remaining_samples] + ]) counter += 1 else: diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index 75c21960e..81989fa6c 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -57,18 +57,15 @@ def _max_repeat(options, max_repeat): sizes = [] for repeat in range(min_, max_ + 1): if repeat: - sizes.append(pow(int(size), repeat, 2 ** 63 - 1)) + sizes.append(pow(int(size), repeat, 2**63 - 1)) repeat_generators = [ - (_GENERATORS[option](args, max_repeat)[0], option, args) - for _ in range(repeat) + (_GENERATORS[option](args, max_repeat)[0], option, args) for _ in range(repeat) ] generators.append(_from_generators(repeat_generators, max_repeat)) - return ( - value - for generator in generators - for value in generator - ), np.sum(sizes) + int(min_ == 0) + return (value for generator in generators for value in generator), np.sum(sizes) + int( + min_ == 0 + ) def _category_chars(regex): @@ -113,7 +110,7 @@ def _from_generators(generators, max_repeat): value = next(generator) generated.append(value) previous[index] = value - generated.extend(previous[index + 1:]) + generated.extend(previous[index + 1 :]) break except StopIteration: generator = _GENERATORS[option](args, max_repeat)[0] @@ -273,5 +270,8 @@ def learn_rounding_digits(data): return decimal # Can't round, not equal after MAX_DECIMALS digits of precision - LOGGER.info("No rounding scheme detected for column '%s'. Data will not be rounded.", name) + LOGGER.info( + "No rounding scheme detected for column '%s'. Data will not be rounded.", + name, + ) return None diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 01ce2bb6c..000000000 --- a/setup.cfg +++ /dev/null @@ -1,32 +0,0 @@ -[flake8] -max-line-length = 99 -inline-quotes = single -extend-ignore = - D107, - SFS3, - PD005, - # TokenError: unterminated string literal (detected at line 1) - E902 -exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints -per-file-ignores = - tests/contributing.py:T001 - tests/performance/profiling.py:T001 - tests/performance/test_performance.py:T001 - rdt/performance/datasets/datetime.py:A005 - rdt/transformers/datetime.py:A005 - -[aliases] -test = pytest - -[pylint] -extension-pkg-whitelist = numpy -min-similarity-lines = 5 -max-args = 8 -max-attributes = 11 -ignore-comments = yes -ignore-docstrings = yes -ignore-imports = yes -disable = R0801, R0903, R0913, R0914, R1708, C0209, W0223, W0221, W0237, C0411, - W0231 # __init__ method from base class is not called on a child class -ignored-classes = sre_parse - diff --git a/static_code_analysis.txt b/static_code_analysis.txt index 0013959c6..7259f63c1 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -1,10 +1,10 @@ -Run started:2024-04-11 03:56:15.289402 +Run started:2024-04-16 22:10:36.007657 Test results: No issues identified. Code scanned: - Total lines of code: 5503 + Total lines of code: 5515 Total lines skipped (#nosec): 0 Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 diff --git a/tasks.py b/tasks.py index b0b66ac2b..ae69f8927 100644 --- a/tasks.py +++ b/tasks.py @@ -11,12 +11,11 @@ from packaging.requirements import Requirement from packaging.version import Version - COMPARISONS = { '>=': operator.ge, '>': operator.gt, '<': operator.lt, - '<=': operator.le + '<=': operator.le, } @@ -62,15 +61,23 @@ def _get_minimum_versions(dependencies, python_version): continue # Skip this dependency if the marker does not apply to the current Python version if req.name not in min_versions: - min_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), None) + min_version = next( + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), + None, + ) if min_version: min_versions[req.name] = f'{req.name}=={min_version}' elif '@' not in min_versions[req.name]: existing_version = Version(min_versions[req.name].split('==')[1]) - new_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), existing_version) + new_version = next( + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), + existing_version, + ) if new_version > existing_version: - min_versions[req.name] = f'{req.name}=={new_version}' # Change when a valid newer version is found + min_versions[req.name] = ( + f'{req.name}=={new_version}' # Change when a valid newer version is found + ) return list(min_versions.values()) @@ -85,7 +92,8 @@ def install_minimum(c): minimum_versions = _get_minimum_versions(dependencies, python_version) if minimum_versions: - c.run(f'python -m pip install {" ".join(minimum_versions)}') + install_deps = ' '.join(minimum_versions) + c.run(f'python -m pip install {install_deps}') @task @@ -114,12 +122,8 @@ def readme(c): @task def lint(c): check_dependencies(c) - c.run('flake8 rdt') - c.run('pydocstyle rdt') - c.run('flake8 tests --ignore=D') - c.run('pydocstyle tests') - c.run('isort -c rdt tests') - c.run('pylint rdt tests/performance --rcfile=setup.cfg') + c.run('ruff check .') + c.run('ruff format . --check') c.run('pytest tests/code_style.py -v --disable-warnings --no-header') diff --git a/tests/code_style.py b/tests/code_style.py index bb459a314..7924dee88 100644 --- a/tests/code_style.py +++ b/tests/code_style.py @@ -92,11 +92,7 @@ def validate_test_names(transformer): assert test_class is not None, 'The expected test class was not found.' test_functions = inspect.getmembers(test_class, predicate=inspect.isfunction) - test_functions = [ - test - for test, _ in test_functions - if test.startswith('test') - ] + test_functions = [test for test, _ in test_functions if test.startswith('test')] assert test_functions, 'No test functions found within the test module.' @@ -110,8 +106,8 @@ def validate_test_names(transformer): for test in test_functions: count = len(valid_test_functions) for transformer_function in transformer_functions: - simple_test = fr'test_{transformer_function}' - described_test = fr'test_{transformer_function}_' + simple_test = rf'test_{transformer_function}' + described_test = rf'test_{transformer_function}_' if test.startswith(described_test): valid_test_functions.append(test) elif test.startswith(simple_test): diff --git a/tests/contributing.py b/tests/contributing.py index 5a59cb312..c4e7754e5 100644 --- a/tests/contributing.py +++ b/tests/contributing.py @@ -15,9 +15,14 @@ from rdt.performance.datasets import get_dataset_generators_by_type from rdt.transformers import get_transformer_class, get_transformers_by_type from tests.code_style import ( - get_test_location, validate_test_location, validate_test_names, validate_transformer_addon, - validate_transformer_importable_from_parent_module, validate_transformer_module, - validate_transformer_subclass) + get_test_location, + validate_test_location, + validate_test_names, + validate_transformer_addon, + validate_transformer_importable_from_parent_module, + validate_transformer_module, + validate_transformer_subclass, +) from tests.integration.test_transformers import validate_transformer from tests.performance import validate_performance @@ -56,7 +61,7 @@ 'rdt/transformers/', 'tests/unit/transformers/', 'tests/integration/transformers/', - 'tests/datasets/' + 'tests/datasets/', ] @@ -122,8 +127,9 @@ def validate_transformer_integration(transformer): return validation_error is None and error_trace is None -def _validate_third_party_code_style(command, tag, success_message, - error_message, transformer_path): +def _validate_third_party_code_style( + command, tag, success_message, error_message, transformer_path +): run_command = command.split(' ') run_command.append(transformer_path) output_capture = subprocess.run(run_command, capture_output=True).stdout.decode() @@ -156,7 +162,7 @@ def _custom_validation(function, tag, success_message, error_message, transforme 'Check': tag, 'Correct': 'No', 'Details': error_message, - 'output_capture': error + 'output_capture': error, } @@ -167,29 +173,29 @@ def _validate_third_party_checks(transformer_path): 'flake8', 'Code follows PEP8 standards.', 'Code must follow PEP8 standards.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'isort -c', 'isort', 'Imports are properly sorted.', 'Imports are not properly sorted.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'pylint --rcfile=setup.cfg ', 'pylint', 'Code is properly formatted and structured.', 'Code is not properly formatted and structured.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'pydocstyle', 'pydocstyle', 'The docstrings are properly written.', 'The docstrings are not properly written.', - transformer_path - ) + transformer_path, + ), ] return results @@ -202,43 +208,43 @@ def _validate_custom_checks(transformer): 'Transformer is subclass', 'The transformer is subclass of ``BaseTransformer``.', 'The transformer must be a subclass of ``BaseTransformer``.', - transformer + transformer, ), _custom_validation( validate_transformer_module, 'Valid module', 'The transformer is placed inside a valid module.', 'The transformer is not placed inside a valid module.', - transformer + transformer, ), _custom_validation( validate_test_location, 'Valid test module', 'The transformer tests are placed inside the valid module.', 'The transformer tests are not placed inside the valid module.', - transformer + transformer, ), _custom_validation( validate_test_names, 'Valid test function names', 'The transformer tests are named correctly.', 'The transformer tests are not named properly.', - transformer + transformer, ), _custom_validation( validate_transformer_addon, 'Valid transformer addon', 'The addon is configured properly.', 'The addon is not configured properly.', - transformer + transformer, ), _custom_validation( validate_transformer_importable_from_parent_module, 'Importable from module', 'The transformer can be imported from the parent module.', 'The transformer can not be imported from the parent module.', - transformer - ) + transformer, + ), ] return results @@ -265,7 +271,7 @@ def validate_transformer_code_style(transformer): transformer_path = inspect.getfile(transformer) print(f'Validating source file {transformer_path}') - results = (_validate_third_party_checks(transformer_path)) + results = _validate_third_party_checks(transformer_path) results.extend(_validate_custom_checks(transformer)) errors = [ @@ -393,7 +399,7 @@ def validate_transformer_performance(transformer): total_results = total_results[total_results.transformer == transformer.get_name()] final_results = total_results.groupby('Evaluation Metric').agg({ 'Value': 'mean', - 'Valid': 'any' + 'Valid': 'any', }) final_results = final_results.rename(columns={'Valid': 'Acceptable'}) final_results['Units'] = np.where( @@ -402,8 +408,9 @@ def validate_transformer_performance(transformer): 'B / row', ) final_results['Acceptable'] = np.where(final_results['Acceptable'], 'Yes', 'No') - final_results['Compared to Average'] = final_results['Value'].div(average).replace( - np.inf, np.nan) + final_results['Compared to Average'] = ( + final_results['Value'].div(average).replace(np.inf, np.nan) + ) return final_results.reset_index() @@ -432,7 +439,7 @@ def check_clean_repository(): if any([ file_path.match(valid_path), file_path.parent.match(valid_path), - file_path.parent.parent.match(valid_path) + file_path.parent.parent.match(valid_path), ]): validated_paths.append(True) @@ -483,13 +490,13 @@ def validate_pull_request(transformer): 'Code Style', code_style, 'Code Style is acceptable.', - 'Code Style is unacceptable!' + 'Code Style is unacceptable!', ), _build_validation_dict( 'Unit Tests', unit_bool, 'The unit tests are correct and run successfully.', - 'The unit tests did not run successfully or the coverage is not a 100%.' + 'The unit tests did not run successfully or the coverage is not a 100%.', ), _build_validation_dict( 'Integration tests', @@ -501,15 +508,14 @@ def validate_pull_request(transformer): 'Performance Tests', performance_bool, 'The performance of the transformer is acceptable.', - 'The performance of the transformer is unacceptable!' + 'The performance of the transformer is unacceptable!', ), _build_validation_dict( 'Clean Repository', clean_repository, 'There are no unexpected changes in the repository.', - 'There are unexpected changes in the repository!' + 'There are unexpected changes in the repository!', ), - ] results = pd.DataFrame(results) @@ -519,7 +525,7 @@ def validate_pull_request(transformer): unit_bool, integration_tests, performance_bool, - clean_repository + clean_repository, ]) print('\n') diff --git a/tests/datasets/tests/test_boolean.py b/tests/datasets/tests/test_boolean.py index 6362f2f22..ef8af0232 100644 --- a/tests/datasets/tests/test_boolean.py +++ b/tests/datasets/tests/test_boolean.py @@ -6,7 +6,6 @@ class TestRandomBooleanGenerator: - def test_generate(self): """Test the `RandomBooleanGenerator.generate` method. @@ -26,7 +25,6 @@ def test_generate(self): class TestRandomBooleanNaNsGenerator: - def test_generate(self): """Test the `RandomBooleanNaNsGenerator.generate` method. @@ -46,7 +44,6 @@ def test_generate(self): class TestRandomSkewedBooleanGenerator: - def test_generate(self): """Test the `RandomSkewedBooleanGenerator.generate` method. @@ -66,7 +63,6 @@ def test_generate(self): class TestRandomSkewedBooleanNaNsGenerator: - def test_generate(self): """Test the `RandomSkewedBooleanNaNsGenerator.generate` method. @@ -87,7 +83,6 @@ def test_generate(self): class TestConstantBooleanGenerator: - def test_generate(self): """Test the `ConstantBooleanGenerator.generate` method. @@ -108,7 +103,6 @@ def test_generate(self): class TestConstantBooleanNaNsGenerator: - def test(self): output = boolean.ConstantBooleanNaNsGenerator.generate(NUM_ROWS) assert len(output) == NUM_ROWS diff --git a/tests/datasets/tests/test_categorical.py b/tests/datasets/tests/test_categorical.py index 649c1f9fd..35e81b7bd 100644 --- a/tests/datasets/tests/test_categorical.py +++ b/tests/datasets/tests/test_categorical.py @@ -5,7 +5,6 @@ class TestRandomIntegerGenerator: - def test(self): output = categorical.RandomIntegerGenerator.generate(10) assert len(output) == 10 @@ -15,7 +14,6 @@ def test(self): class TestRandomIntegerNaNsGenerator: - def test(self): output = categorical.RandomIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -25,7 +23,6 @@ def test(self): class TestRandomStringGenerator: - def test(self): output = categorical.RandomStringGenerator.generate(10) assert len(output) == 10 @@ -35,7 +32,6 @@ def test(self): class TestRandomStringNaNsGenerator: - def test(self): output = categorical.RandomStringNaNsGenerator.generate(10) assert len(output) == 10 @@ -45,7 +41,6 @@ def test(self): class TestRandomMixedGenerator: - def test(self): output = categorical.RandomMixedGenerator.generate(10) assert len(output) == 10 @@ -54,7 +49,6 @@ def test(self): class TestRandomMixedNaNsGenerator: - def test(self): output = categorical.RandomMixedNaNsGenerator.generate(10) assert len(output) == 10 @@ -63,7 +57,6 @@ def test(self): class TestSingleIntegerGenerator: - def test(self): output = categorical.SingleIntegerGenerator.generate(10) assert len(output) == 10 @@ -73,7 +66,6 @@ def test(self): class TestSingleIntegerNaNsGenerator: - def test(self): output = categorical.SingleIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -83,7 +75,6 @@ def test(self): class TestSingleStringGenerator: - def test(self): output = categorical.SingleStringGenerator.generate(10) assert len(output) == 10 @@ -93,7 +84,6 @@ def test(self): class TestSingleStringNaNsGenerator: - def test(self): output = categorical.SingleStringNaNsGenerator.generate(10) assert len(output) == 10 @@ -103,7 +93,6 @@ def test(self): class TestUniqueIntegerGenerator: - def test(self): output = categorical.UniqueIntegerGenerator.generate(10) assert len(output) == 10 @@ -113,7 +102,6 @@ def test(self): class TestUniqueIntegerNaNsGenerator: - def test(self): output = categorical.UniqueIntegerNaNsGenerator.generate(10) nulls = np.isnan(output).sum() @@ -125,7 +113,6 @@ def test(self): class TestUniqueStringGenerator: - def test(self): output = categorical.UniqueStringGenerator.generate(10) assert len(output) == 10 @@ -135,7 +122,6 @@ def test(self): class TestUniqueStringNaNsGenerator: - def test(self): output = categorical.UniqueStringNaNsGenerator.generate(10) nulls = sum(pd.isna(output)) diff --git a/tests/datasets/tests/test_datetime.py b/tests/datasets/tests/test_datetime.py index 465511986..9a3d59419 100644 --- a/tests/datasets/tests/test_datetime.py +++ b/tests/datasets/tests/test_datetime.py @@ -7,7 +7,6 @@ class TestRandomGapDatetimeGenerator: - def test(self): output = datetime.RandomGapDatetimeGenerator.generate(10) assert len(output) == 10 @@ -17,7 +16,6 @@ def test(self): class TestRandomGapSecondsDatetimeGenerator: - def test(self): output = datetime.RandomGapSecondsDatetimeGenerator.generate(10) assert len(output) == 10 @@ -27,7 +25,6 @@ def test(self): class TestRandomGapDatetimeNaNsGenerator: - def test(self): output = datetime.RandomGapDatetimeNaNsGenerator.generate(10) assert len(output) == 10 @@ -37,7 +34,6 @@ def test(self): class TestEqualGapHoursDatetimeGenerator: - def test(self): output = datetime.EqualGapHoursDatetimeGenerator.generate(10) assert len(output) == 10 @@ -47,7 +43,6 @@ def test(self): class TestEqualGapDaysDatetimeGenerator: - def test(self): output = datetime.EqualGapDaysDatetimeGenerator.generate(10) assert len(output) == 10 @@ -57,7 +52,6 @@ def test(self): class TestEqualGapWeeksDatetimeGenerator: - def test(self): output = datetime.EqualGapWeeksDatetimeGenerator.generate(10) assert len(output) == 10 diff --git a/tests/datasets/tests/test_numerical.py b/tests/datasets/tests/test_numerical.py index 6b687d8f0..7948317e4 100644 --- a/tests/datasets/tests/test_numerical.py +++ b/tests/datasets/tests/test_numerical.py @@ -5,7 +5,6 @@ class TestRandomIntegerGenerator: - def test(self): output = numerical.RandomIntegerGenerator.generate(10) assert len(output) == 10 @@ -15,7 +14,6 @@ def test(self): class TestRandomIntegerNaNsGenerator: - def test(self): output = numerical.RandomIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -25,7 +23,6 @@ def test(self): class TestConstantIntegerGenerator: - def test(self): output = numerical.ConstantIntegerGenerator.generate(10) assert len(output) == 10 @@ -35,7 +32,6 @@ def test(self): class TestConstantIntegerNaNsGenerator: - def test(self): output = numerical.ConstantIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -45,7 +41,6 @@ def test(self): class TestAlmostConstantIntegerGenerator: - def test(self): output = numerical.AlmostConstantIntegerGenerator.generate(10) assert len(output) == 10 @@ -55,7 +50,6 @@ def test(self): class TestAlmostConstantIntegerNaNsGenerator: - def test(self): output = numerical.AlmostConstantIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -65,7 +59,6 @@ def test(self): class TestNormalGenerator: - def test(self): output = numerical.NormalGenerator.generate(10) assert len(output) == 10 @@ -75,7 +68,6 @@ def test(self): class TestNormalNaNsGenerator: - def test(self): output = numerical.NormalNaNsGenerator.generate(10) assert len(output) == 10 @@ -85,7 +77,6 @@ def test(self): class TestBigNormalGenerator: - def test(self): output = numerical.BigNormalGenerator.generate(10) assert len(output) == 10 @@ -95,7 +86,6 @@ def test(self): class TestBigNormalNaNsGenerator: - def test(self): output = numerical.BigNormalNaNsGenerator.generate(10) assert len(output) == 10 diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py index e34473071..b0d37eed8 100644 --- a/tests/integration/__init__.py +++ b/tests/integration/__init__.py @@ -1,6 +1,5 @@ """RDT integration testing package.""" - from tests.integration.test_transformers import validate_transformer __all__ = [ diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index 7fee69c4a..a87b48fec 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -8,20 +8,35 @@ from rdt import get_demo from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, +) from rdt.hyper_transformer import Config, HyperTransformer from rdt.transformers import ( - AnonymizedFaker, BaseMultiColumnTransformer, BaseTransformer, BinaryEncoder, - ClusterBasedNormalizer, FloatFormatter, FrequencyEncoder, LabelEncoder, OneHotEncoder, - RegexGenerator, UniformEncoder, UnixTimestampEncoder, get_default_transformer, - get_default_transformers) + AnonymizedFaker, + BaseMultiColumnTransformer, + BaseTransformer, + BinaryEncoder, + ClusterBasedNormalizer, + FloatFormatter, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, + get_default_transformer, + get_default_transformers, +) from rdt.transformers.datetime import OptimizedTimestampEncoder from rdt.transformers.numerical import GaussianNormalizer from rdt.transformers.pii.anonymizer import PseudoAnonymizedFaker class DummyTransformerNumerical(BaseTransformer): - INPUT_SDTYPE = 'categorical' def _fit(self, data): @@ -35,13 +50,15 @@ def _reverse_transform(self, data): class DummyTransformerNotMLReady(BaseTransformer): - INPUT_SDTYPE = 'datetime' def __init__(self): super().__init__() self.output_properties = { - None: {'sdtype': 'datetime', 'next_transformer': FrequencyEncoder()} + None: { + 'sdtype': 'datetime', + 'next_transformer': FrequencyEncoder(), + } } def _fit(self, data): @@ -65,7 +82,8 @@ def _fit(self, data): column: { 'sdtype': 'numerical', 'next_transformer': None, - } for column in self.columns + } + for column in self.columns } @classmethod @@ -96,64 +114,79 @@ def get_input_data(): '2010-01-01', '2010-01-01', ]) - data = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], - 'categorical': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, False, False, True, False, False, True, False], - 'datetime': datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + data = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], + 'categorical': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'a'], + 'bool': [False, False, False, True, False, False, True, False], + 'datetime': datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) return data def get_transformed_data(): datetimes = [ - 1.264982e+18, - 1.264982e+18, - 1.262304e+18, - 1.262304e+18, - 1.262304e+18, - 1.264982e+18, - 1.262304e+18, - 1.262304e+18, + 1.264982e18, + 1.264982e18, + 1.262304e18, + 1.262304e18, + 1.262304e18, + 1.264982e18, + 1.262304e18, + 1.262304e18, ] - return pd.DataFrame({ - 'integer': [1., 2., 1., 3., 1., 4., 2., 3.], - 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], - 'categorical': [ - 0.239836, - 0.233842, - 0.654524, - 0.994903, - 0.371298, - 0.659559, - 0.270355, - 0.120638, - ], - 'bool': [ - 0.667087, - 0.238123, - 0.345841, - 0.842023, - 0.478896, - 0.495079, - 0.775272, - 0.675913, - ], - 'datetime': datetimes, - 'names': [ - 0.159704, - 0.684242, - 0.719619, - 0.458355, - 0.536445, - 0.991478, - 0.078868, - 0.575187, - ] - }, index=TEST_DATA_INDEX) + return pd.DataFrame( + { + 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], + 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], + 'categorical': [ + 0.239836, + 0.233842, + 0.654524, + 0.994903, + 0.371298, + 0.659559, + 0.270355, + 0.120638, + ], + 'bool': [ + 0.667087, + 0.238123, + 0.345841, + 0.842023, + 0.478896, + 0.495079, + 0.775272, + 0.675913, + ], + 'datetime': datetimes, + 'names': [ + 0.159704, + 0.684242, + 0.719619, + 0.458355, + 0.536445, + 0.991478, + 0.078868, + 0.575187, + ], + }, + index=TEST_DATA_INDEX, + ) def get_reversed_data(): @@ -189,14 +222,35 @@ def test_default_inputs(self): '2010-01-01', '2010-01-01', ]) - data = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, np.nan, 0.3], - 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, np.nan, False, True, False, np.nan, True, False], - 'datetime': datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + data = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, np.nan, 0.3], + 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], + 'bool': [ + False, + np.nan, + False, + True, + False, + np.nan, + True, + False, + ], + 'datetime': datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) # Run ht = HyperTransformer() @@ -206,50 +260,62 @@ def test_default_inputs(self): reverse_transformed = ht.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], - 'float': [0.1, 0.2, 0.1, 0.20000000000000004, 0.1, 0.4, 0.20000000000000004, 0.3], - 'categorical': [ - 0.239836, - 0.233842, - 0.634841, - 0.996602, - 0.371298, - 0.773039, - 0.270355, - 0.120638, - ], - 'bool': [ - 0.444725, - 0.579374, - 0.230561, - 0.842023, - 0.319264, - 0.665026, - 0.775272, - 0.450609, - ], - 'datetime': [ - 1.2630692571428572e+18, - 1.2649824e+18, - 1.262304e+18, - 1.262304e+18, - 1.262304e+18, - 1.2649824e+18, - 1.262304e+18, - 1.262304e+18 - ], - 'names': [ - 0.159704, - 0.684242, - 0.719619, - 0.458355, - 0.536445, - 0.991478, - 0.078868, - 0.575187, - ] - }, index=TEST_DATA_INDEX) + expected_transformed = pd.DataFrame( + { + 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], + 'float': [ + 0.1, + 0.2, + 0.1, + 0.20000000000000004, + 0.1, + 0.4, + 0.20000000000000004, + 0.3, + ], + 'categorical': [ + 0.239836, + 0.233842, + 0.634841, + 0.996602, + 0.371298, + 0.773039, + 0.270355, + 0.120638, + ], + 'bool': [ + 0.444725, + 0.579374, + 0.230561, + 0.842023, + 0.319264, + 0.665026, + 0.775272, + 0.450609, + ], + 'datetime': [ + 1.2630692571428572e18, + 1.2649824e18, + 1.262304e18, + 1.262304e18, + 1.262304e18, + 1.2649824e18, + 1.262304e18, + 1.262304e18, + ], + 'names': [ + 0.159704, + 0.684242, + 0.719619, + 0.458355, + 0.536445, + 0.991478, + 0.078868, + 0.575187, + ], + }, + index=TEST_DATA_INDEX, + ) pd.testing.assert_frame_equal(transformed, expected_transformed) reversed_datetimes = pd.to_datetime([ @@ -262,23 +328,35 @@ def test_default_inputs(self): '2010-01-01', '2010-01-01', ]) - expected_reversed = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [ - 0.100000, - np.nan, - np.nan, - 0.20000000000000004, - 0.100000, - 0.400000, - np.nan, - 0.300000, - ], - 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, False, False, True, False, False, True, False], - 'datetime': reversed_datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + expected_reversed = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [ + 0.100000, + np.nan, + np.nan, + 0.20000000000000004, + 0.100000, + 0.400000, + np.nan, + 0.300000, + ], + 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], + 'bool': [False, False, False, True, False, False, True, False], + 'datetime': reversed_datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) for row in range(reverse_transformed.shape[0]): for column in range(reverse_transformed.shape[1]): expected = expected_reversed.iloc[row, column] @@ -323,7 +401,7 @@ def test_field_transformers(self): 'categorical': 'categorical', 'bool': 'boolean', 'datetime': 'datetime', - 'names': 'categorical' + 'names': 'categorical', }, 'transformers': { 'integer': FloatFormatter(missing_value_replacement='mean'), @@ -331,8 +409,8 @@ def test_field_transformers(self): 'categorical': UniformEncoder(), 'bool': UniformEncoder(), 'datetime': DummyTransformerNotMLReady(), - 'names': UniformEncoder() - } + 'names': UniformEncoder(), + }, } data = get_input_data() @@ -347,7 +425,16 @@ def test_field_transformers(self): # Assert expected_transformed = get_transformed_data() - transformed_datetimes = [0.8125, 0.8125, 0.3125, 0.3125, 0.3125, 0.8125, 0.3125, 0.3125] + transformed_datetimes = [ + 0.8125, + 0.8125, + 0.3125, + 0.3125, + 0.3125, + 0.8125, + 0.3125, + 0.3125, + ] expected_transformed['datetime'] = transformed_datetimes pd.testing.assert_frame_equal(transformed, expected_transformed) @@ -358,15 +445,11 @@ def test_single_category(self): """Test that categorical variables with a single value are supported.""" # Setup ht = HyperTransformer() - data = pd.DataFrame({ - 'a': ['a', 'a', 'a'] - }) + data = pd.DataFrame({'a': ['a', 'a', 'a']}) # Run ht.detect_initial_config(data) - ht.update_transformers(column_name_to_transformer={ - 'a': OneHotEncoder() - }) + ht.update_transformers(column_name_to_transformer={'a': OneHotEncoder()}) ht.fit(data) transformed = ht.transform(data) reverse = ht.reverse_transform(transformed) @@ -387,7 +470,7 @@ def test_categorical_encoders_with_booleans(self): 'email_confirmed': FrequencyEncoder(), 'subscribed': OneHotEncoder(), 'paid': LabelEncoder(), - } + }, } ht = HyperTransformer() @@ -456,8 +539,10 @@ def test_multiple_fits_different_data(self): reverse2 = ht.reverse_transform(transformed2) # Assert - expected_transformed = pd.DataFrame( - {'col2': [1., 2., 3.], 'col1': [1.0, 0.0, 0.0]}) + expected_transformed = pd.DataFrame({ + 'col2': [1.0, 2.0, 3.0], + 'col1': [1.0, 0.0, 0.0], + }) pd.testing.assert_frame_equal(transformed1, expected_transformed) pd.testing.assert_frame_equal(transformed2, expected_transformed) pd.testing.assert_frame_equal(reverse1, new_data) @@ -484,8 +569,10 @@ def test_multiple_fits_different_columns(self): reverse2 = ht.reverse_transform(transformed2) # Assert - expected_transformed = pd.DataFrame( - {'col3': [1., 2., 3.], 'col4': [1.0, 0.0, 0.0]}) + expected_transformed = pd.DataFrame({ + 'col3': [1.0, 2.0, 3.0], + 'col4': [1.0, 0.0, 0.0], + }) pd.testing.assert_frame_equal(transformed1, expected_transformed) pd.testing.assert_frame_equal(transformed2, expected_transformed) pd.testing.assert_frame_equal(reverse1, new_data) @@ -502,10 +589,12 @@ def test_multiple_fits_with_set_config(self): # Run ht.detect_initial_config(data) - ht.set_config(config={ - 'sdtypes': {'integer': 'categorical'}, - 'transformers': {'integer': FrequencyEncoder()} - }) + ht.set_config( + config={ + 'sdtypes': {'integer': 'categorical'}, + 'transformers': {'integer': FrequencyEncoder()}, + } + ) ht.fit(data) transformed1 = ht.transform(data) reverse1 = ht.reverse_transform(transformed1) @@ -533,10 +622,12 @@ def test_multiple_detect_configs_with_set_config(self): transformed1 = ht.transform(data) reverse1 = ht.reverse_transform(transformed1) - ht.set_config(config={ - 'sdtypes': {'integers': 'categorical'}, - 'transformers': {'integers': FrequencyEncoder()} - }) + ht.set_config( + config={ + 'sdtypes': {'integers': 'categorical'}, + 'transformers': {'integers': FrequencyEncoder()}, + } + ) ht.detect_initial_config(data) ht.fit(data) @@ -743,7 +834,7 @@ def test_transform_subset(self): transformed = ht.transform_subset(subset) # Assert - expected = pd.DataFrame({'col1': [1., 2.]}) + expected = pd.DataFrame({'col1': [1.0, 2.0]}) pd.testing.assert_frame_equal(transformed, expected) def test_reverse_transform_subset(self): @@ -797,14 +888,13 @@ def test_with_multiple_supported_sdtypes(self): data = pd.DataFrame({ 'user': ['John', 'Doe', 'John Doe', 'Doe John'], 'id': list(range(4)), - 'subscribed': [True, False, True, False] + 'subscribed': [True, False, True, False], }) ht = HyperTransformer() ht.detect_initial_config(data) ht.update_transformers_by_sdtype( - sdtype='boolean', - transformer=FrequencyEncoder(add_noise=True) + sdtype='boolean', transformer=FrequencyEncoder(add_noise=True) ) # Run @@ -842,14 +932,11 @@ def test_reverse_transform_subset_and_generators(self): ht.detect_initial_config(customers) # credit_card and id are pii and text columns - ht.update_sdtypes({ - 'credit_card': 'pii', - 'id': 'text' - }) + ht.update_sdtypes({'credit_card': 'pii', 'id': 'text'}) ht.update_transformers({ 'credit_card': AnonymizedFaker(), - 'id': RegexGenerator(regex_format='id_[a-z]') + 'id': RegexGenerator(regex_format='id_[a-z]'), }) # Run @@ -862,7 +949,7 @@ def test_reverse_transform_subset_and_generators(self): 'last_login', 'email_optin', 'age', - 'dollars_spent' + 'dollars_spent', ] assert all(expected_transformed_columns == transformed.columns) assert reverse_transformed.columns == ['last_login'] @@ -874,9 +961,7 @@ def test_set_config_with_supported_sdtypes(self): 'transformers': { 'boolean_col': FrequencyEncoder(add_noise=True), }, - 'sdtypes': { - 'boolean_col': 'boolean' - } + 'sdtypes': {'boolean_col': 'boolean'}, } ht = HyperTransformer() @@ -889,12 +974,12 @@ def test_chained_transformers(self): When the specified transformer indicates a next transformer, they should each be applied in order during the transform step, and then reversed during the reverse_transform. """ + # Setup class DoublingTransformer(BaseTransformer): INPUT_SDTYPE = 'numerical' - def _fit(self, data): - ... + def _fit(self, data): ... def _transform(self, data): return data * 2 @@ -909,17 +994,17 @@ def _reverse_transform(self, data): transformer1.output_properties[None]['next_transformer'] = transformer2 ht = HyperTransformer() - data = pd.DataFrame({'col': [1., 2, -1, 3, 1]}) + data = pd.DataFrame({'col': [1.0, 2, -1, 3, 1]}) # Run and Assert ht.set_config({ 'sdtypes': {'col': 'numerical'}, - 'transformers': {'col': transformer1} + 'transformers': {'col': transformer1}, }) ht.fit(data) transformed = ht.transform(data) - expected_transform = pd.DataFrame({'col': [8., 16, -8, 24, 8]}) + expected_transform = pd.DataFrame({'col': [8.0, 16, -8, 24, 8]}) pd.testing.assert_frame_equal(transformed, expected_transform) reverse_transformed = ht.reverse_transform(transformed) @@ -931,6 +1016,7 @@ def test_chained_transformers_various_transformers(self): When the specified transformer indicates a next transformer, they should each be applied in order during the transform step, and then reversed during the reverse_transform. """ + # Setup class AB(BaseTransformer): INPUT_SDTYPE = 'categorical' @@ -959,7 +1045,7 @@ class CD(BaseTransformer): def _fit(self, data): self.output_properties = { 'c': {'sdtype': 'categorical', 'next_transformer': None}, - 'd': {'sdtype': 'categorical', 'next_transformer': E()} + 'd': {'sdtype': 'categorical', 'next_transformer': E()}, } def _transform(self, data): @@ -979,7 +1065,7 @@ class E(BaseTransformer): def _fit(self, data): self.output_properties = { None: {'sdtype': 'categorical', 'next_transformer': None}, - 'e': {'sdtype': 'categorical', 'next_transformer': None} + 'e': {'sdtype': 'categorical', 'next_transformer': None}, } def _transform(self, data): @@ -996,13 +1082,17 @@ def _reverse_transform(self, data): data = pd.DataFrame({ 'col': ['a', 'b', 'c'], 'col.a': ['1', '2', '3'], - 'col#': ['_', '_', '_'] + 'col#': ['_', '_', '_'], }) # Run and Assert ht.set_config({ - 'sdtypes': {'col': 'categorical', 'col.a': 'categorical', 'col#': 'categorical'}, - 'transformers': {'col': AB(), 'col.a': AB(), 'col#': E()} + 'sdtypes': { + 'col': 'categorical', + 'col.a': 'categorical', + 'col#': 'categorical', + }, + 'transformers': {'col': AB(), 'col.a': AB(), 'col#': E()}, }) ht.fit(data) transformed = ht.transform(data) @@ -1045,7 +1135,10 @@ def test_field_transformers_correctly_set(self): # if a transformer was set, it should use the provided instance fe = FrequencyEncoder() - ht.set_config({'sdtypes': {'col': 'categorical'}, 'transformers': {'col': fe}}) + ht.set_config({ + 'sdtypes': {'col': 'categorical'}, + 'transformers': {'col': fe}, + }) ht.fit(data) transformer = ht.get_config()['transformers']['col'] assert transformer is fe @@ -1076,17 +1169,17 @@ def _get_hyper_transformer_with_random_transformers(self, data): ht.update_sdtypes({ 'credit_card': 'pii', 'name': 'text', - 'signup_day': 'datetime' + 'signup_day': 'datetime', }) ht.update_transformers({ 'credit_card': AnonymizedFaker('credit_card', 'credit_card_number'), 'balance': ClusterBasedNormalizer(max_clusters=3), - 'name': RegexGenerator() + 'name': RegexGenerator(), }) ht.update_transformers_by_sdtype( 'categorical', transformer_name='FrequencyEncoder', - transformer_parameters={'add_noise': True} + transformer_parameters={'add_noise': True}, ) return ht @@ -1103,12 +1196,24 @@ def test_reset_randomization(self): """ # Setup data = pd.DataFrame({ - 'credit_card': ['123456789', '987654321', '192837645', '918273465', '123789456'], + 'credit_card': [ + '123456789', + '987654321', + '192837645', + '918273465', + '123789456', + ], 'age': [18, 25, 54, 60, 31], 'name': ['Bob', 'Jane', 'Jack', 'Jill', 'Joe'], - 'signup_day': ['1/1/2020', np.nan, '4/1/2019', '12/1/2008', '5/16/2016'], + 'signup_day': [ + '1/1/2020', + np.nan, + '4/1/2019', + '12/1/2008', + '5/16/2016', + ], 'balance': [250, 5400, 150000, np.nan, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) ht1 = self._get_hyper_transformer_with_random_transformers(data) ht2 = self._get_hyper_transformer_with_random_transformers(data) @@ -1117,14 +1222,18 @@ def test_reset_randomization(self): expected_first_transformed = pd.DataFrame({ 'age': [18.0, 25.0, 54.0, 60.0, 31.0], 'signup_day': [ - 1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18 + 1.5778368e18, + 1.45584e18, + 1.5540768e18, + 1.2280896e18, + 1.4633568e18, ], 'balance.normalized': [ -2.693016e-01, -2.467182e-01, 3.873711e-01, 9.571797e-17, - 1.286486e-01 + 1.286486e-01, ], 'balance.component': [0.0, 0, 0, 0, 0], 'card_type': [ @@ -1133,19 +1242,23 @@ def test_reset_randomization(self): 0.639794, 0.862760, 0.263703, - ] + ], }) expected_second_transformed = pd.DataFrame({ 'age': [18.0, 25.0, 54.0, 60.0, 31.0], 'signup_day': [ - 1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18 + 1.5778368e18, + 1.45584e18, + 1.5540768e18, + 1.2280896e18, + 1.4633568e18, ], 'balance.normalized': [ -2.693016e-01, -2.467182e-01, 3.873711e-01, 9.571797e-17, - 1.286486e-01 + 1.286486e-01, ], 'balance.component': [0.0, 0, 0, 0, 0], 'card_type': [ @@ -1154,7 +1267,7 @@ def test_reset_randomization(self): 0.714735, 0.939781, 0.251442, - ] + ], }) ht1.fit(data) @@ -1178,9 +1291,15 @@ def test_reset_randomization(self): ], 'age': [18, 25, 54, 60, 31], 'name': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'], - 'signup_day': ['01/01/2020', '02/19/2016', '04/01/2019', np.nan, np.nan], + 'signup_day': [ + '01/01/2020', + '02/19/2016', + '04/01/2019', + np.nan, + np.nan, + ], 'balance': [250, 5400, 150000, 61662.5, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) expected_second_reverse = pd.DataFrame({ 'credit_card': [ @@ -1192,9 +1311,15 @@ def test_reset_randomization(self): ], 'age': [18, 25, 54, 60, 31], 'name': ['AAAAF', 'AAAAG', 'AAAAH', 'AAAAI', 'AAAAJ'], - 'signup_day': ['01/01/2020', np.nan, '04/01/2019', '12/01/2008', np.nan], + 'signup_day': [ + '01/01/2020', + np.nan, + '04/01/2019', + '12/01/2008', + np.nan, + ], 'balance': [np.nan, 5400, np.nan, 61662.5, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) first_reverse1 = ht1.reverse_transform(first_transformed1) first_reverse2 = ht2.reverse_transform(first_transformed1) @@ -1220,9 +1345,7 @@ def test_cluster_based_normalizer_randomization(self): data = get_demo(100) ht = HyperTransformer() ht.detect_initial_config(data) - ht.update_transformers({ - 'age': ClusterBasedNormalizer() - }) + ht.update_transformers({'age': ClusterBasedNormalizer()}) ht.fit(data) transformed1 = ht.transform(data) transformed2 = ht.transform(data) @@ -1231,9 +1354,7 @@ def test_cluster_based_normalizer_randomization(self): ht2 = HyperTransformer() ht2.detect_initial_config(data) - ht2.update_transformers({ - 'age': ClusterBasedNormalizer() - }) + ht2.update_transformers({'age': ClusterBasedNormalizer()}) ht2.fit(data) pd.testing.assert_frame_equal(transformed1, ht2.transform(data)) @@ -1252,13 +1373,10 @@ def test_anonymized_faker(self): # Run - simple run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': AnonymizedFaker() + 'id2': AnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) @@ -1295,13 +1413,10 @@ def test_anonymized_faker_text(self): # Run - simple run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'info': 'text' - }) + ht.update_sdtypes({'id1': 'pii', 'info': 'text'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'info': AnonymizedFaker() + 'info': AnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) @@ -1321,13 +1436,10 @@ def test_pseudo_anonymized_faker(self): # Run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': PseudoAnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) @@ -1339,13 +1451,10 @@ def test_pseudo_anonymized_faker(self): # Run - run it again on the exact same data ht = HyperTransformer() ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': PseudoAnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) @@ -1369,13 +1478,10 @@ def test_anonymized_faker_different_tables(self): # Run on data1 ht.detect_initial_config(data1) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data1) transformed = ht.transform(data1) @@ -1383,13 +1489,10 @@ def test_anonymized_faker_different_tables(self): # Run on data2 ht.detect_initial_config(data2) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data2) transformed = ht.transform(data2) @@ -1408,10 +1511,30 @@ def test_random_seed(self): 'num4': [1, np.nan, 2] * 10, 'num5': [1, np.nan, 2] * 10, 'num6': [1, np.nan, 2] * 10, - 'date1': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date2': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date3': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date4': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, + 'date1': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date2': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date3': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date4': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, }) ht = HyperTransformer() @@ -1455,18 +1578,18 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self): data_test = pd.DataFrame({ 'A': ['1.0', '2.0', '3.0'], 'B': ['4.0', '5.0', '6.0'], - 'C': [True, False, True] + 'C': [True, False, True], }) dict_config = { 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { ('A', 'B'): DummyMultiColumnTransformerNumerical(), - 'C': UniformEncoder() - } + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1480,31 +1603,37 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self): expected_transformed_data = pd.DataFrame({ 'A': [1.0, 2.0, 3.0], 'B': [4.0, 5.0, 6.0], - 'C': [0.10333535312718026, 0.6697388922326716, 0.18775548909503287] + 'C': [ + 0.10333535312718026, + 0.6697388922326716, + 0.18775548909503287, + ], }) pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) pd.testing.assert_frame_equal(reverse_transformed_data, data_test) - def test_hypertransformer_with_mutli_column_transformer_and_single_column(self): + def test_hypertransformer_with_mutli_column_transformer_and_single_column( + self, + ): """Test a mutli column transformer used with for a single column.""" # Setup data_test = pd.DataFrame({ 'A': ['1.0', '2.0', '3.0'], 'B2': ['4.0', '5.0', '6.0'], - 'C': [True, False, True] + 'C': [True, False, True], }) dict_config = { 'sdtypes': { 'A': 'categorical', 'B2': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'A': DummyMultiColumnTransformerNumerical(), - ('B2', ): DummyMultiColumnTransformerNumerical(), - 'C': UniformEncoder() - } + ('B2',): DummyMultiColumnTransformerNumerical(), + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1518,7 +1647,11 @@ def test_hypertransformer_with_mutli_column_transformer_and_single_column(self): expected_transformed_data = pd.DataFrame({ 'A': [1.0, 2.0, 3.0], 'B2': [4.0, 5.0, 6.0], - 'C': [0.04206197607326308, 0.8000968077312287, 0.06325519846695522] + 'C': [ + 0.04206197607326308, + 0.8000968077312287, + 0.06325519846695522, + ], }) pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) @@ -1531,13 +1664,13 @@ def test_update_transformers_single_to_multi_column(self): 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'A': None, 'B': UniformEncoder(), - 'C': UniformEncoder() - } + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1554,12 +1687,12 @@ def test_update_transformers_single_to_multi_column(self): 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'C': UniformEncoder(), ('A', 'B'): DummyMultiColumnTransformerNumerical(), - } + }, }) expected_multi_columns = { @@ -1580,13 +1713,13 @@ def test_update_transformers_multi_to_single_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1596,7 +1729,7 @@ def test_update_transformers_multi_to_single_column(self): # Run ht.update_transformers({ ('A', 'B'): DummyMultiColumnTransformerNumerical(), - 'D': UniformEncoder() + 'D': UniformEncoder(), }) new_config = ht.get_config() @@ -1607,14 +1740,14 @@ def test_update_transformers_multi_to_single_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'E': UniformEncoder(), "('A', 'B')": DummyMultiColumnTransformerNumerical(), 'C': DummyMultiColumnTransformerNumerical(), - 'D': UniformEncoder() - } + 'D': UniformEncoder(), + }, }) expected_multi_columns = { @@ -1633,13 +1766,13 @@ def test_update_transformers_by_sdtype_mutli_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1657,19 +1790,16 @@ def test_update_transformers_by_sdtype_mutli_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), 'C': LabelEncoder(), - "('B', 'D')": DummyMultiColumnTransformerNumerical() - } + "('B', 'D')": DummyMultiColumnTransformerNumerical(), + }, }) - expected_multi_columns = { - 'B': ('B', 'D'), - 'D': ('B', 'D') - } + expected_multi_columns = {'B': ('B', 'D'), 'D': ('B', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == expected_multi_columns @@ -1683,13 +1813,13 @@ def test_remove_transformer(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1706,19 +1836,16 @@ def test_remove_transformer(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), "('C', 'D')": DummyMultiColumnTransformerNumerical(), - 'B': None - } + 'B': None, + }, }) - exepected_multi_columns = { - 'C': ('C', 'D'), - 'D': ('C', 'D') - } + exepected_multi_columns = {'C': ('C', 'D'), 'D': ('C', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == exepected_multi_columns @@ -1732,13 +1859,13 @@ def test_remove_transformer_by_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1756,14 +1883,14 @@ def test_remove_transformer_by_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), "('B', 'D')": DummyMultiColumnTransformerNumerical(), - 'C': None - } + 'C': None, + }, }) assert repr(new_config) == repr(expected_config) @@ -1777,13 +1904,13 @@ def test_update_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1791,10 +1918,7 @@ def test_update_sdtype(self): ht.set_config(config) # Run - ht.update_sdtypes({ - 'C': 'numerical', - 'A': 'numerical' - }) + ht.update_sdtypes({'C': 'numerical', 'A': 'numerical'}) new_config = ht.get_config() # Assert @@ -1804,19 +1928,16 @@ def test_update_sdtype(self): 'B': 'categorical', 'C': 'numerical', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': FloatFormatter(), 'E': UniformEncoder(), "('B', 'D')": DummyMultiColumnTransformerNumerical(), - 'C': FloatFormatter() - } + 'C': FloatFormatter(), + }, }) - expected_multi_columns = { - 'B': ('B', 'D'), - 'D': ('B', 'D') - } + expected_multi_columns = {'B': ('B', 'D'), 'D': ('B', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == expected_multi_columns @@ -1826,17 +1947,13 @@ def test_with_tuple_returned_by_faker(self): # Setup ht = HyperTransformer() ht.set_config({ - 'sdtypes': { - 'A': 'pii' - }, + 'sdtypes': {'A': 'pii'}, 'transformers': { 'A': AnonymizedFaker(provider_name='currency', function_name='currency') - } + }, }) - ht.fit(pd.DataFrame({ - 'A': ['a', 'b', 'c'] - })) + ht.fit(pd.DataFrame({'A': ['a', 'b', 'c']})) # Run result = ht.create_anonymized_columns(num_rows=10, column_names=['A']) @@ -1844,10 +1961,16 @@ def test_with_tuple_returned_by_faker(self): # Assert expected_results = pd.DataFrame({ 'A': [ - 'KHR, Cambodian riel', 'TVD, Tuvaluan dollar', 'PKR, Pakistani rupee', - 'SVC, Salvadoran colón', 'CVE, Cape Verdean escudo', 'BRL, Brazilian real', - 'RWF, Rwandan franc', 'KZT, Kazakhstani tenge', 'HRK, Croatian kuna', - 'ILS, Israeli new shekel' + 'KHR, Cambodian riel', + 'TVD, Tuvaluan dollar', + 'PKR, Pakistani rupee', + 'SVC, Salvadoran colón', + 'CVE, Cape Verdean escudo', + 'BRL, Brazilian real', + 'RWF, Rwandan franc', + 'KZT, Kazakhstani tenge', + 'HRK, Croatian kuna', + 'ILS, Israeli new shekel', ] }) pd.testing.assert_frame_equal(result, expected_results) @@ -1858,7 +1981,7 @@ def test_with_tuple_returned_by_faker(self): 'B': 'categorical', 'D': 'categorical', 'E': 'categorical', - 'C': 'boolean' + 'C': 'boolean', } } expected_transformer_update = { @@ -1867,7 +1990,7 @@ def test_with_tuple_returned_by_faker(self): 'E': UniformEncoder(), 'C': UniformEncoder(), 'B': UniformEncoder(), - 'D': UniformEncoder() + 'D': UniformEncoder(), } } expected_transformer_remove = { @@ -1876,29 +1999,29 @@ def test_with_tuple_returned_by_faker(self): 'E': UniformEncoder(), 'C': None, 'B': UniformEncoder(), - 'D': UniformEncoder() + 'D': UniformEncoder(), } } - expected_update = { - **expected_sdtype, - **expected_transformer_update - } - expected_remove = { - **expected_sdtype, - **expected_transformer_remove - } + expected_update = {**expected_sdtype, **expected_transformer_update} + expected_remove = {**expected_sdtype, **expected_transformer_remove} parametrization = [ ( - 'update_transformers', {'column_name_to_transformer': {'C': UniformEncoder()}}, - expected_update + 'update_transformers', + {'column_name_to_transformer': {'C': UniformEncoder()}}, + expected_update, ), ( 'update_transformers_by_sdtype', - {'sdtype': 'boolean', 'transformer': UniformEncoder()}, expected_update + {'sdtype': 'boolean', 'transformer': UniformEncoder()}, + expected_update, ), ('remove_transformers', {'column_names': 'C'}, expected_remove), - ('remove_transformers_by_sdtype', {'sdtype': 'boolean'}, expected_remove), + ( + 'remove_transformers_by_sdtype', + {'sdtype': 'boolean'}, + expected_remove, + ), ] @pytest.mark.parametrize(('method_name', 'method_input', 'expected_result'), parametrization) @@ -1908,9 +2031,9 @@ def test_invalid_multi_column(self, method_name, method_input, expected_result): When a multi column is no longer valid, all these methods should raise a warning and assign the default transformer to the columns. """ + # Setup class BadDummyMultiColumnTransformer(DummyMultiColumnTransformerNumerical): - @classmethod def _validate_sdtypes(cls, columns_to_sdtype): raise TransformerInputError('Invalid sdtype') @@ -1926,8 +2049,8 @@ def _validate_sdtypes(cls, columns_to_sdtype): 'transformers': { 'A': UniformEncoder(), ('B', 'D', 'C'): BadDummyMultiColumnTransformer(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py index 5fc4d72a0..aeefe2936 100644 --- a/tests/integration/test_transformers.py +++ b/tests/integration/test_transformers.py @@ -16,23 +16,13 @@ TRANSFORMER_ARGS = { 'BinaryEncoder': { 'missing_value_replacement': -1, - 'missing_value_generation': 'from_column' - }, - 'UnixTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'OptimizedTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'FloatFormatter': { - 'missing_value_generation': 'from_column' - }, - 'GaussianNormalizer': { - 'missing_value_generation': 'from_column' - }, - 'ClusterBasedNormalizer': { - 'missing_value_generation': 'from_column' + 'missing_value_generation': 'from_column', }, + 'UnixTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'OptimizedTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'FloatFormatter': {'missing_value_generation': 'from_column'}, + 'GaussianNormalizer': {'missing_value_generation': 'from_column'}, + 'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'}, } # Mapping of rdt sdtype to dtype @@ -68,8 +58,12 @@ def _validate_helper(validator_function, args, steps): def _is_valid_transformer(transformer_name): """Determine if transformer should be tested or not.""" invalid_names = [ - 'IdentityTransformer', 'Dummy', 'OrderedLabelEncoder', 'CustomLabelEncoder', - 'OrderedUniformEncoder', 'BaseMultiColumnTransformer' + 'IdentityTransformer', + 'Dummy', + 'OrderedLabelEncoder', + 'CustomLabelEncoder', + 'OrderedUniformEncoder', + 'BaseMultiColumnTransformer', ] return all(invalid_name not in transformer_name for invalid_name in invalid_names) @@ -204,32 +198,21 @@ def _test_transformer_with_hypertransformer(transformer_class, input_data, steps transformer_args = TRANSFORMER_ARGS.get(transformer_class.__name__, {}) hypertransformer = HyperTransformer() if transformer_args: - field_transformers = { - TEST_COL: transformer_class(**transformer_args) - } + field_transformers = {TEST_COL: transformer_class(**transformer_args)} else: - field_transformers = { - TEST_COL: transformer_class() - } + field_transformers = {TEST_COL: transformer_class()} sdtypes = {} for field, transformer in field_transformers.items(): sdtypes[field] = transformer.get_supported_sdtypes()[0] - config = { - 'sdtypes': sdtypes, - 'transformers': field_transformers - } + config = {'sdtypes': sdtypes, 'transformers': field_transformers} hypertransformer.set_config(config) hypertransformer.fit(input_data) transformed = hypertransformer.transform(input_data) - _validate_helper( - _validate_hypertransformer_transformed_data, - [transformed], - steps - ) + _validate_helper(_validate_hypertransformer_transformed_data, [transformed], steps) out = hypertransformer.reverse_transform(transformed) _validate_helper( diff --git a/tests/integration/transformers/pii/test_anonymizer.py b/tests/integration/transformers/pii/test_anonymizer.py index 66244f683..b1577dd4f 100644 --- a/tests/integration/transformers/pii/test_anonymizer.py +++ b/tests/integration/transformers/pii/test_anonymizer.py @@ -13,16 +13,14 @@ def test_default_settings(self): """End to end test with the default settings of the ``AnonymizedFaker``.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) instance = AnonymizedFaker() transformed = instance.fit_transform(data, 'username') reverse_transform = instance.reverse_transform(transformed) - expected_transformed = pd.DataFrame({ - 'id': [1, 2, 3, 4, 5] - }) + expected_transformed = pd.DataFrame({'id': [1, 2, 3, 4, 5]}) pd.testing.assert_frame_equal(transformed, expected_transformed) assert len(reverse_transform['username']) == 5 @@ -31,16 +29,14 @@ def test_default_settings_with_locales(self): """End to end test with the default settings and locales of the ``AnonymizedFaker``.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) instance = AnonymizedFaker(locales=['en_US', 'en_CA', 'es_ES']) transformed = instance.fit_transform(data, 'username') reverse_transform = instance.reverse_transform(transformed) - expected_transformed = pd.DataFrame({ - 'id': [1, 2, 3, 4, 5] - }) + expected_transformed = pd.DataFrame({'id': [1, 2, 3, 4, 5]}) pd.testing.assert_frame_equal(transformed, expected_transformed) assert len(reverse_transform['username']) == 5 @@ -63,8 +59,8 @@ def test_custom_provider(self): '4149498289355', '213144860944676', '4514775286178', - '213133122335401' - ] + '213133122335401', + ], }) instance = AnonymizedFaker('credit_card', 'credit_card_number') @@ -83,7 +79,7 @@ def test_with_nans(self): """Test with the default settings of the ``AnonymizedFaker`` with ``nan`` values.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = AnonymizedFaker() @@ -102,7 +98,7 @@ def test_with_nans_missing_value_generation_none(self): """End to end test settings missing_value_generation=None.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = AnonymizedFaker(missing_value_generation=None) @@ -127,8 +123,8 @@ def test_custom_provider_with_nans(self): np.nan, '213144860944676', '4514775286178', - '213133122335401' - ] + '213133122335401', + ], }) instance = AnonymizedFaker( @@ -153,9 +149,7 @@ def test_cardinality_rule(self): Also ensure that when we call ``reset_randomization`` the generator will be able to create values again. """ - data = pd.DataFrame({ - 'job': np.arange(500) - }) + data = pd.DataFrame({'job': np.arange(500)}) instance = AnonymizedFaker('job', 'job', cardinality_rule='unique') transformed = instance.fit_transform(data, 'job') @@ -178,9 +172,7 @@ def test_cardinality_rule(self): def test_cardinality_rule_match(self): """Test it works with the cardinality rule 'match'.""" # Setup - data = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2] - }) + data = pd.DataFrame({'col': [1, 2, 3, 1, 2]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -193,9 +185,7 @@ def test_cardinality_rule_match(self): def test_cardinality_rule_match_nans(self): """Test it works with the cardinality rule 'match' with nans.""" # Setup - data = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2] - }) + data = pd.DataFrame({'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -209,12 +199,8 @@ def test_cardinality_rule_match_nans(self): def test_cardinality_rule_match_not_enough_unique_values(self): """Test it works with the cardinality rule 'match' and too few values to transform.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2] - }) - data_transform = pd.DataFrame({ - 'col': [1, 1, 1] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]}) + data_transform = pd.DataFrame({'col': [1, 1, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -229,12 +215,8 @@ def test_cardinality_rule_match_not_enough_unique_values(self): def test_cardinality_rule_match_too_many_unique(self): """Test it works with the cardinality rule 'match' and more unique values than samples.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, 4, 5, 6] - }) - data_transform = pd.DataFrame({ - 'col': [1, 1, np.nan, 3, 1] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, 4, 5, 6]}) + data_transform = pd.DataFrame({'col': [1, 1, np.nan, 3, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -249,12 +231,8 @@ def test_cardinality_rule_match_too_many_unique(self): def test_cardinality_rule_match_too_many_nans(self): """Test it works with the cardinality rule 'match' and more nans than possible to fit.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, np.nan, np.nan, np.nan] - }) - data_transform = pd.DataFrame({ - 'col': [1, 1, 1, 1] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, np.nan, np.nan, np.nan]}) + data_transform = pd.DataFrame({'col': [1, 1, 1, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -273,9 +251,7 @@ def test_enforce_uniqueness_backwards_compatability(self): expected (can happen when previous transformer version is loaded from a pkl file). """ # Setup - data = pd.DataFrame({ - 'job': np.arange(500) - }) + data = pd.DataFrame({'job': np.arange(500)}) instance = AnonymizedFaker('job', 'job', cardinality_rule='match') instance.enforce_uniqueness = True @@ -304,9 +280,7 @@ def test_enforce_uniqueness_backwards_compatability(self): class TestPsuedoAnonymizedFaker: def test_default_settings(self): """End to end test with the default settings of the ``PseudoAnonymizedFaker``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', 'parrot', 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', 'parrot', 'monkey']}) instance = PseudoAnonymizedFaker() @@ -316,7 +290,7 @@ def test_default_settings(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) assert unique_animals.intersection(set(instance._mapping_dict)) == set() @@ -324,9 +298,7 @@ def test_default_settings(self): def test_with_nans(self): """Test with the default settings of the ``PseudoAnonymizedFaker`` and ``nans``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', np.nan, 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', np.nan, 'monkey']}) instance = PseudoAnonymizedFaker() @@ -336,7 +308,7 @@ def test_with_nans(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) assert unique_animals.intersection(set(instance._mapping_dict)) == set() @@ -344,9 +316,7 @@ def test_with_nans(self): def test_with_custom_provider(self): """End to end test with custom settings of the ``PseudoAnonymizedFaker``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', np.nan, 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', np.nan, 'monkey']}) instance = PseudoAnonymizedFaker('credit_card', 'credit_card_number') @@ -356,7 +326,7 @@ def test_with_custom_provider(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) assert unique_animals.intersection(set(instance._mapping_dict)) == set() diff --git a/tests/integration/transformers/test_base.py b/tests/integration/transformers/test_base.py index 02a4ef72a..564793f34 100644 --- a/tests/integration/transformers/test_base.py +++ b/tests/integration/transformers/test_base.py @@ -27,9 +27,9 @@ def test_dummy_transformer_series_output(): - The transformed data should be able to reversed to re-produce the input data. """ + # Setup class DummyTransformer(BaseTransformer): - INPUT_SDTYPE = 'boolean' def _fit(self, data): @@ -42,9 +42,7 @@ def _reverse_transform(self, data): return data.round() != 0 # Run - data = pd.DataFrame({ - 'bool': [True, False, True, False] - }) + data = pd.DataFrame({'bool': [True, False, True, False]}) transformer = DummyTransformer() transformed = transformer.fit_transform(data, 'bool') @@ -52,9 +50,7 @@ def _reverse_transform(self, data): reverse = transformer.reverse_transform(transformed) # Assert - expected_transform = pd.DataFrame({ - 'bool': [1., 0., 1., 0.] - }) + expected_transform = pd.DataFrame({'bool': [1.0, 0.0, 1.0, 0.0]}) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data) @@ -82,9 +78,9 @@ def test_dummy_transformer_dataframe_output(): - The transformed data should be able to reversed to re-produce the input data. """ + # Setup class DummyTransformer(BaseTransformer): - INPUT_SDTYPE = 'boolean' def __init__(self): @@ -98,13 +94,17 @@ def _fit(self, data): pass def _transform(self, data): - out = pd.DataFrame(dict(zip( - self.output_columns, - [ - data.astype(float).fillna(-1), - data.isna().astype(float) - ] - ))) + out = pd.DataFrame( + dict( + zip( + self.output_columns, + [ + data.astype(float).fillna(-1), + data.isna().astype(float), + ], + ) + ) + ) return out @@ -124,8 +124,8 @@ def _reverse_transform(self, data): # Assert expected_transform = pd.DataFrame({ - 'bool': [1., 0., 1., -1.], - 'bool.null': [0., 0., 0., 1.] + 'bool': [1.0, 0.0, 1.0, -1.0], + 'bool.null': [0.0, 0.0, 0.0, 1.0], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data) @@ -133,6 +133,7 @@ def _reverse_transform(self, data): def test_multi_column_transformer_same_number_of_columns_input_output(): """Test a multi-column transformer when the same of input and output columns.""" + # Setup class AdditionTransformer(BaseMultiColumnTransformer): """This transformer takes 3 columns and return the cumulative sum of each row.""" @@ -141,7 +142,7 @@ def _fit(self, columns_data): self.output_properties = { f'{self.columns[0]}': {'sdtype': 'numerical'}, f'{self.columns[0]}+{self.columns[1]}': {'sdtype': 'numerical'}, - f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': {'sdtype': 'numerical'} + f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': {'sdtype': 'numerical'}, } def _get_prefix(self): @@ -159,13 +160,13 @@ def _reverse_transform(self, data): data_test = pd.DataFrame({ 'col_1': [1, 2, 3], 'col_2': [10, 20, 30], - 'col_3': [100, 200, 300] + 'col_3': [100, 200, 300], }) columns_to_sdtypes = { 'col_1': 'numerical', 'col_2': 'numerical', - 'col_3': 'numerical' + 'col_3': 'numerical', } transformer = AdditionTransformer() @@ -177,7 +178,7 @@ def _reverse_transform(self, data): expected_transform = pd.DataFrame({ 'col_1': [1, 2, 3], 'col_1+col_2': [11, 22, 33], - 'col_1+col_2+col_3': [111, 222, 333] + 'col_1+col_2+col_3': [111, 222, 333], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -185,6 +186,7 @@ def _reverse_transform(self, data): def test_multi_column_transformer_less_output_than_input_columns(): """Test a multi-column transformer when the output has less columns than the input.""" + class ConcatenateTransformer(BaseMultiColumnTransformer): """This transformer takes 4 columns and concatenate them into 2 columns. The two first and last columns are concatenated together. @@ -195,7 +197,7 @@ def _fit(self, columns_data): self.name_2 = self.columns[2] + '#' + self.columns[3] self.output_properties = { f'{self.name_1}.concatenate_1': {'sdtype': 'categorical'}, - f'{self.name_2}.concatenate_2': {'sdtype': 'categorical'} + f'{self.name_2}.concatenate_2': {'sdtype': 'categorical'}, } def _get_prefix(self): @@ -223,14 +225,14 @@ def _reverse_transform(self, data): 'col_1': ['A', 'B', 'C'], 'col_2': ['D', 'E', 'F'], 'col_3': ['G', 'H', 'I'], - 'col_4': ['J', 'K', 'L'] + 'col_4': ['J', 'K', 'L'], }) columns_to_sdtypes = { 'col_1': 'categorical', 'col_2': 'categorical', 'col_3': 'categorical', - 'col_4': 'categorical' + 'col_4': 'categorical', } transformer = ConcatenateTransformer() @@ -242,7 +244,7 @@ def _reverse_transform(self, data): # Assert expected_transform = pd.DataFrame({ 'col_1#col_2.concatenate_1': ['A#D', 'B#E', 'C#F'], - 'col_3#col_4.concatenate_2': ['G#J', 'H#K', 'I#L'] + 'col_3#col_4.concatenate_2': ['G#J', 'H#K', 'I#L'], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -250,14 +252,14 @@ def _reverse_transform(self, data): def test_multi_column_transformer_more_output_than_input_columns(): """Test a multi-column transformer when the output has more columns than the input.""" - class ExpandTransformer(BaseMultiColumnTransformer): + class ExpandTransformer(BaseMultiColumnTransformer): def _fit(self, columns_data): self.output_properties = { f'{self.columns[0]}.first_part_1': {'sdtype': 'categorical'}, f'{self.columns[0]}.second_part_1': {'sdtype': 'categorical'}, f'{self.columns[1]}.first_part_2': {'sdtype': 'categorical'}, - f'{self.columns[1]}.second_part_2': {'sdtype': 'categorical'} + f'{self.columns[1]}.second_part_2': {'sdtype': 'categorical'}, } def _get_prefix(self): @@ -285,10 +287,7 @@ def _reverse_transform(self, data): 'col_2': ['GH', 'IJ', 'KL'], }) - columns_to_sdtypes = { - 'col_1': 'categorical', - 'col_2': 'categorical' - } + columns_to_sdtypes = {'col_1': 'categorical', 'col_2': 'categorical'} transformer = ExpandTransformer() # Run @@ -301,7 +300,7 @@ def _reverse_transform(self, data): 'col_1.first_part_1': ['A', 'C', 'E'], 'col_1.second_part_1': ['B', 'D', 'F'], 'col_2.first_part_2': ['G', 'I', 'K'], - 'col_2.second_part_2': ['H', 'J', 'L'] + 'col_2.second_part_2': ['H', 'J', 'L'], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) diff --git a/tests/integration/transformers/test_boolean.py b/tests/integration/transformers/test_boolean.py index 827802b5b..80857955e 100644 --- a/tests/integration/transformers/test_boolean.py +++ b/tests/integration/transformers/test_boolean.py @@ -5,7 +5,6 @@ class TestBinaryEncoder: - def test_boolean_some_nans(self): """Test BinaryEncoder on input with some nan values. @@ -50,7 +49,7 @@ def test_boolean_missing_value_replacement_mode(self): column = 'bool' transformer = BinaryEncoder( missing_value_replacement='mode', - missing_value_generation='from_column' + missing_value_generation='from_column', ) # Run @@ -60,8 +59,8 @@ def test_boolean_missing_value_replacement_mode(self): # Assert expected_transformed = pd.DataFrame({ - 'bool': [1., 1., 1., 0.], - 'bool.is_null': [0., 0., 1., 0.] + 'bool': [1.0, 1.0, 1.0, 0.0], + 'bool.is_null': [0.0, 0.0, 1.0, 0.0], }) pd.testing.assert_frame_equal(transformed, expected_transformed) pd.testing.assert_frame_equal(reverse, data) @@ -74,10 +73,7 @@ def test_boolean_missing_value_generation_none(self): # Setup data = pd.DataFrame([True, True, None, False], columns=['bool']) column = 'bool' - transformer = BinaryEncoder( - missing_value_replacement='mode', - missing_value_generation=None - ) + transformer = BinaryEncoder(missing_value_replacement='mode', missing_value_generation=None) # Run transformer.fit(data, column) @@ -85,7 +81,7 @@ def test_boolean_missing_value_generation_none(self): reverse = transformer.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({'bool': [1., 1., 1., 0.]}) + expected_transformed = pd.DataFrame({'bool': [1.0, 1.0, 1.0, 0.0]}) expected_reversed = pd.DataFrame({'bool': [True, True, True, False]}) pd.testing.assert_frame_equal(transformed, expected_transformed) pd.testing.assert_frame_equal(reverse, expected_reversed, check_dtype=False) diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 4f77b7fe4..f30f7627a 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -6,8 +6,13 @@ import pandas as pd from rdt.transformers import ( - FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, OrderedUniformEncoder, - UniformEncoder) + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) class TestUniformEncoder: @@ -83,7 +88,18 @@ def test__reverse_transform_nans(self): """Test ``reverse_transform`` for data with NaNs.""" # Setup data = pd.DataFrame({ - 'column_name': ['a', 'b', 'c', np.nan, 'c', 'b', 'b', 'a', 'b', np.nan] + 'column_name': [ + 'a', + 'b', + 'c', + np.nan, + 'c', + 'b', + 'b', + 'a', + 'b', + np.nan, + ] }) column = 'column_name' @@ -97,6 +113,30 @@ def test__reverse_transform_nans(self): # Asserts pd.testing.assert_series_equal(output[column], data[column]) + def test__reverse_transform_nans_pandas_warning(self): + """Test ``_reverse_transform`` for data with NaNs. + + Here we check that no pandas warning is raised. + """ + # Setup + intervals = {'United-States': [0.0, 0.8], None: [0.8, 0.9], 'Jamaica': [0.9, 0.99]} + data = pd.Series([0.107995, 0.148025, 0.632702], name='native-country', dtype=float) + transformer = UniformEncoder() + transformer.intervals = intervals + transformer.dtype = 'O' + + # Run + with warnings.catch_warnings(record=True) as w: + result = transformer._reverse_transform(data) + + assert len(w) == 0 + + # Asserts + expected_result = pd.Series( + ['United-States', 'United-States', 'United-States'], name='native-country' + ) + pd.testing.assert_series_equal(result, expected_result) + def test_uniform_encoder_unseen_transform_nan(self): """Ensure UniformEncoder works when np.nan to transform wasn't seen during fit.""" # Setup @@ -204,9 +244,7 @@ def test_frequency_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert @@ -488,9 +526,7 @@ def test_one_hot_doesnt_warn(tmp_path): def test_one_hot_categoricals(): """Ensure OneHotEncoder works on categorical data. GH#751""" # Setup - test_data = pd.DataFrame(data={ - 'A': ['Yes', 'No', 'Yes', 'Maybe', 'No'] - }) + test_data = pd.DataFrame(data={'A': ['Yes', 'No', 'Yes', 'Maybe', 'No']}) test_data['A'] = test_data['A'].astype('category') transformer = OneHotEncoder() @@ -505,7 +541,7 @@ def test_one_hot_categoricals(): 'A.value1': [0, 1, 0, 0, 1], 'A.value2': [0, 0, 0, 1, 0], }), - check_dtype=False + check_dtype=False, ) # Run @@ -524,7 +560,7 @@ def test_label_numerical_2d_array(): transformer = LabelEncoder() transformer.fit(data, column) - transformed = pd.DataFrame([0., 1., 2., 3.], columns=['column_name']) + transformed = pd.DataFrame([0.0, 1.0, 2.0, 3.0], columns=['column_name']) reverse = transformer.reverse_transform(transformed) pd.testing.assert_frame_equal(reverse, data) @@ -550,9 +586,7 @@ def test_label_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert @@ -662,9 +696,7 @@ def test_ordered_label_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert diff --git a/tests/integration/transformers/test_datetime.py b/tests/integration/transformers/test_datetime.py index 59e189f1a..e09767aa8 100644 --- a/tests/integration/transformers/test_datetime.py +++ b/tests/integration/transformers/test_datetime.py @@ -1,7 +1,10 @@ import numpy as np import pandas as pd -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) class TestUnixTimestampEncoder: @@ -19,7 +22,7 @@ def test_unixtimestampencoder(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -38,7 +41,7 @@ def test_unixtimestampencoder_different_format(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, data) @@ -49,7 +52,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): ute = UnixTimestampEncoder( missing_value_replacement='mean', missing_value_generation=None, - datetime_format='%b %d, %Y' + datetime_format='%b %d, %Y', ) data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) @@ -61,7 +64,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) expected_reversed = pd.DataFrame({ 'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965'] @@ -72,10 +75,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): def test_unixtimestampencoder_with_missing_value_replacement_random(self): """Test that transformed data will replace nans with random values from the data.""" # Setup - ute = UnixTimestampEncoder( - missing_value_replacement='random', - datetime_format='%b %d, %Y' - ) + ute = UnixTimestampEncoder(missing_value_replacement='random', datetime_format='%b %d, %Y') data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) # Run @@ -86,11 +86,9 @@ def test_unixtimestampencoder_with_missing_value_replacement_random(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [-7.007396e+16, 845510400000000000, -145497600000000000] - }) - expected_reversed = pd.DataFrame({ - 'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965'] + 'column': [-7.007396e16, 845510400000000000, -145497600000000000] }) + expected_reversed = pd.DataFrame({'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965']}) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, expected_reversed) @@ -108,8 +106,8 @@ def test_unixtimestampencoder_with_model_missing_values(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000], - 'column.is_null': [1., 0., 0.] + 'column': [3.500064e17, 845510400000000000, -145497600000000000], + 'column.is_null': [1.0, 0.0, 0.0], }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -129,7 +127,7 @@ def test_unixtimestampencoder_with_integer_datetimes(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [6.958656e+17, 1.856736e+18, 6.547392e+17], + 'column': [6.958656e17, 1.856736e18, 6.547392e17], }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -149,8 +147,8 @@ def test_unixtimestampencoder_with_nans(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [0., 0., 0.], - 'column.is_null': [1., 1., 1.] + 'column': [0.0, 0.0, 0.0], + 'column.is_null': [1.0, 1.0, 1.0], }) pd.testing.assert_frame_equal(expected_transformed, transformed) diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index b391e69b9..8e301ad71 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -2,11 +2,14 @@ import pandas as pd from copulas import univariate -from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer +from rdt.transformers.numerical import ( + ClusterBasedNormalizer, + FloatFormatter, + GaussianNormalizer, +) class TestFloatFormatter: - def test_missing_value_generation_from_column(self): """Test end to end with ``missing_value_generation`` set to ``from_column``. @@ -115,7 +118,9 @@ def test_model_missing_value(self): assert list(transformed.iloc[:, 1]) == [0, 0, 0, 0, 1, 0] np.testing.assert_array_almost_equal(reverse, data, decimal=2) - def test_missing_value_replacement_set_to_random_and_model_missing_values(self): + def test_missing_value_replacement_set_to_random_and_model_missing_values( + self, + ): """Test that we are still able to use ``missing_value_replacement`` when is ``random``.""" # Setup data = pd.DataFrame({'a': [1, 2, 3, np.nan, np.nan, 4]}) @@ -128,8 +133,8 @@ def test_missing_value_replacement_set_to_random_and_model_missing_values(self): # Assert expected_transformed = pd.DataFrame({ - 'a': [1., 2., 3., 2.617107, 1.614805, 4.], - 'a.is_null': [0., 0., 0., 1., 1., 0.] + 'a': [1.0, 2.0, 3.0, 2.617107, 1.614805, 4.0], + 'a.is_null': [0.0, 0.0, 0.0, 1.0, 1.0, 0.0], }) pd.testing.assert_frame_equal(transformed, expected_transformed) pd.testing.assert_frame_equal(reverse, data) @@ -158,7 +163,6 @@ def test_missing_value_replacement_random_all_nans(self): class TestGaussianNormalizer: - def test_stats(self): data = pd.DataFrame(np.random.normal(loc=4, scale=4, size=1000), columns=['a']) column = 'a' @@ -206,7 +210,9 @@ def test_missing_value_generation_random(self): reverse = ct.reverse_transform(transformed) expected = pd.DataFrame( - [1., 1.9999999510423996, 1., 1.9999999510423996, 1.4, 1.], columns=['a']) + [1.0, 1.9999999510423996, 1.0, 1.9999999510423996, 1.4, 1.0], + columns=['a'], + ) pd.testing.assert_frame_equal(reverse, expected) def test_int(self): @@ -299,7 +305,6 @@ def test_uniform_class(self): class TestClusterBasedNormalizer: - def generate_data(self): data1 = np.random.normal(loc=5, scale=1, size=100) data2 = np.random.normal(loc=-5, scale=1, size=100) @@ -327,7 +332,7 @@ def test_some_nulls(self): random_state = np.random.get_state() np.random.set_state(np.random.RandomState(10).get_state()) data = self.generate_data() - mask = np.random.choice([1, 0], data.shape, p=[.1, .9]).astype(bool) + mask = np.random.choice([1, 0], data.shape, p=[0.1, 0.9]).astype(bool) data[mask] = np.nan column = 'col' @@ -395,10 +400,12 @@ def test_out_of_bounds_reverse_transform(self): data = pd.DataFrame({ 'col': [round(i, 2) for i in np.random.uniform(0, 10, size=100)] + [None] }) - reverse_data = pd.DataFrame(data={ - 'col.normalized': np.random.uniform(-10, 10, size=100), - 'col.component': np.random.choice([0.0, 1.0, 2.0, 10.0], size=100) - }) + reverse_data = pd.DataFrame( + data={ + 'col.normalized': np.random.uniform(-10, 10, size=100), + 'col.component': np.random.choice([0.0, 1.0, 2.0, 10.0], size=100), + } + ) transformer = ClusterBasedNormalizer() # Run diff --git a/tests/integration/transformers/test_text.py b/tests/integration/transformers/test_text.py index 18190b73e..02085cc92 100644 --- a/tests/integration/transformers/test_text.py +++ b/tests/integration/transformers/test_text.py @@ -7,14 +7,13 @@ from rdt.transformers.text import IDGenerator, RegexGenerator -class TestIDGenerator(): - +class TestIDGenerator: def test_end_to_end(self): """End to end test of the ``IDGenerator``.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -26,18 +25,16 @@ def test_end_to_end(self): reverse_transform_3 = transformer.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'username': ['a', 'b', 'c', 'd', 'e'] - }) + expected_transformed = pd.DataFrame({'username': ['a', 'b', 'c', 'd', 'e']}) expected_reverse_transform = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], - 'id': ['id_100_X', 'id_101_X', 'id_102_X', 'id_103_X', 'id_104_X'] + 'id': ['id_100_X', 'id_101_X', 'id_102_X', 'id_103_X', 'id_104_X'], }) expected_reverse_transform_2 = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], - 'id': ['id_105_X', 'id_106_X', 'id_107_X', 'id_108_X', 'id_109_X'] + 'id': ['id_105_X', 'id_106_X', 'id_107_X', 'id_108_X', 'id_109_X'], }) pd.testing.assert_frame_equal(transformed, expected_transformed) @@ -46,13 +43,13 @@ def test_end_to_end(self): pd.testing.assert_frame_equal(reverse_transform_3, expected_reverse_transform) -class TestRegexGenerator(): +class TestRegexGenerator: def test_regexgenerator(self): """Test ``RegexGenerator`` with the default parameters.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -61,9 +58,7 @@ def test_regexgenerator(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'username': ['a', 'b', 'c', 'd', 'e'] - }) + expected_transformed = pd.DataFrame({'username': ['a', 'b', 'c', 'd', 'e']}) expected_reverse_transformed = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], 'id': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'], @@ -103,7 +98,7 @@ def test_with_nans(self): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) # Run @@ -129,7 +124,7 @@ def test_data_length_bigger_than_regex(self): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) # Run @@ -155,7 +150,7 @@ def test_input_data_bigger_than_data_length(self): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -184,7 +179,7 @@ def test_called_multiple_times(self): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = RegexGenerator('[a-c]') @@ -240,9 +235,7 @@ def test_called_multiple_times_enforce_uniqueness(self): second_reverse_transform = generator.reverse_transform(transformed_data.head(5)) # Assert - expected_first_reverse_transform = pd.DataFrame({ - 'my_column': ['AAAAA', 'AAAAB', 'AAAAC'] - }) + expected_first_reverse_transform = pd.DataFrame({'my_column': ['AAAAA', 'AAAAB', 'AAAAC']}) expected_second_reverse_transform = pd.DataFrame({ 'my_column': ['AAAAD', 'AAAAE', 'AAAAF', 'AAAAG', 'AAAAH'] }) @@ -254,7 +247,7 @@ def test_pickled(self, tmpdir): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -276,7 +269,13 @@ def test_with_many_possibilities(self): """Test the ``RegexGenerator`` with regex containing many possibilities.""" # Setup data = pd.DataFrame({ - 'id': ['a' * 50, 'a' * 49 + 'b', 'a' * 49 + 'c', 'a' * 49 + 'd', 'a' * 49 + 'e'], + 'id': [ + 'a' * 50, + 'a' * 49 + 'b', + 'a' * 49 + 'c', + 'a' * 49 + 'd', + 'a' * 49 + 'e', + ], 'username': ['aa', 'bb', 'cc', 'dd', 'ee'], }) @@ -292,7 +291,13 @@ def test_with_many_possibilities(self): expected_reverse_transformed = pd.DataFrame({ 'username': ['aa', 'bb', 'cc', 'dd', 'ee'], - 'id': ['a' * 50, 'a' * 49 + 'b', 'a' * 49 + 'c', 'a' * 49 + 'd', 'a' * 49 + 'e'], + 'id': [ + 'a' * 50, + 'a' * 49 + 'b', + 'a' * 49 + 'c', + 'a' * 49 + 'd', + 'a' * 49 + 'e', + ], }) pd.testing.assert_frame_equal(transformed, expected_transformed) @@ -345,7 +350,8 @@ def test_end_to_end_scrambled(self): ht.detect_initial_config(customers) ht.update_sdtypes({'id': 'text'}) ht.update_transformers({ - 'id': RegexGenerator(regex_format='id_[a-z]', generation_order='scrambled')}) + 'id': RegexGenerator(regex_format='id_[a-z]', generation_order='scrambled') + }) # Run ht.fit(customers) diff --git a/tests/performance/test_performance.py b/tests/performance/test_performance.py index a20274c41..9dd877886 100644 --- a/tests/performance/test_performance.py +++ b/tests/performance/test_performance.py @@ -9,11 +9,17 @@ from rdt.performance.profiling import profile_transformer from rdt.transformers import get_transformers_by_type from rdt.transformers.categorical import ( - CustomLabelEncoder, OrderedLabelEncoder, OrderedUniformEncoder) + CustomLabelEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, +) from rdt.transformers.numerical import ClusterBasedNormalizer SANDBOX_TRANSFORMERS = [ - ClusterBasedNormalizer, OrderedLabelEncoder, CustomLabelEncoder, OrderedUniformEncoder + ClusterBasedNormalizer, + OrderedLabelEncoder, + CustomLabelEncoder, + OrderedUniformEncoder, ] @@ -101,8 +107,14 @@ def _round_to_magnitude(value): raise ValueError('Value is too big') -def find_transformer_boundaries(transformer, dataset_generator, fit_size, - transform_size, iterations=1, multiplier=5): +def find_transformer_boundaries( + transformer, + dataset_generator, + fit_size, + transform_size, + iterations=1, + multiplier=5, +): """Helper function to find valid candidate boundaries for performance tests. The function works by: diff --git a/tests/performance/tests/test_profiling.py b/tests/performance/tests/test_profiling.py index 3904c9568..d07c53a22 100644 --- a/tests/performance/tests/test_profiling.py +++ b/tests/performance/tests/test_profiling.py @@ -42,25 +42,34 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): deepcopy_mock.return_value = transformer_mock.return_value # Run - profiling_results = profile_transformer(transformer_mock.return_value, - dataset_gen_mock, 100) + profiling_results = profile_transformer(transformer_mock.return_value, dataset_gen_mock, 100) # Assert expected_output_columns = [ - 'Fit Time', 'Fit Memory', 'Transform Time', 'Transform Memory', - 'Reverse Transform Time', 'Reverse Transform Memory' + 'Fit Time', + 'Fit Memory', + 'Transform Time', + 'Transform Memory', + 'Reverse Transform Time', + 'Reverse Transform Memory', ] assert len(deepcopy_mock.mock_calls) == 10 assert len(transformer_mock.return_value.fit.mock_calls) == 11 assert len(transformer_mock.return_value.transform.mock_calls) == 11 assert len(transformer_mock.return_value.reverse_transform.mock_calls) == 10 - all(np.testing.assert_array_equal(call[1][0], np.ones(100)) for call - in transformer_mock.fit.mock_calls) - all(np.testing.assert_array_equal(call[1][0], np.ones(100)) for call - in transformer_mock.transform.mock_calls) - all(np.testing.assert_array_equal(call[1][0], np.zeros(100)) for call - in transformer_mock.reverse_transform.mock_calls) + all( + np.testing.assert_array_equal(call[1][0], np.ones(100)) + for call in transformer_mock.fit.mock_calls + ) + all( + np.testing.assert_array_equal(call[1][0], np.ones(100)) + for call in transformer_mock.transform.mock_calls + ) + all( + np.testing.assert_array_equal(call[1][0], np.zeros(100)) + for call in transformer_mock.reverse_transform.mock_calls + ) assert expected_output_columns == list(profiling_results.index) @@ -74,7 +83,7 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): assert transform_call[2]['args'][0] == transformer_mock.return_value.transform pd.testing.assert_frame_equal( transform_call[2]['args'][1].reset_index(drop=True), - pd.DataFrame({'test': np.ones(100)}) + pd.DataFrame({'test': np.ones(100)}), ) assert reverse_transform_call[2]['args'][0] == transformer_mock.return_value.reverse_transform np.testing.assert_array_equal(reverse_transform_call[2]['args'][1], np.zeros(100)) diff --git a/tests/unit/test___init__.py b/tests/unit/test___init__.py index a33e124b5..be76aaaf6 100644 --- a/tests/unit/test___init__.py +++ b/tests/unit/test___init__.py @@ -1,4 +1,3 @@ - import sys from types import ModuleType from unittest.mock import Mock, patch @@ -25,7 +24,11 @@ def test_get_demo(): demo = get_demo() assert list(demo.columns) == [ - 'last_login', 'email_optin', 'credit_card', 'age', 'dollars_spent' + 'last_login', + 'email_optin', + 'credit_card', + 'age', + 'dollars_spent', ] assert len(demo) == 5 assert list(demo.isna().sum(axis=0)) == [1, 1, 1, 0, 1] @@ -34,23 +37,65 @@ def test_get_demo(): def test_get_demo_many_rows(): demo = get_demo(10) - login_dates = pd.Series([ - '2021-06-26', '2021-02-10', 'NaT', '2020-09-26', '2020-12-22', '2019-11-27', - '2002-05-10', '2014-10-04', '2014-03-19', '2015-09-13' - ], dtype='datetime64[ns]') - email_optin = [False, False, False, True, np.nan, np.nan, False, True, False, False] + login_dates = pd.Series( + [ + '2021-06-26', + '2021-02-10', + 'NaT', + '2020-09-26', + '2020-12-22', + '2019-11-27', + '2002-05-10', + '2014-10-04', + '2014-03-19', + '2015-09-13', + ], + dtype='datetime64[ns]', + ) + email_optin = [ + False, + False, + False, + True, + np.nan, + np.nan, + False, + True, + False, + False, + ] credit_card = [ - 'VISA', 'VISA', 'AMEX', np.nan, 'DISCOVER', 'AMEX', 'AMEX', 'DISCOVER', 'DISCOVER', 'VISA' + 'VISA', + 'VISA', + 'AMEX', + np.nan, + 'DISCOVER', + 'AMEX', + 'AMEX', + 'DISCOVER', + 'DISCOVER', + 'VISA', ] age = [29, 18, 21, 45, 32, 50, 93, 75, 39, 66] - dollars_spent = [99.99, np.nan, 2.50, 25.00, 19.99, 52.48, 39.99, 4.67, np.nan, 23.28] + dollars_spent = [ + 99.99, + np.nan, + 2.50, + 25.00, + 19.99, + 52.48, + 39.99, + 4.67, + np.nan, + 23.28, + ] expected = pd.DataFrame({ 'last_login': login_dates, 'email_optin': email_optin, 'credit_card': credit_card, 'age': age, - 'dollars_spent': dollars_spent + 'dollars_spent': dollars_spent, }) pd.testing.assert_frame_equal(demo, expected) @@ -78,6 +123,7 @@ def test__find_addons_module(entry_points_mock, mock_rdt): @patch.object(rdt, 'entry_points') def test__find_addons_type_error(entry_points_mock): """Test it when entry_points raises a TypeError (happens for py38, py39).""" + # Setup def side_effect(arg=None): if arg == 'rdt_modules': @@ -114,13 +160,14 @@ def test__find_addons_object(entry_points_mock, mock_rdt): @patch('rdt.entry_points') def test__find_addons_bad_addon(entry_points_mock, warning_mock): """Test failing to load an add-on generates a warning.""" + # Setup def entry_point_error(): raise ValueError() bad_entry_point = Mock() bad_entry_point.name = 'bad_entry_point' - bad_entry_point.version = 'bad_module' + bad_entry_point.value = 'bad_module' bad_entry_point.load.side_effect = entry_point_error entry_points_mock.return_value = [bad_entry_point] msg = 'Failed to load "bad_entry_point" from "bad_module".' @@ -204,7 +251,7 @@ def test__find_addons_missing_object(entry_points_mock, warning_mock, mock_rdt): bad_entry_point = Mock() bad_entry_point.name = 'rdt.submodule:missing_object.new_method' entry_points_mock.return_value = [bad_entry_point] - msg = ("Failed to set 'rdt.submodule:missing_object.new_method': missing_object.") + msg = "Failed to set 'rdt.submodule:missing_object.new_method': missing_object." del mock_rdt.submodule.missing_object diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py index ddf7fff86..f175eec8c 100644 --- a/tests/unit/test_hyper_transformer.py +++ b/tests/unit/test_hyper_transformer.py @@ -8,17 +8,29 @@ from rdt import HyperTransformer from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError, - TransformerProcessingError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, + TransformerProcessingError, +) from rdt.transformers import ( - AnonymizedFaker, BaseMultiColumnTransformer, BinaryEncoder, FloatFormatter, FrequencyEncoder, - LabelEncoder, RegexGenerator, UniformEncoder, UnixTimestampEncoder) + AnonymizedFaker, + BaseMultiColumnTransformer, + BinaryEncoder, + FloatFormatter, + FrequencyEncoder, + LabelEncoder, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, +) from rdt.transformers.base import BaseTransformer from rdt.transformers.numerical import ClusterBasedNormalizer class TestHyperTransformer(TestCase): - def test__add_field_to_set_string(self): """Test the ``_add_field_to_set`` method. @@ -85,7 +97,7 @@ def test__validate_field_transformers(self): field_transformers = { 'integer': int_transformer, 'float': float_transformer, - ('integer',): int_transformer + ('integer',): int_transformer, } ht = HyperTransformer() ht.field_transformers = field_transformers @@ -174,12 +186,9 @@ def test__create_multi_column_fields(self): 'a': BinaryEncoder, 'b': UnixTimestampEncoder, ('c', 'd'): UnixTimestampEncoder, - 'e': FloatFormatter - } - ht.field_sdtypes = { - 'f': 'categorical', - ('g', 'h'): 'datetime' + 'e': FloatFormatter, } + ht.field_sdtypes = {'f': 'categorical', ('g', 'h'): 'datetime'} # Run multi_column_fields = ht._create_multi_column_fields() @@ -189,7 +198,7 @@ def test__create_multi_column_fields(self): 'c': ('c', 'd'), 'd': ('c', 'd'), 'g': ('g', 'h'), - 'h': ('g', 'h') + 'h': ('g', 'h'), } assert multi_column_fields == expected @@ -233,7 +242,7 @@ def test__learn_config(self, get_default_transformer_mock): ht.field_sdtypes = { 'datetime': 'datetime', 'pii': 'pii', - 'text': 'text' + 'text': 'text', } ht._unfit = Mock() @@ -277,7 +286,7 @@ def test_detect_initial_config(self, logger_mock): 'col2': ['a', 'b', 'c'], 'col3': [True, False, True], 'col4': pd.to_datetime(['2010-02-01', '2010-01-01', '2010-02-01']), - 'col5': [1, 2, 3] + 'col5': [1, 2, 3], }) # Run @@ -289,7 +298,7 @@ def test_detect_initial_config(self, logger_mock): 'col2': 'categorical', 'col3': 'boolean', 'col4': 'datetime', - 'col5': 'numerical' + 'col5': 'numerical', } field_transformers = {k: repr(v) for (k, v) in ht.field_transformers.items()} @@ -298,7 +307,7 @@ def test_detect_initial_config(self, logger_mock): 'col2': 'UniformEncoder()', 'col3': 'UniformEncoder()', 'col4': 'UnixTimestampEncoder()', - 'col5': 'FloatFormatter()' + 'col5': 'FloatFormatter()', } expected_config = '\n'.join(( @@ -317,13 +326,13 @@ def test_detect_initial_config(self, logger_mock): ' "col4": UnixTimestampEncoder(),', ' "col5": FloatFormatter()', ' }', - '}' + '}', )) logger_mock.info.assert_has_calls([ call('Detecting a new config from the data ... SUCCESS'), call('Setting the new config ... SUCCESS'), call('Config:'), - call(expected_config) + call(expected_config), ]) def test__get_columns_to_sdtypes(self): @@ -377,20 +386,20 @@ def test__fit_field_transformer(self): data = pd.DataFrame({'a': [1, 2, 3]}) transformed_data1 = pd.DataFrame({ 'a.out1': ['2', '4', '6'], - 'a.out2': [1, 2, 3] + 'a.out2': [1, 2, 3], }) transformer1 = Mock() transformer2 = Mock() transformer1.get_output_columns.return_value = ['a.out1', 'a.out2'] transformer1.get_next_transformers.return_value = { 'a.out1': transformer2, - 'a.out2': None + 'a.out2': None, } transformer1.transform.return_value = transformed_data1 transformer2.get_output_columns.return_value = ['a.out1'] transformer2.get_next_transformers.return_value = { 'a.out1': None, - 'a.out1.is_null': None + 'a.out1.is_null': None, } transformer2.transform.return_value = transformed_data1 ht = HyperTransformer() @@ -401,7 +410,7 @@ def test__fit_field_transformer(self): # Assert expected = pd.DataFrame({ 'a.out1': ['2', '4', '6'], - 'a.out2': [1, 2, 3] + 'a.out2': [1, 2, 3], }) pd.testing.assert_frame_equal(out, expected) transformer1.fit.assert_called_once() @@ -486,16 +495,10 @@ def test__validate_config(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run error_msg = re.escape( @@ -511,22 +514,18 @@ def test_validate_config_not_unique_field(self): transformers = { 'column1': FloatFormatter(), 'column2': FrequencyEncoder(), - ('column2', 'column3'): None + ('column2', 'column3'): None, } sdtypes = { 'column1': 'numerical', 'column2': 'numerical', - 'column3': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column3': 'numerical', } + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run error_msg = re.escape( - 'Error: Invalid config. Please provide unique keys for the sdtypes ' - 'and transformers.' + 'Error: Invalid config. Please provide unique keys for the sdtypes ' 'and transformers.' ) with pytest.raises(InvalidConfigError, match=error_msg): HyperTransformer._validate_config(config) @@ -551,17 +550,14 @@ def test__validate_config_no_warning(self, warnings_mock): transformers = { 'column1': FloatFormatter(), 'column2': FrequencyEncoder(), - 'column3': None + 'column3': None, } sdtypes = { 'column1': 'numerical', 'column2': 'categorical', - 'column3': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column3': 'numerical', } + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run HyperTransformer._validate_config(config) @@ -583,16 +579,13 @@ def test__validate_config_invalid_key(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} config = { 'sdtypes': sdtypes, 'transformers': transformers, - 'unexpected': 10 + 'unexpected': 10, } # Run / Assert @@ -617,7 +610,7 @@ def test__validate_config_missing_sdtypes(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() + 'column2': FrequencyEncoder(), } config = { 'transformers': transformers, @@ -643,13 +636,10 @@ def test__validate_config_mismatched_columns(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} transformers = { 'column1': FloatFormatter(), - 'column3': FrequencyEncoder() + 'column3': FrequencyEncoder(), } config = { 'sdtypes': sdtypes, @@ -676,13 +666,10 @@ def test__validate_config_invalid_sdtype(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'unexpected' - } + sdtypes = {'column1': 'numerical', 'column2': 'unexpected'} transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() + 'column2': FrequencyEncoder(), } config = { 'sdtypes': sdtypes, @@ -709,14 +696,8 @@ def test__validate_config_invalid_transformer(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } - transformers = { - 'column1': FloatFormatter(), - 'column2': 'unexpected' - } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} + transformers = {'column1': FloatFormatter(), 'column2': 'unexpected'} config = { 'sdtypes': sdtypes, 'transformers': transformers, @@ -749,12 +730,9 @@ def test_get_config(self): ht = HyperTransformer() ht.field_transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - ht.field_sdtypes = { - 'column1': 'numerical', - 'column2': 'categorical' + 'column2': FrequencyEncoder(), } + ht.field_sdtypes = {'column1': 'numerical', 'column2': 'categorical'} # Run config = ht.get_config() @@ -762,7 +740,7 @@ def test_get_config(self): # Assert expected_config = { 'sdtypes': ht.field_sdtypes, - 'transformers': ht.field_transformers + 'transformers': ht.field_transformers, } assert config == expected_config @@ -784,10 +762,7 @@ def test_get_config_empty(self): config = ht.get_config() # Assert - expected_config = { - 'sdtypes': {}, - 'transformers': {} - } + expected_config = {'sdtypes': {}, 'transformers': {}} assert config == expected_config def test_set_config(self): @@ -811,16 +786,10 @@ def test_set_config(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'categorical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'categorical'} + config = {'sdtypes': sdtypes, 'transformers': transformers} ht = HyperTransformer() ht._validate_config = Mock() @@ -851,10 +820,7 @@ def test_set_config_already_fitted(self, mock_warnings): """ # Setup - config = { - 'sdtypes': {}, - 'transformers': {} - } + config = {'sdtypes': {}, 'transformers': {}} ht = HyperTransformer() ht._fitted = True ht._validate_config = Mock() @@ -875,7 +841,12 @@ def get_data(self): 'float': [0.1, 0.2, 0.1, 0.1], 'categorical': ['a', 'a', 'b', 'a'], 'bool': [False, False, True, False], - 'datetime': pd.to_datetime(['2010-02-01', '2010-01-01', '2010-02-01', '2010-01-01']) + 'datetime': pd.to_datetime([ + '2010-02-01', + '2010-01-01', + '2010-02-01', + '2010-01-01', + ]), }) def get_transformed_data(self): @@ -884,12 +855,7 @@ def get_transformed_data(self): 'float': [0.1, 0.2, 0.1, 0.1], 'categorical': [0.375, 0.375, 0.875, 0.375], 'bool': [0.0, 0.0, 1.0, 0.0], - 'datetime': [ - 1.2649824e+18, - 1.262304e+18, - 1.2649824e+18, - 1.262304e+18 - ] + 'datetime': [1.2649824e18, 1.262304e18, 1.2649824e18, 1.262304e18], }) def test__validate_detect_config_called(self): @@ -996,7 +962,7 @@ def test_fit(self): 'integer.out': int_out_transformer, 'bool': bool_transformer, 'categorical': categorical_transformer, - 'datetime': datetime_transformer + 'datetime': datetime_transformer, } ht = HyperTransformer() @@ -1018,7 +984,7 @@ def test_fit(self): call(data, 'float', float_transformer), call(data, 'categorical', categorical_transformer), call(data, 'bool', bool_transformer), - call(data, 'datetime', datetime_transformer) + call(data, 'datetime', datetime_transformer), ] ht._validate_all_fields_fitted.assert_called_once() ht._validate_detect_config_called.assert_called_once() @@ -1026,6 +992,7 @@ def test_fit(self): def test_fit_with_multi_column_transformer(self): """Test the ``fit`` method with a multi-column transformer.""" + # Setup class MultiColumnTransformer(BaseMultiColumnTransformer): def _fit(self, data): @@ -1045,12 +1012,12 @@ def _reverse_transform(self, data): field_transformers = { ('col1', 'col2'): MultiColumnTransformer(), - 'col3': FloatFormatter() + 'col3': FloatFormatter(), } field_sdtypes = { 'col1': 'numerical', 'col2': 'categorical', - 'col3': 'numerical' + 'col3': 'numerical', } columns_to_sdtype = { @@ -1069,7 +1036,7 @@ def _reverse_transform(self, data): data = pd.DataFrame({ 'col1': [1, 2, 3], 'col2': ['a', 'b', 'c'], - 'col3': [1, 2, 3] + 'col3': [1, 2, 3], }) # Run @@ -1085,6 +1052,7 @@ def test_fit_warns(self): Two chained transformers, where the first generates the columns 'col' and 'col.is_null' and the second takes 'col' and generates 'col.is_null'. """ + # Setup class DummyTransformer2(BaseTransformer): INPUT_SDTYPE = 'numerical' @@ -1093,8 +1061,7 @@ def __init__(self): super().__init__() self.output_properties = {'is_null': {'sdtype': 'float', 'next_transformer': None}} - def _fit(self, _): - ... + def _fit(self, _): ... def _transform(self, data): return data.to_numpy() @@ -1106,19 +1073,23 @@ def __init__(self): super().__init__() self.output_properties = { 'is_null': {'sdtype': 'float', 'next_transformer': None}, - None: {'sdtype': 'float', 'next_transformer': DummyTransformer2()} + None: { + 'sdtype': 'float', + 'next_transformer': DummyTransformer2(), + }, } - def _fit(self, _): - ... + def _fit(self, _): ... def _transform(self, data): return np.array([[4, 1], [5, 2], [6, 3]]) ht = HyperTransformer() data = pd.DataFrame({'col': [1, 2, 3]}) - ht.set_config( - {'sdtypes': {'col': 'numerical'}, 'transformers': {'col': DummyTransformer1()}}) + ht.set_config({ + 'sdtypes': {'col': 'numerical'}, + 'transformers': {'col': DummyTransformer1()}, + }) # Run and Assert warn_msg = re.escape( @@ -1188,7 +1159,7 @@ def test_transform(self): float_transformer, categorical_transformer, bool_transformer, - datetime_transformer + datetime_transformer, ] ht.field_sdtypes = {'col1': 'categorical'} ht._input_columns = list(data.columns) @@ -1215,7 +1186,10 @@ def test_fit_updates_field_transformers(self): ff = FloatFormatter() # Run - ht.set_config({'sdtypes': {'col': 'numerical'}, 'transformers': {'col': ff}}) + ht.set_config({ + 'sdtypes': {'col': 'numerical'}, + 'transformers': {'col': ff}, + }) ht.fit(data) # Assert @@ -1237,9 +1211,11 @@ def test_transform_raises_error_no_config(self): ht = HyperTransformer() # Run - expected_msg = ("No config detected. Set the config using 'set_config' or pre-populate " - "it automatically from your data using 'detect_initial_config' prior to " - 'fitting your data.') + expected_msg = ( + "No config detected. Set the config using 'set_config' or pre-populate " + "it automatically from your data using 'detect_initial_config' prior to " + 'fitting your data.' + ) with pytest.raises(ConfigNotSetError, match=expected_msg): ht.transform(data) @@ -1408,15 +1384,11 @@ def test_fit_transform(self): expect_call_args_transform = pd.DataFrame() assert transformer.fit.call_count == expect_call_count_fit - pd.testing.assert_frame_equal( - transformer.fit.call_args[0][0], - expect_call_args_fit - ) + pd.testing.assert_frame_equal(transformer.fit.call_args[0][0], expect_call_args_fit) assert transformer.transform.call_count == expect_call_count_transform pd.testing.assert_frame_equal( - transformer.transform.call_args[0][0], - expect_call_args_transform + transformer.transform.call_args[0][0], expect_call_args_transform ) def test_reset_randomization(self): @@ -1442,7 +1414,7 @@ def test_reset_randomization(self): 'id': transformer_id, 'random_element': transformer_random_element, 'name': transformer_name, - 'label': None + 'label': None, } # Run @@ -1480,8 +1452,7 @@ def test_create_anonymized_columns(self): instance.random_state = {} random_element = AnonymizedFaker( - function_name='random_element', - function_kwargs={'elements': ['a']} + function_name='random_element', function_kwargs={'elements': ['a']} ) random_element.columns = ['random_element'] random_element.output_columns = [] @@ -1494,20 +1465,18 @@ def test_create_anonymized_columns(self): instance.field_transformers = { 'id': regex_id, - 'random_element': random_element + 'random_element': random_element, } # Run output = HyperTransformer.create_anonymized_columns( - instance, - num_rows=5, - column_names=['id', 'random_element'] + instance, num_rows=5, column_names=['id', 'random_element'] ) # Assert expected_output = pd.DataFrame({ 'id': ['id_0', 'id_1', 'id_2', 'id_3', 'id_4'], - 'random_element': ['a', 'a', 'a', 'a', 'a'] + 'random_element': ['a', 'a', 'a', 'a', 'a'], }) pd.testing.assert_frame_equal(output, expected_output) @@ -1628,7 +1597,7 @@ def test_create_anonymized_columns_invalid_transformers(self): instance.field_transformers = { 'datetime': FloatFormatter(), - 'random_element': FloatFormatter() + 'random_element': FloatFormatter(), } # Run / Assert @@ -1641,7 +1610,7 @@ def test_create_anonymized_columns_invalid_transformers(self): HyperTransformer.create_anonymized_columns( instance, num_rows=5, - column_names=['datetime', 'random_element'] + column_names=['datetime', 'random_element'], ) def test_reverse_transform(self): @@ -1681,7 +1650,7 @@ def test_reverse_transform(self): float_transformer, categorical_transformer, bool_transformer, - datetime_transformer + datetime_transformer, ] ht._output_columns = list(data.columns) ht._input_columns = list(data.columns) @@ -1740,7 +1709,7 @@ def test_reverse_transform_subset_with_generators(self): ht._transformers_sequence = [ int_transformer, float_transformer, - generator_transformer + generator_transformer, ] ht._output_columns = list(reverse_transformed_data.columns) ht._input_columns = list(reverse_transformed_data.columns) @@ -1769,9 +1738,11 @@ def test_reverse_transform_raises_error_no_config(self): ht = HyperTransformer() # Run - expected_msg = ("No config detected. Set the config using 'set_config' or pre-populate " - "it automatically from your data using 'detect_initial_config' prior to " - 'fitting your data.') + expected_msg = ( + "No config detected. Set the config using 'set_config' or pre-populate " + "it automatically from your data using 'detect_initial_config' prior to " + 'fitting your data.' + ) with pytest.raises(ConfigNotSetError, match=expected_msg): ht.reverse_transform(data) @@ -1971,7 +1942,6 @@ def test_update_transformers_by_sdtype_field_sdtypes_not_fitted(self): ht.field_sdtypes = { 'categorical_column': 'categorical', 'numerical_column': 'numerical', - } transformer = LabelEncoder() @@ -2018,14 +1988,17 @@ def test_update_transformers_by_sdtype_field_sdtypes_fitted(self, mock_warnings) call( "The 'transformer' parameter will no longer be supported in future " "versions of the RDT. Please use the 'transformer_name' and " - "'transformer_parameters' parameters instead.", FutureWarning - ) + "'transformer_parameters' parameters instead.", + FutureWarning, + ), ] mock_warnings.warn.assert_has_calls(expected_warnings_msgs) assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) - def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error(self): + def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error( + self, + ): """Passing an incorrect ``sdtype`` should raise an error.""" # Setup ht = HyperTransformer() @@ -2068,7 +2041,9 @@ def test_update_transformers_by_sdtype_bad_transformer_raises_error(self): with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', Mock()) - def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error(self): + def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error( + self, + ): """Test ``update_transformers_by_sdtype`` with a mismatched sdtype and transformer. Setup: @@ -2093,7 +2068,9 @@ def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error(self): with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', FloatFormatter()) - def test_update_transformers_by_sdtype_with_transformer_none_transformer_name_none(self): + def test_update_transformers_by_sdtype_with_transformer_none_transformer_name_none( + self, + ): """When ``transformer_name`` and ``transformer`` are both ``None``, it should crash.""" # Setup ht = HyperTransformer() @@ -2115,7 +2092,9 @@ def test_update_transformers_by_sdtype_incorrect_transformer_name(self): with pytest.raises(InvalidConfigError, match=err_msg): ht.update_transformers_by_sdtype('categorical', transformer_name='Transformer') - def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer(self): + def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer( + self, + ): """When ``sdtype`` is not valid for the transformer, it should crash.""" # Setup ht = HyperTransformer() @@ -2137,7 +2116,9 @@ def test_update_transformers_by_sdtype_incorrect_sdtype(self): with pytest.raises(InvalidConfigError, match=err_msg): ht.update_transformers_by_sdtype('bla', transformer_name='LabelEncoder') - def test_update_transformers_by_sdtype_incorrect_transformer_parameters(self): + def test_update_transformers_by_sdtype_incorrect_transformer_parameters( + self, + ): """When ``transformer_parameters`` has invalid values, it should crash.""" # Setup ht = HyperTransformer() @@ -2147,8 +2128,13 @@ def test_update_transformers_by_sdtype_incorrect_transformer_parameters(self): err_msg = re.escape("Invalid parameters ('false', 'order') for the 'LabelEncoder'.") with pytest.raises(TransformerInputError, match=err_msg): ht.update_transformers_by_sdtype( - 'categorical', transformer_name='LabelEncoder', - transformer_parameters={'order_by': [], 'order': [], 'false': []} + 'categorical', + transformer_name='LabelEncoder', + transformer_parameters={ + 'order_by': [], + 'order': [], + 'false': [], + }, ) def test_update_transformers_by_sdtype_transformer_name(self): @@ -2199,7 +2185,10 @@ def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mo # Run ht.update_transformers_by_sdtype( - 'categorical', transformer='doesnt matter', transformer_name='LabelEncoder') + 'categorical', + transformer='doesnt matter', + transformer_name='LabelEncoder', + ) # Assert expected_msg = ( @@ -2211,7 +2200,9 @@ def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mo assert ht.field_transformers['numerical_column'] == ff assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) - def test_update_transformers_by_sdtype_with_transformer_name_transformer_parameters(self): + def test_update_transformers_by_sdtype_with_transformer_name_transformer_parameters( + self, + ): """Test setting ``transformer_name`` and ``transformer_parameters`` works. Expect the `field_transformers`` to be updated with an instance of the passed @@ -2233,7 +2224,7 @@ def test_update_transformers_by_sdtype_with_transformer_name_transformer_paramet ht.update_transformers_by_sdtype( 'categorical', transformer_name='LabelEncoder', - transformer_parameters={'order_by': 'alphabetical'} + transformer_parameters={'order_by': 'alphabetical'}, ) # Assert @@ -2310,8 +2301,7 @@ def test_remove_column_in_multi_column_fields_single_column_left(self): assert ht._multi_column_fields == expected_column_in_tuple def test_update_transformers_by_sdtype_with_multi_column_transformer(self): - """Test ``update_transformers_by_sdtype`` with columns use with a multi-column transformer. - """ + """Test ``update_transformers_by_sdtype`` with columns use with a multi-column transformer.""" # Setup ht = HyperTransformer() ht.field_transformers = { @@ -2323,13 +2313,10 @@ def test_update_transformers_by_sdtype_with_multi_column_transformer(self): 'A': 'categorical', 'B': 'boolean', 'C': 'categorical', - 'D': 'numerical' + 'D': 'numerical', } - ht._multi_column_fields = { - 'C': ('C', 'D'), - 'D': ('C', 'D') - } + ht._multi_column_fields = {'C': ('C', 'D'), 'D': ('C', 'D')} # Run ht.update_transformers_by_sdtype( @@ -2377,9 +2364,7 @@ def test_update_transformers_fitted(self, mock_warnings): instance.field_transformers = {'my_column': object()} instance._validate_transformers = Mock() transformer = FrequencyEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run instance.update_transformers(column_name_to_transformer) @@ -2396,6 +2381,7 @@ def test_update_transformers_fitted(self, mock_warnings): def test__update_transformers_multi_column_valid(self): """Test ``_update_multi_column_transformer`` with a valid multi-column transformer.""" + # Setup class ValidMultiColumnTransformer(BaseMultiColumnTransformer): @classmethod @@ -2438,6 +2424,7 @@ def test__update_transformers_multi_column_invalid(self): The multi column transformer should be removed and its columns assigned to their default transformers. """ + # Setup class InvalidMultiColumnTransformer(BaseMultiColumnTransformer): @classmethod @@ -2450,7 +2437,7 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'B': 'boolean', 'C': 'numerical', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', } ht.field_transformers = { 'A': LabelEncoder(), @@ -2544,13 +2531,11 @@ def test_update_transformers_changing_multi_column_transformer(self): } def side_effect(column): - ht._multi_column_fields = { - 'B': ('B',) - } + ht._multi_column_fields = {'B': ('B',)} ht.field_transformers = { 'C': FloatFormatter(), 'B': None, - 'A': UniformEncoder() + 'A': UniformEncoder(), } mock_remove_column_in_multi_column_fields = Mock() @@ -2564,7 +2549,7 @@ def side_effect(column): expected_field_transformers = { 'C': FloatFormatter(), 'B': None, - 'A': UniformEncoder() + 'A': UniformEncoder(), } mock_remove_column_in_multi_column_fields.assert_called_once_with('A') assert str(ht.field_transformers) == str(expected_field_transformers) @@ -2598,9 +2583,7 @@ def test_update_transformers_not_fitted(self, mock_warnings): instance.field_sdtypes = {'my_column': 'boolean'} instance._validate_transformers = Mock() transformer = BinaryEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run instance.update_transformers(column_name_to_transformer) @@ -2634,9 +2617,7 @@ def test_update_transformers_no_field_transformers(self): instance = HyperTransformer() instance._fitted = False mock_transformer = Mock() - column_name_to_transformer = { - 'my_column': mock_transformer - } + column_name_to_transformer = {'my_column': mock_transformer} expected_config = instance.get_config() # Run expected_msg = ( @@ -2678,9 +2659,7 @@ def test_update_transformers_mismatch_sdtypes(self): instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() transformer = BinaryEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run and Assert err_msg = re.escape( @@ -2718,9 +2697,7 @@ def test_update_transformers_transformer_is_none(self): instance.field_transformers = {'my_column': mock_numerical} instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() - column_name_to_transformer = { - 'my_column': None - } + column_name_to_transformer = {'my_column': None} # Run instance.update_transformers(column_name_to_transformer) @@ -2756,9 +2733,7 @@ def test_update_transformers_column_doesnt_exist_in_config(self): instance.field_transformers = {'my_column': mock_numerical} instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() - column_name_to_transformer = { - 'unknown_column': None - } + column_name_to_transformer = {'unknown_column': None} # Run / Assert expected_msg = re.escape( @@ -2793,12 +2768,13 @@ def test_update_sdtypes_fitted(self, mock_warnings, mock_logger): """ # Setup instance = HyperTransformer() - instance.field_transformers = {'a': FrequencyEncoder, 'b': FloatFormatter} + instance.field_transformers = { + 'a': FrequencyEncoder, + 'b': FloatFormatter, + } instance.field_sdtypes = {'my_column': 'categorical'} instance._fitted = True - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -2844,9 +2820,7 @@ def test_update_sdtypes_not_fitted(self, mock_warnings, mock_logger): instance = HyperTransformer() instance._fitted = False instance.field_sdtypes = {'my_column': 'categorical'} - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -2879,9 +2853,7 @@ def test_update_sdtypes_no_field_sdtypes(self): instance = HyperTransformer() instance._fitted = False instance.field_sdtypes = {} - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run / Assert expected_message = ( @@ -2911,12 +2883,8 @@ def test_update_sdtypes_invalid_sdtype(self): instance._get_supported_sdtypes = Mock() instance._get_supported_sdtypes.return_value = [] instance._fitted = False - instance.field_sdtypes = { - 'my_column': 'categorical' - } - column_name_to_sdtype = { - 'my_column': 'credit_card' - } + instance.field_sdtypes = {'my_column': 'categorical'} + column_name_to_sdtype = {'my_column': 'credit_card'} # Run / Assert expected_message = re.escape( @@ -2943,12 +2911,8 @@ def test_update_sdtypes_invalid_columns(self): """ # Setup instance = HyperTransformer() - instance.field_sdtypes = { - 'my_column': 'categorical' - } - column_name_to_sdtype = { - 'unexpected': 'categorical' - } + instance.field_sdtypes = {'my_column': 'categorical'} + column_name_to_sdtype = {'unexpected': 'categorical'} # Run / Assert expected_message = re.escape( @@ -2990,9 +2954,7 @@ def test_update_sdtypes_different_sdtype(self, mock_warnings, default_mock, mock instance.field_sdtypes = {'a': 'categorical'} transformer_mock = FloatFormatter() default_mock.return_value = transformer_mock - column_name_to_sdtype = { - 'a': 'numerical' - } + column_name_to_sdtype = {'a': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -3039,9 +3001,7 @@ def test_update_sdtypes_different_sdtype_than_transformer(self, mock_warnings, m instance.field_sdtypes = {'a': 'categorical'} transformer = FloatFormatter() instance.field_transformers = {'a': transformer} - column_name_to_sdtype = { - 'a': 'numerical' - } + column_name_to_sdtype = {'a': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -3063,9 +3023,11 @@ def test_update_sdtypes_multi_column_with_supported_sdtypes(self): In this case the multi column transformer supports the new sdtype so the transformer should not be changed. """ + # Setup class DummyMultiColumnTransformer(BaseMultiColumnTransformer): """Dummy multi column transformer.""" + SUPPORTED_SDTYPES = ['categorical', 'boolean'] @classmethod @@ -3077,43 +3039,45 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } ht._create_multi_column_fields = Mock( return_value={ 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } ) ht._update_multi_column_transformer = Mock() # Run - ht.update_sdtypes(column_name_to_sdtype={ - 'column2': 'boolean', - 'column1': 'boolean', - 'column4': 'categorical' - }) + ht.update_sdtypes( + column_name_to_sdtype={ + 'column2': 'boolean', + 'column1': 'boolean', + 'column4': 'categorical', + } + ) # Assert expected_field_sdtypes = { 'column1': 'boolean', 'column2': 'boolean', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } expected_field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } assert ht.field_sdtypes == expected_field_sdtypes assert str(ht.field_transformers) == str(expected_field_transformers) @@ -3127,6 +3091,7 @@ def test_update_sdtypes_multi_column_with_unsupported_sdtypes(self): In this case the multi column transformer does not support the new sdtype so the transformer should be changed to the default one. """ + # Setup class DummyMultiColumnTransformer(BaseMultiColumnTransformer): """Dummy multi column transformer.""" @@ -3142,16 +3107,16 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3160,17 +3125,19 @@ def _validate_sdtypes(cls, columns_to_sdtypes): ' Assigning a new transformer to it.' ) with pytest.warns(UserWarning, match=expected_warning): - ht.update_sdtypes(column_name_to_sdtype={ - 'column2': 'numerical', - 'column1': 'boolean' - }) + ht.update_sdtypes( + column_name_to_sdtype={ + 'column2': 'numerical', + 'column1': 'boolean', + } + ) # Assert expected_field_sdtypes = { 'column1': 'boolean', 'column2': 'numerical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } expected_field_transformers = { 'column1': UniformEncoder(), @@ -3224,7 +3191,7 @@ def test__validate_transformers(self): column_name_to_transformer = { 'col1': FrequencyEncoder(), 'col2': 'Unexpected', - 'col3': None + 'col3': None, } # Run / Assert @@ -3256,12 +3223,12 @@ def test_remove_transformers(self): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } # Run @@ -3271,7 +3238,7 @@ def test_remove_transformers(self): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': 'transformer' + 'column3': 'transformer', } def test_remove_transformers_unknown_columns(self): @@ -3299,12 +3266,12 @@ def test_remove_transformers_unknown_columns(self): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } error_msg = re.escape( @@ -3320,7 +3287,7 @@ def test_remove_transformers_unknown_columns(self): assert ht.field_transformers == { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } @patch('rdt.hyper_transformer.warnings') @@ -3350,12 +3317,12 @@ def test_remove_transformers_fitted(self, mock_warnings): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } # Run @@ -3370,7 +3337,7 @@ def test_remove_transformers_fitted(self, mock_warnings): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': None + 'column3': None, } def test_remove_transformers_multi_column(self): @@ -3385,16 +3352,16 @@ def test_remove_transformers_multi_column(self): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': 'transformer', ('column2', 'column3'): 'multi_column_transformer', - 'column4': 'transformer' + 'column4': 'transformer', } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3405,7 +3372,7 @@ def test_remove_transformers_multi_column(self): 'column1': 'transformer', 'column2': 'multi_column_transformer', 'column3': None, - 'column4': None + 'column4': None, } @patch('rdt.hyper_transformer.warnings') @@ -3433,12 +3400,12 @@ def test_remove_transformers_by_sdtype(self, mock_warnings): ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } ht.field_sdtypes = { 'column1': 'numerical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } # Run @@ -3448,7 +3415,7 @@ def test_remove_transformers_by_sdtype(self, mock_warnings): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': None + 'column3': None, } expected_warnings_msg = ( 'For this change to take effect, please refit your data using ' @@ -3494,16 +3461,16 @@ def test_remove_transformers_by_sdtype_multi_column(self): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'boolean', - 'column4': 'boolean' + 'column4': 'boolean', } ht.field_transformers = { 'column1': 'transformer', ('column2', 'column3'): 'multi_column_transformer', - 'column4': 'transformer' + 'column4': 'transformer', } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3514,10 +3481,12 @@ def test_remove_transformers_by_sdtype_multi_column(self): 'column1': 'transformer', 'column2': 'multi_column_transformer', 'column3': None, - 'column4': None + 'column4': None, } - def test__fit_field_transformer_multi_column_field_not_ready(self,): + def test__fit_field_transformer_multi_column_field_not_ready( + self, + ): """Test the ``_fit_field_transformer`` method. This tests that the ``_fit_field_transformer`` behaves as expected. @@ -3538,13 +3507,10 @@ def test__fit_field_transformer_multi_column_field_not_ready(self,): outputs of the original transformer. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) transformed_data1 = pd.DataFrame({ 'a.out1': ['1', '2', '3'], - 'b': [4, 5, 6] + 'b': [4, 5, 6], }) transformer1 = Mock() transformer2 = Mock() @@ -3559,10 +3525,7 @@ def test__fit_field_transformer_multi_column_field_not_ready(self,): out = ht._fit_field_transformer(data, 'a', transformer1) # Assert - expected = pd.DataFrame({ - 'a.out1': ['1', '2', '3'], - 'b': [4, 5, 6] - }) + expected = pd.DataFrame({'a.out1': ['1', '2', '3'], 'b': [4, 5, 6]}) pd.testing.assert_frame_equal(out, expected) transformer1.fit.assert_called_once() transformer1.transform.assert_called_once_with(data) diff --git a/tests/unit/transformers/pii/test_anonymization.py b/tests/unit/transformers/pii/test_anonymization.py index 1cfa79ee6..143c018db 100644 --- a/tests/unit/transformers/pii/test_anonymization.py +++ b/tests/unit/transformers/pii/test_anonymization.py @@ -1,11 +1,14 @@ from unittest.mock import Mock, patch from rdt.transformers.pii.anonymization import ( - _detect_provider_name, get_anonymized_transformer, get_faker_instance, is_faker_function) + _detect_provider_name, + get_anonymized_transformer, + get_faker_instance, + is_faker_function, +) class TestAnonimization: - def test__detect_provider_name(self): """Test the ``_detect_provider_name`` method. @@ -48,9 +51,13 @@ def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_f - The return value must be the instance of ``AnonymizedFaker``. """ # Setup - output = get_anonymized_transformer('email', transformer_kwargs={ - 'function_kwargs': {'domain': '@gmail.com'}, 'locales': ['en_CA', 'fr_CA'] - }) + output = get_anonymized_transformer( + 'email', + transformer_kwargs={ + 'function_kwargs': {'domain': '@gmail.com'}, + 'locales': ['en_CA', 'fr_CA'], + }, + ) # Assert assert output == mock_anonymized_faker.return_value @@ -58,7 +65,7 @@ def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_f provider_name='internet', function_name='email', function_kwargs={'domain': '@gmail.com'}, - locales=['en_CA', 'fr_CA'] + locales=['en_CA', 'fr_CA'], ) @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') @@ -82,9 +89,13 @@ def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_fak - The return value must be the instance of ``AnonymizedFaker``. """ # Setup - output = get_anonymized_transformer('color', transformer_kwargs={ - 'function_kwargs': {'hue': 'red'}, 'locales': ['en_CA', 'fr_CA'] - }) + output = get_anonymized_transformer( + 'color', + transformer_kwargs={ + 'function_kwargs': {'hue': 'red'}, + 'locales': ['en_CA', 'fr_CA'], + }, + ) # Assert assert output == mock_anonymized_faker.return_value @@ -92,7 +103,7 @@ def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_fak provider_name='color', function_name='color', function_kwargs={'hue': 'red'}, - locales=['en_CA', 'fr_CA'] + locales=['en_CA', 'fr_CA'], ) @patch('rdt.transformers.pii.anonymization.Faker') diff --git a/tests/unit/transformers/pii/test_anonymizer.py b/tests/unit/transformers/pii/test_anonymizer.py index 9e11beb64..7710d3349 100644 --- a/tests/unit/transformers/pii/test_anonymizer.py +++ b/tests/unit/transformers/pii/test_anonymizer.py @@ -11,7 +11,10 @@ from rdt.errors import TransformerInputError, TransformerProcessingError from rdt.transformers.categorical import LabelEncoder -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) class TestAnonymizedFaker: @@ -41,8 +44,9 @@ def test_check_provider_function_baseprovider(self, mock_attrgetter, mock_getatt @patch('rdt.transformers.pii.anonymizer.faker') @patch('rdt.transformers.pii.anonymizer.getattr') @patch('rdt.transformers.pii.anonymizer.attrgetter') - def test_check_provider_function_other_providers(self, mock_attrgetter, mock_getattr, - mock_faker): + def test_check_provider_function_other_providers( + self, mock_attrgetter, mock_getattr, mock_faker + ): """Test that ``getattr`` is being called with ``provider_name`` and ``function_name``. Mock: @@ -390,11 +394,9 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_ instance = AnonymizedFaker( provider_name='credit_card', function_name='credit_card_full', - function_kwargs={ - 'type': 'visa' - }, + function_kwargs={'type': 'visa'}, locales=['en_US', 'fr_FR'], - enforce_uniqueness=True + enforce_uniqueness=True, ) # Assert @@ -409,7 +411,7 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_ call( "The 'enforce_uniqueness' parameter is no longer supported. " "Please use the 'cardinality_rule' parameter instead.", - FutureWarning + FutureWarning, ) ]) @@ -423,8 +425,7 @@ def test___init__no_function_name(self): """ # Run / Assert expected_message = ( - 'Please specify the function name to use from the ' - "'credit_card' provider." + 'Please specify the function name to use from the ' "'credit_card' provider." ) with pytest.raises(TransformerInputError, match=expected_message): AnonymizedFaker(provider_name='credit_card', locales=['en_US', 'fr_FR']) @@ -442,7 +443,10 @@ def test_get_supported_sdtypes(self, base_mock, issubclass_mock): datetime_mock = Mock() datetime_mock.get_supported_sdtypes.return_value = ['datetime'] boolean_mock = Mock() - boolean_mock.get_supported_sdtypes.return_value = ['boolean', 'categorical'] + boolean_mock.get_supported_sdtypes.return_value = [ + 'boolean', + 'categorical', + ] text_mock = Mock() text_mock.get_supported_sdtypes.return_value = ['text'] phone_mock = Mock() @@ -456,14 +460,18 @@ def test_get_supported_sdtypes(self, base_mock, issubclass_mock): boolean_mock, text_mock, phone_mock, - pii_mock + pii_mock, ] # Run supported_sdtypes = AnonymizedFaker.get_supported_sdtypes() # Assert - assert sorted(supported_sdtypes) == sorted(['phone_number', 'pii', 'text']) + assert sorted(supported_sdtypes) == sorted([ + 'phone_number', + 'pii', + 'text', + ]) @patch('rdt.transformers.pii.anonymizer.BaseTransformer.reset_randomization') @patch('rdt.transformers.pii.anonymizer.faker') @@ -835,10 +843,8 @@ def test___init__custom(self, mock_check_provider_function, mock_faker): instance = PseudoAnonymizedFaker( provider_name='credit_card', function_name='credit_card_full', - function_kwargs={ - 'type': 'visa' - }, - locales=['en_US', 'fr_FR'] + function_kwargs={'type': 'visa'}, + locales=['en_US', 'fr_FR'], ) # Assert @@ -910,7 +916,10 @@ def test__fit(self): assert instance._mapping_dict == {'a': 1, 'b': 2, 'c': 3} assert instance._reverse_mapping_dict == {1: 'a', 2: 'b', 3: 'c'} assert list(instance.output_properties) == [None] - assert list(instance.output_properties[None]) == ['sdtype', 'next_transformer'] + assert list(instance.output_properties[None]) == [ + 'sdtype', + 'next_transformer', + ] assert instance.output_properties[None]['sdtype'] == 'categorical' transformer = instance.output_properties[None]['next_transformer'] diff --git a/tests/unit/transformers/test___init__.py b/tests/unit/transformers/test___init__.py index 30c415ed5..e6542704a 100644 --- a/tests/unit/transformers/test___init__.py +++ b/tests/unit/transformers/test___init__.py @@ -1,8 +1,16 @@ import pytest from rdt.transformers import ( - AnonymizedFaker, BinaryEncoder, FloatFormatter, RegexGenerator, UniformEncoder, - UnixTimestampEncoder, get_default_transformers, get_transformer_class, get_transformer_name) + AnonymizedFaker, + BinaryEncoder, + FloatFormatter, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, + get_default_transformers, + get_transformer_class, + get_transformer_name, +) def test_get_transformer_name(): diff --git a/tests/unit/transformers/test__validators.py b/tests/unit/transformers/test__validators.py index dc1c23478..0b1f8750b 100644 --- a/tests/unit/transformers/test__validators.py +++ b/tests/unit/transformers/test__validators.py @@ -4,12 +4,18 @@ import pytest from rdt.errors import TransformerInputError -from rdt.transformers._validators import AddressValidator, BaseValidator, GPSValidator +from rdt.transformers._validators import ( + AddressValidator, + BaseValidator, + GPSValidator, +) class TestBaseValidator: - - @patch('rdt.transformers._validators.BaseValidator.SUPPORTED_SDTYPES', ['numerical']) + @patch( + 'rdt.transformers._validators.BaseValidator.SUPPORTED_SDTYPES', + ['numerical'], + ) @patch('rdt.transformers._validators.BaseValidator.VALIDATION_TYPE', 'Base') def test_validate_supported_sdtypes(self): """Test ``_validate_supported_sdtypes`` method.""" @@ -87,7 +93,7 @@ def test__validate_number_columns(self): 'col_5': 'street_address', 'col_6': 'secondary_address', 'col_7': 'country_code', - 'col_8': 'administrative_unit' + 'col_8': 'administrative_unit', } # Run and Assert @@ -111,7 +117,7 @@ def test__validate_uniqueness_sdtype(self): 'col_1': 'country_code', 'col_2': 'country_code', 'col_3': 'city', - 'col_4': 'city' + 'col_4': 'city', } # Run and Assert @@ -158,7 +164,7 @@ def test__validate_administrative_unit(self): } columns_to_sdtypes_invalid = { 'col_1': 'administrative_unit', - 'col_2': 'state' + 'col_2': 'state', } # Run and Assert @@ -190,9 +196,7 @@ def test__validate_sdtypes(self): AddressValidator._validate_number_columns.assert_called_once_with(columns_to_sdtypes) AddressValidator._validate_uniqueness_sdtype.assert_called_once_with(columns_to_sdtypes) AddressValidator._validate_supported_sdtypes.assert_called_once_with(columns_to_sdtypes) - AddressValidator._validate_administrative_unit.assert_called_once_with( - columns_to_sdtypes - ) + AddressValidator._validate_administrative_unit.assert_called_once_with(columns_to_sdtypes) def test__validate_imports_without_address_module(self): """Test ``validate_imports`` when address module doesn't exist.""" diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py index 2acf054ea..18aa8ac5e 100644 --- a/tests/unit/transformers/test_base.py +++ b/tests/unit/transformers/test_base.py @@ -7,7 +7,11 @@ import pytest from rdt.errors import TransformerInputError -from rdt.transformers import BaseMultiColumnTransformer, BaseTransformer, NullTransformer +from rdt.transformers import ( + BaseMultiColumnTransformer, + BaseTransformer, + NullTransformer, +) from rdt.transformers.base import random_state, set_random_states @@ -40,7 +44,7 @@ def test_set_random_states(mock_numpy): mock_numpy.random.get_state.assert_called() mock_numpy.random.set_state.assert_has_calls([ call(initial_state_value), - call(first_state) + call(first_state), ]) my_function.assert_called_once_with(mock_numpy.random.RandomState.return_value, 'fit') mock_numpy.random.RandomState.return_value.set_state.assert_called_with(second_state) @@ -91,7 +95,6 @@ def test_random_state_random_states_is_none(mock_set_random_states): class TestBaseTransformer: - def test_set_random_state(self): """Test that the method updates the random state for the correct method.""" # Setup @@ -141,6 +144,7 @@ def test_get_subclasses(self): Output: - a list of classes including the ``Child`` class, but NOT including the ``Parent``. """ + # Setup class Parent(BaseTransformer, abc.ABC): pass @@ -166,8 +170,7 @@ def test_get_input_sdtype_raises_warning(self, mock_get_supported_sdtypes): # Run expected_message = ( - '`get_input_sdtype` is deprecated. Please use ' - '`get_supported_sdtypes` instead.' + '`get_input_sdtype` is deprecated. Please use ' '`get_supported_sdtypes` instead.' ) with pytest.warns(FutureWarning, match=expected_message): input_sdtype = BaseTransformer.get_input_sdtype() @@ -189,6 +192,7 @@ def test_get_supported_sdtypes_supported_sdtypes(self): Output: - the list stored in the ``SUPPORTED_SDTYPES`` attribute. """ + # Setup class Dummy(BaseTransformer): SUPPORTED_SDTYPES = ['categorical', 'boolean'] @@ -212,6 +216,7 @@ def test_get_supported_sdtypes_no_supported_sdtypes_provided(self): Output: - A list with the ``INPUT_SDTYPE`` value inside. """ + # Setup class Dummy(BaseTransformer): INPUT_SDTYPE = 'categorical' @@ -230,7 +235,7 @@ def test__get_output_to_property(self): transformer.output_properties = { 'col': {'sdtype': 'float', 'next_transformer': None}, 'ignore': {'next_transformer': None}, - None: {'sdtype': 'categorical', 'next_transformer': None} + None: {'sdtype': 'categorical', 'next_transformer': None}, } # Run @@ -285,10 +290,13 @@ def test_model_missing_values(self, mock_warnings): # Assert assert result is True - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter instead." - ), FutureWarning) + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter instead." + ), + FutureWarning, + ) @patch('rdt.transformers.base.warnings') def test__set_model_missing_values_true(self, mock_warnings): @@ -299,10 +307,13 @@ def test__set_model_missing_values_true(self, mock_warnings): BaseTransformer._set_model_missing_values(instance, True) # Assert - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.'), FutureWarning + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter to select your " + 'strategy.' + ), + FutureWarning, ) instance._set_missing_value_generation.assert_called_once_with('from_column') @@ -315,10 +326,13 @@ def test__set_model_missing_values_false(self, mock_warnings): BaseTransformer._set_model_missing_values(instance, False) # Assert - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.'), FutureWarning + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter to select your " + 'strategy.' + ), + FutureWarning, ) instance._set_missing_value_generation.assert_called_once_with('random') @@ -347,6 +361,7 @@ def test___repr___with_parameters(self): - The class has two parameters in its ``__init__`` method with default values. - The class instance only sets one of them. """ + # Setup class Dummy(BaseTransformer): def __init__(self, param1=None, param2=None, param3=None): @@ -373,6 +388,7 @@ def test__str__(self): - The class has two parameters in its ``__init__`` method with default values. - The class instance only sets one of them. """ + # Setup class Dummy(BaseTransformer): def __init__(self, param1=None, param2=None, param3=None): @@ -390,6 +406,7 @@ def __init__(self, param1=None, param2=None, param3=None): def test_get_output_sdtypes(self): """Test the column_prefix gets added to all columns in output_properties.""" + # Setup class Dummy(BaseTransformer): column_prefix = 'column_name' @@ -436,6 +453,7 @@ def test_get_input_columns(self): Output: - List matching the list created in the setup. """ + # Setup class Dummy(BaseTransformer): columns = ['col1', 'col2', 'col3'] @@ -461,6 +479,7 @@ def test_get_output_columns(self): Output: - A list of each output name with the prefix prepended. """ + # Setup class Dummy(BaseTransformer): column_prefix = 'column_name' @@ -468,7 +487,7 @@ class Dummy(BaseTransformer): def __init__(self): self.output_properties = { 'out1': {'sdtype': 'numerical'}, - 'out2': {'sdtype': 'float'} + 'out2': {'sdtype': 'float'}, } dummy_transformer = Dummy() @@ -492,6 +511,7 @@ def test_is_generator(self): Output: - the boolean value stored in ``IS_GENERATOR``. """ + # Setup class Dummy(BaseTransformer): IS_GENERATOR = True @@ -518,11 +538,7 @@ def test__store_columns_list(self): - the ``self.columns`` attribute should be set to the list of the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['a', 'b'] base_transformer = BaseTransformer() @@ -546,11 +562,7 @@ def test__store_columns_tuple(self): - the ``self.columns`` attribute should be set to a list of the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ('a', 'b') base_transformer = BaseTransformer() @@ -581,7 +593,7 @@ def test__store_columns_tuple_in_the_data(self): data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], - ('a', 'b'): [7, 8, 9] + ('a', 'b'): [7, 8, 9], }) columns = ('a', 'b') base_transformer = BaseTransformer() @@ -596,22 +608,18 @@ def test__store_columns_tuple_in_the_data(self): def test__store_columns_string(self): """Test the ``_store_columns`` method when passed a string. - When the columns are passed as a string, it should be treated as the only column - name passed and stored in the ``columns`` attribute as a one element list. + When the columns are passed as a string, it should be treated as the only column + name passed and stored in the ``columns`` attribute as a one element list. - Input: - - a data frame. - - a string with the name of one of the columns of the dataframe. + Input: + - a data frame. + - a string with the name of one of the columns of the dataframe. - Side effects: - - the ``self.columns`` attribute should be set to a list containing the passed string. + Side effects: + - the ``self.columns`` attribute should be set to a list containing the passed string. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = 'a' base_transformer = BaseTransformer() @@ -659,11 +667,7 @@ def test__get_columns_data_multiple_columns(self): - the passed dataframe, but containing only the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['a', 'b'] # Run @@ -690,11 +694,7 @@ def test__get_columns_data_single_column(self): - a pandas series, corresponding to the passed column from the dataframe. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['b'] # Run @@ -720,10 +720,7 @@ def test__add_columns_to_data_series(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[2, 0, 1]) columns = ['c'] columns_data = pd.Series([7, 8, 9], name='c') @@ -731,11 +728,7 @@ def test__add_columns_to_data_series(self): result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_dataframe(self): @@ -754,26 +747,29 @@ def test__add_columns_to_data_dataframe(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + data = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) columns = ['c', 'd'] - columns_data = pd.DataFrame({ - 'c': [7, 8, 9], - 'd': [10, 11, 12] - }) + columns_data = pd.DataFrame({'c': [7, 8, 9], 'd': [10, 11, 12]}) # Run result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9], - 'd': [10, 11, 12] - }, index=[2, 0, 1]) + expected = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': [7, 8, 9], + 'd': [10, 11, 12], + }, + index=[2, 0, 1], + ) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_1d_array(self): @@ -792,10 +788,13 @@ def test__add_columns_to_data_1d_array(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + data = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) columns = ['c'] columns_data = np.array([7, 8, 9], dtype=np.int64) @@ -803,11 +802,7 @@ def test__add_columns_to_data_1d_array(self): result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_2d_array(self): @@ -826,25 +821,15 @@ def test__add_columns_to_data_2d_array(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3]}, index=[2, 0, 1]) columns = ['b', 'c'] - columns_data = np.array([ - [7, 1], - [8, 5], - [9, 9] - ], dtype=np.int64) + columns_data = np.array([[7, 1], [8, 5], [9, 9]], dtype=np.int64) # Run result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [7, 8, 9], - 'c': [1, 5, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [7, 8, 9], 'c': [1, 5, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_none(self): @@ -860,10 +845,7 @@ def test__add_columns_to_data_none(self): - Data should not be changed. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[2, 0, 1]) columns = [] columns_data = None @@ -871,10 +853,13 @@ def test__add_columns_to_data_none(self): result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + expected = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) pd.testing.assert_frame_equal(result, expected) def test__build_output_columns(self): @@ -897,11 +882,7 @@ def test__build_output_columns(self): from the ``get_output_sdtypes`` method. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b'] @@ -909,7 +890,7 @@ class Dummy(BaseTransformer): def __init__(self): self.output_properties = { None: {'sdtype': 'numerical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } dummy_transformer = Dummy() @@ -950,15 +931,15 @@ def test__build_output_columns_generated_already_exist(self): 'b': [7, 8, 9], 'a#b#.is_null': [0, 0, 0], 'a#b#.is_null#': [0, 0, 0], - }) class Dummy(BaseTransformer): def __init__(self): self.output_properties = { None: {'sdtype': 'numerical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } + columns = ['a', 'b'] # Run @@ -973,11 +954,7 @@ def test__fit_raises_error(self): """Test ``_fit`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1008,11 +985,7 @@ def test_fit(self): column names to accepted output sdtypes. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) column = ['a'] class Dummy(BaseTransformer): @@ -1020,7 +993,7 @@ def __init__(self): super().__init__() self.output_properties = { None: {'sdtype': 'categorical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } def _fit(self, data): @@ -1042,11 +1015,7 @@ def test__transform_raises_error(self): """Test ``_transform`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1070,11 +1039,7 @@ def test_transform_incorrect_columns(self): - the original data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b', 'd'] @@ -1112,11 +1077,7 @@ def test_transform_drop_true(self): and should store it in ``self._passed_data``. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b'] @@ -1166,11 +1127,7 @@ def test_fit_transform(self): """ # Setup self = Mock(spec_set=BaseTransformer) - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) column = 'a' # Run @@ -1185,11 +1142,7 @@ def test__reverse_transform_raises_error(self): """Test ``_reverse_transform`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1213,11 +1166,7 @@ def test_reverse_transform_incorrect_columns(self): - the original data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): output_columns = ['a', 'b', 'd'] @@ -1243,7 +1192,7 @@ def test_reverse_transform(self): data = pd.DataFrame({ 'a': [1, 2, 3], 'b.is_null': [4, 5, 6], - 'c': [7, 8, 9] + 'c': [7, 8, 9], }) class Dummy(BaseTransformer): @@ -1275,7 +1224,6 @@ def _reverse_transform(self, data): class TestBaseMultiColumnTransformer: - def test___init__(self): """Test the ``__init__`` method.""" # Setup @@ -1416,9 +1364,7 @@ def test__validate_columns_to_sdtypes(self): 'b': 'categorical', 'd': 'boolean', } - expected_error_msg = re.escape( - 'Columns (d) are not present in the data.' - ) + expected_error_msg = re.escape('Columns (d) are not present in the data.') with pytest.raises(ValueError, match=expected_error_msg): transformer._validate_columns_to_sdtypes(data, wrong_columns_to_sdtypes) @@ -1467,9 +1413,7 @@ def test_fit(self): # Assert transformer._validate_columns_to_sdtypes.assert_called_once_with(data, columns_to_sdtypes) - transformer._store_columns.assert_called_once_with( - ['a', 'b'], data - ) + transformer._store_columns.assert_called_once_with(['a', 'b'], data) transformer._set_seed.assert_called_once_with(data) transformer._get_columns_data.assert_called_once_with(data, ['a', 'b']) transformer._fit.assert_called_once_with(data_transformer) diff --git a/tests/unit/transformers/test_boolean.py b/tests/unit/transformers/test_boolean.py index c186b2d4c..2554508ca 100644 --- a/tests/unit/transformers/test_boolean.py +++ b/tests/unit/transformers/test_boolean.py @@ -8,7 +8,6 @@ class TestBinaryEncoder(TestCase): - def test___init__(self): """Test default instance""" # Run @@ -84,13 +83,13 @@ def test__transform_series(self): # Asserts expect_call_count = 1 - expect_call_args = pd.Series([0., 1., None, 1., 0.], dtype=float) + expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], - expect_call_args + expect_call_args, ) def test__transform_array(self): @@ -104,13 +103,13 @@ def test__transform_array(self): # Asserts expect_call_count = 1 - expect_call_args = pd.Series([0., 1., None, 1., 0.], dtype=float) + expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], - expect_call_args + expect_call_args, ) def test__reverse_transform_missing_value_replacement_not_ignore(self): @@ -142,7 +141,7 @@ def test__reverse_transform_missing_value_replacement_not_ignore(self): def test__reverse_transform_series(self): """Test when data is a Series.""" # Setup - data = pd.Series([1., 0., 1.]) + data = pd.Series([1.0, 0.0, 1.0]) # Run transformer = Mock() @@ -157,7 +156,7 @@ def test__reverse_transform_series(self): def test__reverse_transform_not_null_values(self): """Test _reverse_transform not null values correctly""" # Setup - data = np.array([1., 0., 1.]) + data = np.array([1.0, 0.0, 1.0]) # Run transformer = Mock() @@ -174,7 +173,7 @@ def test__reverse_transform_not_null_values(self): def test__reverse_transform_2d_ndarray(self): """Test _reverse_transform not null values correctly""" # Setup - data = np.array([[1.], [0.], [1.]]) + data = np.array([[1.0], [0.0], [1.0]]) # Run transformer = Mock() diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index fc9dcdee8..bd9e27b82 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -8,8 +8,14 @@ from rdt.errors import TransformerInputError from rdt.transformers.categorical import ( - CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, - OrderedUniformEncoder, UniformEncoder) + CustomLabelEncoder, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) RE_SSN = re.compile(r'\d\d\d-\d\d-\d\d\d\d') @@ -141,7 +147,7 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -162,7 +168,7 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -182,12 +188,12 @@ def test__fit(self): expected_frequencies = { 'foo': 0.5, 'bar': 0.3333333333333333, - 'tar': 0.16666666666666666 + 'tar': 0.16666666666666666, } expected_intervals = { - 'foo': [0., 0.5], + 'foo': [0.0, 0.5], 'bar': [0.5, 0.8333333333333333], - 'tar': [0.8333333333333333, 1.0] + 'tar': [0.8333333333333333, 1.0], } assert transformer.frequencies == expected_frequencies assert transformer.intervals == expected_intervals @@ -204,12 +210,12 @@ def test__transform(self): transformer.frequencies = { 'foo': 0.5, 'bar': 0.3333333333333333, - 'tar': 0.16666666666666666 + 'tar': 0.16666666666666666, } transformer.intervals = { - 'foo': [0., 0.5], + 'foo': [0.0, 0.5], 'bar': [0.5, 0.8333333333333333], - 'tar': [0.8333333333333333, 1.0] + 'tar': [0.8333333333333333, 1.0], } # Run @@ -236,15 +242,13 @@ def test__transform_user_warning(self): data_2 = pd.Series([1, 2, 3, 4, 5, 'a', 7, 8, 'b']) transformer = UniformEncoder() transformer.columns = ['col'] - transformer.frequencies = { - 1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25 - } + transformer.frequencies = {1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25} transformer.intervals = { 1: [0, 0.25], 2: [0.25, 0.5], 3: [0.5, 0.75], - 4: [0.75, 1] + 4: [0.75, 1], } # Run @@ -279,19 +283,35 @@ def test__reverse_transform(self, mock_convert_dtype, mock_check_nan): data = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) transformer = UniformEncoder() transformer.dtype = np.int64 - transformer.frequencies = { - 1: 0.222222, - 2: 0.444444, - 3: 0.333333 - } + transformer.frequencies = {1: 0.222222, 2: 0.444444, 3: 0.333333} transformer.intervals = { 1: [0, 0.222222], 2: [0.222222, 0.666666], - 3: [0.666666, 1.0] + 3: [0.666666, 1.0], } - transformed = pd.Series([0.12, 0.254, 0.789, 0.43, 0.56, 0.08, 0.67, 0.98, 0.36]) - mock_convert_dtype.return_value = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) + transformed = pd.Series([ + 0.12, + 0.254, + 0.789, + 0.43, + 0.56, + 0.08, + 0.67, + 0.98, + 0.36, + ]) + mock_convert_dtype.return_value = pd.Series([ + 1, + 2, + 3, + 2, + 2, + 1, + 3, + 3, + 2, + ]) # Run output = transformer._reverse_transform(transformed) @@ -307,23 +327,40 @@ def test__reverse_transform(self, mock_convert_dtype, mock_check_nan): def test__reverse_transform_nans(self): """Test ``_reverse_transform`` for data with NaNs.""" # Setup - data = pd.Series(['a', 'b', 'NaN', np.nan, 'NaN', 'b', 'b', 'a', 'b', np.nan]) + data = pd.Series([ + 'a', + 'b', + 'NaN', + np.nan, + 'NaN', + 'b', + 'b', + 'a', + 'b', + np.nan, + ]) transformer = UniformEncoder() transformer.dtype = object - transformer.frequencies = { - 'a': 0.2, - 'b': 0.4, - 'NaN': 0.2, - np.nan: 0.2 - } + transformer.frequencies = {'a': 0.2, 'b': 0.4, 'NaN': 0.2, np.nan: 0.2} transformer.intervals = { 'a': [0, 0.2], 'b': [0.2, 0.6], 'NaN': [0.6, 0.8], - np.nan: [0.8, 1] + np.nan: [0.8, 1], } - transformed = pd.Series([0.12, 0.254, 0.789, 0.88, 0.69, 0.53, 0.47, 0.08, 0.39, 0.92]) + transformed = pd.Series([ + 0.12, + 0.254, + 0.789, + 0.88, + 0.69, + 0.53, + 0.47, + 0.08, + 0.39, + 0.92, + ]) # Run output = transformer._reverse_transform(transformed) @@ -412,13 +449,13 @@ def test__fit(self): 2.0: 0.2857142857142857, 3.0: 0.14285714285714285, None: 0.14285714285714285, - 1.0: 0.42857142857142855 + 1.0: 0.42857142857142855, } expected_intervals = { 2.0: [0.0, 0.2857142857142857], 3.0: [0.2857142857142857, 0.42857142857142855], None: [0.42857142857142855, 0.5714285714285714], - 1.0: [0.5714285714285714, 1.0] + 1.0: [0.5714285714285714, 1.0], } assert transformer.frequencies == expected_frequencies assert transformer.intervals == expected_intervals @@ -524,7 +561,6 @@ def test__transform_error(self): class TestFrequencyEncoder: - def test___setstate__(self): """Test the ``__set_state__`` method. @@ -540,11 +576,7 @@ def test___setstate__(self): transformer = FrequencyEncoder() # Run - transformer.__setstate__({ - 'intervals': { - None: 'abc' - } - }) + transformer.__setstate__({'intervals': {None: 'abc'}}) # Assert assert transformer.__dict__['intervals'][np.nan] == 'abc' @@ -580,33 +612,28 @@ def test__get_intervals(self): # Asserts expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, 'bar': 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', 'bar', 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert result[0] == expected_intervals @@ -634,33 +661,28 @@ def test__get_intervals_nans(self): # Assert expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), np.nan: ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, np.nan: 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', np.nan, 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert result[0] == expected_intervals @@ -677,33 +699,28 @@ def test__fit_intervals(self): # Asserts expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, 'bar': 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', 'bar', 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert transformer.intervals == expected_intervals @@ -763,24 +780,19 @@ def test__reverse_transform_series(self, mock_check_nan): pd.testing.assert_series_equal(mock_input_data, rt_data) assert mock_input_dtype == transformer.dtype expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } assert transformer.intervals == expected_intervals @@ -855,7 +867,12 @@ def test__transform_by_category_called(self): data = pd.Series([1, 3, 3, 2, 1]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) @@ -950,6 +967,7 @@ def test__transform_by_category_add_noise_true(self, norm_mock): - ``rvs_mock`` should be called four times, one for each element of the intervals dictionary. """ + # Setup def rvs_mock_func(loc, scale, **kwargs): return loc @@ -974,10 +992,30 @@ def rvs_mock_func(loc, scale, **kwargs): expected = np.array([0.875, 0.375, 0.375, 0.625, 0.875]) assert (transformed == expected).all() norm_mock.rvs.assert_has_calls([ - call(0.125, 0.041666666666666664, size=0, random_state=transform_random_state_mock), - call(0.375, 0.041666666666666664, size=2, random_state=transform_random_state_mock), - call(0.625, 0.041666666666666664, size=1, random_state=transform_random_state_mock), - call(0.875, 0.041666666666666664, size=2, random_state=transform_random_state_mock), + call( + 0.125, + 0.041666666666666664, + size=0, + random_state=transform_random_state_mock, + ), + call( + 0.375, + 0.041666666666666664, + size=2, + random_state=transform_random_state_mock, + ), + call( + 0.625, + 0.041666666666666664, + size=1, + random_state=transform_random_state_mock, + ), + call( + 0.875, + 0.041666666666666664, + size=2, + random_state=transform_random_state_mock, + ), ]) def test__transform_by_row_called(self): @@ -999,7 +1037,12 @@ def test__transform_by_row_called(self): data = pd.Series([1, 2, 3, 4]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) @@ -1056,11 +1099,15 @@ def test__reverse_transform_by_category_called(self): transform_data = pd.Series([1, 3, 3, 2, 1]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run - reverse = FrequencyEncoder._reverse_transform( - categorical_transformer_mock, transform_data) + reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, transform_data) # Asserts reverse_arg = categorical_transformer_mock._reverse_transform_by_category.call_args[0][0] @@ -1114,7 +1161,7 @@ def test__get_category_from_start(self): transformer = FrequencyEncoder() transformer.starts = pd.DataFrame({ 'start': [0.0, 0.5, 0.7], - 'category': ['a', 'b', 'c'] + 'category': ['a', 'b', 'c'], }).set_index('start') # Run @@ -1142,9 +1189,15 @@ def test__reverse_transform_by_row_called(self): data = pd.Series([1, 2, 3, 4]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) categorical_transformer_mock.starts = pd.DataFrame( - [0., 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category']) + [0.0, 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category'] + ) categorical_transformer_mock._normalize.return_value = data # Run @@ -1176,7 +1229,8 @@ def test__reverse_transform_by_row(self, mock_check_nan): transformer = FrequencyEncoder() transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) transformer.starts = pd.DataFrame( - [4, 3, 2, 1], index=[0., 0.25, 0.5, 0.75], columns=['category']) + [4, 3, 2, 1], index=[0.0, 0.25, 0.5, 0.75], columns=['category'] + ) transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), @@ -1197,7 +1251,6 @@ def test__reverse_transform_by_row(self, mock_check_nan): class TestOneHotEncoder: - def test__prepare_data_empty_lists(self): # Setup ohe = OneHotEncoder() @@ -1414,11 +1467,7 @@ def test__transform_no_nan(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_no_nan_categorical(self): @@ -1445,11 +1494,7 @@ def test__transform_no_nan_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_nans_encoded(self): @@ -1475,12 +1520,7 @@ def test__transform_nans_encoded(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [0, 0, 1], - [0, 0, 1], - [1, 0, 0], - [0, 1, 0] - ]) + expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected) def test__transform_nans_categorical(self): @@ -1509,12 +1549,7 @@ def test__transform_nans_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [0, 0, 1], - [0, 0, 1], - [1, 0, 0], - [0, 1, 0] - ]) + expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected) def test__transform_single_column(self): @@ -1539,11 +1574,7 @@ def test__transform_single_column(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_single_categorical(self): @@ -1571,11 +1602,7 @@ def test__transform_single_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_zeros(self): @@ -1600,11 +1627,7 @@ def test__transform_zeros(self): out = ohe._transform_helper(pd.Series(['b', 'b', 'b'])) # Assert - expected = np.array([ - [0], - [0], - [0] - ]) + expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected) def test__transform_zeros_categorical(self): @@ -1632,11 +1655,7 @@ def test__transform_zeros_categorical(self): out = ohe._transform_helper(pd.Series(['b', 'b', 'b'])) # Assert - expected = np.array([ - [0], - [0], - [0] - ]) + expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected) def test__transform_unknown_nan(self): @@ -1662,11 +1681,7 @@ def test__transform_unknown_nan(self): out = ohe._transform_helper(pd.Series(['b', 'b', np.nan])) # Assert - expected = np.array([ - [0, 0], - [0, 0], - [0, 1] - ]) + expected = np.array([[0, 0], [0, 0], [0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_no_nans(self): @@ -1689,11 +1704,7 @@ def test__transform_no_nans(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_nans(self): @@ -1716,11 +1727,7 @@ def test__transform_nans(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_single_column_filled_with_ones(self): @@ -1743,11 +1750,7 @@ def test__transform_single_column_filled_with_ones(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_unknown(self): @@ -1782,7 +1785,7 @@ def test__transform_unknown(self): [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], - [0, 0, 0, 0] + [0, 0, 0, 0], ]) np.testing.assert_array_equal(out, expected) @@ -1824,11 +1827,7 @@ def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan): mock_convert_dtype.return_value = data # Run - transformed = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + transformed = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1847,11 +1846,7 @@ def test__reverse_transform_nans(self): ohe._fit(data) # Run - transformed = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + transformed = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1865,11 +1860,7 @@ def test__reverse_transform_single(self): ohe._fit(data) # Run - transformed = np.array([ - [1], - [1], - [1] - ]) + transformed = np.array([[1], [1], [1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1892,7 +1883,6 @@ def test__reverse_transform_1d(self): class TestLabelEncoder: - def test___init__(self): """Passed arguments must be stored as attributes.""" # Run @@ -2030,7 +2020,7 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2054,7 +2044,7 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2122,7 +2112,7 @@ def test__transform(self): transformed = transformer._transform(data) # Assert - expected = pd.Series([0., 1., 2.]) + expected = pd.Series([0.0, 1.0, 2.0]) pd.testing.assert_series_equal(transformed[:-1], expected) assert 0 <= transformed[3] <= 2 @@ -2182,11 +2172,21 @@ def test__transform_unseen_categories(self): # Run with pytest.warns(UserWarning): - transform_data = pd.Series(['a', 2, True, np.nan, np.nan, np.nan, 'b', False, 3]) + transform_data = pd.Series([ + 'a', + 2, + True, + np.nan, + np.nan, + np.nan, + 'b', + False, + 3, + ]) transformed = transformer._transform(transform_data) # Assert - expected = pd.Series([0., 1., 2.]) + expected = pd.Series([0.0, 1.0, 2.0]) pd.testing.assert_series_equal(transformed[:3], expected) assert all(0 <= value < len(fit_data) for value in transformed[3:]) @@ -2264,7 +2264,6 @@ def test__reverse_transform_integer_and_nans(self): class TestOrderedLabelEncoder: - def test___init__(self): """The the ``__init__`` method. @@ -2382,7 +2381,6 @@ def test__fit_error(self): class TestCustomLabelEncoder: - def test___init__(self): """Test the warning message for backwards compatibility of ``CustomLabelEncoder``.""" # Setup / Run / Assert diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index d00707233..eb184d4b9 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -5,12 +5,14 @@ import pandas as pd import pytest -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) from rdt.transformers.null import NullTransformer class TestUnixTimestampEncoder: - def test___init__(self): """Test the ``__init__`` method and the passed arguments are stored as attributes.""" # Run @@ -33,7 +35,7 @@ def test___init__with_model_missing_values(self): transformer = UnixTimestampEncoder( missing_value_replacement='mode', model_missing_values=False, - datetime_format='%M-%d-%Y' + datetime_format='%M-%d-%Y', ) # Asserts @@ -106,7 +108,11 @@ def test__convert_to_datetime_not_convertible_raises_error(self): - a ``TypeError`` is raised. """ # Setup - data = pd.Series(['2020-01-01-can', '2020-02-01-not', '2020-03-01-convert']) + data = pd.Series([ + '2020-01-01-can', + '2020-02-01-not', + '2020-03-01-convert', + ]) transformer = UnixTimestampEncoder() # Run @@ -181,9 +187,15 @@ def test__transform_helper(self): transformed = transformer._transform_helper(data) # Assert - np.testing.assert_allclose(transformed, np.array([ - 1.577837e+18, 1.580515e+18, 1.583021e+18, - ]), rtol=1e-5) + np.testing.assert_allclose( + transformed, + np.array([ + 1.577837e18, + 1.580515e18, + 1.583021e18, + ]), + rtol=1e-5, + ) def test__reverse_transform_helper_nulls(self): """Test the ``_reverse_transform_helper`` with null values. @@ -266,7 +278,8 @@ def test__fit(self, null_transformer_mock): assert null_transformer_mock.return_value.fit.call_count == 1 np.testing.assert_allclose( null_transformer_mock.return_value.fit.call_args_list[0][0][0], - np.array([1.577837e+18, 1.580515e+18, 1.583021e+18]), rtol=1e-5 + np.array([1.577837e18, 1.580515e18, 1.583021e18]), + rtol=1e-5, ) def test__fit_enforce_min_max_values(self): @@ -283,8 +296,8 @@ def test__fit_enforce_min_max_values(self): transformer._fit(data) # Assert - assert transformer._min_value == 1.5778368e+18 - assert transformer._max_value == 1.5830208e+18 + assert transformer._min_value == 1.5778368e18 + assert transformer._max_value == 1.5830208e18 def test__fit_calls_transform_helper(self): """Test the ``_fit`` method. @@ -323,7 +336,7 @@ def test__fit_calls_guess_datetime_format(self, mock__guess_datetime_format_for_ # Assert np.testing.assert_array_equal( mock__guess_datetime_format_for_array.call_args[0][0], - np.array(['2020-02-01', '2020-03-01']) + np.array(['2020-02-01', '2020-03-01']), ) assert transformer.datetime_format == '%Y-%m-%d' @@ -364,7 +377,8 @@ def test__transform(self): assert transformer.null_transformer.transform.call_count == 1 np.testing.assert_allclose( transformer.null_transformer.transform.call_args_list[0][0], - np.array([[1.577837e+18, 1.580515e+18, 1.583021e+18]]), rtol=1e-5 + np.array([[1.577837e18, 1.580515e18, 1.583021e18]]), + rtol=1e-5, ) def test__reverse_transform_all_none(self): @@ -390,7 +404,7 @@ def test__reverse_transform(self): """ # Setup ute = UnixTimestampEncoder() - transformed = np.array([1.5778368e+18, 1.5805152e+18, 1.5830208e+18]) + transformed = np.array([1.5778368e18, 1.5805152e18, 1.5830208e18]) ute.null_transformer = NullTransformer('mean') # Run @@ -409,19 +423,29 @@ def test__reverse_transform_enforce_min_max_values(self): # Setup ute = UnixTimestampEncoder(enforce_min_max_values=True) transformed = np.array([ - 1.5678367e+18, 1.5778368e+18, 1.5805152e+18, 1.5830208e+18, 1.5930209e+18 + 1.5678367e18, + 1.5778368e18, + 1.5805152e18, + 1.5830208e18, + 1.5930209e18, ]) ute.null_transformer = NullTransformer('mean') - ute._min_value = 1.5778368e+18 - ute._max_value = 1.5830208e+18 + ute._min_value = 1.5778368e18 + ute._max_value = 1.5830208e18 # Run output = ute._reverse_transform(transformed) # Assert - expected = pd.Series(pd.to_datetime([ - '2020-01-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-03-01' - ])) + expected = pd.Series( + pd.to_datetime([ + '2020-01-01', + '2020-01-01', + '2020-02-01', + '2020-03-01', + '2020-03-01', + ]) + ) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_datetime_format_dtype_is_datetime(self): @@ -429,7 +453,7 @@ def test__reverse_transform_datetime_format_dtype_is_datetime(self): # Setup ute = UnixTimestampEncoder() ute.datetime_format = '%b %d, %Y' - transformed = np.array([1.5778368e+18, 1.5805152e+18, 1.5830208e+18]) + transformed = np.array([1.5778368e18, 1.5805152e18, 1.5830208e18]) ute._dtype = np.dtype('