From 6e6918d6504bc8d9cf8b2336890c5e0016b4afc5 Mon Sep 17 00:00:00 2001 From: gsheni Date: Fri, 19 Apr 2024 10:46:29 -0400 Subject: [PATCH 01/17] Add ruff --- Makefile | 12 ++++---- pyproject.toml | 76 +++++++++++++++++++++++++++++++------------------- setup.cfg | 32 --------------------- tasks.py | 8 ++---- 4 files changed, 55 insertions(+), 73 deletions(-) delete mode 100644 setup.cfg diff --git a/Makefile b/Makefile index af7ad97b..5e0ba7f4 100644 --- a/Makefile +++ b/Makefile @@ -80,14 +80,14 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a # LINT TARGETS .PHONY: lint -lint: ## check style with flake8 and isort - invoke lint +lint: ## Run all code style checks + ruff check . + ruff format . --check .PHONY: fix-lint -fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find rdt tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive rdt tests - isort --apply --atomic rdt tests +fix-lint: ## fix lint issues using ruff + ruff check --fix . + ruff format . # TEST TARGETS diff --git a/pyproject.toml b/pyproject.toml index 6298c8fe..3dcb71e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,35 +71,7 @@ dev = [ 'watchdog>=1.0.1,<5', # style check - 'pycodestyle>=2.7.0,<2.12', - 'pyflakes>=2.3.0,<3.3', - 'flake8>=3.7.7,<8', - 'flake8-absolute-import>=1.0,<2', - 'flake8-builtins>=1.5.3,<3', - 'flake8-comprehensions>=3.6.1,<4', - 'flake8-debugger>=4.0.0,<5', - 'flake8-docstrings>=1.5.0,<2', - 'flake8-eradicate>=1.1.0,<2', - 'flake8-fixme>=1.1.1,<1.2', - 'flake8-mock>=0.3,<1', - 'flake8-multiline-containers>=0.0.18,<0.1', - 'flake8-mutable>=1.2.0,<1.3', - 'flake8-expression-complexity>=0.0.9,<0.1', - 'flake8-print>=4.0.0,<4.1', - 'flake8-pytest-style>=2.0.0,<3', - 'flake8-quotes>=3.3.0,<4', - 'flake8-sfs>=0.0.3,<2', - 'flake8-variables-names>=0.0.4,<0.1', - 'dlint>=0.11.0,<1', - 'isort>=5.13.2,<6', - 'pandas-vet>=0.2.3,<2024', - 'pep8-naming>=0.12.1,<1', - 'pydocstyle>=6.1.1,<7', - 'pylint>=2.5.3,<4', - - # fix style issues - 'autoflake>=1.1,<3', - 'autopep8>=1.4.3,<3', + 'ruff>=0.3.2,<1', # distribute on PyPI 'twine>=1.10.0,<6', @@ -201,3 +173,49 @@ replace = "__version__ = '{new_version}'" [build-system] requires = ['setuptools', 'wheel'] build-backend = 'setuptools.build_meta' + +[tool.ruff] +preview = true +line-length = 79 +src = ["rdt"] +target-version = "py312" +exclude = [ + "docs", + ".tox", + ".git", + "__pycache__", + ".ipynb_checkpoints" +] + +[tool.ruff.lint] +select = [ + # Pyflakes + "F", + # Pycodestyle + "E", + "W", + # isort + "I001" +] +ignore = [ + "E501", + "D107", # Missing docstring in __init__ + "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 +] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] + +[tool.ruff.lint.isort] +known-first-party = ["rdt"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +preview = true + +[tool.ruff.lint.pydocstyle] +convention = "google" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 01ce2bb6..00000000 --- a/setup.cfg +++ /dev/null @@ -1,32 +0,0 @@ -[flake8] -max-line-length = 99 -inline-quotes = single -extend-ignore = - D107, - SFS3, - PD005, - # TokenError: unterminated string literal (detected at line 1) - E902 -exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints -per-file-ignores = - tests/contributing.py:T001 - tests/performance/profiling.py:T001 - tests/performance/test_performance.py:T001 - rdt/performance/datasets/datetime.py:A005 - rdt/transformers/datetime.py:A005 - -[aliases] -test = pytest - -[pylint] -extension-pkg-whitelist = numpy -min-similarity-lines = 5 -max-args = 8 -max-attributes = 11 -ignore-comments = yes -ignore-docstrings = yes -ignore-imports = yes -disable = R0801, R0903, R0913, R0914, R1708, C0209, W0223, W0221, W0237, C0411, - W0231 # __init__ method from base class is not called on a child class -ignored-classes = sre_parse - diff --git a/tasks.py b/tasks.py index b0b66ac2..461b9320 100644 --- a/tasks.py +++ b/tasks.py @@ -114,12 +114,8 @@ def readme(c): @task def lint(c): check_dependencies(c) - c.run('flake8 rdt') - c.run('pydocstyle rdt') - c.run('flake8 tests --ignore=D') - c.run('pydocstyle tests') - c.run('isort -c rdt tests') - c.run('pylint rdt tests/performance --rcfile=setup.cfg') + c.run('ruff check .') + c.run('ruff format . --check') c.run('pytest tests/code_style.py -v --disable-warnings --no-header') From db188c917981495f9e6a12c6c7b7c8517c8e8fa6 Mon Sep 17 00:00:00 2001 From: gsheni Date: Fri, 19 Apr 2024 10:48:41 -0400 Subject: [PATCH 02/17] run ruff --- rdt/__init__.py | 69 +- rdt/hyper_transformer.py | 203 ++-- rdt/performance/datasets/__init__.py | 9 +- rdt/performance/datasets/boolean.py | 76 +- rdt/performance/datasets/categorical.py | 181 +--- rdt/performance/datasets/datetime.py | 81 +- rdt/performance/datasets/numerical.py | 128 +-- rdt/performance/datasets/pii.py | 24 +- rdt/performance/datasets/text.py | 24 +- rdt/performance/performance.py | 34 +- rdt/performance/profiling.py | 28 +- rdt/transformers/__init__.py | 37 +- rdt/transformers/_validators.py | 49 +- rdt/transformers/base.py | 62 +- rdt/transformers/boolean.py | 20 +- rdt/transformers/categorical.py | 113 ++- rdt/transformers/datetime.py | 78 +- rdt/transformers/null.py | 31 +- rdt/transformers/numerical.py | 124 ++- rdt/transformers/pii/__init__.py | 5 +- rdt/transformers/pii/anonymization.py | 38 +- rdt/transformers/pii/anonymizer.py | 138 ++- rdt/transformers/text.py | 51 +- rdt/transformers/utils.py | 29 +- tasks.py | 31 +- tests/code_style.py | 38 +- tests/contributing.py | 118 ++- tests/datasets/tests/test_boolean.py | 6 - tests/datasets/tests/test_categorical.py | 14 - tests/datasets/tests/test_datetime.py | 6 - tests/datasets/tests/test_numerical.py | 10 - tests/integration/__init__.py | 1 - tests/integration/test_hyper_transformer.py | 924 +++++++++++------- tests/integration/test_transformers.py | 113 ++- .../transformers/pii/test_anonymizer.py | 90 +- tests/integration/transformers/test_base.py | 85 +- .../integration/transformers/test_boolean.py | 16 +- .../transformers/test_categorical.py | 70 +- .../integration/transformers/test_datetime.py | 60 +- .../transformers/test_numerical.py | 50 +- tests/integration/transformers/test_text.py | 136 ++- tests/performance/test_performance.py | 32 +- tests/performance/tests/test_profiling.py | 54 +- tests/unit/test___init__.py | 73 +- tests/unit/test_hyper_transformer.py | 651 ++++++------ .../transformers/pii/test_anonymization.py | 39 +- .../unit/transformers/pii/test_anonymizer.py | 138 ++- tests/unit/transformers/test___init__.py | 12 +- tests/unit/transformers/test__validators.py | 106 +- tests/unit/transformers/test_base.py | 353 ++++--- tests/unit/transformers/test_boolean.py | 45 +- tests/unit/transformers/test_categorical.py | 535 +++++----- tests/unit/transformers/test_datetime.py | 132 ++- tests/unit/transformers/test_null.py | 154 ++- tests/unit/transformers/test_numerical.py | 612 ++++++++---- tests/unit/transformers/test_text.py | 101 +- tests/unit/transformers/test_utils.py | 68 +- 57 files changed, 3934 insertions(+), 2571 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index b23d8cd9..a98ad5b3 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -2,7 +2,6 @@ """Top-level package for RDT.""" - __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' __version__ = '1.12.0.dev2' @@ -20,10 +19,7 @@ from rdt import transformers from rdt.hyper_transformer import HyperTransformer -__all__ = [ - 'HyperTransformer', - 'transformers' -] +__all__ = ['HyperTransformer', 'transformers'] RANDOM_SEED = 42 @@ -42,10 +38,13 @@ def get_demo(num_rows=5): pd.DataFrame """ # Hard code first five rows - login_dates = pd.Series([ - '2021-06-26', '2021-02-10', 'NAT', '2020-09-26', '2020-12-22' - ], dtype='datetime64[ns]') - email_optin = pd.Series([False, False, False, True, np.nan], dtype='object') + login_dates = pd.Series( + ['2021-06-26', '2021-02-10', 'NAT', '2020-09-26', '2020-12-22'], + dtype='datetime64[ns]', + ) + email_optin = pd.Series( + [False, False, False, True, np.nan], dtype='object' + ) credit_card = ['VISA', 'VISA', 'AMEX', np.nan, 'DISCOVER'] age = [29, 18, 21, 45, 32] dollars_spent = [99.99, np.nan, 2.50, 25.00, 19.99] @@ -55,7 +54,7 @@ def get_demo(num_rows=5): 'email_optin': email_optin, 'credit_card': credit_card, 'age': age, - 'dollars_spent': dollars_spent + 'dollars_spent': dollars_spent, }) if num_rows <= 5: @@ -67,33 +66,47 @@ def get_demo(num_rows=5): try: num_rows -= 5 - login_dates = np.array([ - np.datetime64('2000-01-01') + np.timedelta64(np.random.randint(0, 10000), 'D') - for _ in range(num_rows) - ], dtype='datetime64[ns]') - login_dates[np.random.random(size=num_rows) > 0.8] = np.datetime64('NaT') + login_dates = np.array( + [ + np.datetime64('2000-01-01') + + np.timedelta64(np.random.randint(0, 10000), 'D') + for _ in range(num_rows) + ], + dtype='datetime64[ns]', + ) + login_dates[np.random.random(size=num_rows) > 0.8] = np.datetime64( + 'NaT' + ) email_optin = pd.Series([True, False, np.nan], dtype='object').sample( - num_rows, replace=True) - credit_card = np.random.choice(['VISA', 'AMEX', np.nan, 'DISCOVER'], size=num_rows) + num_rows, replace=True + ) + credit_card = np.random.choice( + ['VISA', 'AMEX', np.nan, 'DISCOVER'], size=num_rows + ) age = np.random.randint(18, 100, size=num_rows) - dollars_spent = np.around(np.random.uniform(0, 100, size=num_rows), decimals=2) + dollars_spent = np.around( + np.random.uniform(0, 100, size=num_rows), decimals=2 + ) dollars_spent[np.random.random(size=num_rows) > 0.8] = np.nan finally: np.random.set_state(random_state) - return pd.concat([ - data, - pd.DataFrame({ - 'last_login': login_dates, - 'email_optin': email_optin, - 'credit_card': credit_card, - 'age': age, - 'dollars_spent': dollars_spent - }) - ], ignore_index=True) + return pd.concat( + [ + data, + pd.DataFrame({ + 'last_login': login_dates, + 'email_optin': email_optin, + 'credit_card': credit_card, + 'age': age, + 'dollars_spent': dollars_spent, + }), + ], + ignore_index=True, + ) def _get_addon_target(addon_path_name): diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index e048b26b..a6feaac8 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -9,11 +9,20 @@ import pandas as pd from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError, - TransformerProcessingError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, + TransformerProcessingError, +) from rdt.transformers import ( - BaseMultiColumnTransformer, BaseTransformer, get_class_by_transformer_name, - get_default_transformer, get_transformers_by_type) + BaseMultiColumnTransformer, + BaseTransformer, + get_class_by_transformer_name, + get_default_transformer, + get_transformers_by_type, +) from rdt.transformers.utils import flatten_column_list LOGGER = logging.getLogger(__name__) @@ -31,14 +40,18 @@ def __repr__(self): config = { 'sdtypes': self['sdtypes'], - 'transformers': {str(k): repr(v) for k, v in self['transformers'].items()} + 'transformers': { + str(k): repr(v) for k, v in self['transformers'].items() + }, } printed = json.dumps(config, indent=4) for transformer in self['transformers'].values(): quoted_transformer = f'"{transformer}"' if quoted_transformer in printed: - printed = printed.replace(quoted_transformer, repr(transformer)) + printed = printed.replace( + quoted_transformer, repr(transformer) + ) return printed @@ -59,14 +72,8 @@ class HyperTransformer: 'b': 'boolean', 'M': 'datetime', } - _DEFAULT_OUTPUT_SDTYPES = [ - 'numerical', - 'float', - 'integer' - ] - _REFIT_MESSAGE = ( - "For this change to take effect, please refit your data using 'fit' or 'fit_transform'." - ) + _DEFAULT_OUTPUT_SDTYPES = ['numerical', 'float', 'integer'] + _REFIT_MESSAGE = "For this change to take effect, please refit your data using 'fit' or 'fit_transform'." _DETECT_CONFIG_MESSAGE = ( 'Nothing to update. Use the `detect_initial_config` method to pre-populate all the ' 'sdtypes and transformers from your dataset.' @@ -109,9 +116,11 @@ def _create_multi_column_fields(self): def _validate_field_transformers(self): for field in self.field_transformers: if self._field_in_set(field, self._specified_fields): - raise ValueError(f'Multiple transformers specified for the field {field}. ' - 'Each field can have at most one transformer defined in ' - 'field_transformers.') + raise ValueError( + f'Multiple transformers specified for the field {field}. ' + 'Each field can have at most one transformer defined in ' + 'field_transformers.' + ) self._add_field_to_set(field, self._specified_fields) @@ -131,7 +140,9 @@ def __init__(self): @staticmethod def _field_in_data(field, data): - all_columns_in_data = isinstance(field, tuple) and all(col in data for col in field) + all_columns_in_data = isinstance(field, tuple) and all( + col in data for col in field + ) return field in data or all_columns_in_data @staticmethod @@ -150,7 +161,7 @@ def get_config(self): """ return Config({ 'sdtypes': self.field_sdtypes, - 'transformers': self.field_transformers + 'transformers': self.field_transformers, }) @staticmethod @@ -215,7 +226,9 @@ def _validate_config(config): sdtype_keys = sdtypes.keys() transformer_keys = flatten_column_list(transformers.keys()) - is_transformer_keys_unique = len(transformer_keys) == len(set(transformer_keys)) + is_transformer_keys_unique = len(transformer_keys) == len( + set(transformer_keys) + ) if not is_transformer_keys_unique: raise InvalidConfigError( 'Error: Invalid config. Please provide unique keys for the sdtypes ' @@ -236,7 +249,11 @@ def _validate_config(config): if transformer is None: continue - columns = column_name if isinstance(column_name, tuple) else [column_name] + columns = ( + column_name + if isinstance(column_name, tuple) + else [column_name] + ) for column in columns: sdtype = sdtypes.get(column) if sdtype not in transformer.get_supported_sdtypes(): @@ -250,7 +267,9 @@ def _validate_config(config): def _validate_update_columns(self, update_columns): unknown_columns = self._subset( - flatten_column_list(update_columns), self.field_sdtypes.keys(), not_in=True + flatten_column_list(update_columns), + self.field_sdtypes.keys(), + not_in=True, ) if unknown_columns: raise InvalidConfigError( @@ -279,7 +298,8 @@ def set_config(self, config): warnings.warn(self._REFIT_MESSAGE) def _validate_update_transformers_by_sdtype( - self, sdtype, transformer, transformer_name, transformer_parameters): + self, sdtype, transformer, transformer_name, transformer_parameters + ): if not self.field_sdtypes: raise ConfigNotSetError( 'Nothing to update. Use the `detect_initial_config` method to ' @@ -288,7 +308,9 @@ def _validate_update_transformers_by_sdtype( if transformer_name is None: if transformer is None: - raise InvalidConfigError("Missing required parameter 'transformer_name'.") + raise InvalidConfigError( + "Missing required parameter 'transformer_name'." + ) if not isinstance(transformer, BaseTransformer): raise InvalidConfigError( @@ -301,22 +323,32 @@ def _validate_update_transformers_by_sdtype( ) else: - if transformer_name not in get_class_by_transformer_name() or sdtype not in \ - get_class_by_transformer_name()[transformer_name].get_supported_sdtypes(): + if ( + transformer_name not in get_class_by_transformer_name() + or sdtype + not in get_class_by_transformer_name()[ + transformer_name + ].get_supported_sdtypes() + ): raise InvalidConfigError( - f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype.") + f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype." + ) if transformer_parameters is not None: transformer = get_class_by_transformer_name()[transformer_name] valid = inspect.signature(transformer).parameters - invalid_parameters = {arg for arg in transformer_parameters if arg not in valid} + invalid_parameters = { + arg for arg in transformer_parameters if arg not in valid + } if invalid_parameters: raise TransformerInputError( f'Invalid parameters {tuple(sorted(invalid_parameters))} ' f"for the '{transformer_name}'." ) - def _warn_update_transformers_by_sdtype(self, transformer, transformer_name): + def _warn_update_transformers_by_sdtype( + self, transformer, transformer_name + ): if self._fitted: warnings.warn(self._REFIT_MESSAGE) @@ -325,14 +357,15 @@ def _warn_update_transformers_by_sdtype(self, transformer, transformer_name): warnings.warn( "The 'transformer' parameter will no longer be supported in future versions " "of the RDT. Using the 'transformer_name' parameter instead.", - FutureWarning + FutureWarning, ) else: warnings.warn( "The 'transformer' parameter will no longer be supported in future versions " "of the RDT. Please use the 'transformer_name' and 'transformer_parameters' " - 'parameters instead.', FutureWarning + 'parameters instead.', + FutureWarning, ) def _remove_column_in_multi_column_fields(self, column): @@ -349,13 +382,15 @@ def _remove_column_in_multi_column_fields(self, column): new_tuple = tuple(item for item in old_tuple if item != column) if len(new_tuple) == 1: - new_tuple, = new_tuple + (new_tuple,) = new_tuple self._multi_column_fields.pop(new_tuple, None) else: for col in new_tuple: self._multi_column_fields[col] = new_tuple - self.field_transformers[new_tuple] = self.field_transformers.pop(old_tuple) + self.field_transformers[new_tuple] = self.field_transformers.pop( + old_tuple + ) def _update_multi_column_transformer(self): """Check that multi-columns mappings are valid and update them otherwise.""" @@ -369,7 +404,7 @@ def _update_multi_column_transformer(self): columns_to_sdtypes = self._get_columns_to_sdtypes(field) try: - transformer._validate_sdtypes( # pylint: disable=protected-access + transformer._validate_sdtypes( # pylint: disable=protected-access columns_to_sdtypes ) except TransformerInputError: @@ -379,12 +414,19 @@ def _update_multi_column_transformer(self): ) del self.field_transformers[field] for column, sdtype in columns_to_sdtypes.items(): - self.field_transformers[column] = deepcopy(get_default_transformer(sdtype)) + self.field_transformers[column] = deepcopy( + get_default_transformer(sdtype) + ) self._multi_column_fields = self._create_multi_column_fields() def update_transformers_by_sdtype( - self, sdtype, transformer=None, transformer_name=None, transformer_parameters=None): + self, + sdtype, + transformer=None, + transformer_name=None, + transformer_parameters=None, + ): """Update the transformers for the specified ``sdtype``. Given an ``sdtype`` and a ``transformer``, change all the fields of the ``sdtype`` @@ -403,18 +445,22 @@ def update_transformers_by_sdtype( A dict of the kwargs of the transformer. """ self._validate_update_transformers_by_sdtype( - sdtype, transformer, transformer_name, transformer_parameters) + sdtype, transformer, transformer_name, transformer_parameters + ) self._warn_update_transformers_by_sdtype(transformer, transformer_name) transformer_instance = transformer if transformer_name is not None: if transformer_parameters is not None: - transformer_instance = \ - get_class_by_transformer_name()[transformer_name](**transformer_parameters) + transformer_instance = get_class_by_transformer_name()[ + transformer_name + ](**transformer_parameters) else: - transformer_instance = get_class_by_transformer_name()[transformer_name]() + transformer_instance = get_class_by_transformer_name()[ + transformer_name + ]() for field, field_sdtype in self.field_sdtypes.items(): if field_sdtype == sdtype: @@ -462,7 +508,9 @@ def update_sdtypes(self, column_name_to_sdtype): if column in self._multi_column_fields: self._remove_column_in_multi_column_fields(column) - transformers_to_update[column] = deepcopy(get_default_transformer(sdtype)) + transformers_to_update[column] = deepcopy( + get_default_transformer(sdtype) + ) self.field_sdtypes.update(column_name_to_sdtype) self.field_transformers.update(transformers_to_update) @@ -495,11 +543,19 @@ def update_transformers(self, column_name_to_transformer): self._validate_transformers(column_name_to_transformer) for column_name, transformer in column_name_to_transformer.items(): - columns = column_name if isinstance(column_name, tuple) else (column_name,) + columns = ( + column_name + if isinstance(column_name, tuple) + else (column_name,) + ) for column in columns: if transformer is not None: col_sdtype = self.field_sdtypes.get(column) - if col_sdtype and col_sdtype not in transformer.get_supported_sdtypes(): + if ( + col_sdtype + and col_sdtype + not in transformer.get_supported_sdtypes() + ): raise InvalidConfigError( f"Column '{column}' is a {col_sdtype} column, which is " f"incompatible with the '{transformer.get_name()}' transformer." @@ -594,7 +650,9 @@ def _learn_config(self, data): self._set_field_sdtype(data, field) if field not in self.field_transformers: sdtype = self.field_sdtypes[field] - self.field_transformers[field] = deepcopy(get_default_transformer(sdtype)) + self.field_transformers[field] = deepcopy( + get_default_transformer(sdtype) + ) def detect_initial_config(self, data): """Print the configuration of the data. @@ -620,7 +678,7 @@ def detect_initial_config(self, data): config = Config({ 'sdtypes': self.field_sdtypes, - 'transformers': self.field_transformers + 'transformers': self.field_transformers, }) LOGGER.info('Config:') @@ -675,16 +733,19 @@ def _fit_field_transformer(self, data, field, transformer): next_transformers = transformer.get_next_transformers() for column_name, next_transformer in next_transformers.items(): - # If the column is part of a multi-column field, and at least one column # isn't present in the data, then it should not fit the next transformer if self._field_in_data(column_name, data): - data = self._fit_field_transformer(data, column_name, next_transformer) + data = self._fit_field_transformer( + data, column_name, next_transformer + ) return data def _validate_all_fields_fitted(self): - non_fitted_fields = self._specified_fields.difference(self._fitted_fields) + non_fitted_fields = self._specified_fields.difference( + self._fitted_fields + ) if non_fitted_fields: warnings.warn( 'The following fields were specified in the input arguments but not ' @@ -706,7 +767,11 @@ def _validate_detect_config_called(self, data): missing = any(column not in data.columns for column in fields) unknown_columns = self._subset(data.columns, fields, not_in=True) if unknown_columns or missing: - unknown_text = f' (unknown columns: {unknown_columns})' if unknown_columns else '' + unknown_text = ( + f' (unknown columns: {unknown_columns})' + if unknown_columns + else '' + ) raise InvalidDataError( 'The data you are trying to fit has different columns than the original ' f'detected data{unknown_text}. Column names and their ' @@ -746,7 +811,9 @@ def fit(self, data): else: field = column - data = self._fit_field_transformer(data, field, self.field_transformers[field]) + data = self._fit_field_transformer( + data, field, self.field_transformers[field] + ) self._validate_all_fields_fitted() self._fitted = True @@ -762,10 +829,16 @@ def _transform(self, data, prevent_subset): self._validate_config_exists() self._validate_fitted() - unknown_columns = self._subset(data.columns, self._input_columns, not_in=True) + unknown_columns = self._subset( + data.columns, self._input_columns, not_in=True + ) if prevent_subset: - contained = all(column in self._input_columns for column in data.columns) - is_subset = contained and len(data.columns) < len(self._input_columns) + contained = all( + column in self._input_columns for column in data.columns + ) + is_subset = contained and len(data.columns) < len( + self._input_columns + ) if unknown_columns or is_subset: raise InvalidDataError( 'The data you are trying to transform has different columns than the original ' @@ -844,9 +917,13 @@ def create_anonymized_columns(self, num_rows, column_names): self._validate_fitted() if not isinstance(num_rows, int) or num_rows <= 0: - raise ValueError("Parameter 'num_rows' must be an integer greater than 0.") + raise ValueError( + "Parameter 'num_rows' must be an integer greater than 0." + ) - unknown_columns = self._subset(column_names, self._input_columns, not_in=True) + unknown_columns = self._subset( + column_names, self._input_columns, not_in=True + ) if unknown_columns: raise InvalidConfigError( f"Unknown column name {unknown_columns}. Use 'get_config()' to see a " @@ -875,7 +952,9 @@ def _reverse_transform(self, data, prevent_subset): self._validate_config_exists() self._validate_fitted() - unknown_columns = self._subset(data.columns, self._output_columns, not_in=True) + unknown_columns = self._subset( + data.columns, self._output_columns, not_in=True + ) if unknown_columns: raise InvalidDataError( 'There are unexpected column names in the data you are trying to transform. ' @@ -883,8 +962,12 @@ def _reverse_transform(self, data, prevent_subset): ) if prevent_subset: - contained = all(column in self._output_columns for column in data.columns) - is_subset = contained and len(data.columns) < len(self._output_columns) + contained = all( + column in self._output_columns for column in data.columns + ) + is_subset = contained and len(data.columns) < len( + self._output_columns + ) if is_subset: raise InvalidDataError( 'You must provide a transformed dataset with all the columns from the ' @@ -897,7 +980,9 @@ def _reverse_transform(self, data, prevent_subset): else: for transformer in reversed(self._transformers_sequence): output_columns = transformer.get_output_columns() - if output_columns and set(output_columns).issubset(data.columns): + if output_columns and set(output_columns).issubset( + data.columns + ): data = transformer.reverse_transform(data) reversed_columns = self._subset(self._input_columns, data.columns) diff --git a/rdt/performance/datasets/__init__.py b/rdt/performance/datasets/__init__.py index ca3470b9..eaae213d 100644 --- a/rdt/performance/datasets/__init__.py +++ b/rdt/performance/datasets/__init__.py @@ -2,7 +2,14 @@ from collections import defaultdict -from rdt.performance.datasets import boolean, categorical, datetime, numerical, pii, text +from rdt.performance.datasets import ( + boolean, + categorical, + datetime, + numerical, + pii, + text, +) from rdt.performance.datasets.base import BaseDatasetGenerator __all__ = [ diff --git a/rdt/performance/datasets/boolean.py b/rdt/performance/datasets/boolean.py index 27c6eeb4..14ad742f 100644 --- a/rdt/performance/datasets/boolean.py +++ b/rdt/performance/datasets/boolean.py @@ -36,18 +36,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 2e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -71,18 +65,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } @@ -104,18 +92,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -126,7 +108,9 @@ class RandomSkewedBooleanNaNsGenerator(BooleanGenerator): def generate(num_rows): """Generate a ``num_rows`` number of rows.""" percent_null = np.random.randint(MIN_PERCENT, MAX_PERCENT_NULL) - percent_true = np.random.randint(MIN_PERCENT, 100 - percent_null - MIN_PERCENT) + percent_true = np.random.randint( + MIN_PERCENT, 100 - percent_null - MIN_PERCENT + ) percent_false = 100 - percent_null - percent_true return np.random.choice( @@ -139,18 +123,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } @@ -167,18 +145,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 400.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 500.0, - } + }, } @@ -201,16 +173,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-5, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-5, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-5, 'memory': 400.0}, + 'transform': {'time': 1e-5, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-5, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/categorical.py b/rdt/performance/datasets/categorical.py index e78c7065..3ce2e67f 100644 --- a/rdt/performance/datasets/categorical.py +++ b/rdt/performance/datasets/categorical.py @@ -28,18 +28,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -49,24 +43,20 @@ class RandomIntegerNaNsGenerator(CategoricalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans(RandomIntegerGenerator.generate(num_rows).astype(float)) + return add_nans( + RandomIntegerGenerator.generate(num_rows).astype(float) + ) @staticmethod def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 5e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -83,18 +73,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 2e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -110,18 +94,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 1000.0, - } + }, } @@ -136,12 +114,13 @@ def generate(num_rows): """Generate a ``num_rows`` number of rows.""" cat_size = 5 categories = np.hstack([ - cat.astype('O') for cat in [ + cat.astype('O') + for cat in [ RandomGapDatetimeGenerator.generate(cat_size), np.random.randint(0, 100, cat_size), np.random.uniform(0, 100, cat_size), np.arange(cat_size).astype(str), - np.array([True, False]) + np.array([True, False]), ] ]) @@ -151,18 +130,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 2000.0, - } + }, } @@ -189,18 +162,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 2000.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 2000.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 2000.0, - } + }, } @@ -217,18 +184,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -238,24 +199,20 @@ class SingleIntegerNaNsGenerator(CategoricalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans(SingleIntegerGenerator.generate(num_rows).astype(float)) + return add_nans( + SingleIntegerGenerator.generate(num_rows).astype(float) + ) @staticmethod def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 500.0, - } + }, } @@ -272,18 +229,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -299,18 +250,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 2e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 2e-05, 'memory': 400.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 500.0, - } + }, } @@ -326,18 +271,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.0004, - 'memory': 2000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 500000.0 - }, + 'fit': {'time': 0.0004, 'memory': 2000.0}, + 'transform': {'time': 0.0004, 'memory': 500000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -353,18 +292,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.0004, - 'memory': 1000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 1000000.0 - }, + 'fit': {'time': 0.0004, 'memory': 1000.0}, + 'transform': {'time': 0.0004, 'memory': 1000000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -380,18 +313,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.002, - 'memory': 2000.0 - }, - 'transform': { - 'time': 0.0004, - 'memory': 500000.0 - }, + 'fit': {'time': 0.002, 'memory': 2000.0}, + 'transform': {'time': 0.0004, 'memory': 500000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } @@ -407,16 +334,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 0.001, - 'memory': 1000.0 - }, - 'transform': { - 'time': 0.0005, - 'memory': 1000000.0 - }, + 'fit': {'time': 0.001, 'memory': 1000.0}, + 'transform': {'time': 0.0005, 'memory': 1000000.0}, 'reverse_transform': { 'time': 0.0005, 'memory': 1000000.0, - } + }, } diff --git a/rdt/performance/datasets/datetime.py b/rdt/performance/datasets/datetime.py index 295ae118..6a22981b 100644 --- a/rdt/performance/datasets/datetime.py +++ b/rdt/performance/datasets/datetime.py @@ -31,18 +31,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -61,18 +55,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -89,18 +77,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -119,18 +101,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -143,7 +119,9 @@ def generate(num_rows): today = datetime.datetime.today() delta = datetime.timedelta - today = min(datetime.datetime.today(), pd.Timestamp.max - delta(num_rows)) + today = min( + datetime.datetime.today(), pd.Timestamp.max - delta(num_rows) + ) dates = [delta(i) + today for i in range(num_rows)] return np.array(dates, dtype='datetime64') @@ -152,18 +130,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } @@ -177,7 +149,10 @@ def generate(num_rows): delta = datetime.timedelta today = datetime.datetime.today() - dates = [min(delta(weeks=i) + today, pd.Timestamp.max) for i in range(num_rows)] + dates = [ + min(delta(weeks=i) + today, pd.Timestamp.max) + for i in range(num_rows) + ] return np.array(dates, dtype='datetime64') @@ -185,16 +160,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 5e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 350.0 - }, + 'fit': {'time': 5e-05, 'memory': 500.0}, + 'transform': {'time': 5e-05, 'memory': 350.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/numerical.py b/rdt/performance/datasets/numerical.py index d092b660..dddc78f0 100644 --- a/rdt/performance/datasets/numerical.py +++ b/rdt/performance/datasets/numerical.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -48,24 +42,20 @@ class RandomIntegerNaNsGenerator(NumericalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans(RandomIntegerGenerator.generate(num_rows).astype(float)) + return add_nans( + RandomIntegerGenerator.generate(num_rows).astype(float) + ) @staticmethod def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } @@ -83,18 +73,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -104,24 +88,20 @@ class ConstantIntegerNaNsGenerator(NumericalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans(ConstantIntegerGenerator.generate(num_rows).astype(float)) + return add_nans( + ConstantIntegerGenerator.generate(num_rows).astype(float) + ) @staticmethod def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 600.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 600.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } @@ -142,18 +122,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 2000.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 1e-05, 'memory': 2000.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 2000.0, - } + }, } @@ -174,18 +148,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 3e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } @@ -201,18 +169,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 1e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 1e-05, 'memory': 400.0, - } + }, } @@ -228,18 +190,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 4e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 4e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 350.0, - } + }, } @@ -255,18 +211,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 5e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 5e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 5e-05, 'memory': 400.0, - } + }, } @@ -282,16 +232,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-03, - 'memory': 2500.0 - }, - 'transform': { - 'time': 3e-05, - 'memory': 400.0 - }, + 'fit': {'time': 1e-03, 'memory': 2500.0}, + 'transform': {'time': 3e-05, 'memory': 400.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 350.0, - } + }, } diff --git a/rdt/performance/datasets/pii.py b/rdt/performance/datasets/pii.py index 523a0673..95c9cbf9 100644 --- a/rdt/performance/datasets/pii.py +++ b/rdt/performance/datasets/pii.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 1e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 3e-05, 'memory': 1000.0, - } + }, } @@ -54,16 +48,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 3e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/datasets/text.py b/rdt/performance/datasets/text.py index 1039ae71..5473d982 100644 --- a/rdt/performance/datasets/text.py +++ b/rdt/performance/datasets/text.py @@ -27,18 +27,12 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 500.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 500.0 - }, + 'fit': {'time': 1e-05, 'memory': 500.0}, + 'transform': {'time': 1e-05, 'memory': 500.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } @@ -54,16 +48,10 @@ def generate(num_rows): def get_performance_thresholds(): """Return the expected threseholds.""" return { - 'fit': { - 'time': 1e-05, - 'memory': 400.0 - }, - 'transform': { - 'time': 1e-05, - 'memory': 1000.0 - }, + 'fit': {'time': 1e-05, 'memory': 400.0}, + 'transform': {'time': 1e-05, 'memory': 1000.0}, 'reverse_transform': { 'time': 2e-05, 'memory': 1000.0, - } + }, } diff --git a/rdt/performance/performance.py b/rdt/performance/performance.py index 989f015f..acff0d8e 100644 --- a/rdt/performance/performance.py +++ b/rdt/performance/performance.py @@ -11,23 +11,13 @@ TRANSFORMER_ARGS = { 'BinaryEncoder': { 'missing_value_replacement': -1, - 'missing_value_generation': 'from_column' - }, - 'UnixTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'OptimizedTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'FloatFormatter': { - 'missing_value_generation': 'from_column' - }, - 'GaussianNormalizer': { - 'missing_value_generation': 'from_column' - }, - 'ClusterBasedNormalizer': { - 'missing_value_generation': 'from_column' + 'missing_value_generation': 'from_column', }, + 'UnixTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'OptimizedTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'FloatFormatter': {'missing_value_generation': 'from_column'}, + 'GaussianNormalizer': {'missing_value_generation': 'from_column'}, + 'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'}, } @@ -54,7 +44,9 @@ def _get_dataset_sizes(sdtype): return sizes -def evaluate_transformer_performance(transformer, dataset_generator, verbose=False): +def evaluate_transformer_performance( + transformer, dataset_generator, verbose=False +): """Evaluate the given transformer's performance against the given dataset generator. Args: @@ -87,11 +79,15 @@ def evaluate_transformer_performance(transformer, dataset_generator, verbose=Fal size = np.array([fit_size, transform_size, transform_size] * 2) performance = performance / size if verbose: - performance = performance.rename(lambda x: x + ' (s)' if 'Time' in x else x + ' (B)') + performance = performance.rename( + lambda x: x + ' (s)' if 'Time' in x else x + ' (B)' + ) performance['Number of fit rows'] = fit_size performance['Number of transform rows'] = transform_size performance['Dataset'] = dataset_generator.__name__ - performance['Transformer'] = f'{transformer.__module__ }.{transformer.get_name()}' + performance['Transformer'] = ( + f'{transformer.__module__}.{transformer.get_name()}' + ) out.append(performance) diff --git a/rdt/performance/profiling.py b/rdt/performance/profiling.py index d3ba9c0c..1ef1922e 100644 --- a/rdt/performance/profiling.py +++ b/rdt/performance/profiling.py @@ -10,7 +10,9 @@ import pandas as pd -def _profile_time(transformer, method_name, dataset, column=None, iterations=10, copy=False): +def _profile_time( + transformer, method_name, dataset, column=None, iterations=10, copy=False +): total_time = 0 for _ in range(iterations): if copy: @@ -47,14 +49,16 @@ def _profile_memory(method, dataset, column=None): peak_memory = ctx.Value('i', 0) profiling_process = ctx.Process( target=_set_memory_for_method, - args=(method, dataset, column, peak_memory) + args=(method, dataset, column, peak_memory), ) profiling_process.start() profiling_process.join() return peak_memory.value -def profile_transformer(transformer, dataset_generator, transform_size, fit_size=None): +def profile_transformer( + transformer, dataset_generator, transform_size, fit_size=None +): """Profile a Transformer on a dataset. This function will get the total time and peak memory @@ -82,16 +86,24 @@ def profile_transformer(transformer, dataset_generator, transform_size, fit_size replace = transform_size > fit_size transform_dataset = fit_dataset.sample(transform_size, replace=replace) - fit_time = _profile_time(transformer, 'fit', fit_dataset, column='test', copy=True) + fit_time = _profile_time( + transformer, 'fit', fit_dataset, column='test', copy=True + ) fit_memory = _profile_memory(transformer.fit, fit_dataset, column='test') transformer.fit(fit_dataset, 'test') transform_time = _profile_time(transformer, 'transform', transform_dataset) - transform_memory = _profile_memory(transformer.transform, transform_dataset) + transform_memory = _profile_memory( + transformer.transform, transform_dataset + ) reverse_dataset = transformer.transform(transform_dataset) - reverse_time = _profile_time(transformer, 'reverse_transform', reverse_dataset) - reverse_memory = _profile_memory(transformer.reverse_transform, reverse_dataset) + reverse_time = _profile_time( + transformer, 'reverse_transform', reverse_dataset + ) + reverse_memory = _profile_memory( + transformer.reverse_transform, reverse_dataset + ) return pd.Series({ 'Fit Time': fit_time, @@ -99,5 +111,5 @@ def profile_transformer(transformer, dataset_generator, transform_size, fit_size 'Transform Time': transform_time, 'Transform Memory': transform_memory, 'Reverse Transform Time': reverse_time, - 'Reverse Transform Memory': reverse_memory + 'Reverse Transform Memory': reverse_memory, }) diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index ac214b1d..968639d0 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -9,12 +9,28 @@ from rdt.transformers.base import BaseMultiColumnTransformer, BaseTransformer from rdt.transformers.boolean import BinaryEncoder from rdt.transformers.categorical import ( - CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, - OrderedUniformEncoder, UniformEncoder) -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder + CustomLabelEncoder, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) from rdt.transformers.null import NullTransformer -from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.numerical import ( + ClusterBasedNormalizer, + FloatFormatter, + GaussianNormalizer, +) +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) from rdt.transformers.text import IDGenerator, RegexGenerator __all__ = [ @@ -64,7 +80,9 @@ def get_transformer_name(transformer): if inspect.isclass(transformer): return transformer.__module__ + '.' + transformer.get_name() - raise ValueError(f'The transformer {transformer} must be passed as a class.') + raise ValueError( + f'The transformer {transformer} must be passed as a class.' + ) TRANSFORMERS = { @@ -95,7 +113,10 @@ def get_class_by_transformer_name(): BaseTransformer: BaseTransformer subclass class object. """ - return {class_.get_name(): class_ for class_ in BaseTransformer.get_subclasses()} + return { + class_.get_name(): class_ + for class_ in BaseTransformer.get_subclasses() + } def get_transformer_class(transformer): @@ -144,7 +165,7 @@ def get_default_transformers(): """ transformers_by_type = get_transformers_by_type() defaults = deepcopy(DEFAULT_TRANSFORMERS) - for (sdtype, transformers) in transformers_by_type.items(): + for sdtype, transformers in transformers_by_type.items(): if sdtype not in defaults: defaults[sdtype] = transformers[0]() diff --git a/rdt/transformers/_validators.py b/rdt/transformers/_validators.py index 5c5f7fd8..6e9c0a64 100644 --- a/rdt/transformers/_validators.py +++ b/rdt/transformers/_validators.py @@ -1,4 +1,5 @@ """Validations for multi-column transformers.""" + import importlib from rdt.errors import TransformerInputError @@ -22,9 +23,7 @@ def _validate_supported_sdtypes(cls, columns_to_sdtypes): message += f"Column '{column}' has an unsupported sdtype '{sdtype}'.\n" if message: - message += ( - f'Please provide a column that is compatible with {cls.VALIDATION_TYPE} data.' - ) + message += f'Please provide a column that is compatible with {cls.VALIDATION_TYPE} data.' raise TransformerInputError(message) @classmethod @@ -60,8 +59,14 @@ class AddressValidator(BaseValidator): """Validation class for Address data.""" SUPPORTED_SDTYPES = [ - 'country_code', 'administrative_unit', 'city', 'postcode', - 'street_address', 'secondary_address', 'state', 'state_abbr' + 'country_code', + 'administrative_unit', + 'city', + 'postcode', + 'street_address', + 'secondary_address', + 'state', + 'state_abbr', ] VALIDATION_TYPE = 'Address' @@ -83,14 +88,18 @@ def _validate_uniqueness_sdtype(columns_to_sdtypes): sdtypes_to_columns[sdtype].append(column) duplicate_fields = { - value: keys for value, keys in sdtypes_to_columns.items() if len(keys) > 1 + value: keys + for value, keys in sdtypes_to_columns.items() + if len(keys) > 1 } if duplicate_fields: message = '' for sdtype, columns in duplicate_fields.items(): to_print = "', '".join(columns) - message += f"Columns '{to_print}' have the same sdtype '{sdtype}'.\n" + message += ( + f"Columns '{to_print}' have the same sdtype '{sdtype}'.\n" + ) message += 'Your address data cannot have duplicate fields.' raise TransformerInputError(message) @@ -98,7 +107,9 @@ def _validate_uniqueness_sdtype(columns_to_sdtypes): @classmethod def _validate_administrative_unit(cls, columns_to_sdtypes): num_column_administrative_unit = sum( - 1 for itm in columns_to_sdtypes.values() if itm in ['administrative_unit', 'state'] + 1 + for itm in columns_to_sdtypes.values() + if itm in ['administrative_unit', 'state'] ) if num_column_administrative_unit > 1: raise TransformerInputError( @@ -117,12 +128,12 @@ def validate_sdtypes(cls, columns_to_sdtypes): @classmethod def validate_imports(cls): """Check that the address transformers can be imported.""" - error_message = ( - 'You must have SDV Enterprise with the address add-on to use the address features.' - ) + error_message = 'You must have SDV Enterprise with the address add-on to use the address features.' try: - address_module = importlib.import_module('rdt.transformers.address') + address_module = importlib.import_module( + 'rdt.transformers.address' + ) except ModuleNotFoundError: raise ImportError(error_message) from None @@ -140,7 +151,9 @@ class GPSValidator(BaseValidator): @staticmethod def _validate_uniqueness_sdtype(columns_to_sdtypes): - sdtypes_to_columns = {sdtype: column for column, sdtype in columns_to_sdtypes.items()} + sdtypes_to_columns = { + sdtype: column for column, sdtype in columns_to_sdtypes.items() + } if len(sdtypes_to_columns) != 2: raise TransformerInputError( 'The GPS columns must have one latitude and on longitude columns sdtypes. ' @@ -156,16 +169,18 @@ def validate_sdtypes(cls, columns_to_sdtypes): @classmethod def validate_imports(cls): """Check that the GPS transformers can be imported.""" - error_message = ( - 'You must have SDV Enterprise with the gps add-on to use the GPS features.' - ) + error_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features.' try: gps_module = importlib.import_module('rdt.transformers.gps') except ModuleNotFoundError: raise ImportError(error_message) from None - required_classes = ['RandomLocationGenerator', 'MetroAreaAnonymizer', 'GPSNoiser'] + required_classes = [ + 'RandomLocationGenerator', + 'MetroAreaAnonymizer', + 'GPSNoiser', + ] for class_name in required_classes: if not hasattr(gps_module, class_name): raise ImportError(error_message) diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index 91734ec5..b202ef05 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -1,4 +1,5 @@ """BaseTransformer module.""" + import abc import contextlib import hashlib @@ -45,13 +46,16 @@ def random_state(function): function (Callable): The function to wrap around. """ + @wraps(function) def wrapper(self, *args, **kwargs): if self.random_states is None: return function(self, *args, **kwargs) method_name = function.__name__ - with set_random_states(self.random_states, method_name, self.set_random_state): + with set_random_states( + self.random_states, method_name, self.set_random_state + ): return function(self, *args, **kwargs) return wrapper @@ -78,11 +82,13 @@ class BaseTransformer: missing_value_generation = None def __init__(self): - self.output_properties = {None: {'sdtype': 'float', 'next_transformer': None}} + self.output_properties = { + None: {'sdtype': 'float', 'next_transformer': None} + } self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': None, - 'reverse_transform': None + 'reverse_transform': None, } def set_random_state(self, state, method_name): @@ -106,7 +112,7 @@ def reset_randomization(self): self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': np.random.RandomState(self.random_seed), - 'reverse_transform': np.random.RandomState(self.random_seed + 1) + 'reverse_transform': np.random.RandomState(self.random_seed + 1), } @property @@ -115,7 +121,7 @@ def model_missing_values(self): warnings.warn( "Future versions of RDT will not support the 'model_missing_values' parameter. " "Please switch to using the 'missing_value_generation' parameter instead.", - FutureWarning + FutureWarning, ) return self.missing_value_generation == 'from_column' @@ -132,18 +138,22 @@ def _set_model_missing_values(self, model_missing_values): warnings.warn( "Future versions of RDT will not support the 'model_missing_values' parameter. " "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.', FutureWarning + 'strategy.', + FutureWarning, ) if model_missing_values is True: self._set_missing_value_generation('from_column') elif model_missing_values is False: self._set_missing_value_generation('random') - def _set_missing_value_replacement(self, default, missing_value_replacement): + def _set_missing_value_replacement( + self, default, missing_value_replacement + ): if missing_value_replacement is None: warnings.warn( "Setting 'missing_value_replacement' to 'None' is no longer supported. " - f"Imputing with the '{default}' instead.", FutureWarning + f"Imputing with the '{default}' instead.", + FutureWarning, ) self.missing_value_replacement = default else: @@ -186,7 +196,7 @@ def get_input_sdtype(cls): """ warnings.warn( '`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.', - FutureWarning + FutureWarning, ) return cls.get_supported_sdtypes()[0] @@ -209,7 +219,9 @@ def _get_output_to_property(self, property_): if output_column is None: output[f'{self.column_prefix}'] = properties[property_] else: - output[f'{self.column_prefix}.{output_column}'] = properties[property_] + output[f'{self.column_prefix}.{output_column}'] = properties[ + property_ + ] return output @@ -294,12 +306,16 @@ def _add_columns_to_data(data, transformed_data, transformed_names): """ if transformed_names: if isinstance(transformed_data, (pd.Series, np.ndarray)): - transformed_data = pd.DataFrame(transformed_data, columns=transformed_names) + transformed_data = pd.DataFrame( + transformed_data, columns=transformed_names + ) # When '#' is added to the column_prefix of a transformer # the columns of transformed_data and transformed_names don't match transformed_data.columns = transformed_names - data = pd.concat([data, transformed_data.set_index(data.index)], axis=1) + data = pd.concat( + [data, transformed_data.set_index(data.index)], axis=1 + ) return data @@ -366,12 +382,16 @@ def _set_seed(self, data): for value in data.head(5): hash_value += str(value) - hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) - self.random_seed = hash_value % ((2 ** 32) - 1) # maximum value for a seed + hash_value = int( + hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16 + ) + self.random_seed = hash_value % ( + (2**32) - 1 + ) # maximum value for a seed self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': np.random.RandomState(self.random_seed), - 'reverse_transform': np.random.RandomState(self.random_seed + 1) + 'reverse_transform': np.random.RandomState(self.random_seed + 1), } @random_state @@ -423,7 +443,9 @@ def transform(self, data): columns_data = self._get_columns_data(data, self.columns) transformed_data = self._transform(columns_data) data = data.drop(self.columns, axis=1) - data = self._add_columns_to_data(data, transformed_data, self.output_columns) + data = self._add_columns_to_data( + data, transformed_data, self.output_columns + ) return data @@ -536,7 +558,9 @@ def _get_output_to_property(self, property_): if self.column_prefix is None: output[f'{output_column}'] = properties[property_] else: - output[f'{self.column_prefix}.{output_column}'] = properties[property_] + output[f'{self.column_prefix}.{output_column}'] = properties[ + property_ + ] return output @@ -545,7 +569,9 @@ def _validate_columns_to_sdtypes(self, data, columns_to_sdtypes): missing = set(columns_to_sdtypes.keys()) - set(data.columns) if missing: missing_to_print = ', '.join(missing) - raise ValueError(f'Columns ({missing_to_print}) are not present in the data.') + raise ValueError( + f'Columns ({missing_to_print}) are not present in the data.' + ) @classmethod def _validate_sdtypes(cls, columns_to_sdtypes): diff --git a/rdt/transformers/boolean.py b/rdt/transformers/boolean.py index de2cd5fe..77fd60d1 100644 --- a/rdt/transformers/boolean.py +++ b/rdt/transformers/boolean.py @@ -39,11 +39,17 @@ class BinaryEncoder(BaseTransformer): INPUT_SDTYPE = 'boolean' null_transformer = None - def __init__(self, missing_value_replacement='mode', model_missing_values=None, - missing_value_generation='random'): + def __init__( + self, + missing_value_replacement='mode', + model_missing_values=None, + missing_value_generation='random', + ): super().__init__() self._set_missing_value_generation(missing_value_generation) - self._set_missing_value_replacement('random', missing_value_replacement) + self._set_missing_value_replacement( + 'random', missing_value_replacement + ) if model_missing_values is not None: self._set_model_missing_values(model_missing_values) @@ -55,12 +61,14 @@ def _fit(self, data): Data to fit to. """ self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(data) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform boolean to float. diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 2f07f663..cce5bd50 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -9,7 +9,11 @@ from rdt.errors import TransformerInputError from rdt.transformers.base import BaseTransformer -from rdt.transformers.utils import check_nan_in_transform, fill_nan_with_none, try_convert_to_dtype +from rdt.transformers.utils import ( + check_nan_in_transform, + fill_nan_with_none, + try_convert_to_dtype, +) LOGGER = logging.getLogger(__name__) @@ -55,7 +59,9 @@ def _order_categories(self, unique_data): nans = pd.isna(unique_data) if self.order_by == 'alphabetical': # pylint: disable=invalid-unary-operand-type - if any(map(lambda item: not isinstance(item, str), unique_data[~nans])): # noqa: C417 + if any( + map(lambda item: not isinstance(item, str), unique_data[~nans]) + ): # noqa: C417 raise TransformerInputError( "The data must be of type string if order_by is 'alphabetical'." ) @@ -84,7 +90,9 @@ def _get_message_unseen_categories(cls, unseen_categories): """ categories_to_print = ', '.join(str(x) for x in unseen_categories[:3]) if len(unseen_categories) > 3: - categories_to_print = f'{categories_to_print}, +{len(unseen_categories) - 3} more' + categories_to_print = ( + f'{categories_to_print}, +{len(unseen_categories) - 3} more' + ) return categories_to_print @@ -129,7 +137,9 @@ def _fit(self, data): nan_value = freq[np.nan] if np.nan in freq.index else None freq = freq.reindex(labels, fill_value=nan_value).array - self.frequencies, self.intervals = self._compute_frequencies_intervals(labels, freq) + self.frequencies, self.intervals = self._compute_frequencies_intervals( + labels, freq + ) def _transform(self, data): """Map the category to a continuous value. @@ -149,21 +159,27 @@ def _transform(self, data): if unseen_indexes.any(): # Keep the 3 first unseen categories unseen_categories = list(data.loc[unseen_indexes].unique()) - categories_to_print = self._get_message_unseen_categories(unseen_categories) + categories_to_print = self._get_message_unseen_categories( + unseen_categories + ) warnings.warn( f"The data in column '{self.get_input_column()}' contains new categories " f"that did not appear during 'fit' ({categories_to_print}). Assigning " 'them random values. If you want to model new categories, ' "please fit the data again using 'fit'.", - category=UserWarning + category=UserWarning, ) choices = list(self.frequencies.keys()) size = unseen_indexes.size - data_with_none[unseen_indexes] = np.random.choice(choices, size=size) + data_with_none[unseen_indexes] = np.random.choice( + choices, size=size + ) def map_labels(label): - return np.random.uniform(self.intervals[label][0], self.intervals[label][1]) + return np.random.uniform( + self.intervals[label][0], self.intervals[label][1] + ) return data_with_none.map(map_labels).astype(float) @@ -257,16 +273,18 @@ def _fit(self, data): data = fill_nan_with_none(data) self._check_unknown_categories(data) - category_not_seen = (set(self.order.dropna()) != set(data.dropna())) - nans_not_seen = (pd.isna(self.order).any() and not pd.isna(data).any()) + category_not_seen = set(self.order.dropna()) != set(data.dropna()) + nans_not_seen = pd.isna(self.order).any() and not pd.isna(data).any() if category_not_seen or nans_not_seen: unseen_categories = [x for x in self.order if x not in data.array] - categories_to_print = self._get_message_unseen_categories(unseen_categories) + categories_to_print = self._get_message_unseen_categories( + unseen_categories + ) LOGGER.info( "For column '%s', some of the provided category values were not present in the" ' data during fit: (%s).', self.get_input_column(), - categories_to_print + categories_to_print, ) freq = data.value_counts(normalize=True, dropna=False) @@ -280,7 +298,9 @@ def _fit(self, data): nan_value = freq[np.nan] if np.nan in freq.index else None freq = freq.reindex(self.order, fill_value=nan_value).array - self.frequencies, self.intervals = self._compute_frequencies_intervals(self.order, freq) + self.frequencies, self.intervals = self._compute_frequencies_intervals( + self.order, freq + ) def _transform(self, data): """Map the category to a continuous value.""" @@ -333,7 +353,7 @@ def __init__(self, add_noise=False): warnings.warn( "The 'FrequencyEncoder' transformer will no longer be supported in future versions " "of the RDT library. Please use the 'UniformEncoder' transformer instead.", - FutureWarning + FutureWarning, ) super().__init__() self.add_noise = add_noise @@ -363,12 +383,15 @@ def tie_breaker(element): if pd.isna(element): return data_is_na.loc[data_is_na == 1].index[0] - return data_with_new_index.loc[data_with_new_index == element].index[0] + return data_with_new_index.loc[ + data_with_new_index == element + ].index[0] - augmented_frequencies[sortable_column_name] = frequencies.index.map(tie_breaker) + augmented_frequencies[sortable_column_name] = frequencies.index.map( + tie_breaker + ) augmented_frequencies = augmented_frequencies.sort_values( - [column_name, sortable_column_name], - ascending=[False, True] + [column_name, sortable_column_name], ascending=[False, True] ) sorted_frequencies = augmented_frequencies[column_name] @@ -393,7 +416,9 @@ def tie_breaker(element): start = end means = pd.Series(means, index=list(frequencies.keys())) - starts = pd.DataFrame(starts, columns=['category', 'start']).set_index('start') + starts = pd.DataFrame(starts, columns=['category', 'start']).set_index( + 'start' + ) return intervals, means, starts @@ -423,7 +448,7 @@ def _clip_noised_transform(result, start, end): def _transform_by_category(self, data): """Transform the data by iterating over the different categories.""" - result = np.empty(shape=(len(data), ), dtype=float) + result = np.empty(shape=(len(data),), dtype=float) # loop over categories for category, values in self.intervals.items(): @@ -435,11 +460,14 @@ def _transform_by_category(self, data): if self.add_noise: result[mask] = norm.rvs( - mean, std, + mean, + std, size=mask.sum(), - random_state=self.random_states['transform'] + random_state=self.random_states['transform'], + ) + result[mask] = self._clip_noised_transform( + result[mask], start, end ) - result[mask] = self._clip_noised_transform(result[mask], start, end) else: result[mask] = mean @@ -453,14 +481,21 @@ def _get_value(self, category): start, end, mean, std = self.intervals[category] if self.add_noise: - result = norm.rvs(mean, std, random_state=self.random_states['transform']) + result = norm.rvs( + mean, std, random_state=self.random_states['transform'] + ) return self._clip_noised_transform(result, start, end) return mean def _transform_by_row(self, data): """Transform the data row by row.""" - data = data.infer_objects().fillna(np.nan).apply(self._get_value).to_numpy() + data = ( + data.infer_objects() + .fillna(np.nan) + .apply(self._get_value) + .to_numpy() + ) return data @@ -476,7 +511,9 @@ def _transform(self, data): """ fit_categories = pd.Series(self.intervals.keys()) has_nan = pd.isna(fit_categories).any() - unseen_indexes = ~(data.isin(fit_categories) | (pd.isna(data) & has_nan)) + unseen_indexes = ~( + data.isin(fit_categories) | (pd.isna(data) & has_nan) + ) if unseen_indexes.any(): # Select only the first 5 unseen categories to avoid flooding the console. unseen_categories = set(data[unseen_indexes][:5]) @@ -487,7 +524,9 @@ def _transform(self, data): 'please fit the transformer again with the new data.' ) - data[unseen_indexes] = np.random.choice(fit_categories, size=unseen_indexes.size) + data[unseen_indexes] = np.random.choice( + fit_categories, size=unseen_indexes.size + ) if len(self.means) < len(data): return self._transform_by_category(data) @@ -495,7 +534,7 @@ def _transform(self, data): def _reverse_transform_by_category(self, data): """Reverse transform the data by iterating over all the categories.""" - result = np.empty(shape=(len(data), ), dtype=self.dtype) + result = np.empty(shape=(len(data),), dtype=self.dtype) # loop over categories for category, values in self.intervals.items(): @@ -644,7 +683,9 @@ def _transform(self, data): """ data = self._prepare_data(data) unique_data = {np.nan if pd.isna(x) else x for x in pd.unique(data)} - unseen_categories = unique_data - {np.nan if pd.isna(x) else x for x in self.dummies} + unseen_categories = unique_data - { + np.nan if pd.isna(x) else x for x in self.dummies + } if unseen_categories: # Select only the first 5 unseen categories to avoid flooding the console. examples_unseen_categories = set(list(unseen_categories)[:5]) @@ -781,7 +822,9 @@ def _transform(self, data): Returns: pd.Series """ - mapped = data.infer_objects().fillna(np.nan).map(self.categories_to_values) + mapped = ( + data.infer_objects().fillna(np.nan).map(self.categories_to_values) + ) is_null = mapped.isna() if is_null.any(): # Select only the first 5 unseen categories to avoid flooding the console. @@ -794,8 +837,7 @@ def _transform(self, data): ) mapped[is_null] = np.random.randint( - len(self.categories_to_values), - size=is_null.sum() + len(self.categories_to_values), size=is_null.sum() ) if self.add_noise: @@ -818,7 +860,9 @@ def _reverse_transform(self, data): if self.add_noise: data = np.floor(data) - data = data.clip(min(self.values_to_categories), max(self.values_to_categories)) + data = data.clip( + min(self.values_to_categories), max(self.values_to_categories) + ) data = data.round().map(self.values_to_categories) data = try_convert_to_dtype(data, self.dtype) @@ -906,6 +950,7 @@ class CustomLabelEncoder(OrderedLabelEncoder): def __init__(self, order, add_noise=False): warnings.warn( "The 'CustomLabelEncoder' is renamed to 'OrderedLabelEncoder'. Please update the" - 'name to ensure compatibility with future versions of RDT.', FutureWarning + 'name to ensure compatibility with future versions of RDT.', + FutureWarning, ) super().__init__(order, add_noise) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 28ad451f..7c1e35b4 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -1,4 +1,5 @@ """Transformer for datetime data.""" + import numpy as np import pandas as pd from pandas.api.types import is_datetime64_dtype, is_numeric_dtype @@ -49,9 +50,14 @@ class UnixTimestampEncoder(BaseTransformer): _min_value = None _max_value = None - def __init__(self, missing_value_replacement='mean', model_missing_values=None, - datetime_format=None, missing_value_generation='random', - enforce_min_max_values=False): + def __init__( + self, + missing_value_replacement='mean', + model_missing_values=None, + datetime_format=None, + missing_value_generation='random', + enforce_min_max_values=False, + ): super().__init__() self._set_missing_value_replacement('mean', missing_value_replacement) self._set_missing_value_generation(missing_value_generation) @@ -86,16 +92,22 @@ def _convert_to_datetime(self, data): try: pandas_datetime_format = None if self.datetime_format: - pandas_datetime_format = self.datetime_format.replace('%-', '%') + pandas_datetime_format = self.datetime_format.replace( + '%-', '%' + ) data = pd.to_datetime(data, format=pandas_datetime_format) except ValueError as error: - if 'Unknown string' in str(error) or 'Unknown datetime string' in str(error): + if 'Unknown string' in str( + error + ) or 'Unknown datetime string' in str(error): message = 'Data must be of dtype datetime, or castable to datetime.' raise TypeError(message) from None - raise ValueError('Data does not match specified datetime format.') from None + raise ValueError( + 'Data does not match specified datetime format.' + ) from None return data @@ -103,7 +115,11 @@ def _transform_helper(self, datetimes): """Transform datetime values to integer.""" datetimes = self._convert_to_datetime(datetimes) nulls = datetimes.isna() - integers = pd.to_numeric(datetimes, errors='coerce').to_numpy().astype(np.float64) + integers = ( + pd.to_numeric(datetimes, errors='coerce') + .to_numpy() + .astype(np.float64) + ) integers[nulls] = np.nan transformed = pd.Series(integers) @@ -128,7 +144,9 @@ def _fit(self, data): self._dtype = data.dtype if self.datetime_format is None: datetime_array = data[data.notna()].astype(str).to_numpy() - self.datetime_format = _guess_datetime_format_for_array(datetime_array) + self.datetime_format = _guess_datetime_format_for_array( + datetime_array + ) transformed = self._transform_helper(data) if self.enforce_min_max_values: @@ -136,12 +154,14 @@ def _fit(self, data): self._max_value = transformed.max() self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(transformed) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform datetime values to float values. @@ -172,15 +192,22 @@ def _reverse_transform(self, data): data = self._reverse_transform_helper(data) datetime_data = pd.to_datetime(data) if self.datetime_format: - if is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format: + if ( + is_datetime64_dtype(self._dtype) + and '.%f' not in self.datetime_format + ): datetime_data = pd.to_datetime( datetime_data.dt.strftime(self.datetime_format), format=self.datetime_format, ) else: - datetime_data = datetime_data.dt.strftime(self.datetime_format).astype(self._dtype) + datetime_data = datetime_data.dt.strftime( + self.datetime_format + ).astype(self._dtype) elif is_numeric_dtype(self._dtype): - datetime_data = pd.to_numeric(datetime_data.astype('object'), errors='coerce') + datetime_data = pd.to_numeric( + datetime_data.astype('object'), errors='coerce' + ) datetime_data = datetime_data.astype(self._dtype) return datetime_data @@ -229,14 +256,21 @@ class OptimizedTimestampEncoder(UnixTimestampEncoder): divider = None - def __init__(self, missing_value_replacement=None, model_missing_values=None, - datetime_format=None, missing_value_generation='random', - enforce_min_max_values=False): - super().__init__(missing_value_replacement=missing_value_replacement, - missing_value_generation=missing_value_generation, - enforce_min_max_values=enforce_min_max_values, - model_missing_values=model_missing_values, - datetime_format=datetime_format) + def __init__( + self, + missing_value_replacement=None, + model_missing_values=None, + datetime_format=None, + missing_value_generation='random', + enforce_min_max_values=False, + ): + super().__init__( + missing_value_replacement=missing_value_replacement, + missing_value_generation=missing_value_generation, + enforce_min_max_values=enforce_min_max_values, + model_missing_values=model_missing_values, + datetime_format=datetime_format, + ) def _find_divider(self, transformed): self.divider = 1 diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index c10b5295..44386151 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -10,7 +10,7 @@ LOGGER = logging.getLogger(__name__) -class NullTransformer(): +class NullTransformer: """Transformer for data that contains Null values. Args: @@ -36,7 +36,9 @@ class NullTransformer(): _missing_value_replacement = None _null_percentage = None - def __init__(self, missing_value_replacement=None, missing_value_generation='random'): + def __init__( + self, missing_value_replacement=None, missing_value_generation='random' + ): self._missing_value_replacement = missing_value_replacement if missing_value_generation not in (None, 'from_column', 'random'): raise TransformerInputError( @@ -76,7 +78,10 @@ def _get_missing_value_replacement(self, data): if self._missing_value_replacement is None: return None - if self._missing_value_replacement in {'mean', 'mode', 'random'} and pd.isna(data).all(): + if ( + self._missing_value_replacement in {'mean', 'mode', 'random'} + and pd.isna(data).all() + ): msg = ( f"'missing_value_replacement' cannot be set to '{self._missing_value_replacement}'" ' when the provided data only contains NaNs. Using 0 instead.' @@ -101,7 +106,9 @@ def fit(self, data): data (pandas.Series): Data to transform. """ - self._missing_value_replacement = self._get_missing_value_replacement(data) + self._missing_value_replacement = self._get_missing_value_replacement( + data + ) if self._missing_value_replacement == 'random': self._min_value = data.min() self._max_value = data.max() @@ -135,18 +142,20 @@ def transform(self, data): """ isna = data.isna() if self._missing_value_replacement == 'random': - data_mask = list(np.random.uniform( - low=self._min_value, - high=self._max_value, - size=len(data) - )) + data_mask = list( + np.random.uniform( + low=self._min_value, high=self._max_value, size=len(data) + ) + ) data = data.mask(data.isna(), data_mask) elif isna.any() and self._missing_value_replacement is not None: data = data.infer_objects().fillna(self._missing_value_replacement) if self._missing_value_generation == 'from_column': - return pd.concat([data, isna.astype(np.float64)], axis=1).to_numpy() + return pd.concat( + [data, isna.astype(np.float64)], axis=1 + ).to_numpy() return data.to_numpy() @@ -172,7 +181,7 @@ def reverse_transform(self, data): data = data[:, 0] elif self.nulls: - isna = np.random.random((len(data), )) < self._null_percentage + isna = np.random.random((len(data),)) < self._null_percentage data = pd.Series(data) diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 94be5b7d..4b90b3b9 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -1,4 +1,5 @@ """Transformers for numerical data.""" + import copy import warnings @@ -13,10 +14,10 @@ EPSILON = np.finfo(np.float32).eps INTEGER_BOUNDS = { - 'Int8': (-2**7, 2**7 - 1), - 'Int16': (-2**15, 2**15 - 1), - 'Int32': (-2**31, 2**31 - 1), - 'Int64': (-2**63, 2**63 - 1), + 'Int8': (-(2**7), 2**7 - 1), + 'Int16': (-(2**15), 2**15 - 1), + 'Int32': (-(2**31), 2**31 - 1), + 'Int64': (-(2**63), 2**63 - 1), 'UInt8': (0, 2**8 - 1), 'UInt16': (0, 2**16 - 1), 'UInt32': (0, 2**32 - 1), @@ -73,9 +74,15 @@ class FloatFormatter(BaseTransformer): _min_value = None _max_value = None - def __init__(self, missing_value_replacement='mean', model_missing_values=None, - learn_rounding_scheme=False, enforce_min_max_values=False, - computer_representation='Float', missing_value_generation='random'): + def __init__( + self, + missing_value_replacement='mean', + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + computer_representation='Float', + missing_value_generation='random', + ): super().__init__() self.missing_value_replacement = missing_value_replacement self._set_missing_value_generation(missing_value_generation) @@ -87,7 +94,9 @@ def __init__(self, missing_value_replacement='mean', model_missing_values=None, self.enforce_min_max_values = enforce_min_max_values self.computer_representation = computer_representation - def _raise_out_of_bounds_error(self, value, name, bound_type, min_bound, max_bound): + def _raise_out_of_bounds_error( + self, value, name, bound_type, min_bound, max_bound + ): raise ValueError( f"The {bound_type} value in column '{name}' is {value}." f" All values represented by '{self.computer_representation}'" @@ -108,11 +117,13 @@ def _validate_values_within_bounds(self, data): min_bound, max_bound = INTEGER_BOUNDS[self.computer_representation] if min_value < min_bound: self._raise_out_of_bounds_error( - min_value, data.name, 'minimum', min_bound, max_bound) + min_value, data.name, 'minimum', min_bound, max_bound + ) if max_value > max_bound: self._raise_out_of_bounds_error( - max_value, data.name, 'maximum', min_bound, max_bound) + max_value, data.name, 'maximum', min_bound, max_bound + ) def _fit(self, data): """Fit the transformer to the data. @@ -132,12 +143,14 @@ def _fit(self, data): self._rounding_digits = learn_rounding_digits(data) self.null_transformer = NullTransformer( - self.missing_value_replacement, - self.missing_value_generation + self.missing_value_replacement, self.missing_value_generation ) self.null_transformer.fit(data) if self.null_transformer.models_missing_values(): - self.output_properties['is_null'] = {'sdtype': 'float', 'next_transformer': None} + self.output_properties['is_null'] = { + 'sdtype': 'float', + 'next_transformer': None, + } def _transform(self, data): """Transform numerical data. @@ -246,13 +259,15 @@ class GaussianNormalizer(FloatFormatter): _DEPRECATED_DISTRIBUTIONS_MAPPING = { 'gaussian': 'norm', 'student_t': 't', - 'truncated_gaussian': 'truncnorm' + 'truncated_gaussian': 'truncnorm', } @staticmethod def _get_distributions(): try: - from copulas import univariate # pylint: disable=import-outside-toplevel + from copulas import ( + univariate, # pylint: disable=import-outside-toplevel + ) except ImportError as error: error.msg += ( '\n\nIt seems like `copulas` is not installed.\n' @@ -270,10 +285,14 @@ def _get_distributions(): 'uniform': univariate.UniformUnivariate, } - def __init__(self, model_missing_values=None, learn_rounding_scheme=False, - enforce_min_max_values=False, distribution='truncated_gaussian', - missing_value_generation='random'): - + def __init__( + self, + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + distribution='truncated_gaussian', + missing_value_generation='random', + ): # Using missing_value_replacement='mean' as the default instead of random # as this may lead to different outcomes in certain synthesizers # affecting the synthesizers directly and this is out of scope for now. @@ -282,7 +301,7 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, missing_value_generation=missing_value_generation, missing_value_replacement='mean', learn_rounding_scheme=learn_rounding_scheme, - enforce_min_max_values=enforce_min_max_values + enforce_min_max_values=enforce_min_max_values, ) self._distributions = self._get_distributions() @@ -292,9 +311,11 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, f"Future versions of RDT will not support '{distribution}' as an option. " f"Please use '{self._DEPRECATED_DISTRIBUTIONS_MAPPING[distribution]}' " 'instead.', - FutureWarning + FutureWarning, ) - distribution = self._DEPRECATED_DISTRIBUTIONS_MAPPING[distribution] + distribution = self._DEPRECATED_DISTRIBUTIONS_MAPPING[ + distribution + ] distribution = self._distributions[distribution] @@ -302,11 +323,17 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, def _get_univariate(self): distribution = self._distribution - if any(isinstance(distribution, dist) for dist in self._distributions.values()): + if any( + isinstance(distribution, dist) + for dist in self._distributions.values() + ): return copy.deepcopy(distribution) if isinstance(distribution, tuple): return distribution[0](**distribution[1]) - if isinstance(distribution, type) and distribution in self._distributions.values(): + if ( + isinstance(distribution, type) + and distribution in self._distributions.values() + ): return distribution() raise TypeError(f'Invalid distribution: {distribution}') @@ -423,10 +450,15 @@ class ClusterBasedNormalizer(FloatFormatter): _bgm_transformer = None valid_component_indicator = None - def __init__(self, model_missing_values=None, learn_rounding_scheme=False, - enforce_min_max_values=False, max_clusters=10, weight_threshold=0.005, - missing_value_generation='random'): - + def __init__( + self, + model_missing_values=None, + learn_rounding_scheme=False, + enforce_min_max_values=False, + max_clusters=10, + weight_threshold=0.005, + missing_value_generation='random', + ): # Using missing_value_replacement='mean' as the default instead of random # as this may lead to different outcomes in certain synthesizers # affecting the synthesizers directly and this is out of scope for now. @@ -435,7 +467,7 @@ def __init__(self, model_missing_values=None, learn_rounding_scheme=False, missing_value_generation=missing_value_generation, missing_value_replacement='mean', learn_rounding_scheme=learn_rounding_scheme, - enforce_min_max_values=enforce_min_max_values + enforce_min_max_values=enforce_min_max_values, ) self.max_clusters = max_clusters self.weight_threshold = weight_threshold @@ -461,7 +493,7 @@ def _fit(self, data): n_components=self.max_clusters, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.001, - random_state=self._get_current_random_seed() + random_state=self._get_current_random_seed(), ) super()._fit(data) @@ -473,7 +505,9 @@ def _fit(self, data): warnings.simplefilter('ignore') self._bgm_transformer.fit(data.reshape(-1, 1)) - self.valid_component_indicator = self._bgm_transformer.weights_ > self.weight_threshold + self.valid_component_indicator = ( + self._bgm_transformer.weights_ > self.weight_threshold + ) def _transform(self, data): """Transform the numerical data. @@ -492,7 +526,10 @@ def _transform(self, data): data = data.reshape((len(data), 1)) means = self._bgm_transformer.means_.reshape((1, self.max_clusters)) means = means[:, self.valid_component_indicator] - stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self.max_clusters)) + stds = np.sqrt(self._bgm_transformer.covariances_).reshape(( + 1, + self.max_clusters, + )) stds = stds[:, self.valid_component_indicator] # Multiply stds by 4 so that a value will be in the range [-1,1] with 99.99% probability @@ -506,15 +543,21 @@ def _transform(self, data): component_prob_t = component_prob_t / component_prob_t.sum() selected_component[i] = np.random.choice( np.arange(self.valid_component_indicator.sum()), - p=component_prob_t + p=component_prob_t, ) aranged = np.arange(len(data)) - normalized = normalized_values[aranged, selected_component].reshape([-1, 1]) - normalized = np.clip(normalized, -.99, .99) + normalized = normalized_values[aranged, selected_component].reshape([ + -1, + 1, + ]) + normalized = np.clip(normalized, -0.99, 0.99) normalized = normalized[:, 0] rows = [normalized, selected_component] - if self.null_transformer and self.null_transformer.models_missing_values(): + if ( + self.null_transformer + and self.null_transformer.models_missing_values() + ): rows.append(model_missing_values) return np.stack(rows, axis=1) # noqa: PD013 @@ -524,7 +567,9 @@ def _reverse_transform_helper(self, data): means = self._bgm_transformer.means_.reshape([-1]) stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1]) selected_component = data[:, 1].round().astype(int) - selected_component = selected_component.clip(0, self.valid_component_indicator.sum() - 1) + selected_component = selected_component.clip( + 0, self.valid_component_indicator.sum() - 1 + ) std_t = stds[self.valid_component_indicator][selected_component] mean_t = means[self.valid_component_indicator][selected_component] reversed_data = normalized * self.STD_MULTIPLIER * std_t + mean_t @@ -545,7 +590,10 @@ def _reverse_transform(self, data): data = data.to_numpy() recovered_data = self._reverse_transform_helper(data) - if self.null_transformer and self.null_transformer.models_missing_values(): + if ( + self.null_transformer + and self.null_transformer.models_missing_values() + ): recovered_data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013 return super()._reverse_transform(recovered_data) diff --git a/rdt/transformers/pii/__init__.py b/rdt/transformers/pii/__init__.py index c52ada4b..f2bd3549 100644 --- a/rdt/transformers/pii/__init__.py +++ b/rdt/transformers/pii/__init__.py @@ -1,6 +1,9 @@ """Personal Identifiable Information Transformers module.""" -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) __all__ = [ 'AnonymizedFaker', diff --git a/rdt/transformers/pii/anonymization.py b/rdt/transformers/pii/anonymization.py index eb4443d3..f3fb6610 100644 --- a/rdt/transformers/pii/anonymization.py +++ b/rdt/transformers/pii/anonymization.py @@ -10,41 +10,23 @@ from rdt.transformers import AnonymizedFaker SDTYPE_ANONYMIZERS = { - 'address': { - 'provider_name': 'address', - 'function_name': 'address' - }, - 'email': { - 'provider_name': 'internet', - 'function_name': 'email' - }, - 'ipv4_address': { - 'provider_name': 'internet', - 'function_name': 'ipv4' - }, - 'ipv6_address': { - 'provider_name': 'internet', - 'function_name': 'ipv6' - }, + 'address': {'provider_name': 'address', 'function_name': 'address'}, + 'email': {'provider_name': 'internet', 'function_name': 'email'}, + 'ipv4_address': {'provider_name': 'internet', 'function_name': 'ipv4'}, + 'ipv6_address': {'provider_name': 'internet', 'function_name': 'ipv6'}, 'mac_address': { 'provider_name': 'internet', - 'function_name': 'mac_address' - }, - 'name': { - 'provider_name': 'person', - 'function_name': 'name' + 'function_name': 'mac_address', }, + 'name': {'provider_name': 'person', 'function_name': 'name'}, 'phone_number': { 'provider_name': 'phone_number', - 'function_name': 'phone_number' - }, - 'ssn': { - 'provider_name': 'ssn', - 'function_name': 'ssn' + 'function_name': 'phone_number', }, + 'ssn': {'provider_name': 'ssn', 'function_name': 'ssn'}, 'user_agent_string': { 'provider_name': 'user_agent', - 'function_name': 'user_agent' + 'function_name': 'user_agent', }, } @@ -102,7 +84,7 @@ def get_anonymized_transformer(function_name, transformer_kwargs=None): provider_name = _detect_provider_name(function_name, locales=locales) transformer_kwargs.update({ 'function_name': function_name, - 'provider_name': provider_name + 'provider_name': provider_name, }) return AnonymizedFaker(**transformer_kwargs) diff --git a/rdt/transformers/pii/anonymizer.py b/rdt/transformers/pii/anonymizer.py index 177bd718..10e7da0f 100644 --- a/rdt/transformers/pii/anonymizer.py +++ b/rdt/transformers/pii/anonymizer.py @@ -87,14 +87,18 @@ def check_provider_function(provider_name, function_name): def _check_locales(self): """Check if the locales exist for the provided provider.""" - locales = self.locales if isinstance(self.locales, list) else [self.locales] + locales = ( + self.locales if isinstance(self.locales, list) else [self.locales] + ) missed_locales = [] for locale in locales: provider_name = self.provider_name if self.provider_name.endswith(f'.{locale}'): provider_name = self.provider_name.replace(f'.{locale}', '') - spec = importlib.util.find_spec(f'faker.providers.{provider_name}.{locale}') + spec = importlib.util.find_spec( + f'faker.providers.{provider_name}.{locale}' + ) if spec is None and locale != 'en_US': missed_locales.append(locale) @@ -106,19 +110,28 @@ def _check_locales(self): 'information: https://faker.readthedocs.io/en/master/locales.html' ) - def __init__(self, provider_name=None, function_name=None, function_kwargs=None, - locales=None, cardinality_rule=None, enforce_uniqueness=False, - missing_value_generation='random'): + def __init__( + self, + provider_name=None, + function_name=None, + function_kwargs=None, + locales=None, + cardinality_rule=None, + enforce_uniqueness=False, + missing_value_generation='random', + ): super().__init__() self._data_cardinality = None self.data_length = None self.enforce_uniqueness = enforce_uniqueness - self.cardinality_rule = cardinality_rule.lower() if cardinality_rule else None + self.cardinality_rule = ( + cardinality_rule.lower() if cardinality_rule else None + ) if enforce_uniqueness: warnings.warn( "The 'enforce_uniqueness' parameter is no longer supported. " "Please use the 'cardinality_rule' parameter instead.", - FutureWarning + FutureWarning, ) if not self.cardinality_rule: self.cardinality_rule = 'unique' @@ -131,7 +144,9 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None, ) self.function_name = function_name if function_name else 'lexify' - self.function_kwargs = deepcopy(function_kwargs) if function_kwargs else {} + self.function_kwargs = ( + deepcopy(function_kwargs) if function_kwargs else {} + ) self.check_provider_function(self.provider_name, self.function_name) self.output_properties = {None: {'next_transformer': None}} @@ -159,7 +174,11 @@ def get_supported_sdtypes(cls): Accepted input sdtypes of the transformer. """ unsupported_sdtypes = { - 'numerical', 'datetime', 'categorical', 'boolean', None + 'numerical', + 'datetime', + 'categorical', + 'boolean', + None, } all_sdtypes = {cls.INPUT_SDTYPE} for transformer in BaseTransformer.get_subclasses(): @@ -183,9 +202,13 @@ def _function(self): else: faker_attr = self.faker except AttributeError: - faker_attr = self.faker.unique if self.enforce_uniqueness else self.faker + faker_attr = ( + self.faker.unique if self.enforce_uniqueness else self.faker + ) - result = getattr(faker_attr, self.function_name)(**self.function_kwargs) + result = getattr(faker_attr, self.function_name)( + **self.function_kwargs + ) if isinstance(result, Iterable) and not isinstance(result, str): result = ', '.join(map(str, result)) @@ -197,8 +220,12 @@ def _set_faker_seed(self, data): for value in data.head(5): hash_value += str(value) - hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) - self._faker_random_seed = hash_value % ((2 ** 32) - 1) # maximum value for a seed + hash_value = int( + hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16 + ) + self._faker_random_seed = hash_value % ( + (2**32) - 1 + ) # maximum value for a seed self.faker.seed_instance(self._faker_random_seed) def _fit(self, data): @@ -222,14 +249,19 @@ def _transform(self, _data): return None def _get_unique_categories(self, samples): - return np.array([self._function() for _ in range(samples)], dtype=object) + return np.array( + [self._function() for _ in range(samples)], dtype=object + ) def _reverse_transform_cardinality_rule_match(self, sample_size): """Reverse transform the data when the cardinality rule is 'match'.""" reverse_transformed = np.array([], dtype=object) if self.missing_value_generation == 'random': num_nans = int(self._nan_frequency * sample_size) - reverse_transformed = np.concatenate([reverse_transformed, np.full(num_nans, np.nan)]) + reverse_transformed = np.concatenate([ + reverse_transformed, + np.full(num_nans, np.nan), + ]) else: num_nans = 0 @@ -237,13 +269,24 @@ def _reverse_transform_cardinality_rule_match(self, sample_size): return reverse_transformed if sample_size < num_nans + self._data_cardinality: - unique_categories = self._get_unique_categories(sample_size - num_nans) - reverse_transformed = np.concatenate([reverse_transformed, unique_categories]) + unique_categories = self._get_unique_categories( + sample_size - num_nans + ) + reverse_transformed = np.concatenate([ + reverse_transformed, + unique_categories, + ]) else: - unique_categories = self._get_unique_categories(self._data_cardinality) + unique_categories = self._get_unique_categories( + self._data_cardinality + ) num_copies = sample_size - self._data_cardinality - num_nans copies = np.random.choice(unique_categories, num_copies) - reverse_transformed = np.concatenate([reverse_transformed, unique_categories, copies]) + reverse_transformed = np.concatenate([ + reverse_transformed, + unique_categories, + copies, + ]) np.random.shuffle(reverse_transformed) @@ -265,13 +308,18 @@ def _reverse_transform(self, data): sample_size = self.data_length try: - if hasattr(self, 'cardinality_rule') and self.cardinality_rule == 'match': - reverse_transformed = self._reverse_transform_cardinality_rule_match(sample_size) + if ( + hasattr(self, 'cardinality_rule') + and self.cardinality_rule == 'match' + ): + reverse_transformed = ( + self._reverse_transform_cardinality_rule_match(sample_size) + ) else: - reverse_transformed = np.array([ - self._function() - for _ in range(sample_size) - ], dtype=object) + reverse_transformed = np.array( + [self._function() for _ in range(sample_size)], + dtype=object, + ) except faker.exceptions.UniquenessException as exception: raise TransformerProcessingError( @@ -280,9 +328,14 @@ def _reverse_transform(self, data): f"('{self.get_input_column()}')." ) from exception - if self.missing_value_generation == 'random' and not pd.isna(reverse_transformed).any(): + if ( + self.missing_value_generation == 'random' + and not pd.isna(reverse_transformed).any() + ): num_nans = int(self._nan_frequency * sample_size) - nan_indices = np.random.choice(sample_size, num_nans, replace=False) + nan_indices = np.random.choice( + sample_size, num_nans, replace=False + ) reverse_transformed[nan_indices] = np.nan return reverse_transformed @@ -334,26 +387,37 @@ class PseudoAnonymizedFaker(AnonymizedFaker): def __getstate__(self): """Return a dictionary representation of the instance and warn the user when pickling.""" - warnings.warn(( - 'You are saving the mapping information, which includes the original data. ' - 'Sharing this object with others will also give them access to the original data ' - 'used with this transformer.' - )) + warnings.warn( + ( + 'You are saving the mapping information, which includes the original data. ' + 'Sharing this object with others will also give them access to the original data ' + 'used with this transformer.' + ) + ) return self.__dict__ - def __init__(self, provider_name=None, function_name=None, function_kwargs=None, locales=None): + def __init__( + self, + provider_name=None, + function_name=None, + function_kwargs=None, + locales=None, + ): super().__init__( provider_name=provider_name, function_name=function_name, function_kwargs=function_kwargs, locales=locales, - cardinality_rule='unique' + cardinality_rule='unique', ) self._mapping_dict = {} self._reverse_mapping_dict = {} self.output_properties = { - None: {'sdtype': 'categorical', 'next_transformer': LabelEncoder(add_noise=True)} + None: { + 'sdtype': 'categorical', + 'next_transformer': LabelEncoder(add_noise=True), + } } def get_mapping(self): @@ -375,7 +439,9 @@ def _fit(self, columns_data): unique_values = columns_data[columns_data.notna()].unique() unique_data_length = len(unique_values) try: - generated_values = [self._function() for _ in range(unique_data_length)] + generated_values = [ + self._function() for _ in range(unique_data_length) + ] except faker.exceptions.UniquenessException as exception: raise TransformerProcessingError( 'The Faker function you specified is not able to generate ' diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index c6ef8c9a..6e00194a 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -1,4 +1,5 @@ """Transformers for text data.""" + import logging import warnings @@ -64,7 +65,10 @@ def _reverse_transform(self, data): prefix_str = self.prefix if self.prefix is not None else '' suffix_str = self.suffix if self.suffix is not None else '' - values = [f'{prefix_str}{start + idx}{suffix_str}' for idx in range(len(data))] + values = [ + f'{prefix_str}{start + idx}{suffix_str}' + for idx in range(len(data)) + ] self._counter += len(data) return pd.Series(values) @@ -116,8 +120,12 @@ def __setstate__(self, state): state['generator'] = generator self.__dict__ = state - def __init__(self, regex_format='[A-Za-z]{5}', enforce_uniqueness=False, - generation_order='alphanumeric'): + def __init__( + self, + regex_format='[A-Za-z]{5}', + enforce_uniqueness=False, + generation_order='alphanumeric', + ): super().__init__() self.output_properties = {None: {'next_transformer': None}} self.enforce_uniqueness = enforce_uniqueness @@ -127,14 +135,18 @@ def __init__(self, regex_format='[A-Za-z]{5}', enforce_uniqueness=False, self.generator_size = None self.generated = None if generation_order not in ['alphanumeric', 'scrambled']: - raise ValueError("generation_order must be one of 'alphanumeric' or 'scrambled'.") + raise ValueError( + "generation_order must be one of 'alphanumeric' or 'scrambled'." + ) self.generation_order = generation_order def reset_randomization(self): """Create a new generator and reset the generated values counter.""" super().reset_randomization() - self.generator, self.generator_size = strings_from_regex(self.regex_format) + self.generator, self.generator_size = strings_from_regex( + self.regex_format + ) self.generated = 0 def _fit(self, data): @@ -171,8 +183,10 @@ def _warn_not_enough_unique_values(self, sample_size): LOGGER.info( "The data has %s rows but the regex for '%s' can only create %s unique values." " Some values in '%s' may be repeated.", - sample_size, self.get_input_column(), self.generator_size, - self.get_input_column() + sample_size, + self.get_input_column(), + self.generator_size, + self.get_input_column(), ) remaining = self.generator_size - self.generated @@ -205,7 +219,9 @@ def _reverse_transform(self, data): remaining = self.generator_size if remaining >= sample_size: - reverse_transformed = [next(self.generator) for _ in range(sample_size)] + reverse_transformed = [ + next(self.generator) for _ in range(sample_size) + ] self.generated += sample_size else: @@ -216,21 +232,28 @@ def _reverse_transform(self, data): try: remaining_samples = sample_size - len(reverse_transformed) start = int(generated_values[-1]) + 1 - reverse_transformed.extend( - [str(i) for i in range(start, start + remaining_samples)]) + reverse_transformed.extend([ + str(i) for i in range(start, start + remaining_samples) + ]) except ValueError: counter = 0 while len(reverse_transformed) < sample_size: - remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend( - [f'{i}({counter})' for i in generated_values[:remaining_samples]]) + remaining_samples = sample_size - len( + reverse_transformed + ) + reverse_transformed.extend([ + f'{i}({counter})' + for i in generated_values[:remaining_samples] + ]) counter += 1 else: while len(reverse_transformed) < sample_size: remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend(generated_values[:remaining_samples]) + reverse_transformed.extend( + generated_values[:remaining_samples] + ) if getattr(self, 'generation_order', 'alphanumeric') == 'scrambled': np.random.shuffle(reverse_transformed) diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index 75c21960..f9e33f95 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -29,14 +29,18 @@ def _in(options, max_repeat): generators.append(generator) sizes.append(size) - return (value for generator in generators for value in generator), np.sum(sizes) + return (value for generator in generators for value in generator), np.sum( + sizes + ) def _range(options, max_repeat): del max_repeat min_value, max_value = options max_value += 1 - return (chr(value) for value in range(min_value, max_value)), max_value - min_value + return ( + chr(value) for value in range(min_value, max_value) + ), max_value - min_value def _any(options, max_repeat): @@ -57,18 +61,16 @@ def _max_repeat(options, max_repeat): sizes = [] for repeat in range(min_, max_ + 1): if repeat: - sizes.append(pow(int(size), repeat, 2 ** 63 - 1)) + sizes.append(pow(int(size), repeat, 2**63 - 1)) repeat_generators = [ (_GENERATORS[option](args, max_repeat)[0], option, args) for _ in range(repeat) ] generators.append(_from_generators(repeat_generators, max_repeat)) - return ( - value - for generator in generators - for value in generator - ), np.sum(sizes) + int(min_ == 0) + return (value for generator in generators for value in generator), np.sum( + sizes + ) + int(min_ == 0) def _category_chars(regex): @@ -113,7 +115,7 @@ def _from_generators(generators, max_repeat): value = next(generator) generated.append(value) previous[index] = value - generated.extend(previous[index + 1:]) + generated.extend(previous[index + 1 :]) break except StopIteration: generator = _GENERATORS[option](args, max_repeat)[0] @@ -157,7 +159,9 @@ def strings_from_regex(regex, max_repeat=16): generators.append((generator, option, args)) sizes.append(size) - return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128).real + return _from_generators(generators, max_repeat), np.prod( + sizes, dtype=np.complex128 + ).real def fill_nan_with_none(data): @@ -273,5 +277,8 @@ def learn_rounding_digits(data): return decimal # Can't round, not equal after MAX_DECIMALS digits of precision - LOGGER.info("No rounding scheme detected for column '%s'. Data will not be rounded.", name) + LOGGER.info( + "No rounding scheme detected for column '%s'. Data will not be rounded.", + name, + ) return None diff --git a/tasks.py b/tasks.py index 461b9320..53921d6b 100644 --- a/tasks.py +++ b/tasks.py @@ -11,12 +11,11 @@ from packaging.requirements import Requirement from packaging.version import Version - COMPARISONS = { '>=': operator.ge, '>': operator.gt, '<': operator.lt, - '<=': operator.le + '<=': operator.le, } @@ -58,19 +57,37 @@ def _get_minimum_versions(dependencies, python_version): req = Requirement(dependency) if ';' in dependency: marker = req.marker - if marker and not marker.evaluate({'python_version': python_version}): + if marker and not marker.evaluate({ + 'python_version': python_version + }): continue # Skip this dependency if the marker does not apply to the current Python version if req.name not in min_versions: - min_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), None) + min_version = next( + ( + spec.version + for spec in req.specifier + if spec.operator in ('>=', '==') + ), + None, + ) if min_version: min_versions[req.name] = f'{req.name}=={min_version}' elif '@' not in min_versions[req.name]: existing_version = Version(min_versions[req.name].split('==')[1]) - new_version = next((spec.version for spec in req.specifier if spec.operator in ('>=', '==')), existing_version) + new_version = next( + ( + spec.version + for spec in req.specifier + if spec.operator in ('>=', '==') + ), + existing_version, + ) if new_version > existing_version: - min_versions[req.name] = f'{req.name}=={new_version}' # Change when a valid newer version is found + min_versions[req.name] = ( + f'{req.name}=={new_version}' # Change when a valid newer version is found + ) return list(min_versions.values()) @@ -85,7 +102,7 @@ def install_minimum(c): minimum_versions = _get_minimum_versions(dependencies, python_version) if minimum_versions: - c.run(f'python -m pip install {" ".join(minimum_versions)}') + c.run(f'python -m pip install {' '.join(minimum_versions)}') @task diff --git a/tests/code_style.py b/tests/code_style.py index bb459a31..1186d09c 100644 --- a/tests/code_style.py +++ b/tests/code_style.py @@ -28,7 +28,9 @@ def validate_transformer_module(transformer): elif transformer_folder.parent.match('transformers'): is_valid = True - assert is_valid, 'The transformer module is not placed inside a valid path.' + assert ( + is_valid + ), 'The transformer module is not placed inside a valid path.' def validate_transformer_importable_from_parent_module(transformer): @@ -37,7 +39,9 @@ def validate_transformer_importable_from_parent_module(transformer): module = getattr(transformer, '__module__', '') module = module.rsplit('.', 1)[0] imported_transformer = getattr(importlib.import_module(module), name, None) - assert imported_transformer is not None, f'Could not import {name} from {module}' + assert ( + imported_transformer is not None + ), f'Could not import {name} from {module}' def get_test_location(transformer): @@ -48,10 +52,16 @@ def get_test_location(transformer): test_location = None if transformer_folder.match('transformers'): - test_location = rdt_unit_test_path / 'transformers' / f'test_{transformer_file.name}' + test_location = ( + rdt_unit_test_path + / 'transformers' + / f'test_{transformer_file.name}' + ) elif transformer_folder.parent.match('transformers'): - test_location = rdt_unit_test_path / 'transformers' / transformer_folder.name + test_location = ( + rdt_unit_test_path / 'transformers' / transformer_folder.name + ) test_location = test_location / f'test_{transformer_file.name}' return test_location @@ -74,7 +84,9 @@ def _load_module_from_path(path): if module_path.name == 'transformers': module_path = f'rdt.transformers.{module_name}' elif module_path.parent.name == 'transformers': - module_path = f'rdt.transformers.{module_path.parent.name}.{module_name}' + module_path = ( + f'rdt.transformers.{module_path.parent.name}.{module_name}' + ) spec = importlib.util.spec_from_file_location(module_path, path) module = importlib.util.module_from_spec(spec) @@ -91,11 +103,11 @@ def validate_test_names(transformer): test_class = getattr(module, f'Test{transformer.get_name()}', None) assert test_class is not None, 'The expected test class was not found.' - test_functions = inspect.getmembers(test_class, predicate=inspect.isfunction) + test_functions = inspect.getmembers( + test_class, predicate=inspect.isfunction + ) test_functions = [ - test - for test, _ in test_functions - if test.startswith('test') + test for test, _ in test_functions if test.startswith('test') ] assert test_functions, 'No test functions found within the test module.' @@ -110,8 +122,8 @@ def validate_test_names(transformer): for test in test_functions: count = len(valid_test_functions) for transformer_function in transformer_functions: - simple_test = fr'test_{transformer_function}' - described_test = fr'test_{transformer_function}_' + simple_test = rf'test_{transformer_function}' + described_test = rf'test_{transformer_function}_' if test.startswith(described_test): valid_test_functions.append(test) elif test.startswith(simple_test): @@ -121,7 +133,9 @@ def validate_test_names(transformer): assert len(valid_test_functions) > count, fail_message -@pytest.mark.parametrize('transformer', TRANSFORMERS.values(), ids=TRANSFORMERS.keys()) # noqa +@pytest.mark.parametrize( + 'transformer', TRANSFORMERS.values(), ids=TRANSFORMERS.keys() +) # noqa def test_transformer_code_style(transformer): """Validate a transformer.""" if not inspect.isclass(transformer): diff --git a/tests/contributing.py b/tests/contributing.py index 5a59cb31..a484efb7 100644 --- a/tests/contributing.py +++ b/tests/contributing.py @@ -15,9 +15,14 @@ from rdt.performance.datasets import get_dataset_generators_by_type from rdt.transformers import get_transformer_class, get_transformers_by_type from tests.code_style import ( - get_test_location, validate_test_location, validate_test_names, validate_transformer_addon, - validate_transformer_importable_from_parent_module, validate_transformer_module, - validate_transformer_subclass) + get_test_location, + validate_test_location, + validate_test_names, + validate_transformer_addon, + validate_transformer_importable_from_parent_module, + validate_transformer_module, + validate_transformer_subclass, +) from tests.integration.test_transformers import validate_transformer from tests.performance import validate_performance @@ -56,7 +61,7 @@ 'rdt/transformers/', 'tests/unit/transformers/', 'tests/integration/transformers/', - 'tests/datasets/' + 'tests/datasets/', ] @@ -78,7 +83,9 @@ def validate_transformer_integration(transformer): if isinstance(transformer, str): transformer = get_transformer_class(transformer) - print(f'Validating Integration Tests for transformer {transformer.get_name()}\n') + print( + f'Validating Integration Tests for transformer {transformer.get_name()}\n' + ) steps = [] validation_error = None @@ -87,7 +94,9 @@ def validate_transformer_integration(transformer): try: validate_transformer(transformer, steps=steps) except Exception as error: - error_trace = ''.join(traceback.TracebackException.from_exception(error).format()) + error_trace = ''.join( + traceback.TracebackException.from_exception(error).format() + ) for check in CHECK_DETAILS: if check in error_trace: @@ -116,17 +125,22 @@ def validate_transformer_integration(transformer): else: result_summaries.append([check, 'Yes', details]) - summary = pd.DataFrame(result_summaries, columns=['Check', 'Correct', 'Details']) + summary = pd.DataFrame( + result_summaries, columns=['Check', 'Correct', 'Details'] + ) print(tabulate(summary, headers='keys', showindex=False)) return validation_error is None and error_trace is None -def _validate_third_party_code_style(command, tag, success_message, - error_message, transformer_path): +def _validate_third_party_code_style( + command, tag, success_message, error_message, transformer_path +): run_command = command.split(' ') run_command.append(transformer_path) - output_capture = subprocess.run(run_command, capture_output=True).stdout.decode() + output_capture = subprocess.run( + run_command, capture_output=True + ).stdout.decode() if output_capture: return { 'Check': tag, @@ -142,7 +156,9 @@ def _validate_third_party_code_style(command, tag, success_message, } -def _custom_validation(function, tag, success_message, error_message, transformer): +def _custom_validation( + function, tag, success_message, error_message, transformer +): try: function(transformer) return { @@ -156,7 +172,7 @@ def _custom_validation(function, tag, success_message, error_message, transforme 'Check': tag, 'Correct': 'No', 'Details': error_message, - 'output_capture': error + 'output_capture': error, } @@ -167,29 +183,29 @@ def _validate_third_party_checks(transformer_path): 'flake8', 'Code follows PEP8 standards.', 'Code must follow PEP8 standards.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'isort -c', 'isort', 'Imports are properly sorted.', 'Imports are not properly sorted.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'pylint --rcfile=setup.cfg ', 'pylint', 'Code is properly formatted and structured.', 'Code is not properly formatted and structured.', - transformer_path + transformer_path, ), _validate_third_party_code_style( 'pydocstyle', 'pydocstyle', 'The docstrings are properly written.', 'The docstrings are not properly written.', - transformer_path - ) + transformer_path, + ), ] return results @@ -202,43 +218,43 @@ def _validate_custom_checks(transformer): 'Transformer is subclass', 'The transformer is subclass of ``BaseTransformer``.', 'The transformer must be a subclass of ``BaseTransformer``.', - transformer + transformer, ), _custom_validation( validate_transformer_module, 'Valid module', 'The transformer is placed inside a valid module.', 'The transformer is not placed inside a valid module.', - transformer + transformer, ), _custom_validation( validate_test_location, 'Valid test module', 'The transformer tests are placed inside the valid module.', 'The transformer tests are not placed inside the valid module.', - transformer + transformer, ), _custom_validation( validate_test_names, 'Valid test function names', 'The transformer tests are named correctly.', 'The transformer tests are not named properly.', - transformer + transformer, ), _custom_validation( validate_transformer_addon, 'Valid transformer addon', 'The addon is configured properly.', 'The addon is not configured properly.', - transformer + transformer, ), _custom_validation( validate_transformer_importable_from_parent_module, 'Importable from module', 'The transformer can be imported from the parent module.', 'The transformer can not be imported from the parent module.', - transformer - ) + transformer, + ), ] return results @@ -265,7 +281,7 @@ def validate_transformer_code_style(transformer): transformer_path = inspect.getfile(transformer) print(f'Validating source file {transformer_path}') - results = (_validate_third_party_checks(transformer_path)) + results = _validate_third_party_checks(transformer_path) results.extend(_validate_custom_checks(transformer)) errors = [ @@ -330,9 +346,13 @@ def validate_transformer_unit_tests(transformer): score = cov.report(show_missing=True) rounded_score = round(score / 100, 3) if rounded_score < 1.0: - print(f'\nERROR: The unit tests only cover {round(score, 3)}% of your code.') + print( + f'\nERROR: The unit tests only cover {round(score, 3)}% of your code.' + ) else: - print(f'\nSUCCESS: The unit tests cover {round(score, 3)}% of your code.') + print( + f'\nSUCCESS: The unit tests cover {round(score, 3)}% of your code.' + ) cov.html_report() print('\nFull coverage report here:\n') @@ -370,7 +390,9 @@ def validate_transformer_performance(transformer): total_results = pd.DataFrame() for current_transformer in transformers: for dataset_generator in dataset_generators: - performance = evaluate_transformer_performance(current_transformer, dataset_generator) + performance = evaluate_transformer_performance( + current_transformer, dataset_generator + ) valid = validate_performance(performance, dataset_generator) results = pd.DataFrame({ @@ -387,13 +409,17 @@ def validate_transformer_performance(transformer): else: print('ERROR: One or more Performance Tests were NOT successful.') - other_results = total_results[total_results.transformer != transformer.get_name()] + other_results = total_results[ + total_results.transformer != transformer.get_name() + ] average = other_results.groupby('Evaluation Metric')['Value'].mean() - total_results = total_results[total_results.transformer == transformer.get_name()] + total_results = total_results[ + total_results.transformer == transformer.get_name() + ] final_results = total_results.groupby('Evaluation Metric').agg({ 'Value': 'mean', - 'Valid': 'any' + 'Valid': 'any', }) final_results = final_results.rename(columns={'Valid': 'Acceptable'}) final_results['Units'] = np.where( @@ -401,9 +427,12 @@ def validate_transformer_performance(transformer): 's / row', 'B / row', ) - final_results['Acceptable'] = np.where(final_results['Acceptable'], 'Yes', 'No') - final_results['Compared to Average'] = final_results['Value'].div(average).replace( - np.inf, np.nan) + final_results['Acceptable'] = np.where( + final_results['Acceptable'], 'Yes', 'No' + ) + final_results['Compared to Average'] = ( + final_results['Value'].div(average).replace(np.inf, np.nan) + ) return final_results.reset_index() @@ -421,7 +450,9 @@ def check_clean_repository(): if any other file has been modified outside of that range. """ run_command = 'git diff --name-only main'.split(' ') - output_capture = subprocess.run(run_command, capture_output=True).stdout.decode() + output_capture = subprocess.run( + run_command, capture_output=True + ).stdout.decode() output_capture = output_capture.splitlines() validated_paths = [] @@ -432,7 +463,7 @@ def check_clean_repository(): if any([ file_path.match(valid_path), file_path.parent.match(valid_path), - file_path.parent.parent.match(valid_path) + file_path.parent.parent.match(valid_path), ]): validated_paths.append(True) @@ -483,13 +514,13 @@ def validate_pull_request(transformer): 'Code Style', code_style, 'Code Style is acceptable.', - 'Code Style is unacceptable!' + 'Code Style is unacceptable!', ), _build_validation_dict( 'Unit Tests', unit_bool, 'The unit tests are correct and run successfully.', - 'The unit tests did not run successfully or the coverage is not a 100%.' + 'The unit tests did not run successfully or the coverage is not a 100%.', ), _build_validation_dict( 'Integration tests', @@ -501,15 +532,14 @@ def validate_pull_request(transformer): 'Performance Tests', performance_bool, 'The performance of the transformer is acceptable.', - 'The performance of the transformer is unacceptable!' + 'The performance of the transformer is unacceptable!', ), _build_validation_dict( 'Clean Repository', clean_repository, 'There are no unexpected changes in the repository.', - 'There are unexpected changes in the repository!' + 'There are unexpected changes in the repository!', ), - ] results = pd.DataFrame(results) @@ -519,7 +549,7 @@ def validate_pull_request(transformer): unit_bool, integration_tests, performance_bool, - clean_repository + clean_repository, ]) print('\n') @@ -527,7 +557,9 @@ def validate_pull_request(transformer): if success: print('\nSUCCESS: The Pull Request can be made!') - print('You can now commit all your changes, push to GitHub and create a Pull Request.') + print( + 'You can now commit all your changes, push to GitHub and create a Pull Request.' + ) else: print('\nERROR: The Pull Request can not be made!') print('Fix the reported errors and try again.') diff --git a/tests/datasets/tests/test_boolean.py b/tests/datasets/tests/test_boolean.py index 6362f2f2..ef8af023 100644 --- a/tests/datasets/tests/test_boolean.py +++ b/tests/datasets/tests/test_boolean.py @@ -6,7 +6,6 @@ class TestRandomBooleanGenerator: - def test_generate(self): """Test the `RandomBooleanGenerator.generate` method. @@ -26,7 +25,6 @@ def test_generate(self): class TestRandomBooleanNaNsGenerator: - def test_generate(self): """Test the `RandomBooleanNaNsGenerator.generate` method. @@ -46,7 +44,6 @@ def test_generate(self): class TestRandomSkewedBooleanGenerator: - def test_generate(self): """Test the `RandomSkewedBooleanGenerator.generate` method. @@ -66,7 +63,6 @@ def test_generate(self): class TestRandomSkewedBooleanNaNsGenerator: - def test_generate(self): """Test the `RandomSkewedBooleanNaNsGenerator.generate` method. @@ -87,7 +83,6 @@ def test_generate(self): class TestConstantBooleanGenerator: - def test_generate(self): """Test the `ConstantBooleanGenerator.generate` method. @@ -108,7 +103,6 @@ def test_generate(self): class TestConstantBooleanNaNsGenerator: - def test(self): output = boolean.ConstantBooleanNaNsGenerator.generate(NUM_ROWS) assert len(output) == NUM_ROWS diff --git a/tests/datasets/tests/test_categorical.py b/tests/datasets/tests/test_categorical.py index 649c1f9f..35e81b7b 100644 --- a/tests/datasets/tests/test_categorical.py +++ b/tests/datasets/tests/test_categorical.py @@ -5,7 +5,6 @@ class TestRandomIntegerGenerator: - def test(self): output = categorical.RandomIntegerGenerator.generate(10) assert len(output) == 10 @@ -15,7 +14,6 @@ def test(self): class TestRandomIntegerNaNsGenerator: - def test(self): output = categorical.RandomIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -25,7 +23,6 @@ def test(self): class TestRandomStringGenerator: - def test(self): output = categorical.RandomStringGenerator.generate(10) assert len(output) == 10 @@ -35,7 +32,6 @@ def test(self): class TestRandomStringNaNsGenerator: - def test(self): output = categorical.RandomStringNaNsGenerator.generate(10) assert len(output) == 10 @@ -45,7 +41,6 @@ def test(self): class TestRandomMixedGenerator: - def test(self): output = categorical.RandomMixedGenerator.generate(10) assert len(output) == 10 @@ -54,7 +49,6 @@ def test(self): class TestRandomMixedNaNsGenerator: - def test(self): output = categorical.RandomMixedNaNsGenerator.generate(10) assert len(output) == 10 @@ -63,7 +57,6 @@ def test(self): class TestSingleIntegerGenerator: - def test(self): output = categorical.SingleIntegerGenerator.generate(10) assert len(output) == 10 @@ -73,7 +66,6 @@ def test(self): class TestSingleIntegerNaNsGenerator: - def test(self): output = categorical.SingleIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -83,7 +75,6 @@ def test(self): class TestSingleStringGenerator: - def test(self): output = categorical.SingleStringGenerator.generate(10) assert len(output) == 10 @@ -93,7 +84,6 @@ def test(self): class TestSingleStringNaNsGenerator: - def test(self): output = categorical.SingleStringNaNsGenerator.generate(10) assert len(output) == 10 @@ -103,7 +93,6 @@ def test(self): class TestUniqueIntegerGenerator: - def test(self): output = categorical.UniqueIntegerGenerator.generate(10) assert len(output) == 10 @@ -113,7 +102,6 @@ def test(self): class TestUniqueIntegerNaNsGenerator: - def test(self): output = categorical.UniqueIntegerNaNsGenerator.generate(10) nulls = np.isnan(output).sum() @@ -125,7 +113,6 @@ def test(self): class TestUniqueStringGenerator: - def test(self): output = categorical.UniqueStringGenerator.generate(10) assert len(output) == 10 @@ -135,7 +122,6 @@ def test(self): class TestUniqueStringNaNsGenerator: - def test(self): output = categorical.UniqueStringNaNsGenerator.generate(10) nulls = sum(pd.isna(output)) diff --git a/tests/datasets/tests/test_datetime.py b/tests/datasets/tests/test_datetime.py index 46551198..9a3d5941 100644 --- a/tests/datasets/tests/test_datetime.py +++ b/tests/datasets/tests/test_datetime.py @@ -7,7 +7,6 @@ class TestRandomGapDatetimeGenerator: - def test(self): output = datetime.RandomGapDatetimeGenerator.generate(10) assert len(output) == 10 @@ -17,7 +16,6 @@ def test(self): class TestRandomGapSecondsDatetimeGenerator: - def test(self): output = datetime.RandomGapSecondsDatetimeGenerator.generate(10) assert len(output) == 10 @@ -27,7 +25,6 @@ def test(self): class TestRandomGapDatetimeNaNsGenerator: - def test(self): output = datetime.RandomGapDatetimeNaNsGenerator.generate(10) assert len(output) == 10 @@ -37,7 +34,6 @@ def test(self): class TestEqualGapHoursDatetimeGenerator: - def test(self): output = datetime.EqualGapHoursDatetimeGenerator.generate(10) assert len(output) == 10 @@ -47,7 +43,6 @@ def test(self): class TestEqualGapDaysDatetimeGenerator: - def test(self): output = datetime.EqualGapDaysDatetimeGenerator.generate(10) assert len(output) == 10 @@ -57,7 +52,6 @@ def test(self): class TestEqualGapWeeksDatetimeGenerator: - def test(self): output = datetime.EqualGapWeeksDatetimeGenerator.generate(10) assert len(output) == 10 diff --git a/tests/datasets/tests/test_numerical.py b/tests/datasets/tests/test_numerical.py index 6b687d8f..7948317e 100644 --- a/tests/datasets/tests/test_numerical.py +++ b/tests/datasets/tests/test_numerical.py @@ -5,7 +5,6 @@ class TestRandomIntegerGenerator: - def test(self): output = numerical.RandomIntegerGenerator.generate(10) assert len(output) == 10 @@ -15,7 +14,6 @@ def test(self): class TestRandomIntegerNaNsGenerator: - def test(self): output = numerical.RandomIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -25,7 +23,6 @@ def test(self): class TestConstantIntegerGenerator: - def test(self): output = numerical.ConstantIntegerGenerator.generate(10) assert len(output) == 10 @@ -35,7 +32,6 @@ def test(self): class TestConstantIntegerNaNsGenerator: - def test(self): output = numerical.ConstantIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -45,7 +41,6 @@ def test(self): class TestAlmostConstantIntegerGenerator: - def test(self): output = numerical.AlmostConstantIntegerGenerator.generate(10) assert len(output) == 10 @@ -55,7 +50,6 @@ def test(self): class TestAlmostConstantIntegerNaNsGenerator: - def test(self): output = numerical.AlmostConstantIntegerNaNsGenerator.generate(10) assert len(output) == 10 @@ -65,7 +59,6 @@ def test(self): class TestNormalGenerator: - def test(self): output = numerical.NormalGenerator.generate(10) assert len(output) == 10 @@ -75,7 +68,6 @@ def test(self): class TestNormalNaNsGenerator: - def test(self): output = numerical.NormalNaNsGenerator.generate(10) assert len(output) == 10 @@ -85,7 +77,6 @@ def test(self): class TestBigNormalGenerator: - def test(self): output = numerical.BigNormalGenerator.generate(10) assert len(output) == 10 @@ -95,7 +86,6 @@ def test(self): class TestBigNormalNaNsGenerator: - def test(self): output = numerical.BigNormalNaNsGenerator.generate(10) assert len(output) == 10 diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py index e3447307..b0d37eed 100644 --- a/tests/integration/__init__.py +++ b/tests/integration/__init__.py @@ -1,6 +1,5 @@ """RDT integration testing package.""" - from tests.integration.test_transformers import validate_transformer __all__ = [ diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index 7fee69c4..c8725735 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -8,20 +8,35 @@ from rdt import get_demo from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, +) from rdt.hyper_transformer import Config, HyperTransformer from rdt.transformers import ( - AnonymizedFaker, BaseMultiColumnTransformer, BaseTransformer, BinaryEncoder, - ClusterBasedNormalizer, FloatFormatter, FrequencyEncoder, LabelEncoder, OneHotEncoder, - RegexGenerator, UniformEncoder, UnixTimestampEncoder, get_default_transformer, - get_default_transformers) + AnonymizedFaker, + BaseMultiColumnTransformer, + BaseTransformer, + BinaryEncoder, + ClusterBasedNormalizer, + FloatFormatter, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, + get_default_transformer, + get_default_transformers, +) from rdt.transformers.datetime import OptimizedTimestampEncoder from rdt.transformers.numerical import GaussianNormalizer from rdt.transformers.pii.anonymizer import PseudoAnonymizedFaker class DummyTransformerNumerical(BaseTransformer): - INPUT_SDTYPE = 'categorical' def _fit(self, data): @@ -35,13 +50,15 @@ def _reverse_transform(self, data): class DummyTransformerNotMLReady(BaseTransformer): - INPUT_SDTYPE = 'datetime' def __init__(self): super().__init__() self.output_properties = { - None: {'sdtype': 'datetime', 'next_transformer': FrequencyEncoder()} + None: { + 'sdtype': 'datetime', + 'next_transformer': FrequencyEncoder(), + } } def _fit(self, data): @@ -65,7 +82,8 @@ def _fit(self, data): column: { 'sdtype': 'numerical', 'next_transformer': None, - } for column in self.columns + } + for column in self.columns } @classmethod @@ -96,64 +114,79 @@ def get_input_data(): '2010-01-01', '2010-01-01', ]) - data = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], - 'categorical': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, False, False, True, False, False, True, False], - 'datetime': datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + data = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], + 'categorical': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'a'], + 'bool': [False, False, False, True, False, False, True, False], + 'datetime': datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) return data def get_transformed_data(): datetimes = [ - 1.264982e+18, - 1.264982e+18, - 1.262304e+18, - 1.262304e+18, - 1.262304e+18, - 1.264982e+18, - 1.262304e+18, - 1.262304e+18, + 1.264982e18, + 1.264982e18, + 1.262304e18, + 1.262304e18, + 1.262304e18, + 1.264982e18, + 1.262304e18, + 1.262304e18, ] - return pd.DataFrame({ - 'integer': [1., 2., 1., 3., 1., 4., 2., 3.], - 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], - 'categorical': [ - 0.239836, - 0.233842, - 0.654524, - 0.994903, - 0.371298, - 0.659559, - 0.270355, - 0.120638, - ], - 'bool': [ - 0.667087, - 0.238123, - 0.345841, - 0.842023, - 0.478896, - 0.495079, - 0.775272, - 0.675913, - ], - 'datetime': datetimes, - 'names': [ - 0.159704, - 0.684242, - 0.719619, - 0.458355, - 0.536445, - 0.991478, - 0.078868, - 0.575187, - ] - }, index=TEST_DATA_INDEX) + return pd.DataFrame( + { + 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], + 'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3], + 'categorical': [ + 0.239836, + 0.233842, + 0.654524, + 0.994903, + 0.371298, + 0.659559, + 0.270355, + 0.120638, + ], + 'bool': [ + 0.667087, + 0.238123, + 0.345841, + 0.842023, + 0.478896, + 0.495079, + 0.775272, + 0.675913, + ], + 'datetime': datetimes, + 'names': [ + 0.159704, + 0.684242, + 0.719619, + 0.458355, + 0.536445, + 0.991478, + 0.078868, + 0.575187, + ], + }, + index=TEST_DATA_INDEX, + ) def get_reversed_data(): @@ -189,14 +222,35 @@ def test_default_inputs(self): '2010-01-01', '2010-01-01', ]) - data = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, np.nan, 0.3], - 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, np.nan, False, True, False, np.nan, True, False], - 'datetime': datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + data = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, np.nan, 0.3], + 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], + 'bool': [ + False, + np.nan, + False, + True, + False, + np.nan, + True, + False, + ], + 'datetime': datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) # Run ht = HyperTransformer() @@ -206,50 +260,62 @@ def test_default_inputs(self): reverse_transformed = ht.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], - 'float': [0.1, 0.2, 0.1, 0.20000000000000004, 0.1, 0.4, 0.20000000000000004, 0.3], - 'categorical': [ - 0.239836, - 0.233842, - 0.634841, - 0.996602, - 0.371298, - 0.773039, - 0.270355, - 0.120638, - ], - 'bool': [ - 0.444725, - 0.579374, - 0.230561, - 0.842023, - 0.319264, - 0.665026, - 0.775272, - 0.450609, - ], - 'datetime': [ - 1.2630692571428572e+18, - 1.2649824e+18, - 1.262304e+18, - 1.262304e+18, - 1.262304e+18, - 1.2649824e+18, - 1.262304e+18, - 1.262304e+18 - ], - 'names': [ - 0.159704, - 0.684242, - 0.719619, - 0.458355, - 0.536445, - 0.991478, - 0.078868, - 0.575187, - ] - }, index=TEST_DATA_INDEX) + expected_transformed = pd.DataFrame( + { + 'integer': [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 2.0, 3.0], + 'float': [ + 0.1, + 0.2, + 0.1, + 0.20000000000000004, + 0.1, + 0.4, + 0.20000000000000004, + 0.3, + ], + 'categorical': [ + 0.239836, + 0.233842, + 0.634841, + 0.996602, + 0.371298, + 0.773039, + 0.270355, + 0.120638, + ], + 'bool': [ + 0.444725, + 0.579374, + 0.230561, + 0.842023, + 0.319264, + 0.665026, + 0.775272, + 0.450609, + ], + 'datetime': [ + 1.2630692571428572e18, + 1.2649824e18, + 1.262304e18, + 1.262304e18, + 1.262304e18, + 1.2649824e18, + 1.262304e18, + 1.262304e18, + ], + 'names': [ + 0.159704, + 0.684242, + 0.719619, + 0.458355, + 0.536445, + 0.991478, + 0.078868, + 0.575187, + ], + }, + index=TEST_DATA_INDEX, + ) pd.testing.assert_frame_equal(transformed, expected_transformed) reversed_datetimes = pd.to_datetime([ @@ -262,23 +328,35 @@ def test_default_inputs(self): '2010-01-01', '2010-01-01', ]) - expected_reversed = pd.DataFrame({ - 'integer': [1, 2, 1, 3, 1, 4, 2, 3], - 'float': [ - 0.100000, - np.nan, - np.nan, - 0.20000000000000004, - 0.100000, - 0.400000, - np.nan, - 0.300000, - ], - 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], - 'bool': [False, False, False, True, False, False, True, False], - 'datetime': reversed_datetimes, - 'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'], - }, index=TEST_DATA_INDEX) + expected_reversed = pd.DataFrame( + { + 'integer': [1, 2, 1, 3, 1, 4, 2, 3], + 'float': [ + 0.100000, + np.nan, + np.nan, + 0.20000000000000004, + 0.100000, + 0.400000, + np.nan, + 0.300000, + ], + 'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'], + 'bool': [False, False, False, True, False, False, True, False], + 'datetime': reversed_datetimes, + 'names': [ + 'Jon', + 'Arya', + 'Arya', + 'Jon', + 'Jon', + 'Sansa', + 'Jon', + 'Jon', + ], + }, + index=TEST_DATA_INDEX, + ) for row in range(reverse_transformed.shape[0]): for column in range(reverse_transformed.shape[1]): expected = expected_reversed.iloc[row, column] @@ -289,7 +367,9 @@ def test_default_inputs(self): assert isinstance(ht.field_transformers['float'], FloatFormatter) assert isinstance(ht.field_transformers['categorical'], UniformEncoder) assert isinstance(ht.field_transformers['bool'], UniformEncoder) - assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder) + assert isinstance( + ht.field_transformers['datetime'], UnixTimestampEncoder + ) assert isinstance(ht.field_transformers['names'], UniformEncoder) get_default_transformers.cache_clear() @@ -323,7 +403,7 @@ def test_field_transformers(self): 'categorical': 'categorical', 'bool': 'boolean', 'datetime': 'datetime', - 'names': 'categorical' + 'names': 'categorical', }, 'transformers': { 'integer': FloatFormatter(missing_value_replacement='mean'), @@ -331,8 +411,8 @@ def test_field_transformers(self): 'categorical': UniformEncoder(), 'bool': UniformEncoder(), 'datetime': DummyTransformerNotMLReady(), - 'names': UniformEncoder() - } + 'names': UniformEncoder(), + }, } data = get_input_data() @@ -347,7 +427,16 @@ def test_field_transformers(self): # Assert expected_transformed = get_transformed_data() - transformed_datetimes = [0.8125, 0.8125, 0.3125, 0.3125, 0.3125, 0.8125, 0.3125, 0.3125] + transformed_datetimes = [ + 0.8125, + 0.8125, + 0.3125, + 0.3125, + 0.3125, + 0.8125, + 0.3125, + 0.3125, + ] expected_transformed['datetime'] = transformed_datetimes pd.testing.assert_frame_equal(transformed, expected_transformed) @@ -358,15 +447,13 @@ def test_single_category(self): """Test that categorical variables with a single value are supported.""" # Setup ht = HyperTransformer() - data = pd.DataFrame({ - 'a': ['a', 'a', 'a'] - }) + data = pd.DataFrame({'a': ['a', 'a', 'a']}) # Run ht.detect_initial_config(data) - ht.update_transformers(column_name_to_transformer={ - 'a': OneHotEncoder() - }) + ht.update_transformers( + column_name_to_transformer={'a': OneHotEncoder()} + ) ht.fit(data) transformed = ht.transform(data) reverse = ht.reverse_transform(transformed) @@ -387,7 +474,7 @@ def test_categorical_encoders_with_booleans(self): 'email_confirmed': FrequencyEncoder(), 'subscribed': OneHotEncoder(), 'paid': LabelEncoder(), - } + }, } ht = HyperTransformer() @@ -456,8 +543,10 @@ def test_multiple_fits_different_data(self): reverse2 = ht.reverse_transform(transformed2) # Assert - expected_transformed = pd.DataFrame( - {'col2': [1., 2., 3.], 'col1': [1.0, 0.0, 0.0]}) + expected_transformed = pd.DataFrame({ + 'col2': [1.0, 2.0, 3.0], + 'col1': [1.0, 0.0, 0.0], + }) pd.testing.assert_frame_equal(transformed1, expected_transformed) pd.testing.assert_frame_equal(transformed2, expected_transformed) pd.testing.assert_frame_equal(reverse1, new_data) @@ -484,8 +573,10 @@ def test_multiple_fits_different_columns(self): reverse2 = ht.reverse_transform(transformed2) # Assert - expected_transformed = pd.DataFrame( - {'col3': [1., 2., 3.], 'col4': [1.0, 0.0, 0.0]}) + expected_transformed = pd.DataFrame({ + 'col3': [1.0, 2.0, 3.0], + 'col4': [1.0, 0.0, 0.0], + }) pd.testing.assert_frame_equal(transformed1, expected_transformed) pd.testing.assert_frame_equal(transformed2, expected_transformed) pd.testing.assert_frame_equal(reverse1, new_data) @@ -502,10 +593,12 @@ def test_multiple_fits_with_set_config(self): # Run ht.detect_initial_config(data) - ht.set_config(config={ - 'sdtypes': {'integer': 'categorical'}, - 'transformers': {'integer': FrequencyEncoder()} - }) + ht.set_config( + config={ + 'sdtypes': {'integer': 'categorical'}, + 'transformers': {'integer': FrequencyEncoder()}, + } + ) ht.fit(data) transformed1 = ht.transform(data) reverse1 = ht.reverse_transform(transformed1) @@ -533,10 +626,12 @@ def test_multiple_detect_configs_with_set_config(self): transformed1 = ht.transform(data) reverse1 = ht.reverse_transform(transformed1) - ht.set_config(config={ - 'sdtypes': {'integers': 'categorical'}, - 'transformers': {'integers': FrequencyEncoder()} - }) + ht.set_config( + config={ + 'sdtypes': {'integers': 'categorical'}, + 'transformers': {'integers': FrequencyEncoder()}, + } + ) ht.detect_initial_config(data) ht.fit(data) @@ -743,7 +838,7 @@ def test_transform_subset(self): transformed = ht.transform_subset(subset) # Assert - expected = pd.DataFrame({'col1': [1., 2.]}) + expected = pd.DataFrame({'col1': [1.0, 2.0]}) pd.testing.assert_frame_equal(transformed, expected) def test_reverse_transform_subset(self): @@ -797,14 +892,13 @@ def test_with_multiple_supported_sdtypes(self): data = pd.DataFrame({ 'user': ['John', 'Doe', 'John Doe', 'Doe John'], 'id': list(range(4)), - 'subscribed': [True, False, True, False] + 'subscribed': [True, False, True, False], }) ht = HyperTransformer() ht.detect_initial_config(data) ht.update_transformers_by_sdtype( - sdtype='boolean', - transformer=FrequencyEncoder(add_noise=True) + sdtype='boolean', transformer=FrequencyEncoder(add_noise=True) ) # Run @@ -842,27 +936,26 @@ def test_reverse_transform_subset_and_generators(self): ht.detect_initial_config(customers) # credit_card and id are pii and text columns - ht.update_sdtypes({ - 'credit_card': 'pii', - 'id': 'text' - }) + ht.update_sdtypes({'credit_card': 'pii', 'id': 'text'}) ht.update_transformers({ 'credit_card': AnonymizedFaker(), - 'id': RegexGenerator(regex_format='id_[a-z]') + 'id': RegexGenerator(regex_format='id_[a-z]'), }) # Run ht.fit(customers) transformed = ht.transform(customers) - reverse_transformed = ht.reverse_transform_subset(transformed[['last_login']]) + reverse_transformed = ht.reverse_transform_subset( + transformed[['last_login']] + ) # Assert expected_transformed_columns = [ 'last_login', 'email_optin', 'age', - 'dollars_spent' + 'dollars_spent', ] assert all(expected_transformed_columns == transformed.columns) assert reverse_transformed.columns == ['last_login'] @@ -874,9 +967,7 @@ def test_set_config_with_supported_sdtypes(self): 'transformers': { 'boolean_col': FrequencyEncoder(add_noise=True), }, - 'sdtypes': { - 'boolean_col': 'boolean' - } + 'sdtypes': {'boolean_col': 'boolean'}, } ht = HyperTransformer() @@ -889,12 +980,12 @@ def test_chained_transformers(self): When the specified transformer indicates a next transformer, they should each be applied in order during the transform step, and then reversed during the reverse_transform. """ + # Setup class DoublingTransformer(BaseTransformer): INPUT_SDTYPE = 'numerical' - def _fit(self, data): - ... + def _fit(self, data): ... def _transform(self, data): return data * 2 @@ -909,17 +1000,17 @@ def _reverse_transform(self, data): transformer1.output_properties[None]['next_transformer'] = transformer2 ht = HyperTransformer() - data = pd.DataFrame({'col': [1., 2, -1, 3, 1]}) + data = pd.DataFrame({'col': [1.0, 2, -1, 3, 1]}) # Run and Assert ht.set_config({ 'sdtypes': {'col': 'numerical'}, - 'transformers': {'col': transformer1} + 'transformers': {'col': transformer1}, }) ht.fit(data) transformed = ht.transform(data) - expected_transform = pd.DataFrame({'col': [8., 16, -8, 24, 8]}) + expected_transform = pd.DataFrame({'col': [8.0, 16, -8, 24, 8]}) pd.testing.assert_frame_equal(transformed, expected_transform) reverse_transformed = ht.reverse_transform(transformed) @@ -931,6 +1022,7 @@ def test_chained_transformers_various_transformers(self): When the specified transformer indicates a next transformer, they should each be applied in order during the transform step, and then reversed during the reverse_transform. """ + # Setup class AB(BaseTransformer): INPUT_SDTYPE = 'categorical' @@ -950,7 +1042,9 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.a'].str[:-1] + new_data[f'{self.column_prefix}'] = data[ + f'{self.column_prefix}.a' + ].str[:-1] return new_data class CD(BaseTransformer): @@ -959,7 +1053,7 @@ class CD(BaseTransformer): def _fit(self, data): self.output_properties = { 'c': {'sdtype': 'categorical', 'next_transformer': None}, - 'd': {'sdtype': 'categorical', 'next_transformer': E()} + 'd': {'sdtype': 'categorical', 'next_transformer': E()}, } def _transform(self, data): @@ -970,7 +1064,9 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.c'].str[:-1] + new_data[f'{self.column_prefix}'] = data[ + f'{self.column_prefix}.c' + ].str[:-1] return new_data class E(BaseTransformer): @@ -979,7 +1075,7 @@ class E(BaseTransformer): def _fit(self, data): self.output_properties = { None: {'sdtype': 'categorical', 'next_transformer': None}, - 'e': {'sdtype': 'categorical', 'next_transformer': None} + 'e': {'sdtype': 'categorical', 'next_transformer': None}, } def _transform(self, data): @@ -989,20 +1085,26 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.e'].str[:-1] + new_data[f'{self.column_prefix}'] = data[ + f'{self.column_prefix}.e' + ].str[:-1] return new_data ht = HyperTransformer() data = pd.DataFrame({ 'col': ['a', 'b', 'c'], 'col.a': ['1', '2', '3'], - 'col#': ['_', '_', '_'] + 'col#': ['_', '_', '_'], }) # Run and Assert ht.set_config({ - 'sdtypes': {'col': 'categorical', 'col.a': 'categorical', 'col#': 'categorical'}, - 'transformers': {'col': AB(), 'col.a': AB(), 'col#': E()} + 'sdtypes': { + 'col': 'categorical', + 'col.a': 'categorical', + 'col#': 'categorical', + }, + 'transformers': {'col': AB(), 'col.a': AB(), 'col#': E()}, }) ht.fit(data) transformed = ht.transform(data) @@ -1045,7 +1147,10 @@ def test_field_transformers_correctly_set(self): # if a transformer was set, it should use the provided instance fe = FrequencyEncoder() - ht.set_config({'sdtypes': {'col': 'categorical'}, 'transformers': {'col': fe}}) + ht.set_config({ + 'sdtypes': {'col': 'categorical'}, + 'transformers': {'col': fe}, + }) ht.fit(data) transformer = ht.get_config()['transformers']['col'] assert transformer is fe @@ -1058,7 +1163,9 @@ def test_field_transformers_correctly_set(self): transformer = ht.get_config()['transformers']['col'] assert transformer is fe - ht.update_transformers_by_sdtype('categorical', transformer_name='FrequencyEncoder') + ht.update_transformers_by_sdtype( + 'categorical', transformer_name='FrequencyEncoder' + ) transformer = ht.get_config()['transformers']['col'] transformer.new_attribute3 = 'abc' ht.fit(data) @@ -1076,17 +1183,19 @@ def _get_hyper_transformer_with_random_transformers(self, data): ht.update_sdtypes({ 'credit_card': 'pii', 'name': 'text', - 'signup_day': 'datetime' + 'signup_day': 'datetime', }) ht.update_transformers({ - 'credit_card': AnonymizedFaker('credit_card', 'credit_card_number'), + 'credit_card': AnonymizedFaker( + 'credit_card', 'credit_card_number' + ), 'balance': ClusterBasedNormalizer(max_clusters=3), - 'name': RegexGenerator() + 'name': RegexGenerator(), }) ht.update_transformers_by_sdtype( 'categorical', transformer_name='FrequencyEncoder', - transformer_parameters={'add_noise': True} + transformer_parameters={'add_noise': True}, ) return ht @@ -1103,12 +1212,24 @@ def test_reset_randomization(self): """ # Setup data = pd.DataFrame({ - 'credit_card': ['123456789', '987654321', '192837645', '918273465', '123789456'], + 'credit_card': [ + '123456789', + '987654321', + '192837645', + '918273465', + '123789456', + ], 'age': [18, 25, 54, 60, 31], 'name': ['Bob', 'Jane', 'Jack', 'Jill', 'Joe'], - 'signup_day': ['1/1/2020', np.nan, '4/1/2019', '12/1/2008', '5/16/2016'], + 'signup_day': [ + '1/1/2020', + np.nan, + '4/1/2019', + '12/1/2008', + '5/16/2016', + ], 'balance': [250, 5400, 150000, np.nan, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) ht1 = self._get_hyper_transformer_with_random_transformers(data) ht2 = self._get_hyper_transformer_with_random_transformers(data) @@ -1117,14 +1238,18 @@ def test_reset_randomization(self): expected_first_transformed = pd.DataFrame({ 'age': [18.0, 25.0, 54.0, 60.0, 31.0], 'signup_day': [ - 1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18 + 1.5778368e18, + 1.45584e18, + 1.5540768e18, + 1.2280896e18, + 1.4633568e18, ], 'balance.normalized': [ -2.693016e-01, -2.467182e-01, 3.873711e-01, 9.571797e-17, - 1.286486e-01 + 1.286486e-01, ], 'balance.component': [0.0, 0, 0, 0, 0], 'card_type': [ @@ -1133,19 +1258,23 @@ def test_reset_randomization(self): 0.639794, 0.862760, 0.263703, - ] + ], }) expected_second_transformed = pd.DataFrame({ 'age': [18.0, 25.0, 54.0, 60.0, 31.0], 'signup_day': [ - 1.5778368e+18, 1.45584e+18, 1.5540768e+18, 1.2280896e+18, 1.4633568e+18 + 1.5778368e18, + 1.45584e18, + 1.5540768e18, + 1.2280896e18, + 1.4633568e18, ], 'balance.normalized': [ -2.693016e-01, -2.467182e-01, 3.873711e-01, 9.571797e-17, - 1.286486e-01 + 1.286486e-01, ], 'balance.component': [0.0, 0, 0, 0, 0], 'card_type': [ @@ -1154,7 +1283,7 @@ def test_reset_randomization(self): 0.714735, 0.939781, 0.251442, - ] + ], }) ht1.fit(data) @@ -1163,9 +1292,15 @@ def test_reset_randomization(self): first_transformed2 = ht2.transform(data) second_transformed1 = ht1.transform(data) - pd.testing.assert_frame_equal(first_transformed1, expected_first_transformed) - pd.testing.assert_frame_equal(first_transformed2, expected_first_transformed) - pd.testing.assert_frame_equal(second_transformed1, expected_second_transformed) + pd.testing.assert_frame_equal( + first_transformed1, expected_first_transformed + ) + pd.testing.assert_frame_equal( + first_transformed2, expected_first_transformed + ) + pd.testing.assert_frame_equal( + second_transformed1, expected_second_transformed + ) # test reverse transforming multiple times with different tranformers expected_first_reverse = pd.DataFrame({ @@ -1178,9 +1313,15 @@ def test_reset_randomization(self): ], 'age': [18, 25, 54, 60, 31], 'name': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'], - 'signup_day': ['01/01/2020', '02/19/2016', '04/01/2019', np.nan, np.nan], + 'signup_day': [ + '01/01/2020', + '02/19/2016', + '04/01/2019', + np.nan, + np.nan, + ], 'balance': [250, 5400, 150000, 61662.5, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) expected_second_reverse = pd.DataFrame({ 'credit_card': [ @@ -1192,9 +1333,15 @@ def test_reset_randomization(self): ], 'age': [18, 25, 54, 60, 31], 'name': ['AAAAF', 'AAAAG', 'AAAAH', 'AAAAI', 'AAAAJ'], - 'signup_day': ['01/01/2020', np.nan, '04/01/2019', '12/01/2008', np.nan], + 'signup_day': [ + '01/01/2020', + np.nan, + '04/01/2019', + '12/01/2008', + np.nan, + ], 'balance': [np.nan, 5400, np.nan, 61662.5, 91000], - 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'] + 'card_type': ['Visa', 'Visa', 'Master Card', 'Amex', 'Visa'], }) first_reverse1 = ht1.reverse_transform(first_transformed1) first_reverse2 = ht2.reverse_transform(first_transformed1) @@ -1207,7 +1354,9 @@ def test_reset_randomization(self): ht1.reset_randomization() transformed_post_reset = ht1.reverse_transform(first_transformed1) - pd.testing.assert_frame_equal(transformed_post_reset, expected_first_reverse) + pd.testing.assert_frame_equal( + transformed_post_reset, expected_first_reverse + ) def test_cluster_based_normalizer_randomization(self): """Test that the ``ClusterBasedNormalizer`` handles randomization correctly. @@ -1220,20 +1369,18 @@ def test_cluster_based_normalizer_randomization(self): data = get_demo(100) ht = HyperTransformer() ht.detect_initial_config(data) - ht.update_transformers({ - 'age': ClusterBasedNormalizer() - }) + ht.update_transformers({'age': ClusterBasedNormalizer()}) ht.fit(data) transformed1 = ht.transform(data) transformed2 = ht.transform(data) - assert any(transformed1['age.normalized'] != transformed2['age.normalized']) + assert any( + transformed1['age.normalized'] != transformed2['age.normalized'] + ) ht2 = HyperTransformer() ht2.detect_initial_config(data) - ht2.update_transformers({ - 'age': ClusterBasedNormalizer() - }) + ht2.update_transformers({'age': ClusterBasedNormalizer()}) ht2.fit(data) pd.testing.assert_frame_equal(transformed1, ht2.transform(data)) @@ -1252,29 +1399,38 @@ def test_anonymized_faker(self): # Run - simple run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': AnonymizedFaker() + 'id2': AnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) reverse_transformed1 = ht.reverse_transform(transformed) # Assert - assert reverse_transformed1['id1'].tolist() != reverse_transformed1['id2'].tolist() + assert ( + reverse_transformed1['id1'].tolist() + != reverse_transformed1['id2'].tolist() + ) # Run - make sure transforming again returns different values than the original transform transformed = ht.transform(data) reverse_transformed2 = ht.reverse_transform(transformed) # Assert - assert reverse_transformed2['id1'].tolist() != reverse_transformed2['id2'].tolist() - assert reverse_transformed1['id1'].tolist() != reverse_transformed2['id1'].tolist() - assert reverse_transformed1['id2'].tolist() != reverse_transformed2['id2'].tolist() + assert ( + reverse_transformed2['id1'].tolist() + != reverse_transformed2['id2'].tolist() + ) + assert ( + reverse_transformed1['id1'].tolist() + != reverse_transformed2['id1'].tolist() + ) + assert ( + reverse_transformed1['id2'].tolist() + != reverse_transformed2['id2'].tolist() + ) # Run - make sure resetting randomization works ht.reset_randomization() @@ -1282,7 +1438,9 @@ def test_anonymized_faker(self): reverse_transformed3 = ht.reverse_transform(transformed) # Assert - pd.testing.assert_frame_equal(reverse_transformed1, reverse_transformed3) + pd.testing.assert_frame_equal( + reverse_transformed1, reverse_transformed3 + ) def test_anonymized_faker_text(self): """Test ``AnonymizedFaker`` with text column.""" @@ -1295,20 +1453,19 @@ def test_anonymized_faker_text(self): # Run - simple run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'info': 'text' - }) + ht.update_sdtypes({'id1': 'pii', 'info': 'text'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'info': AnonymizedFaker() + 'info': AnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) reverse_transformed = ht.reverse_transform(transformed) # Assert - assert all(reverse_transformed['info'].apply(lambda s: isinstance(s, str))) + assert all( + reverse_transformed['info'].apply(lambda s: isinstance(s, str)) + ) def test_pseudo_anonymized_faker(self): """Test ``PseudoAnonymizedFaker`` generates different values for different columns.""" @@ -1321,38 +1478,38 @@ def test_pseudo_anonymized_faker(self): # Run ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': PseudoAnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) reverse_transformed1 = ht.reverse_transform(transformed) # Assert - assert reverse_transformed1['id1'].tolist() != reverse_transformed1['id2'].tolist() + assert ( + reverse_transformed1['id1'].tolist() + != reverse_transformed1['id2'].tolist() + ) # Run - run it again on the exact same data ht = HyperTransformer() ht.detect_initial_config(data) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': PseudoAnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data) transformed = ht.transform(data) reverse_transformed2 = ht.reverse_transform(transformed) # Assert - different instances of the same transformer should return the same result - assert reverse_transformed1['id1'].tolist() == reverse_transformed2['id1'].tolist() + assert ( + reverse_transformed1['id1'].tolist() + == reverse_transformed2['id1'].tolist() + ) def test_anonymized_faker_different_tables(self): """Test ``AnonymizedFaker`` generates different values for columns with same name.""" @@ -1369,13 +1526,10 @@ def test_anonymized_faker_different_tables(self): # Run on data1 ht.detect_initial_config(data1) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data1) transformed = ht.transform(data1) @@ -1383,21 +1537,24 @@ def test_anonymized_faker_different_tables(self): # Run on data2 ht.detect_initial_config(data2) - ht.update_sdtypes({ - 'id1': 'pii', - 'id2': 'pii' - }) + ht.update_sdtypes({'id1': 'pii', 'id2': 'pii'}) ht.update_transformers({ 'id1': AnonymizedFaker(), - 'id2': PseudoAnonymizedFaker() + 'id2': PseudoAnonymizedFaker(), }) ht.fit(data2) transformed = ht.transform(data2) reverse_transformed2 = ht.reverse_transform(transformed) # Assert - assert reverse_transformed1['id1'].tolist() != reverse_transformed2['id1'].tolist() - assert reverse_transformed1['id2'].tolist() != reverse_transformed2['id2'].tolist() + assert ( + reverse_transformed1['id1'].tolist() + != reverse_transformed2['id1'].tolist() + ) + assert ( + reverse_transformed1['id2'].tolist() + != reverse_transformed2['id2'].tolist() + ) def test_random_seed(self): # Setup @@ -1408,10 +1565,30 @@ def test_random_seed(self): 'num4': [1, np.nan, 2] * 10, 'num5': [1, np.nan, 2] * 10, 'num6': [1, np.nan, 2] * 10, - 'date1': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date2': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date3': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, - 'date4': [np.datetime64('2020-10-10'), np.datetime64('2021-11-11'), np.nan] * 10, + 'date1': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date2': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date3': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, + 'date4': [ + np.datetime64('2020-10-10'), + np.datetime64('2021-11-11'), + np.nan, + ] + * 10, }) ht = HyperTransformer() @@ -1435,11 +1612,26 @@ def test_random_seed(self): reversed1 = ht.reverse_transform(transformed1) # Assert - assert reversed1['num1'].isna().tolist() != reversed1['num2'].isna().tolist() - assert reversed1['num3'].isna().tolist() != reversed1['num4'].isna().tolist() - assert reversed1['num5'].isna().tolist() != reversed1['num6'].isna().tolist() - assert reversed1['date1'].isna().tolist() != reversed1['date2'].isna().tolist() - assert reversed1['date3'].isna().tolist() != reversed1['date4'].isna().tolist() + assert ( + reversed1['num1'].isna().tolist() + != reversed1['num2'].isna().tolist() + ) + assert ( + reversed1['num3'].isna().tolist() + != reversed1['num4'].isna().tolist() + ) + assert ( + reversed1['num5'].isna().tolist() + != reversed1['num6'].isna().tolist() + ) + assert ( + reversed1['date1'].isna().tolist() + != reversed1['date2'].isna().tolist() + ) + assert ( + reversed1['date3'].isna().tolist() + != reversed1['date4'].isna().tolist() + ) # Run ht.reset_randomization() @@ -1455,18 +1647,18 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self): data_test = pd.DataFrame({ 'A': ['1.0', '2.0', '3.0'], 'B': ['4.0', '5.0', '6.0'], - 'C': [True, False, True] + 'C': [True, False, True], }) dict_config = { 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { ('A', 'B'): DummyMultiColumnTransformerNumerical(), - 'C': UniformEncoder() - } + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1480,31 +1672,39 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self): expected_transformed_data = pd.DataFrame({ 'A': [1.0, 2.0, 3.0], 'B': [4.0, 5.0, 6.0], - 'C': [0.10333535312718026, 0.6697388922326716, 0.18775548909503287] + 'C': [ + 0.10333535312718026, + 0.6697388922326716, + 0.18775548909503287, + ], }) - pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) + pd.testing.assert_frame_equal( + transformed_data, expected_transformed_data + ) pd.testing.assert_frame_equal(reverse_transformed_data, data_test) - def test_hypertransformer_with_mutli_column_transformer_and_single_column(self): + def test_hypertransformer_with_mutli_column_transformer_and_single_column( + self, + ): """Test a mutli column transformer used with for a single column.""" # Setup data_test = pd.DataFrame({ 'A': ['1.0', '2.0', '3.0'], 'B2': ['4.0', '5.0', '6.0'], - 'C': [True, False, True] + 'C': [True, False, True], }) dict_config = { 'sdtypes': { 'A': 'categorical', 'B2': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'A': DummyMultiColumnTransformerNumerical(), - ('B2', ): DummyMultiColumnTransformerNumerical(), - 'C': UniformEncoder() - } + ('B2',): DummyMultiColumnTransformerNumerical(), + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1518,10 +1718,16 @@ def test_hypertransformer_with_mutli_column_transformer_and_single_column(self): expected_transformed_data = pd.DataFrame({ 'A': [1.0, 2.0, 3.0], 'B2': [4.0, 5.0, 6.0], - 'C': [0.04206197607326308, 0.8000968077312287, 0.06325519846695522] + 'C': [ + 0.04206197607326308, + 0.8000968077312287, + 0.06325519846695522, + ], }) - pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) + pd.testing.assert_frame_equal( + transformed_data, expected_transformed_data + ) pd.testing.assert_frame_equal(reverse_transformed_data, data_test) def test_update_transformers_single_to_multi_column(self): @@ -1531,13 +1737,13 @@ def test_update_transformers_single_to_multi_column(self): 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'A': None, 'B': UniformEncoder(), - 'C': UniformEncoder() - } + 'C': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1554,12 +1760,12 @@ def test_update_transformers_single_to_multi_column(self): 'sdtypes': { 'A': 'categorical', 'B': 'categorical', - 'C': 'boolean' + 'C': 'boolean', }, 'transformers': { 'C': UniformEncoder(), ('A', 'B'): DummyMultiColumnTransformerNumerical(), - } + }, }) expected_multi_columns = { @@ -1580,13 +1786,13 @@ def test_update_transformers_multi_to_single_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1596,7 +1802,7 @@ def test_update_transformers_multi_to_single_column(self): # Run ht.update_transformers({ ('A', 'B'): DummyMultiColumnTransformerNumerical(), - 'D': UniformEncoder() + 'D': UniformEncoder(), }) new_config = ht.get_config() @@ -1607,14 +1813,14 @@ def test_update_transformers_multi_to_single_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'E': UniformEncoder(), "('A', 'B')": DummyMultiColumnTransformerNumerical(), 'C': DummyMultiColumnTransformerNumerical(), - 'D': UniformEncoder() - } + 'D': UniformEncoder(), + }, }) expected_multi_columns = { @@ -1633,13 +1839,13 @@ def test_update_transformers_by_sdtype_mutli_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1647,7 +1853,9 @@ def test_update_transformers_by_sdtype_mutli_column(self): ht.set_config(config) # Run - ht.update_transformers_by_sdtype('boolean', transformer_name='LabelEncoder') + ht.update_transformers_by_sdtype( + 'boolean', transformer_name='LabelEncoder' + ) new_config = ht.get_config() # Assert @@ -1657,19 +1865,16 @@ def test_update_transformers_by_sdtype_mutli_column(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), 'C': LabelEncoder(), - "('B', 'D')": DummyMultiColumnTransformerNumerical() - } + "('B', 'D')": DummyMultiColumnTransformerNumerical(), + }, }) - expected_multi_columns = { - 'B': ('B', 'D'), - 'D': ('B', 'D') - } + expected_multi_columns = {'B': ('B', 'D'), 'D': ('B', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == expected_multi_columns @@ -1683,13 +1888,13 @@ def test_remove_transformer(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) ht = HyperTransformer() @@ -1706,19 +1911,16 @@ def test_remove_transformer(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), "('C', 'D')": DummyMultiColumnTransformerNumerical(), - 'B': None - } + 'B': None, + }, }) - exepected_multi_columns = { - 'C': ('C', 'D'), - 'D': ('C', 'D') - } + exepected_multi_columns = {'C': ('C', 'D'), 'D': ('C', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == exepected_multi_columns @@ -1732,13 +1934,13 @@ def test_remove_transformer_by_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1756,14 +1958,14 @@ def test_remove_transformer_by_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), 'E': UniformEncoder(), "('B', 'D')": DummyMultiColumnTransformerNumerical(), - 'C': None - } + 'C': None, + }, }) assert repr(new_config) == repr(expected_config) @@ -1777,13 +1979,13 @@ def test_update_sdtype(self): 'B': 'categorical', 'C': 'boolean', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': UniformEncoder(), ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) @@ -1791,10 +1993,7 @@ def test_update_sdtype(self): ht.set_config(config) # Run - ht.update_sdtypes({ - 'C': 'numerical', - 'A': 'numerical' - }) + ht.update_sdtypes({'C': 'numerical', 'A': 'numerical'}) new_config = ht.get_config() # Assert @@ -1804,19 +2003,16 @@ def test_update_sdtype(self): 'B': 'categorical', 'C': 'numerical', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', }, 'transformers': { 'A': FloatFormatter(), 'E': UniformEncoder(), "('B', 'D')": DummyMultiColumnTransformerNumerical(), - 'C': FloatFormatter() - } + 'C': FloatFormatter(), + }, }) - expected_multi_columns = { - 'B': ('B', 'D'), - 'D': ('B', 'D') - } + expected_multi_columns = {'B': ('B', 'D'), 'D': ('B', 'D')} assert repr(new_config) == repr(expected_config) assert ht._multi_column_fields == expected_multi_columns @@ -1826,17 +2022,15 @@ def test_with_tuple_returned_by_faker(self): # Setup ht = HyperTransformer() ht.set_config({ - 'sdtypes': { - 'A': 'pii' - }, + 'sdtypes': {'A': 'pii'}, 'transformers': { - 'A': AnonymizedFaker(provider_name='currency', function_name='currency') - } + 'A': AnonymizedFaker( + provider_name='currency', function_name='currency' + ) + }, }) - ht.fit(pd.DataFrame({ - 'A': ['a', 'b', 'c'] - })) + ht.fit(pd.DataFrame({'A': ['a', 'b', 'c']})) # Run result = ht.create_anonymized_columns(num_rows=10, column_names=['A']) @@ -1844,10 +2038,16 @@ def test_with_tuple_returned_by_faker(self): # Assert expected_results = pd.DataFrame({ 'A': [ - 'KHR, Cambodian riel', 'TVD, Tuvaluan dollar', 'PKR, Pakistani rupee', - 'SVC, Salvadoran colón', 'CVE, Cape Verdean escudo', 'BRL, Brazilian real', - 'RWF, Rwandan franc', 'KZT, Kazakhstani tenge', 'HRK, Croatian kuna', - 'ILS, Israeli new shekel' + 'KHR, Cambodian riel', + 'TVD, Tuvaluan dollar', + 'PKR, Pakistani rupee', + 'SVC, Salvadoran colón', + 'CVE, Cape Verdean escudo', + 'BRL, Brazilian real', + 'RWF, Rwandan franc', + 'KZT, Kazakhstani tenge', + 'HRK, Croatian kuna', + 'ILS, Israeli new shekel', ] }) pd.testing.assert_frame_equal(result, expected_results) @@ -1858,7 +2058,7 @@ def test_with_tuple_returned_by_faker(self): 'B': 'categorical', 'D': 'categorical', 'E': 'categorical', - 'C': 'boolean' + 'C': 'boolean', } } expected_transformer_update = { @@ -1867,7 +2067,7 @@ def test_with_tuple_returned_by_faker(self): 'E': UniformEncoder(), 'C': UniformEncoder(), 'B': UniformEncoder(), - 'D': UniformEncoder() + 'D': UniformEncoder(), } } expected_transformer_remove = { @@ -1876,41 +2076,47 @@ def test_with_tuple_returned_by_faker(self): 'E': UniformEncoder(), 'C': None, 'B': UniformEncoder(), - 'D': UniformEncoder() + 'D': UniformEncoder(), } } - expected_update = { - **expected_sdtype, - **expected_transformer_update - } - expected_remove = { - **expected_sdtype, - **expected_transformer_remove - } + expected_update = {**expected_sdtype, **expected_transformer_update} + expected_remove = {**expected_sdtype, **expected_transformer_remove} parametrization = [ ( - 'update_transformers', {'column_name_to_transformer': {'C': UniformEncoder()}}, - expected_update + 'update_transformers', + {'column_name_to_transformer': {'C': UniformEncoder()}}, + expected_update, ), ( 'update_transformers_by_sdtype', - {'sdtype': 'boolean', 'transformer': UniformEncoder()}, expected_update + {'sdtype': 'boolean', 'transformer': UniformEncoder()}, + expected_update, ), ('remove_transformers', {'column_names': 'C'}, expected_remove), - ('remove_transformers_by_sdtype', {'sdtype': 'boolean'}, expected_remove), + ( + 'remove_transformers_by_sdtype', + {'sdtype': 'boolean'}, + expected_remove, + ), ] - @pytest.mark.parametrize(('method_name', 'method_input', 'expected_result'), parametrization) - def test_invalid_multi_column(self, method_name, method_input, expected_result): + @pytest.mark.parametrize( + ('method_name', 'method_input', 'expected_result'), parametrization + ) + def test_invalid_multi_column( + self, method_name, method_input, expected_result + ): """Test the ``update`` and ``remove`` methods with invalid multi column transformer. When a multi column is no longer valid, all these methods should raise a warning and assign the default transformer to the columns. """ - # Setup - class BadDummyMultiColumnTransformer(DummyMultiColumnTransformerNumerical): + # Setup + class BadDummyMultiColumnTransformer( + DummyMultiColumnTransformerNumerical + ): @classmethod def _validate_sdtypes(cls, columns_to_sdtype): raise TransformerInputError('Invalid sdtype') @@ -1926,8 +2132,8 @@ def _validate_sdtypes(cls, columns_to_sdtype): 'transformers': { 'A': UniformEncoder(), ('B', 'D', 'C'): BadDummyMultiColumnTransformer(), - 'E': UniformEncoder() - } + 'E': UniformEncoder(), + }, } config = Config(dict_config) diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py index 5fc4d72a..716a6a5d 100644 --- a/tests/integration/test_transformers.py +++ b/tests/integration/test_transformers.py @@ -16,23 +16,13 @@ TRANSFORMER_ARGS = { 'BinaryEncoder': { 'missing_value_replacement': -1, - 'missing_value_generation': 'from_column' - }, - 'UnixTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'OptimizedTimestampEncoder': { - 'missing_value_generation': 'from_column' - }, - 'FloatFormatter': { - 'missing_value_generation': 'from_column' - }, - 'GaussianNormalizer': { - 'missing_value_generation': 'from_column' - }, - 'ClusterBasedNormalizer': { - 'missing_value_generation': 'from_column' + 'missing_value_generation': 'from_column', }, + 'UnixTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'OptimizedTimestampEncoder': {'missing_value_generation': 'from_column'}, + 'FloatFormatter': {'missing_value_generation': 'from_column'}, + 'GaussianNormalizer': {'missing_value_generation': 'from_column'}, + 'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'}, } # Mapping of rdt sdtype to dtype @@ -68,10 +58,16 @@ def _validate_helper(validator_function, args, steps): def _is_valid_transformer(transformer_name): """Determine if transformer should be tested or not.""" invalid_names = [ - 'IdentityTransformer', 'Dummy', 'OrderedLabelEncoder', 'CustomLabelEncoder', - 'OrderedUniformEncoder', 'BaseMultiColumnTransformer' + 'IdentityTransformer', + 'Dummy', + 'OrderedLabelEncoder', + 'CustomLabelEncoder', + 'OrderedUniformEncoder', + 'BaseMultiColumnTransformer', ] - return all(invalid_name not in transformer_name for invalid_name in invalid_names) + return all( + invalid_name not in transformer_name for invalid_name in invalid_names + ) def _get_all_transformers(): @@ -101,7 +97,9 @@ def _find_dataset_generators(sdtype, generators): if sdtype is None: primary_generators = [] for primary_sdtype in PRIMARY_SDTYPES: - primary_generators.extend(_find_dataset_generators(primary_sdtype, generators)) + primary_generators.extend( + _find_dataset_generators(primary_sdtype, generators) + ) return primary_generators @@ -110,7 +108,9 @@ def _find_dataset_generators(sdtype, generators): def _validate_dataset_generators(dataset_generators): """Check that the number of dataset generators is greater than zero.""" - assert len(dataset_generators) > 0, 'There are no associated dataset generators.' + assert ( + len(dataset_generators) > 0 + ), 'There are no associated dataset generators.' def _validate_transformed_data(transformer, transformed_data): @@ -119,20 +119,32 @@ def _validate_transformed_data(transformer, transformed_data): transformed_dtypes = transformed_data.dtypes for column, expected_sdtype in expected_sdtypes.items(): - message = f'Column {column} is expected but not found in transformed data.' + message = ( + f'Column {column} is expected but not found in transformed data.' + ) assert column in transformed_data, message - message = f'Column {column} is not the expected sdtype {expected_sdtype}' - assert transformed_dtypes[column].kind in SDTYPE_TO_DTYPES[expected_sdtype], message - - -def _validate_reverse_transformed_data(transformer, reversed_data, input_dtype): + message = ( + f'Column {column} is not the expected sdtype {expected_sdtype}' + ) + assert ( + transformed_dtypes[column].kind + in SDTYPE_TO_DTYPES[expected_sdtype] + ), message + + +def _validate_reverse_transformed_data( + transformer, reversed_data, input_dtype +): """Check that the reverse transformed data is the expected dtype. Expect that the dtype is equal to the dtype of the input data. """ expected_sdtype = transformer.get_supported_sdtypes()[0] message = f'Reverse transformed data is not the expected sdtype {expected_sdtype}' - assert reversed_data.dtypes[TEST_COL].kind in SDTYPE_TO_DTYPES[expected_sdtype], message + assert ( + reversed_data.dtypes[TEST_COL].kind + in SDTYPE_TO_DTYPES[expected_sdtype] + ), message def _test_transformer_with_dataset(transformer_class, input_data, steps): @@ -173,20 +185,30 @@ def _test_transformer_with_dataset(transformer_class, input_data, steps): def _validate_hypertransformer_transformed_data(transformed_data): """Check that the transformed data is not null and of type float.""" - assert transformed_data.notna().all(axis=None), 'Transformed data has nulls.' + assert transformed_data.notna().all( + axis=None + ), 'Transformed data has nulls.' for dtype in transformed_data.dtypes: - assert dtype.kind in SDTYPE_TO_DTYPES['numerical'], 'Transformed data is not numerical.' + assert ( + dtype.kind in SDTYPE_TO_DTYPES['numerical'] + ), 'Transformed data is not numerical.' -def _validate_hypertransformer_reverse_transformed_data(transformer, reversed_data): +def _validate_hypertransformer_reverse_transformed_data( + transformer, reversed_data +): """Check that the reverse transformed data has the same dtype as the input.""" expected_sdtype = transformer().get_supported_sdtypes()[0] message = f'Reversed transformed data is not the expected sdtype {expected_sdtype}' - assert reversed_data.dtype.kind in SDTYPE_TO_DTYPES[expected_sdtype], message + assert ( + reversed_data.dtype.kind in SDTYPE_TO_DTYPES[expected_sdtype] + ), message -def _test_transformer_with_hypertransformer(transformer_class, input_data, steps): +def _test_transformer_with_hypertransformer( + transformer_class, input_data, steps +): """Test the given transformer in the hypertransformer. Run the provided transformer using the hypertransformer using the provided @@ -204,31 +226,22 @@ def _test_transformer_with_hypertransformer(transformer_class, input_data, steps transformer_args = TRANSFORMER_ARGS.get(transformer_class.__name__, {}) hypertransformer = HyperTransformer() if transformer_args: - field_transformers = { - TEST_COL: transformer_class(**transformer_args) - } + field_transformers = {TEST_COL: transformer_class(**transformer_args)} else: - field_transformers = { - TEST_COL: transformer_class() - } + field_transformers = {TEST_COL: transformer_class()} sdtypes = {} for field, transformer in field_transformers.items(): sdtypes[field] = transformer.get_supported_sdtypes()[0] - config = { - 'sdtypes': sdtypes, - 'transformers': field_transformers - } + config = {'sdtypes': sdtypes, 'transformers': field_transformers} hypertransformer.set_config(config) hypertransformer.fit(input_data) transformed = hypertransformer.transform(input_data) _validate_helper( - _validate_hypertransformer_transformed_data, - [transformed], - steps + _validate_hypertransformer_transformed_data, [transformed], steps ) out = hypertransformer.reverse_transform(transformed) @@ -259,9 +272,13 @@ def validate_transformer(transformer, steps=None, subtests=None): data = pd.DataFrame({TEST_COL: dg.generate(DATA_SIZE)}) if subtests: - with subtests.test(msg=f'test_transformer_with_dataset_{dg}', generator=dg): + with subtests.test( + msg=f'test_transformer_with_dataset_{dg}', generator=dg + ): _test_transformer_with_dataset(transformer, data, steps) - _test_transformer_with_hypertransformer(transformer, data, steps) + _test_transformer_with_hypertransformer( + transformer, data, steps + ) else: _test_transformer_with_dataset(transformer, data, steps) _test_transformer_with_hypertransformer(transformer, data, steps) diff --git a/tests/integration/transformers/pii/test_anonymizer.py b/tests/integration/transformers/pii/test_anonymizer.py index 66244f68..ecb65765 100644 --- a/tests/integration/transformers/pii/test_anonymizer.py +++ b/tests/integration/transformers/pii/test_anonymizer.py @@ -13,16 +13,14 @@ def test_default_settings(self): """End to end test with the default settings of the ``AnonymizedFaker``.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) instance = AnonymizedFaker() transformed = instance.fit_transform(data, 'username') reverse_transform = instance.reverse_transform(transformed) - expected_transformed = pd.DataFrame({ - 'id': [1, 2, 3, 4, 5] - }) + expected_transformed = pd.DataFrame({'id': [1, 2, 3, 4, 5]}) pd.testing.assert_frame_equal(transformed, expected_transformed) assert len(reverse_transform['username']) == 5 @@ -31,16 +29,14 @@ def test_default_settings_with_locales(self): """End to end test with the default settings and locales of the ``AnonymizedFaker``.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) instance = AnonymizedFaker(locales=['en_US', 'en_CA', 'es_ES']) transformed = instance.fit_transform(data, 'username') reverse_transform = instance.reverse_transform(transformed) - expected_transformed = pd.DataFrame({ - 'id': [1, 2, 3, 4, 5] - }) + expected_transformed = pd.DataFrame({'id': [1, 2, 3, 4, 5]}) pd.testing.assert_frame_equal(transformed, expected_transformed) assert len(reverse_transform['username']) == 5 @@ -63,8 +59,8 @@ def test_custom_provider(self): '4149498289355', '213144860944676', '4514775286178', - '213133122335401' - ] + '213133122335401', + ], }) instance = AnonymizedFaker('credit_card', 'credit_card_number') @@ -83,7 +79,7 @@ def test_with_nans(self): """Test with the default settings of the ``AnonymizedFaker`` with ``nan`` values.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = AnonymizedFaker() @@ -102,7 +98,7 @@ def test_with_nans_missing_value_generation_none(self): """End to end test settings missing_value_generation=None.""" data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = AnonymizedFaker(missing_value_generation=None) @@ -127,8 +123,8 @@ def test_custom_provider_with_nans(self): np.nan, '213144860944676', '4514775286178', - '213133122335401' - ] + '213133122335401', + ], }) instance = AnonymizedFaker( @@ -153,9 +149,7 @@ def test_cardinality_rule(self): Also ensure that when we call ``reset_randomization`` the generator will be able to create values again. """ - data = pd.DataFrame({ - 'job': np.arange(500) - }) + data = pd.DataFrame({'job': np.arange(500)}) instance = AnonymizedFaker('job', 'job', cardinality_rule='unique') transformed = instance.fit_transform(data, 'job') @@ -178,9 +172,7 @@ def test_cardinality_rule(self): def test_cardinality_rule_match(self): """Test it works with the cardinality rule 'match'.""" # Setup - data = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2] - }) + data = pd.DataFrame({'col': [1, 2, 3, 1, 2]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -193,9 +185,7 @@ def test_cardinality_rule_match(self): def test_cardinality_rule_match_nans(self): """Test it works with the cardinality rule 'match' with nans.""" # Setup - data = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2] - }) + data = pd.DataFrame({'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -212,9 +202,7 @@ def test_cardinality_rule_match_not_enough_unique_values(self): data_fit = pd.DataFrame({ 'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2] }) - data_transform = pd.DataFrame({ - 'col': [1, 1, 1] - }) + data_transform = pd.DataFrame({'col': [1, 1, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -229,12 +217,8 @@ def test_cardinality_rule_match_not_enough_unique_values(self): def test_cardinality_rule_match_too_many_unique(self): """Test it works with the cardinality rule 'match' and more unique values than samples.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, 4, 5, 6] - }) - data_transform = pd.DataFrame({ - 'col': [1, 1, np.nan, 3, 1] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, 4, 5, 6]}) + data_transform = pd.DataFrame({'col': [1, 1, np.nan, 3, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -249,12 +233,8 @@ def test_cardinality_rule_match_too_many_unique(self): def test_cardinality_rule_match_too_many_nans(self): """Test it works with the cardinality rule 'match' and more nans than possible to fit.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, np.nan, np.nan, np.nan] - }) - data_transform = pd.DataFrame({ - 'col': [1, 1, 1, 1] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, np.nan, np.nan, np.nan]}) + data_transform = pd.DataFrame({'col': [1, 1, 1, 1]}) instance = AnonymizedFaker(cardinality_rule='match') # Run @@ -273,9 +253,7 @@ def test_enforce_uniqueness_backwards_compatability(self): expected (can happen when previous transformer version is loaded from a pkl file). """ # Setup - data = pd.DataFrame({ - 'job': np.arange(500) - }) + data = pd.DataFrame({'job': np.arange(500)}) instance = AnonymizedFaker('job', 'job', cardinality_rule='match') instance.enforce_uniqueness = True @@ -304,9 +282,7 @@ def test_enforce_uniqueness_backwards_compatability(self): class TestPsuedoAnonymizedFaker: def test_default_settings(self): """End to end test with the default settings of the ``PseudoAnonymizedFaker``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', 'parrot', 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', 'parrot', 'monkey']}) instance = PseudoAnonymizedFaker() @@ -316,17 +292,17 @@ def test_default_settings(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert unique_animals.intersection(set(instance._mapping_dict)) == set() + assert ( + unique_animals.intersection(set(instance._mapping_dict)) == set() + ) assert len(reverse_transformed) == len(transformed) == 4 def test_with_nans(self): """Test with the default settings of the ``PseudoAnonymizedFaker`` and ``nans``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', np.nan, 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', np.nan, 'monkey']}) instance = PseudoAnonymizedFaker() @@ -336,17 +312,17 @@ def test_with_nans(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert unique_animals.intersection(set(instance._mapping_dict)) == set() + assert ( + unique_animals.intersection(set(instance._mapping_dict)) == set() + ) assert len(reverse_transformed) == len(transformed) == 4 def test_with_custom_provider(self): """End to end test with custom settings of the ``PseudoAnonymizedFaker``.""" - data = pd.DataFrame({ - 'animals': ['cat', 'dog', np.nan, 'monkey'] - }) + data = pd.DataFrame({'animals': ['cat', 'dog', np.nan, 'monkey']}) instance = PseudoAnonymizedFaker('credit_card', 'credit_card_number') @@ -356,8 +332,10 @@ def test_with_custom_provider(self): assert transformed.columns == ['animals'] pd.testing.assert_series_equal( reverse_transformed['animals'].map(instance._reverse_mapping_dict), - data['animals'] + data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert unique_animals.intersection(set(instance._mapping_dict)) == set() + assert ( + unique_animals.intersection(set(instance._mapping_dict)) == set() + ) assert len(reverse_transformed) == len(transformed) == 4 diff --git a/tests/integration/transformers/test_base.py b/tests/integration/transformers/test_base.py index 02a4ef72..afe600d0 100644 --- a/tests/integration/transformers/test_base.py +++ b/tests/integration/transformers/test_base.py @@ -27,9 +27,9 @@ def test_dummy_transformer_series_output(): - The transformed data should be able to reversed to re-produce the input data. """ + # Setup class DummyTransformer(BaseTransformer): - INPUT_SDTYPE = 'boolean' def _fit(self, data): @@ -42,9 +42,7 @@ def _reverse_transform(self, data): return data.round() != 0 # Run - data = pd.DataFrame({ - 'bool': [True, False, True, False] - }) + data = pd.DataFrame({'bool': [True, False, True, False]}) transformer = DummyTransformer() transformed = transformer.fit_transform(data, 'bool') @@ -52,9 +50,7 @@ def _reverse_transform(self, data): reverse = transformer.reverse_transform(transformed) # Assert - expected_transform = pd.DataFrame({ - 'bool': [1., 0., 1., 0.] - }) + expected_transform = pd.DataFrame({'bool': [1.0, 0.0, 1.0, 0.0]}) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data) @@ -82,9 +78,9 @@ def test_dummy_transformer_dataframe_output(): - The transformed data should be able to reversed to re-produce the input data. """ + # Setup class DummyTransformer(BaseTransformer): - INPUT_SDTYPE = 'boolean' def __init__(self): @@ -98,13 +94,17 @@ def _fit(self, data): pass def _transform(self, data): - out = pd.DataFrame(dict(zip( - self.output_columns, - [ - data.astype(float).fillna(-1), - data.isna().astype(float) - ] - ))) + out = pd.DataFrame( + dict( + zip( + self.output_columns, + [ + data.astype(float).fillna(-1), + data.isna().astype(float), + ], + ) + ) + ) return out @@ -124,8 +124,8 @@ def _reverse_transform(self, data): # Assert expected_transform = pd.DataFrame({ - 'bool': [1., 0., 1., -1.], - 'bool.null': [0., 0., 0., 1.] + 'bool': [1.0, 0.0, 1.0, -1.0], + 'bool.null': [0.0, 0.0, 0.0, 1.0], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data) @@ -133,6 +133,7 @@ def _reverse_transform(self, data): def test_multi_column_transformer_same_number_of_columns_input_output(): """Test a multi-column transformer when the same of input and output columns.""" + # Setup class AdditionTransformer(BaseMultiColumnTransformer): """This transformer takes 3 columns and return the cumulative sum of each row.""" @@ -140,8 +141,12 @@ class AdditionTransformer(BaseMultiColumnTransformer): def _fit(self, columns_data): self.output_properties = { f'{self.columns[0]}': {'sdtype': 'numerical'}, - f'{self.columns[0]}+{self.columns[1]}': {'sdtype': 'numerical'}, - f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': {'sdtype': 'numerical'} + f'{self.columns[0]}+{self.columns[1]}': { + 'sdtype': 'numerical' + }, + f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': { + 'sdtype': 'numerical' + }, } def _get_prefix(self): @@ -159,13 +164,13 @@ def _reverse_transform(self, data): data_test = pd.DataFrame({ 'col_1': [1, 2, 3], 'col_2': [10, 20, 30], - 'col_3': [100, 200, 300] + 'col_3': [100, 200, 300], }) columns_to_sdtypes = { 'col_1': 'numerical', 'col_2': 'numerical', - 'col_3': 'numerical' + 'col_3': 'numerical', } transformer = AdditionTransformer() @@ -177,7 +182,7 @@ def _reverse_transform(self, data): expected_transform = pd.DataFrame({ 'col_1': [1, 2, 3], 'col_1+col_2': [11, 22, 33], - 'col_1+col_2+col_3': [111, 222, 333] + 'col_1+col_2+col_3': [111, 222, 333], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -185,6 +190,7 @@ def _reverse_transform(self, data): def test_multi_column_transformer_less_output_than_input_columns(): """Test a multi-column transformer when the output has less columns than the input.""" + class ConcatenateTransformer(BaseMultiColumnTransformer): """This transformer takes 4 columns and concatenate them into 2 columns. The two first and last columns are concatenated together. @@ -195,7 +201,7 @@ def _fit(self, columns_data): self.name_2 = self.columns[2] + '#' + self.columns[3] self.output_properties = { f'{self.name_1}.concatenate_1': {'sdtype': 'categorical'}, - f'{self.name_2}.concatenate_2': {'sdtype': 'categorical'} + f'{self.name_2}.concatenate_2': {'sdtype': 'categorical'}, } def _get_prefix(self): @@ -212,10 +218,14 @@ def _reverse_transform(self, data): column_names = list(data.columns) col1, col2 = column_names[0].split('#') - result[[col1, col2]] = result[column_names[0]].str.split('#', expand=True) + result[[col1, col2]] = result[column_names[0]].str.split( + '#', expand=True + ) col3, col4 = column_names[1].split('#') - result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True) + result[[col3, col4]] = result[column_names[1]].str.split( + '#', expand=True + ) return result.drop(columns=column_names) @@ -223,14 +233,14 @@ def _reverse_transform(self, data): 'col_1': ['A', 'B', 'C'], 'col_2': ['D', 'E', 'F'], 'col_3': ['G', 'H', 'I'], - 'col_4': ['J', 'K', 'L'] + 'col_4': ['J', 'K', 'L'], }) columns_to_sdtypes = { 'col_1': 'categorical', 'col_2': 'categorical', 'col_3': 'categorical', - 'col_4': 'categorical' + 'col_4': 'categorical', } transformer = ConcatenateTransformer() @@ -242,7 +252,7 @@ def _reverse_transform(self, data): # Assert expected_transform = pd.DataFrame({ 'col_1#col_2.concatenate_1': ['A#D', 'B#E', 'C#F'], - 'col_3#col_4.concatenate_2': ['G#J', 'H#K', 'I#L'] + 'col_3#col_4.concatenate_2': ['G#J', 'H#K', 'I#L'], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) @@ -250,14 +260,14 @@ def _reverse_transform(self, data): def test_multi_column_transformer_more_output_than_input_columns(): """Test a multi-column transformer when the output has more columns than the input.""" - class ExpandTransformer(BaseMultiColumnTransformer): + class ExpandTransformer(BaseMultiColumnTransformer): def _fit(self, columns_data): self.output_properties = { f'{self.columns[0]}.first_part_1': {'sdtype': 'categorical'}, f'{self.columns[0]}.second_part_1': {'sdtype': 'categorical'}, f'{self.columns[1]}.first_part_2': {'sdtype': 'categorical'}, - f'{self.columns[1]}.second_part_2': {'sdtype': 'categorical'} + f'{self.columns[1]}.second_part_2': {'sdtype': 'categorical'}, } def _get_prefix(self): @@ -273,8 +283,12 @@ def _transform(self, data): def _reverse_transform(self, data): result = data.copy() - reverse_1 = result[self.output_columns[0]] + result[self.output_columns[1]] - reverse_2 = result[self.output_columns[2]] + result[self.output_columns[3]] + reverse_1 = ( + result[self.output_columns[0]] + result[self.output_columns[1]] + ) + reverse_2 = ( + result[self.output_columns[2]] + result[self.output_columns[3]] + ) result[self.columns[0]] = reverse_1 result[self.columns[1]] = reverse_2 @@ -285,10 +299,7 @@ def _reverse_transform(self, data): 'col_2': ['GH', 'IJ', 'KL'], }) - columns_to_sdtypes = { - 'col_1': 'categorical', - 'col_2': 'categorical' - } + columns_to_sdtypes = {'col_1': 'categorical', 'col_2': 'categorical'} transformer = ExpandTransformer() # Run @@ -301,7 +312,7 @@ def _reverse_transform(self, data): 'col_1.first_part_1': ['A', 'C', 'E'], 'col_1.second_part_1': ['B', 'D', 'F'], 'col_2.first_part_2': ['G', 'I', 'K'], - 'col_2.second_part_2': ['H', 'J', 'L'] + 'col_2.second_part_2': ['H', 'J', 'L'], }) pd.testing.assert_frame_equal(expected_transform, transformed) pd.testing.assert_frame_equal(reverse, data_test) diff --git a/tests/integration/transformers/test_boolean.py b/tests/integration/transformers/test_boolean.py index 827802b5..dd217e37 100644 --- a/tests/integration/transformers/test_boolean.py +++ b/tests/integration/transformers/test_boolean.py @@ -5,7 +5,6 @@ class TestBinaryEncoder: - def test_boolean_some_nans(self): """Test BinaryEncoder on input with some nan values. @@ -50,7 +49,7 @@ def test_boolean_missing_value_replacement_mode(self): column = 'bool' transformer = BinaryEncoder( missing_value_replacement='mode', - missing_value_generation='from_column' + missing_value_generation='from_column', ) # Run @@ -60,8 +59,8 @@ def test_boolean_missing_value_replacement_mode(self): # Assert expected_transformed = pd.DataFrame({ - 'bool': [1., 1., 1., 0.], - 'bool.is_null': [0., 0., 1., 0.] + 'bool': [1.0, 1.0, 1.0, 0.0], + 'bool.is_null': [0.0, 0.0, 1.0, 0.0], }) pd.testing.assert_frame_equal(transformed, expected_transformed) pd.testing.assert_frame_equal(reverse, data) @@ -75,8 +74,7 @@ def test_boolean_missing_value_generation_none(self): data = pd.DataFrame([True, True, None, False], columns=['bool']) column = 'bool' transformer = BinaryEncoder( - missing_value_replacement='mode', - missing_value_generation=None + missing_value_replacement='mode', missing_value_generation=None ) # Run @@ -85,7 +83,9 @@ def test_boolean_missing_value_generation_none(self): reverse = transformer.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({'bool': [1., 1., 1., 0.]}) + expected_transformed = pd.DataFrame({'bool': [1.0, 1.0, 1.0, 0.0]}) expected_reversed = pd.DataFrame({'bool': [True, True, True, False]}) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse, expected_reversed, check_dtype=False) + pd.testing.assert_frame_equal( + reverse, expected_reversed, check_dtype=False + ) diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 4f77b7fe..3bb67253 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -6,8 +6,13 @@ import pandas as pd from rdt.transformers import ( - FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, OrderedUniformEncoder, - UniformEncoder) + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) class TestUniformEncoder: @@ -59,7 +64,9 @@ def test__reverse_transform(self): output = transformer.reverse_transform(transformed) # Asserts - pd.testing.assert_series_equal(output['column_name'], data['column_name']) + pd.testing.assert_series_equal( + output['column_name'], data['column_name'] + ) def test__reverse_transform_negative_transformed_values(self): """Test the ``reverse_transform``.""" @@ -83,7 +90,18 @@ def test__reverse_transform_nans(self): """Test ``reverse_transform`` for data with NaNs.""" # Setup data = pd.DataFrame({ - 'column_name': ['a', 'b', 'c', np.nan, 'c', 'b', 'b', 'a', 'b', np.nan] + 'column_name': [ + 'a', + 'b', + 'c', + np.nan, + 'c', + 'b', + 'b', + 'a', + 'b', + np.nan, + ] }) column = 'column_name' @@ -101,7 +119,9 @@ def test_uniform_encoder_unseen_transform_nan(self): """Ensure UniformEncoder works when np.nan to transform wasn't seen during fit.""" # Setup fit_data = pd.DataFrame([1.0, 2.0, 3.0], columns=['column_name']) - transform_data = pd.DataFrame([1, 2, 3, np.nan], columns=['column_name']) + transform_data = pd.DataFrame( + [1, 2, 3, np.nan], columns=['column_name'] + ) column = 'column_name' transformer = UniformEncoder() @@ -156,7 +176,9 @@ def test_order(self): def test_string(self): """Test that the transformer works with string labels.""" # Setup - data = pd.DataFrame({'column_name': ['b', 'a', 'c', 'a', np.nan, 'b', 'b']}) + data = pd.DataFrame({ + 'column_name': ['b', 'a', 'c', 'a', np.nan, 'b', 'b'] + }) transformer = OrderedUniformEncoder(order=['a', 'c', np.nan, 'b']) column = 'column_name' @@ -223,7 +245,9 @@ def test_frequency_encoder_numerical_nans_no_warning(): def test_frequency_encoder_unseen_transform_data(): """Ensure FrequencyEncoder works when data to transform wasn't seen during fit.""" - fit_data = pd.DataFrame([1, 2, float('nan'), np.nan], columns=['column_name']) + fit_data = pd.DataFrame( + [1, 2, float('nan'), np.nan], columns=['column_name'] + ) transform_data = pd.DataFrame([1, 2, np.nan, 3], columns=['column_name']) column = 'column_name' @@ -415,7 +439,9 @@ def test_frequency_encoder_mixed_more_rows(): # setup data = pd.DataFrame([True, 'a', 1, None], columns=['column_name']) column = 'column_name' - transform_data = pd.DataFrame(['a', 1, None, 'a', True, 1], columns=['column_name']) + transform_data = pd.DataFrame( + ['a', 1, None, 'a', True, 1], columns=['column_name'] + ) transformer = FrequencyEncoder() # run @@ -439,7 +465,9 @@ def test_frequency_encoder_noise(): - The reverse transformed data """ # setup - data = pd.DataFrame(np.random.choice(a=range(100), size=10000), columns=['column_name']) + data = pd.DataFrame( + np.random.choice(a=range(100), size=10000), columns=['column_name'] + ) column = 'column_name' transformer = FrequencyEncoder(add_noise=True) @@ -468,7 +496,9 @@ def test_one_hot_numerical_nans(): def test_one_hot_doesnt_warn(tmp_path): """Ensure OneHotEncoder doesn't warn when saving and loading GH#616.""" # Setup - data = pd.DataFrame({'column_name': [1.0, 2.0, np.nan, 2.0, 3.0, np.nan, 3.0]}) + data = pd.DataFrame({ + 'column_name': [1.0, 2.0, np.nan, 2.0, 3.0, np.nan, 3.0] + }) ohe = OneHotEncoder() # Run @@ -488,9 +518,7 @@ def test_one_hot_doesnt_warn(tmp_path): def test_one_hot_categoricals(): """Ensure OneHotEncoder works on categorical data. GH#751""" # Setup - test_data = pd.DataFrame(data={ - 'A': ['Yes', 'No', 'Yes', 'Maybe', 'No'] - }) + test_data = pd.DataFrame(data={'A': ['Yes', 'No', 'Yes', 'Maybe', 'No']}) test_data['A'] = test_data['A'].astype('category') transformer = OneHotEncoder() @@ -505,7 +533,7 @@ def test_one_hot_categoricals(): 'A.value1': [0, 1, 0, 0, 1], 'A.value2': [0, 0, 0, 1, 0], }), - check_dtype=False + check_dtype=False, ) # Run @@ -524,7 +552,7 @@ def test_label_numerical_2d_array(): transformer = LabelEncoder() transformer.fit(data, column) - transformed = pd.DataFrame([0., 1., 2., 3.], columns=['column_name']) + transformed = pd.DataFrame([0.0, 1.0, 2.0, 3.0], columns=['column_name']) reverse = transformer.reverse_transform(transformed) pd.testing.assert_frame_equal(reverse, data) @@ -576,7 +604,9 @@ def test_label_encoder_order_by_numerical(): - Transformed data should map labels to values based on numerical order. """ - data = pd.DataFrame([5, np.nan, 3.11, 100, 67.8, -2.5], columns=['column_name']) + data = pd.DataFrame( + [5, np.nan, 3.11, 100, 67.8, -2.5], columns=['column_name'] + ) transformer = LabelEncoder(order_by='numerical_value') transformer.fit(data, 'column_name') @@ -598,7 +628,9 @@ def test_label_encoder_order_by_alphabetical(): - Transformed data should map labels to values based on alphabetical order. """ - data = pd.DataFrame(['one', 'two', np.nan, 'three', 'four'], columns=['column_name']) + data = pd.DataFrame( + ['one', 'two', np.nan, 'three', 'four'], columns=['column_name'] + ) transformer = LabelEncoder(order_by='alphabetical') transformer.fit(data, 'column_name') @@ -644,7 +676,9 @@ def test_ordered_label_encoder_nans(): - Reverse transformed data should match the input """ - data = pd.DataFrame(['two', 3, 1, np.nan, 'zero', None], columns=['column_name']) + data = pd.DataFrame( + ['two', 3, 1, np.nan, 'zero', None], columns=['column_name'] + ) transformer = OrderedLabelEncoder(order=['zero', 1, 'two', 3, None]) transformer.fit(data, 'column_name') diff --git a/tests/integration/transformers/test_datetime.py b/tests/integration/transformers/test_datetime.py index 59e189f1..d64a45c6 100644 --- a/tests/integration/transformers/test_datetime.py +++ b/tests/integration/transformers/test_datetime.py @@ -1,7 +1,10 @@ import numpy as np import pandas as pd -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) class TestUnixTimestampEncoder: @@ -9,7 +12,9 @@ def test_unixtimestampencoder(self): """Test the ``UnixTimestampEncoder`` end to end.""" # Setup ute = UnixTimestampEncoder(missing_value_replacement='mean') - data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) + data = pd.DataFrame({ + 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) + }) # Run ute.fit(data, column='column') @@ -19,7 +24,7 @@ def test_unixtimestampencoder(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -27,7 +32,9 @@ def test_unixtimestampencoder(self): def test_unixtimestampencoder_different_format(self): """Test the ``UnixTimestampEncoder`` with a unique datetime format.""" - ute = UnixTimestampEncoder(missing_value_replacement='mean', datetime_format='%b %d, %Y') + ute = UnixTimestampEncoder( + missing_value_replacement='mean', datetime_format='%b %d, %Y' + ) data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) # Run @@ -38,7 +45,7 @@ def test_unixtimestampencoder_different_format(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, data) @@ -49,7 +56,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): ute = UnixTimestampEncoder( missing_value_replacement='mean', missing_value_generation=None, - datetime_format='%b %d, %Y' + datetime_format='%b %d, %Y', ) data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) @@ -61,7 +68,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000] + 'column': [3.500064e17, 845510400000000000, -145497600000000000] }) expected_reversed = pd.DataFrame({ 'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965'] @@ -73,8 +80,7 @@ def test_unixtimestampencoder_with_missing_value_replacement_random(self): """Test that transformed data will replace nans with random values from the data.""" # Setup ute = UnixTimestampEncoder( - missing_value_replacement='random', - datetime_format='%b %d, %Y' + missing_value_replacement='random', datetime_format='%b %d, %Y' ) data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) @@ -86,7 +92,7 @@ def test_unixtimestampencoder_with_missing_value_replacement_random(self): # Asserts expect_transformed = pd.DataFrame({ - 'column': [-7.007396e+16, 845510400000000000, -145497600000000000] + 'column': [-7.007396e16, 845510400000000000, -145497600000000000] }) expected_reversed = pd.DataFrame({ 'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965'] @@ -98,7 +104,9 @@ def test_unixtimestampencoder_with_model_missing_values(self): """Test that `model_missing_values` is accepted by the transformer.""" # Setup ute = UnixTimestampEncoder('mean', True) - data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) + data = pd.DataFrame({ + 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) + }) # Run ute.fit(data, column='column') @@ -108,8 +116,8 @@ def test_unixtimestampencoder_with_model_missing_values(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [3.500064e+17, 845510400000000000, -145497600000000000], - 'column.is_null': [1., 0., 0.] + 'column': [3.500064e17, 845510400000000000, -145497600000000000], + 'column.is_null': [1.0, 0.0, 0.0], }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -129,7 +137,7 @@ def test_unixtimestampencoder_with_integer_datetimes(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [6.958656e+17, 1.856736e+18, 6.547392e+17], + 'column': [6.958656e17, 1.856736e18, 6.547392e17], }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -149,8 +157,8 @@ def test_unixtimestampencoder_with_nans(self): # Asserts expected_transformed = pd.DataFrame({ - 'column': [0., 0., 0.], - 'column.is_null': [1., 1., 1.] + 'column': [0.0, 0.0, 0.0], + 'column.is_null': [1.0, 1.0, 1.0], }) pd.testing.assert_frame_equal(expected_transformed, transformed) @@ -160,15 +168,21 @@ def test_with_enforce_min_max_values_true(self): """Test that the transformer properly clipped out of bounds values.""" # Setup ute = UnixTimestampEncoder(enforce_min_max_values=True) - data = pd.DataFrame({'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965']}) + data = pd.DataFrame({ + 'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965'] + }) ute.fit(data, column='column') # Run transformed = ute.transform(data) min_val = transformed['column'].min() max_val = transformed['column'].max() - transformed.loc[transformed['column'] == min_val, 'column'] = min_val - 1e17 - transformed.loc[transformed['column'] == max_val, 'column'] = max_val + 1e17 + transformed.loc[transformed['column'] == min_val, 'column'] = ( + min_val - 1e17 + ) + transformed.loc[transformed['column'] == max_val, 'column'] = ( + max_val + 1e17 + ) reverted = ute.reverse_transform(transformed) # Asserts @@ -180,7 +194,9 @@ def test_with_enforce_min_max_values_true(self): class TestOptimizedTimestampEncoder: def test_optimizedtimestampencoder(self): ote = OptimizedTimestampEncoder(missing_value_replacement='mean') - data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) + data = pd.DataFrame({ + 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) + }) # Run ote.fit(data, column='column') @@ -189,6 +205,8 @@ def test_optimizedtimestampencoder(self): reverted = ote.reverse_transform(transformed) # Asserts - expect_transformed = pd.DataFrame({'column': [4051.0, 9786.0, -1684.0]}) + expect_transformed = pd.DataFrame({ + 'column': [4051.0, 9786.0, -1684.0] + }) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, data) diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index b391e69b..ced5be5e 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -2,11 +2,14 @@ import pandas as pd from copulas import univariate -from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer +from rdt.transformers.numerical import ( + ClusterBasedNormalizer, + FloatFormatter, + GaussianNormalizer, +) class TestFloatFormatter: - def test_missing_value_generation_from_column(self): """Test end to end with ``missing_value_generation`` set to ``from_column``. @@ -115,7 +118,9 @@ def test_model_missing_value(self): assert list(transformed.iloc[:, 1]) == [0, 0, 0, 0, 1, 0] np.testing.assert_array_almost_equal(reverse, data, decimal=2) - def test_missing_value_replacement_set_to_random_and_model_missing_values(self): + def test_missing_value_replacement_set_to_random_and_model_missing_values( + self, + ): """Test that we are still able to use ``missing_value_replacement`` when is ``random``.""" # Setup data = pd.DataFrame({'a': [1, 2, 3, np.nan, np.nan, 4]}) @@ -128,8 +133,8 @@ def test_missing_value_replacement_set_to_random_and_model_missing_values(self): # Assert expected_transformed = pd.DataFrame({ - 'a': [1., 2., 3., 2.617107, 1.614805, 4.], - 'a.is_null': [0., 0., 0., 1., 1., 0.] + 'a': [1.0, 2.0, 3.0, 2.617107, 1.614805, 4.0], + 'a.is_null': [0.0, 0.0, 0.0, 1.0, 1.0, 0.0], }) pd.testing.assert_frame_equal(transformed, expected_transformed) pd.testing.assert_frame_equal(reverse, data) @@ -154,13 +159,16 @@ def test_missing_value_replacement_random_all_nans(self): expected_transformed = pd.DataFrame({'a': [0.0] * 10}) expected_reverse_transformed = pd.DataFrame({'a': [np.nan] * 10}) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transformed, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transformed, expected_reverse_transformed + ) class TestGaussianNormalizer: - def test_stats(self): - data = pd.DataFrame(np.random.normal(loc=4, scale=4, size=1000), columns=['a']) + data = pd.DataFrame( + np.random.normal(loc=4, scale=4, size=1000), columns=['a'] + ) column = 'a' ct = GaussianNormalizer() @@ -206,7 +214,9 @@ def test_missing_value_generation_random(self): reverse = ct.reverse_transform(transformed) expected = pd.DataFrame( - [1., 1.9999999510423996, 1., 1.9999999510423996, 1.4, 1.], columns=['a']) + [1.0, 1.9999999510423996, 1.0, 1.9999999510423996, 1.4, 1.0], + columns=['a'], + ) pd.testing.assert_frame_equal(reverse, expected) def test_int(self): @@ -299,7 +309,6 @@ def test_uniform_class(self): class TestClusterBasedNormalizer: - def generate_data(self): data1 = np.random.normal(loc=5, scale=1, size=100) data2 = np.random.normal(loc=-5, scale=1, size=100) @@ -327,11 +336,13 @@ def test_some_nulls(self): random_state = np.random.get_state() np.random.set_state(np.random.RandomState(10).get_state()) data = self.generate_data() - mask = np.random.choice([1, 0], data.shape, p=[.1, .9]).astype(bool) + mask = np.random.choice([1, 0], data.shape, p=[0.1, 0.9]).astype(bool) data[mask] = np.nan column = 'col' - bgmm_transformer = ClusterBasedNormalizer(missing_value_generation='from_column') + bgmm_transformer = ClusterBasedNormalizer( + missing_value_generation='from_column' + ) bgmm_transformer.fit(data, column) transformed = bgmm_transformer.transform(data) @@ -393,12 +404,17 @@ def test_out_of_bounds_reverse_transform(self): """Test that the reverse transform works when the data is out of bounds GH#672.""" # Setup data = pd.DataFrame({ - 'col': [round(i, 2) for i in np.random.uniform(0, 10, size=100)] + [None] - }) - reverse_data = pd.DataFrame(data={ - 'col.normalized': np.random.uniform(-10, 10, size=100), - 'col.component': np.random.choice([0.0, 1.0, 2.0, 10.0], size=100) + 'col': [round(i, 2) for i in np.random.uniform(0, 10, size=100)] + + [None] }) + reverse_data = pd.DataFrame( + data={ + 'col.normalized': np.random.uniform(-10, 10, size=100), + 'col.component': np.random.choice( + [0.0, 1.0, 2.0, 10.0], size=100 + ), + } + ) transformer = ClusterBasedNormalizer() # Run diff --git a/tests/integration/transformers/test_text.py b/tests/integration/transformers/test_text.py index 18190b73..47986d83 100644 --- a/tests/integration/transformers/test_text.py +++ b/tests/integration/transformers/test_text.py @@ -7,18 +7,19 @@ from rdt.transformers.text import IDGenerator, RegexGenerator -class TestIDGenerator(): - +class TestIDGenerator: def test_end_to_end(self): """End to end test of the ``IDGenerator``.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run - transformer = IDGenerator(prefix='id_', starting_value=100, suffix='_X') + transformer = IDGenerator( + prefix='id_', starting_value=100, suffix='_X' + ) transformed = transformer.fit_transform(data, 'id') reverse_transform = transformer.reverse_transform(transformed) reverse_transform_2 = transformer.reverse_transform(transformed) @@ -32,27 +33,33 @@ def test_end_to_end(self): expected_reverse_transform = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], - 'id': ['id_100_X', 'id_101_X', 'id_102_X', 'id_103_X', 'id_104_X'] + 'id': ['id_100_X', 'id_101_X', 'id_102_X', 'id_103_X', 'id_104_X'], }) expected_reverse_transform_2 = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], - 'id': ['id_105_X', 'id_106_X', 'id_107_X', 'id_108_X', 'id_109_X'] + 'id': ['id_105_X', 'id_106_X', 'id_107_X', 'id_108_X', 'id_109_X'], }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transform) - pd.testing.assert_frame_equal(reverse_transform_2, expected_reverse_transform_2) - pd.testing.assert_frame_equal(reverse_transform_3, expected_reverse_transform) - - -class TestRegexGenerator(): + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transform + ) + pd.testing.assert_frame_equal( + reverse_transform_2, expected_reverse_transform_2 + ) + pd.testing.assert_frame_equal( + reverse_transform_3, expected_reverse_transform + ) + + +class TestRegexGenerator: def test_regexgenerator(self): """Test ``RegexGenerator`` with the default parameters.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -70,7 +77,9 @@ def test_regexgenerator(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_with_custom_regex(self): """Test the ``RegexGenerator`` with a custom regex format.""" @@ -96,14 +105,16 @@ def test_with_custom_regex(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_with_nans(self): """Test the ``RegexGenerator`` with a custom regex format and ``nans``.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) # Run @@ -122,14 +133,16 @@ def test_with_nans(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_data_length_bigger_than_regex(self): """Test the ``RegexGenerator`` with short regex and more data length.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) # Run @@ -148,14 +161,16 @@ def test_data_length_bigger_than_regex(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_input_data_bigger_than_data_length(self): """Test the ``RegexGenerator`` with input dataframe bigger than the learned data length.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -173,7 +188,9 @@ def test_input_data_bigger_than_data_length(self): 'username': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b'], }) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_called_multiple_times(self): """Test the ``RegexGenerator`` with short regex and called multiple times. @@ -184,7 +201,7 @@ def test_called_multiple_times(self): # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', np.nan, 'c', 'd', 'e'] + 'username': ['a', np.nan, 'c', 'd', 'e'], }) instance = RegexGenerator('[a-c]') @@ -206,27 +223,37 @@ def test_called_multiple_times(self): 'id': [1, 2, 3, 4, 5], 'username': ['a', 'b', 'c', 'a', 'b'], }) - pd.testing.assert_frame_equal(first_reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + first_reverse_transform, expected_reverse_transformed + ) # Reverse Transform Again - second_reverse_transform = instance.reverse_transform(transformed.head(1)) + second_reverse_transform = instance.reverse_transform( + transformed.head(1) + ) # Assert Reverse Transform expected_reverse_transformed = pd.DataFrame({ 'id': [1], 'username': ['a'], }) - pd.testing.assert_frame_equal(second_reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + second_reverse_transform, expected_reverse_transformed + ) # Reverse Transform Again - third_reverse_transform = instance.reverse_transform(transformed.head(1)) + third_reverse_transform = instance.reverse_transform( + transformed.head(1) + ) # Assert Reverse Transform expected_reverse_transformed = pd.DataFrame({ 'id': [1], 'username': ['b'], }) - pd.testing.assert_frame_equal(third_reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + third_reverse_transform, expected_reverse_transformed + ) def test_called_multiple_times_enforce_uniqueness(self): """Test that calling multiple times with ``enforce_uniqueness`` returns unique values.""" @@ -236,8 +263,12 @@ def test_called_multiple_times_enforce_uniqueness(self): # Run transformed_data = generator.fit_transform(data, 'my_column') - first_reverse_transform = generator.reverse_transform(transformed_data.head(3)) - second_reverse_transform = generator.reverse_transform(transformed_data.head(5)) + first_reverse_transform = generator.reverse_transform( + transformed_data.head(3) + ) + second_reverse_transform = generator.reverse_transform( + transformed_data.head(5) + ) # Assert expected_first_reverse_transform = pd.DataFrame({ @@ -246,15 +277,19 @@ def test_called_multiple_times_enforce_uniqueness(self): expected_second_reverse_transform = pd.DataFrame({ 'my_column': ['AAAAD', 'AAAAE', 'AAAAF', 'AAAAG', 'AAAAH'] }) - pd.testing.assert_frame_equal(first_reverse_transform, expected_first_reverse_transform) - pd.testing.assert_frame_equal(second_reverse_transform, expected_second_reverse_transform) + pd.testing.assert_frame_equal( + first_reverse_transform, expected_first_reverse_transform + ) + pd.testing.assert_frame_equal( + second_reverse_transform, expected_second_reverse_transform + ) def test_pickled(self, tmpdir): """Test that ensures that ``RegexGenerator`` can be pickled.""" # Setup data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], - 'username': ['a', 'b', 'c', 'd', 'e'] + 'username': ['a', 'b', 'c', 'd', 'e'], }) # Run @@ -276,7 +311,13 @@ def test_with_many_possibilities(self): """Test the ``RegexGenerator`` with regex containing many possibilities.""" # Setup data = pd.DataFrame({ - 'id': ['a' * 50, 'a' * 49 + 'b', 'a' * 49 + 'c', 'a' * 49 + 'd', 'a' * 49 + 'e'], + 'id': [ + 'a' * 50, + 'a' * 49 + 'b', + 'a' * 49 + 'c', + 'a' * 49 + 'd', + 'a' * 49 + 'e', + ], 'username': ['aa', 'bb', 'cc', 'dd', 'ee'], }) @@ -292,11 +333,19 @@ def test_with_many_possibilities(self): expected_reverse_transformed = pd.DataFrame({ 'username': ['aa', 'bb', 'cc', 'dd', 'ee'], - 'id': ['a' * 50, 'a' * 49 + 'b', 'a' * 49 + 'c', 'a' * 49 + 'd', 'a' * 49 + 'e'], + 'id': [ + 'a' * 50, + 'a' * 49 + 'b', + 'a' * 49 + 'c', + 'a' * 49 + 'd', + 'a' * 49 + 'e', + ], }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) + pd.testing.assert_frame_equal( + reverse_transform, expected_reverse_transformed + ) def test_enforce_uniqueness_not_enough_values_categorical(self): """Test with enforce_uniqueness=True but insufficient regex values.""" @@ -311,7 +360,9 @@ def test_enforce_uniqueness_not_enough_values_categorical(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected = pd.DataFrame({'id': ['id_a', 'id_b', 'id_a(0)', 'id_b(0)', 'id_a(1)']}) + expected = pd.DataFrame({ + 'id': ['id_a', 'id_b', 'id_a(0)', 'id_b(0)', 'id_a(1)'] + }) pd.testing.assert_frame_equal(reverse_transform, expected) def test_enforce_uniqueness_not_enough_values_numerical(self): @@ -327,7 +378,9 @@ def test_enforce_uniqueness_not_enough_values_numerical(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected = pd.DataFrame({'id': ['2', '3', '4', '5', '6']}, dtype=object) + expected = pd.DataFrame( + {'id': ['2', '3', '4', '5', '6']}, dtype=object + ) pd.testing.assert_frame_equal(reverse_transform, expected) @@ -345,7 +398,10 @@ def test_end_to_end_scrambled(self): ht.detect_initial_config(customers) ht.update_sdtypes({'id': 'text'}) ht.update_transformers({ - 'id': RegexGenerator(regex_format='id_[a-z]', generation_order='scrambled')}) + 'id': RegexGenerator( + regex_format='id_[a-z]', generation_order='scrambled' + ) + }) # Run ht.fit(customers) @@ -353,5 +409,7 @@ def test_end_to_end_scrambled(self): reverse_transformed = ht.reverse_transform(transformed) # Assert - expected_id = pd.Series(['id_b', 'id_a', 'id_c', 'id_e', 'id_d'], name='id') + expected_id = pd.Series( + ['id_b', 'id_a', 'id_c', 'id_e', 'id_d'], name='id' + ) pd.testing.assert_series_equal(reverse_transformed['id'], expected_id) diff --git a/tests/performance/test_performance.py b/tests/performance/test_performance.py index a20274c4..6aab074c 100644 --- a/tests/performance/test_performance.py +++ b/tests/performance/test_performance.py @@ -9,11 +9,17 @@ from rdt.performance.profiling import profile_transformer from rdt.transformers import get_transformers_by_type from rdt.transformers.categorical import ( - CustomLabelEncoder, OrderedLabelEncoder, OrderedUniformEncoder) + CustomLabelEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, +) from rdt.transformers.numerical import ClusterBasedNormalizer SANDBOX_TRANSFORMERS = [ - ClusterBasedNormalizer, OrderedLabelEncoder, CustomLabelEncoder, OrderedUniformEncoder + ClusterBasedNormalizer, + OrderedLabelEncoder, + CustomLabelEncoder, + OrderedUniformEncoder, ] @@ -64,7 +70,9 @@ def validate_performance(performance, dataset_generator, should_assert=False): out.append(valid) if should_assert and not valid: - raise AssertionError(f'{function} {metric}: {value} > {expected_metric}') + raise AssertionError( + f'{function} {metric}: {value} > {expected_metric}' + ) return out @@ -84,7 +92,9 @@ def test_performance(transformer, dataset_generator): dataset_generator (rdt.tests.dataset.BaseDatasetGenerator): The dataset generator to performance tests against. """ - performance = evaluate_transformer_performance(transformer, dataset_generator) + performance = evaluate_transformer_performance( + transformer, dataset_generator + ) validate_performance(performance, dataset_generator, should_assert=True) @@ -101,8 +111,14 @@ def _round_to_magnitude(value): raise ValueError('Value is too big') -def find_transformer_boundaries(transformer, dataset_generator, fit_size, - transform_size, iterations=1, multiplier=5): +def find_transformer_boundaries( + transformer, + dataset_generator, + fit_size, + transform_size, + iterations=1, + multiplier=5, +): """Helper function to find valid candidate boundaries for performance tests. The function works by: @@ -134,7 +150,9 @@ def find_transformer_boundaries(transformer, dataset_generator, fit_size, Candidate values for each metric. """ results = [ - profile_transformer(transformer, dataset_generator, transform_size, fit_size) + profile_transformer( + transformer, dataset_generator, transform_size, fit_size + ) for _ in range(iterations) ] means = pd.DataFrame(results).mean(axis=0) diff --git a/tests/performance/tests/test_profiling.py b/tests/performance/tests/test_profiling.py index 3904c956..4fc229b8 100644 --- a/tests/performance/tests/test_profiling.py +++ b/tests/performance/tests/test_profiling.py @@ -42,25 +42,38 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): deepcopy_mock.return_value = transformer_mock.return_value # Run - profiling_results = profile_transformer(transformer_mock.return_value, - dataset_gen_mock, 100) + profiling_results = profile_transformer( + transformer_mock.return_value, dataset_gen_mock, 100 + ) # Assert expected_output_columns = [ - 'Fit Time', 'Fit Memory', 'Transform Time', 'Transform Memory', - 'Reverse Transform Time', 'Reverse Transform Memory' + 'Fit Time', + 'Fit Memory', + 'Transform Time', + 'Transform Memory', + 'Reverse Transform Time', + 'Reverse Transform Memory', ] assert len(deepcopy_mock.mock_calls) == 10 assert len(transformer_mock.return_value.fit.mock_calls) == 11 assert len(transformer_mock.return_value.transform.mock_calls) == 11 - assert len(transformer_mock.return_value.reverse_transform.mock_calls) == 10 + assert ( + len(transformer_mock.return_value.reverse_transform.mock_calls) == 10 + ) - all(np.testing.assert_array_equal(call[1][0], np.ones(100)) for call - in transformer_mock.fit.mock_calls) - all(np.testing.assert_array_equal(call[1][0], np.ones(100)) for call - in transformer_mock.transform.mock_calls) - all(np.testing.assert_array_equal(call[1][0], np.zeros(100)) for call - in transformer_mock.reverse_transform.mock_calls) + all( + np.testing.assert_array_equal(call[1][0], np.ones(100)) + for call in transformer_mock.fit.mock_calls + ) + all( + np.testing.assert_array_equal(call[1][0], np.ones(100)) + for call in transformer_mock.transform.mock_calls + ) + all( + np.testing.assert_array_equal(call[1][0], np.zeros(100)) + for call in transformer_mock.reverse_transform.mock_calls + ) assert expected_output_columns == list(profiling_results.index) @@ -70,11 +83,20 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): reverse_transform_call = process_mock.mock_calls[6] assert fit_call[2]['args'][0] == transformer_mock.return_value.fit - pd.testing.assert_frame_equal(fit_call[2]['args'][1], pd.DataFrame({'test': np.ones(100)})) - assert transform_call[2]['args'][0] == transformer_mock.return_value.transform + pd.testing.assert_frame_equal( + fit_call[2]['args'][1], pd.DataFrame({'test': np.ones(100)}) + ) + assert ( + transform_call[2]['args'][0] == transformer_mock.return_value.transform + ) pd.testing.assert_frame_equal( transform_call[2]['args'][1].reset_index(drop=True), - pd.DataFrame({'test': np.ones(100)}) + pd.DataFrame({'test': np.ones(100)}), + ) + assert ( + reverse_transform_call[2]['args'][0] + == transformer_mock.return_value.reverse_transform + ) + np.testing.assert_array_equal( + reverse_transform_call[2]['args'][1], np.zeros(100) ) - assert reverse_transform_call[2]['args'][0] == transformer_mock.return_value.reverse_transform - np.testing.assert_array_equal(reverse_transform_call[2]['args'][1], np.zeros(100)) diff --git a/tests/unit/test___init__.py b/tests/unit/test___init__.py index a33e124b..0a22f6a1 100644 --- a/tests/unit/test___init__.py +++ b/tests/unit/test___init__.py @@ -1,4 +1,3 @@ - import sys from types import ModuleType from unittest.mock import Mock, patch @@ -25,7 +24,11 @@ def test_get_demo(): demo = get_demo() assert list(demo.columns) == [ - 'last_login', 'email_optin', 'credit_card', 'age', 'dollars_spent' + 'last_login', + 'email_optin', + 'credit_card', + 'age', + 'dollars_spent', ] assert len(demo) == 5 assert list(demo.isna().sum(axis=0)) == [1, 1, 1, 0, 1] @@ -34,23 +37,65 @@ def test_get_demo(): def test_get_demo_many_rows(): demo = get_demo(10) - login_dates = pd.Series([ - '2021-06-26', '2021-02-10', 'NaT', '2020-09-26', '2020-12-22', '2019-11-27', - '2002-05-10', '2014-10-04', '2014-03-19', '2015-09-13' - ], dtype='datetime64[ns]') - email_optin = [False, False, False, True, np.nan, np.nan, False, True, False, False] + login_dates = pd.Series( + [ + '2021-06-26', + '2021-02-10', + 'NaT', + '2020-09-26', + '2020-12-22', + '2019-11-27', + '2002-05-10', + '2014-10-04', + '2014-03-19', + '2015-09-13', + ], + dtype='datetime64[ns]', + ) + email_optin = [ + False, + False, + False, + True, + np.nan, + np.nan, + False, + True, + False, + False, + ] credit_card = [ - 'VISA', 'VISA', 'AMEX', np.nan, 'DISCOVER', 'AMEX', 'AMEX', 'DISCOVER', 'DISCOVER', 'VISA' + 'VISA', + 'VISA', + 'AMEX', + np.nan, + 'DISCOVER', + 'AMEX', + 'AMEX', + 'DISCOVER', + 'DISCOVER', + 'VISA', ] age = [29, 18, 21, 45, 32, 50, 93, 75, 39, 66] - dollars_spent = [99.99, np.nan, 2.50, 25.00, 19.99, 52.48, 39.99, 4.67, np.nan, 23.28] + dollars_spent = [ + 99.99, + np.nan, + 2.50, + 25.00, + 19.99, + 52.48, + 39.99, + 4.67, + np.nan, + 23.28, + ] expected = pd.DataFrame({ 'last_login': login_dates, 'email_optin': email_optin, 'credit_card': credit_card, 'age': age, - 'dollars_spent': dollars_spent + 'dollars_spent': dollars_spent, }) pd.testing.assert_frame_equal(demo, expected) @@ -78,6 +123,7 @@ def test__find_addons_module(entry_points_mock, mock_rdt): @patch.object(rdt, 'entry_points') def test__find_addons_type_error(entry_points_mock): """Test it when entry_points raises a TypeError (happens for py38, py39).""" + # Setup def side_effect(arg=None): if arg == 'rdt_modules': @@ -114,6 +160,7 @@ def test__find_addons_object(entry_points_mock, mock_rdt): @patch('rdt.entry_points') def test__find_addons_bad_addon(entry_points_mock, warning_mock): """Test failing to load an add-on generates a warning.""" + # Setup def entry_point_error(): raise ValueError() @@ -198,13 +245,15 @@ def test__find_addons_module_and_object(entry_points_mock, warning_mock): @patch('warnings.warn') @patch.object(rdt, 'entry_points') -def test__find_addons_missing_object(entry_points_mock, warning_mock, mock_rdt): +def test__find_addons_missing_object( + entry_points_mock, warning_mock, mock_rdt +): """Test incorrect add-on name generates a warning.""" # Setup bad_entry_point = Mock() bad_entry_point.name = 'rdt.submodule:missing_object.new_method' entry_points_mock.return_value = [bad_entry_point] - msg = ("Failed to set 'rdt.submodule:missing_object.new_method': missing_object.") + msg = "Failed to set 'rdt.submodule:missing_object.new_method': missing_object." del mock_rdt.submodule.missing_object diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py index ddf7fff8..43a6b53e 100644 --- a/tests/unit/test_hyper_transformer.py +++ b/tests/unit/test_hyper_transformer.py @@ -8,17 +8,29 @@ from rdt import HyperTransformer from rdt.errors import ( - ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError, - TransformerProcessingError) + ConfigNotSetError, + InvalidConfigError, + InvalidDataError, + NotFittedError, + TransformerInputError, + TransformerProcessingError, +) from rdt.transformers import ( - AnonymizedFaker, BaseMultiColumnTransformer, BinaryEncoder, FloatFormatter, FrequencyEncoder, - LabelEncoder, RegexGenerator, UniformEncoder, UnixTimestampEncoder) + AnonymizedFaker, + BaseMultiColumnTransformer, + BinaryEncoder, + FloatFormatter, + FrequencyEncoder, + LabelEncoder, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, +) from rdt.transformers.base import BaseTransformer from rdt.transformers.numerical import ClusterBasedNormalizer class TestHyperTransformer(TestCase): - def test__add_field_to_set_string(self): """Test the ``_add_field_to_set`` method. @@ -85,7 +97,7 @@ def test__validate_field_transformers(self): field_transformers = { 'integer': int_transformer, 'float': float_transformer, - ('integer',): int_transformer + ('integer',): int_transformer, } ht = HyperTransformer() ht.field_transformers = field_transformers @@ -99,7 +111,9 @@ def test__validate_field_transformers(self): with pytest.raises(ValueError, match=error_msg): ht._validate_field_transformers() - @patch('rdt.hyper_transformer.HyperTransformer._validate_field_transformers') + @patch( + 'rdt.hyper_transformer.HyperTransformer._validate_field_transformers' + ) def test___init__(self, validation_mock): """Test create new instance of HyperTransformer""" # Run @@ -174,12 +188,9 @@ def test__create_multi_column_fields(self): 'a': BinaryEncoder, 'b': UnixTimestampEncoder, ('c', 'd'): UnixTimestampEncoder, - 'e': FloatFormatter - } - ht.field_sdtypes = { - 'f': 'categorical', - ('g', 'h'): 'datetime' + 'e': FloatFormatter, } + ht.field_sdtypes = {'f': 'categorical', ('g', 'h'): 'datetime'} # Run multi_column_fields = ht._create_multi_column_fields() @@ -189,7 +200,7 @@ def test__create_multi_column_fields(self): 'c': ('c', 'd'), 'd': ('c', 'd'), 'g': ('g', 'h'), - 'h': ('g', 'h') + 'h': ('g', 'h'), } assert multi_column_fields == expected @@ -233,7 +244,7 @@ def test__learn_config(self, get_default_transformer_mock): ht.field_sdtypes = { 'datetime': 'datetime', 'pii': 'pii', - 'text': 'text' + 'text': 'text', } ht._unfit = Mock() @@ -252,10 +263,14 @@ def test__learn_config(self, get_default_transformer_mock): } assert isinstance(ht.field_transformers['integer'], FloatFormatter) - assert isinstance(ht.field_transformers['float'], ClusterBasedNormalizer) + assert isinstance( + ht.field_transformers['float'], ClusterBasedNormalizer + ) assert isinstance(ht.field_transformers['categorical'], LabelEncoder) assert isinstance(ht.field_transformers['bool'], LabelEncoder) - assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder) + assert isinstance( + ht.field_transformers['datetime'], UnixTimestampEncoder + ) assert isinstance(ht.field_transformers['pii'], AnonymizedFaker) assert isinstance(ht.field_transformers['text'], RegexGenerator) ht._unfit.assert_called_once() @@ -277,7 +292,7 @@ def test_detect_initial_config(self, logger_mock): 'col2': ['a', 'b', 'c'], 'col3': [True, False, True], 'col4': pd.to_datetime(['2010-02-01', '2010-01-01', '2010-02-01']), - 'col5': [1, 2, 3] + 'col5': [1, 2, 3], }) # Run @@ -289,16 +304,18 @@ def test_detect_initial_config(self, logger_mock): 'col2': 'categorical', 'col3': 'boolean', 'col4': 'datetime', - 'col5': 'numerical' + 'col5': 'numerical', } - field_transformers = {k: repr(v) for (k, v) in ht.field_transformers.items()} + field_transformers = { + k: repr(v) for (k, v) in ht.field_transformers.items() + } assert field_transformers == { 'col1': 'FloatFormatter()', 'col2': 'UniformEncoder()', 'col3': 'UniformEncoder()', 'col4': 'UnixTimestampEncoder()', - 'col5': 'FloatFormatter()' + 'col5': 'FloatFormatter()', } expected_config = '\n'.join(( @@ -317,13 +334,13 @@ def test_detect_initial_config(self, logger_mock): ' "col4": UnixTimestampEncoder(),', ' "col5": FloatFormatter()', ' }', - '}' + '}', )) logger_mock.info.assert_has_calls([ call('Detecting a new config from the data ... SUCCESS'), call('Setting the new config ... SUCCESS'), call('Config:'), - call(expected_config) + call(expected_config), ]) def test__get_columns_to_sdtypes(self): @@ -377,20 +394,20 @@ def test__fit_field_transformer(self): data = pd.DataFrame({'a': [1, 2, 3]}) transformed_data1 = pd.DataFrame({ 'a.out1': ['2', '4', '6'], - 'a.out2': [1, 2, 3] + 'a.out2': [1, 2, 3], }) transformer1 = Mock() transformer2 = Mock() transformer1.get_output_columns.return_value = ['a.out1', 'a.out2'] transformer1.get_next_transformers.return_value = { 'a.out1': transformer2, - 'a.out2': None + 'a.out2': None, } transformer1.transform.return_value = transformed_data1 transformer2.get_output_columns.return_value = ['a.out1'] transformer2.get_next_transformers.return_value = { 'a.out1': None, - 'a.out1.is_null': None + 'a.out1.is_null': None, } transformer2.transform.return_value = transformed_data1 ht = HyperTransformer() @@ -401,7 +418,7 @@ def test__fit_field_transformer(self): # Assert expected = pd.DataFrame({ 'a.out1': ['2', '4', '6'], - 'a.out2': [1, 2, 3] + 'a.out2': [1, 2, 3], }) pd.testing.assert_frame_equal(out, expected) transformer1.fit.assert_called_once() @@ -486,16 +503,10 @@ def test__validate_config(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run error_msg = re.escape( @@ -511,17 +522,14 @@ def test_validate_config_not_unique_field(self): transformers = { 'column1': FloatFormatter(), 'column2': FrequencyEncoder(), - ('column2', 'column3'): None + ('column2', 'column3'): None, } sdtypes = { 'column1': 'numerical', 'column2': 'numerical', - 'column3': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column3': 'numerical', } + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run error_msg = re.escape( @@ -551,17 +559,14 @@ def test__validate_config_no_warning(self, warnings_mock): transformers = { 'column1': FloatFormatter(), 'column2': FrequencyEncoder(), - 'column3': None + 'column3': None, } sdtypes = { 'column1': 'numerical', 'column2': 'categorical', - 'column3': 'numerical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column3': 'numerical', } + config = {'sdtypes': sdtypes, 'transformers': transformers} # Run HyperTransformer._validate_config(config) @@ -583,16 +588,13 @@ def test__validate_config_invalid_key(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} config = { 'sdtypes': sdtypes, 'transformers': transformers, - 'unexpected': 10 + 'unexpected': 10, } # Run / Assert @@ -617,7 +619,7 @@ def test__validate_config_missing_sdtypes(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() + 'column2': FrequencyEncoder(), } config = { 'transformers': transformers, @@ -643,13 +645,10 @@ def test__validate_config_mismatched_columns(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} transformers = { 'column1': FloatFormatter(), - 'column3': FrequencyEncoder() + 'column3': FrequencyEncoder(), } config = { 'sdtypes': sdtypes, @@ -676,13 +675,10 @@ def test__validate_config_invalid_sdtype(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'unexpected' - } + sdtypes = {'column1': 'numerical', 'column2': 'unexpected'} transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() + 'column2': FrequencyEncoder(), } config = { 'sdtypes': sdtypes, @@ -709,14 +705,8 @@ def test__validate_config_invalid_transformer(self): - It should raise an error. """ # Setup - sdtypes = { - 'column1': 'numerical', - 'column2': 'numerical' - } - transformers = { - 'column1': FloatFormatter(), - 'column2': 'unexpected' - } + sdtypes = {'column1': 'numerical', 'column2': 'numerical'} + transformers = {'column1': FloatFormatter(), 'column2': 'unexpected'} config = { 'sdtypes': sdtypes, 'transformers': transformers, @@ -749,12 +739,9 @@ def test_get_config(self): ht = HyperTransformer() ht.field_transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - ht.field_sdtypes = { - 'column1': 'numerical', - 'column2': 'categorical' + 'column2': FrequencyEncoder(), } + ht.field_sdtypes = {'column1': 'numerical', 'column2': 'categorical'} # Run config = ht.get_config() @@ -762,7 +749,7 @@ def test_get_config(self): # Assert expected_config = { 'sdtypes': ht.field_sdtypes, - 'transformers': ht.field_transformers + 'transformers': ht.field_transformers, } assert config == expected_config @@ -784,10 +771,7 @@ def test_get_config_empty(self): config = ht.get_config() # Assert - expected_config = { - 'sdtypes': {}, - 'transformers': {} - } + expected_config = {'sdtypes': {}, 'transformers': {}} assert config == expected_config def test_set_config(self): @@ -811,16 +795,10 @@ def test_set_config(self): # Setup transformers = { 'column1': FloatFormatter(), - 'column2': FrequencyEncoder() - } - sdtypes = { - 'column1': 'numerical', - 'column2': 'categorical' - } - config = { - 'sdtypes': sdtypes, - 'transformers': transformers + 'column2': FrequencyEncoder(), } + sdtypes = {'column1': 'numerical', 'column2': 'categorical'} + config = {'sdtypes': sdtypes, 'transformers': transformers} ht = HyperTransformer() ht._validate_config = Mock() @@ -851,10 +829,7 @@ def test_set_config_already_fitted(self, mock_warnings): """ # Setup - config = { - 'sdtypes': {}, - 'transformers': {} - } + config = {'sdtypes': {}, 'transformers': {}} ht = HyperTransformer() ht._fitted = True ht._validate_config = Mock() @@ -875,7 +850,12 @@ def get_data(self): 'float': [0.1, 0.2, 0.1, 0.1], 'categorical': ['a', 'a', 'b', 'a'], 'bool': [False, False, True, False], - 'datetime': pd.to_datetime(['2010-02-01', '2010-01-01', '2010-02-01', '2010-01-01']) + 'datetime': pd.to_datetime([ + '2010-02-01', + '2010-01-01', + '2010-02-01', + '2010-01-01', + ]), }) def get_transformed_data(self): @@ -884,12 +864,7 @@ def get_transformed_data(self): 'float': [0.1, 0.2, 0.1, 0.1], 'categorical': [0.375, 0.375, 0.875, 0.375], 'bool': [0.0, 0.0, 1.0, 0.0], - 'datetime': [ - 1.2649824e+18, - 1.262304e+18, - 1.2649824e+18, - 1.262304e+18 - ] + 'datetime': [1.2649824e18, 1.262304e18, 1.2649824e18, 1.262304e18], }) def test__validate_detect_config_called(self): @@ -996,7 +971,7 @@ def test_fit(self): 'integer.out': int_out_transformer, 'bool': bool_transformer, 'categorical': categorical_transformer, - 'datetime': datetime_transformer + 'datetime': datetime_transformer, } ht = HyperTransformer() @@ -1018,7 +993,7 @@ def test_fit(self): call(data, 'float', float_transformer), call(data, 'categorical', categorical_transformer), call(data, 'bool', bool_transformer), - call(data, 'datetime', datetime_transformer) + call(data, 'datetime', datetime_transformer), ] ht._validate_all_fields_fitted.assert_called_once() ht._validate_detect_config_called.assert_called_once() @@ -1026,6 +1001,7 @@ def test_fit(self): def test_fit_with_multi_column_transformer(self): """Test the ``fit`` method with a multi-column transformer.""" + # Setup class MultiColumnTransformer(BaseMultiColumnTransformer): def _fit(self, data): @@ -1045,12 +1021,12 @@ def _reverse_transform(self, data): field_transformers = { ('col1', 'col2'): MultiColumnTransformer(), - 'col3': FloatFormatter() + 'col3': FloatFormatter(), } field_sdtypes = { 'col1': 'numerical', 'col2': 'categorical', - 'col3': 'numerical' + 'col3': 'numerical', } columns_to_sdtype = { @@ -1069,7 +1045,7 @@ def _reverse_transform(self, data): data = pd.DataFrame({ 'col1': [1, 2, 3], 'col2': ['a', 'b', 'c'], - 'col3': [1, 2, 3] + 'col3': [1, 2, 3], }) # Run @@ -1085,16 +1061,18 @@ def test_fit_warns(self): Two chained transformers, where the first generates the columns 'col' and 'col.is_null' and the second takes 'col' and generates 'col.is_null'. """ + # Setup class DummyTransformer2(BaseTransformer): INPUT_SDTYPE = 'numerical' def __init__(self): super().__init__() - self.output_properties = {'is_null': {'sdtype': 'float', 'next_transformer': None}} + self.output_properties = { + 'is_null': {'sdtype': 'float', 'next_transformer': None} + } - def _fit(self, _): - ... + def _fit(self, _): ... def _transform(self, data): return data.to_numpy() @@ -1106,19 +1084,23 @@ def __init__(self): super().__init__() self.output_properties = { 'is_null': {'sdtype': 'float', 'next_transformer': None}, - None: {'sdtype': 'float', 'next_transformer': DummyTransformer2()} + None: { + 'sdtype': 'float', + 'next_transformer': DummyTransformer2(), + }, } - def _fit(self, _): - ... + def _fit(self, _): ... def _transform(self, data): return np.array([[4, 1], [5, 2], [6, 3]]) ht = HyperTransformer() data = pd.DataFrame({'col': [1, 2, 3]}) - ht.set_config( - {'sdtypes': {'col': 'numerical'}, 'transformers': {'col': DummyTransformer1()}}) + ht.set_config({ + 'sdtypes': {'col': 'numerical'}, + 'transformers': {'col': DummyTransformer1()}, + }) # Run and Assert warn_msg = re.escape( @@ -1136,7 +1118,9 @@ def test_fit_warns_columns_in_data(self): ht = HyperTransformer() data = pd.DataFrame({'col': [1, np.nan, 3], 'col.is_null': [1, 2, 3]}) ht.detect_initial_config(data) - ht.field_transformers['col'] = FloatFormatter(model_missing_values=True) + ht.field_transformers['col'] = FloatFormatter( + model_missing_values=True + ) # Run and Assert warn_msg = re.escape( @@ -1188,7 +1172,7 @@ def test_transform(self): float_transformer, categorical_transformer, bool_transformer, - datetime_transformer + datetime_transformer, ] ht.field_sdtypes = {'col1': 'categorical'} ht._input_columns = list(data.columns) @@ -1215,7 +1199,10 @@ def test_fit_updates_field_transformers(self): ff = FloatFormatter() # Run - ht.set_config({'sdtypes': {'col': 'numerical'}, 'transformers': {'col': ff}}) + ht.set_config({ + 'sdtypes': {'col': 'numerical'}, + 'transformers': {'col': ff}, + }) ht.fit(data) # Assert @@ -1237,9 +1224,11 @@ def test_transform_raises_error_no_config(self): ht = HyperTransformer() # Run - expected_msg = ("No config detected. Set the config using 'set_config' or pre-populate " - "it automatically from your data using 'detect_initial_config' prior to " - 'fitting your data.') + expected_msg = ( + "No config detected. Set the config using 'set_config' or pre-populate " + "it automatically from your data using 'detect_initial_config' prior to " + 'fitting your data.' + ) with pytest.raises(ConfigNotSetError, match=expected_msg): ht.transform(data) @@ -1409,14 +1398,12 @@ def test_fit_transform(self): assert transformer.fit.call_count == expect_call_count_fit pd.testing.assert_frame_equal( - transformer.fit.call_args[0][0], - expect_call_args_fit + transformer.fit.call_args[0][0], expect_call_args_fit ) assert transformer.transform.call_count == expect_call_count_transform pd.testing.assert_frame_equal( - transformer.transform.call_args[0][0], - expect_call_args_transform + transformer.transform.call_args[0][0], expect_call_args_transform ) def test_reset_randomization(self): @@ -1442,7 +1429,7 @@ def test_reset_randomization(self): 'id': transformer_id, 'random_element': transformer_random_element, 'name': transformer_name, - 'label': None + 'label': None, } # Run @@ -1480,12 +1467,13 @@ def test_create_anonymized_columns(self): instance.random_state = {} random_element = AnonymizedFaker( - function_name='random_element', - function_kwargs={'elements': ['a']} + function_name='random_element', function_kwargs={'elements': ['a']} ) random_element.columns = ['random_element'] random_element.output_columns = [] - random_element.set_random_state(np.random.RandomState(42), 'reverse_transform') + random_element.set_random_state( + np.random.RandomState(42), 'reverse_transform' + ) regex_id = RegexGenerator(regex_format='id_[0-9]') regex_id.reset_randomization() @@ -1494,20 +1482,18 @@ def test_create_anonymized_columns(self): instance.field_transformers = { 'id': regex_id, - 'random_element': random_element + 'random_element': random_element, } # Run output = HyperTransformer.create_anonymized_columns( - instance, - num_rows=5, - column_names=['id', 'random_element'] + instance, num_rows=5, column_names=['id', 'random_element'] ) # Assert expected_output = pd.DataFrame({ 'id': ['id_0', 'id_1', 'id_2', 'id_3', 'id_4'], - 'random_element': ['a', 'a', 'a', 'a', 'a'] + 'random_element': ['a', 'a', 'a', 'a', 'a'], }) pd.testing.assert_frame_equal(output, expected_output) @@ -1559,15 +1545,23 @@ def test_create_anonymized_columns_num_rows_error(self): instance._modified_config = False # Run / Assert - error_msg = re.escape("Parameter 'num_rows' must be an integer greater than 0.") + error_msg = re.escape( + "Parameter 'num_rows' must be an integer greater than 0." + ) with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns(instance, num_rows='a', column_names=['a']) + HyperTransformer.create_anonymized_columns( + instance, num_rows='a', column_names=['a'] + ) with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns(instance, num_rows=0, column_names=['a']) + HyperTransformer.create_anonymized_columns( + instance, num_rows=0, column_names=['a'] + ) with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns(instance, num_rows=-1, column_names=['a']) + HyperTransformer.create_anonymized_columns( + instance, num_rows=-1, column_names=['a'] + ) def test_create_anonymized_columns_invalid_columns(self): """Test ``create_anonymized_columns``. @@ -1601,7 +1595,9 @@ def test_create_anonymized_columns_invalid_columns(self): 'a list of valid column names.' ) with pytest.raises(InvalidConfigError, match=error_msg): - instance.create_anonymized_columns(num_rows=10, column_names=['credit_card', 'id']) + instance.create_anonymized_columns( + num_rows=10, column_names=['credit_card', 'id'] + ) def test_create_anonymized_columns_invalid_transformers(self): """Test ``create_anonymized_columǹs`` with transformers that do not generate data. @@ -1628,7 +1624,7 @@ def test_create_anonymized_columns_invalid_transformers(self): instance.field_transformers = { 'datetime': FloatFormatter(), - 'random_element': FloatFormatter() + 'random_element': FloatFormatter(), } # Run / Assert @@ -1641,7 +1637,7 @@ def test_create_anonymized_columns_invalid_transformers(self): HyperTransformer.create_anonymized_columns( instance, num_rows=5, - column_names=['datetime', 'random_element'] + column_names=['datetime', 'random_element'], ) def test_reverse_transform(self): @@ -1681,7 +1677,7 @@ def test_reverse_transform(self): float_transformer, categorical_transformer, bool_transformer, - datetime_transformer + datetime_transformer, ] ht._output_columns = list(data.columns) ht._input_columns = list(data.columns) @@ -1731,7 +1727,9 @@ def test_reverse_transform_subset_with_generators(self): reverse_transformed_data = self.get_transformed_data() float_transformer.reverse_transform = lambda x: x - int_transformer.reverse_transform.return_value = reverse_transformed_data + int_transformer.reverse_transform.return_value = ( + reverse_transformed_data + ) ht = HyperTransformer() ht._validate_config_exists = Mock() @@ -1740,16 +1738,20 @@ def test_reverse_transform_subset_with_generators(self): ht._transformers_sequence = [ int_transformer, float_transformer, - generator_transformer + generator_transformer, ] ht._output_columns = list(reverse_transformed_data.columns) ht._input_columns = list(reverse_transformed_data.columns) # Run - reverse_transformed = ht.reverse_transform_subset(reverse_transformed_data) + reverse_transformed = ht.reverse_transform_subset( + reverse_transformed_data + ) # Assert - pd.testing.assert_frame_equal(reverse_transformed, reverse_transformed_data) + pd.testing.assert_frame_equal( + reverse_transformed, reverse_transformed_data + ) int_transformer.reverse_transform.assert_called_once() generator_transformer.reverse_transform.assert_not_called() @@ -1769,9 +1771,11 @@ def test_reverse_transform_raises_error_no_config(self): ht = HyperTransformer() # Run - expected_msg = ("No config detected. Set the config using 'set_config' or pre-populate " - "it automatically from your data using 'detect_initial_config' prior to " - 'fitting your data.') + expected_msg = ( + "No config detected. Set the config using 'set_config' or pre-populate " + "it automatically from your data using 'detect_initial_config' prior to " + 'fitting your data.' + ) with pytest.raises(ConfigNotSetError, match=expected_msg): ht.reverse_transform(data) @@ -1827,9 +1831,7 @@ def test_reverse_transform_with_subset(self): data = pd.DataFrame({'col1': [1, 2]}) # Run / Assert - expected_msg = ( - 'You must provide a transformed dataset with all the columns from the original data.' - ) + expected_msg = 'You must provide a transformed dataset with all the columns from the original data.' with pytest.raises(InvalidDataError, match=expected_msg): ht.reverse_transform(data) @@ -1890,7 +1892,9 @@ def test_reverse_transform_subset(self): ht.reverse_transform_subset(data) # Assert - ht._reverse_transform.assert_called_once_with(data, prevent_subset=False) + ht._reverse_transform.assert_called_once_with( + data, prevent_subset=False + ) def test_reverse_transform_subset_with_unknown_columns(self): """Test the ``reverse_transform_subset`` method with unknown columns. @@ -1971,7 +1975,6 @@ def test_update_transformers_by_sdtype_field_sdtypes_not_fitted(self): ht.field_sdtypes = { 'categorical_column': 'categorical', 'numerical_column': 'numerical', - } transformer = LabelEncoder() @@ -1979,11 +1982,17 @@ def test_update_transformers_by_sdtype_field_sdtypes_not_fitted(self): ht.update_transformers_by_sdtype('categorical', transformer) # Assert - assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) - assert isinstance(ht.field_transformers['numerical_column'], FloatFormatter) + assert isinstance( + ht.field_transformers['categorical_column'], LabelEncoder + ) + assert isinstance( + ht.field_transformers['numerical_column'], FloatFormatter + ) @patch('rdt.hyper_transformer.warnings') - def test_update_transformers_by_sdtype_field_sdtypes_fitted(self, mock_warnings): + def test_update_transformers_by_sdtype_field_sdtypes_fitted( + self, mock_warnings + ): """Test ``update_transformers_by_sdtype`` if ``HyperTransformer`` has aleady been fit. Ensure that the ``field_transformers`` that have the input ``sdtype`` have been updated and @@ -2018,14 +2027,19 @@ def test_update_transformers_by_sdtype_field_sdtypes_fitted(self, mock_warnings) call( "The 'transformer' parameter will no longer be supported in future " "versions of the RDT. Please use the 'transformer_name' and " - "'transformer_parameters' parameters instead.", FutureWarning - ) + "'transformer_parameters' parameters instead.", + FutureWarning, + ), ] mock_warnings.warn.assert_has_calls(expected_warnings_msgs) - assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) + assert isinstance( + ht.field_transformers['categorical_column'], LabelEncoder + ) - def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error(self): + def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error( + self, + ): """Passing an incorrect ``sdtype`` should raise an error.""" # Setup ht = HyperTransformer() @@ -2041,7 +2055,9 @@ def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error(self): # Run / Assert expected_msg = "Invalid transformer name 'LabelEncoder' for the 'fake_type' sdtype." with pytest.raises(InvalidConfigError, match=expected_msg): - ht.update_transformers_by_sdtype('fake_type', transformer_name='LabelEncoder') + ht.update_transformers_by_sdtype( + 'fake_type', transformer_name='LabelEncoder' + ) def test_update_transformers_by_sdtype_bad_transformer_raises_error(self): """Test ``update_transformers_by_sdtype`` with an object that isn't a transformer instance. @@ -2064,11 +2080,15 @@ def test_update_transformers_by_sdtype_bad_transformer_raises_error(self): } # Run / Assert - expected_msg = 'Invalid transformer. Please input an rdt transformer object.' + expected_msg = ( + 'Invalid transformer. Please input an rdt transformer object.' + ) with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', Mock()) - def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error(self): + def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error( + self, + ): """Test ``update_transformers_by_sdtype`` with a mismatched sdtype and transformer. Setup: @@ -2089,11 +2109,15 @@ def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error(self): } # Run / Assert - expected_msg = "The transformer you've assigned is incompatible with the sdtype." + expected_msg = ( + "The transformer you've assigned is incompatible with the sdtype." + ) with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', FloatFormatter()) - def test_update_transformers_by_sdtype_with_transformer_none_transformer_name_none(self): + def test_update_transformers_by_sdtype_with_transformer_none_transformer_name_none( + self, + ): """When ``transformer_name`` and ``transformer`` are both ``None``, it should crash.""" # Setup ht = HyperTransformer() @@ -2113,9 +2137,13 @@ def test_update_transformers_by_sdtype_incorrect_transformer_name(self): # Run and Assert err_msg = "Invalid transformer name 'Transformer' for the 'categorical' sdtype." with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype('categorical', transformer_name='Transformer') + ht.update_transformers_by_sdtype( + 'categorical', transformer_name='Transformer' + ) - def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer(self): + def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer( + self, + ): """When ``sdtype`` is not valid for the transformer, it should crash.""" # Setup ht = HyperTransformer() @@ -2124,7 +2152,9 @@ def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer(self): # Run and Assert err_msg = "Invalid transformer name 'LabelEncoder' for the 'numerical' sdtype." with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype('numerical', transformer_name='LabelEncoder') + ht.update_transformers_by_sdtype( + 'numerical', transformer_name='LabelEncoder' + ) def test_update_transformers_by_sdtype_incorrect_sdtype(self): """When ``sdtype`` is invalid, it should crash.""" @@ -2133,22 +2163,35 @@ def test_update_transformers_by_sdtype_incorrect_sdtype(self): ht.field_sdtypes = {'doesnt matter'} # Run and Assert - err_msg = "Invalid transformer name 'LabelEncoder' for the 'bla' sdtype." + err_msg = ( + "Invalid transformer name 'LabelEncoder' for the 'bla' sdtype." + ) with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype('bla', transformer_name='LabelEncoder') + ht.update_transformers_by_sdtype( + 'bla', transformer_name='LabelEncoder' + ) - def test_update_transformers_by_sdtype_incorrect_transformer_parameters(self): + def test_update_transformers_by_sdtype_incorrect_transformer_parameters( + self, + ): """When ``transformer_parameters`` has invalid values, it should crash.""" # Setup ht = HyperTransformer() ht.field_sdtypes = {'doesnt matter'} # Run and Assert - err_msg = re.escape("Invalid parameters ('false', 'order') for the 'LabelEncoder'.") + err_msg = re.escape( + "Invalid parameters ('false', 'order') for the 'LabelEncoder'." + ) with pytest.raises(TransformerInputError, match=err_msg): ht.update_transformers_by_sdtype( - 'categorical', transformer_name='LabelEncoder', - transformer_parameters={'order_by': [], 'order': [], 'false': []} + 'categorical', + transformer_name='LabelEncoder', + transformer_parameters={ + 'order_by': [], + 'order': [], + 'false': [], + }, ) def test_update_transformers_by_sdtype_transformer_name(self): @@ -2170,15 +2213,21 @@ def test_update_transformers_by_sdtype_transformer_name(self): } # Run - ht.update_transformers_by_sdtype('categorical', transformer_name='LabelEncoder') + ht.update_transformers_by_sdtype( + 'categorical', transformer_name='LabelEncoder' + ) # Assert assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) + assert isinstance( + ht.field_transformers['categorical_column'], LabelEncoder + ) @patch('rdt.hyper_transformer.warnings') - def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mock_warning): + def test_update_transformers_by_sdtype_transformer_name_and_transformer( + self, mock_warning + ): """Test setting ``transformer_name`` ignores ``transformer`` parameter. Expect the ``transformer`` parameter to be ignored, a warning to be raised, @@ -2199,7 +2248,10 @@ def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mo # Run ht.update_transformers_by_sdtype( - 'categorical', transformer='doesnt matter', transformer_name='LabelEncoder') + 'categorical', + transformer='doesnt matter', + transformer_name='LabelEncoder', + ) # Assert expected_msg = ( @@ -2209,9 +2261,13 @@ def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mo mock_warning.warn.assert_called_once_with(expected_msg, FutureWarning) assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) + assert isinstance( + ht.field_transformers['categorical_column'], LabelEncoder + ) - def test_update_transformers_by_sdtype_with_transformer_name_transformer_parameters(self): + def test_update_transformers_by_sdtype_with_transformer_name_transformer_parameters( + self, + ): """Test setting ``transformer_name`` and ``transformer_parameters`` works. Expect the `field_transformers`` to be updated with an instance of the passed @@ -2233,14 +2289,19 @@ def test_update_transformers_by_sdtype_with_transformer_name_transformer_paramet ht.update_transformers_by_sdtype( 'categorical', transformer_name='LabelEncoder', - transformer_parameters={'order_by': 'alphabetical'} + transformer_parameters={'order_by': 'alphabetical'}, ) # Assert assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) - assert ht.field_transformers['categorical_column'].order_by == 'alphabetical' + assert isinstance( + ht.field_transformers['categorical_column'], LabelEncoder + ) + assert ( + ht.field_transformers['categorical_column'].order_by + == 'alphabetical' + ) def test_create_multi_column_fields(self): """Test ``_create_multi_column_fields``.""" @@ -2310,8 +2371,7 @@ def test_remove_column_in_multi_column_fields_single_column_left(self): assert ht._multi_column_fields == expected_column_in_tuple def test_update_transformers_by_sdtype_with_multi_column_transformer(self): - """Test ``update_transformers_by_sdtype`` with columns use with a multi-column transformer. - """ + """Test ``update_transformers_by_sdtype`` with columns use with a multi-column transformer.""" # Setup ht = HyperTransformer() ht.field_transformers = { @@ -2323,13 +2383,10 @@ def test_update_transformers_by_sdtype_with_multi_column_transformer(self): 'A': 'categorical', 'B': 'boolean', 'C': 'categorical', - 'D': 'numerical' + 'D': 'numerical', } - ht._multi_column_fields = { - 'C': ('C', 'D'), - 'D': ('C', 'D') - } + ht._multi_column_fields = {'C': ('C', 'D'), 'D': ('C', 'D')} # Run ht.update_transformers_by_sdtype( @@ -2377,9 +2434,7 @@ def test_update_transformers_fitted(self, mock_warnings): instance.field_transformers = {'my_column': object()} instance._validate_transformers = Mock() transformer = FrequencyEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run instance.update_transformers(column_name_to_transformer) @@ -2392,10 +2447,13 @@ def test_update_transformers_fitted(self, mock_warnings): mock_warnings.warn.assert_called_once_with(expected_message) assert instance.field_transformers['my_column'] == transformer - instance._validate_transformers.assert_called_once_with(column_name_to_transformer) + instance._validate_transformers.assert_called_once_with( + column_name_to_transformer + ) def test__update_transformers_multi_column_valid(self): """Test ``_update_multi_column_transformer`` with a valid multi-column transformer.""" + # Setup class ValidMultiColumnTransformer(BaseMultiColumnTransformer): @classmethod @@ -2438,6 +2496,7 @@ def test__update_transformers_multi_column_invalid(self): The multi column transformer should be removed and its columns assigned to their default transformers. """ + # Setup class InvalidMultiColumnTransformer(BaseMultiColumnTransformer): @classmethod @@ -2450,7 +2509,7 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'B': 'boolean', 'C': 'numerical', 'D': 'categorical', - 'E': 'categorical' + 'E': 'categorical', } ht.field_transformers = { 'A': LabelEncoder(), @@ -2544,18 +2603,18 @@ def test_update_transformers_changing_multi_column_transformer(self): } def side_effect(column): - ht._multi_column_fields = { - 'B': ('B',) - } + ht._multi_column_fields = {'B': ('B',)} ht.field_transformers = { 'C': FloatFormatter(), 'B': None, - 'A': UniformEncoder() + 'A': UniformEncoder(), } mock_remove_column_in_multi_column_fields = Mock() mock_remove_column_in_multi_column_fields.side_effect = side_effect - ht._remove_column_in_multi_column_fields = mock_remove_column_in_multi_column_fields + ht._remove_column_in_multi_column_fields = ( + mock_remove_column_in_multi_column_fields + ) # Run ht.update_transformers(column_name_to_transformer) @@ -2564,7 +2623,7 @@ def side_effect(column): expected_field_transformers = { 'C': FloatFormatter(), 'B': None, - 'A': UniformEncoder() + 'A': UniformEncoder(), } mock_remove_column_in_multi_column_fields.assert_called_once_with('A') assert str(ht.field_transformers) == str(expected_field_transformers) @@ -2598,9 +2657,7 @@ def test_update_transformers_not_fitted(self, mock_warnings): instance.field_sdtypes = {'my_column': 'boolean'} instance._validate_transformers = Mock() transformer = BinaryEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run instance.update_transformers(column_name_to_transformer) @@ -2608,7 +2665,9 @@ def test_update_transformers_not_fitted(self, mock_warnings): # Assert mock_warnings.warn.assert_not_called() assert instance.field_transformers['my_column'] == transformer - instance._validate_transformers.assert_called_once_with(column_name_to_transformer) + instance._validate_transformers.assert_called_once_with( + column_name_to_transformer + ) def test_update_transformers_no_field_transformers(self): """Test update transformers. @@ -2634,9 +2693,7 @@ def test_update_transformers_no_field_transformers(self): instance = HyperTransformer() instance._fitted = False mock_transformer = Mock() - column_name_to_transformer = { - 'my_column': mock_transformer - } + column_name_to_transformer = {'my_column': mock_transformer} expected_config = instance.get_config() # Run expected_msg = ( @@ -2678,9 +2735,7 @@ def test_update_transformers_mismatch_sdtypes(self): instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() transformer = BinaryEncoder() - column_name_to_transformer = { - 'my_column': transformer - } + column_name_to_transformer = {'my_column': transformer} # Run and Assert err_msg = re.escape( @@ -2690,7 +2745,9 @@ def test_update_transformers_mismatch_sdtypes(self): with pytest.raises(InvalidConfigError, match=err_msg): instance.update_transformers(column_name_to_transformer) - instance._validate_transformers.assert_called_once_with(column_name_to_transformer) + instance._validate_transformers.assert_called_once_with( + column_name_to_transformer + ) def test_update_transformers_transformer_is_none(self): """Test update transformers. @@ -2718,16 +2775,16 @@ def test_update_transformers_transformer_is_none(self): instance.field_transformers = {'my_column': mock_numerical} instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() - column_name_to_transformer = { - 'my_column': None - } + column_name_to_transformer = {'my_column': None} # Run instance.update_transformers(column_name_to_transformer) # Assert assert instance.field_transformers == {'my_column': None} - instance._validate_transformers.assert_called_once_with(column_name_to_transformer) + instance._validate_transformers.assert_called_once_with( + column_name_to_transformer + ) def test_update_transformers_column_doesnt_exist_in_config(self): """Test update transformers. @@ -2756,9 +2813,7 @@ def test_update_transformers_column_doesnt_exist_in_config(self): instance.field_transformers = {'my_column': mock_numerical} instance.field_sdtypes = {'my_column': 'categorical'} instance._validate_transformers = Mock() - column_name_to_transformer = { - 'unknown_column': None - } + column_name_to_transformer = {'unknown_column': None} # Run / Assert expected_msg = re.escape( @@ -2793,12 +2848,13 @@ def test_update_sdtypes_fitted(self, mock_warnings, mock_logger): """ # Setup instance = HyperTransformer() - instance.field_transformers = {'a': FrequencyEncoder, 'b': FloatFormatter} + instance.field_transformers = { + 'a': FrequencyEncoder, + 'b': FloatFormatter, + } instance.field_sdtypes = {'my_column': 'categorical'} instance._fitted = True - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -2844,9 +2900,7 @@ def test_update_sdtypes_not_fitted(self, mock_warnings, mock_logger): instance = HyperTransformer() instance._fitted = False instance.field_sdtypes = {'my_column': 'categorical'} - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -2879,9 +2933,7 @@ def test_update_sdtypes_no_field_sdtypes(self): instance = HyperTransformer() instance._fitted = False instance.field_sdtypes = {} - column_name_to_sdtype = { - 'my_column': 'numerical' - } + column_name_to_sdtype = {'my_column': 'numerical'} # Run / Assert expected_message = ( @@ -2911,12 +2963,8 @@ def test_update_sdtypes_invalid_sdtype(self): instance._get_supported_sdtypes = Mock() instance._get_supported_sdtypes.return_value = [] instance._fitted = False - instance.field_sdtypes = { - 'my_column': 'categorical' - } - column_name_to_sdtype = { - 'my_column': 'credit_card' - } + instance.field_sdtypes = {'my_column': 'categorical'} + column_name_to_sdtype = {'my_column': 'credit_card'} # Run / Assert expected_message = re.escape( @@ -2943,12 +2991,8 @@ def test_update_sdtypes_invalid_columns(self): """ # Setup instance = HyperTransformer() - instance.field_sdtypes = { - 'my_column': 'categorical' - } - column_name_to_sdtype = { - 'unexpected': 'categorical' - } + instance.field_sdtypes = {'my_column': 'categorical'} + column_name_to_sdtype = {'unexpected': 'categorical'} # Run / Assert expected_message = re.escape( @@ -2961,7 +3005,9 @@ def test_update_sdtypes_invalid_columns(self): @patch('rdt.hyper_transformer.LOGGER') @patch('rdt.hyper_transformer.get_default_transformer') @patch('rdt.hyper_transformer.warnings') - def test_update_sdtypes_different_sdtype(self, mock_warnings, default_mock, mock_logger): + def test_update_sdtypes_different_sdtype( + self, mock_warnings, default_mock, mock_logger + ): """Test ``update_sdtypes``. Ensure that the method properly updates the ``self.field_sdtypes`` and changes the @@ -2990,9 +3036,7 @@ def test_update_sdtypes_different_sdtype(self, mock_warnings, default_mock, mock instance.field_sdtypes = {'a': 'categorical'} transformer_mock = FloatFormatter() default_mock.return_value = transformer_mock - column_name_to_sdtype = { - 'a': 'numerical' - } + column_name_to_sdtype = {'a': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -3009,7 +3053,9 @@ def test_update_sdtypes_different_sdtype(self, mock_warnings, default_mock, mock @patch('rdt.hyper_transformer.LOGGER') @patch('rdt.hyper_transformer.warnings') - def test_update_sdtypes_different_sdtype_than_transformer(self, mock_warnings, mock_logger): + def test_update_sdtypes_different_sdtype_than_transformer( + self, mock_warnings, mock_logger + ): """Test ``update_sdtypes``. Ensure that the method properly updates the ``self.field_sdtypes`` but doesn't change @@ -3039,9 +3085,7 @@ def test_update_sdtypes_different_sdtype_than_transformer(self, mock_warnings, m instance.field_sdtypes = {'a': 'categorical'} transformer = FloatFormatter() instance.field_transformers = {'a': transformer} - column_name_to_sdtype = { - 'a': 'numerical' - } + column_name_to_sdtype = {'a': 'numerical'} # Run instance.update_sdtypes(column_name_to_sdtype) @@ -3063,9 +3107,11 @@ def test_update_sdtypes_multi_column_with_supported_sdtypes(self): In this case the multi column transformer supports the new sdtype so the transformer should not be changed. """ + # Setup class DummyMultiColumnTransformer(BaseMultiColumnTransformer): """Dummy multi column transformer.""" + SUPPORTED_SDTYPES = ['categorical', 'boolean'] @classmethod @@ -3077,43 +3123,45 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } ht._create_multi_column_fields = Mock( return_value={ 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } ) ht._update_multi_column_transformer = Mock() # Run - ht.update_sdtypes(column_name_to_sdtype={ - 'column2': 'boolean', - 'column1': 'boolean', - 'column4': 'categorical' - }) + ht.update_sdtypes( + column_name_to_sdtype={ + 'column2': 'boolean', + 'column1': 'boolean', + 'column4': 'categorical', + } + ) # Assert expected_field_sdtypes = { 'column1': 'boolean', 'column2': 'boolean', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } expected_field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } assert ht.field_sdtypes == expected_field_sdtypes assert str(ht.field_transformers) == str(expected_field_transformers) @@ -3127,6 +3175,7 @@ def test_update_sdtypes_multi_column_with_unsupported_sdtypes(self): In this case the multi column transformer does not support the new sdtype so the transformer should be changed to the default one. """ + # Setup class DummyMultiColumnTransformer(BaseMultiColumnTransformer): """Dummy multi column transformer.""" @@ -3142,16 +3191,16 @@ def _validate_sdtypes(cls, columns_to_sdtypes): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': UniformEncoder(), ('column2', 'column3'): DummyMultiColumnTransformer(), - 'column4': None + 'column4': None, } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3160,17 +3209,19 @@ def _validate_sdtypes(cls, columns_to_sdtypes): ' Assigning a new transformer to it.' ) with pytest.warns(UserWarning, match=expected_warning): - ht.update_sdtypes(column_name_to_sdtype={ - 'column2': 'numerical', - 'column1': 'boolean' - }) + ht.update_sdtypes( + column_name_to_sdtype={ + 'column2': 'numerical', + 'column1': 'boolean', + } + ) # Assert expected_field_sdtypes = { 'column1': 'boolean', 'column2': 'numerical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } expected_field_transformers = { 'column1': UniformEncoder(), @@ -3224,7 +3275,7 @@ def test__validate_transformers(self): column_name_to_transformer = { 'col1': FrequencyEncoder(), 'col2': 'Unexpected', - 'col3': None + 'col3': None, } # Run / Assert @@ -3256,12 +3307,12 @@ def test_remove_transformers(self): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } # Run @@ -3271,7 +3322,7 @@ def test_remove_transformers(self): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': 'transformer' + 'column3': 'transformer', } def test_remove_transformers_unknown_columns(self): @@ -3299,12 +3350,12 @@ def test_remove_transformers_unknown_columns(self): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } error_msg = re.escape( @@ -3320,7 +3371,7 @@ def test_remove_transformers_unknown_columns(self): assert ht.field_transformers == { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } @patch('rdt.hyper_transformer.warnings') @@ -3350,12 +3401,12 @@ def test_remove_transformers_fitted(self, mock_warnings): ht.field_sdtypes = { 'column1': 'categorical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } # Run @@ -3370,7 +3421,7 @@ def test_remove_transformers_fitted(self, mock_warnings): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': None + 'column3': None, } def test_remove_transformers_multi_column(self): @@ -3385,16 +3436,16 @@ def test_remove_transformers_multi_column(self): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'categorical', - 'column4': 'categorical' + 'column4': 'categorical', } ht.field_transformers = { 'column1': 'transformer', ('column2', 'column3'): 'multi_column_transformer', - 'column4': 'transformer' + 'column4': 'transformer', } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3405,7 +3456,7 @@ def test_remove_transformers_multi_column(self): 'column1': 'transformer', 'column2': 'multi_column_transformer', 'column3': None, - 'column4': None + 'column4': None, } @patch('rdt.hyper_transformer.warnings') @@ -3433,12 +3484,12 @@ def test_remove_transformers_by_sdtype(self, mock_warnings): ht.field_transformers = { 'column1': 'transformer', 'column2': 'transformer', - 'column3': 'transformer' + 'column3': 'transformer', } ht.field_sdtypes = { 'column1': 'numerical', 'column2': 'categorical', - 'column3': 'categorical' + 'column3': 'categorical', } # Run @@ -3448,7 +3499,7 @@ def test_remove_transformers_by_sdtype(self, mock_warnings): assert ht.field_transformers == { 'column1': 'transformer', 'column2': None, - 'column3': None + 'column3': None, } expected_warnings_msg = ( 'For this change to take effect, please refit your data using ' @@ -3494,16 +3545,16 @@ def test_remove_transformers_by_sdtype_multi_column(self): 'column1': 'categorical', 'column2': 'categorical', 'column3': 'boolean', - 'column4': 'boolean' + 'column4': 'boolean', } ht.field_transformers = { 'column1': 'transformer', ('column2', 'column3'): 'multi_column_transformer', - 'column4': 'transformer' + 'column4': 'transformer', } ht._multi_column_fields = { 'column2': ('column2', 'column3'), - 'column3': ('column2', 'column3') + 'column3': ('column2', 'column3'), } # Run @@ -3514,10 +3565,12 @@ def test_remove_transformers_by_sdtype_multi_column(self): 'column1': 'transformer', 'column2': 'multi_column_transformer', 'column3': None, - 'column4': None + 'column4': None, } - def test__fit_field_transformer_multi_column_field_not_ready(self,): + def test__fit_field_transformer_multi_column_field_not_ready( + self, + ): """Test the ``_fit_field_transformer`` method. This tests that the ``_fit_field_transformer`` behaves as expected. @@ -3538,18 +3591,17 @@ def test__fit_field_transformer_multi_column_field_not_ready(self,): outputs of the original transformer. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) transformed_data1 = pd.DataFrame({ 'a.out1': ['1', '2', '3'], - 'b': [4, 5, 6] + 'b': [4, 5, 6], }) transformer1 = Mock() transformer2 = Mock() transformer1.get_output_columns.return_value = ['a.out1'] - transformer1.get_next_transformers.return_value = {('a.out1', 'b.out1'): transformer2} + transformer1.get_next_transformers.return_value = { + ('a.out1', 'b.out1'): transformer2 + } transformer1.transform.return_value = transformed_data1 ht = HyperTransformer() ht._multi_column_fields = Mock() @@ -3559,10 +3611,7 @@ def test__fit_field_transformer_multi_column_field_not_ready(self,): out = ht._fit_field_transformer(data, 'a', transformer1) # Assert - expected = pd.DataFrame({ - 'a.out1': ['1', '2', '3'], - 'b': [4, 5, 6] - }) + expected = pd.DataFrame({'a.out1': ['1', '2', '3'], 'b': [4, 5, 6]}) pd.testing.assert_frame_equal(out, expected) transformer1.fit.assert_called_once() transformer1.transform.assert_called_once_with(data) diff --git a/tests/unit/transformers/pii/test_anonymization.py b/tests/unit/transformers/pii/test_anonymization.py index 1cfa79ee..a357034c 100644 --- a/tests/unit/transformers/pii/test_anonymization.py +++ b/tests/unit/transformers/pii/test_anonymization.py @@ -1,11 +1,14 @@ from unittest.mock import Mock, patch from rdt.transformers.pii.anonymization import ( - _detect_provider_name, get_anonymized_transformer, get_faker_instance, is_faker_function) + _detect_provider_name, + get_anonymized_transformer, + get_faker_instance, + is_faker_function, +) class TestAnonimization: - def test__detect_provider_name(self): """Test the ``_detect_provider_name`` method. @@ -28,7 +31,9 @@ def test__detect_provider_name(self): assert state_provider == 'address.en_US' @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') - def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_faker): + def test_get_anonymized_transformer_with_existing_sdtype( + self, mock_anonymized_faker + ): """Test the ``get_anonymized_transformer`` method. Test that when calling with an existing ``sdtype`` / ``function_name`` from the @@ -48,9 +53,13 @@ def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_f - The return value must be the instance of ``AnonymizedFaker``. """ # Setup - output = get_anonymized_transformer('email', transformer_kwargs={ - 'function_kwargs': {'domain': '@gmail.com'}, 'locales': ['en_CA', 'fr_CA'] - }) + output = get_anonymized_transformer( + 'email', + transformer_kwargs={ + 'function_kwargs': {'domain': '@gmail.com'}, + 'locales': ['en_CA', 'fr_CA'], + }, + ) # Assert assert output == mock_anonymized_faker.return_value @@ -58,11 +67,13 @@ def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_f provider_name='internet', function_name='email', function_kwargs={'domain': '@gmail.com'}, - locales=['en_CA', 'fr_CA'] + locales=['en_CA', 'fr_CA'], ) @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') - def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_faker): + def test_get_anonymized_transformer_with_custom_sdtype( + self, mock_anonymized_faker + ): """Test the ``get_anonymized_transformer`` method. Test that when calling with a custom ``sdtype`` / ``function_name`` that does not belong @@ -82,9 +93,13 @@ def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_fak - The return value must be the instance of ``AnonymizedFaker``. """ # Setup - output = get_anonymized_transformer('color', transformer_kwargs={ - 'function_kwargs': {'hue': 'red'}, 'locales': ['en_CA', 'fr_CA'] - }) + output = get_anonymized_transformer( + 'color', + transformer_kwargs={ + 'function_kwargs': {'hue': 'red'}, + 'locales': ['en_CA', 'fr_CA'], + }, + ) # Assert assert output == mock_anonymized_faker.return_value @@ -92,7 +107,7 @@ def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_fak provider_name='color', function_name='color', function_kwargs={'hue': 'red'}, - locales=['en_CA', 'fr_CA'] + locales=['en_CA', 'fr_CA'], ) @patch('rdt.transformers.pii.anonymization.Faker') diff --git a/tests/unit/transformers/pii/test_anonymizer.py b/tests/unit/transformers/pii/test_anonymizer.py index 9e11beb6..ae96688f 100644 --- a/tests/unit/transformers/pii/test_anonymizer.py +++ b/tests/unit/transformers/pii/test_anonymizer.py @@ -11,7 +11,10 @@ from rdt.errors import TransformerInputError, TransformerProcessingError from rdt.transformers.categorical import LabelEncoder -from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker +from rdt.transformers.pii.anonymizer import ( + AnonymizedFaker, + PseudoAnonymizedFaker, +) class TestAnonymizedFaker: @@ -20,7 +23,9 @@ class TestAnonymizedFaker: @patch('rdt.transformers.pii.anonymizer.faker') @patch('rdt.transformers.pii.anonymizer.getattr') @patch('rdt.transformers.pii.anonymizer.attrgetter') - def test_check_provider_function_baseprovider(self, mock_attrgetter, mock_getattr, mock_faker): + def test_check_provider_function_baseprovider( + self, mock_attrgetter, mock_getattr, mock_faker + ): """Test that ``getattr`` is being called with ``BaseProvider`` and ``function_name``. Mock: @@ -32,17 +37,22 @@ def test_check_provider_function_baseprovider(self, mock_attrgetter, mock_getatt mock_getattr.side_effect = ['provider', None] # Run - AnonymizedFaker.check_provider_function('BaseProvider', 'function_name') + AnonymizedFaker.check_provider_function( + 'BaseProvider', 'function_name' + ) # Assert assert mock_attrgetter.call_args_list[0] == call('BaseProvider') - assert mock_getattr.call_args_list[0] == call('module', 'function_name') + assert mock_getattr.call_args_list[0] == call( + 'module', 'function_name' + ) @patch('rdt.transformers.pii.anonymizer.faker') @patch('rdt.transformers.pii.anonymizer.getattr') @patch('rdt.transformers.pii.anonymizer.attrgetter') - def test_check_provider_function_other_providers(self, mock_attrgetter, mock_getattr, - mock_faker): + def test_check_provider_function_other_providers( + self, mock_attrgetter, mock_getattr, mock_faker + ): """Test that ``getattr`` is being called with ``provider_name`` and ``function_name``. Mock: @@ -54,12 +64,16 @@ def test_check_provider_function_other_providers(self, mock_attrgetter, mock_get mock_getattr.side_effect = ['provider_class', None] # Run - AnonymizedFaker.check_provider_function('provider_name', 'function_name') + AnonymizedFaker.check_provider_function( + 'provider_name', 'function_name' + ) # Assert assert mock_attrgetter.call_args_list[0] == call('provider_name') assert mock_getattr.call_args_list[0] == call('module', 'Provider') - assert mock_getattr.call_args_list[1] == call('provider_class', 'function_name') + assert mock_getattr.call_args_list[1] == call( + 'provider_class', 'function_name' + ) def test_check_provider_function_raise_attribute_error(self): """Test that ``check_provider_function`` raises an ``AttributeError``. @@ -76,7 +90,9 @@ def test_check_provider_function_raise_attribute_error(self): # Run with pytest.raises(TransformerProcessingError, match=expected_message): - AnonymizedFaker.check_provider_function('TestProvider', 'TestFunction') + AnonymizedFaker.check_provider_function( + 'TestProvider', 'TestFunction' + ) def test__function_cardinality_rule_none(self): """Test that ``_function`` does not use ``faker.unique``. @@ -258,7 +274,9 @@ def test__check_locales(self, mock_warnings, mock_importlib): @patch('rdt.transformers.pii.anonymizer.importlib') @patch('rdt.transformers.pii.anonymizer.warnings') - def test__check_locales_provider_ending_with_locale(self, mock_warnings, mock_importlib): + def test__check_locales_provider_ending_with_locale( + self, mock_warnings, mock_importlib + ): """Test that check locales does not warn the user if the provider ends with the locale. Mock: @@ -280,7 +298,9 @@ def test__check_locales_provider_ending_with_locale(self, mock_warnings, mock_im @patch('rdt.transformers.pii.anonymizer.importlib') @patch('rdt.transformers.pii.anonymizer.warnings') - def test__check_locales_provider_ending_with_wrong_locale(self, mock_warnings, mock_importlib): + def test__check_locales_provider_ending_with_wrong_locale( + self, mock_warnings, mock_importlib + ): """Test that check locales warns the user. If the provider ends with the given locale but is not separated by a dot this will warn @@ -310,7 +330,9 @@ def test__check_locales_provider_ending_with_wrong_locale(self, mock_warnings, m mock_warnings.warn.assert_called_once_with(expected_message) @patch('rdt.transformers.pii.anonymizer.faker') - @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') + @patch( + 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' + ) def test___init__default(self, mock_check_provider_function, mock_faker): """Test the default instantiation of the transformer. @@ -336,7 +358,9 @@ def test___init__default(self, mock_check_provider_function, mock_faker): instance = AnonymizedFaker() # Assert - mock_check_provider_function.assert_called_once_with('BaseProvider', 'lexify') + mock_check_provider_function.assert_called_once_with( + 'BaseProvider', 'lexify' + ) assert instance.provider_name == 'BaseProvider' assert instance.function_name == 'lexify' assert instance.function_kwargs == {} @@ -362,9 +386,13 @@ def test___init__error_missing_value_generation(self): AnonymizedFaker(missing_value_generation='invalid') @patch('rdt.transformers.pii.anonymizer.faker') - @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') + @patch( + 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' + ) @patch('rdt.transformers.pii.anonymizer.warnings') - def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_faker): + def test___init__custom( + self, mock_warnings, mock_check_provider_function, mock_faker + ): """Test the instantiation of the transformer with custom parameters. Test that the transformer can be instantiated with a custom provider and function, and @@ -390,15 +418,15 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_ instance = AnonymizedFaker( provider_name='credit_card', function_name='credit_card_full', - function_kwargs={ - 'type': 'visa' - }, + function_kwargs={'type': 'visa'}, locales=['en_US', 'fr_FR'], - enforce_uniqueness=True + enforce_uniqueness=True, ) # Assert - mock_check_provider_function.assert_called_once_with('credit_card', 'credit_card_full') + mock_check_provider_function.assert_called_once_with( + 'credit_card', 'credit_card_full' + ) assert instance.provider_name == 'credit_card' assert instance.function_name == 'credit_card_full' assert instance.function_kwargs == {'type': 'visa'} @@ -409,7 +437,7 @@ def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_ call( "The 'enforce_uniqueness' parameter is no longer supported. " "Please use the 'cardinality_rule' parameter instead.", - FutureWarning + FutureWarning, ) ]) @@ -427,7 +455,9 @@ def test___init__no_function_name(self): "'credit_card' provider." ) with pytest.raises(TransformerInputError, match=expected_message): - AnonymizedFaker(provider_name='credit_card', locales=['en_US', 'fr_FR']) + AnonymizedFaker( + provider_name='credit_card', locales=['en_US', 'fr_FR'] + ) @patch('rdt.transformers.pii.anonymizer.issubclass') @patch('rdt.transformers.pii.anonymizer.BaseTransformer') @@ -442,7 +472,10 @@ def test_get_supported_sdtypes(self, base_mock, issubclass_mock): datetime_mock = Mock() datetime_mock.get_supported_sdtypes.return_value = ['datetime'] boolean_mock = Mock() - boolean_mock.get_supported_sdtypes.return_value = ['boolean', 'categorical'] + boolean_mock.get_supported_sdtypes.return_value = [ + 'boolean', + 'categorical', + ] text_mock = Mock() text_mock.get_supported_sdtypes.return_value = ['text'] phone_mock = Mock() @@ -456,16 +489,22 @@ def test_get_supported_sdtypes(self, base_mock, issubclass_mock): boolean_mock, text_mock, phone_mock, - pii_mock + pii_mock, ] # Run supported_sdtypes = AnonymizedFaker.get_supported_sdtypes() # Assert - assert sorted(supported_sdtypes) == sorted(['phone_number', 'pii', 'text']) + assert sorted(supported_sdtypes) == sorted([ + 'phone_number', + 'pii', + 'text', + ]) - @patch('rdt.transformers.pii.anonymizer.BaseTransformer.reset_randomization') + @patch( + 'rdt.transformers.pii.anonymizer.BaseTransformer.reset_randomization' + ) @patch('rdt.transformers.pii.anonymizer.faker') def test_reset_randomization(self, mock_faker, mock_base_reset): """Test that this function creates a new faker instance.""" @@ -505,7 +544,9 @@ def test__fit(self): # Assert assert transformer.data_length == 5 - assert transformer.output_properties == {None: {'next_transformer': None}} + assert transformer.output_properties == { + None: {'next_transformer': None} + } assert transformer._nan_frequency == 0.4 assert transformer._data_cardinality == 3 @@ -575,7 +616,9 @@ def test__reverse_transform_match_cardinality(self): AnonymizedFaker._reverse_transform(instance, None) # Assert - instance._reverse_transform_cardinality_rule_match.assert_called_once_with(3) + instance._reverse_transform_cardinality_rule_match.assert_called_once_with( + 3 + ) def test__reverse_transform_cardinality_rule_match_only_nans(self): """Test it with only nans.""" @@ -675,7 +718,9 @@ def test__reverse_transform_not_enough_unique_values(self): - Raises an error. """ # Setup - instance = AnonymizedFaker('misc', 'boolean', cardinality_rule='unique') + instance = AnonymizedFaker( + 'misc', 'boolean', cardinality_rule='unique' + ) data = pd.Series(['a', 'b', 'c', 'd']) instance.columns = ['a'] @@ -774,8 +819,12 @@ def test___getstate__(self, mock_warnings): mock_warnings.warn.assert_called_once_with(expected_warning_msg) @patch('rdt.transformers.pii.anonymizer.faker') - @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') - def test___init__super_attrs(self, mock_check_provider_function, mock_faker): + @patch( + 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' + ) + def test___init__super_attrs( + self, mock_check_provider_function, mock_faker + ): """Test that initializing an instance is calling properly the ``super`` class. Mock: @@ -808,7 +857,9 @@ def test___init__super_attrs(self, mock_check_provider_function, mock_faker): mock_faker.Faker.assert_called_once_with(None) @patch('rdt.transformers.pii.anonymizer.faker') - @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') + @patch( + 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' + ) def test___init__custom(self, mock_check_provider_function, mock_faker): """Test the instantiation of the transformer with custom parameters. @@ -835,16 +886,16 @@ def test___init__custom(self, mock_check_provider_function, mock_faker): instance = PseudoAnonymizedFaker( provider_name='credit_card', function_name='credit_card_full', - function_kwargs={ - 'type': 'visa' - }, - locales=['en_US', 'fr_FR'] + function_kwargs={'type': 'visa'}, + locales=['en_US', 'fr_FR'], ) # Assert assert instance._mapping_dict == {} assert instance._reverse_mapping_dict == {} - mock_check_provider_function.assert_called_once_with('credit_card', 'credit_card_full') + mock_check_provider_function.assert_called_once_with( + 'credit_card', 'credit_card_full' + ) assert instance.provider_name == 'credit_card' assert instance.function_name == 'credit_card_full' assert instance.function_kwargs == {'type': 'visa'} @@ -910,7 +961,10 @@ def test__fit(self): assert instance._mapping_dict == {'a': 1, 'b': 2, 'c': 3} assert instance._reverse_mapping_dict == {1: 'a', 2: 'b', 3: 'c'} assert list(instance.output_properties) == [None] - assert list(instance.output_properties[None]) == ['sdtype', 'next_transformer'] + assert list(instance.output_properties[None]) == [ + 'sdtype', + 'next_transformer', + ] assert instance.output_properties[None]['sdtype'] == 'categorical' transformer = instance.output_properties[None]['next_transformer'] @@ -976,7 +1030,9 @@ def test__transform(self): result = instance._transform(data) # Assert - pd.testing.assert_series_equal(result, pd.Series(['z', 'y', 'x'], name='col')) + pd.testing.assert_series_equal( + result, pd.Series(['z', 'y', 'x'], name='col') + ) def test__transform_with_new_values(self): """Test the ``_transform`` method. @@ -1039,4 +1095,6 @@ def test__reverse_transform(self): reverse_transformed = instance._reverse_transform(data) # Assert - pd.testing.assert_series_equal(reverse_transformed, pd.Series(['a', 'b', 'c'], name='col')) + pd.testing.assert_series_equal( + reverse_transformed, pd.Series(['a', 'b', 'c'], name='col') + ) diff --git a/tests/unit/transformers/test___init__.py b/tests/unit/transformers/test___init__.py index 30c415ed..e6542704 100644 --- a/tests/unit/transformers/test___init__.py +++ b/tests/unit/transformers/test___init__.py @@ -1,8 +1,16 @@ import pytest from rdt.transformers import ( - AnonymizedFaker, BinaryEncoder, FloatFormatter, RegexGenerator, UniformEncoder, - UnixTimestampEncoder, get_default_transformers, get_transformer_class, get_transformer_name) + AnonymizedFaker, + BinaryEncoder, + FloatFormatter, + RegexGenerator, + UniformEncoder, + UnixTimestampEncoder, + get_default_transformers, + get_transformer_class, + get_transformer_name, +) def test_get_transformer_name(): diff --git a/tests/unit/transformers/test__validators.py b/tests/unit/transformers/test__validators.py index dc1c2347..ad7250aa 100644 --- a/tests/unit/transformers/test__validators.py +++ b/tests/unit/transformers/test__validators.py @@ -4,13 +4,21 @@ import pytest from rdt.errors import TransformerInputError -from rdt.transformers._validators import AddressValidator, BaseValidator, GPSValidator +from rdt.transformers._validators import ( + AddressValidator, + BaseValidator, + GPSValidator, +) class TestBaseValidator: - - @patch('rdt.transformers._validators.BaseValidator.SUPPORTED_SDTYPES', ['numerical']) - @patch('rdt.transformers._validators.BaseValidator.VALIDATION_TYPE', 'Base') + @patch( + 'rdt.transformers._validators.BaseValidator.SUPPORTED_SDTYPES', + ['numerical'], + ) + @patch( + 'rdt.transformers._validators.BaseValidator.VALIDATION_TYPE', 'Base' + ) def test_validate_supported_sdtypes(self): """Test ``_validate_supported_sdtypes`` method.""" # Setup @@ -33,7 +41,9 @@ def test_validate_supported_sdtypes(self): # Run and Assert BaseValidator._validate_supported_sdtypes(columns_to_sdtypes_valid) with pytest.raises(TransformerInputError, match=expected_message): - BaseValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) + BaseValidator._validate_supported_sdtypes( + columns_to_sdtypes_invalid + ) def test_validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -87,7 +97,7 @@ def test__validate_number_columns(self): 'col_5': 'street_address', 'col_6': 'secondary_address', 'col_7': 'country_code', - 'col_8': 'administrative_unit' + 'col_8': 'administrative_unit', } # Run and Assert @@ -97,8 +107,12 @@ def test__validate_number_columns(self): 'Address transformers takes up to 7 columns to transform. Please provide address' ' data with valid fields.' ) - with pytest.raises(TransformerInputError, match=re.escape(expected_message)): - AddressValidator._validate_number_columns(column_to_sdtypes_invalid) + with pytest.raises( + TransformerInputError, match=re.escape(expected_message) + ): + AddressValidator._validate_number_columns( + column_to_sdtypes_invalid + ) def test__validate_uniqueness_sdtype(self): """Test ``_validate_uniqueness_sdtype`` method.""" @@ -111,7 +125,7 @@ def test__validate_uniqueness_sdtype(self): 'col_1': 'country_code', 'col_2': 'country_code', 'col_3': 'city', - 'col_4': 'city' + 'col_4': 'city', } # Run and Assert @@ -123,7 +137,9 @@ def test__validate_uniqueness_sdtype(self): 'Your address data cannot have duplicate fields.' ) with pytest.raises(TransformerInputError, match=expected_message): - AddressValidator._validate_uniqueness_sdtype(columns_to_sdtypes_invalid) + AddressValidator._validate_uniqueness_sdtype( + columns_to_sdtypes_invalid + ) def test__validate_supported_sdtype(self): """Test ``_validate_supported_sdtype`` method.""" @@ -147,7 +163,9 @@ def test__validate_supported_sdtype(self): 'Please provide a column that is compatible with Address data.' ) with pytest.raises(TransformerInputError, match=expected_message): - AddressValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) + AddressValidator._validate_supported_sdtypes( + columns_to_sdtypes_invalid + ) def test__validate_administrative_unit(self): """Test ``_validate_administrative_unit`` method.""" @@ -158,18 +176,24 @@ def test__validate_administrative_unit(self): } columns_to_sdtypes_invalid = { 'col_1': 'administrative_unit', - 'col_2': 'state' + 'col_2': 'state', } # Run and Assert - AddressValidator._validate_administrative_unit(columns_to_sdtypes_valid) + AddressValidator._validate_administrative_unit( + columns_to_sdtypes_valid + ) expected_message = ( "The AddressValidator can have up to 1 column with sdtype 'state'" " or 'administrative_unit'. Please provide address data with valid fields." ) - with pytest.raises(TransformerInputError, match=re.escape(expected_message)): - AddressValidator._validate_administrative_unit(columns_to_sdtypes_invalid) + with pytest.raises( + TransformerInputError, match=re.escape(expected_message) + ): + AddressValidator._validate_administrative_unit( + columns_to_sdtypes_invalid + ) def test__validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -187,9 +211,15 @@ def test__validate_sdtypes(self): AddressValidator.validate_sdtypes(columns_to_sdtypes) # Assert - AddressValidator._validate_number_columns.assert_called_once_with(columns_to_sdtypes) - AddressValidator._validate_uniqueness_sdtype.assert_called_once_with(columns_to_sdtypes) - AddressValidator._validate_supported_sdtypes.assert_called_once_with(columns_to_sdtypes) + AddressValidator._validate_number_columns.assert_called_once_with( + columns_to_sdtypes + ) + AddressValidator._validate_uniqueness_sdtype.assert_called_once_with( + columns_to_sdtypes + ) + AddressValidator._validate_supported_sdtypes.assert_called_once_with( + columns_to_sdtypes + ) AddressValidator._validate_administrative_unit.assert_called_once_with( columns_to_sdtypes ) @@ -197,14 +227,14 @@ def test__validate_sdtypes(self): def test__validate_imports_without_address_module(self): """Test ``validate_imports`` when address module doesn't exist.""" # Run and Assert - expected_message = ( - 'You must have SDV Enterprise with the address add-on to use the address features' - ) + expected_message = 'You must have SDV Enterprise with the address add-on to use the address features' with pytest.raises(ImportError, match=expected_message): AddressValidator.validate_imports() @patch('rdt.transformers._validators.importlib.import_module') - def test__validate_imports_without_premium_features(self, mock_import_module): + def test__validate_imports_without_premium_features( + self, mock_import_module + ): """Test ``validate_imports`` when the user doesn't have the transformers.""" # Setup mock_address = Mock() @@ -213,9 +243,7 @@ def test__validate_imports_without_premium_features(self, mock_import_module): mock_import_module.return_value = mock_address # Run and Assert - expected_message = ( - 'You must have SDV Enterprise with the address add-on to use the address features' - ) + expected_message = 'You must have SDV Enterprise with the address add-on to use the address features' with pytest.raises(ImportError, match=expected_message): AddressValidator.validate_imports() @@ -241,7 +269,9 @@ def test__validate_uniqueness_sdtype(self): 'Please provide GPS data with valid fields.' ) with pytest.raises(TransformerInputError, match=expected_message): - GPSValidator._validate_uniqueness_sdtype(columns_to_sdtypes_invalid) + GPSValidator._validate_uniqueness_sdtype( + columns_to_sdtypes_invalid + ) def test__validate_supported_sdtype(self): """Test ``_validate_supported_sdtype`` method.""" @@ -263,7 +293,9 @@ def test__validate_supported_sdtype(self): 'Please provide a column that is compatible with GPS data.' ) with pytest.raises(TransformerInputError, match=expected_message): - GPSValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) + GPSValidator._validate_supported_sdtypes( + columns_to_sdtypes_invalid + ) def test__validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -279,20 +311,24 @@ def test__validate_sdtypes(self): GPSValidator.validate_sdtypes(columns_to_sdtypes) # Assert - GPSValidator._validate_uniqueness_sdtype.assert_called_once_with(columns_to_sdtypes) - GPSValidator._validate_supported_sdtypes.assert_called_once_with(columns_to_sdtypes) + GPSValidator._validate_uniqueness_sdtype.assert_called_once_with( + columns_to_sdtypes + ) + GPSValidator._validate_supported_sdtypes.assert_called_once_with( + columns_to_sdtypes + ) def test_validate_import_gps_transformers_without_gps_module(self): """Test ``validate_imports`` when gps module doesn't exist.""" # Run and Assert - expected_message = ( - 'You must have SDV Enterprise with the gps add-on to use the GPS features' - ) + expected_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features' with pytest.raises(ImportError, match=expected_message): GPSValidator.validate_imports() @patch('rdt.transformers._validators.importlib.import_module') - def test_validate_import_gps_transformers_without_premium_features(self, mock_import_module): + def test_validate_import_gps_transformers_without_premium_features( + self, mock_import_module + ): """Test ``validate_imports`` when the user doesn't have the transformers.""" # Setup mock_gps = Mock() @@ -302,8 +338,6 @@ def test_validate_import_gps_transformers_without_premium_features(self, mock_im mock_import_module.return_value = mock_gps # Run and Assert - expected_message = ( - 'You must have SDV Enterprise with the gps add-on to use the GPS features' - ) + expected_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features' with pytest.raises(ImportError, match=expected_message): GPSValidator.validate_imports() diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py index 2acf054e..7e3aa185 100644 --- a/tests/unit/transformers/test_base.py +++ b/tests/unit/transformers/test_base.py @@ -7,7 +7,11 @@ import pytest from rdt.errors import TransformerInputError -from rdt.transformers import BaseMultiColumnTransformer, BaseTransformer, NullTransformer +from rdt.transformers import ( + BaseMultiColumnTransformer, + BaseTransformer, + NullTransformer, +) from rdt.transformers.base import random_state, set_random_states @@ -40,10 +44,14 @@ def test_set_random_states(mock_numpy): mock_numpy.random.get_state.assert_called() mock_numpy.random.set_state.assert_has_calls([ call(initial_state_value), - call(first_state) + call(first_state), ]) - my_function.assert_called_once_with(mock_numpy.random.RandomState.return_value, 'fit') - mock_numpy.random.RandomState.return_value.set_state.assert_called_with(second_state) + my_function.assert_called_once_with( + mock_numpy.random.RandomState.return_value, 'fit' + ) + mock_numpy.random.RandomState.return_value.set_state.assert_called_with( + second_state + ) @patch('rdt.transformers.base.set_random_states') @@ -66,7 +74,9 @@ def test_random_state(mock_set_random_states): wrapped_function(instance) # Assert - mock_set_random_states.assert_called_once_with({}, 'name', mock_set_random_state) + mock_set_random_states.assert_called_once_with( + {}, 'name', mock_set_random_state + ) my_function.assert_called_once() @@ -91,7 +101,6 @@ def test_random_state_random_states_is_none(mock_set_random_states): class TestBaseTransformer: - def test_set_random_state(self): """Test that the method updates the random state for the correct method.""" # Setup @@ -141,6 +150,7 @@ def test_get_subclasses(self): Output: - a list of classes including the ``Child`` class, but NOT including the ``Parent``. """ + # Setup class Parent(BaseTransformer, abc.ABC): pass @@ -189,6 +199,7 @@ def test_get_supported_sdtypes_supported_sdtypes(self): Output: - the list stored in the ``SUPPORTED_SDTYPES`` attribute. """ + # Setup class Dummy(BaseTransformer): SUPPORTED_SDTYPES = ['categorical', 'boolean'] @@ -212,6 +223,7 @@ def test_get_supported_sdtypes_no_supported_sdtypes_provided(self): Output: - A list with the ``INPUT_SDTYPE`` value inside. """ + # Setup class Dummy(BaseTransformer): INPUT_SDTYPE = 'categorical' @@ -230,7 +242,7 @@ def test__get_output_to_property(self): transformer.output_properties = { 'col': {'sdtype': 'float', 'next_transformer': None}, 'ignore': {'next_transformer': None}, - None: {'sdtype': 'categorical', 'next_transformer': None} + None: {'sdtype': 'categorical', 'next_transformer': None}, } # Run @@ -248,8 +260,12 @@ def test__set_missing_value_generation(self): # Run BaseTransformer._set_missing_value_generation(instance_none, None) - BaseTransformer._set_missing_value_generation(instance_random, 'random') - BaseTransformer._set_missing_value_generation(instance_from_column, 'from_column') + BaseTransformer._set_missing_value_generation( + instance_random, 'random' + ) + BaseTransformer._set_missing_value_generation( + instance_from_column, 'from_column' + ) # Assert assert instance_none.missing_value_generation is None @@ -285,10 +301,13 @@ def test_model_missing_values(self, mock_warnings): # Assert assert result is True - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter instead." - ), FutureWarning) + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter instead." + ), + FutureWarning, + ) @patch('rdt.transformers.base.warnings') def test__set_model_missing_values_true(self, mock_warnings): @@ -299,12 +318,17 @@ def test__set_model_missing_values_true(self, mock_warnings): BaseTransformer._set_model_missing_values(instance, True) # Assert - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.'), FutureWarning + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter to select your " + 'strategy.' + ), + FutureWarning, + ) + instance._set_missing_value_generation.assert_called_once_with( + 'from_column' ) - instance._set_missing_value_generation.assert_called_once_with('from_column') @patch('rdt.transformers.base.warnings') def test__set_model_missing_values_false(self, mock_warnings): @@ -315,12 +339,17 @@ def test__set_model_missing_values_false(self, mock_warnings): BaseTransformer._set_model_missing_values(instance, False) # Assert - mock_warnings.warn.assert_called_once_with(( - "Future versions of RDT will not support the 'model_missing_values' parameter. " - "Please switch to using the 'missing_value_generation' parameter to select your " - 'strategy.'), FutureWarning + mock_warnings.warn.assert_called_once_with( + ( + "Future versions of RDT will not support the 'model_missing_values' parameter. " + "Please switch to using the 'missing_value_generation' parameter to select your " + 'strategy.' + ), + FutureWarning, + ) + instance._set_missing_value_generation.assert_called_once_with( + 'random' ) - instance._set_missing_value_generation.assert_called_once_with('random') def test___repr___no_parameters(self): """Test that the ``__str__`` method returns the class name. @@ -347,6 +376,7 @@ def test___repr___with_parameters(self): - The class has two parameters in its ``__init__`` method with default values. - The class instance only sets one of them. """ + # Setup class Dummy(BaseTransformer): def __init__(self, param1=None, param2=None, param3=None): @@ -373,6 +403,7 @@ def test__str__(self): - The class has two parameters in its ``__init__`` method with default values. - The class instance only sets one of them. """ + # Setup class Dummy(BaseTransformer): def __init__(self, param1=None, param2=None, param3=None): @@ -390,6 +421,7 @@ def __init__(self, param1=None, param2=None, param3=None): def test_get_output_sdtypes(self): """Test the column_prefix gets added to all columns in output_properties.""" + # Setup class Dummy(BaseTransformer): column_prefix = 'column_name' @@ -414,7 +446,9 @@ class Dummy(BaseTransformer): column_prefix = 'column_name' def __init__(self): - self.output_properties = {None: {'next_transformer': transformer}} + self.output_properties = { + None: {'next_transformer': transformer} + } dummy_transformer = Dummy() @@ -436,6 +470,7 @@ def test_get_input_columns(self): Output: - List matching the list created in the setup. """ + # Setup class Dummy(BaseTransformer): columns = ['col1', 'col2', 'col3'] @@ -461,6 +496,7 @@ def test_get_output_columns(self): Output: - A list of each output name with the prefix prepended. """ + # Setup class Dummy(BaseTransformer): column_prefix = 'column_name' @@ -468,7 +504,7 @@ class Dummy(BaseTransformer): def __init__(self): self.output_properties = { 'out1': {'sdtype': 'numerical'}, - 'out2': {'sdtype': 'float'} + 'out2': {'sdtype': 'float'}, } dummy_transformer = Dummy() @@ -492,6 +528,7 @@ def test_is_generator(self): Output: - the boolean value stored in ``IS_GENERATOR``. """ + # Setup class Dummy(BaseTransformer): IS_GENERATOR = True @@ -518,11 +555,7 @@ def test__store_columns_list(self): - the ``self.columns`` attribute should be set to the list of the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['a', 'b'] base_transformer = BaseTransformer() @@ -546,11 +579,7 @@ def test__store_columns_tuple(self): - the ``self.columns`` attribute should be set to a list of the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ('a', 'b') base_transformer = BaseTransformer() @@ -581,7 +610,7 @@ def test__store_columns_tuple_in_the_data(self): data = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6], - ('a', 'b'): [7, 8, 9] + ('a', 'b'): [7, 8, 9], }) columns = ('a', 'b') base_transformer = BaseTransformer() @@ -596,22 +625,18 @@ def test__store_columns_tuple_in_the_data(self): def test__store_columns_string(self): """Test the ``_store_columns`` method when passed a string. - When the columns are passed as a string, it should be treated as the only column - name passed and stored in the ``columns`` attribute as a one element list. + When the columns are passed as a string, it should be treated as the only column + name passed and stored in the ``columns`` attribute as a one element list. - Input: - - a data frame. - - a string with the name of one of the columns of the dataframe. + Input: + - a data frame. + - a string with the name of one of the columns of the dataframe. - Side effects: - - the ``self.columns`` attribute should be set to a list containing the passed string. + Side effects: + - the ``self.columns`` attribute should be set to a list containing the passed string. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = 'a' base_transformer = BaseTransformer() @@ -659,11 +684,7 @@ def test__get_columns_data_multiple_columns(self): - the passed dataframe, but containing only the passed columns. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['a', 'b'] # Run @@ -690,11 +711,7 @@ def test__get_columns_data_single_column(self): - a pandas series, corresponding to the passed column from the dataframe. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) columns = ['b'] # Run @@ -720,22 +737,19 @@ def test__add_columns_to_data_series(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[2, 0, 1]) columns = ['c'] columns_data = pd.Series([7, 8, 9], name='c') # Run - result = BaseTransformer._add_columns_to_data(data, columns_data, columns) + result = BaseTransformer._add_columns_to_data( + data, columns_data, columns + ) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame( + {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1] + ) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_dataframe(self): @@ -754,26 +768,31 @@ def test__add_columns_to_data_dataframe(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + data = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) columns = ['c', 'd'] - columns_data = pd.DataFrame({ - 'c': [7, 8, 9], - 'd': [10, 11, 12] - }) + columns_data = pd.DataFrame({'c': [7, 8, 9], 'd': [10, 11, 12]}) # Run - result = BaseTransformer._add_columns_to_data(data, columns_data, columns) + result = BaseTransformer._add_columns_to_data( + data, columns_data, columns + ) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9], - 'd': [10, 11, 12] - }, index=[2, 0, 1]) + expected = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': [7, 8, 9], + 'd': [10, 11, 12], + }, + index=[2, 0, 1], + ) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_1d_array(self): @@ -792,22 +811,25 @@ def test__add_columns_to_data_1d_array(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + data = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) columns = ['c'] columns_data = np.array([7, 8, 9], dtype=np.int64) # Run - result = BaseTransformer._add_columns_to_data(data, columns_data, columns) + result = BaseTransformer._add_columns_to_data( + data, columns_data, columns + ) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame( + {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1] + ) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_2d_array(self): @@ -826,25 +848,19 @@ def test__add_columns_to_data_2d_array(self): as they were in columns_data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3]}, index=[2, 0, 1]) columns = ['b', 'c'] - columns_data = np.array([ - [7, 1], - [8, 5], - [9, 9] - ], dtype=np.int64) + columns_data = np.array([[7, 1], [8, 5], [9, 9]], dtype=np.int64) # Run - result = BaseTransformer._add_columns_to_data(data, columns_data, columns) + result = BaseTransformer._add_columns_to_data( + data, columns_data, columns + ) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [7, 8, 9], - 'c': [1, 5, 9] - }, index=[2, 0, 1]) + expected = pd.DataFrame( + {'a': [1, 2, 3], 'b': [7, 8, 9], 'c': [1, 5, 9]}, index=[2, 0, 1] + ) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_none(self): @@ -860,21 +876,23 @@ def test__add_columns_to_data_none(self): - Data should not be changed. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6] - }, index=[2, 0, 1]) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[2, 0, 1]) columns = [] columns_data = None # Run - result = BaseTransformer._add_columns_to_data(data, columns_data, columns) + result = BaseTransformer._add_columns_to_data( + data, columns_data, columns + ) # Assert - expected = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - }, index=[2, 0, 1]) + expected = pd.DataFrame( + { + 'a': [1, 2, 3], + 'b': [4, 5, 6], + }, + index=[2, 0, 1], + ) pd.testing.assert_frame_equal(result, expected) def test__build_output_columns(self): @@ -897,11 +915,7 @@ def test__build_output_columns(self): from the ``get_output_sdtypes`` method. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b'] @@ -909,7 +923,7 @@ class Dummy(BaseTransformer): def __init__(self): self.output_properties = { None: {'sdtype': 'numerical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } dummy_transformer = Dummy() @@ -950,15 +964,15 @@ def test__build_output_columns_generated_already_exist(self): 'b': [7, 8, 9], 'a#b#.is_null': [0, 0, 0], 'a#b#.is_null#': [0, 0, 0], - }) class Dummy(BaseTransformer): def __init__(self): self.output_properties = { None: {'sdtype': 'numerical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } + columns = ['a', 'b'] # Run @@ -973,11 +987,7 @@ def test__fit_raises_error(self): """Test ``_fit`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1008,11 +1018,7 @@ def test_fit(self): column names to accepted output sdtypes. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) column = ['a'] class Dummy(BaseTransformer): @@ -1020,7 +1026,7 @@ def __init__(self): super().__init__() self.output_properties = { None: {'sdtype': 'categorical'}, - 'is_null': {'sdtype': 'float'} + 'is_null': {'sdtype': 'float'}, } def _fit(self, data): @@ -1034,7 +1040,9 @@ def _fit(self, data): # Assert expected_data = pd.Series([1, 2, 3], name='a') assert dummy_transformer.columns == ['a'] - pd.testing.assert_series_equal(dummy_transformer._passed_data, expected_data) + pd.testing.assert_series_equal( + dummy_transformer._passed_data, expected_data + ) assert dummy_transformer.column_prefix == 'a' assert dummy_transformer.output_columns == ['a', 'a.is_null'] @@ -1042,11 +1050,7 @@ def test__transform_raises_error(self): """Test ``_transform`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1070,11 +1074,7 @@ def test_transform_incorrect_columns(self): - the original data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b', 'd'] @@ -1082,7 +1082,9 @@ class Dummy(BaseTransformer): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state(np.random.RandomState(42), 'transform') + dummy_transformer.set_random_state( + np.random.RandomState(42), 'transform' + ) transformed_data = dummy_transformer.transform(data) # Assert @@ -1112,11 +1114,7 @@ def test_transform_drop_true(self): and should store it in ``self._passed_data``. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): columns = ['a', 'b'] @@ -1129,7 +1127,9 @@ def _transform(self, data): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state(np.random.RandomState(42), 'transform') + dummy_transformer.set_random_state( + np.random.RandomState(42), 'transform' + ) transformed_data = dummy_transformer.transform(data) # Assert @@ -1137,7 +1137,9 @@ def _transform(self, data): 'a': [1, 2, 3], 'b': [4, 5, 6], }) - pd.testing.assert_frame_equal(dummy_transformer._passed_data, expected_passed) + pd.testing.assert_frame_equal( + dummy_transformer._passed_data, expected_passed + ) expected_transformed = pd.DataFrame({ 'c': [7, 8, 9], @@ -1166,11 +1168,7 @@ def test_fit_transform(self): """ # Setup self = Mock(spec_set=BaseTransformer) - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) column = 'a' # Run @@ -1185,11 +1183,7 @@ def test__reverse_transform_raises_error(self): """Test ``_reverse_transform`` raises ``NotImplementedError``.""" # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) transformer = BaseTransformer() # Run / Assert @@ -1213,11 +1207,7 @@ def test_reverse_transform_incorrect_columns(self): - the original data. """ # Setup - data = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': [4, 5, 6], - 'c': [7, 8, 9] - }) + data = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) class Dummy(BaseTransformer): output_columns = ['a', 'b', 'd'] @@ -1225,7 +1215,9 @@ class Dummy(BaseTransformer): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state(np.random.RandomState(42), 'reverse_transform') + dummy_transformer.set_random_state( + np.random.RandomState(42), 'reverse_transform' + ) transformed_data = dummy_transformer.reverse_transform(data) # Assert @@ -1243,7 +1235,7 @@ def test_reverse_transform(self): data = pd.DataFrame({ 'a': [1, 2, 3], 'b.is_null': [4, 5, 6], - 'c': [7, 8, 9] + 'c': [7, 8, 9], }) class Dummy(BaseTransformer): @@ -1256,7 +1248,9 @@ def _reverse_transform(self, data): # Run dummy_transformer = Dummy() - dummy_transformer.set_random_state(np.random.RandomState(42), 'reverse_transform') + dummy_transformer.set_random_state( + np.random.RandomState(42), 'reverse_transform' + ) transformed_data = dummy_transformer.reverse_transform(data) # Assert @@ -1264,7 +1258,9 @@ def _reverse_transform(self, data): 'a': [1, 2, 3], 'b.is_null': [4, 5, 6], }) - pd.testing.assert_frame_equal(dummy_transformer._passed_data, expected_passed) + pd.testing.assert_frame_equal( + dummy_transformer._passed_data, expected_passed + ) expected_transformed = pd.DataFrame({ 'c': [7, 8, 9], @@ -1275,7 +1271,6 @@ def _reverse_transform(self, data): class TestBaseMultiColumnTransformer: - def test___init__(self): """Test the ``__init__`` method.""" # Setup @@ -1420,7 +1415,9 @@ def test__validate_columns_to_sdtypes(self): 'Columns (d) are not present in the data.' ) with pytest.raises(ValueError, match=expected_error_msg): - transformer._validate_columns_to_sdtypes(data, wrong_columns_to_sdtypes) + transformer._validate_columns_to_sdtypes( + data, wrong_columns_to_sdtypes + ) def test__validate_sdtypes(self): """Test the ``_validate_sdtypes`` method.""" @@ -1466,10 +1463,10 @@ def test_fit(self): transformer.fit(data, columns_to_sdtypes) # Assert - transformer._validate_columns_to_sdtypes.assert_called_once_with(data, columns_to_sdtypes) - transformer._store_columns.assert_called_once_with( - ['a', 'b'], data + transformer._validate_columns_to_sdtypes.assert_called_once_with( + data, columns_to_sdtypes ) + transformer._store_columns.assert_called_once_with(['a', 'b'], data) transformer._set_seed.assert_called_once_with(data) transformer._get_columns_data.assert_called_once_with(data, ['a', 'b']) transformer._fit.assert_called_once_with(data_transformer) diff --git a/tests/unit/transformers/test_boolean.py b/tests/unit/transformers/test_boolean.py index c186b2d4..7ce9ae82 100644 --- a/tests/unit/transformers/test_boolean.py +++ b/tests/unit/transformers/test_boolean.py @@ -8,7 +8,6 @@ class TestBinaryEncoder(TestCase): - def test___init__(self): """Test default instance""" # Run @@ -18,7 +17,9 @@ def test___init__(self): error_message = 'Unexpected missing_value_replacement' error_generation = 'Unexpected missing_value_generation' assert transformer.missing_value_replacement == 'mode', error_message - assert transformer.missing_value_generation == 'random', error_generation + assert ( + transformer.missing_value_generation == 'random' + ), error_generation def test___init___model_missing_value_passed(self): """Test when model missing value is passed to the init.""" @@ -39,7 +40,9 @@ def test__fit_missing_value_replacement_not_ignore(self): # Asserts error_msg = 'Unexpected fill value' - assert transformer.null_transformer._missing_value_replacement == 0, error_msg + assert ( + transformer.null_transformer._missing_value_replacement == 0 + ), error_msg def test__fit_array(self): """Test _fit with numpy.array""" @@ -52,7 +55,9 @@ def test__fit_array(self): # Asserts error_msg = 'Unexpected fill value' - assert transformer.null_transformer._missing_value_replacement == 0, error_msg + assert ( + transformer.null_transformer._missing_value_replacement == 0 + ), error_msg def test__fit_missing_value_generation_from_column(self): """Test output_properties contains 'is_null' column. @@ -84,13 +89,16 @@ def test__transform_series(self): # Asserts expect_call_count = 1 - expect_call_args = pd.Series([0., 1., None, 1., 0.], dtype=float) + expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' - assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg + assert ( + transformer.null_transformer.transform.call_count + == expect_call_count + ), error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], - expect_call_args + expect_call_args, ) def test__transform_array(self): @@ -104,13 +112,16 @@ def test__transform_array(self): # Asserts expect_call_count = 1 - expect_call_args = pd.Series([0., 1., None, 1., 0.], dtype=float) + expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' - assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg + assert ( + transformer.null_transformer.transform.call_count + == expect_call_count + ), error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], - expect_call_args + expect_call_args, ) def test__reverse_transform_missing_value_replacement_not_ignore(self): @@ -122,7 +133,9 @@ def test__reverse_transform_missing_value_replacement_not_ignore(self): # Run transformer = Mock() transformer.missing_value_replacement = 0 - transformer.null_transformer.reverse_transform.return_value = transformed_data + transformer.null_transformer.reverse_transform.return_value = ( + transformed_data + ) result = BinaryEncoder._reverse_transform(transformer, data) @@ -136,13 +149,15 @@ def test__reverse_transform_missing_value_replacement_not_ignore(self): 'NullTransformer.reverse_transform should not be called when ' 'missing_value_replacement is ignore' ) - reverse_transform_call_count = transformer.null_transformer.reverse_transform.call_count + reverse_transform_call_count = ( + transformer.null_transformer.reverse_transform.call_count + ) assert reverse_transform_call_count == expect_call_count, error_msg def test__reverse_transform_series(self): """Test when data is a Series.""" # Setup - data = pd.Series([1., 0., 1.]) + data = pd.Series([1.0, 0.0, 1.0]) # Run transformer = Mock() @@ -157,7 +172,7 @@ def test__reverse_transform_series(self): def test__reverse_transform_not_null_values(self): """Test _reverse_transform not null values correctly""" # Setup - data = np.array([1., 0., 1.]) + data = np.array([1.0, 0.0, 1.0]) # Run transformer = Mock() @@ -174,7 +189,7 @@ def test__reverse_transform_not_null_values(self): def test__reverse_transform_2d_ndarray(self): """Test _reverse_transform not null values correctly""" # Setup - data = np.array([[1.], [0.], [1.]]) + data = np.array([[1.0], [0.0], [1.0]]) # Run transformer = Mock() diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index fc9dcdee..c9a5d8d2 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -8,8 +8,14 @@ from rdt.errors import TransformerInputError from rdt.transformers.categorical import ( - CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder, - OrderedUniformEncoder, UniformEncoder) + CustomLabelEncoder, + FrequencyEncoder, + LabelEncoder, + OneHotEncoder, + OrderedLabelEncoder, + OrderedUniformEncoder, + UniformEncoder, +) RE_SSN = re.compile(r'\d\d\d-\d\d-\d\d\d\d') @@ -51,7 +57,9 @@ def test__order_categories_alphabetical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal(ordered, np.array(['four', 'one', 'three', 'two'])) + np.testing.assert_array_equal( + ordered, np.array(['four', 'one', 'three', 'two']) + ) def test__order_categories_alphabetical_with_nans(self): """Test the ``_order_categories`` method when ``order_by`` is 'alphabetical'. @@ -71,7 +79,9 @@ def test__order_categories_alphabetical_with_nans(self): ordered = transformer._order_categories(arr) # Assert - expected = np.array(['four', 'one', 'three', 'two', np.nan], dtype='object') + expected = np.array( + ['four', 'one', 'three', 'two', np.nan], dtype='object' + ) pd.testing.assert_series_equal(pd.Series(ordered), pd.Series(expected)) def test__order_categories_alphabetical_float_error(self): @@ -85,7 +95,9 @@ def test__order_categories_alphabetical_float_error(self): arr = np.array([1, 2, 3, 4]) # Run / Assert - message = "The data must be of type string if order_by is 'alphabetical'." + message = ( + "The data must be of type string if order_by is 'alphabetical'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -100,7 +112,9 @@ def test__order_categories_alphabetical_nonstring_object_error(self): arr = np.array([True, False, None]) # Run / Assert - message = "The data must be of type string if order_by is 'alphabetical'." + message = ( + "The data must be of type string if order_by is 'alphabetical'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -122,7 +136,9 @@ def test__order_categories_numerical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal(ordered, np.array([-2.5, 3.11, 5, 67.8, 100, None])) + np.testing.assert_array_equal( + ordered, np.array([-2.5, 3.11, 5, 67.8, 100, None]) + ) def test__order_categories_numerical_error(self): """Test the ``_order_categories`` method when ``order_by`` is 'numerical_value'. @@ -141,7 +157,9 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = ( + "The data must be numerical if order_by is 'numerical_value'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -162,7 +180,9 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = ( + "The data must be numerical if order_by is 'numerical_value'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -182,12 +202,12 @@ def test__fit(self): expected_frequencies = { 'foo': 0.5, 'bar': 0.3333333333333333, - 'tar': 0.16666666666666666 + 'tar': 0.16666666666666666, } expected_intervals = { - 'foo': [0., 0.5], + 'foo': [0.0, 0.5], 'bar': [0.5, 0.8333333333333333], - 'tar': [0.8333333333333333, 1.0] + 'tar': [0.8333333333333333, 1.0], } assert transformer.frequencies == expected_frequencies assert transformer.intervals == expected_intervals @@ -204,12 +224,12 @@ def test__transform(self): transformer.frequencies = { 'foo': 0.5, 'bar': 0.3333333333333333, - 'tar': 0.16666666666666666 + 'tar': 0.16666666666666666, } transformer.intervals = { - 'foo': [0., 0.5], + 'foo': [0.0, 0.5], 'bar': [0.5, 0.8333333333333333], - 'tar': [0.8333333333333333, 1.0] + 'tar': [0.8333333333333333, 1.0], } # Run @@ -217,8 +237,12 @@ def test__transform(self): # Asserts for key in transformer.intervals: - assert (transformed.loc[data == key] >= transformer.intervals[key][0]).all() - assert (transformed.loc[data == key] < transformer.intervals[key][1]).all() + assert ( + transformed.loc[data == key] >= transformer.intervals[key][0] + ).all() + assert ( + transformed.loc[data == key] < transformer.intervals[key][1] + ).all() def test__transform_user_warning(self): """Test the ``transform`` with unknown data. @@ -236,15 +260,13 @@ def test__transform_user_warning(self): data_2 = pd.Series([1, 2, 3, 4, 5, 'a', 7, 8, 'b']) transformer = UniformEncoder() transformer.columns = ['col'] - transformer.frequencies = { - 1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25 - } + transformer.frequencies = {1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25} transformer.intervals = { 1: [0, 0.25], 2: [0.25, 0.5], 3: [0.5, 0.75], - 4: [0.75, 1] + 4: [0.75, 1], } # Run @@ -279,19 +301,35 @@ def test__reverse_transform(self, mock_convert_dtype, mock_check_nan): data = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) transformer = UniformEncoder() transformer.dtype = np.int64 - transformer.frequencies = { - 1: 0.222222, - 2: 0.444444, - 3: 0.333333 - } + transformer.frequencies = {1: 0.222222, 2: 0.444444, 3: 0.333333} transformer.intervals = { 1: [0, 0.222222], 2: [0.222222, 0.666666], - 3: [0.666666, 1.0] + 3: [0.666666, 1.0], } - transformed = pd.Series([0.12, 0.254, 0.789, 0.43, 0.56, 0.08, 0.67, 0.98, 0.36]) - mock_convert_dtype.return_value = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) + transformed = pd.Series([ + 0.12, + 0.254, + 0.789, + 0.43, + 0.56, + 0.08, + 0.67, + 0.98, + 0.36, + ]) + mock_convert_dtype.return_value = pd.Series([ + 1, + 2, + 3, + 2, + 2, + 1, + 3, + 3, + 2, + ]) # Run output = transformer._reverse_transform(transformed) @@ -307,23 +345,40 @@ def test__reverse_transform(self, mock_convert_dtype, mock_check_nan): def test__reverse_transform_nans(self): """Test ``_reverse_transform`` for data with NaNs.""" # Setup - data = pd.Series(['a', 'b', 'NaN', np.nan, 'NaN', 'b', 'b', 'a', 'b', np.nan]) + data = pd.Series([ + 'a', + 'b', + 'NaN', + np.nan, + 'NaN', + 'b', + 'b', + 'a', + 'b', + np.nan, + ]) transformer = UniformEncoder() transformer.dtype = object - transformer.frequencies = { - 'a': 0.2, - 'b': 0.4, - 'NaN': 0.2, - np.nan: 0.2 - } + transformer.frequencies = {'a': 0.2, 'b': 0.4, 'NaN': 0.2, np.nan: 0.2} transformer.intervals = { 'a': [0, 0.2], 'b': [0.2, 0.6], 'NaN': [0.6, 0.8], - np.nan: [0.8, 1] + np.nan: [0.8, 1], } - transformed = pd.Series([0.12, 0.254, 0.789, 0.88, 0.69, 0.53, 0.47, 0.08, 0.39, 0.92]) + transformed = pd.Series([ + 0.12, + 0.254, + 0.789, + 0.88, + 0.69, + 0.53, + 0.47, + 0.08, + 0.39, + 0.92, + ]) # Run output = transformer._reverse_transform(transformed) @@ -369,7 +424,9 @@ def test___init__(self): transformer = OrderedUniformEncoder(order=['b', 'c', 'a', None]) # Asserts - pd.testing.assert_series_equal(transformer.order, pd.Series(['b', 'c', 'a', np.nan])) + pd.testing.assert_series_equal( + transformer.order, pd.Series(['b', 'c', 'a', np.nan]) + ) def test___init___duplicate_categories(self): """Test the ``__init__`` method errors if duplicate categories provided. @@ -390,13 +447,17 @@ def test___repr___default(self): The order should be printed as instead of the actual order. """ # Setup - transformer = OrderedUniformEncoder(order=['VISA', 'AMEX', 'DISCOVER', None]) + transformer = OrderedUniformEncoder( + order=['VISA', 'AMEX', 'DISCOVER', None] + ) # Run stringified_transformer = transformer.__repr__() # Assert - assert stringified_transformer == 'OrderedUniformEncoder(order=)' + assert ( + stringified_transformer == 'OrderedUniformEncoder(order=)' + ) def test__fit(self): """Test the ``_fit`` method.""" @@ -412,13 +473,13 @@ def test__fit(self): 2.0: 0.2857142857142857, 3.0: 0.14285714285714285, None: 0.14285714285714285, - 1.0: 0.42857142857142855 + 1.0: 0.42857142857142855, } expected_intervals = { 2.0: [0.0, 0.2857142857142857], 3.0: [0.2857142857142857, 0.42857142857142855], None: [0.42857142857142855, 0.5714285714285714], - 1.0: [0.5714285714285714, 1.0] + 1.0: [0.5714285714285714, 1.0], } assert transformer.frequencies == expected_frequencies assert transformer.intervals == expected_intervals @@ -502,8 +563,12 @@ def test__transform(self): # Asserts for key in transformer.intervals: - assert (transformed.loc[data == key] >= transformer.intervals[key][0]).all() - assert (transformed.loc[data == key] < transformer.intervals[key][1]).all() + assert ( + transformed.loc[data == key] >= transformer.intervals[key][0] + ).all() + assert ( + transformed.loc[data == key] < transformer.intervals[key][1] + ).all() def test__transform_error(self): """Test the ``_transform`` method checks that data is in ``self.order``. @@ -524,7 +589,6 @@ def test__transform_error(self): class TestFrequencyEncoder: - def test___setstate__(self): """Test the ``__set_state__`` method. @@ -540,11 +604,7 @@ def test___setstate__(self): transformer = FrequencyEncoder() # Run - transformer.__setstate__({ - 'intervals': { - None: 'abc' - } - }) + transformer.__setstate__({'intervals': {None: 'abc'}}) # Assert assert transformer.__dict__['intervals'][np.nan] == 'abc' @@ -580,33 +640,28 @@ def test__get_intervals(self): # Asserts expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, 'bar': 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', 'bar', 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert result[0] == expected_intervals @@ -634,33 +689,28 @@ def test__get_intervals_nans(self): # Assert expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), np.nan: ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, np.nan: 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', np.nan, 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert result[0] == expected_intervals @@ -677,33 +727,28 @@ def test__fit_intervals(self): # Asserts expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } expected_means = pd.Series({ 'foo': 0.25, 'bar': 0.6666666666666666, - 'tar': 0.9166666666666666 + 'tar': 0.9166666666666666, }) expected_starts = pd.DataFrame({ 'category': ['foo', 'bar', 'tar'], - 'start': [0, 0.5, 0.8333333333333333] + 'start': [0, 0.5, 0.8333333333333333], }).set_index('start') assert transformer.intervals == expected_intervals @@ -763,24 +808,19 @@ def test__reverse_transform_series(self, mock_check_nan): pd.testing.assert_series_equal(mock_input_data, rt_data) assert mock_input_dtype == transformer.dtype expected_intervals = { - 'foo': ( - 0, - 0.5, - 0.25, - 0.5 / 6 - ), + 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': ( 0.5, 0.8333333333333333, 0.6666666666666666, - 0.05555555555555555 + 0.05555555555555555, ), 'tar': ( 0.8333333333333333, 0.9999999999999999, 0.9166666666666666, - 0.027777777777777776 - ) + 0.027777777777777776, + ), } assert transformer.intervals == expected_intervals @@ -855,14 +895,26 @@ def test__transform_by_category_called(self): data = pd.Series([1, 3, 3, 2, 1]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run - transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) + transformed = FrequencyEncoder._transform( + categorical_transformer_mock, data + ) # Asserts - categorical_transformer_mock._transform_by_category.assert_called_once_with(data) - assert transformed == categorical_transformer_mock._transform_by_category.return_value + categorical_transformer_mock._transform_by_category.assert_called_once_with( + data + ) + assert ( + transformed + == categorical_transformer_mock._transform_by_category.return_value + ) def test__transform_by_category(self): """Test the `_transform_by_category` method with numerical data. @@ -950,6 +1002,7 @@ def test__transform_by_category_add_noise_true(self, norm_mock): - ``rvs_mock`` should be called four times, one for each element of the intervals dictionary. """ + # Setup def rvs_mock_func(loc, scale, **kwargs): return loc @@ -974,10 +1027,30 @@ def rvs_mock_func(loc, scale, **kwargs): expected = np.array([0.875, 0.375, 0.375, 0.625, 0.875]) assert (transformed == expected).all() norm_mock.rvs.assert_has_calls([ - call(0.125, 0.041666666666666664, size=0, random_state=transform_random_state_mock), - call(0.375, 0.041666666666666664, size=2, random_state=transform_random_state_mock), - call(0.625, 0.041666666666666664, size=1, random_state=transform_random_state_mock), - call(0.875, 0.041666666666666664, size=2, random_state=transform_random_state_mock), + call( + 0.125, + 0.041666666666666664, + size=0, + random_state=transform_random_state_mock, + ), + call( + 0.375, + 0.041666666666666664, + size=2, + random_state=transform_random_state_mock, + ), + call( + 0.625, + 0.041666666666666664, + size=1, + random_state=transform_random_state_mock, + ), + call( + 0.875, + 0.041666666666666664, + size=2, + random_state=transform_random_state_mock, + ), ]) def test__transform_by_row_called(self): @@ -999,14 +1072,26 @@ def test__transform_by_row_called(self): data = pd.Series([1, 2, 3, 4]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run - transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) + transformed = FrequencyEncoder._transform( + categorical_transformer_mock, data + ) # Asserts - categorical_transformer_mock._transform_by_row.assert_called_once_with(data) - assert transformed == categorical_transformer_mock._transform_by_row.return_value + categorical_transformer_mock._transform_by_row.assert_called_once_with( + data + ) + assert ( + transformed + == categorical_transformer_mock._transform_by_row.return_value + ) def test__transform_by_row(self): """Test the `_transform_by_row` method with numerical data. @@ -1056,16 +1141,27 @@ def test__reverse_transform_by_category_called(self): transform_data = pd.Series([1, 3, 3, 2, 1]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) # Run reverse = FrequencyEncoder._reverse_transform( - categorical_transformer_mock, transform_data) + categorical_transformer_mock, transform_data + ) # Asserts - reverse_arg = categorical_transformer_mock._reverse_transform_by_category.call_args[0][0] + reverse_arg = categorical_transformer_mock._reverse_transform_by_category.call_args[ + 0 + ][0] np.testing.assert_array_equal(reverse_arg, transform_data.clip(0, 1)) - assert reverse == categorical_transformer_mock._reverse_transform_by_category.return_value + assert ( + reverse + == categorical_transformer_mock._reverse_transform_by_category.return_value + ) def test__reverse_transform_by_category(self): """Test the _reverse_transform_by_category method with numerical data. @@ -1084,7 +1180,9 @@ def test__reverse_transform_by_category(self): transformed = pd.Series([0.875, 0.375, 0.375, 0.625, 0.875]) transformer = FrequencyEncoder() - transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) + transformer.means = pd.Series( + [0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1] + ) transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), @@ -1114,7 +1212,7 @@ def test__get_category_from_start(self): transformer = FrequencyEncoder() transformer.starts = pd.DataFrame({ 'start': [0.0, 0.5, 0.7], - 'category': ['a', 'b', 'c'] + 'category': ['a', 'b', 'c'], }).set_index('start') # Run @@ -1142,18 +1240,33 @@ def test__reverse_transform_by_row_called(self): data = pd.Series([1, 2, 3, 4]) categorical_transformer_mock = Mock() - categorical_transformer_mock.means = pd.Series([0.125, 0.375, 0.625, 0.875]) + categorical_transformer_mock.means = pd.Series([ + 0.125, + 0.375, + 0.625, + 0.875, + ]) categorical_transformer_mock.starts = pd.DataFrame( - [0., 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category']) + [0.0, 0.25, 0.5, 0.75], index=[4, 3, 2, 1], columns=['category'] + ) categorical_transformer_mock._normalize.return_value = data # Run - reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data) + reverse = FrequencyEncoder._reverse_transform( + categorical_transformer_mock, data + ) # Asserts - reverse_arg = categorical_transformer_mock._reverse_transform_by_row.call_args[0][0] + reverse_arg = ( + categorical_transformer_mock._reverse_transform_by_row.call_args[ + 0 + ][0] + ) np.testing.assert_array_equal(reverse_arg, data.clip(0, 1)) - assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value + assert ( + reverse + == categorical_transformer_mock._reverse_transform_by_row.return_value + ) @patch('rdt.transformers.categorical.check_nan_in_transform') def test__reverse_transform_by_row(self, mock_check_nan): @@ -1174,9 +1287,12 @@ def test__reverse_transform_by_row(self, mock_check_nan): transformed = pd.Series([0.875, 0.625, 0.375, 0.125]) transformer = FrequencyEncoder() - transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) + transformer.means = pd.Series( + [0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1] + ) transformer.starts = pd.DataFrame( - [4, 3, 2, 1], index=[0., 0.25, 0.5, 0.75], columns=['category']) + [4, 3, 2, 1], index=[0.0, 0.25, 0.5, 0.75], columns=['category'] + ) transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), @@ -1197,7 +1313,6 @@ def test__reverse_transform_by_row(self, mock_check_nan): class TestOneHotEncoder: - def test__prepare_data_empty_lists(self): # Setup ohe = OneHotEncoder() @@ -1414,11 +1529,7 @@ def test__transform_no_nan(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_no_nan_categorical(self): @@ -1445,11 +1556,7 @@ def test__transform_no_nan_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_nans_encoded(self): @@ -1475,12 +1582,7 @@ def test__transform_nans_encoded(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [0, 0, 1], - [0, 0, 1], - [1, 0, 0], - [0, 1, 0] - ]) + expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected) def test__transform_nans_categorical(self): @@ -1509,12 +1611,7 @@ def test__transform_nans_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [0, 0, 1], - [0, 0, 1], - [1, 0, 0], - [0, 1, 0] - ]) + expected = np.array([[0, 0, 1], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) np.testing.assert_array_equal(out, expected) def test__transform_single_column(self): @@ -1539,11 +1636,7 @@ def test__transform_single_column(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_single_categorical(self): @@ -1571,11 +1664,7 @@ def test__transform_single_categorical(self): out = ohe._transform_helper(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_zeros(self): @@ -1600,11 +1689,7 @@ def test__transform_zeros(self): out = ohe._transform_helper(pd.Series(['b', 'b', 'b'])) # Assert - expected = np.array([ - [0], - [0], - [0] - ]) + expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected) def test__transform_zeros_categorical(self): @@ -1632,11 +1717,7 @@ def test__transform_zeros_categorical(self): out = ohe._transform_helper(pd.Series(['b', 'b', 'b'])) # Assert - expected = np.array([ - [0], - [0], - [0] - ]) + expected = np.array([[0], [0], [0]]) np.testing.assert_array_equal(out, expected) def test__transform_unknown_nan(self): @@ -1662,11 +1743,7 @@ def test__transform_unknown_nan(self): out = ohe._transform_helper(pd.Series(['b', 'b', np.nan])) # Assert - expected = np.array([ - [0, 0], - [0, 0], - [0, 1] - ]) + expected = np.array([[0, 0], [0, 0], [0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_no_nans(self): @@ -1689,11 +1766,7 @@ def test__transform_no_nans(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_nans(self): @@ -1716,11 +1789,7 @@ def test__transform_nans(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) np.testing.assert_array_equal(out, expected) def test__transform_single_column_filled_with_ones(self): @@ -1743,11 +1812,7 @@ def test__transform_single_column_filled_with_ones(self): out = ohe._transform(data) # Assert - expected = np.array([ - [1], - [1], - [1] - ]) + expected = np.array([[1], [1], [1]]) np.testing.assert_array_equal(out, expected) def test__transform_unknown(self): @@ -1782,7 +1847,7 @@ def test__transform_unknown(self): [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], - [0, 0, 0, 0] + [0, 0, 0, 0], ]) np.testing.assert_array_equal(out, expected) @@ -1816,7 +1881,9 @@ def test__transform_numeric(self): @patch('rdt.transformers.categorical.check_nan_in_transform') @patch('rdt.transformers.categorical.try_convert_to_dtype') - def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan): + def test__reverse_transform_no_nans( + self, mock_convert_dtype, mock_check_nan + ): # Setup ohe = OneHotEncoder() data = pd.Series(['a', 'b', 'c']) @@ -1824,11 +1891,7 @@ def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan): mock_convert_dtype.return_value = data # Run - transformed = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + transformed = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1847,11 +1910,7 @@ def test__reverse_transform_nans(self): ohe._fit(data) # Run - transformed = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]) + transformed = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1865,11 +1924,7 @@ def test__reverse_transform_single(self): ohe._fit(data) # Run - transformed = np.array([ - [1], - [1], - [1] - ]) + transformed = np.array([[1], [1], [1]]) out = ohe._reverse_transform(transformed) # Assert @@ -1892,11 +1947,12 @@ def test__reverse_transform_1d(self): class TestLabelEncoder: - def test___init__(self): """Passed arguments must be stored as attributes.""" # Run - transformer = LabelEncoder(add_noise='add_noise_value', order_by='alphabetical') + transformer = LabelEncoder( + add_noise='add_noise_value', order_by='alphabetical' + ) # Asserts assert transformer.add_noise == 'add_noise_value' @@ -1939,7 +1995,9 @@ def test__order_categories_alphabetical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal(ordered, np.array(['four', 'one', 'three', 'two'])) + np.testing.assert_array_equal( + ordered, np.array(['four', 'one', 'three', 'two']) + ) def test__order_categories_alphabetical_with_nans(self): """Test the ``_order_categories`` method when ``order_by`` is 'alphabetical'. @@ -1961,7 +2019,9 @@ def test__order_categories_alphabetical_with_nans(self): ordered = transformer._order_categories(arr) # Assert - expected = np.array(['four', 'one', 'three', 'two', np.nan], dtype='object') + expected = np.array( + ['four', 'one', 'three', 'two', np.nan], dtype='object' + ) pd.testing.assert_series_equal(pd.Series(ordered), pd.Series(expected)) def test__order_categories_alphabetical_error(self): @@ -1984,7 +2044,9 @@ def test__order_categories_alphabetical_error(self): arr = np.array([1, 2, 3, 4]) # Run / Assert - message = "The data must be of type string if order_by is 'alphabetical'." + message = ( + "The data must be of type string if order_by is 'alphabetical'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2008,7 +2070,9 @@ def test__order_categories_numerical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal(ordered, np.array([-2.5, 3.11, 5, 67.8, 100, np.nan])) + np.testing.assert_array_equal( + ordered, np.array([-2.5, 3.11, 5, 67.8, 100, np.nan]) + ) def test__order_categories_numerical_error(self): """Test the ``_order_categories`` method when ``order_by`` is 'numerical_value'. @@ -2030,7 +2094,9 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = ( + "The data must be numerical if order_by is 'numerical_value'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2054,7 +2120,9 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ("The data must be numerical if order_by is 'numerical_value'.") + message = ( + "The data must be numerical if order_by is 'numerical_value'." + ) with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2122,7 +2190,7 @@ def test__transform(self): transformed = transformer._transform(data) # Assert - expected = pd.Series([0., 1., 2.]) + expected = pd.Series([0.0, 1.0, 2.0]) pd.testing.assert_series_equal(transformed[:-1], expected) assert 0 <= transformed[3] <= 2 @@ -2182,11 +2250,21 @@ def test__transform_unseen_categories(self): # Run with pytest.warns(UserWarning): - transform_data = pd.Series(['a', 2, True, np.nan, np.nan, np.nan, 'b', False, 3]) + transform_data = pd.Series([ + 'a', + 2, + True, + np.nan, + np.nan, + np.nan, + 'b', + False, + 3, + ]) transformed = transformer._transform(transform_data) # Assert - expected = pd.Series([0., 1., 2.]) + expected = pd.Series([0.0, 1.0, 2.0]) pd.testing.assert_series_equal(transformed[:3], expected) assert all(0 <= value < len(fit_data) for value in transformed[3:]) @@ -2216,7 +2294,9 @@ def test__reverse_transform_clips_values(self): @patch('rdt.transformers.categorical.check_nan_in_transform') @patch('rdt.transformers.categorical.try_convert_to_dtype') - def test__reverse_transform_add_noise(self, mock_convert_dtype, mock_check_nan): + def test__reverse_transform_add_noise( + self, mock_convert_dtype, mock_check_nan + ): """Test the ``_reverse_transform`` method with ``add_noise``. Test that the method correctly reverse transforms the data @@ -2264,18 +2344,21 @@ def test__reverse_transform_integer_and_nans(self): class TestOrderedLabelEncoder: - def test___init__(self): """The the ``__init__`` method. Passed arguments must be stored as attributes. """ # Run - transformer = OrderedLabelEncoder(order=['b', 'c', 'a', None], add_noise='add_noise_value') + transformer = OrderedLabelEncoder( + order=['b', 'c', 'a', None], add_noise='add_noise_value' + ) # Asserts assert transformer.add_noise == 'add_noise_value' - pd.testing.assert_series_equal(transformer.order, pd.Series(['b', 'c', 'a', np.nan])) + pd.testing.assert_series_equal( + transformer.order, pd.Series(['b', 'c', 'a', np.nan]) + ) def test___init___duplicate_categories(self): """The the ``__init__`` method with duplicate categories in the order parameter. @@ -2288,7 +2371,9 @@ def test___init___duplicate_categories(self): 'Please drop the duplicates to proceed.' ) with pytest.raises(TransformerInputError, match=expected_msg): - OrderedLabelEncoder(order=['b', 'c', 'a', 'a'], add_noise='add_noise_value') + OrderedLabelEncoder( + order=['b', 'c', 'a', 'a'], add_noise='add_noise_value' + ) def test___repr___default(self): """Test that the ``__repr__`` method prints the custom order. @@ -2296,7 +2381,9 @@ def test___repr___default(self): The order should be printed as instead of the actual order. """ # Setup - transformer = OrderedLabelEncoder(order=['VISA', 'AMEX', 'DISCOVER', None]) + transformer = OrderedLabelEncoder( + order=['VISA', 'AMEX', 'DISCOVER', None] + ) # Run stringified_transformer = transformer.__repr__() @@ -2311,13 +2398,18 @@ def test___repr___add_noise_true(self): is provided, it should be printed too. """ # Setup - transformer = OrderedLabelEncoder(order=['VISA', 'AMEX', 'DISCOVER', None], add_noise=True) + transformer = OrderedLabelEncoder( + order=['VISA', 'AMEX', 'DISCOVER', None], add_noise=True + ) # Run stringified_transformer = transformer.__repr__() # Assert - assert stringified_transformer == 'OrderedLabelEncoder(order=, add_noise=True)' + assert ( + stringified_transformer + == 'OrderedLabelEncoder(order=, add_noise=True)' + ) def test__fit(self): """Test the ``_fit`` method. @@ -2348,10 +2440,14 @@ def test__fit(self): expected_values_to_categories = {0: 2, 1: 3, 2: np.nan, 3: 1} expected_categories_to_values = {2: 0, 3: 1, 1: 3, np.nan: 2} for key, value in transformer.values_to_categories.items(): - assert value == expected_values_to_categories[key] or pd.isna(value) + assert value == expected_values_to_categories[key] or pd.isna( + value + ) for key, value in transformer.categories_to_values.items(): - assert value == expected_categories_to_values.get(key) or pd.isna(key) + assert value == expected_categories_to_values.get(key) or pd.isna( + key + ) def test__fit_error(self): """Test the ``_fit`` method checks that data is in ``self.order``. @@ -2382,7 +2478,6 @@ def test__fit_error(self): class TestCustomLabelEncoder: - def test___init__(self): """Test the warning message for backwards compatibility of ``CustomLabelEncoder``.""" # Setup / Run / Assert diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index d0070723..06359452 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -5,12 +5,14 @@ import pandas as pd import pytest -from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder +from rdt.transformers.datetime import ( + OptimizedTimestampEncoder, + UnixTimestampEncoder, +) from rdt.transformers.null import NullTransformer class TestUnixTimestampEncoder: - def test___init__(self): """Test the ``__init__`` method and the passed arguments are stored as attributes.""" # Run @@ -33,7 +35,7 @@ def test___init__with_model_missing_values(self): transformer = UnixTimestampEncoder( missing_value_replacement='mode', model_missing_values=False, - datetime_format='%M-%d-%Y' + datetime_format='%M-%d-%Y', ) # Asserts @@ -62,7 +64,9 @@ def test__convert_to_datetime(self): converted_data = transformer._convert_to_datetime(data) # Assert - expected_data = pd.Series(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])) + expected_data = pd.Series( + pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) + ) pd.testing.assert_series_equal(expected_data, converted_data) def test__convert_to_datetime_format(self): @@ -89,7 +93,9 @@ def test__convert_to_datetime_format(self): converted_data = transformer._convert_to_datetime(data) # Assert - expected_data = pd.Series(pd.to_datetime(['01Feb2020', '02Mar2020', '03Jan2010'])) + expected_data = pd.Series( + pd.to_datetime(['01Feb2020', '02Mar2020', '03Jan2010']) + ) pd.testing.assert_series_equal(expected_data, converted_data) def test__convert_to_datetime_not_convertible_raises_error(self): @@ -106,11 +112,17 @@ def test__convert_to_datetime_not_convertible_raises_error(self): - a ``TypeError`` is raised. """ # Setup - data = pd.Series(['2020-01-01-can', '2020-02-01-not', '2020-03-01-convert']) + data = pd.Series([ + '2020-01-01-can', + '2020-02-01-not', + '2020-03-01-convert', + ]) transformer = UnixTimestampEncoder() # Run - error_message = 'Data must be of dtype datetime, or castable to datetime.' + error_message = ( + 'Data must be of dtype datetime, or castable to datetime.' + ) with pytest.raises(TypeError, match=error_message): transformer._convert_to_datetime(data) @@ -181,9 +193,15 @@ def test__transform_helper(self): transformed = transformer._transform_helper(data) # Assert - np.testing.assert_allclose(transformed, np.array([ - 1.577837e+18, 1.580515e+18, 1.583021e+18, - ]), rtol=1e-5) + np.testing.assert_allclose( + transformed, + np.array([ + 1.577837e18, + 1.580515e18, + 1.583021e18, + ]), + rtol=1e-5, + ) def test__reverse_transform_helper_nulls(self): """Test the ``_reverse_transform_helper`` with null values. @@ -205,14 +223,18 @@ def test__reverse_transform_helper_nulls(self): data = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) transformer = UnixTimestampEncoder(missing_value_replacement='mean') transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = pd.Series([1, 2, 3]) + transformer.null_transformer.reverse_transform.return_value = ( + pd.Series([1, 2, 3]) + ) # Run transformer._reverse_transform_helper(data) # Assert transformer.null_transformer.reverse_transform.assert_called_once() - datetimes = transformer.null_transformer.reverse_transform.mock_calls[0][1][0] + datetimes = transformer.null_transformer.reverse_transform.mock_calls[ + 0 + ][1][0] np.testing.assert_array_equal(data.to_numpy(), datetimes) def test__reverse_transform_helper_model_missing_values_true(self): @@ -235,14 +257,18 @@ def test__reverse_transform_helper_model_missing_values_true(self): data = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) transformer = UnixTimestampEncoder(model_missing_values=True) transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = pd.Series([1, 2, 3]) + transformer.null_transformer.reverse_transform.return_value = ( + pd.Series([1, 2, 3]) + ) # Run transformer._reverse_transform_helper(data) # Assert transformer.null_transformer.reverse_transform.assert_called_once() - datetimes = transformer.null_transformer.reverse_transform.mock_calls[0][1][0] + datetimes = transformer.null_transformer.reverse_transform.mock_calls[ + 0 + ][1][0] np.testing.assert_array_equal(data.to_numpy(), datetimes) @patch('rdt.transformers.datetime.NullTransformer') @@ -266,7 +292,8 @@ def test__fit(self, null_transformer_mock): assert null_transformer_mock.return_value.fit.call_count == 1 np.testing.assert_allclose( null_transformer_mock.return_value.fit.call_args_list[0][0][0], - np.array([1.577837e+18, 1.580515e+18, 1.583021e+18]), rtol=1e-5 + np.array([1.577837e18, 1.580515e18, 1.583021e18]), + rtol=1e-5, ) def test__fit_enforce_min_max_values(self): @@ -283,8 +310,8 @@ def test__fit_enforce_min_max_values(self): transformer._fit(data) # Assert - assert transformer._min_value == 1.5778368e+18 - assert transformer._max_value == 1.5830208e+18 + assert transformer._min_value == 1.5778368e18 + assert transformer._max_value == 1.5830208e18 def test__fit_calls_transform_helper(self): """Test the ``_fit`` method. @@ -307,7 +334,9 @@ def test__fit_calls_transform_helper(self): } @patch('rdt.transformers.datetime._guess_datetime_format_for_array') - def test__fit_calls_guess_datetime_format(self, mock__guess_datetime_format_for_array): + def test__fit_calls_guess_datetime_format( + self, mock__guess_datetime_format_for_array + ): """Test the ``_fit`` method. The ``_fit`` method should call the ``_transform_helper`` method. @@ -323,7 +352,7 @@ def test__fit_calls_guess_datetime_format(self, mock__guess_datetime_format_for_ # Assert np.testing.assert_array_equal( mock__guess_datetime_format_for_array.call_args[0][0], - np.array(['2020-02-01', '2020-03-01']) + np.array(['2020-02-01', '2020-03-01']), ) assert transformer.datetime_format == '%Y-%m-%d' @@ -334,7 +363,9 @@ def test__fit_missing_value_generation(self): column. """ # Setup - transformer = UnixTimestampEncoder(missing_value_generation='from_column') + transformer = UnixTimestampEncoder( + missing_value_generation='from_column' + ) data = pd.Series(['2020-02-01', np.nan]) # Run @@ -364,7 +395,8 @@ def test__transform(self): assert transformer.null_transformer.transform.call_count == 1 np.testing.assert_allclose( transformer.null_transformer.transform.call_args_list[0][0], - np.array([[1.577837e+18, 1.580515e+18, 1.583021e+18]]), rtol=1e-5 + np.array([[1.577837e18, 1.580515e18, 1.583021e18]]), + rtol=1e-5, ) def test__reverse_transform_all_none(self): @@ -390,14 +422,16 @@ def test__reverse_transform(self): """ # Setup ute = UnixTimestampEncoder() - transformed = np.array([1.5778368e+18, 1.5805152e+18, 1.5830208e+18]) + transformed = np.array([1.5778368e18, 1.5805152e18, 1.5830208e18]) ute.null_transformer = NullTransformer('mean') # Run output = ute._reverse_transform(transformed) # Assert - expected = pd.Series(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])) + expected = pd.Series( + pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) + ) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_enforce_min_max_values(self): @@ -409,19 +443,29 @@ def test__reverse_transform_enforce_min_max_values(self): # Setup ute = UnixTimestampEncoder(enforce_min_max_values=True) transformed = np.array([ - 1.5678367e+18, 1.5778368e+18, 1.5805152e+18, 1.5830208e+18, 1.5930209e+18 + 1.5678367e18, + 1.5778368e18, + 1.5805152e18, + 1.5830208e18, + 1.5930209e18, ]) ute.null_transformer = NullTransformer('mean') - ute._min_value = 1.5778368e+18 - ute._max_value = 1.5830208e+18 + ute._min_value = 1.5778368e18 + ute._max_value = 1.5830208e18 # Run output = ute._reverse_transform(transformed) # Assert - expected = pd.Series(pd.to_datetime([ - '2020-01-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-03-01' - ])) + expected = pd.Series( + pd.to_datetime([ + '2020-01-01', + '2020-01-01', + '2020-02-01', + '2020-03-01', + '2020-03-01', + ]) + ) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_datetime_format_dtype_is_datetime(self): @@ -429,7 +473,7 @@ def test__reverse_transform_datetime_format_dtype_is_datetime(self): # Setup ute = UnixTimestampEncoder() ute.datetime_format = '%b %d, %Y' - transformed = np.array([1.5778368e+18, 1.5805152e+18, 1.5830208e+18]) + transformed = np.array([1.5778368e18, 1.5805152e18, 1.5830208e18]) ute._dtype = np.dtype(' Date: Fri, 19 Apr 2024 11:02:25 -0400 Subject: [PATCH 03/17] fix quote --- tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tasks.py b/tasks.py index 53921d6b..296c9939 100644 --- a/tasks.py +++ b/tasks.py @@ -102,7 +102,8 @@ def install_minimum(c): minimum_versions = _get_minimum_versions(dependencies, python_version) if minimum_versions: - c.run(f'python -m pip install {' '.join(minimum_versions)}') + install_deps = ' '.join(minimum_versions) + c.run(f'python -m pip install {install_deps}') @task From 7f309bd16e760622cc15fa27975cb2fac5c990c5 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Fri, 19 Apr 2024 11:50:47 -0500 Subject: [PATCH 04/17] =?UTF-8?q?Bump=20version:=201.12.0=20=E2=86=92=201.?= =?UTF-8?q?12.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- rdt/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9ddbc16b..416f97a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -166,7 +166,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.0" +current_version = "1.12.1.dev0" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/rdt/__init__.py b/rdt/__init__.py index cd03c1c9..3d21df5d 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.0' +__version__ = '1.12.1.dev0' import sys From 3c002f3a31c8fa54f75a30d5db163eaafa43161e Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Mon, 22 Apr 2024 08:22:06 -0700 Subject: [PATCH 05/17] Fix warning (#811) --- rdt/__init__.py | 2 +- tests/unit/test___init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rdt/__init__.py b/rdt/__init__.py index 3d21df5d..dffca04a 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -159,7 +159,7 @@ def _find_addons(): try: addon = entry_point.load() except Exception: # pylint: disable=broad-exception-caught - msg = f'Failed to load "{entry_point.name}" from "{entry_point.version}".' + msg = f'Failed to load "{entry_point.name}" from "{entry_point.value}".' warnings.warn(msg) continue diff --git a/tests/unit/test___init__.py b/tests/unit/test___init__.py index a33e124b..ab4dbf5d 100644 --- a/tests/unit/test___init__.py +++ b/tests/unit/test___init__.py @@ -120,7 +120,7 @@ def entry_point_error(): bad_entry_point = Mock() bad_entry_point.name = 'bad_entry_point' - bad_entry_point.version = 'bad_module' + bad_entry_point.value = 'bad_module' bad_entry_point.load.side_effect = entry_point_error entry_points_mock.return_value = [bad_entry_point] msg = 'Failed to load "bad_entry_point" from "bad_module".' From f381930a451081639310a0c25cbf5823af66513e Mon Sep 17 00:00:00 2001 From: gsheni Date: Thu, 25 Apr 2024 11:46:20 -0400 Subject: [PATCH 06/17] lint --- pyproject.toml | 15 +- rdt/__init__.py | 19 +- rdt/hyper_transformer.py | 140 ++++--------- rdt/performance/datasets/boolean.py | 4 +- rdt/performance/datasets/categorical.py | 8 +- rdt/performance/datasets/datetime.py | 9 +- rdt/performance/datasets/numerical.py | 8 +- rdt/performance/performance.py | 12 +- rdt/performance/profiling.py | 24 +-- rdt/transformers/__init__.py | 9 +- rdt/transformers/_validators.py | 28 ++- rdt/transformers/base.py | 44 ++--- rdt/transformers/boolean.py | 4 +- rdt/transformers/categorical.py | 89 +++------ rdt/transformers/datetime.py | 35 +--- rdt/transformers/null.py | 21 +- rdt/transformers/numerical.py | 36 +--- rdt/transformers/pii/anonymizer.py | 66 ++----- rdt/transformers/text.py | 28 +-- rdt/transformers/utils.py | 21 +- tasks.py | 16 +- tests/code_style.py | 34 +--- tests/contributing.py | 52 ++--- tests/integration/test_hyper_transformer.py | 153 ++++---------- tests/integration/test_transformers.py | 70 ++----- .../transformers/pii/test_anonymizer.py | 16 +- tests/integration/transformers/test_base.py | 24 +-- .../integration/transformers/test_boolean.py | 8 +- .../transformers/test_categorical.py | 52 ++--- .../integration/transformers/test_datetime.py | 40 +--- .../transformers/test_numerical.py | 19 +- tests/integration/transformers/test_text.py | 104 +++------- tests/performance/test_performance.py | 12 +- tests/performance/tests/test_profiling.py | 25 +-- tests/unit/test___init__.py | 4 +- tests/unit/test_hyper_transformer.py | 180 +++++------------ .../transformers/pii/test_anonymization.py | 8 +- .../unit/transformers/pii/test_anonymizer.py | 99 +++------- tests/unit/transformers/test__validators.py | 92 +++------ tests/unit/transformers/test_base.py | 107 +++------- tests/unit/transformers/test_boolean.py | 30 +-- tests/unit/transformers/test_categorical.py | 187 +++++------------- tests/unit/transformers/test_datetime.py | 48 ++--- tests/unit/transformers/test_null.py | 113 +++-------- tests/unit/transformers/test_numerical.py | 124 +++--------- tests/unit/transformers/test_text.py | 72 ++----- tests/unit/transformers/test_utils.py | 13 +- 47 files changed, 591 insertions(+), 1731 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3dcb71e7..51b77988 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,7 +176,8 @@ build-backend = 'setuptools.build_meta' [tool.ruff] preview = true -line-length = 79 +line-length = 100 +indent-width = 4 src = ["rdt"] target-version = "py312" exclude = [ @@ -203,6 +204,13 @@ ignore = [ "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 ] +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +preview = true +docstring-code-format = true +docstring-code-line-length = "dynamic" + [tool.ruff.lint.pep8-naming] extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] @@ -212,10 +220,5 @@ known-first-party = ["rdt"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] -[tool.ruff.format] -quote-style = "single" -indent-style = "space" -preview = true - [tool.ruff.lint.pydocstyle] convention = "google" \ No newline at end of file diff --git a/rdt/__init__.py b/rdt/__init__.py index a98ad5b3..86ad4e24 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -42,9 +42,7 @@ def get_demo(num_rows=5): ['2021-06-26', '2021-02-10', 'NAT', '2020-09-26', '2020-12-22'], dtype='datetime64[ns]', ) - email_optin = pd.Series( - [False, False, False, True, np.nan], dtype='object' - ) + email_optin = pd.Series([False, False, False, True, np.nan], dtype='object') credit_card = ['VISA', 'VISA', 'AMEX', np.nan, 'DISCOVER'] age = [29, 18, 21, 45, 32] dollars_spent = [99.99, np.nan, 2.50, 25.00, 19.99] @@ -68,27 +66,20 @@ def get_demo(num_rows=5): login_dates = np.array( [ - np.datetime64('2000-01-01') - + np.timedelta64(np.random.randint(0, 10000), 'D') + np.datetime64('2000-01-01') + np.timedelta64(np.random.randint(0, 10000), 'D') for _ in range(num_rows) ], dtype='datetime64[ns]', ) - login_dates[np.random.random(size=num_rows) > 0.8] = np.datetime64( - 'NaT' - ) + login_dates[np.random.random(size=num_rows) > 0.8] = np.datetime64('NaT') email_optin = pd.Series([True, False, np.nan], dtype='object').sample( num_rows, replace=True ) - credit_card = np.random.choice( - ['VISA', 'AMEX', np.nan, 'DISCOVER'], size=num_rows - ) + credit_card = np.random.choice(['VISA', 'AMEX', np.nan, 'DISCOVER'], size=num_rows) age = np.random.randint(18, 100, size=num_rows) - dollars_spent = np.around( - np.random.uniform(0, 100, size=num_rows), decimals=2 - ) + dollars_spent = np.around(np.random.uniform(0, 100, size=num_rows), decimals=2) dollars_spent[np.random.random(size=num_rows) > 0.8] = np.nan finally: diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index a6feaac8..652597c5 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -40,18 +40,14 @@ def __repr__(self): config = { 'sdtypes': self['sdtypes'], - 'transformers': { - str(k): repr(v) for k, v in self['transformers'].items() - }, + 'transformers': {str(k): repr(v) for k, v in self['transformers'].items()}, } printed = json.dumps(config, indent=4) for transformer in self['transformers'].values(): quoted_transformer = f'"{transformer}"' if quoted_transformer in printed: - printed = printed.replace( - quoted_transformer, repr(transformer) - ) + printed = printed.replace(quoted_transformer, repr(transformer)) return printed @@ -73,7 +69,9 @@ class HyperTransformer: 'M': 'datetime', } _DEFAULT_OUTPUT_SDTYPES = ['numerical', 'float', 'integer'] - _REFIT_MESSAGE = "For this change to take effect, please refit your data using 'fit' or 'fit_transform'." + _REFIT_MESSAGE = ( + "For this change to take effect, please refit your data using 'fit' or 'fit_transform'." + ) _DETECT_CONFIG_MESSAGE = ( 'Nothing to update. Use the `detect_initial_config` method to pre-populate all the ' 'sdtypes and transformers from your dataset.' @@ -99,11 +97,7 @@ def _field_in_set(field, field_set): @staticmethod def _subset(input_list, other_list, not_in=False): - return [ - element - for element in input_list - if (element in other_list) ^ not_in - ] + return [element for element in input_list if (element in other_list) ^ not_in] def _create_multi_column_fields(self): multi_column_fields = {} @@ -140,9 +134,7 @@ def __init__(self): @staticmethod def _field_in_data(field, data): - all_columns_in_data = isinstance(field, tuple) and all( - col in data for col in field - ) + all_columns_in_data = isinstance(field, tuple) and all(col in data for col in field) return field in data or all_columns_in_data @staticmethod @@ -226,9 +218,7 @@ def _validate_config(config): sdtype_keys = sdtypes.keys() transformer_keys = flatten_column_list(transformers.keys()) - is_transformer_keys_unique = len(transformer_keys) == len( - set(transformer_keys) - ) + is_transformer_keys_unique = len(transformer_keys) == len(set(transformer_keys)) if not is_transformer_keys_unique: raise InvalidConfigError( 'Error: Invalid config. Please provide unique keys for the sdtypes ' @@ -249,11 +239,7 @@ def _validate_config(config): if transformer is None: continue - columns = ( - column_name - if isinstance(column_name, tuple) - else [column_name] - ) + columns = column_name if isinstance(column_name, tuple) else [column_name] for column in columns: sdtype = sdtypes.get(column) if sdtype not in transformer.get_supported_sdtypes(): @@ -308,9 +294,7 @@ def _validate_update_transformers_by_sdtype( if transformer_name is None: if transformer is None: - raise InvalidConfigError( - "Missing required parameter 'transformer_name'." - ) + raise InvalidConfigError("Missing required parameter 'transformer_name'.") if not isinstance(transformer, BaseTransformer): raise InvalidConfigError( @@ -326,9 +310,7 @@ def _validate_update_transformers_by_sdtype( if ( transformer_name not in get_class_by_transformer_name() or sdtype - not in get_class_by_transformer_name()[ - transformer_name - ].get_supported_sdtypes() + not in get_class_by_transformer_name()[transformer_name].get_supported_sdtypes() ): raise InvalidConfigError( f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype." @@ -337,18 +319,14 @@ def _validate_update_transformers_by_sdtype( if transformer_parameters is not None: transformer = get_class_by_transformer_name()[transformer_name] valid = inspect.signature(transformer).parameters - invalid_parameters = { - arg for arg in transformer_parameters if arg not in valid - } + invalid_parameters = {arg for arg in transformer_parameters if arg not in valid} if invalid_parameters: raise TransformerInputError( f'Invalid parameters {tuple(sorted(invalid_parameters))} ' f"for the '{transformer_name}'." ) - def _warn_update_transformers_by_sdtype( - self, transformer, transformer_name - ): + def _warn_update_transformers_by_sdtype(self, transformer, transformer_name): if self._fitted: warnings.warn(self._REFIT_MESSAGE) @@ -388,9 +366,7 @@ def _remove_column_in_multi_column_fields(self, column): for col in new_tuple: self._multi_column_fields[col] = new_tuple - self.field_transformers[new_tuple] = self.field_transformers.pop( - old_tuple - ) + self.field_transformers[new_tuple] = self.field_transformers.pop(old_tuple) def _update_multi_column_transformer(self): """Check that multi-columns mappings are valid and update them otherwise.""" @@ -414,9 +390,7 @@ def _update_multi_column_transformer(self): ) del self.field_transformers[field] for column, sdtype in columns_to_sdtypes.items(): - self.field_transformers[column] = deepcopy( - get_default_transformer(sdtype) - ) + self.field_transformers[column] = deepcopy(get_default_transformer(sdtype)) self._multi_column_fields = self._create_multi_column_fields() @@ -453,14 +427,12 @@ def update_transformers_by_sdtype( if transformer_name is not None: if transformer_parameters is not None: - transformer_instance = get_class_by_transformer_name()[ - transformer_name - ](**transformer_parameters) + transformer_instance = get_class_by_transformer_name()[transformer_name]( + **transformer_parameters + ) else: - transformer_instance = get_class_by_transformer_name()[ - transformer_name - ]() + transformer_instance = get_class_by_transformer_name()[transformer_name]() for field, field_sdtype in self.field_sdtypes.items(): if field_sdtype == sdtype: @@ -508,9 +480,7 @@ def update_sdtypes(self, column_name_to_sdtype): if column in self._multi_column_fields: self._remove_column_in_multi_column_fields(column) - transformers_to_update[column] = deepcopy( - get_default_transformer(sdtype) - ) + transformers_to_update[column] = deepcopy(get_default_transformer(sdtype)) self.field_sdtypes.update(column_name_to_sdtype) self.field_transformers.update(transformers_to_update) @@ -543,19 +513,11 @@ def update_transformers(self, column_name_to_transformer): self._validate_transformers(column_name_to_transformer) for column_name, transformer in column_name_to_transformer.items(): - columns = ( - column_name - if isinstance(column_name, tuple) - else (column_name,) - ) + columns = column_name if isinstance(column_name, tuple) else (column_name,) for column in columns: if transformer is not None: col_sdtype = self.field_sdtypes.get(column) - if ( - col_sdtype - and col_sdtype - not in transformer.get_supported_sdtypes() - ): + if col_sdtype and col_sdtype not in transformer.get_supported_sdtypes(): raise InvalidConfigError( f"Column '{column}' is a {col_sdtype} column, which is " f"incompatible with the '{transformer.get_name()}' transformer." @@ -650,9 +612,7 @@ def _learn_config(self, data): self._set_field_sdtype(data, field) if field not in self.field_transformers: sdtype = self.field_sdtypes[field] - self.field_transformers[field] = deepcopy( - get_default_transformer(sdtype) - ) + self.field_transformers[field] = deepcopy(get_default_transformer(sdtype)) def detect_initial_config(self, data): """Print the configuration of the data. @@ -736,16 +696,12 @@ def _fit_field_transformer(self, data, field, transformer): # If the column is part of a multi-column field, and at least one column # isn't present in the data, then it should not fit the next transformer if self._field_in_data(column_name, data): - data = self._fit_field_transformer( - data, column_name, next_transformer - ) + data = self._fit_field_transformer(data, column_name, next_transformer) return data def _validate_all_fields_fitted(self): - non_fitted_fields = self._specified_fields.difference( - self._fitted_fields - ) + non_fitted_fields = self._specified_fields.difference(self._fitted_fields) if non_fitted_fields: warnings.warn( 'The following fields were specified in the input arguments but not ' @@ -767,11 +723,7 @@ def _validate_detect_config_called(self, data): missing = any(column not in data.columns for column in fields) unknown_columns = self._subset(data.columns, fields, not_in=True) if unknown_columns or missing: - unknown_text = ( - f' (unknown columns: {unknown_columns})' - if unknown_columns - else '' - ) + unknown_text = f' (unknown columns: {unknown_columns})' if unknown_columns else '' raise InvalidDataError( 'The data you are trying to fit has different columns than the original ' f'detected data{unknown_text}. Column names and their ' @@ -811,9 +763,7 @@ def fit(self, data): else: field = column - data = self._fit_field_transformer( - data, field, self.field_transformers[field] - ) + data = self._fit_field_transformer(data, field, self.field_transformers[field]) self._validate_all_fields_fitted() self._fitted = True @@ -829,16 +779,10 @@ def _transform(self, data, prevent_subset): self._validate_config_exists() self._validate_fitted() - unknown_columns = self._subset( - data.columns, self._input_columns, not_in=True - ) + unknown_columns = self._subset(data.columns, self._input_columns, not_in=True) if prevent_subset: - contained = all( - column in self._input_columns for column in data.columns - ) - is_subset = contained and len(data.columns) < len( - self._input_columns - ) + contained = all(column in self._input_columns for column in data.columns) + is_subset = contained and len(data.columns) < len(self._input_columns) if unknown_columns or is_subset: raise InvalidDataError( 'The data you are trying to transform has different columns than the original ' @@ -917,13 +861,9 @@ def create_anonymized_columns(self, num_rows, column_names): self._validate_fitted() if not isinstance(num_rows, int) or num_rows <= 0: - raise ValueError( - "Parameter 'num_rows' must be an integer greater than 0." - ) + raise ValueError("Parameter 'num_rows' must be an integer greater than 0.") - unknown_columns = self._subset( - column_names, self._input_columns, not_in=True - ) + unknown_columns = self._subset(column_names, self._input_columns, not_in=True) if unknown_columns: raise InvalidConfigError( f"Unknown column name {unknown_columns}. Use 'get_config()' to see a " @@ -952,9 +892,7 @@ def _reverse_transform(self, data, prevent_subset): self._validate_config_exists() self._validate_fitted() - unknown_columns = self._subset( - data.columns, self._output_columns, not_in=True - ) + unknown_columns = self._subset(data.columns, self._output_columns, not_in=True) if unknown_columns: raise InvalidDataError( 'There are unexpected column names in the data you are trying to transform. ' @@ -962,12 +900,8 @@ def _reverse_transform(self, data, prevent_subset): ) if prevent_subset: - contained = all( - column in self._output_columns for column in data.columns - ) - is_subset = contained and len(data.columns) < len( - self._output_columns - ) + contained = all(column in self._output_columns for column in data.columns) + is_subset = contained and len(data.columns) < len(self._output_columns) if is_subset: raise InvalidDataError( 'You must provide a transformed dataset with all the columns from the ' @@ -980,9 +914,7 @@ def _reverse_transform(self, data, prevent_subset): else: for transformer in reversed(self._transformers_sequence): output_columns = transformer.get_output_columns() - if output_columns and set(output_columns).issubset( - data.columns - ): + if output_columns and set(output_columns).issubset(data.columns): data = transformer.reverse_transform(data) reversed_columns = self._subset(self._input_columns, data.columns) diff --git a/rdt/performance/datasets/boolean.py b/rdt/performance/datasets/boolean.py index 14ad742f..40d98cb9 100644 --- a/rdt/performance/datasets/boolean.py +++ b/rdt/performance/datasets/boolean.py @@ -108,9 +108,7 @@ class RandomSkewedBooleanNaNsGenerator(BooleanGenerator): def generate(num_rows): """Generate a ``num_rows`` number of rows.""" percent_null = np.random.randint(MIN_PERCENT, MAX_PERCENT_NULL) - percent_true = np.random.randint( - MIN_PERCENT, 100 - percent_null - MIN_PERCENT - ) + percent_true = np.random.randint(MIN_PERCENT, 100 - percent_null - MIN_PERCENT) percent_false = 100 - percent_null - percent_true return np.random.choice( diff --git a/rdt/performance/datasets/categorical.py b/rdt/performance/datasets/categorical.py index 3ce2e67f..c2ce6928 100644 --- a/rdt/performance/datasets/categorical.py +++ b/rdt/performance/datasets/categorical.py @@ -43,9 +43,7 @@ class RandomIntegerNaNsGenerator(CategoricalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans( - RandomIntegerGenerator.generate(num_rows).astype(float) - ) + return add_nans(RandomIntegerGenerator.generate(num_rows).astype(float)) @staticmethod def get_performance_thresholds(): @@ -199,9 +197,7 @@ class SingleIntegerNaNsGenerator(CategoricalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans( - SingleIntegerGenerator.generate(num_rows).astype(float) - ) + return add_nans(SingleIntegerGenerator.generate(num_rows).astype(float)) @staticmethod def get_performance_thresholds(): diff --git a/rdt/performance/datasets/datetime.py b/rdt/performance/datasets/datetime.py index 6a22981b..a6bd738a 100644 --- a/rdt/performance/datasets/datetime.py +++ b/rdt/performance/datasets/datetime.py @@ -119,9 +119,7 @@ def generate(num_rows): today = datetime.datetime.today() delta = datetime.timedelta - today = min( - datetime.datetime.today(), pd.Timestamp.max - delta(num_rows) - ) + today = min(datetime.datetime.today(), pd.Timestamp.max - delta(num_rows)) dates = [delta(i) + today for i in range(num_rows)] return np.array(dates, dtype='datetime64') @@ -149,10 +147,7 @@ def generate(num_rows): delta = datetime.timedelta today = datetime.datetime.today() - dates = [ - min(delta(weeks=i) + today, pd.Timestamp.max) - for i in range(num_rows) - ] + dates = [min(delta(weeks=i) + today, pd.Timestamp.max) for i in range(num_rows)] return np.array(dates, dtype='datetime64') diff --git a/rdt/performance/datasets/numerical.py b/rdt/performance/datasets/numerical.py index dddc78f0..2d5a973e 100644 --- a/rdt/performance/datasets/numerical.py +++ b/rdt/performance/datasets/numerical.py @@ -42,9 +42,7 @@ class RandomIntegerNaNsGenerator(NumericalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans( - RandomIntegerGenerator.generate(num_rows).astype(float) - ) + return add_nans(RandomIntegerGenerator.generate(num_rows).astype(float)) @staticmethod def get_performance_thresholds(): @@ -88,9 +86,7 @@ class ConstantIntegerNaNsGenerator(NumericalGenerator): @staticmethod def generate(num_rows): """Generate a ``num_rows`` number of rows.""" - return add_nans( - ConstantIntegerGenerator.generate(num_rows).astype(float) - ) + return add_nans(ConstantIntegerGenerator.generate(num_rows).astype(float)) @staticmethod def get_performance_thresholds(): diff --git a/rdt/performance/performance.py b/rdt/performance/performance.py index acff0d8e..d4bdbaf2 100644 --- a/rdt/performance/performance.py +++ b/rdt/performance/performance.py @@ -44,9 +44,7 @@ def _get_dataset_sizes(sdtype): return sizes -def evaluate_transformer_performance( - transformer, dataset_generator, verbose=False -): +def evaluate_transformer_performance(transformer, dataset_generator, verbose=False): """Evaluate the given transformer's performance against the given dataset generator. Args: @@ -79,15 +77,11 @@ def evaluate_transformer_performance( size = np.array([fit_size, transform_size, transform_size] * 2) performance = performance / size if verbose: - performance = performance.rename( - lambda x: x + ' (s)' if 'Time' in x else x + ' (B)' - ) + performance = performance.rename(lambda x: x + ' (s)' if 'Time' in x else x + ' (B)') performance['Number of fit rows'] = fit_size performance['Number of transform rows'] = transform_size performance['Dataset'] = dataset_generator.__name__ - performance['Transformer'] = ( - f'{transformer.__module__}.{transformer.get_name()}' - ) + performance['Transformer'] = f'{transformer.__module__}.{transformer.get_name()}' out.append(performance) diff --git a/rdt/performance/profiling.py b/rdt/performance/profiling.py index 1ef1922e..9b2598b8 100644 --- a/rdt/performance/profiling.py +++ b/rdt/performance/profiling.py @@ -10,9 +10,7 @@ import pandas as pd -def _profile_time( - transformer, method_name, dataset, column=None, iterations=10, copy=False -): +def _profile_time(transformer, method_name, dataset, column=None, iterations=10, copy=False): total_time = 0 for _ in range(iterations): if copy: @@ -56,9 +54,7 @@ def _profile_memory(method, dataset, column=None): return peak_memory.value -def profile_transformer( - transformer, dataset_generator, transform_size, fit_size=None -): +def profile_transformer(transformer, dataset_generator, transform_size, fit_size=None): """Profile a Transformer on a dataset. This function will get the total time and peak memory @@ -86,24 +82,16 @@ def profile_transformer( replace = transform_size > fit_size transform_dataset = fit_dataset.sample(transform_size, replace=replace) - fit_time = _profile_time( - transformer, 'fit', fit_dataset, column='test', copy=True - ) + fit_time = _profile_time(transformer, 'fit', fit_dataset, column='test', copy=True) fit_memory = _profile_memory(transformer.fit, fit_dataset, column='test') transformer.fit(fit_dataset, 'test') transform_time = _profile_time(transformer, 'transform', transform_dataset) - transform_memory = _profile_memory( - transformer.transform, transform_dataset - ) + transform_memory = _profile_memory(transformer.transform, transform_dataset) reverse_dataset = transformer.transform(transform_dataset) - reverse_time = _profile_time( - transformer, 'reverse_transform', reverse_dataset - ) - reverse_memory = _profile_memory( - transformer.reverse_transform, reverse_dataset - ) + reverse_time = _profile_time(transformer, 'reverse_transform', reverse_dataset) + reverse_memory = _profile_memory(transformer.reverse_transform, reverse_dataset) return pd.Series({ 'Fit Time': fit_time, diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index 968639d0..4f8f79b6 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -80,9 +80,7 @@ def get_transformer_name(transformer): if inspect.isclass(transformer): return transformer.__module__ + '.' + transformer.get_name() - raise ValueError( - f'The transformer {transformer} must be passed as a class.' - ) + raise ValueError(f'The transformer {transformer} must be passed as a class.') TRANSFORMERS = { @@ -113,10 +111,7 @@ def get_class_by_transformer_name(): BaseTransformer: BaseTransformer subclass class object. """ - return { - class_.get_name(): class_ - for class_ in BaseTransformer.get_subclasses() - } + return {class_.get_name(): class_ for class_ in BaseTransformer.get_subclasses()} def get_transformer_class(transformer): diff --git a/rdt/transformers/_validators.py b/rdt/transformers/_validators.py index 6e9c0a64..2cc22a4c 100644 --- a/rdt/transformers/_validators.py +++ b/rdt/transformers/_validators.py @@ -23,7 +23,9 @@ def _validate_supported_sdtypes(cls, columns_to_sdtypes): message += f"Column '{column}' has an unsupported sdtype '{sdtype}'.\n" if message: - message += f'Please provide a column that is compatible with {cls.VALIDATION_TYPE} data.' + message += ( + f'Please provide a column that is compatible with {cls.VALIDATION_TYPE} data.' + ) raise TransformerInputError(message) @classmethod @@ -88,18 +90,14 @@ def _validate_uniqueness_sdtype(columns_to_sdtypes): sdtypes_to_columns[sdtype].append(column) duplicate_fields = { - value: keys - for value, keys in sdtypes_to_columns.items() - if len(keys) > 1 + value: keys for value, keys in sdtypes_to_columns.items() if len(keys) > 1 } if duplicate_fields: message = '' for sdtype, columns in duplicate_fields.items(): to_print = "', '".join(columns) - message += ( - f"Columns '{to_print}' have the same sdtype '{sdtype}'.\n" - ) + message += f"Columns '{to_print}' have the same sdtype '{sdtype}'.\n" message += 'Your address data cannot have duplicate fields.' raise TransformerInputError(message) @@ -107,9 +105,7 @@ def _validate_uniqueness_sdtype(columns_to_sdtypes): @classmethod def _validate_administrative_unit(cls, columns_to_sdtypes): num_column_administrative_unit = sum( - 1 - for itm in columns_to_sdtypes.values() - if itm in ['administrative_unit', 'state'] + 1 for itm in columns_to_sdtypes.values() if itm in ['administrative_unit', 'state'] ) if num_column_administrative_unit > 1: raise TransformerInputError( @@ -128,12 +124,12 @@ def validate_sdtypes(cls, columns_to_sdtypes): @classmethod def validate_imports(cls): """Check that the address transformers can be imported.""" - error_message = 'You must have SDV Enterprise with the address add-on to use the address features.' + error_message = ( + 'You must have SDV Enterprise with the address add-on to use the address features.' + ) try: - address_module = importlib.import_module( - 'rdt.transformers.address' - ) + address_module = importlib.import_module('rdt.transformers.address') except ModuleNotFoundError: raise ImportError(error_message) from None @@ -151,9 +147,7 @@ class GPSValidator(BaseValidator): @staticmethod def _validate_uniqueness_sdtype(columns_to_sdtypes): - sdtypes_to_columns = { - sdtype: column for column, sdtype in columns_to_sdtypes.items() - } + sdtypes_to_columns = {sdtype: column for column, sdtype in columns_to_sdtypes.items()} if len(sdtypes_to_columns) != 2: raise TransformerInputError( 'The GPS columns must have one latitude and on longitude columns sdtypes. ' diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py index b202ef05..d1520b21 100644 --- a/rdt/transformers/base.py +++ b/rdt/transformers/base.py @@ -53,9 +53,7 @@ def wrapper(self, *args, **kwargs): return function(self, *args, **kwargs) method_name = function.__name__ - with set_random_states( - self.random_states, method_name, self.set_random_state - ): + with set_random_states(self.random_states, method_name, self.set_random_state): return function(self, *args, **kwargs) return wrapper @@ -82,9 +80,7 @@ class BaseTransformer: missing_value_generation = None def __init__(self): - self.output_properties = { - None: {'sdtype': 'float', 'next_transformer': None} - } + self.output_properties = {None: {'sdtype': 'float', 'next_transformer': None}} self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': None, @@ -146,9 +142,7 @@ def _set_model_missing_values(self, model_missing_values): elif model_missing_values is False: self._set_missing_value_generation('random') - def _set_missing_value_replacement( - self, default, missing_value_replacement - ): + def _set_missing_value_replacement(self, default, missing_value_replacement): if missing_value_replacement is None: warnings.warn( "Setting 'missing_value_replacement' to 'None' is no longer supported. " @@ -219,9 +213,7 @@ def _get_output_to_property(self, property_): if output_column is None: output[f'{self.column_prefix}'] = properties[property_] else: - output[f'{self.column_prefix}.{output_column}'] = properties[ - property_ - ] + output[f'{self.column_prefix}.{output_column}'] = properties[property_] return output @@ -306,16 +298,12 @@ def _add_columns_to_data(data, transformed_data, transformed_names): """ if transformed_names: if isinstance(transformed_data, (pd.Series, np.ndarray)): - transformed_data = pd.DataFrame( - transformed_data, columns=transformed_names - ) + transformed_data = pd.DataFrame(transformed_data, columns=transformed_names) # When '#' is added to the column_prefix of a transformer # the columns of transformed_data and transformed_names don't match transformed_data.columns = transformed_names - data = pd.concat( - [data, transformed_data.set_index(data.index)], axis=1 - ) + data = pd.concat([data, transformed_data.set_index(data.index)], axis=1) return data @@ -382,12 +370,8 @@ def _set_seed(self, data): for value in data.head(5): hash_value += str(value) - hash_value = int( - hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16 - ) - self.random_seed = hash_value % ( - (2**32) - 1 - ) # maximum value for a seed + hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) + self.random_seed = hash_value % ((2**32) - 1) # maximum value for a seed self.random_states = { 'fit': self.INITIAL_FIT_STATE, 'transform': np.random.RandomState(self.random_seed), @@ -443,9 +427,7 @@ def transform(self, data): columns_data = self._get_columns_data(data, self.columns) transformed_data = self._transform(columns_data) data = data.drop(self.columns, axis=1) - data = self._add_columns_to_data( - data, transformed_data, self.output_columns - ) + data = self._add_columns_to_data(data, transformed_data, self.output_columns) return data @@ -558,9 +540,7 @@ def _get_output_to_property(self, property_): if self.column_prefix is None: output[f'{output_column}'] = properties[property_] else: - output[f'{self.column_prefix}.{output_column}'] = properties[ - property_ - ] + output[f'{self.column_prefix}.{output_column}'] = properties[property_] return output @@ -569,9 +549,7 @@ def _validate_columns_to_sdtypes(self, data, columns_to_sdtypes): missing = set(columns_to_sdtypes.keys()) - set(data.columns) if missing: missing_to_print = ', '.join(missing) - raise ValueError( - f'Columns ({missing_to_print}) are not present in the data.' - ) + raise ValueError(f'Columns ({missing_to_print}) are not present in the data.') @classmethod def _validate_sdtypes(cls, columns_to_sdtypes): diff --git a/rdt/transformers/boolean.py b/rdt/transformers/boolean.py index 77fd60d1..5ae13c5a 100644 --- a/rdt/transformers/boolean.py +++ b/rdt/transformers/boolean.py @@ -47,9 +47,7 @@ def __init__( ): super().__init__() self._set_missing_value_generation(missing_value_generation) - self._set_missing_value_replacement( - 'random', missing_value_replacement - ) + self._set_missing_value_replacement('random', missing_value_replacement) if model_missing_values is not None: self._set_model_missing_values(model_missing_values) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index cce5bd50..98dfc99b 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -59,9 +59,7 @@ def _order_categories(self, unique_data): nans = pd.isna(unique_data) if self.order_by == 'alphabetical': # pylint: disable=invalid-unary-operand-type - if any( - map(lambda item: not isinstance(item, str), unique_data[~nans]) - ): # noqa: C417 + if any(map(lambda item: not isinstance(item, str), unique_data[~nans])): # noqa: C417 raise TransformerInputError( "The data must be of type string if order_by is 'alphabetical'." ) @@ -90,9 +88,7 @@ def _get_message_unseen_categories(cls, unseen_categories): """ categories_to_print = ', '.join(str(x) for x in unseen_categories[:3]) if len(unseen_categories) > 3: - categories_to_print = ( - f'{categories_to_print}, +{len(unseen_categories) - 3} more' - ) + categories_to_print = f'{categories_to_print}, +{len(unseen_categories) - 3} more' return categories_to_print @@ -137,9 +133,7 @@ def _fit(self, data): nan_value = freq[np.nan] if np.nan in freq.index else None freq = freq.reindex(labels, fill_value=nan_value).array - self.frequencies, self.intervals = self._compute_frequencies_intervals( - labels, freq - ) + self.frequencies, self.intervals = self._compute_frequencies_intervals(labels, freq) def _transform(self, data): """Map the category to a continuous value. @@ -159,9 +153,7 @@ def _transform(self, data): if unseen_indexes.any(): # Keep the 3 first unseen categories unseen_categories = list(data.loc[unseen_indexes].unique()) - categories_to_print = self._get_message_unseen_categories( - unseen_categories - ) + categories_to_print = self._get_message_unseen_categories(unseen_categories) warnings.warn( f"The data in column '{self.get_input_column()}' contains new categories " f"that did not appear during 'fit' ({categories_to_print}). Assigning " @@ -172,14 +164,10 @@ def _transform(self, data): choices = list(self.frequencies.keys()) size = unseen_indexes.size - data_with_none[unseen_indexes] = np.random.choice( - choices, size=size - ) + data_with_none[unseen_indexes] = np.random.choice(choices, size=size) def map_labels(label): - return np.random.uniform( - self.intervals[label][0], self.intervals[label][1] - ) + return np.random.uniform(self.intervals[label][0], self.intervals[label][1]) return data_with_none.map(map_labels).astype(float) @@ -277,9 +265,7 @@ def _fit(self, data): nans_not_seen = pd.isna(self.order).any() and not pd.isna(data).any() if category_not_seen or nans_not_seen: unseen_categories = [x for x in self.order if x not in data.array] - categories_to_print = self._get_message_unseen_categories( - unseen_categories - ) + categories_to_print = self._get_message_unseen_categories(unseen_categories) LOGGER.info( "For column '%s', some of the provided category values were not present in the" ' data during fit: (%s).', @@ -298,9 +284,7 @@ def _fit(self, data): nan_value = freq[np.nan] if np.nan in freq.index else None freq = freq.reindex(self.order, fill_value=nan_value).array - self.frequencies, self.intervals = self._compute_frequencies_intervals( - self.order, freq - ) + self.frequencies, self.intervals = self._compute_frequencies_intervals(self.order, freq) def _transform(self, data): """Map the category to a continuous value.""" @@ -383,13 +367,9 @@ def tie_breaker(element): if pd.isna(element): return data_is_na.loc[data_is_na == 1].index[0] - return data_with_new_index.loc[ - data_with_new_index == element - ].index[0] + return data_with_new_index.loc[data_with_new_index == element].index[0] - augmented_frequencies[sortable_column_name] = frequencies.index.map( - tie_breaker - ) + augmented_frequencies[sortable_column_name] = frequencies.index.map(tie_breaker) augmented_frequencies = augmented_frequencies.sort_values( [column_name, sortable_column_name], ascending=[False, True] ) @@ -416,9 +396,7 @@ def tie_breaker(element): start = end means = pd.Series(means, index=list(frequencies.keys())) - starts = pd.DataFrame(starts, columns=['category', 'start']).set_index( - 'start' - ) + starts = pd.DataFrame(starts, columns=['category', 'start']).set_index('start') return intervals, means, starts @@ -465,9 +443,7 @@ def _transform_by_category(self, data): size=mask.sum(), random_state=self.random_states['transform'], ) - result[mask] = self._clip_noised_transform( - result[mask], start, end - ) + result[mask] = self._clip_noised_transform(result[mask], start, end) else: result[mask] = mean @@ -481,21 +457,14 @@ def _get_value(self, category): start, end, mean, std = self.intervals[category] if self.add_noise: - result = norm.rvs( - mean, std, random_state=self.random_states['transform'] - ) + result = norm.rvs(mean, std, random_state=self.random_states['transform']) return self._clip_noised_transform(result, start, end) return mean def _transform_by_row(self, data): """Transform the data row by row.""" - data = ( - data.infer_objects() - .fillna(np.nan) - .apply(self._get_value) - .to_numpy() - ) + data = data.infer_objects().fillna(np.nan).apply(self._get_value).to_numpy() return data @@ -511,9 +480,7 @@ def _transform(self, data): """ fit_categories = pd.Series(self.intervals.keys()) has_nan = pd.isna(fit_categories).any() - unseen_indexes = ~( - data.isin(fit_categories) | (pd.isna(data) & has_nan) - ) + unseen_indexes = ~(data.isin(fit_categories) | (pd.isna(data) & has_nan)) if unseen_indexes.any(): # Select only the first 5 unseen categories to avoid flooding the console. unseen_categories = set(data[unseen_indexes][:5]) @@ -524,9 +491,7 @@ def _transform(self, data): 'please fit the transformer again with the new data.' ) - data[unseen_indexes] = np.random.choice( - fit_categories, size=unseen_indexes.size - ) + data[unseen_indexes] = np.random.choice(fit_categories, size=unseen_indexes.size) if len(self.means) < len(data): return self._transform_by_category(data) @@ -683,9 +648,7 @@ def _transform(self, data): """ data = self._prepare_data(data) unique_data = {np.nan if pd.isna(x) else x for x in pd.unique(data)} - unseen_categories = unique_data - { - np.nan if pd.isna(x) else x for x in self.dummies - } + unseen_categories = unique_data - {np.nan if pd.isna(x) else x for x in self.dummies} if unseen_categories: # Select only the first 5 unseen categories to avoid flooding the console. examples_unseen_categories = set(list(unseen_categories)[:5]) @@ -803,8 +766,7 @@ def _fit(self, data): unique_data = self._order_categories(unique_data) self.values_to_categories = dict(enumerate(unique_data)) self.categories_to_values = { - category: value - for value, category in self.values_to_categories.items() + category: value for value, category in self.values_to_categories.items() } def _transform(self, data): @@ -822,9 +784,7 @@ def _transform(self, data): Returns: pd.Series """ - mapped = ( - data.infer_objects().fillna(np.nan).map(self.categories_to_values) - ) + mapped = data.infer_objects().fillna(np.nan).map(self.categories_to_values) is_null = mapped.isna() if is_null.any(): # Select only the first 5 unseen categories to avoid flooding the console. @@ -836,9 +796,7 @@ def _transform(self, data): 'please fit the transformer again with the new data.' ) - mapped[is_null] = np.random.randint( - len(self.categories_to_values), size=is_null.sum() - ) + mapped[is_null] = np.random.randint(len(self.categories_to_values), size=is_null.sum()) if self.add_noise: mapped = mapped.astype(float) @@ -860,9 +818,7 @@ def _reverse_transform(self, data): if self.add_noise: data = np.floor(data) - data = data.clip( - min(self.values_to_categories), max(self.values_to_categories) - ) + data = data.clip(min(self.values_to_categories), max(self.values_to_categories)) data = data.round().map(self.values_to_categories) data = try_convert_to_dtype(data, self.dtype) @@ -936,8 +892,7 @@ def _fit(self, data): self.values_to_categories = dict(enumerate(self.order)) self.categories_to_values = { - category: value - for value, category in self.values_to_categories.items() + category: value for value, category in self.values_to_categories.items() } diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 7c1e35b4..d66c0610 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -92,22 +92,16 @@ def _convert_to_datetime(self, data): try: pandas_datetime_format = None if self.datetime_format: - pandas_datetime_format = self.datetime_format.replace( - '%-', '%' - ) + pandas_datetime_format = self.datetime_format.replace('%-', '%') data = pd.to_datetime(data, format=pandas_datetime_format) except ValueError as error: - if 'Unknown string' in str( - error - ) or 'Unknown datetime string' in str(error): + if 'Unknown string' in str(error) or 'Unknown datetime string' in str(error): message = 'Data must be of dtype datetime, or castable to datetime.' raise TypeError(message) from None - raise ValueError( - 'Data does not match specified datetime format.' - ) from None + raise ValueError('Data does not match specified datetime format.') from None return data @@ -115,11 +109,7 @@ def _transform_helper(self, datetimes): """Transform datetime values to integer.""" datetimes = self._convert_to_datetime(datetimes) nulls = datetimes.isna() - integers = ( - pd.to_numeric(datetimes, errors='coerce') - .to_numpy() - .astype(np.float64) - ) + integers = pd.to_numeric(datetimes, errors='coerce').to_numpy().astype(np.float64) integers[nulls] = np.nan transformed = pd.Series(integers) @@ -144,9 +134,7 @@ def _fit(self, data): self._dtype = data.dtype if self.datetime_format is None: datetime_array = data[data.notna()].astype(str).to_numpy() - self.datetime_format = _guess_datetime_format_for_array( - datetime_array - ) + self.datetime_format = _guess_datetime_format_for_array(datetime_array) transformed = self._transform_helper(data) if self.enforce_min_max_values: @@ -192,22 +180,15 @@ def _reverse_transform(self, data): data = self._reverse_transform_helper(data) datetime_data = pd.to_datetime(data) if self.datetime_format: - if ( - is_datetime64_dtype(self._dtype) - and '.%f' not in self.datetime_format - ): + if is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format: datetime_data = pd.to_datetime( datetime_data.dt.strftime(self.datetime_format), format=self.datetime_format, ) else: - datetime_data = datetime_data.dt.strftime( - self.datetime_format - ).astype(self._dtype) + datetime_data = datetime_data.dt.strftime(self.datetime_format).astype(self._dtype) elif is_numeric_dtype(self._dtype): - datetime_data = pd.to_numeric( - datetime_data.astype('object'), errors='coerce' - ) + datetime_data = pd.to_numeric(datetime_data.astype('object'), errors='coerce') datetime_data = datetime_data.astype(self._dtype) return datetime_data diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index 44386151..ed583a48 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -36,9 +36,7 @@ class NullTransformer: _missing_value_replacement = None _null_percentage = None - def __init__( - self, missing_value_replacement=None, missing_value_generation='random' - ): + def __init__(self, missing_value_replacement=None, missing_value_generation='random'): self._missing_value_replacement = missing_value_replacement if missing_value_generation not in (None, 'from_column', 'random'): raise TransformerInputError( @@ -78,10 +76,7 @@ def _get_missing_value_replacement(self, data): if self._missing_value_replacement is None: return None - if ( - self._missing_value_replacement in {'mean', 'mode', 'random'} - and pd.isna(data).all() - ): + if self._missing_value_replacement in {'mean', 'mode', 'random'} and pd.isna(data).all(): msg = ( f"'missing_value_replacement' cannot be set to '{self._missing_value_replacement}'" ' when the provided data only contains NaNs. Using 0 instead.' @@ -106,9 +101,7 @@ def fit(self, data): data (pandas.Series): Data to transform. """ - self._missing_value_replacement = self._get_missing_value_replacement( - data - ) + self._missing_value_replacement = self._get_missing_value_replacement(data) if self._missing_value_replacement == 'random': self._min_value = data.min() self._max_value = data.max() @@ -143,9 +136,7 @@ def transform(self, data): isna = data.isna() if self._missing_value_replacement == 'random': data_mask = list( - np.random.uniform( - low=self._min_value, high=self._max_value, size=len(data) - ) + np.random.uniform(low=self._min_value, high=self._max_value, size=len(data)) ) data = data.mask(data.isna(), data_mask) @@ -153,9 +144,7 @@ def transform(self, data): data = data.infer_objects().fillna(self._missing_value_replacement) if self._missing_value_generation == 'from_column': - return pd.concat( - [data, isna.astype(np.float64)], axis=1 - ).to_numpy() + return pd.concat([data, isna.astype(np.float64)], axis=1).to_numpy() return data.to_numpy() diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 4b90b3b9..1c6d1ddf 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -94,9 +94,7 @@ def __init__( self.enforce_min_max_values = enforce_min_max_values self.computer_representation = computer_representation - def _raise_out_of_bounds_error( - self, value, name, bound_type, min_bound, max_bound - ): + def _raise_out_of_bounds_error(self, value, name, bound_type, min_bound, max_bound): raise ValueError( f"The {bound_type} value in column '{name}' is {value}." f" All values represented by '{self.computer_representation}'" @@ -313,9 +311,7 @@ def __init__( 'instead.', FutureWarning, ) - distribution = self._DEPRECATED_DISTRIBUTIONS_MAPPING[ - distribution - ] + distribution = self._DEPRECATED_DISTRIBUTIONS_MAPPING[distribution] distribution = self._distributions[distribution] @@ -323,17 +319,11 @@ def __init__( def _get_univariate(self): distribution = self._distribution - if any( - isinstance(distribution, dist) - for dist in self._distributions.values() - ): + if any(isinstance(distribution, dist) for dist in self._distributions.values()): return copy.deepcopy(distribution) if isinstance(distribution, tuple): return distribution[0](**distribution[1]) - if ( - isinstance(distribution, type) - and distribution in self._distributions.values() - ): + if isinstance(distribution, type) and distribution in self._distributions.values(): return distribution() raise TypeError(f'Invalid distribution: {distribution}') @@ -505,9 +495,7 @@ def _fit(self, data): warnings.simplefilter('ignore') self._bgm_transformer.fit(data.reshape(-1, 1)) - self.valid_component_indicator = ( - self._bgm_transformer.weights_ > self.weight_threshold - ) + self.valid_component_indicator = self._bgm_transformer.weights_ > self.weight_threshold def _transform(self, data): """Transform the numerical data. @@ -554,10 +542,7 @@ def _transform(self, data): normalized = np.clip(normalized, -0.99, 0.99) normalized = normalized[:, 0] rows = [normalized, selected_component] - if ( - self.null_transformer - and self.null_transformer.models_missing_values() - ): + if self.null_transformer and self.null_transformer.models_missing_values(): rows.append(model_missing_values) return np.stack(rows, axis=1) # noqa: PD013 @@ -567,9 +552,7 @@ def _reverse_transform_helper(self, data): means = self._bgm_transformer.means_.reshape([-1]) stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1]) selected_component = data[:, 1].round().astype(int) - selected_component = selected_component.clip( - 0, self.valid_component_indicator.sum() - 1 - ) + selected_component = selected_component.clip(0, self.valid_component_indicator.sum() - 1) std_t = stds[self.valid_component_indicator][selected_component] mean_t = means[self.valid_component_indicator][selected_component] reversed_data = normalized * self.STD_MULTIPLIER * std_t + mean_t @@ -590,10 +573,7 @@ def _reverse_transform(self, data): data = data.to_numpy() recovered_data = self._reverse_transform_helper(data) - if ( - self.null_transformer - and self.null_transformer.models_missing_values() - ): + if self.null_transformer and self.null_transformer.models_missing_values(): recovered_data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013 return super()._reverse_transform(recovered_data) diff --git a/rdt/transformers/pii/anonymizer.py b/rdt/transformers/pii/anonymizer.py index 10e7da0f..c4dab9a5 100644 --- a/rdt/transformers/pii/anonymizer.py +++ b/rdt/transformers/pii/anonymizer.py @@ -87,18 +87,14 @@ def check_provider_function(provider_name, function_name): def _check_locales(self): """Check if the locales exist for the provided provider.""" - locales = ( - self.locales if isinstance(self.locales, list) else [self.locales] - ) + locales = self.locales if isinstance(self.locales, list) else [self.locales] missed_locales = [] for locale in locales: provider_name = self.provider_name if self.provider_name.endswith(f'.{locale}'): provider_name = self.provider_name.replace(f'.{locale}', '') - spec = importlib.util.find_spec( - f'faker.providers.{provider_name}.{locale}' - ) + spec = importlib.util.find_spec(f'faker.providers.{provider_name}.{locale}') if spec is None and locale != 'en_US': missed_locales.append(locale) @@ -124,9 +120,7 @@ def __init__( self._data_cardinality = None self.data_length = None self.enforce_uniqueness = enforce_uniqueness - self.cardinality_rule = ( - cardinality_rule.lower() if cardinality_rule else None - ) + self.cardinality_rule = cardinality_rule.lower() if cardinality_rule else None if enforce_uniqueness: warnings.warn( "The 'enforce_uniqueness' parameter is no longer supported. " @@ -144,9 +138,7 @@ def __init__( ) self.function_name = function_name if function_name else 'lexify' - self.function_kwargs = ( - deepcopy(function_kwargs) if function_kwargs else {} - ) + self.function_kwargs = deepcopy(function_kwargs) if function_kwargs else {} self.check_provider_function(self.provider_name, self.function_name) self.output_properties = {None: {'next_transformer': None}} @@ -202,13 +194,9 @@ def _function(self): else: faker_attr = self.faker except AttributeError: - faker_attr = ( - self.faker.unique if self.enforce_uniqueness else self.faker - ) + faker_attr = self.faker.unique if self.enforce_uniqueness else self.faker - result = getattr(faker_attr, self.function_name)( - **self.function_kwargs - ) + result = getattr(faker_attr, self.function_name)(**self.function_kwargs) if isinstance(result, Iterable) and not isinstance(result, str): result = ', '.join(map(str, result)) @@ -220,12 +208,8 @@ def _set_faker_seed(self, data): for value in data.head(5): hash_value += str(value) - hash_value = int( - hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16 - ) - self._faker_random_seed = hash_value % ( - (2**32) - 1 - ) # maximum value for a seed + hash_value = int(hashlib.sha256(hash_value.encode('utf-8')).hexdigest(), 16) + self._faker_random_seed = hash_value % ((2**32) - 1) # maximum value for a seed self.faker.seed_instance(self._faker_random_seed) def _fit(self, data): @@ -249,9 +233,7 @@ def _transform(self, _data): return None def _get_unique_categories(self, samples): - return np.array( - [self._function() for _ in range(samples)], dtype=object - ) + return np.array([self._function() for _ in range(samples)], dtype=object) def _reverse_transform_cardinality_rule_match(self, sample_size): """Reverse transform the data when the cardinality rule is 'match'.""" @@ -269,17 +251,13 @@ def _reverse_transform_cardinality_rule_match(self, sample_size): return reverse_transformed if sample_size < num_nans + self._data_cardinality: - unique_categories = self._get_unique_categories( - sample_size - num_nans - ) + unique_categories = self._get_unique_categories(sample_size - num_nans) reverse_transformed = np.concatenate([ reverse_transformed, unique_categories, ]) else: - unique_categories = self._get_unique_categories( - self._data_cardinality - ) + unique_categories = self._get_unique_categories(self._data_cardinality) num_copies = sample_size - self._data_cardinality - num_nans copies = np.random.choice(unique_categories, num_copies) reverse_transformed = np.concatenate([ @@ -308,13 +286,8 @@ def _reverse_transform(self, data): sample_size = self.data_length try: - if ( - hasattr(self, 'cardinality_rule') - and self.cardinality_rule == 'match' - ): - reverse_transformed = ( - self._reverse_transform_cardinality_rule_match(sample_size) - ) + if hasattr(self, 'cardinality_rule') and self.cardinality_rule == 'match': + reverse_transformed = self._reverse_transform_cardinality_rule_match(sample_size) else: reverse_transformed = np.array( [self._function() for _ in range(sample_size)], @@ -328,14 +301,9 @@ def _reverse_transform(self, data): f"('{self.get_input_column()}')." ) from exception - if ( - self.missing_value_generation == 'random' - and not pd.isna(reverse_transformed).any() - ): + if self.missing_value_generation == 'random' and not pd.isna(reverse_transformed).any(): num_nans = int(self._nan_frequency * sample_size) - nan_indices = np.random.choice( - sample_size, num_nans, replace=False - ) + nan_indices = np.random.choice(sample_size, num_nans, replace=False) reverse_transformed[nan_indices] = np.nan return reverse_transformed @@ -439,9 +407,7 @@ def _fit(self, columns_data): unique_values = columns_data[columns_data.notna()].unique() unique_data_length = len(unique_values) try: - generated_values = [ - self._function() for _ in range(unique_data_length) - ] + generated_values = [self._function() for _ in range(unique_data_length)] except faker.exceptions.UniquenessException as exception: raise TransformerProcessingError( 'The Faker function you specified is not able to generate ' diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index 6e00194a..dd74ddcb 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -65,10 +65,7 @@ def _reverse_transform(self, data): prefix_str = self.prefix if self.prefix is not None else '' suffix_str = self.suffix if self.suffix is not None else '' - values = [ - f'{prefix_str}{start + idx}{suffix_str}' - for idx in range(len(data)) - ] + values = [f'{prefix_str}{start + idx}{suffix_str}' for idx in range(len(data))] self._counter += len(data) return pd.Series(values) @@ -135,18 +132,14 @@ def __init__( self.generator_size = None self.generated = None if generation_order not in ['alphanumeric', 'scrambled']: - raise ValueError( - "generation_order must be one of 'alphanumeric' or 'scrambled'." - ) + raise ValueError("generation_order must be one of 'alphanumeric' or 'scrambled'.") self.generation_order = generation_order def reset_randomization(self): """Create a new generator and reset the generated values counter.""" super().reset_randomization() - self.generator, self.generator_size = strings_from_regex( - self.regex_format - ) + self.generator, self.generator_size = strings_from_regex(self.regex_format) self.generated = 0 def _fit(self, data): @@ -219,9 +212,7 @@ def _reverse_transform(self, data): remaining = self.generator_size if remaining >= sample_size: - reverse_transformed = [ - next(self.generator) for _ in range(sample_size) - ] + reverse_transformed = [next(self.generator) for _ in range(sample_size)] self.generated += sample_size else: @@ -239,21 +230,16 @@ def _reverse_transform(self, data): except ValueError: counter = 0 while len(reverse_transformed) < sample_size: - remaining_samples = sample_size - len( - reverse_transformed - ) + remaining_samples = sample_size - len(reverse_transformed) reverse_transformed.extend([ - f'{i}({counter})' - for i in generated_values[:remaining_samples] + f'{i}({counter})' for i in generated_values[:remaining_samples] ]) counter += 1 else: while len(reverse_transformed) < sample_size: remaining_samples = sample_size - len(reverse_transformed) - reverse_transformed.extend( - generated_values[:remaining_samples] - ) + reverse_transformed.extend(generated_values[:remaining_samples]) if getattr(self, 'generation_order', 'alphanumeric') == 'scrambled': np.random.shuffle(reverse_transformed) diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index f9e33f95..81989fa6 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -29,18 +29,14 @@ def _in(options, max_repeat): generators.append(generator) sizes.append(size) - return (value for generator in generators for value in generator), np.sum( - sizes - ) + return (value for generator in generators for value in generator), np.sum(sizes) def _range(options, max_repeat): del max_repeat min_value, max_value = options max_value += 1 - return ( - chr(value) for value in range(min_value, max_value) - ), max_value - min_value + return (chr(value) for value in range(min_value, max_value)), max_value - min_value def _any(options, max_repeat): @@ -63,14 +59,13 @@ def _max_repeat(options, max_repeat): if repeat: sizes.append(pow(int(size), repeat, 2**63 - 1)) repeat_generators = [ - (_GENERATORS[option](args, max_repeat)[0], option, args) - for _ in range(repeat) + (_GENERATORS[option](args, max_repeat)[0], option, args) for _ in range(repeat) ] generators.append(_from_generators(repeat_generators, max_repeat)) - return (value for generator in generators for value in generator), np.sum( - sizes - ) + int(min_ == 0) + return (value for generator in generators for value in generator), np.sum(sizes) + int( + min_ == 0 + ) def _category_chars(regex): @@ -159,9 +154,7 @@ def strings_from_regex(regex, max_repeat=16): generators.append((generator, option, args)) sizes.append(size) - return _from_generators(generators, max_repeat), np.prod( - sizes, dtype=np.complex128 - ).real + return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128).real def fill_nan_with_none(data): diff --git a/tasks.py b/tasks.py index 296c9939..ae69f892 100644 --- a/tasks.py +++ b/tasks.py @@ -57,18 +57,12 @@ def _get_minimum_versions(dependencies, python_version): req = Requirement(dependency) if ';' in dependency: marker = req.marker - if marker and not marker.evaluate({ - 'python_version': python_version - }): + if marker and not marker.evaluate({'python_version': python_version}): continue # Skip this dependency if the marker does not apply to the current Python version if req.name not in min_versions: min_version = next( - ( - spec.version - for spec in req.specifier - if spec.operator in ('>=', '==') - ), + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), None, ) if min_version: @@ -77,11 +71,7 @@ def _get_minimum_versions(dependencies, python_version): elif '@' not in min_versions[req.name]: existing_version = Version(min_versions[req.name].split('==')[1]) new_version = next( - ( - spec.version - for spec in req.specifier - if spec.operator in ('>=', '==') - ), + (spec.version for spec in req.specifier if spec.operator in ('>=', '==')), existing_version, ) if new_version > existing_version: diff --git a/tests/code_style.py b/tests/code_style.py index 1186d09c..7924dee8 100644 --- a/tests/code_style.py +++ b/tests/code_style.py @@ -28,9 +28,7 @@ def validate_transformer_module(transformer): elif transformer_folder.parent.match('transformers'): is_valid = True - assert ( - is_valid - ), 'The transformer module is not placed inside a valid path.' + assert is_valid, 'The transformer module is not placed inside a valid path.' def validate_transformer_importable_from_parent_module(transformer): @@ -39,9 +37,7 @@ def validate_transformer_importable_from_parent_module(transformer): module = getattr(transformer, '__module__', '') module = module.rsplit('.', 1)[0] imported_transformer = getattr(importlib.import_module(module), name, None) - assert ( - imported_transformer is not None - ), f'Could not import {name} from {module}' + assert imported_transformer is not None, f'Could not import {name} from {module}' def get_test_location(transformer): @@ -52,16 +48,10 @@ def get_test_location(transformer): test_location = None if transformer_folder.match('transformers'): - test_location = ( - rdt_unit_test_path - / 'transformers' - / f'test_{transformer_file.name}' - ) + test_location = rdt_unit_test_path / 'transformers' / f'test_{transformer_file.name}' elif transformer_folder.parent.match('transformers'): - test_location = ( - rdt_unit_test_path / 'transformers' / transformer_folder.name - ) + test_location = rdt_unit_test_path / 'transformers' / transformer_folder.name test_location = test_location / f'test_{transformer_file.name}' return test_location @@ -84,9 +74,7 @@ def _load_module_from_path(path): if module_path.name == 'transformers': module_path = f'rdt.transformers.{module_name}' elif module_path.parent.name == 'transformers': - module_path = ( - f'rdt.transformers.{module_path.parent.name}.{module_name}' - ) + module_path = f'rdt.transformers.{module_path.parent.name}.{module_name}' spec = importlib.util.spec_from_file_location(module_path, path) module = importlib.util.module_from_spec(spec) @@ -103,12 +91,8 @@ def validate_test_names(transformer): test_class = getattr(module, f'Test{transformer.get_name()}', None) assert test_class is not None, 'The expected test class was not found.' - test_functions = inspect.getmembers( - test_class, predicate=inspect.isfunction - ) - test_functions = [ - test for test, _ in test_functions if test.startswith('test') - ] + test_functions = inspect.getmembers(test_class, predicate=inspect.isfunction) + test_functions = [test for test, _ in test_functions if test.startswith('test')] assert test_functions, 'No test functions found within the test module.' @@ -133,9 +117,7 @@ def validate_test_names(transformer): assert len(valid_test_functions) > count, fail_message -@pytest.mark.parametrize( - 'transformer', TRANSFORMERS.values(), ids=TRANSFORMERS.keys() -) # noqa +@pytest.mark.parametrize('transformer', TRANSFORMERS.values(), ids=TRANSFORMERS.keys()) # noqa def test_transformer_code_style(transformer): """Validate a transformer.""" if not inspect.isclass(transformer): diff --git a/tests/contributing.py b/tests/contributing.py index a484efb7..c4e7754e 100644 --- a/tests/contributing.py +++ b/tests/contributing.py @@ -83,9 +83,7 @@ def validate_transformer_integration(transformer): if isinstance(transformer, str): transformer = get_transformer_class(transformer) - print( - f'Validating Integration Tests for transformer {transformer.get_name()}\n' - ) + print(f'Validating Integration Tests for transformer {transformer.get_name()}\n') steps = [] validation_error = None @@ -94,9 +92,7 @@ def validate_transformer_integration(transformer): try: validate_transformer(transformer, steps=steps) except Exception as error: - error_trace = ''.join( - traceback.TracebackException.from_exception(error).format() - ) + error_trace = ''.join(traceback.TracebackException.from_exception(error).format()) for check in CHECK_DETAILS: if check in error_trace: @@ -125,9 +121,7 @@ def validate_transformer_integration(transformer): else: result_summaries.append([check, 'Yes', details]) - summary = pd.DataFrame( - result_summaries, columns=['Check', 'Correct', 'Details'] - ) + summary = pd.DataFrame(result_summaries, columns=['Check', 'Correct', 'Details']) print(tabulate(summary, headers='keys', showindex=False)) return validation_error is None and error_trace is None @@ -138,9 +132,7 @@ def _validate_third_party_code_style( ): run_command = command.split(' ') run_command.append(transformer_path) - output_capture = subprocess.run( - run_command, capture_output=True - ).stdout.decode() + output_capture = subprocess.run(run_command, capture_output=True).stdout.decode() if output_capture: return { 'Check': tag, @@ -156,9 +148,7 @@ def _validate_third_party_code_style( } -def _custom_validation( - function, tag, success_message, error_message, transformer -): +def _custom_validation(function, tag, success_message, error_message, transformer): try: function(transformer) return { @@ -346,13 +336,9 @@ def validate_transformer_unit_tests(transformer): score = cov.report(show_missing=True) rounded_score = round(score / 100, 3) if rounded_score < 1.0: - print( - f'\nERROR: The unit tests only cover {round(score, 3)}% of your code.' - ) + print(f'\nERROR: The unit tests only cover {round(score, 3)}% of your code.') else: - print( - f'\nSUCCESS: The unit tests cover {round(score, 3)}% of your code.' - ) + print(f'\nSUCCESS: The unit tests cover {round(score, 3)}% of your code.') cov.html_report() print('\nFull coverage report here:\n') @@ -390,9 +376,7 @@ def validate_transformer_performance(transformer): total_results = pd.DataFrame() for current_transformer in transformers: for dataset_generator in dataset_generators: - performance = evaluate_transformer_performance( - current_transformer, dataset_generator - ) + performance = evaluate_transformer_performance(current_transformer, dataset_generator) valid = validate_performance(performance, dataset_generator) results = pd.DataFrame({ @@ -409,14 +393,10 @@ def validate_transformer_performance(transformer): else: print('ERROR: One or more Performance Tests were NOT successful.') - other_results = total_results[ - total_results.transformer != transformer.get_name() - ] + other_results = total_results[total_results.transformer != transformer.get_name()] average = other_results.groupby('Evaluation Metric')['Value'].mean() - total_results = total_results[ - total_results.transformer == transformer.get_name() - ] + total_results = total_results[total_results.transformer == transformer.get_name()] final_results = total_results.groupby('Evaluation Metric').agg({ 'Value': 'mean', 'Valid': 'any', @@ -427,9 +407,7 @@ def validate_transformer_performance(transformer): 's / row', 'B / row', ) - final_results['Acceptable'] = np.where( - final_results['Acceptable'], 'Yes', 'No' - ) + final_results['Acceptable'] = np.where(final_results['Acceptable'], 'Yes', 'No') final_results['Compared to Average'] = ( final_results['Value'].div(average).replace(np.inf, np.nan) ) @@ -450,9 +428,7 @@ def check_clean_repository(): if any other file has been modified outside of that range. """ run_command = 'git diff --name-only main'.split(' ') - output_capture = subprocess.run( - run_command, capture_output=True - ).stdout.decode() + output_capture = subprocess.run(run_command, capture_output=True).stdout.decode() output_capture = output_capture.splitlines() validated_paths = [] @@ -557,9 +533,7 @@ def validate_pull_request(transformer): if success: print('\nSUCCESS: The Pull Request can be made!') - print( - 'You can now commit all your changes, push to GitHub and create a Pull Request.' - ) + print('You can now commit all your changes, push to GitHub and create a Pull Request.') else: print('\nERROR: The Pull Request can not be made!') print('Fix the reported errors and try again.') diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index c8725735..a87b48fe 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -367,9 +367,7 @@ def test_default_inputs(self): assert isinstance(ht.field_transformers['float'], FloatFormatter) assert isinstance(ht.field_transformers['categorical'], UniformEncoder) assert isinstance(ht.field_transformers['bool'], UniformEncoder) - assert isinstance( - ht.field_transformers['datetime'], UnixTimestampEncoder - ) + assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder) assert isinstance(ht.field_transformers['names'], UniformEncoder) get_default_transformers.cache_clear() @@ -451,9 +449,7 @@ def test_single_category(self): # Run ht.detect_initial_config(data) - ht.update_transformers( - column_name_to_transformer={'a': OneHotEncoder()} - ) + ht.update_transformers(column_name_to_transformer={'a': OneHotEncoder()}) ht.fit(data) transformed = ht.transform(data) reverse = ht.reverse_transform(transformed) @@ -946,9 +942,7 @@ def test_reverse_transform_subset_and_generators(self): # Run ht.fit(customers) transformed = ht.transform(customers) - reverse_transformed = ht.reverse_transform_subset( - transformed[['last_login']] - ) + reverse_transformed = ht.reverse_transform_subset(transformed[['last_login']]) # Assert expected_transformed_columns = [ @@ -1042,9 +1036,7 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[ - f'{self.column_prefix}.a' - ].str[:-1] + new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.a'].str[:-1] return new_data class CD(BaseTransformer): @@ -1064,9 +1056,7 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[ - f'{self.column_prefix}.c' - ].str[:-1] + new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.c'].str[:-1] return new_data class E(BaseTransformer): @@ -1085,9 +1075,7 @@ def _transform(self, data): def _reverse_transform(self, data): new_data = pd.DataFrame() - new_data[f'{self.column_prefix}'] = data[ - f'{self.column_prefix}.e' - ].str[:-1] + new_data[f'{self.column_prefix}'] = data[f'{self.column_prefix}.e'].str[:-1] return new_data ht = HyperTransformer() @@ -1163,9 +1151,7 @@ def test_field_transformers_correctly_set(self): transformer = ht.get_config()['transformers']['col'] assert transformer is fe - ht.update_transformers_by_sdtype( - 'categorical', transformer_name='FrequencyEncoder' - ) + ht.update_transformers_by_sdtype('categorical', transformer_name='FrequencyEncoder') transformer = ht.get_config()['transformers']['col'] transformer.new_attribute3 = 'abc' ht.fit(data) @@ -1186,9 +1172,7 @@ def _get_hyper_transformer_with_random_transformers(self, data): 'signup_day': 'datetime', }) ht.update_transformers({ - 'credit_card': AnonymizedFaker( - 'credit_card', 'credit_card_number' - ), + 'credit_card': AnonymizedFaker('credit_card', 'credit_card_number'), 'balance': ClusterBasedNormalizer(max_clusters=3), 'name': RegexGenerator(), }) @@ -1292,15 +1276,9 @@ def test_reset_randomization(self): first_transformed2 = ht2.transform(data) second_transformed1 = ht1.transform(data) - pd.testing.assert_frame_equal( - first_transformed1, expected_first_transformed - ) - pd.testing.assert_frame_equal( - first_transformed2, expected_first_transformed - ) - pd.testing.assert_frame_equal( - second_transformed1, expected_second_transformed - ) + pd.testing.assert_frame_equal(first_transformed1, expected_first_transformed) + pd.testing.assert_frame_equal(first_transformed2, expected_first_transformed) + pd.testing.assert_frame_equal(second_transformed1, expected_second_transformed) # test reverse transforming multiple times with different tranformers expected_first_reverse = pd.DataFrame({ @@ -1354,9 +1332,7 @@ def test_reset_randomization(self): ht1.reset_randomization() transformed_post_reset = ht1.reverse_transform(first_transformed1) - pd.testing.assert_frame_equal( - transformed_post_reset, expected_first_reverse - ) + pd.testing.assert_frame_equal(transformed_post_reset, expected_first_reverse) def test_cluster_based_normalizer_randomization(self): """Test that the ``ClusterBasedNormalizer`` handles randomization correctly. @@ -1374,9 +1350,7 @@ def test_cluster_based_normalizer_randomization(self): transformed1 = ht.transform(data) transformed2 = ht.transform(data) - assert any( - transformed1['age.normalized'] != transformed2['age.normalized'] - ) + assert any(transformed1['age.normalized'] != transformed2['age.normalized']) ht2 = HyperTransformer() ht2.detect_initial_config(data) @@ -1409,28 +1383,16 @@ def test_anonymized_faker(self): reverse_transformed1 = ht.reverse_transform(transformed) # Assert - assert ( - reverse_transformed1['id1'].tolist() - != reverse_transformed1['id2'].tolist() - ) + assert reverse_transformed1['id1'].tolist() != reverse_transformed1['id2'].tolist() # Run - make sure transforming again returns different values than the original transform transformed = ht.transform(data) reverse_transformed2 = ht.reverse_transform(transformed) # Assert - assert ( - reverse_transformed2['id1'].tolist() - != reverse_transformed2['id2'].tolist() - ) - assert ( - reverse_transformed1['id1'].tolist() - != reverse_transformed2['id1'].tolist() - ) - assert ( - reverse_transformed1['id2'].tolist() - != reverse_transformed2['id2'].tolist() - ) + assert reverse_transformed2['id1'].tolist() != reverse_transformed2['id2'].tolist() + assert reverse_transformed1['id1'].tolist() != reverse_transformed2['id1'].tolist() + assert reverse_transformed1['id2'].tolist() != reverse_transformed2['id2'].tolist() # Run - make sure resetting randomization works ht.reset_randomization() @@ -1438,9 +1400,7 @@ def test_anonymized_faker(self): reverse_transformed3 = ht.reverse_transform(transformed) # Assert - pd.testing.assert_frame_equal( - reverse_transformed1, reverse_transformed3 - ) + pd.testing.assert_frame_equal(reverse_transformed1, reverse_transformed3) def test_anonymized_faker_text(self): """Test ``AnonymizedFaker`` with text column.""" @@ -1463,9 +1423,7 @@ def test_anonymized_faker_text(self): reverse_transformed = ht.reverse_transform(transformed) # Assert - assert all( - reverse_transformed['info'].apply(lambda s: isinstance(s, str)) - ) + assert all(reverse_transformed['info'].apply(lambda s: isinstance(s, str))) def test_pseudo_anonymized_faker(self): """Test ``PseudoAnonymizedFaker`` generates different values for different columns.""" @@ -1488,10 +1446,7 @@ def test_pseudo_anonymized_faker(self): reverse_transformed1 = ht.reverse_transform(transformed) # Assert - assert ( - reverse_transformed1['id1'].tolist() - != reverse_transformed1['id2'].tolist() - ) + assert reverse_transformed1['id1'].tolist() != reverse_transformed1['id2'].tolist() # Run - run it again on the exact same data ht = HyperTransformer() @@ -1506,10 +1461,7 @@ def test_pseudo_anonymized_faker(self): reverse_transformed2 = ht.reverse_transform(transformed) # Assert - different instances of the same transformer should return the same result - assert ( - reverse_transformed1['id1'].tolist() - == reverse_transformed2['id1'].tolist() - ) + assert reverse_transformed1['id1'].tolist() == reverse_transformed2['id1'].tolist() def test_anonymized_faker_different_tables(self): """Test ``AnonymizedFaker`` generates different values for columns with same name.""" @@ -1547,14 +1499,8 @@ def test_anonymized_faker_different_tables(self): reverse_transformed2 = ht.reverse_transform(transformed) # Assert - assert ( - reverse_transformed1['id1'].tolist() - != reverse_transformed2['id1'].tolist() - ) - assert ( - reverse_transformed1['id2'].tolist() - != reverse_transformed2['id2'].tolist() - ) + assert reverse_transformed1['id1'].tolist() != reverse_transformed2['id1'].tolist() + assert reverse_transformed1['id2'].tolist() != reverse_transformed2['id2'].tolist() def test_random_seed(self): # Setup @@ -1612,26 +1558,11 @@ def test_random_seed(self): reversed1 = ht.reverse_transform(transformed1) # Assert - assert ( - reversed1['num1'].isna().tolist() - != reversed1['num2'].isna().tolist() - ) - assert ( - reversed1['num3'].isna().tolist() - != reversed1['num4'].isna().tolist() - ) - assert ( - reversed1['num5'].isna().tolist() - != reversed1['num6'].isna().tolist() - ) - assert ( - reversed1['date1'].isna().tolist() - != reversed1['date2'].isna().tolist() - ) - assert ( - reversed1['date3'].isna().tolist() - != reversed1['date4'].isna().tolist() - ) + assert reversed1['num1'].isna().tolist() != reversed1['num2'].isna().tolist() + assert reversed1['num3'].isna().tolist() != reversed1['num4'].isna().tolist() + assert reversed1['num5'].isna().tolist() != reversed1['num6'].isna().tolist() + assert reversed1['date1'].isna().tolist() != reversed1['date2'].isna().tolist() + assert reversed1['date3'].isna().tolist() != reversed1['date4'].isna().tolist() # Run ht.reset_randomization() @@ -1679,9 +1610,7 @@ def test_hypertransformer_with_mutli_column_transformer_end_to_end(self): ], }) - pd.testing.assert_frame_equal( - transformed_data, expected_transformed_data - ) + pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) pd.testing.assert_frame_equal(reverse_transformed_data, data_test) def test_hypertransformer_with_mutli_column_transformer_and_single_column( @@ -1725,9 +1654,7 @@ def test_hypertransformer_with_mutli_column_transformer_and_single_column( ], }) - pd.testing.assert_frame_equal( - transformed_data, expected_transformed_data - ) + pd.testing.assert_frame_equal(transformed_data, expected_transformed_data) pd.testing.assert_frame_equal(reverse_transformed_data, data_test) def test_update_transformers_single_to_multi_column(self): @@ -1853,9 +1780,7 @@ def test_update_transformers_by_sdtype_mutli_column(self): ht.set_config(config) # Run - ht.update_transformers_by_sdtype( - 'boolean', transformer_name='LabelEncoder' - ) + ht.update_transformers_by_sdtype('boolean', transformer_name='LabelEncoder') new_config = ht.get_config() # Assert @@ -2024,9 +1949,7 @@ def test_with_tuple_returned_by_faker(self): ht.set_config({ 'sdtypes': {'A': 'pii'}, 'transformers': { - 'A': AnonymizedFaker( - provider_name='currency', function_name='currency' - ) + 'A': AnonymizedFaker(provider_name='currency', function_name='currency') }, }) @@ -2101,12 +2024,8 @@ def test_with_tuple_returned_by_faker(self): ), ] - @pytest.mark.parametrize( - ('method_name', 'method_input', 'expected_result'), parametrization - ) - def test_invalid_multi_column( - self, method_name, method_input, expected_result - ): + @pytest.mark.parametrize(('method_name', 'method_input', 'expected_result'), parametrization) + def test_invalid_multi_column(self, method_name, method_input, expected_result): """Test the ``update`` and ``remove`` methods with invalid multi column transformer. When a multi column is no longer valid, all these methods should raise a warning @@ -2114,9 +2033,7 @@ def test_invalid_multi_column( """ # Setup - class BadDummyMultiColumnTransformer( - DummyMultiColumnTransformerNumerical - ): + class BadDummyMultiColumnTransformer(DummyMultiColumnTransformerNumerical): @classmethod def _validate_sdtypes(cls, columns_to_sdtype): raise TransformerInputError('Invalid sdtype') diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py index 716a6a5d..aeefe293 100644 --- a/tests/integration/test_transformers.py +++ b/tests/integration/test_transformers.py @@ -65,9 +65,7 @@ def _is_valid_transformer(transformer_name): 'OrderedUniformEncoder', 'BaseMultiColumnTransformer', ] - return all( - invalid_name not in transformer_name for invalid_name in invalid_names - ) + return all(invalid_name not in transformer_name for invalid_name in invalid_names) def _get_all_transformers(): @@ -97,9 +95,7 @@ def _find_dataset_generators(sdtype, generators): if sdtype is None: primary_generators = [] for primary_sdtype in PRIMARY_SDTYPES: - primary_generators.extend( - _find_dataset_generators(primary_sdtype, generators) - ) + primary_generators.extend(_find_dataset_generators(primary_sdtype, generators)) return primary_generators @@ -108,9 +104,7 @@ def _find_dataset_generators(sdtype, generators): def _validate_dataset_generators(dataset_generators): """Check that the number of dataset generators is greater than zero.""" - assert ( - len(dataset_generators) > 0 - ), 'There are no associated dataset generators.' + assert len(dataset_generators) > 0, 'There are no associated dataset generators.' def _validate_transformed_data(transformer, transformed_data): @@ -119,32 +113,20 @@ def _validate_transformed_data(transformer, transformed_data): transformed_dtypes = transformed_data.dtypes for column, expected_sdtype in expected_sdtypes.items(): - message = ( - f'Column {column} is expected but not found in transformed data.' - ) + message = f'Column {column} is expected but not found in transformed data.' assert column in transformed_data, message - message = ( - f'Column {column} is not the expected sdtype {expected_sdtype}' - ) - assert ( - transformed_dtypes[column].kind - in SDTYPE_TO_DTYPES[expected_sdtype] - ), message - - -def _validate_reverse_transformed_data( - transformer, reversed_data, input_dtype -): + message = f'Column {column} is not the expected sdtype {expected_sdtype}' + assert transformed_dtypes[column].kind in SDTYPE_TO_DTYPES[expected_sdtype], message + + +def _validate_reverse_transformed_data(transformer, reversed_data, input_dtype): """Check that the reverse transformed data is the expected dtype. Expect that the dtype is equal to the dtype of the input data. """ expected_sdtype = transformer.get_supported_sdtypes()[0] message = f'Reverse transformed data is not the expected sdtype {expected_sdtype}' - assert ( - reversed_data.dtypes[TEST_COL].kind - in SDTYPE_TO_DTYPES[expected_sdtype] - ), message + assert reversed_data.dtypes[TEST_COL].kind in SDTYPE_TO_DTYPES[expected_sdtype], message def _test_transformer_with_dataset(transformer_class, input_data, steps): @@ -185,30 +167,20 @@ def _test_transformer_with_dataset(transformer_class, input_data, steps): def _validate_hypertransformer_transformed_data(transformed_data): """Check that the transformed data is not null and of type float.""" - assert transformed_data.notna().all( - axis=None - ), 'Transformed data has nulls.' + assert transformed_data.notna().all(axis=None), 'Transformed data has nulls.' for dtype in transformed_data.dtypes: - assert ( - dtype.kind in SDTYPE_TO_DTYPES['numerical'] - ), 'Transformed data is not numerical.' + assert dtype.kind in SDTYPE_TO_DTYPES['numerical'], 'Transformed data is not numerical.' -def _validate_hypertransformer_reverse_transformed_data( - transformer, reversed_data -): +def _validate_hypertransformer_reverse_transformed_data(transformer, reversed_data): """Check that the reverse transformed data has the same dtype as the input.""" expected_sdtype = transformer().get_supported_sdtypes()[0] message = f'Reversed transformed data is not the expected sdtype {expected_sdtype}' - assert ( - reversed_data.dtype.kind in SDTYPE_TO_DTYPES[expected_sdtype] - ), message + assert reversed_data.dtype.kind in SDTYPE_TO_DTYPES[expected_sdtype], message -def _test_transformer_with_hypertransformer( - transformer_class, input_data, steps -): +def _test_transformer_with_hypertransformer(transformer_class, input_data, steps): """Test the given transformer in the hypertransformer. Run the provided transformer using the hypertransformer using the provided @@ -240,9 +212,7 @@ def _test_transformer_with_hypertransformer( hypertransformer.fit(input_data) transformed = hypertransformer.transform(input_data) - _validate_helper( - _validate_hypertransformer_transformed_data, [transformed], steps - ) + _validate_helper(_validate_hypertransformer_transformed_data, [transformed], steps) out = hypertransformer.reverse_transform(transformed) _validate_helper( @@ -272,13 +242,9 @@ def validate_transformer(transformer, steps=None, subtests=None): data = pd.DataFrame({TEST_COL: dg.generate(DATA_SIZE)}) if subtests: - with subtests.test( - msg=f'test_transformer_with_dataset_{dg}', generator=dg - ): + with subtests.test(msg=f'test_transformer_with_dataset_{dg}', generator=dg): _test_transformer_with_dataset(transformer, data, steps) - _test_transformer_with_hypertransformer( - transformer, data, steps - ) + _test_transformer_with_hypertransformer(transformer, data, steps) else: _test_transformer_with_dataset(transformer, data, steps) _test_transformer_with_hypertransformer(transformer, data, steps) diff --git a/tests/integration/transformers/pii/test_anonymizer.py b/tests/integration/transformers/pii/test_anonymizer.py index ecb65765..b1577dd4 100644 --- a/tests/integration/transformers/pii/test_anonymizer.py +++ b/tests/integration/transformers/pii/test_anonymizer.py @@ -199,9 +199,7 @@ def test_cardinality_rule_match_nans(self): def test_cardinality_rule_match_not_enough_unique_values(self): """Test it works with the cardinality rule 'match' and too few values to transform.""" # Setup - data_fit = pd.DataFrame({ - 'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2] - }) + data_fit = pd.DataFrame({'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]}) data_transform = pd.DataFrame({'col': [1, 1, 1]}) instance = AnonymizedFaker(cardinality_rule='match') @@ -295,9 +293,7 @@ def test_default_settings(self): data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert ( - unique_animals.intersection(set(instance._mapping_dict)) == set() - ) + assert unique_animals.intersection(set(instance._mapping_dict)) == set() assert len(reverse_transformed) == len(transformed) == 4 def test_with_nans(self): @@ -315,9 +311,7 @@ def test_with_nans(self): data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert ( - unique_animals.intersection(set(instance._mapping_dict)) == set() - ) + assert unique_animals.intersection(set(instance._mapping_dict)) == set() assert len(reverse_transformed) == len(transformed) == 4 def test_with_custom_provider(self): @@ -335,7 +329,5 @@ def test_with_custom_provider(self): data['animals'], ) unique_animals = set(reverse_transformed['animals']) - assert ( - unique_animals.intersection(set(instance._mapping_dict)) == set() - ) + assert unique_animals.intersection(set(instance._mapping_dict)) == set() assert len(reverse_transformed) == len(transformed) == 4 diff --git a/tests/integration/transformers/test_base.py b/tests/integration/transformers/test_base.py index afe600d0..564793f3 100644 --- a/tests/integration/transformers/test_base.py +++ b/tests/integration/transformers/test_base.py @@ -141,12 +141,8 @@ class AdditionTransformer(BaseMultiColumnTransformer): def _fit(self, columns_data): self.output_properties = { f'{self.columns[0]}': {'sdtype': 'numerical'}, - f'{self.columns[0]}+{self.columns[1]}': { - 'sdtype': 'numerical' - }, - f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': { - 'sdtype': 'numerical' - }, + f'{self.columns[0]}+{self.columns[1]}': {'sdtype': 'numerical'}, + f'{self.columns[0]}+{self.columns[1]}+{self.columns[2]}': {'sdtype': 'numerical'}, } def _get_prefix(self): @@ -218,14 +214,10 @@ def _reverse_transform(self, data): column_names = list(data.columns) col1, col2 = column_names[0].split('#') - result[[col1, col2]] = result[column_names[0]].str.split( - '#', expand=True - ) + result[[col1, col2]] = result[column_names[0]].str.split('#', expand=True) col3, col4 = column_names[1].split('#') - result[[col3, col4]] = result[column_names[1]].str.split( - '#', expand=True - ) + result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True) return result.drop(columns=column_names) @@ -283,12 +275,8 @@ def _transform(self, data): def _reverse_transform(self, data): result = data.copy() - reverse_1 = ( - result[self.output_columns[0]] + result[self.output_columns[1]] - ) - reverse_2 = ( - result[self.output_columns[2]] + result[self.output_columns[3]] - ) + reverse_1 = result[self.output_columns[0]] + result[self.output_columns[1]] + reverse_2 = result[self.output_columns[2]] + result[self.output_columns[3]] result[self.columns[0]] = reverse_1 result[self.columns[1]] = reverse_2 diff --git a/tests/integration/transformers/test_boolean.py b/tests/integration/transformers/test_boolean.py index dd217e37..80857955 100644 --- a/tests/integration/transformers/test_boolean.py +++ b/tests/integration/transformers/test_boolean.py @@ -73,9 +73,7 @@ def test_boolean_missing_value_generation_none(self): # Setup data = pd.DataFrame([True, True, None, False], columns=['bool']) column = 'bool' - transformer = BinaryEncoder( - missing_value_replacement='mode', missing_value_generation=None - ) + transformer = BinaryEncoder(missing_value_replacement='mode', missing_value_generation=None) # Run transformer.fit(data, column) @@ -86,6 +84,4 @@ def test_boolean_missing_value_generation_none(self): expected_transformed = pd.DataFrame({'bool': [1.0, 1.0, 1.0, 0.0]}) expected_reversed = pd.DataFrame({'bool': [True, True, True, False]}) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse, expected_reversed, check_dtype=False - ) + pd.testing.assert_frame_equal(reverse, expected_reversed, check_dtype=False) diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 3bb67253..04959750 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -64,9 +64,7 @@ def test__reverse_transform(self): output = transformer.reverse_transform(transformed) # Asserts - pd.testing.assert_series_equal( - output['column_name'], data['column_name'] - ) + pd.testing.assert_series_equal(output['column_name'], data['column_name']) def test__reverse_transform_negative_transformed_values(self): """Test the ``reverse_transform``.""" @@ -119,9 +117,7 @@ def test_uniform_encoder_unseen_transform_nan(self): """Ensure UniformEncoder works when np.nan to transform wasn't seen during fit.""" # Setup fit_data = pd.DataFrame([1.0, 2.0, 3.0], columns=['column_name']) - transform_data = pd.DataFrame( - [1, 2, 3, np.nan], columns=['column_name'] - ) + transform_data = pd.DataFrame([1, 2, 3, np.nan], columns=['column_name']) column = 'column_name' transformer = UniformEncoder() @@ -176,9 +172,7 @@ def test_order(self): def test_string(self): """Test that the transformer works with string labels.""" # Setup - data = pd.DataFrame({ - 'column_name': ['b', 'a', 'c', 'a', np.nan, 'b', 'b'] - }) + data = pd.DataFrame({'column_name': ['b', 'a', 'c', 'a', np.nan, 'b', 'b']}) transformer = OrderedUniformEncoder(order=['a', 'c', np.nan, 'b']) column = 'column_name' @@ -226,9 +220,7 @@ def test_frequency_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert @@ -245,9 +237,7 @@ def test_frequency_encoder_numerical_nans_no_warning(): def test_frequency_encoder_unseen_transform_data(): """Ensure FrequencyEncoder works when data to transform wasn't seen during fit.""" - fit_data = pd.DataFrame( - [1, 2, float('nan'), np.nan], columns=['column_name'] - ) + fit_data = pd.DataFrame([1, 2, float('nan'), np.nan], columns=['column_name']) transform_data = pd.DataFrame([1, 2, np.nan, 3], columns=['column_name']) column = 'column_name' @@ -439,9 +429,7 @@ def test_frequency_encoder_mixed_more_rows(): # setup data = pd.DataFrame([True, 'a', 1, None], columns=['column_name']) column = 'column_name' - transform_data = pd.DataFrame( - ['a', 1, None, 'a', True, 1], columns=['column_name'] - ) + transform_data = pd.DataFrame(['a', 1, None, 'a', True, 1], columns=['column_name']) transformer = FrequencyEncoder() # run @@ -465,9 +453,7 @@ def test_frequency_encoder_noise(): - The reverse transformed data """ # setup - data = pd.DataFrame( - np.random.choice(a=range(100), size=10000), columns=['column_name'] - ) + data = pd.DataFrame(np.random.choice(a=range(100), size=10000), columns=['column_name']) column = 'column_name' transformer = FrequencyEncoder(add_noise=True) @@ -496,9 +482,7 @@ def test_one_hot_numerical_nans(): def test_one_hot_doesnt_warn(tmp_path): """Ensure OneHotEncoder doesn't warn when saving and loading GH#616.""" # Setup - data = pd.DataFrame({ - 'column_name': [1.0, 2.0, np.nan, 2.0, 3.0, np.nan, 3.0] - }) + data = pd.DataFrame({'column_name': [1.0, 2.0, np.nan, 2.0, 3.0, np.nan, 3.0]}) ohe = OneHotEncoder() # Run @@ -578,9 +562,7 @@ def test_label_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert @@ -604,9 +586,7 @@ def test_label_encoder_order_by_numerical(): - Transformed data should map labels to values based on numerical order. """ - data = pd.DataFrame( - [5, np.nan, 3.11, 100, 67.8, -2.5], columns=['column_name'] - ) + data = pd.DataFrame([5, np.nan, 3.11, 100, 67.8, -2.5], columns=['column_name']) transformer = LabelEncoder(order_by='numerical_value') transformer.fit(data, 'column_name') @@ -628,9 +608,7 @@ def test_label_encoder_order_by_alphabetical(): - Transformed data should map labels to values based on alphabetical order. """ - data = pd.DataFrame( - ['one', 'two', np.nan, 'three', 'four'], columns=['column_name'] - ) + data = pd.DataFrame(['one', 'two', np.nan, 'three', 'four'], columns=['column_name']) transformer = LabelEncoder(order_by='alphabetical') transformer.fit(data, 'column_name') @@ -676,9 +654,7 @@ def test_ordered_label_encoder_nans(): - Reverse transformed data should match the input """ - data = pd.DataFrame( - ['two', 3, 1, np.nan, 'zero', None], columns=['column_name'] - ) + data = pd.DataFrame(['two', 3, 1, np.nan, 'zero', None], columns=['column_name']) transformer = OrderedLabelEncoder(order=['zero', 1, 'two', 3, None]) transformer.fit(data, 'column_name') @@ -696,9 +672,7 @@ def test_ordered_label_encoder_numerical_nans_no_warning(): Related to Issue #793 (https://github.com/sdv-dev/RDT/issues/793) """ # Setup - data = pd.DataFrame({ - 'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object') - }) + data = pd.DataFrame({'column_name': pd.Series([1, 2, float('nan'), np.nan], dtype='object')}) column = 'column_name' # Run and Assert diff --git a/tests/integration/transformers/test_datetime.py b/tests/integration/transformers/test_datetime.py index d64a45c6..e09767aa 100644 --- a/tests/integration/transformers/test_datetime.py +++ b/tests/integration/transformers/test_datetime.py @@ -12,9 +12,7 @@ def test_unixtimestampencoder(self): """Test the ``UnixTimestampEncoder`` end to end.""" # Setup ute = UnixTimestampEncoder(missing_value_replacement='mean') - data = pd.DataFrame({ - 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) - }) + data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) # Run ute.fit(data, column='column') @@ -32,9 +30,7 @@ def test_unixtimestampencoder(self): def test_unixtimestampencoder_different_format(self): """Test the ``UnixTimestampEncoder`` with a unique datetime format.""" - ute = UnixTimestampEncoder( - missing_value_replacement='mean', datetime_format='%b %d, %Y' - ) + ute = UnixTimestampEncoder(missing_value_replacement='mean', datetime_format='%b %d, %Y') data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) # Run @@ -79,9 +75,7 @@ def test_unixtimestampencoder_with_missing_value_generation_none(self): def test_unixtimestampencoder_with_missing_value_replacement_random(self): """Test that transformed data will replace nans with random values from the data.""" # Setup - ute = UnixTimestampEncoder( - missing_value_replacement='random', datetime_format='%b %d, %Y' - ) + ute = UnixTimestampEncoder(missing_value_replacement='random', datetime_format='%b %d, %Y') data = pd.DataFrame({'column': [None, 'Oct 17, 1996', 'May 23, 1965']}) # Run @@ -94,9 +88,7 @@ def test_unixtimestampencoder_with_missing_value_replacement_random(self): expect_transformed = pd.DataFrame({ 'column': [-7.007396e16, 845510400000000000, -145497600000000000] }) - expected_reversed = pd.DataFrame({ - 'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965'] - }) + expected_reversed = pd.DataFrame({'column': [np.nan, 'Oct 17, 1996', 'May 23, 1965']}) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, expected_reversed) @@ -104,9 +96,7 @@ def test_unixtimestampencoder_with_model_missing_values(self): """Test that `model_missing_values` is accepted by the transformer.""" # Setup ute = UnixTimestampEncoder('mean', True) - data = pd.DataFrame({ - 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) - }) + data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) # Run ute.fit(data, column='column') @@ -168,21 +158,15 @@ def test_with_enforce_min_max_values_true(self): """Test that the transformer properly clipped out of bounds values.""" # Setup ute = UnixTimestampEncoder(enforce_min_max_values=True) - data = pd.DataFrame({ - 'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965'] - }) + data = pd.DataFrame({'column': ['Feb 03, 1981', 'Oct 17, 1996', 'May 23, 1965']}) ute.fit(data, column='column') # Run transformed = ute.transform(data) min_val = transformed['column'].min() max_val = transformed['column'].max() - transformed.loc[transformed['column'] == min_val, 'column'] = ( - min_val - 1e17 - ) - transformed.loc[transformed['column'] == max_val, 'column'] = ( - max_val + 1e17 - ) + transformed.loc[transformed['column'] == min_val, 'column'] = min_val - 1e17 + transformed.loc[transformed['column'] == max_val, 'column'] = max_val + 1e17 reverted = ute.reverse_transform(transformed) # Asserts @@ -194,9 +178,7 @@ def test_with_enforce_min_max_values_true(self): class TestOptimizedTimestampEncoder: def test_optimizedtimestampencoder(self): ote = OptimizedTimestampEncoder(missing_value_replacement='mean') - data = pd.DataFrame({ - 'column': pd.to_datetime([None, '1996-10-17', '1965-05-23']) - }) + data = pd.DataFrame({'column': pd.to_datetime([None, '1996-10-17', '1965-05-23'])}) # Run ote.fit(data, column='column') @@ -205,8 +187,6 @@ def test_optimizedtimestampencoder(self): reverted = ote.reverse_transform(transformed) # Asserts - expect_transformed = pd.DataFrame({ - 'column': [4051.0, 9786.0, -1684.0] - }) + expect_transformed = pd.DataFrame({'column': [4051.0, 9786.0, -1684.0]}) pd.testing.assert_frame_equal(expect_transformed, transformed) pd.testing.assert_frame_equal(reverted, data) diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index ced5be5e..8e301ad7 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -159,16 +159,12 @@ def test_missing_value_replacement_random_all_nans(self): expected_transformed = pd.DataFrame({'a': [0.0] * 10}) expected_reverse_transformed = pd.DataFrame({'a': [np.nan] * 10}) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transformed, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transformed, expected_reverse_transformed) class TestGaussianNormalizer: def test_stats(self): - data = pd.DataFrame( - np.random.normal(loc=4, scale=4, size=1000), columns=['a'] - ) + data = pd.DataFrame(np.random.normal(loc=4, scale=4, size=1000), columns=['a']) column = 'a' ct = GaussianNormalizer() @@ -340,9 +336,7 @@ def test_some_nulls(self): data[mask] = np.nan column = 'col' - bgmm_transformer = ClusterBasedNormalizer( - missing_value_generation='from_column' - ) + bgmm_transformer = ClusterBasedNormalizer(missing_value_generation='from_column') bgmm_transformer.fit(data, column) transformed = bgmm_transformer.transform(data) @@ -404,15 +398,12 @@ def test_out_of_bounds_reverse_transform(self): """Test that the reverse transform works when the data is out of bounds GH#672.""" # Setup data = pd.DataFrame({ - 'col': [round(i, 2) for i in np.random.uniform(0, 10, size=100)] - + [None] + 'col': [round(i, 2) for i in np.random.uniform(0, 10, size=100)] + [None] }) reverse_data = pd.DataFrame( data={ 'col.normalized': np.random.uniform(-10, 10, size=100), - 'col.component': np.random.choice( - [0.0, 1.0, 2.0, 10.0], size=100 - ), + 'col.component': np.random.choice([0.0, 1.0, 2.0, 10.0], size=100), } ) transformer = ClusterBasedNormalizer() diff --git a/tests/integration/transformers/test_text.py b/tests/integration/transformers/test_text.py index 47986d83..02085cc9 100644 --- a/tests/integration/transformers/test_text.py +++ b/tests/integration/transformers/test_text.py @@ -17,9 +17,7 @@ def test_end_to_end(self): }) # Run - transformer = IDGenerator( - prefix='id_', starting_value=100, suffix='_X' - ) + transformer = IDGenerator(prefix='id_', starting_value=100, suffix='_X') transformed = transformer.fit_transform(data, 'id') reverse_transform = transformer.reverse_transform(transformed) reverse_transform_2 = transformer.reverse_transform(transformed) @@ -27,9 +25,7 @@ def test_end_to_end(self): reverse_transform_3 = transformer.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'username': ['a', 'b', 'c', 'd', 'e'] - }) + expected_transformed = pd.DataFrame({'username': ['a', 'b', 'c', 'd', 'e']}) expected_reverse_transform = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], @@ -42,15 +38,9 @@ def test_end_to_end(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transform - ) - pd.testing.assert_frame_equal( - reverse_transform_2, expected_reverse_transform_2 - ) - pd.testing.assert_frame_equal( - reverse_transform_3, expected_reverse_transform - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transform) + pd.testing.assert_frame_equal(reverse_transform_2, expected_reverse_transform_2) + pd.testing.assert_frame_equal(reverse_transform_3, expected_reverse_transform) class TestRegexGenerator: @@ -68,18 +58,14 @@ def test_regexgenerator(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected_transformed = pd.DataFrame({ - 'username': ['a', 'b', 'c', 'd', 'e'] - }) + expected_transformed = pd.DataFrame({'username': ['a', 'b', 'c', 'd', 'e']}) expected_reverse_transformed = pd.DataFrame({ 'username': ['a', 'b', 'c', 'd', 'e'], 'id': ['AAAAA', 'AAAAB', 'AAAAC', 'AAAAD', 'AAAAE'], }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_with_custom_regex(self): """Test the ``RegexGenerator`` with a custom regex format.""" @@ -105,9 +91,7 @@ def test_with_custom_regex(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_with_nans(self): """Test the ``RegexGenerator`` with a custom regex format and ``nans``.""" @@ -133,9 +117,7 @@ def test_with_nans(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_data_length_bigger_than_regex(self): """Test the ``RegexGenerator`` with short regex and more data length.""" @@ -161,9 +143,7 @@ def test_data_length_bigger_than_regex(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_input_data_bigger_than_data_length(self): """Test the ``RegexGenerator`` with input dataframe bigger than the learned data length.""" @@ -188,9 +168,7 @@ def test_input_data_bigger_than_data_length(self): 'username': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b'], }) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_called_multiple_times(self): """Test the ``RegexGenerator`` with short regex and called multiple times. @@ -223,37 +201,27 @@ def test_called_multiple_times(self): 'id': [1, 2, 3, 4, 5], 'username': ['a', 'b', 'c', 'a', 'b'], }) - pd.testing.assert_frame_equal( - first_reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(first_reverse_transform, expected_reverse_transformed) # Reverse Transform Again - second_reverse_transform = instance.reverse_transform( - transformed.head(1) - ) + second_reverse_transform = instance.reverse_transform(transformed.head(1)) # Assert Reverse Transform expected_reverse_transformed = pd.DataFrame({ 'id': [1], 'username': ['a'], }) - pd.testing.assert_frame_equal( - second_reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(second_reverse_transform, expected_reverse_transformed) # Reverse Transform Again - third_reverse_transform = instance.reverse_transform( - transformed.head(1) - ) + third_reverse_transform = instance.reverse_transform(transformed.head(1)) # Assert Reverse Transform expected_reverse_transformed = pd.DataFrame({ 'id': [1], 'username': ['b'], }) - pd.testing.assert_frame_equal( - third_reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(third_reverse_transform, expected_reverse_transformed) def test_called_multiple_times_enforce_uniqueness(self): """Test that calling multiple times with ``enforce_uniqueness`` returns unique values.""" @@ -263,26 +231,16 @@ def test_called_multiple_times_enforce_uniqueness(self): # Run transformed_data = generator.fit_transform(data, 'my_column') - first_reverse_transform = generator.reverse_transform( - transformed_data.head(3) - ) - second_reverse_transform = generator.reverse_transform( - transformed_data.head(5) - ) + first_reverse_transform = generator.reverse_transform(transformed_data.head(3)) + second_reverse_transform = generator.reverse_transform(transformed_data.head(5)) # Assert - expected_first_reverse_transform = pd.DataFrame({ - 'my_column': ['AAAAA', 'AAAAB', 'AAAAC'] - }) + expected_first_reverse_transform = pd.DataFrame({'my_column': ['AAAAA', 'AAAAB', 'AAAAC']}) expected_second_reverse_transform = pd.DataFrame({ 'my_column': ['AAAAD', 'AAAAE', 'AAAAF', 'AAAAG', 'AAAAH'] }) - pd.testing.assert_frame_equal( - first_reverse_transform, expected_first_reverse_transform - ) - pd.testing.assert_frame_equal( - second_reverse_transform, expected_second_reverse_transform - ) + pd.testing.assert_frame_equal(first_reverse_transform, expected_first_reverse_transform) + pd.testing.assert_frame_equal(second_reverse_transform, expected_second_reverse_transform) def test_pickled(self, tmpdir): """Test that ensures that ``RegexGenerator`` can be pickled.""" @@ -343,9 +301,7 @@ def test_with_many_possibilities(self): }) pd.testing.assert_frame_equal(transformed, expected_transformed) - pd.testing.assert_frame_equal( - reverse_transform, expected_reverse_transformed - ) + pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transformed) def test_enforce_uniqueness_not_enough_values_categorical(self): """Test with enforce_uniqueness=True but insufficient regex values.""" @@ -360,9 +316,7 @@ def test_enforce_uniqueness_not_enough_values_categorical(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected = pd.DataFrame({ - 'id': ['id_a', 'id_b', 'id_a(0)', 'id_b(0)', 'id_a(1)'] - }) + expected = pd.DataFrame({'id': ['id_a', 'id_b', 'id_a(0)', 'id_b(0)', 'id_a(1)']}) pd.testing.assert_frame_equal(reverse_transform, expected) def test_enforce_uniqueness_not_enough_values_numerical(self): @@ -378,9 +332,7 @@ def test_enforce_uniqueness_not_enough_values_numerical(self): reverse_transform = instance.reverse_transform(transformed) # Assert - expected = pd.DataFrame( - {'id': ['2', '3', '4', '5', '6']}, dtype=object - ) + expected = pd.DataFrame({'id': ['2', '3', '4', '5', '6']}, dtype=object) pd.testing.assert_frame_equal(reverse_transform, expected) @@ -398,9 +350,7 @@ def test_end_to_end_scrambled(self): ht.detect_initial_config(customers) ht.update_sdtypes({'id': 'text'}) ht.update_transformers({ - 'id': RegexGenerator( - regex_format='id_[a-z]', generation_order='scrambled' - ) + 'id': RegexGenerator(regex_format='id_[a-z]', generation_order='scrambled') }) # Run @@ -409,7 +359,5 @@ def test_end_to_end_scrambled(self): reverse_transformed = ht.reverse_transform(transformed) # Assert - expected_id = pd.Series( - ['id_b', 'id_a', 'id_c', 'id_e', 'id_d'], name='id' - ) + expected_id = pd.Series(['id_b', 'id_a', 'id_c', 'id_e', 'id_d'], name='id') pd.testing.assert_series_equal(reverse_transformed['id'], expected_id) diff --git a/tests/performance/test_performance.py b/tests/performance/test_performance.py index 6aab074c..9dd87788 100644 --- a/tests/performance/test_performance.py +++ b/tests/performance/test_performance.py @@ -70,9 +70,7 @@ def validate_performance(performance, dataset_generator, should_assert=False): out.append(valid) if should_assert and not valid: - raise AssertionError( - f'{function} {metric}: {value} > {expected_metric}' - ) + raise AssertionError(f'{function} {metric}: {value} > {expected_metric}') return out @@ -92,9 +90,7 @@ def test_performance(transformer, dataset_generator): dataset_generator (rdt.tests.dataset.BaseDatasetGenerator): The dataset generator to performance tests against. """ - performance = evaluate_transformer_performance( - transformer, dataset_generator - ) + performance = evaluate_transformer_performance(transformer, dataset_generator) validate_performance(performance, dataset_generator, should_assert=True) @@ -150,9 +146,7 @@ def find_transformer_boundaries( Candidate values for each metric. """ results = [ - profile_transformer( - transformer, dataset_generator, transform_size, fit_size - ) + profile_transformer(transformer, dataset_generator, transform_size, fit_size) for _ in range(iterations) ] means = pd.DataFrame(results).mean(axis=0) diff --git a/tests/performance/tests/test_profiling.py b/tests/performance/tests/test_profiling.py index 4fc229b8..d07c53a2 100644 --- a/tests/performance/tests/test_profiling.py +++ b/tests/performance/tests/test_profiling.py @@ -42,9 +42,7 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): deepcopy_mock.return_value = transformer_mock.return_value # Run - profiling_results = profile_transformer( - transformer_mock.return_value, dataset_gen_mock, 100 - ) + profiling_results = profile_transformer(transformer_mock.return_value, dataset_gen_mock, 100) # Assert expected_output_columns = [ @@ -58,9 +56,7 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): assert len(deepcopy_mock.mock_calls) == 10 assert len(transformer_mock.return_value.fit.mock_calls) == 11 assert len(transformer_mock.return_value.transform.mock_calls) == 11 - assert ( - len(transformer_mock.return_value.reverse_transform.mock_calls) == 10 - ) + assert len(transformer_mock.return_value.reverse_transform.mock_calls) == 10 all( np.testing.assert_array_equal(call[1][0], np.ones(100)) @@ -83,20 +79,11 @@ def test_profile_transformer(deepcopy_mock, multiprocessor_mock): reverse_transform_call = process_mock.mock_calls[6] assert fit_call[2]['args'][0] == transformer_mock.return_value.fit - pd.testing.assert_frame_equal( - fit_call[2]['args'][1], pd.DataFrame({'test': np.ones(100)}) - ) - assert ( - transform_call[2]['args'][0] == transformer_mock.return_value.transform - ) + pd.testing.assert_frame_equal(fit_call[2]['args'][1], pd.DataFrame({'test': np.ones(100)})) + assert transform_call[2]['args'][0] == transformer_mock.return_value.transform pd.testing.assert_frame_equal( transform_call[2]['args'][1].reset_index(drop=True), pd.DataFrame({'test': np.ones(100)}), ) - assert ( - reverse_transform_call[2]['args'][0] - == transformer_mock.return_value.reverse_transform - ) - np.testing.assert_array_equal( - reverse_transform_call[2]['args'][1], np.zeros(100) - ) + assert reverse_transform_call[2]['args'][0] == transformer_mock.return_value.reverse_transform + np.testing.assert_array_equal(reverse_transform_call[2]['args'][1], np.zeros(100)) diff --git a/tests/unit/test___init__.py b/tests/unit/test___init__.py index 0a22f6a1..ee39b083 100644 --- a/tests/unit/test___init__.py +++ b/tests/unit/test___init__.py @@ -245,9 +245,7 @@ def test__find_addons_module_and_object(entry_points_mock, warning_mock): @patch('warnings.warn') @patch.object(rdt, 'entry_points') -def test__find_addons_missing_object( - entry_points_mock, warning_mock, mock_rdt -): +def test__find_addons_missing_object(entry_points_mock, warning_mock, mock_rdt): """Test incorrect add-on name generates a warning.""" # Setup bad_entry_point = Mock() diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py index 43a6b53e..f175eec8 100644 --- a/tests/unit/test_hyper_transformer.py +++ b/tests/unit/test_hyper_transformer.py @@ -111,9 +111,7 @@ def test__validate_field_transformers(self): with pytest.raises(ValueError, match=error_msg): ht._validate_field_transformers() - @patch( - 'rdt.hyper_transformer.HyperTransformer._validate_field_transformers' - ) + @patch('rdt.hyper_transformer.HyperTransformer._validate_field_transformers') def test___init__(self, validation_mock): """Test create new instance of HyperTransformer""" # Run @@ -263,14 +261,10 @@ def test__learn_config(self, get_default_transformer_mock): } assert isinstance(ht.field_transformers['integer'], FloatFormatter) - assert isinstance( - ht.field_transformers['float'], ClusterBasedNormalizer - ) + assert isinstance(ht.field_transformers['float'], ClusterBasedNormalizer) assert isinstance(ht.field_transformers['categorical'], LabelEncoder) assert isinstance(ht.field_transformers['bool'], LabelEncoder) - assert isinstance( - ht.field_transformers['datetime'], UnixTimestampEncoder - ) + assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder) assert isinstance(ht.field_transformers['pii'], AnonymizedFaker) assert isinstance(ht.field_transformers['text'], RegexGenerator) ht._unfit.assert_called_once() @@ -307,9 +301,7 @@ def test_detect_initial_config(self, logger_mock): 'col5': 'numerical', } - field_transformers = { - k: repr(v) for (k, v) in ht.field_transformers.items() - } + field_transformers = {k: repr(v) for (k, v) in ht.field_transformers.items()} assert field_transformers == { 'col1': 'FloatFormatter()', 'col2': 'UniformEncoder()', @@ -533,8 +525,7 @@ def test_validate_config_not_unique_field(self): # Run error_msg = re.escape( - 'Error: Invalid config. Please provide unique keys for the sdtypes ' - 'and transformers.' + 'Error: Invalid config. Please provide unique keys for the sdtypes ' 'and transformers.' ) with pytest.raises(InvalidConfigError, match=error_msg): HyperTransformer._validate_config(config) @@ -1068,9 +1059,7 @@ class DummyTransformer2(BaseTransformer): def __init__(self): super().__init__() - self.output_properties = { - 'is_null': {'sdtype': 'float', 'next_transformer': None} - } + self.output_properties = {'is_null': {'sdtype': 'float', 'next_transformer': None}} def _fit(self, _): ... @@ -1118,9 +1107,7 @@ def test_fit_warns_columns_in_data(self): ht = HyperTransformer() data = pd.DataFrame({'col': [1, np.nan, 3], 'col.is_null': [1, 2, 3]}) ht.detect_initial_config(data) - ht.field_transformers['col'] = FloatFormatter( - model_missing_values=True - ) + ht.field_transformers['col'] = FloatFormatter(model_missing_values=True) # Run and Assert warn_msg = re.escape( @@ -1397,9 +1384,7 @@ def test_fit_transform(self): expect_call_args_transform = pd.DataFrame() assert transformer.fit.call_count == expect_call_count_fit - pd.testing.assert_frame_equal( - transformer.fit.call_args[0][0], expect_call_args_fit - ) + pd.testing.assert_frame_equal(transformer.fit.call_args[0][0], expect_call_args_fit) assert transformer.transform.call_count == expect_call_count_transform pd.testing.assert_frame_equal( @@ -1471,9 +1456,7 @@ def test_create_anonymized_columns(self): ) random_element.columns = ['random_element'] random_element.output_columns = [] - random_element.set_random_state( - np.random.RandomState(42), 'reverse_transform' - ) + random_element.set_random_state(np.random.RandomState(42), 'reverse_transform') regex_id = RegexGenerator(regex_format='id_[0-9]') regex_id.reset_randomization() @@ -1545,23 +1528,15 @@ def test_create_anonymized_columns_num_rows_error(self): instance._modified_config = False # Run / Assert - error_msg = re.escape( - "Parameter 'num_rows' must be an integer greater than 0." - ) + error_msg = re.escape("Parameter 'num_rows' must be an integer greater than 0.") with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns( - instance, num_rows='a', column_names=['a'] - ) + HyperTransformer.create_anonymized_columns(instance, num_rows='a', column_names=['a']) with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns( - instance, num_rows=0, column_names=['a'] - ) + HyperTransformer.create_anonymized_columns(instance, num_rows=0, column_names=['a']) with pytest.raises(ValueError, match=error_msg): - HyperTransformer.create_anonymized_columns( - instance, num_rows=-1, column_names=['a'] - ) + HyperTransformer.create_anonymized_columns(instance, num_rows=-1, column_names=['a']) def test_create_anonymized_columns_invalid_columns(self): """Test ``create_anonymized_columns``. @@ -1595,9 +1570,7 @@ def test_create_anonymized_columns_invalid_columns(self): 'a list of valid column names.' ) with pytest.raises(InvalidConfigError, match=error_msg): - instance.create_anonymized_columns( - num_rows=10, column_names=['credit_card', 'id'] - ) + instance.create_anonymized_columns(num_rows=10, column_names=['credit_card', 'id']) def test_create_anonymized_columns_invalid_transformers(self): """Test ``create_anonymized_columǹs`` with transformers that do not generate data. @@ -1727,9 +1700,7 @@ def test_reverse_transform_subset_with_generators(self): reverse_transformed_data = self.get_transformed_data() float_transformer.reverse_transform = lambda x: x - int_transformer.reverse_transform.return_value = ( - reverse_transformed_data - ) + int_transformer.reverse_transform.return_value = reverse_transformed_data ht = HyperTransformer() ht._validate_config_exists = Mock() @@ -1744,14 +1715,10 @@ def test_reverse_transform_subset_with_generators(self): ht._input_columns = list(reverse_transformed_data.columns) # Run - reverse_transformed = ht.reverse_transform_subset( - reverse_transformed_data - ) + reverse_transformed = ht.reverse_transform_subset(reverse_transformed_data) # Assert - pd.testing.assert_frame_equal( - reverse_transformed, reverse_transformed_data - ) + pd.testing.assert_frame_equal(reverse_transformed, reverse_transformed_data) int_transformer.reverse_transform.assert_called_once() generator_transformer.reverse_transform.assert_not_called() @@ -1831,7 +1798,9 @@ def test_reverse_transform_with_subset(self): data = pd.DataFrame({'col1': [1, 2]}) # Run / Assert - expected_msg = 'You must provide a transformed dataset with all the columns from the original data.' + expected_msg = ( + 'You must provide a transformed dataset with all the columns from the original data.' + ) with pytest.raises(InvalidDataError, match=expected_msg): ht.reverse_transform(data) @@ -1892,9 +1861,7 @@ def test_reverse_transform_subset(self): ht.reverse_transform_subset(data) # Assert - ht._reverse_transform.assert_called_once_with( - data, prevent_subset=False - ) + ht._reverse_transform.assert_called_once_with(data, prevent_subset=False) def test_reverse_transform_subset_with_unknown_columns(self): """Test the ``reverse_transform_subset`` method with unknown columns. @@ -1982,17 +1949,11 @@ def test_update_transformers_by_sdtype_field_sdtypes_not_fitted(self): ht.update_transformers_by_sdtype('categorical', transformer) # Assert - assert isinstance( - ht.field_transformers['categorical_column'], LabelEncoder - ) - assert isinstance( - ht.field_transformers['numerical_column'], FloatFormatter - ) + assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) + assert isinstance(ht.field_transformers['numerical_column'], FloatFormatter) @patch('rdt.hyper_transformer.warnings') - def test_update_transformers_by_sdtype_field_sdtypes_fitted( - self, mock_warnings - ): + def test_update_transformers_by_sdtype_field_sdtypes_fitted(self, mock_warnings): """Test ``update_transformers_by_sdtype`` if ``HyperTransformer`` has aleady been fit. Ensure that the ``field_transformers`` that have the input ``sdtype`` have been updated and @@ -2033,9 +1994,7 @@ def test_update_transformers_by_sdtype_field_sdtypes_fitted( ] mock_warnings.warn.assert_has_calls(expected_warnings_msgs) - assert isinstance( - ht.field_transformers['categorical_column'], LabelEncoder - ) + assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error( self, @@ -2055,9 +2014,7 @@ def test_update_transformers_by_sdtype_unsupported_sdtype_raises_error( # Run / Assert expected_msg = "Invalid transformer name 'LabelEncoder' for the 'fake_type' sdtype." with pytest.raises(InvalidConfigError, match=expected_msg): - ht.update_transformers_by_sdtype( - 'fake_type', transformer_name='LabelEncoder' - ) + ht.update_transformers_by_sdtype('fake_type', transformer_name='LabelEncoder') def test_update_transformers_by_sdtype_bad_transformer_raises_error(self): """Test ``update_transformers_by_sdtype`` with an object that isn't a transformer instance. @@ -2080,9 +2037,7 @@ def test_update_transformers_by_sdtype_bad_transformer_raises_error(self): } # Run / Assert - expected_msg = ( - 'Invalid transformer. Please input an rdt transformer object.' - ) + expected_msg = 'Invalid transformer. Please input an rdt transformer object.' with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', Mock()) @@ -2109,9 +2064,7 @@ def test_update_transformers_by_sdtype_mismatched_sdtype_raises_error( } # Run / Assert - expected_msg = ( - "The transformer you've assigned is incompatible with the sdtype." - ) + expected_msg = "The transformer you've assigned is incompatible with the sdtype." with pytest.raises(InvalidConfigError, match=expected_msg): ht.update_transformers_by_sdtype('categorical', FloatFormatter()) @@ -2137,9 +2090,7 @@ def test_update_transformers_by_sdtype_incorrect_transformer_name(self): # Run and Assert err_msg = "Invalid transformer name 'Transformer' for the 'categorical' sdtype." with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype( - 'categorical', transformer_name='Transformer' - ) + ht.update_transformers_by_sdtype('categorical', transformer_name='Transformer') def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer( self, @@ -2152,9 +2103,7 @@ def test_update_transformers_by_sdtype_incorrect_sdtype_for_transformer( # Run and Assert err_msg = "Invalid transformer name 'LabelEncoder' for the 'numerical' sdtype." with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype( - 'numerical', transformer_name='LabelEncoder' - ) + ht.update_transformers_by_sdtype('numerical', transformer_name='LabelEncoder') def test_update_transformers_by_sdtype_incorrect_sdtype(self): """When ``sdtype`` is invalid, it should crash.""" @@ -2163,13 +2112,9 @@ def test_update_transformers_by_sdtype_incorrect_sdtype(self): ht.field_sdtypes = {'doesnt matter'} # Run and Assert - err_msg = ( - "Invalid transformer name 'LabelEncoder' for the 'bla' sdtype." - ) + err_msg = "Invalid transformer name 'LabelEncoder' for the 'bla' sdtype." with pytest.raises(InvalidConfigError, match=err_msg): - ht.update_transformers_by_sdtype( - 'bla', transformer_name='LabelEncoder' - ) + ht.update_transformers_by_sdtype('bla', transformer_name='LabelEncoder') def test_update_transformers_by_sdtype_incorrect_transformer_parameters( self, @@ -2180,9 +2125,7 @@ def test_update_transformers_by_sdtype_incorrect_transformer_parameters( ht.field_sdtypes = {'doesnt matter'} # Run and Assert - err_msg = re.escape( - "Invalid parameters ('false', 'order') for the 'LabelEncoder'." - ) + err_msg = re.escape("Invalid parameters ('false', 'order') for the 'LabelEncoder'.") with pytest.raises(TransformerInputError, match=err_msg): ht.update_transformers_by_sdtype( 'categorical', @@ -2213,21 +2156,15 @@ def test_update_transformers_by_sdtype_transformer_name(self): } # Run - ht.update_transformers_by_sdtype( - 'categorical', transformer_name='LabelEncoder' - ) + ht.update_transformers_by_sdtype('categorical', transformer_name='LabelEncoder') # Assert assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance( - ht.field_transformers['categorical_column'], LabelEncoder - ) + assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) @patch('rdt.hyper_transformer.warnings') - def test_update_transformers_by_sdtype_transformer_name_and_transformer( - self, mock_warning - ): + def test_update_transformers_by_sdtype_transformer_name_and_transformer(self, mock_warning): """Test setting ``transformer_name`` ignores ``transformer`` parameter. Expect the ``transformer`` parameter to be ignored, a warning to be raised, @@ -2261,9 +2198,7 @@ def test_update_transformers_by_sdtype_transformer_name_and_transformer( mock_warning.warn.assert_called_once_with(expected_msg, FutureWarning) assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance( - ht.field_transformers['categorical_column'], LabelEncoder - ) + assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) def test_update_transformers_by_sdtype_with_transformer_name_transformer_parameters( self, @@ -2295,13 +2230,8 @@ def test_update_transformers_by_sdtype_with_transformer_name_transformer_paramet # Assert assert len(ht.field_transformers) == 2 assert ht.field_transformers['numerical_column'] == ff - assert isinstance( - ht.field_transformers['categorical_column'], LabelEncoder - ) - assert ( - ht.field_transformers['categorical_column'].order_by - == 'alphabetical' - ) + assert isinstance(ht.field_transformers['categorical_column'], LabelEncoder) + assert ht.field_transformers['categorical_column'].order_by == 'alphabetical' def test_create_multi_column_fields(self): """Test ``_create_multi_column_fields``.""" @@ -2447,9 +2377,7 @@ def test_update_transformers_fitted(self, mock_warnings): mock_warnings.warn.assert_called_once_with(expected_message) assert instance.field_transformers['my_column'] == transformer - instance._validate_transformers.assert_called_once_with( - column_name_to_transformer - ) + instance._validate_transformers.assert_called_once_with(column_name_to_transformer) def test__update_transformers_multi_column_valid(self): """Test ``_update_multi_column_transformer`` with a valid multi-column transformer.""" @@ -2612,9 +2540,7 @@ def side_effect(column): mock_remove_column_in_multi_column_fields = Mock() mock_remove_column_in_multi_column_fields.side_effect = side_effect - ht._remove_column_in_multi_column_fields = ( - mock_remove_column_in_multi_column_fields - ) + ht._remove_column_in_multi_column_fields = mock_remove_column_in_multi_column_fields # Run ht.update_transformers(column_name_to_transformer) @@ -2665,9 +2591,7 @@ def test_update_transformers_not_fitted(self, mock_warnings): # Assert mock_warnings.warn.assert_not_called() assert instance.field_transformers['my_column'] == transformer - instance._validate_transformers.assert_called_once_with( - column_name_to_transformer - ) + instance._validate_transformers.assert_called_once_with(column_name_to_transformer) def test_update_transformers_no_field_transformers(self): """Test update transformers. @@ -2745,9 +2669,7 @@ def test_update_transformers_mismatch_sdtypes(self): with pytest.raises(InvalidConfigError, match=err_msg): instance.update_transformers(column_name_to_transformer) - instance._validate_transformers.assert_called_once_with( - column_name_to_transformer - ) + instance._validate_transformers.assert_called_once_with(column_name_to_transformer) def test_update_transformers_transformer_is_none(self): """Test update transformers. @@ -2782,9 +2704,7 @@ def test_update_transformers_transformer_is_none(self): # Assert assert instance.field_transformers == {'my_column': None} - instance._validate_transformers.assert_called_once_with( - column_name_to_transformer - ) + instance._validate_transformers.assert_called_once_with(column_name_to_transformer) def test_update_transformers_column_doesnt_exist_in_config(self): """Test update transformers. @@ -3005,9 +2925,7 @@ def test_update_sdtypes_invalid_columns(self): @patch('rdt.hyper_transformer.LOGGER') @patch('rdt.hyper_transformer.get_default_transformer') @patch('rdt.hyper_transformer.warnings') - def test_update_sdtypes_different_sdtype( - self, mock_warnings, default_mock, mock_logger - ): + def test_update_sdtypes_different_sdtype(self, mock_warnings, default_mock, mock_logger): """Test ``update_sdtypes``. Ensure that the method properly updates the ``self.field_sdtypes`` and changes the @@ -3053,9 +2971,7 @@ def test_update_sdtypes_different_sdtype( @patch('rdt.hyper_transformer.LOGGER') @patch('rdt.hyper_transformer.warnings') - def test_update_sdtypes_different_sdtype_than_transformer( - self, mock_warnings, mock_logger - ): + def test_update_sdtypes_different_sdtype_than_transformer(self, mock_warnings, mock_logger): """Test ``update_sdtypes``. Ensure that the method properly updates the ``self.field_sdtypes`` but doesn't change @@ -3599,9 +3515,7 @@ def test__fit_field_transformer_multi_column_field_not_ready( transformer1 = Mock() transformer2 = Mock() transformer1.get_output_columns.return_value = ['a.out1'] - transformer1.get_next_transformers.return_value = { - ('a.out1', 'b.out1'): transformer2 - } + transformer1.get_next_transformers.return_value = {('a.out1', 'b.out1'): transformer2} transformer1.transform.return_value = transformed_data1 ht = HyperTransformer() ht._multi_column_fields = Mock() diff --git a/tests/unit/transformers/pii/test_anonymization.py b/tests/unit/transformers/pii/test_anonymization.py index a357034c..143c018d 100644 --- a/tests/unit/transformers/pii/test_anonymization.py +++ b/tests/unit/transformers/pii/test_anonymization.py @@ -31,9 +31,7 @@ def test__detect_provider_name(self): assert state_provider == 'address.en_US' @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') - def test_get_anonymized_transformer_with_existing_sdtype( - self, mock_anonymized_faker - ): + def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_faker): """Test the ``get_anonymized_transformer`` method. Test that when calling with an existing ``sdtype`` / ``function_name`` from the @@ -71,9 +69,7 @@ def test_get_anonymized_transformer_with_existing_sdtype( ) @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') - def test_get_anonymized_transformer_with_custom_sdtype( - self, mock_anonymized_faker - ): + def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_faker): """Test the ``get_anonymized_transformer`` method. Test that when calling with a custom ``sdtype`` / ``function_name`` that does not belong diff --git a/tests/unit/transformers/pii/test_anonymizer.py b/tests/unit/transformers/pii/test_anonymizer.py index ae96688f..7710d334 100644 --- a/tests/unit/transformers/pii/test_anonymizer.py +++ b/tests/unit/transformers/pii/test_anonymizer.py @@ -23,9 +23,7 @@ class TestAnonymizedFaker: @patch('rdt.transformers.pii.anonymizer.faker') @patch('rdt.transformers.pii.anonymizer.getattr') @patch('rdt.transformers.pii.anonymizer.attrgetter') - def test_check_provider_function_baseprovider( - self, mock_attrgetter, mock_getattr, mock_faker - ): + def test_check_provider_function_baseprovider(self, mock_attrgetter, mock_getattr, mock_faker): """Test that ``getattr`` is being called with ``BaseProvider`` and ``function_name``. Mock: @@ -37,15 +35,11 @@ def test_check_provider_function_baseprovider( mock_getattr.side_effect = ['provider', None] # Run - AnonymizedFaker.check_provider_function( - 'BaseProvider', 'function_name' - ) + AnonymizedFaker.check_provider_function('BaseProvider', 'function_name') # Assert assert mock_attrgetter.call_args_list[0] == call('BaseProvider') - assert mock_getattr.call_args_list[0] == call( - 'module', 'function_name' - ) + assert mock_getattr.call_args_list[0] == call('module', 'function_name') @patch('rdt.transformers.pii.anonymizer.faker') @patch('rdt.transformers.pii.anonymizer.getattr') @@ -64,16 +58,12 @@ def test_check_provider_function_other_providers( mock_getattr.side_effect = ['provider_class', None] # Run - AnonymizedFaker.check_provider_function( - 'provider_name', 'function_name' - ) + AnonymizedFaker.check_provider_function('provider_name', 'function_name') # Assert assert mock_attrgetter.call_args_list[0] == call('provider_name') assert mock_getattr.call_args_list[0] == call('module', 'Provider') - assert mock_getattr.call_args_list[1] == call( - 'provider_class', 'function_name' - ) + assert mock_getattr.call_args_list[1] == call('provider_class', 'function_name') def test_check_provider_function_raise_attribute_error(self): """Test that ``check_provider_function`` raises an ``AttributeError``. @@ -90,9 +80,7 @@ def test_check_provider_function_raise_attribute_error(self): # Run with pytest.raises(TransformerProcessingError, match=expected_message): - AnonymizedFaker.check_provider_function( - 'TestProvider', 'TestFunction' - ) + AnonymizedFaker.check_provider_function('TestProvider', 'TestFunction') def test__function_cardinality_rule_none(self): """Test that ``_function`` does not use ``faker.unique``. @@ -274,9 +262,7 @@ def test__check_locales(self, mock_warnings, mock_importlib): @patch('rdt.transformers.pii.anonymizer.importlib') @patch('rdt.transformers.pii.anonymizer.warnings') - def test__check_locales_provider_ending_with_locale( - self, mock_warnings, mock_importlib - ): + def test__check_locales_provider_ending_with_locale(self, mock_warnings, mock_importlib): """Test that check locales does not warn the user if the provider ends with the locale. Mock: @@ -298,9 +284,7 @@ def test__check_locales_provider_ending_with_locale( @patch('rdt.transformers.pii.anonymizer.importlib') @patch('rdt.transformers.pii.anonymizer.warnings') - def test__check_locales_provider_ending_with_wrong_locale( - self, mock_warnings, mock_importlib - ): + def test__check_locales_provider_ending_with_wrong_locale(self, mock_warnings, mock_importlib): """Test that check locales warns the user. If the provider ends with the given locale but is not separated by a dot this will warn @@ -330,9 +314,7 @@ def test__check_locales_provider_ending_with_wrong_locale( mock_warnings.warn.assert_called_once_with(expected_message) @patch('rdt.transformers.pii.anonymizer.faker') - @patch( - 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' - ) + @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') def test___init__default(self, mock_check_provider_function, mock_faker): """Test the default instantiation of the transformer. @@ -358,9 +340,7 @@ def test___init__default(self, mock_check_provider_function, mock_faker): instance = AnonymizedFaker() # Assert - mock_check_provider_function.assert_called_once_with( - 'BaseProvider', 'lexify' - ) + mock_check_provider_function.assert_called_once_with('BaseProvider', 'lexify') assert instance.provider_name == 'BaseProvider' assert instance.function_name == 'lexify' assert instance.function_kwargs == {} @@ -386,13 +366,9 @@ def test___init__error_missing_value_generation(self): AnonymizedFaker(missing_value_generation='invalid') @patch('rdt.transformers.pii.anonymizer.faker') - @patch( - 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' - ) + @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') @patch('rdt.transformers.pii.anonymizer.warnings') - def test___init__custom( - self, mock_warnings, mock_check_provider_function, mock_faker - ): + def test___init__custom(self, mock_warnings, mock_check_provider_function, mock_faker): """Test the instantiation of the transformer with custom parameters. Test that the transformer can be instantiated with a custom provider and function, and @@ -424,9 +400,7 @@ def test___init__custom( ) # Assert - mock_check_provider_function.assert_called_once_with( - 'credit_card', 'credit_card_full' - ) + mock_check_provider_function.assert_called_once_with('credit_card', 'credit_card_full') assert instance.provider_name == 'credit_card' assert instance.function_name == 'credit_card_full' assert instance.function_kwargs == {'type': 'visa'} @@ -451,13 +425,10 @@ def test___init__no_function_name(self): """ # Run / Assert expected_message = ( - 'Please specify the function name to use from the ' - "'credit_card' provider." + 'Please specify the function name to use from the ' "'credit_card' provider." ) with pytest.raises(TransformerInputError, match=expected_message): - AnonymizedFaker( - provider_name='credit_card', locales=['en_US', 'fr_FR'] - ) + AnonymizedFaker(provider_name='credit_card', locales=['en_US', 'fr_FR']) @patch('rdt.transformers.pii.anonymizer.issubclass') @patch('rdt.transformers.pii.anonymizer.BaseTransformer') @@ -502,9 +473,7 @@ def test_get_supported_sdtypes(self, base_mock, issubclass_mock): 'text', ]) - @patch( - 'rdt.transformers.pii.anonymizer.BaseTransformer.reset_randomization' - ) + @patch('rdt.transformers.pii.anonymizer.BaseTransformer.reset_randomization') @patch('rdt.transformers.pii.anonymizer.faker') def test_reset_randomization(self, mock_faker, mock_base_reset): """Test that this function creates a new faker instance.""" @@ -544,9 +513,7 @@ def test__fit(self): # Assert assert transformer.data_length == 5 - assert transformer.output_properties == { - None: {'next_transformer': None} - } + assert transformer.output_properties == {None: {'next_transformer': None}} assert transformer._nan_frequency == 0.4 assert transformer._data_cardinality == 3 @@ -616,9 +583,7 @@ def test__reverse_transform_match_cardinality(self): AnonymizedFaker._reverse_transform(instance, None) # Assert - instance._reverse_transform_cardinality_rule_match.assert_called_once_with( - 3 - ) + instance._reverse_transform_cardinality_rule_match.assert_called_once_with(3) def test__reverse_transform_cardinality_rule_match_only_nans(self): """Test it with only nans.""" @@ -718,9 +683,7 @@ def test__reverse_transform_not_enough_unique_values(self): - Raises an error. """ # Setup - instance = AnonymizedFaker( - 'misc', 'boolean', cardinality_rule='unique' - ) + instance = AnonymizedFaker('misc', 'boolean', cardinality_rule='unique') data = pd.Series(['a', 'b', 'c', 'd']) instance.columns = ['a'] @@ -819,12 +782,8 @@ def test___getstate__(self, mock_warnings): mock_warnings.warn.assert_called_once_with(expected_warning_msg) @patch('rdt.transformers.pii.anonymizer.faker') - @patch( - 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' - ) - def test___init__super_attrs( - self, mock_check_provider_function, mock_faker - ): + @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') + def test___init__super_attrs(self, mock_check_provider_function, mock_faker): """Test that initializing an instance is calling properly the ``super`` class. Mock: @@ -857,9 +816,7 @@ def test___init__super_attrs( mock_faker.Faker.assert_called_once_with(None) @patch('rdt.transformers.pii.anonymizer.faker') - @patch( - 'rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function' - ) + @patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function') def test___init__custom(self, mock_check_provider_function, mock_faker): """Test the instantiation of the transformer with custom parameters. @@ -893,9 +850,7 @@ def test___init__custom(self, mock_check_provider_function, mock_faker): # Assert assert instance._mapping_dict == {} assert instance._reverse_mapping_dict == {} - mock_check_provider_function.assert_called_once_with( - 'credit_card', 'credit_card_full' - ) + mock_check_provider_function.assert_called_once_with('credit_card', 'credit_card_full') assert instance.provider_name == 'credit_card' assert instance.function_name == 'credit_card_full' assert instance.function_kwargs == {'type': 'visa'} @@ -1030,9 +985,7 @@ def test__transform(self): result = instance._transform(data) # Assert - pd.testing.assert_series_equal( - result, pd.Series(['z', 'y', 'x'], name='col') - ) + pd.testing.assert_series_equal(result, pd.Series(['z', 'y', 'x'], name='col')) def test__transform_with_new_values(self): """Test the ``_transform`` method. @@ -1095,6 +1048,4 @@ def test__reverse_transform(self): reverse_transformed = instance._reverse_transform(data) # Assert - pd.testing.assert_series_equal( - reverse_transformed, pd.Series(['a', 'b', 'c'], name='col') - ) + pd.testing.assert_series_equal(reverse_transformed, pd.Series(['a', 'b', 'c'], name='col')) diff --git a/tests/unit/transformers/test__validators.py b/tests/unit/transformers/test__validators.py index ad7250aa..0b1f8750 100644 --- a/tests/unit/transformers/test__validators.py +++ b/tests/unit/transformers/test__validators.py @@ -16,9 +16,7 @@ class TestBaseValidator: 'rdt.transformers._validators.BaseValidator.SUPPORTED_SDTYPES', ['numerical'], ) - @patch( - 'rdt.transformers._validators.BaseValidator.VALIDATION_TYPE', 'Base' - ) + @patch('rdt.transformers._validators.BaseValidator.VALIDATION_TYPE', 'Base') def test_validate_supported_sdtypes(self): """Test ``_validate_supported_sdtypes`` method.""" # Setup @@ -41,9 +39,7 @@ def test_validate_supported_sdtypes(self): # Run and Assert BaseValidator._validate_supported_sdtypes(columns_to_sdtypes_valid) with pytest.raises(TransformerInputError, match=expected_message): - BaseValidator._validate_supported_sdtypes( - columns_to_sdtypes_invalid - ) + BaseValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) def test_validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -107,12 +103,8 @@ def test__validate_number_columns(self): 'Address transformers takes up to 7 columns to transform. Please provide address' ' data with valid fields.' ) - with pytest.raises( - TransformerInputError, match=re.escape(expected_message) - ): - AddressValidator._validate_number_columns( - column_to_sdtypes_invalid - ) + with pytest.raises(TransformerInputError, match=re.escape(expected_message)): + AddressValidator._validate_number_columns(column_to_sdtypes_invalid) def test__validate_uniqueness_sdtype(self): """Test ``_validate_uniqueness_sdtype`` method.""" @@ -137,9 +129,7 @@ def test__validate_uniqueness_sdtype(self): 'Your address data cannot have duplicate fields.' ) with pytest.raises(TransformerInputError, match=expected_message): - AddressValidator._validate_uniqueness_sdtype( - columns_to_sdtypes_invalid - ) + AddressValidator._validate_uniqueness_sdtype(columns_to_sdtypes_invalid) def test__validate_supported_sdtype(self): """Test ``_validate_supported_sdtype`` method.""" @@ -163,9 +153,7 @@ def test__validate_supported_sdtype(self): 'Please provide a column that is compatible with Address data.' ) with pytest.raises(TransformerInputError, match=expected_message): - AddressValidator._validate_supported_sdtypes( - columns_to_sdtypes_invalid - ) + AddressValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) def test__validate_administrative_unit(self): """Test ``_validate_administrative_unit`` method.""" @@ -180,20 +168,14 @@ def test__validate_administrative_unit(self): } # Run and Assert - AddressValidator._validate_administrative_unit( - columns_to_sdtypes_valid - ) + AddressValidator._validate_administrative_unit(columns_to_sdtypes_valid) expected_message = ( "The AddressValidator can have up to 1 column with sdtype 'state'" " or 'administrative_unit'. Please provide address data with valid fields." ) - with pytest.raises( - TransformerInputError, match=re.escape(expected_message) - ): - AddressValidator._validate_administrative_unit( - columns_to_sdtypes_invalid - ) + with pytest.raises(TransformerInputError, match=re.escape(expected_message)): + AddressValidator._validate_administrative_unit(columns_to_sdtypes_invalid) def test__validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -211,30 +193,22 @@ def test__validate_sdtypes(self): AddressValidator.validate_sdtypes(columns_to_sdtypes) # Assert - AddressValidator._validate_number_columns.assert_called_once_with( - columns_to_sdtypes - ) - AddressValidator._validate_uniqueness_sdtype.assert_called_once_with( - columns_to_sdtypes - ) - AddressValidator._validate_supported_sdtypes.assert_called_once_with( - columns_to_sdtypes - ) - AddressValidator._validate_administrative_unit.assert_called_once_with( - columns_to_sdtypes - ) + AddressValidator._validate_number_columns.assert_called_once_with(columns_to_sdtypes) + AddressValidator._validate_uniqueness_sdtype.assert_called_once_with(columns_to_sdtypes) + AddressValidator._validate_supported_sdtypes.assert_called_once_with(columns_to_sdtypes) + AddressValidator._validate_administrative_unit.assert_called_once_with(columns_to_sdtypes) def test__validate_imports_without_address_module(self): """Test ``validate_imports`` when address module doesn't exist.""" # Run and Assert - expected_message = 'You must have SDV Enterprise with the address add-on to use the address features' + expected_message = ( + 'You must have SDV Enterprise with the address add-on to use the address features' + ) with pytest.raises(ImportError, match=expected_message): AddressValidator.validate_imports() @patch('rdt.transformers._validators.importlib.import_module') - def test__validate_imports_without_premium_features( - self, mock_import_module - ): + def test__validate_imports_without_premium_features(self, mock_import_module): """Test ``validate_imports`` when the user doesn't have the transformers.""" # Setup mock_address = Mock() @@ -243,7 +217,9 @@ def test__validate_imports_without_premium_features( mock_import_module.return_value = mock_address # Run and Assert - expected_message = 'You must have SDV Enterprise with the address add-on to use the address features' + expected_message = ( + 'You must have SDV Enterprise with the address add-on to use the address features' + ) with pytest.raises(ImportError, match=expected_message): AddressValidator.validate_imports() @@ -269,9 +245,7 @@ def test__validate_uniqueness_sdtype(self): 'Please provide GPS data with valid fields.' ) with pytest.raises(TransformerInputError, match=expected_message): - GPSValidator._validate_uniqueness_sdtype( - columns_to_sdtypes_invalid - ) + GPSValidator._validate_uniqueness_sdtype(columns_to_sdtypes_invalid) def test__validate_supported_sdtype(self): """Test ``_validate_supported_sdtype`` method.""" @@ -293,9 +267,7 @@ def test__validate_supported_sdtype(self): 'Please provide a column that is compatible with GPS data.' ) with pytest.raises(TransformerInputError, match=expected_message): - GPSValidator._validate_supported_sdtypes( - columns_to_sdtypes_invalid - ) + GPSValidator._validate_supported_sdtypes(columns_to_sdtypes_invalid) def test__validate_sdtypes(self): """Test ``validate_sdtypes`` method.""" @@ -311,24 +283,20 @@ def test__validate_sdtypes(self): GPSValidator.validate_sdtypes(columns_to_sdtypes) # Assert - GPSValidator._validate_uniqueness_sdtype.assert_called_once_with( - columns_to_sdtypes - ) - GPSValidator._validate_supported_sdtypes.assert_called_once_with( - columns_to_sdtypes - ) + GPSValidator._validate_uniqueness_sdtype.assert_called_once_with(columns_to_sdtypes) + GPSValidator._validate_supported_sdtypes.assert_called_once_with(columns_to_sdtypes) def test_validate_import_gps_transformers_without_gps_module(self): """Test ``validate_imports`` when gps module doesn't exist.""" # Run and Assert - expected_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features' + expected_message = ( + 'You must have SDV Enterprise with the gps add-on to use the GPS features' + ) with pytest.raises(ImportError, match=expected_message): GPSValidator.validate_imports() @patch('rdt.transformers._validators.importlib.import_module') - def test_validate_import_gps_transformers_without_premium_features( - self, mock_import_module - ): + def test_validate_import_gps_transformers_without_premium_features(self, mock_import_module): """Test ``validate_imports`` when the user doesn't have the transformers.""" # Setup mock_gps = Mock() @@ -338,6 +306,8 @@ def test_validate_import_gps_transformers_without_premium_features( mock_import_module.return_value = mock_gps # Run and Assert - expected_message = 'You must have SDV Enterprise with the gps add-on to use the GPS features' + expected_message = ( + 'You must have SDV Enterprise with the gps add-on to use the GPS features' + ) with pytest.raises(ImportError, match=expected_message): GPSValidator.validate_imports() diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py index 7e3aa185..18aa8ac5 100644 --- a/tests/unit/transformers/test_base.py +++ b/tests/unit/transformers/test_base.py @@ -46,12 +46,8 @@ def test_set_random_states(mock_numpy): call(initial_state_value), call(first_state), ]) - my_function.assert_called_once_with( - mock_numpy.random.RandomState.return_value, 'fit' - ) - mock_numpy.random.RandomState.return_value.set_state.assert_called_with( - second_state - ) + my_function.assert_called_once_with(mock_numpy.random.RandomState.return_value, 'fit') + mock_numpy.random.RandomState.return_value.set_state.assert_called_with(second_state) @patch('rdt.transformers.base.set_random_states') @@ -74,9 +70,7 @@ def test_random_state(mock_set_random_states): wrapped_function(instance) # Assert - mock_set_random_states.assert_called_once_with( - {}, 'name', mock_set_random_state - ) + mock_set_random_states.assert_called_once_with({}, 'name', mock_set_random_state) my_function.assert_called_once() @@ -176,8 +170,7 @@ def test_get_input_sdtype_raises_warning(self, mock_get_supported_sdtypes): # Run expected_message = ( - '`get_input_sdtype` is deprecated. Please use ' - '`get_supported_sdtypes` instead.' + '`get_input_sdtype` is deprecated. Please use ' '`get_supported_sdtypes` instead.' ) with pytest.warns(FutureWarning, match=expected_message): input_sdtype = BaseTransformer.get_input_sdtype() @@ -260,12 +253,8 @@ def test__set_missing_value_generation(self): # Run BaseTransformer._set_missing_value_generation(instance_none, None) - BaseTransformer._set_missing_value_generation( - instance_random, 'random' - ) - BaseTransformer._set_missing_value_generation( - instance_from_column, 'from_column' - ) + BaseTransformer._set_missing_value_generation(instance_random, 'random') + BaseTransformer._set_missing_value_generation(instance_from_column, 'from_column') # Assert assert instance_none.missing_value_generation is None @@ -326,9 +315,7 @@ def test__set_model_missing_values_true(self, mock_warnings): ), FutureWarning, ) - instance._set_missing_value_generation.assert_called_once_with( - 'from_column' - ) + instance._set_missing_value_generation.assert_called_once_with('from_column') @patch('rdt.transformers.base.warnings') def test__set_model_missing_values_false(self, mock_warnings): @@ -347,9 +334,7 @@ def test__set_model_missing_values_false(self, mock_warnings): ), FutureWarning, ) - instance._set_missing_value_generation.assert_called_once_with( - 'random' - ) + instance._set_missing_value_generation.assert_called_once_with('random') def test___repr___no_parameters(self): """Test that the ``__str__`` method returns the class name. @@ -446,9 +431,7 @@ class Dummy(BaseTransformer): column_prefix = 'column_name' def __init__(self): - self.output_properties = { - None: {'next_transformer': transformer} - } + self.output_properties = {None: {'next_transformer': transformer}} dummy_transformer = Dummy() @@ -742,14 +725,10 @@ def test__add_columns_to_data_series(self): columns_data = pd.Series([7, 8, 9], name='c') # Run - result = BaseTransformer._add_columns_to_data( - data, columns_data, columns - ) + result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame( - {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1] - ) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_dataframe(self): @@ -779,9 +758,7 @@ def test__add_columns_to_data_dataframe(self): columns_data = pd.DataFrame({'c': [7, 8, 9], 'd': [10, 11, 12]}) # Run - result = BaseTransformer._add_columns_to_data( - data, columns_data, columns - ) + result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert expected = pd.DataFrame( @@ -822,14 +799,10 @@ def test__add_columns_to_data_1d_array(self): columns_data = np.array([7, 8, 9], dtype=np.int64) # Run - result = BaseTransformer._add_columns_to_data( - data, columns_data, columns - ) + result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame( - {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1] - ) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_2d_array(self): @@ -853,14 +826,10 @@ def test__add_columns_to_data_2d_array(self): columns_data = np.array([[7, 1], [8, 5], [9, 9]], dtype=np.int64) # Run - result = BaseTransformer._add_columns_to_data( - data, columns_data, columns - ) + result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert - expected = pd.DataFrame( - {'a': [1, 2, 3], 'b': [7, 8, 9], 'c': [1, 5, 9]}, index=[2, 0, 1] - ) + expected = pd.DataFrame({'a': [1, 2, 3], 'b': [7, 8, 9], 'c': [1, 5, 9]}, index=[2, 0, 1]) pd.testing.assert_frame_equal(result, expected) def test__add_columns_to_data_none(self): @@ -881,9 +850,7 @@ def test__add_columns_to_data_none(self): columns_data = None # Run - result = BaseTransformer._add_columns_to_data( - data, columns_data, columns - ) + result = BaseTransformer._add_columns_to_data(data, columns_data, columns) # Assert expected = pd.DataFrame( @@ -1040,9 +1007,7 @@ def _fit(self, data): # Assert expected_data = pd.Series([1, 2, 3], name='a') assert dummy_transformer.columns == ['a'] - pd.testing.assert_series_equal( - dummy_transformer._passed_data, expected_data - ) + pd.testing.assert_series_equal(dummy_transformer._passed_data, expected_data) assert dummy_transformer.column_prefix == 'a' assert dummy_transformer.output_columns == ['a', 'a.is_null'] @@ -1082,9 +1047,7 @@ class Dummy(BaseTransformer): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state( - np.random.RandomState(42), 'transform' - ) + dummy_transformer.set_random_state(np.random.RandomState(42), 'transform') transformed_data = dummy_transformer.transform(data) # Assert @@ -1127,9 +1090,7 @@ def _transform(self, data): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state( - np.random.RandomState(42), 'transform' - ) + dummy_transformer.set_random_state(np.random.RandomState(42), 'transform') transformed_data = dummy_transformer.transform(data) # Assert @@ -1137,9 +1098,7 @@ def _transform(self, data): 'a': [1, 2, 3], 'b': [4, 5, 6], }) - pd.testing.assert_frame_equal( - dummy_transformer._passed_data, expected_passed - ) + pd.testing.assert_frame_equal(dummy_transformer._passed_data, expected_passed) expected_transformed = pd.DataFrame({ 'c': [7, 8, 9], @@ -1215,9 +1174,7 @@ class Dummy(BaseTransformer): dummy_transformer = Dummy() # Run - dummy_transformer.set_random_state( - np.random.RandomState(42), 'reverse_transform' - ) + dummy_transformer.set_random_state(np.random.RandomState(42), 'reverse_transform') transformed_data = dummy_transformer.reverse_transform(data) # Assert @@ -1248,9 +1205,7 @@ def _reverse_transform(self, data): # Run dummy_transformer = Dummy() - dummy_transformer.set_random_state( - np.random.RandomState(42), 'reverse_transform' - ) + dummy_transformer.set_random_state(np.random.RandomState(42), 'reverse_transform') transformed_data = dummy_transformer.reverse_transform(data) # Assert @@ -1258,9 +1213,7 @@ def _reverse_transform(self, data): 'a': [1, 2, 3], 'b.is_null': [4, 5, 6], }) - pd.testing.assert_frame_equal( - dummy_transformer._passed_data, expected_passed - ) + pd.testing.assert_frame_equal(dummy_transformer._passed_data, expected_passed) expected_transformed = pd.DataFrame({ 'c': [7, 8, 9], @@ -1411,13 +1364,9 @@ def test__validate_columns_to_sdtypes(self): 'b': 'categorical', 'd': 'boolean', } - expected_error_msg = re.escape( - 'Columns (d) are not present in the data.' - ) + expected_error_msg = re.escape('Columns (d) are not present in the data.') with pytest.raises(ValueError, match=expected_error_msg): - transformer._validate_columns_to_sdtypes( - data, wrong_columns_to_sdtypes - ) + transformer._validate_columns_to_sdtypes(data, wrong_columns_to_sdtypes) def test__validate_sdtypes(self): """Test the ``_validate_sdtypes`` method.""" @@ -1463,9 +1412,7 @@ def test_fit(self): transformer.fit(data, columns_to_sdtypes) # Assert - transformer._validate_columns_to_sdtypes.assert_called_once_with( - data, columns_to_sdtypes - ) + transformer._validate_columns_to_sdtypes.assert_called_once_with(data, columns_to_sdtypes) transformer._store_columns.assert_called_once_with(['a', 'b'], data) transformer._set_seed.assert_called_once_with(data) transformer._get_columns_data.assert_called_once_with(data, ['a', 'b']) diff --git a/tests/unit/transformers/test_boolean.py b/tests/unit/transformers/test_boolean.py index 7ce9ae82..2554508c 100644 --- a/tests/unit/transformers/test_boolean.py +++ b/tests/unit/transformers/test_boolean.py @@ -17,9 +17,7 @@ def test___init__(self): error_message = 'Unexpected missing_value_replacement' error_generation = 'Unexpected missing_value_generation' assert transformer.missing_value_replacement == 'mode', error_message - assert ( - transformer.missing_value_generation == 'random' - ), error_generation + assert transformer.missing_value_generation == 'random', error_generation def test___init___model_missing_value_passed(self): """Test when model missing value is passed to the init.""" @@ -40,9 +38,7 @@ def test__fit_missing_value_replacement_not_ignore(self): # Asserts error_msg = 'Unexpected fill value' - assert ( - transformer.null_transformer._missing_value_replacement == 0 - ), error_msg + assert transformer.null_transformer._missing_value_replacement == 0, error_msg def test__fit_array(self): """Test _fit with numpy.array""" @@ -55,9 +51,7 @@ def test__fit_array(self): # Asserts error_msg = 'Unexpected fill value' - assert ( - transformer.null_transformer._missing_value_replacement == 0 - ), error_msg + assert transformer.null_transformer._missing_value_replacement == 0, error_msg def test__fit_missing_value_generation_from_column(self): """Test output_properties contains 'is_null' column. @@ -92,10 +86,7 @@ def test__transform_series(self): expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' - assert ( - transformer.null_transformer.transform.call_count - == expect_call_count - ), error_msg + assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], expect_call_args, @@ -115,10 +106,7 @@ def test__transform_array(self): expect_call_args = pd.Series([0.0, 1.0, None, 1.0, 0.0], dtype=float) error_msg = 'NullTransformer.transform must be called one time' - assert ( - transformer.null_transformer.transform.call_count - == expect_call_count - ), error_msg + assert transformer.null_transformer.transform.call_count == expect_call_count, error_msg pd.testing.assert_series_equal( transformer.null_transformer.transform.call_args[0][0], expect_call_args, @@ -133,9 +121,7 @@ def test__reverse_transform_missing_value_replacement_not_ignore(self): # Run transformer = Mock() transformer.missing_value_replacement = 0 - transformer.null_transformer.reverse_transform.return_value = ( - transformed_data - ) + transformer.null_transformer.reverse_transform.return_value = transformed_data result = BinaryEncoder._reverse_transform(transformer, data) @@ -149,9 +135,7 @@ def test__reverse_transform_missing_value_replacement_not_ignore(self): 'NullTransformer.reverse_transform should not be called when ' 'missing_value_replacement is ignore' ) - reverse_transform_call_count = ( - transformer.null_transformer.reverse_transform.call_count - ) + reverse_transform_call_count = transformer.null_transformer.reverse_transform.call_count assert reverse_transform_call_count == expect_call_count, error_msg def test__reverse_transform_series(self): diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index c9a5d8d2..bd9e27b8 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -57,9 +57,7 @@ def test__order_categories_alphabetical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal( - ordered, np.array(['four', 'one', 'three', 'two']) - ) + np.testing.assert_array_equal(ordered, np.array(['four', 'one', 'three', 'two'])) def test__order_categories_alphabetical_with_nans(self): """Test the ``_order_categories`` method when ``order_by`` is 'alphabetical'. @@ -79,9 +77,7 @@ def test__order_categories_alphabetical_with_nans(self): ordered = transformer._order_categories(arr) # Assert - expected = np.array( - ['four', 'one', 'three', 'two', np.nan], dtype='object' - ) + expected = np.array(['four', 'one', 'three', 'two', np.nan], dtype='object') pd.testing.assert_series_equal(pd.Series(ordered), pd.Series(expected)) def test__order_categories_alphabetical_float_error(self): @@ -95,9 +91,7 @@ def test__order_categories_alphabetical_float_error(self): arr = np.array([1, 2, 3, 4]) # Run / Assert - message = ( - "The data must be of type string if order_by is 'alphabetical'." - ) + message = "The data must be of type string if order_by is 'alphabetical'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -112,9 +106,7 @@ def test__order_categories_alphabetical_nonstring_object_error(self): arr = np.array([True, False, None]) # Run / Assert - message = ( - "The data must be of type string if order_by is 'alphabetical'." - ) + message = "The data must be of type string if order_by is 'alphabetical'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -136,9 +128,7 @@ def test__order_categories_numerical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal( - ordered, np.array([-2.5, 3.11, 5, 67.8, 100, None]) - ) + np.testing.assert_array_equal(ordered, np.array([-2.5, 3.11, 5, 67.8, 100, None])) def test__order_categories_numerical_error(self): """Test the ``_order_categories`` method when ``order_by`` is 'numerical_value'. @@ -157,9 +147,7 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ( - "The data must be numerical if order_by is 'numerical_value'." - ) + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -180,9 +168,7 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ( - "The data must be numerical if order_by is 'numerical_value'." - ) + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -237,12 +223,8 @@ def test__transform(self): # Asserts for key in transformer.intervals: - assert ( - transformed.loc[data == key] >= transformer.intervals[key][0] - ).all() - assert ( - transformed.loc[data == key] < transformer.intervals[key][1] - ).all() + assert (transformed.loc[data == key] >= transformer.intervals[key][0]).all() + assert (transformed.loc[data == key] < transformer.intervals[key][1]).all() def test__transform_user_warning(self): """Test the ``transform`` with unknown data. @@ -424,9 +406,7 @@ def test___init__(self): transformer = OrderedUniformEncoder(order=['b', 'c', 'a', None]) # Asserts - pd.testing.assert_series_equal( - transformer.order, pd.Series(['b', 'c', 'a', np.nan]) - ) + pd.testing.assert_series_equal(transformer.order, pd.Series(['b', 'c', 'a', np.nan])) def test___init___duplicate_categories(self): """Test the ``__init__`` method errors if duplicate categories provided. @@ -447,17 +427,13 @@ def test___repr___default(self): The order should be printed as instead of the actual order. """ # Setup - transformer = OrderedUniformEncoder( - order=['VISA', 'AMEX', 'DISCOVER', None] - ) + transformer = OrderedUniformEncoder(order=['VISA', 'AMEX', 'DISCOVER', None]) # Run stringified_transformer = transformer.__repr__() # Assert - assert ( - stringified_transformer == 'OrderedUniformEncoder(order=)' - ) + assert stringified_transformer == 'OrderedUniformEncoder(order=)' def test__fit(self): """Test the ``_fit`` method.""" @@ -563,12 +539,8 @@ def test__transform(self): # Asserts for key in transformer.intervals: - assert ( - transformed.loc[data == key] >= transformer.intervals[key][0] - ).all() - assert ( - transformed.loc[data == key] < transformer.intervals[key][1] - ).all() + assert (transformed.loc[data == key] >= transformer.intervals[key][0]).all() + assert (transformed.loc[data == key] < transformer.intervals[key][1]).all() def test__transform_error(self): """Test the ``_transform`` method checks that data is in ``self.order``. @@ -903,18 +875,11 @@ def test__transform_by_category_called(self): ]) # Run - transformed = FrequencyEncoder._transform( - categorical_transformer_mock, data - ) + transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) # Asserts - categorical_transformer_mock._transform_by_category.assert_called_once_with( - data - ) - assert ( - transformed - == categorical_transformer_mock._transform_by_category.return_value - ) + categorical_transformer_mock._transform_by_category.assert_called_once_with(data) + assert transformed == categorical_transformer_mock._transform_by_category.return_value def test__transform_by_category(self): """Test the `_transform_by_category` method with numerical data. @@ -1080,18 +1045,11 @@ def test__transform_by_row_called(self): ]) # Run - transformed = FrequencyEncoder._transform( - categorical_transformer_mock, data - ) + transformed = FrequencyEncoder._transform(categorical_transformer_mock, data) # Asserts - categorical_transformer_mock._transform_by_row.assert_called_once_with( - data - ) - assert ( - transformed - == categorical_transformer_mock._transform_by_row.return_value - ) + categorical_transformer_mock._transform_by_row.assert_called_once_with(data) + assert transformed == categorical_transformer_mock._transform_by_row.return_value def test__transform_by_row(self): """Test the `_transform_by_row` method with numerical data. @@ -1149,19 +1107,12 @@ def test__reverse_transform_by_category_called(self): ]) # Run - reverse = FrequencyEncoder._reverse_transform( - categorical_transformer_mock, transform_data - ) + reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, transform_data) # Asserts - reverse_arg = categorical_transformer_mock._reverse_transform_by_category.call_args[ - 0 - ][0] + reverse_arg = categorical_transformer_mock._reverse_transform_by_category.call_args[0][0] np.testing.assert_array_equal(reverse_arg, transform_data.clip(0, 1)) - assert ( - reverse - == categorical_transformer_mock._reverse_transform_by_category.return_value - ) + assert reverse == categorical_transformer_mock._reverse_transform_by_category.return_value def test__reverse_transform_by_category(self): """Test the _reverse_transform_by_category method with numerical data. @@ -1180,9 +1131,7 @@ def test__reverse_transform_by_category(self): transformed = pd.Series([0.875, 0.375, 0.375, 0.625, 0.875]) transformer = FrequencyEncoder() - transformer.means = pd.Series( - [0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1] - ) + transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), @@ -1252,21 +1201,12 @@ def test__reverse_transform_by_row_called(self): categorical_transformer_mock._normalize.return_value = data # Run - reverse = FrequencyEncoder._reverse_transform( - categorical_transformer_mock, data - ) + reverse = FrequencyEncoder._reverse_transform(categorical_transformer_mock, data) # Asserts - reverse_arg = ( - categorical_transformer_mock._reverse_transform_by_row.call_args[ - 0 - ][0] - ) + reverse_arg = categorical_transformer_mock._reverse_transform_by_row.call_args[0][0] np.testing.assert_array_equal(reverse_arg, data.clip(0, 1)) - assert ( - reverse - == categorical_transformer_mock._reverse_transform_by_row.return_value - ) + assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value @patch('rdt.transformers.categorical.check_nan_in_transform') def test__reverse_transform_by_row(self, mock_check_nan): @@ -1287,9 +1227,7 @@ def test__reverse_transform_by_row(self, mock_check_nan): transformed = pd.Series([0.875, 0.625, 0.375, 0.125]) transformer = FrequencyEncoder() - transformer.means = pd.Series( - [0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1] - ) + transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) transformer.starts = pd.DataFrame( [4, 3, 2, 1], index=[0.0, 0.25, 0.5, 0.75], columns=['category'] ) @@ -1881,9 +1819,7 @@ def test__transform_numeric(self): @patch('rdt.transformers.categorical.check_nan_in_transform') @patch('rdt.transformers.categorical.try_convert_to_dtype') - def test__reverse_transform_no_nans( - self, mock_convert_dtype, mock_check_nan - ): + def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan): # Setup ohe = OneHotEncoder() data = pd.Series(['a', 'b', 'c']) @@ -1950,9 +1886,7 @@ class TestLabelEncoder: def test___init__(self): """Passed arguments must be stored as attributes.""" # Run - transformer = LabelEncoder( - add_noise='add_noise_value', order_by='alphabetical' - ) + transformer = LabelEncoder(add_noise='add_noise_value', order_by='alphabetical') # Asserts assert transformer.add_noise == 'add_noise_value' @@ -1995,9 +1929,7 @@ def test__order_categories_alphabetical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal( - ordered, np.array(['four', 'one', 'three', 'two']) - ) + np.testing.assert_array_equal(ordered, np.array(['four', 'one', 'three', 'two'])) def test__order_categories_alphabetical_with_nans(self): """Test the ``_order_categories`` method when ``order_by`` is 'alphabetical'. @@ -2019,9 +1951,7 @@ def test__order_categories_alphabetical_with_nans(self): ordered = transformer._order_categories(arr) # Assert - expected = np.array( - ['four', 'one', 'three', 'two', np.nan], dtype='object' - ) + expected = np.array(['four', 'one', 'three', 'two', np.nan], dtype='object') pd.testing.assert_series_equal(pd.Series(ordered), pd.Series(expected)) def test__order_categories_alphabetical_error(self): @@ -2044,9 +1974,7 @@ def test__order_categories_alphabetical_error(self): arr = np.array([1, 2, 3, 4]) # Run / Assert - message = ( - "The data must be of type string if order_by is 'alphabetical'." - ) + message = "The data must be of type string if order_by is 'alphabetical'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2070,9 +1998,7 @@ def test__order_categories_numerical(self): ordered = transformer._order_categories(arr) # Assert - np.testing.assert_array_equal( - ordered, np.array([-2.5, 3.11, 5, 67.8, 100, np.nan]) - ) + np.testing.assert_array_equal(ordered, np.array([-2.5, 3.11, 5, 67.8, 100, np.nan])) def test__order_categories_numerical_error(self): """Test the ``_order_categories`` method when ``order_by`` is 'numerical_value'. @@ -2094,9 +2020,7 @@ def test__order_categories_numerical_error(self): arr = np.array(['one', 'two', 'three', 'four']) # Run / Assert - message = ( - "The data must be numerical if order_by is 'numerical_value'." - ) + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2120,9 +2044,7 @@ def test__order_categories_numerical_different_dtype_error(self): arr = np.array([True, False, False, True]) # Run / Assert - message = ( - "The data must be numerical if order_by is 'numerical_value'." - ) + message = "The data must be numerical if order_by is 'numerical_value'." with pytest.raises(TransformerInputError, match=message): transformer._order_categories(arr) @@ -2294,9 +2216,7 @@ def test__reverse_transform_clips_values(self): @patch('rdt.transformers.categorical.check_nan_in_transform') @patch('rdt.transformers.categorical.try_convert_to_dtype') - def test__reverse_transform_add_noise( - self, mock_convert_dtype, mock_check_nan - ): + def test__reverse_transform_add_noise(self, mock_convert_dtype, mock_check_nan): """Test the ``_reverse_transform`` method with ``add_noise``. Test that the method correctly reverse transforms the data @@ -2350,15 +2270,11 @@ def test___init__(self): Passed arguments must be stored as attributes. """ # Run - transformer = OrderedLabelEncoder( - order=['b', 'c', 'a', None], add_noise='add_noise_value' - ) + transformer = OrderedLabelEncoder(order=['b', 'c', 'a', None], add_noise='add_noise_value') # Asserts assert transformer.add_noise == 'add_noise_value' - pd.testing.assert_series_equal( - transformer.order, pd.Series(['b', 'c', 'a', np.nan]) - ) + pd.testing.assert_series_equal(transformer.order, pd.Series(['b', 'c', 'a', np.nan])) def test___init___duplicate_categories(self): """The the ``__init__`` method with duplicate categories in the order parameter. @@ -2371,9 +2287,7 @@ def test___init___duplicate_categories(self): 'Please drop the duplicates to proceed.' ) with pytest.raises(TransformerInputError, match=expected_msg): - OrderedLabelEncoder( - order=['b', 'c', 'a', 'a'], add_noise='add_noise_value' - ) + OrderedLabelEncoder(order=['b', 'c', 'a', 'a'], add_noise='add_noise_value') def test___repr___default(self): """Test that the ``__repr__`` method prints the custom order. @@ -2381,9 +2295,7 @@ def test___repr___default(self): The order should be printed as instead of the actual order. """ # Setup - transformer = OrderedLabelEncoder( - order=['VISA', 'AMEX', 'DISCOVER', None] - ) + transformer = OrderedLabelEncoder(order=['VISA', 'AMEX', 'DISCOVER', None]) # Run stringified_transformer = transformer.__repr__() @@ -2398,18 +2310,13 @@ def test___repr___add_noise_true(self): is provided, it should be printed too. """ # Setup - transformer = OrderedLabelEncoder( - order=['VISA', 'AMEX', 'DISCOVER', None], add_noise=True - ) + transformer = OrderedLabelEncoder(order=['VISA', 'AMEX', 'DISCOVER', None], add_noise=True) # Run stringified_transformer = transformer.__repr__() # Assert - assert ( - stringified_transformer - == 'OrderedLabelEncoder(order=, add_noise=True)' - ) + assert stringified_transformer == 'OrderedLabelEncoder(order=, add_noise=True)' def test__fit(self): """Test the ``_fit`` method. @@ -2440,14 +2347,10 @@ def test__fit(self): expected_values_to_categories = {0: 2, 1: 3, 2: np.nan, 3: 1} expected_categories_to_values = {2: 0, 3: 1, 1: 3, np.nan: 2} for key, value in transformer.values_to_categories.items(): - assert value == expected_values_to_categories[key] or pd.isna( - value - ) + assert value == expected_values_to_categories[key] or pd.isna(value) for key, value in transformer.categories_to_values.items(): - assert value == expected_categories_to_values.get(key) or pd.isna( - key - ) + assert value == expected_categories_to_values.get(key) or pd.isna(key) def test__fit_error(self): """Test the ``_fit`` method checks that data is in ``self.order``. diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index 06359452..eb184d4b 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -64,9 +64,7 @@ def test__convert_to_datetime(self): converted_data = transformer._convert_to_datetime(data) # Assert - expected_data = pd.Series( - pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) - ) + expected_data = pd.Series(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])) pd.testing.assert_series_equal(expected_data, converted_data) def test__convert_to_datetime_format(self): @@ -93,9 +91,7 @@ def test__convert_to_datetime_format(self): converted_data = transformer._convert_to_datetime(data) # Assert - expected_data = pd.Series( - pd.to_datetime(['01Feb2020', '02Mar2020', '03Jan2010']) - ) + expected_data = pd.Series(pd.to_datetime(['01Feb2020', '02Mar2020', '03Jan2010'])) pd.testing.assert_series_equal(expected_data, converted_data) def test__convert_to_datetime_not_convertible_raises_error(self): @@ -120,9 +116,7 @@ def test__convert_to_datetime_not_convertible_raises_error(self): transformer = UnixTimestampEncoder() # Run - error_message = ( - 'Data must be of dtype datetime, or castable to datetime.' - ) + error_message = 'Data must be of dtype datetime, or castable to datetime.' with pytest.raises(TypeError, match=error_message): transformer._convert_to_datetime(data) @@ -223,18 +217,14 @@ def test__reverse_transform_helper_nulls(self): data = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) transformer = UnixTimestampEncoder(missing_value_replacement='mean') transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = ( - pd.Series([1, 2, 3]) - ) + transformer.null_transformer.reverse_transform.return_value = pd.Series([1, 2, 3]) # Run transformer._reverse_transform_helper(data) # Assert transformer.null_transformer.reverse_transform.assert_called_once() - datetimes = transformer.null_transformer.reverse_transform.mock_calls[ - 0 - ][1][0] + datetimes = transformer.null_transformer.reverse_transform.mock_calls[0][1][0] np.testing.assert_array_equal(data.to_numpy(), datetimes) def test__reverse_transform_helper_model_missing_values_true(self): @@ -257,18 +247,14 @@ def test__reverse_transform_helper_model_missing_values_true(self): data = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) transformer = UnixTimestampEncoder(model_missing_values=True) transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = ( - pd.Series([1, 2, 3]) - ) + transformer.null_transformer.reverse_transform.return_value = pd.Series([1, 2, 3]) # Run transformer._reverse_transform_helper(data) # Assert transformer.null_transformer.reverse_transform.assert_called_once() - datetimes = transformer.null_transformer.reverse_transform.mock_calls[ - 0 - ][1][0] + datetimes = transformer.null_transformer.reverse_transform.mock_calls[0][1][0] np.testing.assert_array_equal(data.to_numpy(), datetimes) @patch('rdt.transformers.datetime.NullTransformer') @@ -334,9 +320,7 @@ def test__fit_calls_transform_helper(self): } @patch('rdt.transformers.datetime._guess_datetime_format_for_array') - def test__fit_calls_guess_datetime_format( - self, mock__guess_datetime_format_for_array - ): + def test__fit_calls_guess_datetime_format(self, mock__guess_datetime_format_for_array): """Test the ``_fit`` method. The ``_fit`` method should call the ``_transform_helper`` method. @@ -363,9 +347,7 @@ def test__fit_missing_value_generation(self): column. """ # Setup - transformer = UnixTimestampEncoder( - missing_value_generation='from_column' - ) + transformer = UnixTimestampEncoder(missing_value_generation='from_column') data = pd.Series(['2020-02-01', np.nan]) # Run @@ -429,9 +411,7 @@ def test__reverse_transform(self): output = ute._reverse_transform(transformed) # Assert - expected = pd.Series( - pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) - ) + expected = pd.Series(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_enforce_min_max_values(self): @@ -481,9 +461,7 @@ def test__reverse_transform_datetime_format_dtype_is_datetime(self): output = ute._reverse_transform(transformed) # Assert - expected = pd.Series( - pd.to_datetime(['Jan 01, 2020', 'Feb 01, 2020', 'Mar 01, 2020']) - ) + expected = pd.Series(pd.to_datetime(['Jan 01, 2020', 'Feb 01, 2020', 'Mar 01, 2020'])) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_datetime_format(self): @@ -621,9 +599,7 @@ def test__reverse_transform_helper(self): transformer = OptimizedTimestampEncoder() transformer.divider = 1000 transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.side_effect = ( - lambda x: x - ) + transformer.null_transformer.reverse_transform.side_effect = lambda x: x # Run multiplied = transformer._reverse_transform_helper(data) diff --git a/tests/unit/transformers/test_null.py b/tests/unit/transformers/test_null.py index 11fdfb2c..d3820946 100644 --- a/tests/unit/transformers/test_null.py +++ b/tests/unit/transformers/test_null.py @@ -43,10 +43,7 @@ def test___init__not_default(self): transformer = NullTransformer('a_missing_value_replacement', None) # Assert - assert ( - transformer._missing_value_replacement - == 'a_missing_value_replacement' - ) + assert transformer._missing_value_replacement == 'a_missing_value_replacement' assert transformer._missing_value_generation is None assert transformer._min_value is None assert transformer._max_value is None @@ -70,9 +67,7 @@ def test_models_missing_values(self): ``True``. """ # Setup - transformer = NullTransformer( - 'something', missing_value_generation='from_column' - ) + transformer = NullTransformer('something', missing_value_generation='from_column') # Run models_missing_values = transformer.models_missing_values() @@ -87,18 +82,12 @@ def test_models_missing_values_missing_value_generation_is_none(self): ``False``. """ # Setup - none_transformer = NullTransformer( - 'something', missing_value_generation=None - ) - random_transformer = NullTransformer( - 'something', missing_value_generation='random' - ) + none_transformer = NullTransformer('something', missing_value_generation=None) + random_transformer = NullTransformer('something', missing_value_generation='random') # Run none_models_missing_values = none_transformer.models_missing_values() - random_models_missing_values = ( - random_transformer.models_missing_values() - ) + random_models_missing_values = random_transformer.models_missing_values() # Assert assert none_models_missing_values is False @@ -126,9 +115,7 @@ def test__get_missing_value_replacement_scalar(self): # Run data = pd.Series([1, np.nan, 3], name='abc') - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert assert missing_value_replacement == 'a_missing_value_replacement' @@ -155,9 +142,7 @@ def test__get_missing_value_replacement_none_numerical(self): # Run data = pd.Series([1, 2, np.nan], name='abc') - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert assert missing_value_replacement == 1.5 @@ -184,9 +169,7 @@ def test__get_missing_value_replacement_none_not_numerical(self): # Run data = pd.Series(['a', 'b', 'b', np.nan], name='abc') - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert assert missing_value_replacement == 'b' @@ -213,9 +196,7 @@ def test__get_missing_value_replacement_mean(self): # Run data = pd.Series([1, 2, np.nan], name='abc') - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert assert missing_value_replacement == 1.5 @@ -228,9 +209,7 @@ def test__get_missing_value_replacement_mean_only_nans(self, logger_mock): data = pd.Series([float('nan'), None, np.nan], name='abc') # Run - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert logger_mock.info.assert_called_once_with( @@ -261,9 +240,7 @@ def test__get_missing_value_replacement_mode(self): # Run data = pd.Series([1, 2, 2, np.nan], name='abc') - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert assert missing_value_replacement == 2 @@ -276,9 +253,7 @@ def test__get_missing_value_replacement_mode_only_nans(self, logger_mock): data = pd.Series([float('nan'), None, np.nan], name='abc') # Run - missing_value_replacement = transformer._get_missing_value_replacement( - data - ) + missing_value_replacement = transformer._get_missing_value_replacement(data) # Assert logger_mock.info.assert_called_once_with( @@ -382,61 +357,29 @@ def test_fit_with_multiple_missing_value_generations(self): missing_value_generation_none_str.fit(nulls_str) # Assert - assert ( - missing_value_generation_random_nulls._missing_value_generation - == 'random' - ) + assert missing_value_generation_random_nulls._missing_value_generation == 'random' assert missing_value_generation_random_nulls.nulls - assert ( - missing_value_generation_random_nulls._missing_value_replacement - == 'b' - ) + assert missing_value_generation_random_nulls._missing_value_replacement == 'b' - assert ( - missing_value_generation_random_no_nulls._missing_value_generation - == 'random' - ) + assert missing_value_generation_random_no_nulls._missing_value_generation == 'random' assert not missing_value_generation_random_no_nulls.nulls - assert ( - missing_value_generation_random_no_nulls._missing_value_replacement - == 'b' - ) + assert missing_value_generation_random_no_nulls._missing_value_replacement == 'b' - assert ( - missing_value_generation_column_nulls._missing_value_generation - == 'from_column' - ) + assert missing_value_generation_column_nulls._missing_value_generation == 'from_column' assert missing_value_generation_column_nulls.nulls - assert ( - missing_value_generation_column_nulls._missing_value_replacement - == 2 - ) + assert missing_value_generation_column_nulls._missing_value_replacement == 2 - assert ( - missing_value_generation_column_no_nulls._missing_value_generation - is None - ) + assert missing_value_generation_column_no_nulls._missing_value_generation is None assert not missing_value_generation_column_no_nulls.nulls - assert ( - missing_value_generation_column_no_nulls._missing_value_replacement - == 2.5 - ) + assert missing_value_generation_column_no_nulls._missing_value_replacement == 2.5 - assert ( - missing_value_generation_none_int._missing_value_generation is None - ) + assert missing_value_generation_none_int._missing_value_generation is None assert missing_value_generation_none_int.nulls is None - assert ( - missing_value_generation_none_int._missing_value_replacement == 2 - ) + assert missing_value_generation_none_int._missing_value_replacement == 2 - assert ( - missing_value_generation_none_str._missing_value_generation is None - ) + assert missing_value_generation_none_str._missing_value_generation is None assert missing_value_generation_none_str.nulls is None - assert ( - missing_value_generation_none_str._missing_value_replacement == 'b' - ) + assert missing_value_generation_none_str._missing_value_replacement == 'b' def test_transform__missing_value_generation_from_column(self): """Test transform when ``_missing_value_generation`` is set to ``from_column``. @@ -487,9 +430,7 @@ def test_transform__missing_value_generation_random(self): pd.testing.assert_series_equal(modified_input_data, input_data) @patch('rdt.transformers.null.np.random.uniform') - def test_transform__missing_value_replacement_random( - self, mock_np_random_uniform - ): + def test_transform__missing_value_replacement_random(self, mock_np_random_uniform): """Test transform when ``_missing_value_replacement`` is set to ``random``.""" # Setup transformer = NullTransformer(missing_value_replacement='random') @@ -564,9 +505,7 @@ def test_reverse_transform__missing_value_generation_from_column_no_nulls( pd.testing.assert_series_equal(expected_output, output) @patch('rdt.transformers.null.np.random') - def test_reverse_transform__missing_value_generation_random_with_nulls( - self, random_mock - ): + def test_reverse_transform__missing_value_generation_random_with_nulls(self, random_mock): """Test reverse_transform when ``missing_value_generation`` is ``random`` and nulls. When ``missing_value_generation`` is ``random`` and there are nulls, a ``_null_percentage`` diff --git a/tests/unit/transformers/test_numerical.py b/tests/unit/transformers/test_numerical.py index e120ca5d..c248b8ec 100644 --- a/tests/unit/transformers/test_numerical.py +++ b/tests/unit/transformers/test_numerical.py @@ -20,9 +20,7 @@ class TestFloatFormatter(TestCase): def test___init__super_attrs(self): """super() arguments are properly passed and set as attributes.""" - nt = FloatFormatter( - missing_value_replacement='mode', missing_value_generation='random' - ) + nt = FloatFormatter(missing_value_replacement='mode', missing_value_generation='random') assert nt.missing_value_replacement == 'mode' assert nt.missing_value_generation == 'random' @@ -136,9 +134,7 @@ def test__fit(self): """ # Setup data = pd.Series([1.5, None, 2.5]) - transformer = FloatFormatter( - missing_value_replacement='missing_value_replacement' - ) + transformer = FloatFormatter(missing_value_replacement='missing_value_replacement') transformer._validate_values_within_bounds = Mock() # Run @@ -146,16 +142,12 @@ def test__fit(self): # Asserts expected = 'missing_value_replacement' - assert ( - transformer.null_transformer._missing_value_replacement == expected - ) + assert transformer.null_transformer._missing_value_replacement == expected assert is_float_dtype(transformer._dtype) assert transformer.output_properties == { None: {'sdtype': 'float', 'next_transformer': None} } - transformer._validate_values_within_bounds.assert_called_once_with( - data - ) + transformer._validate_values_within_bounds.assert_called_once_with(data) assert transformer.output_properties == { None: {'sdtype': 'float', 'next_transformer': None}, } @@ -211,9 +203,7 @@ def test__fit_learn_rounding_scheme_true(self): ]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', learn_rounding_scheme=True - ) + transformer = FloatFormatter(missing_value_replacement='mean', learn_rounding_scheme=True) transformer._fit(data) # Asserts @@ -237,9 +227,7 @@ def test__fit_learn_rounding_scheme_true_max_decimals(self): data = pd.Series([0.000000000000001]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', learn_rounding_scheme=True - ) + transformer = FloatFormatter(missing_value_replacement='mean', learn_rounding_scheme=True) transformer._fit(data) # Asserts @@ -262,9 +250,7 @@ def test__fit_learn_rounding_scheme_true_inf(self): data = pd.Series([15000, 4000, 60000, np.inf]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', learn_rounding_scheme=True - ) + transformer = FloatFormatter(missing_value_replacement='mean', learn_rounding_scheme=True) transformer._fit(data) # Asserts @@ -285,9 +271,7 @@ def test__fit_learn_rounding_scheme_true_max_zero(self): data = pd.Series([0, 0, 0]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', learn_rounding_scheme=True - ) + transformer = FloatFormatter(missing_value_replacement='mean', learn_rounding_scheme=True) transformer._fit(data) # Asserts @@ -309,9 +293,7 @@ def test__fit_enforce_min_max_values_false(self): data = pd.Series([1.5, None, 2.5]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', enforce_min_max_values=False - ) + transformer = FloatFormatter(missing_value_replacement='mean', enforce_min_max_values=False) transformer._fit(data) # Asserts @@ -333,9 +315,7 @@ def test__fit_enforce_min_max_values_true(self): data = pd.Series([-100, -5000, 0, None, 100, 4000]) # Run - transformer = FloatFormatter( - missing_value_replacement='mean', enforce_min_max_values=True - ) + transformer = FloatFormatter(missing_value_replacement='mean', enforce_min_max_values=True) transformer._fit(data) # Asserts @@ -386,9 +366,7 @@ def test__transform(self): transformer._transform(data) # Assert - transformer._validate_values_within_bounds.assert_called_once_with( - data - ) + transformer._validate_values_within_bounds.assert_called_once_with(data) assert transformer.null_transformer.transform.call_count == 1 def test__reverse_transform_learn_rounding_scheme_false(self): @@ -650,9 +628,7 @@ def test__reverse_transform_enforce_min_max_values(self): result = transformer._reverse_transform(data) # Asserts - np.testing.assert_array_equal( - result, np.array([-300, -300, -300, -250, 0, 125, 400, 400]) - ) + np.testing.assert_array_equal(result, np.array([-300, -300, -300, -250, 0, 125, 400, 400])) def test__reverse_transform_enforce_min_max_values_with_nulls(self): """Test ``_reverse_transform`` with nulls and ``enforce_min_max_values`` set to ``True``. @@ -694,15 +670,11 @@ def test__reverse_transform_enforce_min_max_values_with_nulls(self): transformer._min_value = -300 transformer.enforce_min_max_values = True transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = ( - expected_data - ) + transformer.null_transformer.reverse_transform.return_value = expected_data result = transformer._reverse_transform(data) # Asserts - null_transformer_calls = ( - transformer.null_transformer.reverse_transform.mock_calls - ) + null_transformer_calls = transformer.null_transformer.reverse_transform.mock_calls np.testing.assert_array_equal(null_transformer_calls[0][1][0], data) np.testing.assert_array_equal(result, expected_data) @@ -802,9 +774,7 @@ def custom_import(name, *args): return __py_import__(name, *args) with patch('builtins.__import__', side_effect=custom_import): - with pytest.raises( - ImportError, match=r'pip install rdt\[copulas\]' - ): + with pytest.raises(ImportError, match=r'pip install rdt\[copulas\]'): GaussianNormalizer._get_distributions() def test__get_distributions(self): @@ -954,9 +924,7 @@ def test__fit(self): # Assert ct._get_univariate.return_value.fit.assert_called_once() call_value = ct._get_univariate.return_value.fit.call_args_list[0] - np.testing.assert_array_equal( - call_value[0][0], np.array([0.0, 0.5, 1.0]) - ) + np.testing.assert_array_equal(call_value[0][0], np.array([0.0, 0.5, 1.0])) assert ct.output_properties == { None: {'sdtype': 'float', 'next_transformer': None}, } @@ -977,9 +945,7 @@ def test__fit_missing_value_generation_from_column(self): # Assert ct._get_univariate.return_value.fit.assert_called_once() call_value = ct._get_univariate.return_value.fit.call_args_list[0] - np.testing.assert_array_equal( - call_value[0][0], np.array([0.0, 0.5, 1.0]) - ) + np.testing.assert_array_equal(call_value[0][0], np.array([0.0, 0.5, 1.0])) assert ct.output_properties == { None: {'sdtype': 'float', 'next_transformer': None}, 'is_null': {'sdtype': 'float', 'next_transformer': None}, @@ -1053,9 +1019,7 @@ def test__transform(self): ct = GaussianNormalizer() ct._univariate = Mock() ct._univariate.cdf.return_value = np.array([0.25, 0.5, 0.75, 0.5]) - ct.null_transformer = NullTransformer( - 'mean', missing_value_generation='from_column' - ) + ct.null_transformer = NullTransformer('mean', missing_value_generation='from_column') ct.null_transformer.fit(data) # Run @@ -1076,9 +1040,7 @@ def test__transform_missing_value_generation_is_random(self): ct = GaussianNormalizer() ct._univariate = Mock() ct._univariate.cdf.return_value = np.array([0.25, 0.5, 0.75, 0.5]) - ct.null_transformer = NullTransformer( - 'mean', missing_value_generation='random' - ) + ct.null_transformer = NullTransformer('mean', missing_value_generation='random') # Run ct.null_transformer.fit(data) @@ -1127,9 +1089,7 @@ def test__reverse_transform_missing_value_generation(self): ct = GaussianNormalizer() ct._univariate = Mock() ct._univariate.ppf.return_value = np.array([0.0, 1.0, 2.0, 1.0]) - ct.null_transformer = NullTransformer( - None, missing_value_generation='random' - ) + ct.null_transformer = NullTransformer(None, missing_value_generation='random') # Run ct.null_transformer.fit(expected) @@ -1143,9 +1103,7 @@ class TestClusterBasedNormalizer(TestCase): def test__get_current_random_seed_random_states_is_none(self): """Test that the method returns 0 if ``instance.random_states`` is None.""" # Setup - transformer = ClusterBasedNormalizer( - max_clusters=10, weight_threshold=0.005 - ) + transformer = ClusterBasedNormalizer(max_clusters=10, weight_threshold=0.005) transformer.random_states = None # Run @@ -1176,9 +1134,7 @@ def test__fit(self, mock_bgm): # Setup bgm_instance = mock_bgm.return_value bgm_instance.weights_ = np.array([10.0, 5.0, 0.0]) - transformer = ClusterBasedNormalizer( - max_clusters=10, weight_threshold=0.005 - ) + transformer = ClusterBasedNormalizer(max_clusters=10, weight_threshold=0.005) mock_state = Mock() transformer.random_states['fit'] = mock_state mock_state.get_state.return_value = [None, [0]] @@ -1255,9 +1211,7 @@ def test__fit_catch_warnings(self, mock_warnings, mock_bgm): # Setup bgm_instance = mock_bgm.return_value bgm_instance.weights_ = np.array([10.0, 5.0, 0.0]) - transformer = ClusterBasedNormalizer( - max_clusters=10, weight_threshold=0.005 - ) + transformer = ClusterBasedNormalizer(max_clusters=10, weight_threshold=0.005) data = pd.Series(np.random.random(size=100)) # Run @@ -1347,9 +1301,7 @@ def test__transform(self): 0.10703034, 0.05709835, ]) - np.testing.assert_allclose( - output[:, 0], expected_normalized, rtol=1e-3 - ) + np.testing.assert_allclose(output[:, 0], expected_normalized, rtol=1e-3) expected_component = np.array([ 1.0, @@ -1415,9 +1367,7 @@ def test__transform_missing_value_replacement(self): transformer._bgm_transformer.predict_proba.return_value = probabilities transformer.valid_component_indicator = np.array([True, True, False]) - transformer.null_transformer = NullTransformer( - 0.0, missing_value_generation='from_column' - ) + transformer.null_transformer = NullTransformer(0.0, missing_value_generation='from_column') data = pd.Series([ 0.01, np.nan, @@ -1450,9 +1400,7 @@ def test__transform_missing_value_replacement(self): -0.046177, 0.1226, ]) - np.testing.assert_allclose( - output[:, 0], expected_normalized, rtol=1e-3 - ) + np.testing.assert_allclose(output[:, 0], expected_normalized, rtol=1e-3) expected_component = np.array([ 0.0, @@ -1585,9 +1533,7 @@ def test__reverse_transform(self): 0.97, ]) transformer.null_transformer = Mock() - transformer.null_transformer.reverse_transform.return_value = ( - reversed_data - ) + transformer.null_transformer.reverse_transform.return_value = reversed_data transformer._reverse_transform_helper = Mock() transformer._reverse_transform_helper.return_value = reversed_data @@ -1641,9 +1587,7 @@ def test__reverse_transform(self): ], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0], ]).transpose() - np.testing.assert_allclose( - transformer._reverse_transform_helper.call_args[0][0], call_data - ) + np.testing.assert_allclose(transformer._reverse_transform_helper.call_args[0][0], call_data) def test__reverse_transform_missing_value_replacement_missing_value_replacement_from_col( self, @@ -1654,9 +1598,7 @@ def test__reverse_transform_missing_value_replacement_missing_value_replacement_ appropriate output when passed a numpy array containing ``np.nan`` values. """ # Setup - transformer = ClusterBasedNormalizer( - missing_value_generation='from_column', max_clusters=3 - ) + transformer = ClusterBasedNormalizer(missing_value_generation='from_column', max_clusters=3) transformer.output_columns = ['col.normalized', 'col.component'] transformer._reverse_transform_helper = Mock() transformer._reverse_transform_helper.return_value = np.array([ @@ -1740,9 +1682,7 @@ def test__reverse_transform_missing_value_replacement_missing_value_replacement_ appropriate output when passed a numpy array containing ``np.nan`` values. """ # Setup - transformer = ClusterBasedNormalizer( - missing_value_generation='from_column', max_clusters=3 - ) + transformer = ClusterBasedNormalizer(missing_value_generation='from_column', max_clusters=3) transformer.output_columns = ['col.normalized', 'col.component'] transformer._reverse_transform_helper = Mock() transformer._reverse_transform_helper.return_value = np.array([ @@ -1758,9 +1698,7 @@ def test__reverse_transform_missing_value_replacement_missing_value_replacement_ 0.62239389, ]) - transformer.null_transformer = NullTransformer( - 'mean', missing_value_generation='random' - ) + transformer.null_transformer = NullTransformer('mean', missing_value_generation='random') transformer.null_transformer.fit(pd.Series([0, np.nan])) transformer.null_transformer.reverse_transform = Mock() transformer.null_transformer.reverse_transform.return_value = np.array([ diff --git a/tests/unit/transformers/test_text.py b/tests/unit/transformers/test_text.py index e5c001fe..466cad7d 100644 --- a/tests/unit/transformers/test_text.py +++ b/tests/unit/transformers/test_text.py @@ -41,9 +41,7 @@ def test___init__default(self): assert transformer.starting_value == 0 assert transformer.suffix is None assert transformer._counter == 0 - assert transformer.output_properties == { - None: {'next_transformer': None} - } + assert transformer.output_properties == {None: {'next_transformer': None}} def test___init__with_parameters(self): """Test the ``__init__`` method with paremeters.""" @@ -51,42 +49,32 @@ def test___init__with_parameters(self): transformer_prefix = IDGenerator(prefix='prefix_') transformer_suffix = IDGenerator(suffix='_suffix') transformer_starting_value = IDGenerator(starting_value=10) - transformer_all = IDGenerator( - prefix='prefix_', starting_value=10, suffix='_suffix' - ) + transformer_all = IDGenerator(prefix='prefix_', starting_value=10, suffix='_suffix') # Assert assert transformer_prefix.prefix == 'prefix_' assert transformer_prefix.starting_value == 0 assert transformer_prefix.suffix is None assert transformer_prefix._counter == 0 - assert transformer_prefix.output_properties == { - None: {'next_transformer': None} - } + assert transformer_prefix.output_properties == {None: {'next_transformer': None}} assert transformer_suffix.prefix is None assert transformer_suffix.starting_value == 0 assert transformer_suffix.suffix == '_suffix' assert transformer_suffix._counter == 0 - assert transformer_suffix.output_properties == { - None: {'next_transformer': None} - } + assert transformer_suffix.output_properties == {None: {'next_transformer': None}} assert transformer_starting_value.prefix is None assert transformer_starting_value.starting_value == 10 assert transformer_starting_value.suffix is None assert transformer_starting_value._counter == 0 - assert transformer_starting_value.output_properties == { - None: {'next_transformer': None} - } + assert transformer_starting_value.output_properties == {None: {'next_transformer': None}} assert transformer_all.prefix == 'prefix_' assert transformer_all.starting_value == 10 assert transformer_all.suffix == '_suffix' assert transformer_all._counter == 0 - assert transformer_all.output_properties == { - None: {'next_transformer': None} - } + assert transformer_all.output_properties == {None: {'next_transformer': None}} def test_reset_randomization(self): """Test the ``reset_randomization`` method.""" @@ -139,9 +127,7 @@ def test__reverse_transform(self): def test__reverse_transform_with_everything(self): """Test the ``_reverse_transform`` method with all parameters.""" # Setup - transformer = IDGenerator( - prefix='prefix_', starting_value=100, suffix='_suffix' - ) + transformer = IDGenerator(prefix='prefix_', starting_value=100, suffix='_suffix') # Run result = transformer._reverse_transform(np.array([1, 2, 3])) @@ -183,9 +169,7 @@ def test___getstate__(self): } @patch('rdt.transformers.text.strings_from_regex') - def test___setstate__generated_and_generator_size( - self, mock_strings_from_regex - ): + def test___setstate__generated_and_generator_size(self, mock_strings_from_regex): """Test that ``__setstate__`` will initialize a generator and wind it forward.""" # Setup state = { @@ -281,17 +265,13 @@ def test___init__custom(self): def test___init__bad_value_generation_order(self): """Test that an error is raised if a bad value is given for `generation_order`.""" # Run and Assert - error_message = ( - "generation_order must be one of 'alphanumeric' or 'scrambled'." - ) + error_message = "generation_order must be one of 'alphanumeric' or 'scrambled'." with pytest.raises(ValueError, match=error_message): RegexGenerator(generation_order='afdsfd') @patch('rdt.transformers.text.BaseTransformer.reset_randomization') @patch('rdt.transformers.text.strings_from_regex') - def test_reset_randomization( - self, mock_strings_from_regex, mock_base_reset - ): + def test_reset_randomization(self, mock_strings_from_regex, mock_base_reset): """Test that this method creates a new generator. This method should create a new ``instance.generator``, ``instance.generator_size`` and @@ -496,9 +476,7 @@ def test__reverse_transform_generator_size_of_input_data(self): assert instance.generated == 4 @patch('rdt.transformers.text.warnings') - def test__reverse_transform_not_enough_unique_values_enforce_uniqueness( - self, mock_warnings - ): + def test__reverse_transform_not_enough_unique_values_enforce_uniqueness(self, mock_warnings): """Test it when there are not enough unique values to generate.""" # Setup instance = RegexGenerator('[A-E]', enforce_uniqueness=True) @@ -518,9 +496,7 @@ def test__reverse_transform_not_enough_unique_values_enforce_uniqueness( "The regex for 'a' can only generate 5 " 'unique values. Additional values may not exactly follow the provided regex.' ) - np.testing.assert_array_equal( - out, np.array(['A', 'B', 'C', 'D', 'E', 'A(0)']) - ) + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A(0)'])) def test__reverse_transform_not_enough_unique_values(self): """Test it when there are not enough unique values to generate.""" @@ -538,14 +514,10 @@ def test__reverse_transform_not_enough_unique_values(self): out = instance._reverse_transform(columns_data) # Assert - np.testing.assert_array_equal( - out, np.array(['A', 'B', 'C', 'D', 'E', 'A']) - ) + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'A'])) @patch('rdt.transformers.text.warnings') - def test__reverse_transform_not_enough_unique_values_numerical( - self, mock_warnings - ): + def test__reverse_transform_not_enough_unique_values_numerical(self, mock_warnings): """Test it when there are not enough unique values to generate.""" # Setup instance = RegexGenerator('[1-3]', enforce_uniqueness=True) @@ -565,14 +537,10 @@ def test__reverse_transform_not_enough_unique_values_numerical( "The regex for 'a' can only generate 3 " 'unique values. Additional values may not exactly follow the provided regex.' ) - np.testing.assert_array_equal( - out, np.array(['1', '2', '3', '4', '5', '6']) - ) + np.testing.assert_array_equal(out, np.array(['1', '2', '3', '4', '5', '6'])) @patch('rdt.transformers.text.warnings') - def test__reverse_transform_enforce_uniqueness_not_enough_remaining( - self, mock_warnings - ): + def test__reverse_transform_enforce_uniqueness_not_enough_remaining(self, mock_warnings): """Test the case when there are not enough unique values remaining.""" # Setup instance = RegexGenerator('[A-Z]', enforce_uniqueness=True) @@ -592,9 +560,7 @@ def test__reverse_transform_enforce_uniqueness_not_enough_remaining( 'The regex generator is not able to generate 6 new unique ' 'values (only 1 unique values left).' ) - np.testing.assert_array_equal( - out, np.array(['A', 'B', 'C', 'D', 'E', 'F']) - ) + np.testing.assert_array_equal(out, np.array(['A', 'B', 'C', 'D', 'E', 'F'])) @patch('rdt.transformers.text.LOGGER') def test__reverse_transform_info_message(self, mock_logger): @@ -622,6 +588,4 @@ def test__reverse_transform_info_message(self, mock_logger): ) expected_args = (6, 'a', 5, 'a') - mock_logger.info.assert_called_once_with( - expected_format, *expected_args - ) + mock_logger.info.assert_called_once_with(expected_format, *expected_args) diff --git a/tests/unit/transformers/test_utils.py b/tests/unit/transformers/test_utils.py index 7a81ae93..2a2df8be 100644 --- a/tests/unit/transformers/test_utils.py +++ b/tests/unit/transformers/test_utils.py @@ -76,10 +76,7 @@ def test_strings_from_regex_very_large_regex(): very_large_regex = '[0-9a-zA-Z]{9}-[0-9a-zA-Z]{4}-[0-9a-zA-Z]{9}-[0-9a-zA-Z]{9}-[0-9a-z]{12}' generator, size = strings_from_regex(very_large_regex, max_repeat=16) - assert ( - size - == 173689027553046619421110743915454114823342474255318764491341273608665169920 - ) + assert size == 173689027553046619421110743915454114823342474255318764491341273608665169920 [next(generator) for _ in range(100_000)] @@ -176,14 +173,10 @@ def test_try_convert_to_dtype(): # Run output_convertibe = try_convert_to_dtype(data_int_with_nan, 'str') output_int_with_nan = try_convert_to_dtype(data_int_with_nan, 'int') - with pytest.raises( - ValueError, match="could not convert string to float: 'a'" - ): + with pytest.raises(ValueError, match="could not convert string to float: 'a'"): try_convert_to_dtype(data_not_convertible, 'int') - with pytest.raises( - ValueError, match="could not convert string to float: 'a'" - ): + with pytest.raises(ValueError, match="could not convert string to float: 'a'"): try_convert_to_dtype(data_not_convertible, 'float') # Assert From 2c4a754ab473d822d529cd81624d07a3520a6bd9 Mon Sep 17 00:00:00 2001 From: gsheni Date: Thu, 25 Apr 2024 12:11:52 -0400 Subject: [PATCH 07/17] undo invoke --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5e0ba7f4..e5bd6b7e 100644 --- a/Makefile +++ b/Makefile @@ -81,8 +81,7 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint lint: ## Run all code style checks - ruff check . - ruff format . --check + invoke lint .PHONY: fix-lint fix-lint: ## fix lint issues using ruff From 1f997dec43f6f91e4fd72b351a672d6287c47581 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Tue, 30 Apr 2024 08:32:40 +0100 Subject: [PATCH 08/17] Only run unit and integration tests on oldest and latest python versions for macos (#814) --- .github/workflows/integration.yml | 7 ++++++- .github/workflows/minimum.yml | 7 ++++++- .github/workflows/unit.yml | 7 ++++++- pyproject.toml | 9 ++++----- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a7ad18f3..81afd6a8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml index 1d0d9ecc..e125c803 100644 --- a/.github/workflows/minimum.yml +++ b/.github/workflows/minimum.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml index b031ebe6..1b9a911b 100644 --- a/.github/workflows/unit.yml +++ b/.github/workflows/unit.yml @@ -11,7 +11,12 @@ jobs: strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + include: + - os: macos-latest + python-version: '3.8' + - os: macos-latest + python-version: '3.12' steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index 7c1e04a7..828967f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,17 +21,16 @@ license = { text = 'BSL-1.1' } requires-python = '>=3.8,<3.13' readme = 'README.md' dependencies = [ - "numpy>=1.20.0;python_version<'3.10'", + "numpy>=1.21.0;python_version<'3.10'", "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'", "numpy>=1.26.0;python_version>='3.12'", - "pandas>=1.1.3;python_version<'3.10'", - "pandas>=1.3.4;python_version>='3.10' and python_version<'3.11'", + "pandas>=1.4.0;python_version<'3.11'", "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'", "pandas>=2.1.1;python_version>='3.12'", - "scipy>=1.5.4;python_version<'3.10'", + "scipy>=1.7.3;python_version<'3.10'", "scipy>=1.9.2;python_version>='3.10' and python_version<'3.12'", "scipy>=1.12.0;python_version>='3.12'", - "scikit-learn>=0.24;python_version<'3.10'", + "scikit-learn>=1.0.2;python_version<'3.10'", "scikit-learn>=1.1.0;python_version>='3.10' and python_version<'3.11'", "scikit-learn>=1.1.3;python_version>='3.11' and python_version<'3.12'", "scikit-learn>=1.3.1;python_version>='3.12'", From fb59579c514294f4d8878a667c5c0c482f5d7898 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Wed, 1 May 2024 10:32:25 -0400 Subject: [PATCH 09/17] Automated Latest Dependency Updates (#817) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 6ab9f163..8a3d0e93 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,4 +1,4 @@ -Faker==24.11.0 +Faker==25.0.0 copulas==0.11.0 numpy==1.26.4 pandas==2.2.2 From be6152f5c1ebd4571cd43ce84766b33d07bf5fbe Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Wed, 1 May 2024 17:56:02 -0400 Subject: [PATCH 10/17] Latest Code Analysis (#798) Co-authored-by: amontanez24 --- static_code_analysis.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static_code_analysis.txt b/static_code_analysis.txt index 0013959c..550bd343 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -1,4 +1,4 @@ -Run started:2024-04-11 03:56:15.289402 +Run started:2024-04-16 22:10:36.007657 Test results: No issues identified. From d750307f7bdecdac4ad27307faccf9d49c454dc0 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Fri, 3 May 2024 11:09:33 -0400 Subject: [PATCH 11/17] Latest Code Analysis (#810) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Andrew Montanez --- static_code_analysis.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static_code_analysis.txt b/static_code_analysis.txt index 550bd343..7259f63c 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -4,7 +4,7 @@ Test results: No issues identified. Code scanned: - Total lines of code: 5503 + Total lines of code: 5515 Total lines skipped (#nosec): 0 Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 From 88bf32de4b317e7bac33ad8eccb20c86b0553f15 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Tue, 7 May 2024 11:22:04 -0400 Subject: [PATCH 12/17] Automated Latest Dependency Updates (#818) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 8a3d0e93..3995bb04 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,4 +1,4 @@ -Faker==25.0.0 +Faker==25.0.1 copulas==0.11.0 numpy==1.26.4 pandas==2.2.2 From e1dde65f6a85ed3f76c6de6f004e46837634fa30 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Wed, 8 May 2024 16:53:28 -0500 Subject: [PATCH 13/17] Refactoring code for Enterprise issue #529 (#815) --- rdt/transformers/text.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index dd74ddcb..2d02650d 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -211,14 +211,18 @@ def _reverse_transform(self, data): self.reset_randomization() remaining = self.generator_size - if remaining >= sample_size: - reverse_transformed = [next(self.generator) for _ in range(sample_size)] - self.generated += sample_size - - else: - generated_values = list(self.generator) - reverse_transformed = generated_values[:] - self.generated = self.generator_size + generated_values = [] + while len(generated_values) < sample_size: + try: + generated_values.append(next(self.generator)) + self.generated += 1 + except (RuntimeError, StopIteration): + # Can't generate more rows without collision so breaking out of loop + break + + reverse_transformed = generated_values[:] + + if len(reverse_transformed) < sample_size: if self.enforce_uniqueness: try: remaining_samples = sample_size - len(reverse_transformed) From ecf749959276dacd54efc474edfdc5f7804e133e Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Wed, 8 May 2024 16:56:11 -0500 Subject: [PATCH 14/17] =?UTF-8?q?Bump=20version:=201.12.1.dev0=20=E2=86=92?= =?UTF-8?q?=201.12.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- rdt/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 828967f7..9093c784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.1.dev0" +current_version = "1.12.1.dev1" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/rdt/__init__.py b/rdt/__init__.py index 9a6ff95e..146d8970 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -4,7 +4,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.1.dev0' +__version__ = '1.12.1.dev1' import sys From 36b291dcb6cda2f1fbd087efcc10aebef70cac74 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Thu, 9 May 2024 20:15:28 +0100 Subject: [PATCH 15/17] Fix pandas FutureWarning in UniformEncoder (#820) --- rdt/transformers/categorical.py | 4 +++- .../transformers/test_categorical.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 98dfc99b..e4c2993d 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -197,7 +197,9 @@ def _reverse_transform(self, data): labels.append(key) result = pd.cut(data, bins=bins, labels=labels, include_lowest=True) - result = result.replace(nan_name, np.nan) + if nan_name in result.cat.categories: + result = result.cat.remove_categories(nan_name) + result = try_convert_to_dtype(result, self.dtype) return result diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py index 04959750..f30f7627 100644 --- a/tests/integration/transformers/test_categorical.py +++ b/tests/integration/transformers/test_categorical.py @@ -113,6 +113,30 @@ def test__reverse_transform_nans(self): # Asserts pd.testing.assert_series_equal(output[column], data[column]) + def test__reverse_transform_nans_pandas_warning(self): + """Test ``_reverse_transform`` for data with NaNs. + + Here we check that no pandas warning is raised. + """ + # Setup + intervals = {'United-States': [0.0, 0.8], None: [0.8, 0.9], 'Jamaica': [0.9, 0.99]} + data = pd.Series([0.107995, 0.148025, 0.632702], name='native-country', dtype=float) + transformer = UniformEncoder() + transformer.intervals = intervals + transformer.dtype = 'O' + + # Run + with warnings.catch_warnings(record=True) as w: + result = transformer._reverse_transform(data) + + assert len(w) == 0 + + # Asserts + expected_result = pd.Series( + ['United-States', 'United-States', 'United-States'], name='native-country' + ) + pd.testing.assert_series_equal(result, expected_result) + def test_uniform_encoder_unseen_transform_nan(self): """Ensure UniformEncoder works when np.nan to transform wasn't seen during fit.""" # Setup From 4c6a71b2b0307dd2776a3f901129cf31ba06dbde Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 9 May 2024 15:06:21 -0500 Subject: [PATCH 16/17] =?UTF-8?q?Bump=20version:=201.12.1.dev1=20=E2=86=92?= =?UTF-8?q?=201.12.1.dev2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- rdt/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9093c784..f3500968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.1.dev1" +current_version = "1.12.1.dev2" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/rdt/__init__.py b/rdt/__init__.py index 146d8970..caeb32c9 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -4,7 +4,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.1.dev1' +__version__ = '1.12.1.dev2' import sys From dc79d97ab0f1dab7dbc65ceeb22970c823324564 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 9 May 2024 16:03:36 -0500 Subject: [PATCH 17/17] 1.12.1 Release Notes (#821) --- HISTORY.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 19c8109b..e6253ff6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,22 @@ # History +## 1.12.1 - 2024-05-09 + +This release handles a pandas warning that was showing up in the `UniformEncoder`. + +### Bugs Fixed + +* Fix pandas FutureWarning in UniformEncoder - Issue [#819](https://github.com/sdv-dev/RDT/issues/819) by @R-Palazzo + +### Maintenance + +* Switch to using ruff for Python linting and code formatting - Issue [#765](https://github.com/sdv-dev/RDT/issues/765) by @gsheni +* Only run unit and integration tests on oldest and latest python versions for macos - Issue [#812](https://github.com/sdv-dev/RDT/issues/812) by @R-Palazzo + +### Internal + +* Refactoring code for Enterprise issue #529 - PR[#815](https://github.com/sdv-dev/RDT/pull/815) by @amontanez24 + ## 1.12.0 - 2024-04-19 This release adds a new parameter to the `RegexGenerator` called `generation_order`. This parameter lets users change if they want the generated values for the regex to come out in alphanumeric or scrambled order. Additionally, warnings that were disrupting the progress bar are handled.