diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index b15bf4adc..000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Generate Docs - -on: - push: - branches: [ stable ] - -jobs: - - docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - name: Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - - name: Build - run: | - python -m pip install --upgrade pip - pip install -e .[dev] - make docs - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{secrets.GITHUB_TOKEN}} - publish_dir: docs/_build/html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a5c5f7283..579c1b37d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -5,24 +5,78 @@ on: - pull_request jobs: - build: + lint: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest] + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e lint + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest] # skip windows bc rundoc fails steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e readme + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install tox tox-gh-actions + - name: Test with tox + run: tox -e pytest + minimum: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions - name: Test with tox - run: tox + run: tox -e minimum diff --git a/.travis.yml b/.travis.yml index 388b7d3a1..bd6bd1740 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,21 @@ # Config file for automatic testing at travis-ci.org +os: linux dist: bionic language: python python: - - 3.8 - - 3.7 - 3.6 - - 3.5 + - 3.7 + - 3.8 +env: + - TOXENV=lint + - TOXENV=readme + - TOXENV=pytest + - TOXENV=minimum # Command to install dependencies -install: pip install -U tox-travis codecov +install: + - pip install -U tox-travis codecov after_success: codecov -# Command to run tests script: tox diff --git a/HISTORY.md b/HISTORY.md index 267a8bf17..6745f5d82 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,54 @@ # History +## 0.2.10 - 2020-12-18 + +This release adds a new argument to the `HyperTransformer` which gives control over +which transformers to use by default for each `dtype` if no specific transformer +has been specified for the field. + +This is also the first version to be officially released on conda. + +### Issues closed + +* Add `dtype_transformers` argument to HyperTransformer - Issue [#148](https://github.com/sdv-dev/RDT/issues/148) by @csala +* Makes Copulas an optional dependency - Issue [#144](https://github.com/sdv-dev/RDT/issues/144) by @fealho + +## 0.2.9 - 2020-11-27 + +This release fixes a bug that prevented the `CategoricalTransformer` from working properly +when being passed data that contained numerical data only, without any strings, but also +contained `None` or `NaN` values. + +### Issues closed + +* KeyError: nan - CategoricalTransformer fails on numerical + nan data only - Issue [#142](https://github.com/sdv-dev/RDT/issues/142) by @csala + +## 0.2.8 - 2020-11-20 + +This release fixes a few minor bugs, including some which prevented RDT from fully working +on Windows systems. + +Thanks to this fixes, as well as a new testing infrastructure that has been set up, from now +on RDT is officially supported on Windows systems, as well as on the Linux and macOS systems +which were previously supported. + +### Issues closed + +* TypeError: unsupported operand type(s) for: 'NoneType' and 'int' - Issue [#132](https://github.com/sdv-dev/RDT/issues/132) by @csala +* Example does not work on Windows - Issue [#114](https://github.com/sdv-dev/RDT/issues/114) by @csala +* OneHotEncodingTransformer producing all zeros - Issue [#135](https://github.com/sdv-dev/RDT/issues/135) by @fealho +* OneHotEncodingTransformer support for lists and lists of lists - Issue [#137](https://github.com/sdv-dev/RDT/issues/137) by @fealho + +## 0.2.7 - 2020-10-16 + +In this release we drop the support for the now officially dead Python 3.5 +and introduce a new feature in the DatetimeTransformer which reduces the dimensionality +of the generated numerical values while also ensuring that the reverted datetimes +maintain the same level as time unit precision as the original ones. + +* Drop Py35 support - Issue [#129](https://github.com/sdv-dev/RDT/issues/129) by @csala +* Add option to drop constant parts of the datetimes - Issue [#130](https://github.com/sdv-dev/RDT/issues/130) by @csala + ## 0.2.6 - 2020-10-05 * Add GaussianCopulaTransformer - Issue [#125](https://github.com/sdv-dev/RDT/issues/125) by @csala diff --git a/Makefile b/Makefile index f2cb5c603..779a40589 100644 --- a/Makefile +++ b/Makefile @@ -86,10 +86,7 @@ install-develop: clean-build clean-pyc ## install the package in editable mode a .PHONY: lint lint: ## check style with flake8 and isort - flake8 rdt - flake8 tests --ignore=D - isort -c --recursive rdt tests - pylint rdt --rcfile=setup.cfg + invoke lint .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort @@ -102,20 +99,15 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --cov=rdt + invoke pytest .PHONY: test-readme test-readme: ## run the readme snippets - rm -rf tests/readme_test && mkdir tests/readme_test - cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md - rm -rf tests/readme_test + invoke readme .PHONY: test test: test-unit test-readme ## test everything that needs test dependencies -.PHONY: test-devel -test-devel: lint docs ## test everything that needs development dependencies - .PHONY: test-all test-all: ## test using tox tox -r diff --git a/README.md b/README.md index c97eebdbb..86d13f80e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) [![PyPi Shield](https://img.shields.io/pypi/v/RDT.svg)](https://pypi.python.org/pypi/RDT) -[![Travis CI Shield](https://travis-ci.org/sdv-dev/RDT.svg?branch=master)](https://travis-ci.org/sdv-dev/RDT) +[![Travis CI Shield](https://travis-ci.com/sdv-dev/RDT.svg?branch=master)](https://travis-ci.com/sdv-dev/RDT) [![Coverage Status](https://codecov.io/gh/sdv-dev/RDT/branch/master/graph/badge.svg)](https://codecov.io/gh/sdv-dev/RDT) [![Downloads](https://pepy.tech/badge/rdt)](https://pepy.tech/project/rdt) @@ -24,7 +24,8 @@ the transformations in order to revert them as needed. ## Requirements -**RDT** has been developed and tested on [Python 3.5, 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +**RDT** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +on GNU/Linux, macOS and Windows systems. Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid @@ -44,6 +45,16 @@ This will pull and install the latest stable release from [PyPi](https://pypi.or If you want to install from source or contribute to the project please read the [Contributing Guide](CONTRIBUTING.rst). +## Install with conda + +**RDT** can also be installed using [conda](https://docs.conda.io/en/latest/): + +```bash +conda install -c sdv-dev -c conda-forge rdt +``` + +This will pull and install the latest stable release from [Anaconda](https://anaconda.org/). + # Quickstart diff --git a/conda/README.md b/conda/README.md new file mode 100644 index 000000000..60cd558ee --- /dev/null +++ b/conda/README.md @@ -0,0 +1,29 @@ +## Instructions + +These are instructions to deploy the latest version of **RDT** to [conda](https://docs.conda.io/en/latest/). +It should be done after every new release. + +## Update the recipe +Prior to making the release on PyPI, you should update the meta.yaml to reflect any changes in the dependencies. +Note that you do not need to edit the version number as that is managed by bumpversion. + +## Make the PyPI release +Follow the standard release instructions to make a PyPI release. Then, return here to make the conda release. + +## Build a package +As part of the PyPI release, you will have updated the stable branch. You should now check out the stable +branch and build the conda package. + +```bash +git checkout stable +cd conda +conda build -c sdv-dev -c conda-forge . +``` + +## Upload to Anaconda +Finally, you can upload the resulting package to Anaconda. + +```bash +anaconda login +anaconda upload -u sdv-dev +``` \ No newline at end of file diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 000000000..ca3cdfdaf --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,43 @@ +{% set name = 'rdt' %} +{% set version = '0.2.10.dev1' %} + +package: + name: "{{ name|lower }}" + version: "{{ version }}" + +source: + url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz" + +build: + number: 0 + noarch: python + script: "{{ PYTHON }} -m pip install ." + +requirements: + host: + - faker >=1.0.1,<4.15.0 + - numpy >=1.17.4,<2 + - pandas >=1.1,<1.1.5 + - pip + - python + - scipy >=1.4,<2 + - pytest-runner + run: + - faker >=1.0.1,<4.15.0 + - numpy >=1.17.4,<2 + - pandas >=1.1,<1.1.5 + - python + - scipy >=1.4,<2 + +about: + home: "https://github.com/sdv-dev/RDT" + license: MIT + license_family: MIT + license_file: + summary: "Reversible Data Transforms" + doc_url: + dev_url: + +extra: + recipe-maintainers: + - sdv-dev diff --git a/rdt/__init__.py b/rdt/__init__.py index ed43d66f6..9aa0a6073 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.6' +__version__ = '0.2.10.dev1' import numpy as np import pandas as pd diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index fd576152d..cdec5ab7d 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -1,12 +1,14 @@ """Hyper transformer module.""" import re +import warnings +from copy import deepcopy import numpy as np from rdt.transformers import ( - BooleanTransformer, CategoricalTransformer, DatetimeTransformer, NumericalTransformer, - load_transformers) + BooleanTransformer, CategoricalTransformer, DatetimeTransformer, LabelEncodingTransformer, + NumericalTransformer, OneHotEncodingTransformer, load_transformers) class HyperTransformer: @@ -28,6 +30,10 @@ class HyperTransformer: dtypes (list or None): List of column data types to use when building the ``transformers`` dict automatically. If not passed, the ``DataFrame.dtypes`` are used. + dtype_transformers (dict or None): + Transformer templates to use for each dtype. Passed as a dictionary of + dtype kinds ('i', 'f', 'O', 'b', 'M') and transformer names, classes + or instances. Example: Create a simple ``HyperTransformer`` instance that will decide which transformers @@ -53,12 +59,35 @@ class HyperTransformer: >>> ht = HyperTransformer(transformers) """ - def __init__(self, transformers=None, copy=True, anonymize=None, dtypes=None): + _TRANSFORMER_TEMPLATES = { + 'numerical': NumericalTransformer, + 'integer': NumericalTransformer(dtype=int), + 'float': NumericalTransformer(dtype=float), + 'categorical': CategoricalTransformer, + 'categorical_fuzzy': CategoricalTransformer(fuzzy=True), + 'one_hot_encoding': OneHotEncodingTransformer(error_on_unknown=False), + 'label_encoding': LabelEncodingTransformer, + 'boolean': BooleanTransformer, + 'datetime': DatetimeTransformer, + } + _DTYPE_TRANSFORMERS = { + 'i': 'numerical', + 'f': 'numerical', + 'O': 'categorical', + 'b': 'boolean', + 'M': 'datetime', + } + + def __init__(self, transformers=None, copy=True, anonymize=None, + dtypes=None, dtype_transformers=None): self.transformers = transformers self._transformers = dict() self.copy = copy self.anonymize = anonymize or dict() self.dtypes = dtypes + self.dtype_transformers = self._DTYPE_TRANSFORMERS.copy() + if dtype_transformers: + self.dtype_transformers.update(dtype_transformers) def _analyze(self, data): """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``. @@ -100,22 +129,26 @@ def _analyze(self, data): kind = np.dtype(dtype).kind except TypeError: # probably category - kind = dtype - - if kind == 'i': - transformer = NumericalTransformer(dtype=int) - elif kind == 'f': - transformer = NumericalTransformer(dtype=float) - elif kind in ('O', 'category'): - anonymize = self.anonymize.get(name) - transformer = CategoricalTransformer(anonymize=anonymize) - elif kind == 'b': - transformer = BooleanTransformer() - elif kind == 'M': - transformer = DatetimeTransformer() - else: + kind = 'O' + + transformer_template = self.dtype_transformers[kind] + if not transformer_template: raise ValueError('Unsupported dtype: {}'.format(dtype)) + if isinstance(transformer_template, str): + transformer_template = self._TRANSFORMER_TEMPLATES[transformer_template] + + if not isinstance(transformer_template, type): + transformer = deepcopy(transformer_template) + elif self.anonymize and transformer_template == CategoricalTransformer: + warnings.warn( + 'Categorical anonymization is deprecated and will be removed from RDT soon.', + DeprecationWarning + ) + transformer = CategoricalTransformer(anonymize=self.anonymize) + else: + transformer = transformer_template() + transformers[name] = transformer return transformers diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index 8d2f64e93..146ee3747 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -115,6 +115,9 @@ def _get_intervals(data): end = start + prob mean = (start + end) / 2 std = prob / 6 + if pd.isnull(value): + value = np.nan + intervals[value] = (start, end, mean, std) start = end @@ -144,7 +147,11 @@ def fit(self, data): def _get_value(self, category): """Get the value that represents this category.""" + if pd.isnull(category): + category = np.nan + mean, std = self.intervals[category][2:] + if self.fuzzy: return norm.rvs(mean, std) @@ -218,20 +225,56 @@ class OneHotEncodingTransformer(BaseTransformer): is found and 0s on the rest. Null values are considered just another category. + + Args: + error_on_unknown (bool): + If a value that was not seen during the fit stage is passed to + transform, then an error will be raised if this is True. """ dummy_na = None dummies = None + def __init__(self, error_on_unknown=True): + self.error_on_unknown = error_on_unknown + + @staticmethod + def _prepare_data(data): + """Transform data to appropriate format. + + If data is a valid list or a list of lists, transforms it into an np.array, + otherwise returns it. + + Args: + data (pandas.Series, numpy.ndarray, list or list of lists): + Data to prepare. + + Returns: + pandas.Series or numpy.ndarray + """ + if isinstance(data, list): + data = np.array(data) + + if len(data.shape) > 2: + raise ValueError('Unexpected format.') + if len(data.shape) == 2: + if data.shape[1] != 1: + raise ValueError('Unexpected format.') + + data = data[:, 0] + + return data + def fit(self, data): """Fit the transformer to the data. Get the pandas `dummies` which will be used later on for OneHotEncoding. Args: - data (pandas.Series or numpy.ndarray): + data (pandas.Series, numpy.ndarray, list or list of lists): Data to fit the transformer to. """ + data = self._prepare_data(data) self.dummy_na = pd.isnull(data).any() self.dummies = list(pd.get_dummies(data, dummy_na=self.dummy_na).columns) @@ -239,14 +282,20 @@ def transform(self, data): """Replace each category with the OneHot vectors. Args: - data (pandas.Series or numpy.ndarray): + data (pandas.Series, numpy.ndarray, list or list of lists): Data to transform. Returns: numpy.ndarray: """ + data = self._prepare_data(data) dummies = pd.get_dummies(data, dummy_na=self.dummy_na) - return dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) + array = dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) + for i, row in enumerate(array): + if np.all(row == 0) and self.error_on_unknown: + raise ValueError(f'The value {data[i]} was not seen during the fit stage.') + + return array def reverse_transform(self, data): """Convert float values back to the original categorical values. diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 8988e07b8..a2f6fa296 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -26,23 +26,45 @@ class DatetimeTransformer(BaseTransformer): If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. + strip_constant (bool): + Whether to optimize the output values by finding the smallest time unit that + is not zero on the training datetimes and dividing the generated numerical + values by the value of the next smallest time unit. This, a part from reducing the + orders of magnitued of the transformed values, ensures that reverted values always + are zero on the lower time units. """ null_transformer = None + divider = None - def __init__(self, nan='mean', null_column=None): + def __init__(self, nan='mean', null_column=None, strip_constant=False): self.nan = nan self.null_column = null_column + self.strip_constant = strip_constant - @staticmethod - def _transform(datetimes): + def _find_divider(self, transformed): + self.divider = 1 + multipliers = [10] * 9 + [60, 60, 24] + for multiplier in multipliers: + candidate = self.divider * multiplier + if np.mod(transformed, candidate).any(): + break + + self.divider = candidate + + def _transform(self, datetimes): """Transform datetime values to integer.""" nulls = datetimes.isnull() integers = np.zeros(len(datetimes)) - integers[~nulls] = datetimes[~nulls].astype(int).astype(float).values + integers[~nulls] = datetimes[~nulls].astype(np.int64).astype(np.float64).values integers[nulls] = np.nan - return pd.Series(integers) + transformed = pd.Series(integers) + if self.strip_constant: + self._find_divider(transformed) + transformed = transformed.floordiv(self.divider) + + return transformed def fit(self, data): """Fit the transformer to the data. @@ -55,7 +77,7 @@ def fit(self, data): data = pd.Series(data) transformed = self._transform(data) - self.null_transformer = NullTransformer(self.nan, self.null_column) + self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(transformed) def transform(self, data): @@ -88,5 +110,8 @@ def reverse_transform(self, data): if self.nan is not None: data = self.null_transformer.reverse_transform(data) - data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(int) + data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(np.int64) + if self.strip_constant: + data = data.astype(float) * self.divider + return pd.to_datetime(data) diff --git a/rdt/transformers/null.py b/rdt/transformers/null.py index 887e0e861..f2455a71f 100644 --- a/rdt/transformers/null.py +++ b/rdt/transformers/null.py @@ -111,6 +111,9 @@ def reverse_transform(self, data): data = pd.Series(data) if isnull.any(): + if self.copy: + data = data.copy() + data.iloc[isnull] = np.nan return data diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index b8a514110..a6ea1c79c 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -1,15 +1,15 @@ """Transformers for numerical data.""" - import copy import numpy as np import pandas as pd import scipy -from copulas import EPSILON, univariate from rdt.transformers.base import BaseTransformer from rdt.transformers.null import NullTransformer +EPSILON = np.finfo(np.float32).eps + class NumericalTransformer(BaseTransformer): """Transformer for numerical data. @@ -85,7 +85,7 @@ def reverse_transform(self, data): Data to transform. Returns: - pandas.Series + numpy.ndarray """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) @@ -161,64 +161,77 @@ class GaussianCopulaTransformer(NumericalTransformer): * ``truncated_gaussian``: Use a Truncated Gaussian distribution. """ - _DISTRIBUTIONS = { - 'univariate': univariate.Univariate, - 'parametric': ( - univariate.Univariate, { - 'parametric': univariate.ParametricType.PARAMETRIC, - }, - ), - 'bounded': ( - univariate.Univariate, - { - 'bounded': univariate.BoundedType.BOUNDED, - }, - ), - 'semi_bounded': ( - univariate.Univariate, - { - 'bounded': univariate.BoundedType.SEMI_BOUNDED, - }, - ), - 'parametric_bounded': ( - univariate.Univariate, - { - 'parametric': univariate.ParametricType.PARAMETRIC, - 'bounded': univariate.BoundedType.BOUNDED, - }, - ), - 'parametric_semi_bounded': ( - univariate.Univariate, - { - 'parametric': univariate.ParametricType.PARAMETRIC, - 'bounded': univariate.BoundedType.SEMI_BOUNDED, - }, - ), - 'gaussian': univariate.GaussianUnivariate, - 'gamma': univariate.GammaUnivariate, - 'beta': univariate.BetaUnivariate, - 'student_t': univariate.StudentTUnivariate, - 'gaussian_kde': univariate.GaussianKDE, - 'truncated_gaussian': univariate.TruncatedGaussian, - } - _univariate = None def __init__(self, dtype=None, nan='mean', null_column=None, distribution='parametric'): super().__init__(dtype=dtype, nan=nan, null_column=null_column) + self._distributions = self._get_distributions() if isinstance(distribution, str): - distribution = self._DISTRIBUTIONS[distribution] + distribution = self._distributions[distribution] self._distribution = distribution + @staticmethod + def _get_distributions(): + try: + from copulas import univariate # pylint: disable=import-outside-toplevel + except ImportError as error: + error.msg += ( + '\n\nIt seems like `copulas` is not installed.\n' + 'Please install it using:\n\n pip install rdt[copulas]' + ) + raise + + return { + 'univariate': univariate.Univariate, + 'parametric': ( + univariate.Univariate, { + 'parametric': univariate.ParametricType.PARAMETRIC, + }, + ), + 'bounded': ( + univariate.Univariate, + { + 'bounded': univariate.BoundedType.BOUNDED, + }, + ), + 'semi_bounded': ( + univariate.Univariate, + { + 'bounded': univariate.BoundedType.SEMI_BOUNDED, + }, + ), + 'parametric_bounded': ( + univariate.Univariate, + { + 'parametric': univariate.ParametricType.PARAMETRIC, + 'bounded': univariate.BoundedType.BOUNDED, + }, + ), + 'parametric_semi_bounded': ( + univariate.Univariate, + { + 'parametric': univariate.ParametricType.PARAMETRIC, + 'bounded': univariate.BoundedType.SEMI_BOUNDED, + }, + ), + 'gaussian': univariate.GaussianUnivariate, + 'gamma': univariate.GammaUnivariate, + 'beta': univariate.BetaUnivariate, + 'student_t': univariate.StudentTUnivariate, + 'gaussian_kde': univariate.GaussianKDE, + 'truncated_gaussian': univariate.TruncatedGaussian, + } + def _get_univariate(self): distribution = self._distribution - if isinstance(distribution, univariate.Univariate): + if isinstance(distribution, self._distributions['univariate']): return copy.deepcopy(distribution) if isinstance(distribution, tuple): return distribution[0](**distribution[1]) - if isinstance(distribution, type) and issubclass(distribution, univariate.Univariate): + if isinstance(distribution, type) and \ + issubclass(distribution, self._distributions['univariate']): return distribution() raise TypeError('Invalid distribution: {}'.format(distribution)) diff --git a/setup.cfg b/setup.cfg index a21d7c635..f7a5ce8fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.6 +current_version = 0.2.10.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? @@ -24,6 +24,10 @@ replace = version='{new_version}' search = __version__ = '{current_version}' replace = __version__ = '{new_version}' +[bumpversion:file:conda/meta.yaml] +search = version = '{current_version}' +replace = version = '{new_version}' + [bdist_wheel] universal = 1 @@ -52,6 +56,7 @@ ignore = D107, D407 [pylint] extension-pkg-whitelist = numpy min-similarity-lines = 5 +max-args = 8 ignore-comments = yes ignore-docstrings = yes ignore-imports = yes diff --git a/setup.py b/setup.py index 09c644fbd..0d2c17173 100644 --- a/setup.py +++ b/setup.py @@ -12,13 +12,15 @@ history = history_file.read() install_requires = [ - 'numpy>=1.15.4,<2', - 'pandas>=0.21,<2', - 'scipy>=1.1.0,<2', - 'Faker>=1.0.1,<2', - 'copulas>=0.3.0,<0.4', + 'numpy>=1.17.4,<2', + 'pandas>=1.1,<1.1.5', + 'scipy>=1.4,<2', + 'Faker>=1.0.1,<4.15', ] +copulas_requires = [ + 'copulas>=0.3.3,<0.4', +] setup_requires = [ 'pytest-runner>=2.11.1', ] @@ -28,8 +30,11 @@ 'pytest-cov>=2.6.0', 'jupyter>=1.0.0,<2', 'rundoc>=0.4.3,<0.5', + 'copulas>=0.3.3,<0.4', ] + + development_requires = [ # general 'bumpversion>=0.5.3,<0.6', @@ -61,6 +66,9 @@ # Advanced testing 'coverage>=4.5.1,<6', 'tox>=2.9.1,<4', + + # Invoking test commands + 'invoke' ] setup( @@ -72,15 +80,15 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], - description='Reversible Data Transformsi', + description='Reversible Data Transforms', extras_require={ - 'test': tests_require, - 'dev': development_requires + tests_require, + 'copulas': copulas_requires, + 'test': tests_require + copulas_requires, + 'dev': development_requires + tests_require + copulas_requires, }, include_package_data=True, install_requires=install_requires, @@ -90,11 +98,11 @@ long_description_content_type='text/markdown', name='rdt', packages=find_packages(include=['rdt', 'rdt.*']), - python_requires='>=3.5,<3.9', + python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.6', + version='0.2.10.dev1', zip_safe=False, ) diff --git a/tasks.py b/tasks.py new file mode 100644 index 000000000..13af582a2 --- /dev/null +++ b/tasks.py @@ -0,0 +1,82 @@ +import os +import re +import shutil +import stat +from pathlib import Path + +from invoke import task + + +@task +def pytest(c): + c.run('python -m pytest --cov=rdt') + + +@task +def install_minimum(c): + with open('setup.py', 'r') as setup_py: + lines = setup_py.read().splitlines() + + versions = [] + started = False + for line in lines: + if started: + if line == ']': + started = False + continue + + line = line.strip() + line = re.sub(r',?<=?[\d.]*,?', '', line) + line = re.sub(r'>=?', '==', line) + line = re.sub(r"""['",]""", '', line) + versions.append(line) + + elif line.startswith('install_requires = [') or \ + line.startswith('copulas_requires = ['): + started = True + + c.run(f'python -m pip install {" ".join(versions)}') + + +@task +def minimum(c): + install_minimum(c) + c.run('python -m pip check') + c.run('python -m pytest') + + +@task +def readme(c): + test_path = Path('tests/readme_test') + if test_path.exists() and test_path.is_dir(): + shutil.rmtree(test_path) + + cwd = os.getcwd() + os.makedirs(test_path, exist_ok=True) + shutil.copy('README.md', test_path / 'README.md') + os.chdir(test_path) + c.run('rundoc run --single-session python3 -t python3 README.md') + os.chdir(cwd) + shutil.rmtree(test_path) + + +@task +def lint(c): + c.run('flake8 rdt') + c.run('flake8 tests --ignore=D') + c.run('isort -c --recursive rdt tests') + c.run('pylint rdt --rcfile=setup.cfg') + + +def remove_readonly(func, path, _): + "Clear the readonly bit and reattempt the removal" + os.chmod(path, stat.S_IWRITE) + func(path) + + +@task +def rmdir(c, path): + try: + shutil.rmtree(path, onerror=remove_readonly) + except PermissionError: + pass diff --git a/tests/integration/test_hyper_transformer.py b/tests/integration/test_hyper_transformer.py index 3ce56adcf..bd83b447b 100644 --- a/tests/integration/test_hyper_transformer.py +++ b/tests/integration/test_hyper_transformer.py @@ -45,13 +45,13 @@ def get_transformers(): 'integer': { 'class': 'NumericalTransformer', 'kwargs': { - 'dtype': int, + 'dtype': np.int64, } }, 'float': { 'class': 'NumericalTransformer', 'kwargs': { - 'dtype': float, + 'dtype': np.float64, } }, 'categorical': { diff --git a/tests/integration/transformers/test_categorical.py b/tests/integration/transformers/test_categorical.py new file mode 100644 index 000000000..461f94d0c --- /dev/null +++ b/tests/integration/transformers/test_categorical.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd + +from rdt.transformers import CategoricalTransformer + + +def test_categorical_numerical_nans(): + """Ensure CategoricalTransformers work on numerical + nan only columns.""" + + data = pd.Series([1, 2, float('nan'), np.nan]) + + ct = CategoricalTransformer() + ct.fit(data) + transformed = ct.transform(data) + reverse = ct.reverse_transform(transformed) + + pd.testing.assert_series_equal(reverse, data) diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index 07f1caf33..535a6c317 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -1,75 +1,127 @@ import numpy as np -from rdt.transformers.numerical import GaussianCopulaTransformer +from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer -def test_copula_transformer_stats(): - data = np.random.normal(loc=4, scale=4, size=1000) +class TestNumericalTransformer: - ct = GaussianCopulaTransformer() - transformed = ct.fit_transform(data) + def test_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (1000, ) + nt = NumericalTransformer() + transformed = nt.fit_transform(data) - np.testing.assert_almost_equal(transformed.mean(), 0, decimal=1) - np.testing.assert_almost_equal(transformed.std(), 1, decimal=1) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] - reverse = ct.reverse_transform(transformed) + reverse = nt.reverse_transform(transformed) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_not_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) -def test_copula_transformer_null_column(): - data = np.array([1, 2, 1, 2, np.nan, 1]) + nt = NumericalTransformer(null_column=False) + transformed = nt.fit_transform(data) - ct = GaussianCopulaTransformer() - transformed = ct.fit_transform(data) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, ) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, 2) - assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] + reverse = nt.reverse_transform(transformed) - reverse = ct.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_int(self): + data = np.array([1, 2, 1, 2, 1]) + nt = NumericalTransformer(dtype=int) + transformed = nt.fit_transform(data) -def test_copula_transformer_not_null_column(): - data = np.array([1, 2, 1, 2, np.nan, 1]) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (5, ) - ct = GaussianCopulaTransformer(null_column=False) - transformed = ct.fit_transform(data) + reverse = nt.reverse_transform(transformed) + assert list(reverse) == [1, 2, 1, 2, 1] - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, ) + def test_int_nan(self): + data = np.array([1, 2, 1, 2, 1, np.nan]) - reverse = ct.reverse_transform(transformed) + nt = NumericalTransformer(dtype=int) + transformed = nt.fit_transform(data) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + reverse = nt.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) -def test_copula_transformer_int(): - data = np.array([1, 2, 1, 2, 1]) - ct = GaussianCopulaTransformer(dtype=int) - transformed = ct.fit_transform(data) +class TestGaussianCopulaTransformer: - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (5, ) + def test_stats(self): + data = np.random.normal(loc=4, scale=4, size=1000) - reverse = ct.reverse_transform(transformed) - assert list(reverse) == [1, 2, 1, 2, 1] + ct = GaussianCopulaTransformer() + transformed = ct.fit_transform(data) + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (1000, ) -def test_copula_transformer_int_nan(): - data = np.array([1, 2, 1, 2, 1, np.nan]) + np.testing.assert_almost_equal(transformed.mean(), 0, decimal=1) + np.testing.assert_almost_equal(transformed.std(), 1, decimal=1) - ct = GaussianCopulaTransformer(dtype=int) - transformed = ct.fit_transform(data) + reverse = ct.reverse_transform(transformed) - assert isinstance(transformed, np.ndarray) - assert transformed.shape == (6, 2) + np.testing.assert_array_almost_equal(reverse, data, decimal=1) - reverse = ct.reverse_transform(transformed) - np.testing.assert_array_almost_equal(reverse, data, decimal=2) + def test_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) + + ct = GaussianCopulaTransformer() + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + assert list(transformed[:, 1]) == [0, 0, 0, 0, 1, 0] + + reverse = ct.reverse_transform(transformed) + + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + + def test_not_null_column(self): + data = np.array([1, 2, 1, 2, np.nan, 1]) + + ct = GaussianCopulaTransformer(null_column=False) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, ) + + reverse = ct.reverse_transform(transformed) + + np.testing.assert_array_almost_equal(reverse, data, decimal=2) + + def test_int(self): + data = np.array([1, 2, 1, 2, 1]) + + ct = GaussianCopulaTransformer(dtype=int) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (5, ) + + reverse = ct.reverse_transform(transformed) + assert list(reverse) == [1, 2, 1, 2, 1] + + def test_int_nan(self): + data = np.array([1, 2, 1, 2, 1, np.nan]) + + ct = GaussianCopulaTransformer(dtype=int) + transformed = ct.fit_transform(data) + + assert isinstance(transformed, np.ndarray) + assert transformed.shape == (6, 2) + + reverse = ct.reverse_transform(transformed) + np.testing.assert_array_almost_equal(reverse, data, decimal=2) diff --git a/tests/test_hyper_transformer.py b/tests/test_hyper_transformer.py index cdd9a7c24..5cfd884b7 100644 --- a/tests/test_hyper_transformer.py +++ b/tests/test_hyper_transformer.py @@ -1,5 +1,5 @@ from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import Mock import numpy as np import pandas as pd @@ -7,7 +7,7 @@ from rdt import HyperTransformer from rdt.transformers import ( - BooleanTransformer, CategoricalTransformer, DatetimeTransformer, NumericalTransformer) + BooleanTransformer, DatetimeTransformer, NumericalTransformer, OneHotEncodingTransformer) class TestHyperTransformerTransformer(TestCase): @@ -22,123 +22,33 @@ def test___init__(self): self.assertEqual(ht.anonymize, dict()) self.assertEqual(ht.dtypes, None) - def test__analyze_int(self): - """Test _analyze int dtype""" + def test__analyze(self): + """Test _analyze""" # Setup - data = pd.DataFrame({ - 'integers': [1, 2, 3, 4, 5, None, 6, 7, 8, 9, 0] - }) - - dtypes = [int] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = NumericalTransformer - - self.assertIsInstance(result['integers'], expect_class) - - def test__analyze_float(self): - """Test _analyze float dtype""" - # Setup - data = pd.DataFrame({ - 'floats': [1.1, 2.2, 3.3, 4.4, 5.5, None, 6.6, 7.7, 8.8, 9.9, 0.0] - }) - - dtypes = [float] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = NumericalTransformer - - self.assertIsInstance(result['floats'], expect_class) - - def test__analyze_object(self): - """Test _analyze object dtype""" - # Setup - data = pd.DataFrame({ - 'objects': ['foo', 'bar', None, 'tar'] - }) - - dtypes = [np.object] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = CategoricalTransformer - - self.assertIsInstance(result['objects'], expect_class) - - def test__analyze_bool(self): - """Test _analyze bool dtype""" - # Setup - data = pd.DataFrame({ - 'booleans': [True, False, None, False, True] - }) - - dtypes = [bool] + hp = HyperTransformer(dtype_transformers={'O': 'one_hot_encoding'}) # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) - - # Asserts - expect_class = BooleanTransformer - - self.assertIsInstance(result['booleans'], expect_class) - - def test__analyze_datetime64(self): - """Test _analyze datetime64 dtype""" - # Setup data = pd.DataFrame({ - 'datetimes': ['1965-05-23', None, '1997-10-17'] + 'int': [1, 2, None], + 'float': [1.0, 2.0, None], + 'object': ['foo', 'bar', None], + 'category': [1, 2, None], + 'bool': [True, False, None], + 'datetime': pd.to_datetime(['1965-05-23', None, '1997-10-17']), }) - - data['datetimes'] = pd.to_datetime(data['datetimes'], format='%Y-%m-%d', errors='coerce') - - dtypes = [np.datetime64] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - result = HyperTransformer._analyze(transformer, data) + data['category'] = data['category'].astype('category') + result = hp._analyze(data) # Asserts - expect_class = DatetimeTransformer - - self.assertIsInstance(result['datetimes'], expect_class) - - @patch('rdt.hyper_transformer.np.dtype', new=Mock()) - def test__analyze_raise_error(self): - """Test _analyze raise error""" - # Setup - data = Mock() - data.columns = ['foo'] - - dtypes = [Mock()] - - # Run - transformer = Mock() - transformer.dtypes = dtypes - - with self.assertRaises(ValueError): - HyperTransformer._analyze(transformer, data) + assert isinstance(result, dict) + assert set(result.keys()) == {'int', 'float', 'object', 'category', 'bool', 'datetime'} + + assert isinstance(result['int'], NumericalTransformer) + assert isinstance(result['float'], NumericalTransformer) + assert isinstance(result['object'], OneHotEncodingTransformer) + assert isinstance(result['category'], OneHotEncodingTransformer) + assert isinstance(result['bool'], BooleanTransformer) + assert isinstance(result['datetime'], DatetimeTransformer) def test_fit_with_analyze(self): """Test fit and analyze the transformers""" diff --git a/tests/transformers/test_boolean.py b/tests/transformers/test_boolean.py index ef49d1f1a..201f3ff9b 100644 --- a/tests/transformers/test_boolean.py +++ b/tests/transformers/test_boolean.py @@ -180,5 +180,5 @@ def test_reverse_transform_not_null_values(self): # Asserts expected = np.array([True, False, True]) - assert type(result) == pd.Series + assert isinstance(result, pd.Series) np.testing.assert_equal(result.values, expected) diff --git a/tests/transformers/test_categorical.py b/tests/transformers/test_categorical.py index 971c9111f..873033399 100644 --- a/tests/transformers/test_categorical.py +++ b/tests/transformers/test_categorical.py @@ -283,6 +283,48 @@ def test_reversible_mixed(self): class TestOneHotEncodingTransformer: + def test__prepare_data_empty_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + data = [[], [], []] + + # Assert + with pytest.raises(ValueError): + ohet._prepare_data(data) + + def test__prepare_data_nested_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + data = [[[]]] + + # Assert + with pytest.raises(ValueError): + ohet._prepare_data(data) + + def test__prepare_data_list_of_lists(self): + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = [['a'], ['b'], ['c']] + out = ohet._prepare_data(data) + + # Assert + expected = np.array(['a', 'b', 'c']) + np.testing.assert_array_equal(out, expected) + + def test__prepare_data_pandas_series(self): + # Setup + ohet = OneHotEncodingTransformer() + + # Run + data = pd.Series(['a', 'b', 'c']) + out = ohet._prepare_data(data) + + # Assert + expected = pd.Series(['a', 'b', 'c']) + np.testing.assert_array_equal(out, expected) + def test_fit_no_nans(self): # Setup ohet = OneHotEncodingTransformer() @@ -367,6 +409,16 @@ def test_transform_single(self): ]) np.testing.assert_array_equal(out, expected) + def test_transform_all_zeros(self): + # Setup + ohet = OneHotEncodingTransformer() + data = pd.Series(['a']) + ohet.fit(data) + + # Assert + with np.testing.assert_raises(ValueError): + ohet.transform(['b']) + def test_reverse_transform_no_nans(self): # Setup ohet = OneHotEncodingTransformer() diff --git a/tests/transformers/test_datetime.py b/tests/transformers/test_datetime.py index 1bce84a37..e35a0eb14 100644 --- a/tests/transformers/test_datetime.py +++ b/tests/transformers/test_datetime.py @@ -1,164 +1,51 @@ -from unittest import TestCase -from unittest.mock import Mock - import numpy as np import pandas as pd from rdt.transformers import DatetimeTransformer -class TestDatetimeTransformer(TestCase): - - def test___init__(self): - """Test default instance""" - # Run - transformer = DatetimeTransformer() - - # Asserts - self.assertEqual(transformer.nan, 'mean', "Unexpected nan") - self.assertIsNone(transformer.null_column, "null_column is None by default") - self.assertIsNone(transformer.null_transformer, "null_transformer is None by default") - - def test__transform(self): - """Test transform datetimes series to integer""" - # Setup - data = pd.Series([None, '1996-10-17', '1965-05-23']) - data = pd.to_datetime(data) - - # Run - result = DatetimeTransformer._transform(data) - - # Asserts - expect = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - pd.testing.assert_series_equal(result, expect) - - def test_fit(self): - """Test fit nan custom value with numpy.array""" - # Setup - data = pd.to_datetime([None, '1996-10-17', '1965-05-23']).values - - # Run - transformer = DatetimeTransformer(nan='nan') - transformer.fit(data) - - # Asserts - expect_nan = 'nan' - expect_fill_value = 'nan' - - self.assertEqual( - transformer.nan, - expect_nan, - 'Unexpected nan' - ) - self.assertEqual( - transformer.null_transformer.fill_value, - expect_fill_value, - "Data mean is wrong" - ) - - def test_transform_array(self): - """Test tranform datetime arary""" - # Setup - data = pd.to_datetime([None, '1996-10-17', '1965-05-23']).values - - data_transform = pd.Series([np.nan, 845510400000000000, -145497600000000000]) +class TestDatetimeTransformer: - # Run - transformer = Mock() - transformer._transform.return_value = data_transform - - DatetimeTransformer.transform(transformer, data) - - # Asserts - exp_call_data = pd.Series([None, '1996-10-17', '1965-05-23']) - expect_call_args = pd.to_datetime(exp_call_data) - expect_call_count = 1 - - pd.testing.assert_series_equal( - transformer._transform.call_args[0][0], - expect_call_args - ) - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "NullTransformer.transform must be called only once." - ) - - def test_transform_series(self): - """Test transform datetime series""" - # Setup - data = pd.Series([None, '1996-10-17', '1965-05-23']) - data = pd.to_datetime(data) - - data_transform = pd.Series([np.nan, 845510400000000000, -145497600000000000]) + def test_no_strip(self): + dtt = DatetimeTransformer(strip_constant=False) + data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23'])) # Run - transformer = Mock() - transformer._transform.return_value = data_transform - - DatetimeTransformer.transform(transformer, data) + transformed = dtt.fit_transform(data.copy().to_numpy()) + reverted = dtt.reverse_transform(transformed) # Asserts - exp_call_data = pd.Series([None, '1996-10-17', '1965-05-23']) - expect_call_args = pd.to_datetime(exp_call_data) - expect_call_count = 1 - - pd.testing.assert_series_equal( - transformer._transform.call_args[0][0], - expect_call_args - ) - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "NullTransformer.transform must be called only once." - ) - - def test_reverse_transform_nan_not_ignore(self): - """Test reverse_transform with nan not equal to ignore""" - # Setup - data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) + expect_trans = np.array([ + [350006400000000000, 1.0], + [845510400000000000, 0.0], + [-145497600000000000, 0.0] + ]) + np.testing.assert_almost_equal(expect_trans, transformed) + pd.testing.assert_series_equal(reverted, data) - reversed_data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) + def test_strip(self): + dtt = DatetimeTransformer(strip_constant=True) + data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23'])) # Run - transformer = Mock() - transformer.nan = 'mean' - transformer.null_transformer.reverse_transform.return_value = reversed_data - - DatetimeTransformer.reverse_transform(transformer, data) + transformed = dtt.fit_transform(data.copy().to_numpy()) + reverted = dtt.reverse_transform(transformed) # Asserts - expect_reverse_call_count = 1 - - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expect_reverse_call_count, - "NullTransformer.reverse_transform must be called when nan is not ignore" - ) - - def test_reverse_transform_nan_ignore(self): - """Test reverse_transform with nan equal to ignore""" - # Setup - data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) - - # Run - transformer = Mock() - transformer.nan = None + expect_trans = np.array([ + [4051.0, 1.0], + [9786.0, 0.0], + [-1684.0, 0.0] + ]) + np.testing.assert_almost_equal(expect_trans, transformed) + pd.testing.assert_series_equal(reverted, data) - result = DatetimeTransformer.reverse_transform(transformer, data) + def test_reverse_transform_all_none(self): + dt = pd.to_datetime(['2020-01-01']) + dtt = DatetimeTransformer(strip_constant=True) + dtt.fit(dt) - # Asserts - expect = pd.Series([ - np.nan, - pd.to_datetime(845510400000000000), - pd.to_datetime(-145497600000000000) - ]) - expect_reverse_call_count = 0 + output = dtt.reverse_transform(np.array([None])) - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expect_reverse_call_count, - "NullTransformer.reverse_transform won't be called when nan is ignore" - ) + expected = pd.to_datetime(['NaT']) + pd.testing.assert_series_equal(output.to_series(), expected.to_series()) diff --git a/tests/transformers/test_numerical.py b/tests/transformers/test_numerical.py index 89161052b..b799be321 100644 --- a/tests/transformers/test_numerical.py +++ b/tests/transformers/test_numerical.py @@ -1,9 +1,7 @@ from unittest import TestCase -from unittest.mock import Mock, patch import copulas import numpy as np -import pandas as pd import pytest from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer @@ -11,15 +9,13 @@ class TestNumericalTransformer(TestCase): - def test___init__(self): - """Test default instance""" - # Run - transformer = NumericalTransformer() + def test___init__super_attrs(self): + """super() arguments are properly passed and set as attributes.""" + nt = NumericalTransformer(dtype='int', nan='mode', null_column=False) - # Asserts - self.assertEqual(transformer.nan, 'mean', "Unexpected nan") - self.assertIsNone(transformer.null_column, "null_column is None by default") - self.assertIsNone(transformer.dtype, "dtype is None by default") + assert nt.dtype == 'int' + assert nt.nan == 'mode' + assert nt.null_column is False def test_fit(self): """Test fit nan mean with numpy.array""" @@ -34,125 +30,8 @@ def test_fit(self): expect_fill_value = 'nan' expect_dtype = np.float - self.assertEqual( - transformer.null_transformer.fill_value, - expect_fill_value, - "Data mean is wrong" - ) - - self.assertEqual( - transformer._dtype, - expect_dtype, - "Expected dtype: float" - ) - - def test_transform_array(self): - """Test transform numpy.array""" - # Setup - data = np.array([1.5, None, 2.5]) - - # Run - transformer = Mock() - NumericalTransformer.transform(transformer, data) - - # Asserts - expect_call_count = 1 - - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "Transform must be called only once" - ) - - def test_transform_series(self): - """Test transform pandas.Series""" - # Setup - data = pd.Series([1.5, None, 2.5]) - - # Run - transformer = Mock() - NumericalTransformer.transform(transformer, data) - - # Asserts - expect_call_count = 1 - - self.assertEqual( - transformer.null_transformer.transform.call_count, - expect_call_count, - "Transform must be called only once" - ) - - def test_reverse_transform_nan_ignore(self): - """Test reverse_transform with nan equal to ignore""" - # Setup - data = pd.Series([1.5, None, 2.5]) - - # Run - transformer = Mock() - transformer.nan = None - transformer._dtype = np.float - - result = NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expect = pd.Series([1.5, None, 2.5]) - expected_reverse_transform_call_count = 0 - - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform can't be called when nan is ignore" - ) - - def test_reverse_transform_nan_not_ignore(self): - """Test reverse_transform with nan not equal to ignore""" - # Setup - data = pd.Series([1.5, 2.0, 2.5]) - reversed_data = pd.Series([1.5, 2.0, 2.5]) - - # Run - transformer = Mock() - transformer.nan = 'mean' - transformer._dtype = np.float - transformer.null_transformer.nulls = False - transformer.null_transformer.reverse_transform.return_value = reversed_data - - NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expected_reverse_transform_call_count = 1 - - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform must be called at least once" - ) - - @patch('numpy.round') - def test_reverse_transform_dtype_int(self, numpy_mock): - """Test reverse_transform with dtype equal to int""" - # Setup - numpy_mock.return_value = pd.Series([3, 2, 3]) - data = pd.Series([3.0, 2.0, 3.0]) - - # Run - transformer = Mock() - transformer.nan = None - transformer._dtype = np.int - - result = NumericalTransformer.reverse_transform(transformer, data) - - # Asserts - expect = pd.Series([3, 2, 3]) - expected_reverse_transform_call_count = 0 - - pd.testing.assert_series_equal(result, expect) - self.assertEqual( - transformer.null_transformer.reverse_transform.call_count, - expected_reverse_transform_call_count, - "NullTransformer.reverse_transform must be called at least once" - ) + assert transformer.null_transformer.fill_value == expect_fill_value + assert transformer._dtype == expect_dtype class TestGaussianCopulaTransformer: diff --git a/tox.ini b/tox.ini index c8ace7f65..83282478a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,31 +1,34 @@ [tox] -envlist = py3{5,6,7,8}, test-devel +envlist = py3{6,7,8}-{lint,readme,pytest,minimum} [travis] python = - 3.8: py38, test-devel - 3.7: py37 - 3.6: py36 - 3.5: py35 + 3.8: py38-lint, py38-readme, py38-pytest, py38-minimum + 3.7: py37-lint, py37-readme, py37-pytest, py37-minimum + 3.6: py36-lint, py36-readme, py36-pytest, py36-minimum [gh-actions] python = - 3.8: py38, test-devel - 3.7: py37, - 3.6: py36 - 3.5: py35 + 3.8: py38-lint, py38-readme, py38-pytest, py38-minimum + 3.7: py37-lint, py37-readme, py37-pytest, py37-minimum + 3.6: py36-lint, py36-readme, py36-pytest, py36-minimum [testenv] passenv = CI TRAVIS TRAVIS_* -skipsdist = true -skip_install = true -commands_pre = - /usr/bin/env pip install .[test] +skipsdist = false +skip_install = false +deps = + invoke + readme: rundoc + tutorials: jupyter +extras = + lint: dev + pytest: test + minimum: test + tutorials: ctgan commands = - /usr/bin/env make test - -[testenv:test-devel] -commands_pre = - /usr/bin/env pip install .[dev] -commands = - /usr/bin/env make test-devel + lint: invoke lint + readme: invoke readme + pytest: invoke pytest + minimum: invoke minimum + invoke rmdir {envdir}