Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho committed Aug 17, 2021
2 parents 148ae9c + 4c866f9 commit c1c883d
Show file tree
Hide file tree
Showing 14 changed files with 261 additions and 78 deletions.
10 changes: 10 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# History

## 0.5.2 - 2021-08-16

This release fixes a couple of bugs introduced by the previous release regarding the
`OneHotEncoder` and the `BooleanTransformer`.

### Issues closed

* BooleanTransformer.reverse_transform sometimes crashes with TypeError - Isssue [#210](https://github.com/sdv-dev/RDT/issues/210) by @katxiao
* OneHotEncoder causing shape misalignment in CopulaGAN, CTGAN, and TVAE - Issue [#208](https://github.com/sdv-dev/RDT/issues/210) by @sarahmish

## 0.5.1 - 2021-08-11

This release improves the overall performance of the library, both in terms of memory and time consumption.
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = 'rdt' %}
{% set version = '0.5.1' %}
{% set version = '0.5.2.dev1' %}

package:
name: "{{ name|lower }}"
Expand Down
2 changes: 1 addition & 1 deletion rdt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__author__ = """MIT Data To AI Lab"""
__email__ = '[email protected]'
__version__ = '0.5.1'
__version__ = '0.5.2.dev1'

import numpy as np
import pandas as pd
Expand Down
4 changes: 2 additions & 2 deletions rdt/transformers/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def fit(self, data):
if isinstance(data, np.ndarray):
data = pd.Series(data)

self.null_transformer = NullTransformer(self.nan, self.null_column)
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(data)

def transform(self, data):
Expand Down Expand Up @@ -85,4 +85,4 @@ def reverse_transform(self, data):

data = pd.Series(data)

return np.round(data).astype('boolean').astype('object')
return np.round(data).clip(0, 1).astype('boolean').astype('object')
42 changes: 21 additions & 21 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,11 @@ class OneHotEncodingTransformer(BaseTransformer):
"""

dummies = None
dummy_na = None
num_dummies = None
dummy_encoded = False
indexer = None
decoder = None
_dummy_na = None
_num_dummies = None
_dummy_encoded = False
_indexer = None
_uniques = None

def __init__(self, error_on_unknown=True):
self.error_on_unknown = error_on_unknown
Expand Down Expand Up @@ -297,19 +297,19 @@ def _prepare_data(data):
return data

def _transform(self, data):
if self.dummy_encoded:
coder = self.indexer
codes = pd.Categorical(data, categories=self.dummies).codes
if self._dummy_encoded:
coder = self._indexer
codes = pd.Categorical(data, categories=self._uniques).codes
else:
coder = self.dummies
coder = self._uniques
codes = data

rows = len(data)
dummies = np.broadcast_to(coder, (rows, self.num_dummies))
coded = np.broadcast_to(codes, (self.num_dummies, rows)).T
dummies = np.broadcast_to(coder, (rows, self._num_dummies))
coded = np.broadcast_to(codes, (self._num_dummies, rows)).T
array = (coded == dummies).astype(int)

if self.dummy_na:
if self._dummy_na:
null = np.zeros((rows, 1), dtype=int)
null[pd.isnull(data)] = 1
array = np.append(array, null, axis=1)
Expand All @@ -328,17 +328,17 @@ def fit(self, data):
data = self._prepare_data(data)

null = pd.isnull(data)
self.dummy_na = null.any()
self.dummies = list(pd.unique(data[~null]))
self.num_dummies = len(self.dummies)
self.indexer = list(range(self.num_dummies))
self.decoder = self.dummies.copy()
self._uniques = list(pd.unique(data[~null]))
self._dummy_na = null.any()
self._num_dummies = len(self._uniques)
self._indexer = list(range(self._num_dummies))
self.dummies = self._uniques.copy()

if not np.issubdtype(data.dtype, np.number):
self.dummy_encoded = True
self._dummy_encoded = True

if self.dummy_na:
self.decoder.append(np.nan)
if self._dummy_na:
self.dummies.append(np.nan)

def transform(self, data):
"""Replace each category with the OneHot vectors.
Expand Down Expand Up @@ -375,7 +375,7 @@ def reverse_transform(self, data):
data = data.reshape(-1, 1)

indices = np.argmax(data, axis=1)
return pd.Series(indices).map(self.decoder.__getitem__)
return pd.Series(indices).map(self.dummies.__getitem__)


class LabelEncodingTransformer(BaseTransformer):
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.1
current_version = 0.5.2.dev1
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/sdv-dev/RDT',
version='0.5.1',
version='0.5.2.dev1',
zip_safe=False,
)
83 changes: 83 additions & 0 deletions tests/integration/transformers/test_boolean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import numpy as np
import pandas as pd

from rdt.transformers import BooleanTransformer


class TestBooleanTransformer:

def test_boolean_some_nans(self):
"""Test BooleanTransformer on input with some nan values.
Ensure that the BooleanTransformer can fit, transform, and reverse
transform on boolean data with Nones. Expect that the reverse
transformed data is the same as the input.
Input:
- boolean data with None values
Output:
- The reversed transformed data
"""
# Setup
data = pd.Series([True, False, None, False])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformed = transformer.transform(data)
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)

def test_boolean_all_nans(self):
"""Test BooleanTransformer on input with all nan values.
Ensure that the BooleanTransformer can fit, transform, and reverse
transform on boolean data with all Nones. Expect that the reverse
transformed data is the same as the input.
Input:
- 4 rows of all None values
Output:
- The reversed transformed data
"""
# Setup
data = pd.Series([None, None, None, None])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformed = transformer.transform(data)
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)

def test_boolean_input_unchanged(self):
"""Test BooleanTransformer on input with some nan values.
Ensure that the BooleanTransformer can fit, transform, and reverse
transform on boolean data with all Nones. Expect that the intermediate
transformed data is unchanged.
Input:
- 4 rows of all None values
Output:
- The reversed transformed data
Side effects:
- The intermediate transformed data is unchanged.
"""
# Setup
data = pd.Series([True, False, None, False])
transformer = BooleanTransformer()

# Run
transformer.fit(data)
transformed = transformer.transform(data)
unchanged_transformed = transformed.copy()
reverse = transformer.reverse_transform(transformed)

# Assert
pd.testing.assert_series_equal(reverse, data)
np.testing.assert_array_equal(unchanged_transformed, transformed)
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
"memory": 1000000.0
},
"reverse_transform": {
"time": 0.003,
"time": 0.01,
"memory": 1000000.0
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"memory": 1000000.0
},
"reverse_transform": {
"time": 0.002,
"time": 0.01,
"memory": 500000.0
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"memory": 1000000.0
},
"reverse_transform": {
"time": 0.002,
"time": 0.01,
"memory": 500000.0
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"memory": 1000000.0
},
"reverse_transform": {
"time": 0.002,
"time": 0.01,
"memory": 500000.0
}
}
}
}
51 changes: 51 additions & 0 deletions tests/unit/transformers/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,54 @@ def test_reverse_transform_2d_ndarray(self):

assert isinstance(result, pd.Series)
np.testing.assert_equal(result.values, expected)

def test_reverse_transform_float_values(self):
"""Test the ``reverse_transform`` method with decimals.
Expect that the ``reverse_transform`` method handles decimal inputs
correctly by rounding them.
Input:
- Transformed data with decimal values.
Output:
- Reversed transformed data.
"""
# Setup
data = np.array([1.2, 0.32, 1.01])
transformer = Mock()
transformer.nan = None

# Run
result = BooleanTransformer.reverse_transform(transformer, data)

# Asserts
expected = np.array([True, False, True])

assert isinstance(result, pd.Series)
np.testing.assert_equal(result.values, expected)

def test_reverse_transform_float_values_out_of_range(self):
"""Test the ``reverse_transform`` method with decimals that are out of range.
Expect that the ``reverse_transform`` method handles decimal inputs
correctly by rounding them. If the rounded decimal inputs are < 0 or > 1, expect
expect them to be clipped.
Input:
- Transformed data with decimal values, some of which round to < 0 or > 1.
Output:
- Reversed transformed data.
"""
# Setup
data = np.array([1.9, -0.7, 1.01])
transformer = Mock()
transformer.nan = None

# Run
result = BooleanTransformer.reverse_transform(transformer, data)

# Asserts
expected = np.array([True, False, True])

assert isinstance(result, pd.Series)
np.testing.assert_equal(result.values, expected)
Loading

0 comments on commit c1c883d

Please sign in to comment.