Skip to content

Commit

Permalink
Allow encoders to return numeric features (#352)
Browse files Browse the repository at this point in the history
* Allow encoders to return numeric features

* Update changelog

* fix enum objects

* add assert_never

---------

Co-authored-by: Egor Baturin <[email protected]>
  • Loading branch information
egoriyaa and Egor Baturin authored May 30, 2024
1 parent 4ba2c0e commit d08e6fa
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 14 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Allow `RNNModel` to work with categorical features ([#334](https://github.com/etna-team/etna/pull/334))
- Allow `DeepARNativeModel` and `MLPModel` to work with categorical features ([#336](https://github.com/etna-team/etna/pull/336))
- Allow `DeepState` to work with categorical features ([#342](https://github.com/etna-team/etna/pull/342))
-
- Allow encoders to return numeric features ([#352](https://github.com/etna-team/etna/pull/352))
-
-
-
Expand Down
72 changes: 62 additions & 10 deletions etna/transforms/encoders/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from sklearn import preprocessing
from sklearn.utils._encode import _check_unknown
from sklearn.utils._encode import _encode
from typing_extensions import assert_never

from etna.datasets import TSDataset
from etna.distributions import BaseDistribution
Expand All @@ -22,9 +23,26 @@ class ImputerMode(str, Enum):
mean = "mean"
none = "none"

@classmethod
def _missing_(cls, value):
raise ValueError(f"The strategy '{value}' doesn't exist")


class ReturnType(str, Enum):
"""Enum for data types of returned columns."""

categorical = "categorical"
numeric = "numeric"

@classmethod
def _missing_(cls, value):
raise NotImplementedError(
f"{value} is not a valid {cls.__name__}. Supported types: {', '.join([repr(m.value) for m in cls])}"
)


class _LabelEncoder(preprocessing.LabelEncoder):
def transform(self, y: pd.Series, strategy: str):
def transform(self, y: pd.Series, strategy: ImputerMode):
diff = _check_unknown(y, known_values=self.classes_)

is_new_index = np.isin(y, diff)
Expand All @@ -34,14 +52,14 @@ def transform(self, y: pd.Series, strategy: str):
float
)

if strategy == ImputerMode.none:
if strategy is ImputerMode.none:
filling_value = None
elif strategy == ImputerMode.new_value:
elif strategy is ImputerMode.new_value:
filling_value = -1
elif strategy == ImputerMode.mean:
elif strategy is ImputerMode.mean:
filling_value = np.mean(encoded[~np.isin(y, diff)])
else:
raise ValueError(f"The strategy '{strategy}' doesn't exist")
assert_never(strategy)

encoded[is_new_index] = filling_value
return encoded
Expand All @@ -50,7 +68,13 @@ def transform(self, y: pd.Series, strategy: str):
class LabelEncoderTransform(IrreversibleTransform):
"""Encode categorical feature with value between 0 and n_classes-1."""

def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: str = ImputerMode.mean):
def __init__(
self,
in_column: str,
out_column: Optional[str] = None,
strategy: str = ImputerMode.mean,
return_type: str = ReturnType.categorical,
):
"""
Init LabelEncoderTransform.
Expand All @@ -68,12 +92,19 @@ def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: s
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
return_type:
Data type of returned columns:
- If "categorical", then returned columns will have "category" data type
- If "numeric", then returned columns will have float data type
"""
super().__init__(required_features=[in_column])
self.in_column = in_column
self.out_column = out_column
self.strategy = strategy
self.strategy = ImputerMode(strategy)
self.return_type = ReturnType(return_type)
self.le = _LabelEncoder()
self.in_column_regressor: Optional[bool] = None

Expand Down Expand Up @@ -123,7 +154,15 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
out_column = self._get_column_name()
result_df = TSDataset.to_flatten(df)
result_df[out_column] = self.le.transform(result_df[self.in_column], self.strategy)
result_df[out_column] = result_df[out_column].astype("category")

if self.return_type is ReturnType.categorical:
return_type = "category"
elif self.return_type is ReturnType.numeric:
return_type = "float"
else:
assert_never(self.return_type)

result_df[out_column] = result_df[out_column].astype(return_type)
result_df = TSDataset.to_dataset(result_df)
return result_df

Expand Down Expand Up @@ -155,7 +194,7 @@ class OneHotEncoderTransform(IrreversibleTransform):
encoded columns for this feature will be all zeros.
"""

def __init__(self, in_column: str, out_column: Optional[str] = None):
def __init__(self, in_column: str, out_column: Optional[str] = None, return_type: str = ReturnType.categorical):
"""
Init OneHotEncoderTransform.
Expand All @@ -165,10 +204,17 @@ def __init__(self, in_column: str, out_column: Optional[str] = None):
Name of column to be encoded
out_column:
Prefix of names of added columns. If not given, use ``self.__repr__()``
return_type:
Data type of returned columns:
- If "categorical", then returned columns will have "category" data type
- If "numeric", then returned columns will have float data type
"""
super().__init__(required_features=[in_column])
self.in_column = in_column
self.out_column = out_column
self.return_type = ReturnType(return_type)
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=int)
self.in_column_regressor: Optional[bool] = None

Expand Down Expand Up @@ -219,7 +265,13 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
x = result_df[[self.in_column]]
out_columns = self._get_out_column_names()
result_df[out_columns] = self.ohe.transform(X=x)
result_df[out_columns] = result_df[out_columns].astype("category")

if self.return_type == ReturnType.categorical:
return_type = "category"
elif self.return_type == ReturnType.numeric:
return_type = "float"

result_df[out_columns] = result_df[out_columns].astype(return_type)
result_df = TSDataset.to_dataset(result_df)
return result_df

Expand Down
42 changes: 39 additions & 3 deletions tests/test_transforms/test_encoders/test_categorical_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,24 @@ def test_label_encoder_simple(dtype):
assert df_transformed.equals(df_expected)


@pytest.mark.parametrize("return_type, expected_type", [("categorical", "category"), ("numeric", "float64")])
def test_label_encoder_return_type(return_type, expected_type):
"""Test that LabelEncoderTransform return column with correct dtype."""
ts, _ = get_ts_for_label_encoding()
le = LabelEncoderTransform(in_column=f"regressor_0", out_column="test", return_type=return_type)
le.fit(ts)
df_transformed = le.transform(deepcopy(ts)).to_pandas()["segment_0"]["test"]
type_transformed = df_transformed.dtypes.name
assert type_transformed == expected_type


@pytest.mark.parametrize("return_type", ["int", "all"])
def test_wrong_mode_type(return_type):
"""Check that Exception raises when passed wrong return_type"""
with pytest.raises(NotImplementedError, match=f"{return_type} is not a valid ImputerType."):
_ = LabelEncoderTransform(in_column=f"regressor_0", out_column="test", return_type=return_type)


@pytest.mark.parametrize("dtype", ["float", "int", "str", "category"])
def test_ohe_encoder_simple(dtype):
"""Test that OneHotEncoderTransform works correct in a simple case."""
Expand All @@ -165,13 +183,31 @@ def test_ohe_encoder_simple(dtype):
assert df_transformed.equals(df_expected)


@pytest.mark.parametrize("return_type, expected_type", [("categorical", "category"), ("numeric", "float64")])
def test_ohe_encoder_return_type(return_type, expected_type):
"""Test that OneHotEncoderTransform return columns with correct dtype."""
ts, _ = get_ts_for_label_encoding()
ohe = OneHotEncoderTransform(in_column=f"regressor_0", out_column="test", return_type=return_type)
ohe.fit(ts)
cols = ohe._get_out_column_names()
df_transformed = ohe.transform(deepcopy(ts)).to_pandas()["segment_0"][cols]
for col in df_transformed.columns:
type_transformed = df_transformed[col].dtypes.name
assert type_transformed == expected_type


@pytest.mark.parametrize("return_type", ["int", "all"])
def test_wrong_mode_type(return_type):
"""Check that Exception raises when passed wrong return_type"""
with pytest.raises(NotImplementedError, match=f"{return_type} is not a valid ReturnType."):
_ = OneHotEncoderTransform(in_column=f"regressor_0", out_column="test", return_type=return_type)


def test_value_error_label_encoder(ts_for_label_encoding):
"""Test LabelEncoderTransform with wrong strategy."""
ts, _ = ts_for_label_encoding
with pytest.raises(ValueError, match="The strategy"):
with pytest.raises(ValueError, match="The strategy 'fake_strategy' doesn't exist"):
le = LabelEncoderTransform(in_column="target", strategy="fake_strategy")
le.fit(ts)
le.transform(ts)


@pytest.mark.parametrize(
Expand Down

0 comments on commit d08e6fa

Please sign in to comment.