Skip to content

Commit

Permalink
Add MeanEncoderTransform (#413)
Browse files Browse the repository at this point in the history
* add MeanEncoderTransform

* fix fit

* fix code

* final

* fix mode name

* resolve changelog

* resolve changelog

* fix all

* add comments

* satisfy mypy

* add tests, fix docs

* fix

* fix

---------

Co-authored-by: Egor Baturin <[email protected]>
  • Loading branch information
egoriyaa and Egor Baturin authored Jul 12, 2024
1 parent 1bd32a7 commit 12f19fb
Show file tree
Hide file tree
Showing 8 changed files with 684 additions and 1 deletion.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398))
- Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
- Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415))
-
- Add `MeanEncoderTransform` ([#413](https://github.com/etna-team/etna/pull/413))

### Changed
- Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396))
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/transforms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Categorical encoding transforms:
:template: class.rst

SegmentEncoderTransform
MeanEncoderTransform
MeanSegmentEncoderTransform
LabelEncoderTransform
OneHotEncoderTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from etna.transforms.embeddings import EmbeddingSegmentTransform
from etna.transforms.embeddings import EmbeddingWindowTransform
from etna.transforms.encoders import LabelEncoderTransform
from etna.transforms.encoders import MeanEncoderTransform
from etna.transforms.encoders import MeanSegmentEncoderTransform
from etna.transforms.encoders import OneHotEncoderTransform
from etna.transforms.encoders import SegmentEncoderTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from etna.transforms.encoders.categorical import LabelEncoderTransform
from etna.transforms.encoders.categorical import OneHotEncoderTransform
from etna.transforms.encoders.mean_encoder import MeanEncoderTransform
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform
313 changes: 313 additions & 0 deletions etna/transforms/encoders/mean_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
import reprlib
from enum import Enum
from typing import Dict
from typing import List
from typing import Optional
from typing import Union
from typing import cast

import numpy as np
import pandas as pd
from bottleneck import nanmean

from etna.datasets import TSDataset
from etna.distributions import BaseDistribution
from etna.distributions import FloatDistribution
from etna.transforms import IrreversibleTransform


class EncoderMode(str, Enum):
"""Enum for different encoding strategies."""

per_segment = "per-segment"
macro = "macro"

@classmethod
def _missing_(cls, value):
raise ValueError(f"The strategy '{value}' doesn't exist")


class MissingMode(str, Enum):
"""Enum for handle missing strategies."""

category = "category"
global_mean = "global_mean"

@classmethod
def _missing_(cls, value):
raise NotImplementedError(
f"{value} is not a valid {cls.__name__}. Supported types: {', '.join([repr(m.value) for m in cls])}"
)


class MeanEncoderTransform(IrreversibleTransform):
"""
Makes encoding of categorical feature.
For timestamps that are before the last timestamp seen in ``fit`` transformations are made using the formula below:
.. math::
\\frac{TargetSum + RunningMean * Smoothing}{FeatureCount + Smoothing}
where
* TargetSum is the sum of target up to the current timestamp for the current category, not including the current timestamp
* RunningMean is target mean up to the current timestamp, not including the current timestamp
* FeatureCount is the number of categories with the same value as in the current timestamp, not including the current timestamp
For future timestamps:
* for known categories encoding are filled with global mean of target for these categories calculated during ``fit``
* for unknown categories encoding are filled with global mean of target in the whole dataset calculated during ``fit``
All types of NaN values are considering as one category.
"""

idx = pd.IndexSlice

def __init__(
self,
in_column: str,
out_column: str,
mode: Union[EncoderMode, str] = "per-segment",
handle_missing: str = MissingMode.category,
smoothing: int = 1,
):
"""
Init MeanEncoderTransform.
Parameters
----------
in_column:
categorical column to apply transform
out_column:
name of added column
mode:
mode to encode segments
* 'per-segment' - statistics are calculated across each segment individually
* 'macro' - statistics are calculated across all segments. In this mode transform can work with new segments that were not seen during ``fit``
handle_missing:
mode to handle missing values in ``in_column``
* 'category' - NaNs they are interpreted as a separate categorical feature
* 'global_mean' - NaNs are filled with the running mean
smoothing:
smoothing parameter
"""
super().__init__(required_features=["target", in_column])
self.in_column = in_column
self.out_column = out_column
self.mode = EncoderMode(mode)
self.handle_missing = MissingMode(handle_missing)
self.smoothing = smoothing

self._global_means: Optional[Union[float, Dict[str, float]]] = None
self._global_means_category: Optional[Union[Dict[str, float], Dict[str, Dict[str, float]]]] = None
self._last_timestamp: Union[pd.Timestamp, int, None]

def _fit(self, df: pd.DataFrame) -> "MeanEncoderTransform":
"""
Fit encoder.
Parameters
----------
df:
dataframe with data to fit expanding mean target encoder.
Returns
-------
:
Fitted transform
"""
df.loc[:, pd.IndexSlice[:, self.in_column]] = df.loc[:, pd.IndexSlice[:, self.in_column]].fillna(np.NaN)

if self.mode is EncoderMode.per_segment:
axis = 0
segments = df.columns.get_level_values("segment").unique().tolist()
global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)
global_means = dict(zip(segments, global_means))

global_means_category = {}
for segment in segments:
segment_df = TSDataset.to_flatten(df.loc[:, pd.IndexSlice[segment, :]])
global_means_category[segment] = (
segment_df[[self.in_column, "target"]]
.groupby(self.in_column, dropna=False)
.mean()
.to_dict()["target"]
)
else:
axis = None
global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)

segment_df = TSDataset.to_flatten(df)
global_means_category = (
segment_df[[self.in_column, "target"]].groupby(self.in_column, dropna=False).mean().to_dict()["target"]
)

self._global_means = global_means
self._global_means_category = global_means_category
self._last_timestamp = df.index[-1]

return self

@staticmethod
def _count_macro_running_mean(df, n_segments):
y = df["target"]
timestamp_count = y.groupby(df["timestamp"]).transform("count")
timestamp_sum = y.groupby(df["timestamp"]).transform("sum")
expanding_mean = timestamp_sum.iloc[::n_segments].cumsum() / timestamp_count.iloc[::n_segments].cumsum()
expanding_mean = expanding_mean.repeat(n_segments)
# first timestamp is NaN
expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
return expanding_mean

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.
Parameters
----------
df:
dataframe with data to transform.
Returns
-------
:
result dataframe
Raises
------
ValueError:
If transform isn't fitted.
NotImplementedError:
If there are segments that weren't present during training.
"""
if self._global_means is None:
raise ValueError("The transform isn't fitted!")

segments = df.columns.get_level_values("segment").unique().tolist()
n_segments = len(segments)
if self.mode is EncoderMode.per_segment:
self._global_means = cast(Dict[str, float], self._global_means)
new_segments = set(segments) - self._global_means.keys()
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)
df.loc[:, self.idx[:, self.in_column]] = df.loc[:, self.idx[:, self.in_column]].fillna(np.NaN)

future_timestamps = df.index[df.index > self._last_timestamp]
intersected_timestamps = df.index[df.index <= self._last_timestamp]

intersected_df = df.loc[intersected_timestamps, self.idx[:, :]]
future_df = df.loc[future_timestamps, self.idx[:, :]]

if len(intersected_df) > 0:
if self.mode is EncoderMode.per_segment:
for segment in segments:
segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
y = segment_df["target"]
# first timestamp is NaN
expanding_mean = y.expanding().mean().shift()
# cumcount not including current timestamp
cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount")
# cumsum not including current timestamp
cumsum = (
y.groupby(segment_df[self.in_column].astype(str))
.transform(lambda x: x.shift().cumsum())
.fillna(0)
)
feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]
intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values

else:
flatten = TSDataset.to_flatten(intersected_df)
flatten = flatten.sort_values(["timestamp", "segment"])
running_mean = self._count_macro_running_mean(flatten, n_segments)

temp = pd.DataFrame(index=flatten.index, columns=["cumsum", "cumcount"], dtype=float)

timestamps = intersected_df.index
categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel())

cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories})
cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
for _ in range(len(timestamps)):
timestamp_df = flatten.loc[cur_timestamp_idx]
# statistics from previous timestamp
cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)
# map categories for current timestamp to statistics
temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict)
temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict)
# count statistics for current timestamp
stats = (
timestamp_df["target"]
.groupby(timestamp_df[self.in_column], dropna=False)
.agg(["count", "sum"])
.reset_index()
)
# sum current and previous statistics
cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
cur_timestamp_idx += 1

feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
nan_feature_index = flatten[flatten[self.in_column].isnull()].index
feature.loc[nan_feature_index] = running_mean.loc[nan_feature_index]

feature = pd.DataFrame(
feature.values.reshape(len(timestamps), n_segments),
columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
index=intersected_df.index,
)
intersected_df = pd.concat([intersected_df, feature], axis=1)

if len(future_df) > 0:
n_timestamps = len(future_df.index)
if self.mode is EncoderMode.per_segment:
self._global_means_category = cast(Dict[str, Dict[str, float]], self._global_means_category)
self._global_means = cast(Dict[str, float], self._global_means)
for segment in segments:
segment_df = TSDataset.to_flatten(future_df.loc[:, self.idx[segment, :]])
feature = segment_df[self.in_column].map(self._global_means_category[segment])
feature = feature.fillna(self._global_means[segment])
future_df.loc[:, self.idx[segment, self.out_column]] = feature.values
else:
flatten = TSDataset.to_flatten(future_df)
feature = flatten[self.in_column].map(self._global_means_category)
feature = feature.fillna(self._global_means)
feature = pd.DataFrame(
feature.values.reshape(len(segments), n_timestamps).T,
columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
index=future_df.index,
)
future_df = pd.concat([future_df, feature], axis=1)

intersected_df = intersected_df.sort_index(axis=1)
future_df = future_df.sort_index(axis=1)
transformed_df = pd.concat((intersected_df, future_df), axis=0)
return transformed_df

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
return [self.out_column]

def params_to_tune(self) -> Dict[str, BaseDistribution]:
"""Get default grid for tuning hyperparameters.
This grid tunes ``smoothing`` parameter. Other parameters are expected to be set by the user.
Returns
-------
:
Grid to tune.
"""
return {"smoothing": FloatDistribution(low=0, high=2)}
Loading

0 comments on commit 12f19fb

Please sign in to comment.