diff --git a/CHANGELOG.md b/CHANGELOG.md index 44402f9ed..fa97ef948 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,7 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491)) - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) - Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499)) -- +- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492)) - - - diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py index 207f0d734..f92a7cd0a 100644 --- a/etna/transforms/encoders/mean_encoder.py +++ b/etna/transforms/encoders/mean_encoder.py @@ -3,9 +3,11 @@ from typing import Dict from typing import List from typing import Optional +from typing import Tuple from typing import Union from typing import cast +import numba import numpy as np import pandas as pd from bottleneck import nanmean @@ -165,6 +167,39 @@ def _count_macro_running_mean(df, n_segments): expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments) return expanding_mean + @staticmethod + @numba.njit() + def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ans_cumsum = np.full_like(target, np.nan) + ans_cumcount = np.full_like(target, np.nan) + unique_categories = np.unique(categories) + for category in unique_categories: + idx = np.where(category == categories)[0] + t = target[idx] + + # Mask for valid (non-NaN) target values + valid = ~np.isnan(t) + + # Compute cumulative sums and counts for valid values + cumsum = np.cumsum(np.where(valid, t, 0)) + cumcount = np.cumsum(valid).astype(np.float32) + + # Shift statistics by 1 to get statistics not including current index + cumsum = np.roll(cumsum, 1) + cumcount = np.roll(cumcount, 1) + + cumsum[0] = np.NaN + cumcount[0] = np.NaN + + # Handle positions with no previous valid values + cumsum[cumcount == 0] = np.NaN + cumcount[cumcount == 0] = np.NaN + + # Assign the computed values back to the answer arrays + ans_cumsum[idx] = cumsum + ans_cumcount[idx] = cumcount + return ans_cumsum, ans_cumcount + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Get encoded values for the segment. @@ -211,20 +246,24 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: for segment in segments: segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]]) y = segment_df["target"] + categories = segment_df[self.in_column].values.astype(str) + + unique_categories = np.unique(categories) + cat_to_int = {cat: idx for idx, cat in enumerate(unique_categories)} + int_categories = np.array([cat_to_int[cat] for cat in categories], dtype=np.int64) + # first timestamp is NaN expanding_mean = y.expanding().mean().shift() - # cumcount not including current timestamp - cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount") - # cumsum not including current timestamp - cumsum = ( - y.groupby(segment_df[self.in_column].astype(str)) - .transform(lambda x: x.shift().cumsum()) - .fillna(0) - ) + + cumsum, cumcount = self._count_per_segment_cumstats(y.values, int_categories) + cumsum = pd.Series(cumsum) + cumcount = pd.Series(cumcount) + feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing) if self.handle_missing is MissingMode.global_mean: nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index] + intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values else: @@ -237,16 +276,19 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: timestamps = intersected_df.index categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel()) - cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories}) + cumstats = pd.DataFrame(data={"sum": np.NaN, "count": np.NaN, self.in_column: categories}) cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps)) for _ in range(len(timestamps)): timestamp_df = flatten.loc[cur_timestamp_idx] + # statistics from previous timestamp cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values) cumcount_dict = dict(cumstats[[self.in_column, "count"]].values) + # map categories for current timestamp to statistics temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict) temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict) + # count statistics for current timestamp stats = ( timestamp_df["target"] @@ -254,8 +296,14 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: .agg(["count", "sum"]) .reset_index() ) + # statistics become zeros for categories with target=NaN + stats = stats.replace({"count": 0, "sum": 0}, np.NaN) + # sum current and previous statistics cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum() + # zeros appear for categories that weren't updated in previous line and whose statistics were NaN + cumstats = cumstats.replace({"count": 0, "sum": 0}, np.NaN) + cur_timestamp_idx += 1 feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing) diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py index 6b2ad6279..973bcae05 100644 --- a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py @@ -31,7 +31,7 @@ def category_ts() -> TSDataset: def expected_micro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.75, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN] ts = TSDataset(df, freq="D") return ts @@ -41,7 +41,7 @@ def expected_micro_category_ts() -> TSDataset: def expected_micro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.5, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 1.5, 1.5, 2.5, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, 8.0] ts = TSDataset(df, freq="D") return ts @@ -61,7 +61,7 @@ def expected_micro_category_make_future_ts() -> TSDataset: def expected_macro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 4, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 4.275] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 4.875, 4, 4.851] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 4.27] ts = TSDataset(df, freq="D") return ts @@ -71,7 +71,7 @@ def expected_macro_category_ts() -> TSDataset: def expected_macro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 5, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 5.55] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 4, 4.875, 5, 4.85] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 5.55] ts = TSDataset(df, freq="D") return ts @@ -104,7 +104,7 @@ def ts_begin_nan() -> TSDataset: def expected_ts_begin_nan_smooth_1() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 0.5, 1.16, 1.5, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.75, 1.5, 2.5] ts = TSDataset(df, freq="D") return ts @@ -114,12 +114,97 @@ def expected_ts_begin_nan_smooth_1() -> TSDataset: def expected_ts_begin_nan_smooth_2() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 2 / 3, 5 / 4, 5 / 3, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 5 / 3, 5 / 3, 2.5] ts = TSDataset(df, freq="D") return ts +@pytest.fixture +def multiple_nan_target_category_ts() -> TSDataset: + """Fixture with segment having multiple NaN targets: + + * For `regressor="A"` set of NaN timestamp goes before first notna value + * For `regressor="B"` set of NaN timestamp goes after first notna value + """ + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8) + df["target"] = [np.nan, 1.5, np.nan, 3.0, 4.0, np.NaN, np.NaN, np.NaN] + + df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=9) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "B", "A", "A"] + + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_multiple_nan_target_category_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8) + df.rename(columns={"target": "regressor_mean"}, inplace=True) + df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.75, 3.0] + + ts = TSDataset(df=df, freq="D") + + return ts + + +@pytest.fixture +def mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df["target"] = [0, 1, np.NaN, 3, 4] + + df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10) + df_exog.rename(columns={"target": "segment_feature"}, inplace=True) + df_exog["segment_feature"] = "segment_0" + + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df.rename(columns={"target": "segment_mean"}, inplace=True) + df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + + ts = TSDataset(df=df, freq="D") + + return ts + + +@pytest.fixture +def multiple_nan_target_two_segments_ts() -> TSDataset: + """Fixture with two segments having multiple NaN targets: + + * For `regressor="A"` set of NaN timestamp goes before first notna value + * For `regressor="B"` set of NaN timestamp goes after first notna value + """ + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df["target"] = [np.NaN, 2, np.NaN, 4, np.NaN, 5] + [np.NaN, 7, np.NaN, np.NaN, 10, 11] + + df_exog = generate_ar_df(start_time="2001-01-01", periods=7, n_segments=2) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "A"] + ["A", "B", "A", "B", "A", "B", "A"] + + ts = TSDataset(df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_multiple_nan_target_two_segments_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "regressor_mean"}, inplace=True) + df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 4.5, 4.5] + [np.NaN, np.NaN, np.NaN, 4.5, 4, 4.5] + + ts = TSDataset(df=df, freq="D") + + return ts + + @pytest.mark.smoke @pytest.mark.parametrize("mode", ["per-segment", "macro"]) @pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) @@ -311,6 +396,56 @@ def test_ts_begin_nan_smooth_2(ts_begin_nan, expected_ts_begin_nan_smooth_2): ) +def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_encoder_ts): + mean_encoder = MeanEncoderTransform( + in_column="segment_feature", + mode="per-segment", + handle_missing="category", + smoothing=0, + out_column="segment_mean", + ) + mean_encoder.fit_transform(mean_segment_encoder_ts) + assert_frame_equal( + mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]], + expected_mean_segment_encoder_ts.df, + atol=0.01, + ) + + +def test_multiple_nan_target_category_ts(multiple_nan_target_category_ts, expected_multiple_nan_target_category_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=0, + out_column="regressor_mean", + ) + mean_encoder.fit_transform(multiple_nan_target_category_ts) + assert_frame_equal( + multiple_nan_target_category_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]], + expected_multiple_nan_target_category_ts.df, + atol=0.01, + ) + + +def test_multiple_nan_target_two_segments_ts( + multiple_nan_target_two_segments_ts, expected_multiple_nan_target_two_segments_ts +): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="macro", + handle_missing="category", + smoothing=0, + out_column="regressor_mean", + ) + mean_encoder.fit_transform(multiple_nan_target_two_segments_ts) + assert_frame_equal( + multiple_nan_target_two_segments_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]], + expected_multiple_nan_target_two_segments_ts.df, + atol=0.01, + ) + + def test_save_load(category_ts): mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") assert_transformation_equals_loaded_original(transform=mean_encoder, ts=category_ts)