diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d505bdeb..6e3d5c379 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add parameter `missing_mode` into `MAE` metric ([#523](https://github.com/etna-team/etna/pull/523)) - Add parameter `missing_mode` into `MAPE` and `SMAPE` metrics ([#524](https://github.com/etna-team/etna/pull/524)) - -- +- Update `aggregate_metrics_df` to work with `None` values ([#522](https://github.com/etna-team/etna/pull/522)) - - - diff --git a/etna/auto/auto.py b/etna/auto/auto.py index 47101bc6d..525987f3e 100644 --- a/etna/auto/auto.py +++ b/etna/auto/auto.py @@ -484,7 +484,11 @@ def _objective(trial: Trial) -> float: for metric in aggregated_metrics: trial.set_user_attr(metric, aggregated_metrics[metric]) - return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"] + result_value = aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"] + if result_value is None: + raise ValueError("Metric value is None! It should be float for optimization.") + + return result_value return _objective @@ -809,7 +813,11 @@ def _objective(trial: Trial) -> float: for metric in aggregated_metrics: trial.set_user_attr(metric, aggregated_metrics[metric]) - return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"] + result_value = aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"] + if result_value is None: + raise ValueError("Metric value is None! It should be float for optimization.") + + return result_value return _objective diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py index 5e31c5d78..e563010a7 100644 --- a/etna/metrics/utils.py +++ b/etna/metrics/utils.py @@ -1,3 +1,4 @@ +import warnings from typing import Callable from typing import Dict from typing import List @@ -37,24 +38,89 @@ def compute_metrics( return metrics_values +def mean_agg(): + """Mean for pandas agg.""" + + def func(x: pd.Series): + with warnings.catch_warnings(): + # this helps to prevent warning in case of all nans + warnings.filterwarnings( + message="Mean of empty slice", + action="ignore", + ) + return np.nanmean(a=x.values) + + func.__name__ = "mean" + return func + + +def median_agg(): + """Median for pandas agg.""" + + def func(x: pd.Series): + with warnings.catch_warnings(): + # this helps to prevent warning in case of all nans + warnings.filterwarnings( + message="All-NaN slice encountered", + action="ignore", + ) + return np.nanmedian(a=x.values) + + func.__name__ = "median" + return func + + +def std_agg(): + """Std for pandas agg.""" + + def func(x: pd.Series): + with warnings.catch_warnings(): + # this helps to prevent warning in case of all nans + warnings.filterwarnings( + message="Degrees of freedom <= 0", + action="ignore", + ) + return np.nanstd(a=x.values) + + func.__name__ = "std" + return func + + +def notna_size_agg(): + """Size of not-na elements for pandas agg.""" + + def func(x: pd.Series): + return len(x) - pd.isna(x.values).sum() + + func.__name__ = "notna_size" + return func + + def percentile(n: int): """Percentile for pandas agg.""" - def percentile_(x): - return np.nanpercentile(a=x.values, q=n) + def func(x: pd.Series): + with warnings.catch_warnings(): + # this helps to prevent warning in case of all nans + warnings.filterwarnings( + message="All-NaN slice encountered", + action="ignore", + ) + return np.nanpercentile(a=x.values, q=n) - percentile_.__name__ = f"percentile_{n}" - return percentile_ + func.__name__ = f"percentile_{n}" + return func MetricAggregationStatistics = Literal[ - "median", "mean", "std", "percentile_5", "percentile_25", "percentile_75", "percentile_95" + "median", "mean", "std", "notna_size", "percentile_5", "percentile_25", "percentile_75", "percentile_95" ] METRICS_AGGREGATION_MAP: Dict[MetricAggregationStatistics, Union[str, Callable]] = { - "median": "median", - "mean": "mean", - "std": "std", + "median": mean_agg(), + "mean": median_agg(), + "std": std_agg(), + "notna_size": notna_size_agg(), "percentile_5": percentile(5), "percentile_25": percentile(25), "percentile_75": percentile(75), @@ -62,7 +128,7 @@ def percentile_(x): } -def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]: +def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, Optional[float]]: """Aggregate metrics in :py:meth:`log_backtest_metrics` method. Parameters @@ -74,7 +140,7 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]: if "fold_number" in metrics_df.columns: metrics_dict = ( metrics_df.groupby("segment") - .mean() + .mean(numeric_only=False) .reset_index() .drop(["segment", "fold_number"], axis=1) .apply(list(METRICS_AGGREGATION_MAP.values())) @@ -85,10 +151,11 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]: else: metrics_dict = metrics_df.drop(["segment"], axis=1).apply(list(METRICS_AGGREGATION_MAP.values())).to_dict() - metrics_dict_wide = { - f"{metrics_key}_{statistics_key}": value - for metrics_key, values in metrics_dict.items() - for statistics_key, value in values.items() - } + cur_dict = {} + for metrics_key, values in metrics_dict.items(): + for statistics_key, value in values.items(): + new_key = f"{metrics_key}_{statistics_key}" + new_value = value if not pd.isna(value) else None + cur_dict[new_key] = new_value - return metrics_dict_wide + return cur_dict diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py index d6728ee6c..c5b5dba3f 100644 --- a/etna/pipeline/base.py +++ b/etna/pipeline/base.py @@ -856,7 +856,9 @@ def _get_backtest_metrics(self, aggregate_metrics: bool = False) -> pd.DataFrame metrics_df.sort_values(["segment", self._fold_column], inplace=True) if aggregate_metrics: - metrics_df = metrics_df.groupby("segment").mean().reset_index().drop(self._fold_column, axis=1) + metrics_df = ( + metrics_df.groupby("segment").mean(numeric_only=False).reset_index().drop(self._fold_column, axis=1) + ) return metrics_df diff --git a/tests/test_auto/conftest.py b/tests/test_auto/conftest.py index 18168e8c3..66584e926 100644 --- a/tests/test_auto/conftest.py +++ b/tests/test_auto/conftest.py @@ -1,11 +1,14 @@ from os import unlink +import numpy as np +import pandas as pd import pytest from optuna.storages import RDBStorage from optuna.trial import TrialState from typing_extensions import NamedTuple from etna.auto.utils import config_hash +from etna.datasets import TSDataset from etna.models import NaiveModel from etna.pipeline import Pipeline @@ -35,3 +38,102 @@ class Trial(NamedTuple): fail_trials = [Trial(user_attrs={}, state=TrialState.FAIL)] return complete_trials + complete_trials[:3] + fail_trials + + +@pytest.fixture +def ts_with_fold_missing_tail(random_seed) -> TSDataset: + periods = 100 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + df1.loc[df1.index[-7:], "target"] = np.NaN + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + df2.loc[df2.index[-7:], "target"] = np.NaN + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +@pytest.fixture +def ts_with_fold_missing_middle(random_seed) -> TSDataset: + periods = 100 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + df1.loc[df1.index[-14:-7], "target"] = np.NaN + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + df2.loc[df2.index[-14:-7], "target"] = np.NaN + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +@pytest.fixture +def ts_with_all_folds_missing_one_segment(random_seed) -> TSDataset: + periods = 100 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + df1.loc[df1.index[-40:], "target"] = np.NaN + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +@pytest.fixture +def ts_with_all_folds_missing_all_segments(random_seed) -> TSDataset: + periods = 100 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + df1.loc[df1.index[-40:], "target"] = np.NaN + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + df2.loc[df2.index[-40:], "target"] = np.NaN + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +@pytest.fixture +def ts_with_few_missing(random_seed) -> TSDataset: + periods = 100 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + df1.loc[df1.index[-4:-2], "target"] = np.NaN + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + df2.loc[df2.index[-12:-10], "target"] = np.NaN + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py index e93b56581..76f8e44e4 100644 --- a/tests/test_auto/test_auto.py +++ b/tests/test_auto/test_auto.py @@ -16,6 +16,7 @@ from etna.models import NaiveModel from etna.pipeline import Pipeline from etna.transforms import LagTransform +from etna.transforms import TimeSeriesImputerTransform @pytest.fixture() @@ -23,13 +24,15 @@ def pool_generator(): pool = [ { "_target_": "etna.pipeline.Pipeline", - "horizon": "${__aux__.horizon}", "model": {"_target_": "etna.models.MovingAverageModel", "window": "${mult:${horizon},1}"}, + "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}], + "horizon": "${__aux__.horizon}", }, { "_target_": "etna.pipeline.Pipeline", - "horizon": "${__aux__.horizon}", "model": {"_target_": "etna.models.NaiveModel", "lag": 1}, + "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}], + "horizon": "${__aux__.horizon}", }, ] pool_generator = PoolGenerator(pool) @@ -38,26 +41,44 @@ def pool_generator(): @pytest.fixture() def pool_list(): - return [Pipeline(MovingAverageModel(7), horizon=7), Pipeline(NaiveModel(1), horizon=7)] + return [ + Pipeline(MovingAverageModel(7), transforms=[TimeSeriesImputerTransform()], horizon=7), + Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7), + ] +@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock()) # TODO: remove after fix +@pytest.mark.parametrize( + "ts_name", + [ + "example_tsds", + "ts_with_few_missing", + "ts_with_fold_missing_tail", + "ts_with_fold_missing_middle", + "ts_with_all_folds_missing_one_segment", + ], +) def test_objective( - example_tsds, - target_metric=MAE(), + validate_on_dataset_mock, + ts_name, + request, + target_metric=MAE(missing_mode="ignore"), metric_aggregation: Literal["mean"] = "mean", - metrics=[MAE()], + metrics=[MAE(missing_mode="ignore")], backtest_params={}, - initializer=MagicMock(spec=_Initializer), - callback=MagicMock(spec=_Callback), relative_params={ "_target_": "etna.pipeline.Pipeline", "horizon": 7, "model": {"_target_": "etna.models.NaiveModel", "lag": 1}, + "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}], }, ): + ts = request.getfixturevalue(ts_name) + initializer = MagicMock(spec=_Initializer) + callback = MagicMock(spec=_Callback) trial = MagicMock(relative_params=relative_params) _objective = Auto.objective( - ts=example_tsds, + ts=ts, target_metric=target_metric, metric_aggregation=metric_aggregation, metrics=metrics, @@ -72,6 +93,41 @@ def test_objective( callback.assert_called_once() +@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock()) # TODO: remove after fix +@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_all_segments"]) +def test_objective_fail_none( + validate_on_dataset_mock, + ts_name, + request, + target_metric=MAE(missing_mode="ignore"), + metric_aggregation: Literal["mean"] = "mean", + metrics=[MAE(missing_mode="ignore")], + backtest_params={}, + initializer=MagicMock(spec=_Initializer), + callback=MagicMock(spec=_Callback), + relative_params={ + "_target_": "etna.pipeline.Pipeline", + "horizon": 7, + "model": {"_target_": "etna.models.NaiveModel", "lag": 1}, + "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}], + }, +): + ts = request.getfixturevalue(ts_name) + trial = MagicMock(relative_params=relative_params) + _objective = Auto.objective( + ts=ts, + target_metric=target_metric, + metric_aggregation=metric_aggregation, + metrics=metrics, + backtest_params=backtest_params, + initializer=initializer, + callback=callback, + ) + + with pytest.raises(ValueError, match="Metric value is None"): + _ = _objective(trial) + + @pytest.mark.parametrize("tune_size", [0, 2]) def test_fit_called_tuning_pool( tune_size, @@ -142,17 +198,20 @@ def test_init_optuna( ) +@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"]) @pytest.mark.parametrize("pool", ["pool_list", "pool_generator"]) -def test_fit_without_tuning_list(example_tsds, optuna_storage, pool, request): +def test_fit_without_tuning_list(ts_name, optuna_storage, pool, request): + ts = request.getfixturevalue(ts_name) pool = request.getfixturevalue(pool) auto = Auto( - MAE(), + MAE(missing_mode="ignore"), + metrics=[MAE(missing_mode="ignore")], pool=pool, metric_aggregation="median", horizon=7, storage=optuna_storage, ) - auto.fit(ts=example_tsds, n_trials=2) + auto.fit(ts=ts, n_trials=2) assert len(auto._pool_optuna.study.trials) == 2 assert len(auto.summary()) == 2 @@ -163,27 +222,36 @@ def test_fit_without_tuning_list(example_tsds, optuna_storage, pool, request): assert auto.top_k(k=1)[0].to_dict() == pool[0].to_dict() +@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"]) @pytest.mark.parametrize("tune_size", [1, 2]) def test_fit_with_tuning( + ts_name, tune_size, - example_tsds, + request, optuna_storage, pool=( - Pipeline(MovingAverageModel(5), horizon=7), - Pipeline(NaiveModel(1), horizon=7), + Pipeline(MovingAverageModel(5), transforms=[TimeSeriesImputerTransform(strategy="forward_fill")], horizon=7), + Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform(strategy="forward_fill")], horizon=7), Pipeline( - LinearPerSegmentModel(), transforms=[LagTransform(in_column="target", lags=list(range(7, 21)))], horizon=7 + LinearPerSegmentModel(), + transforms=[ + TimeSeriesImputerTransform(strategy="forward_fill"), + LagTransform(in_column="target", lags=list(range(7, 21))), + ], + horizon=7, ), ), ): + ts = request.getfixturevalue(ts_name) auto = Auto( - MAE(), + MAE(missing_mode="ignore"), + metrics=[MAE(missing_mode="ignore")], pool=pool, metric_aggregation="median", horizon=7, storage=optuna_storage, ) - auto.fit(ts=example_tsds, n_trials=11, tune_size=tune_size) + auto.fit(ts=ts, n_trials=11, tune_size=tune_size) assert len(auto._pool_optuna.study.trials) == 3 assert len(auto.summary()) == 11 diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py index 0a1b972a0..41562994d 100644 --- a/tests/test_auto/test_tune.py +++ b/tests/test_auto/test_tune.py @@ -21,22 +21,37 @@ from etna.reconciliation import BottomUpReconciliator from etna.transforms import AddConstTransform from etna.transforms import DateFlagsTransform +from etna.transforms import TimeSeriesImputerTransform +@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock()) # TODO: remove after fix +@pytest.mark.parametrize( + "ts_name", + [ + "example_tsds", + "ts_with_few_missing", + "ts_with_fold_missing_tail", + "ts_with_fold_missing_middle", + "ts_with_all_folds_missing_one_segment", + ], +) def test_objective( - example_tsds, - target_metric=MAE(), + validate_on_dataset_mock, + ts_name, + request, + target_metric=MAE(missing_mode="ignore"), metric_aggregation: Literal["mean"] = "mean", - metrics=[MAE()], + metrics=[MAE(missing_mode="ignore")], backtest_params={}, - initializer=MagicMock(spec=_Initializer), - callback=MagicMock(spec=_Callback), - pipeline=Pipeline(NaiveModel()), + pipeline=Pipeline(model=NaiveModel(), transforms=[TimeSeriesImputerTransform()], horizon=7), params_to_tune={}, ): + ts = request.getfixturevalue(ts_name) + initializer = MagicMock(spec=_Initializer) + callback = MagicMock(spec=_Callback) trial = MagicMock() _objective = Tune.objective( - ts=example_tsds, + ts=ts, pipeline=pipeline, params_to_tune=params_to_tune, target_metric=target_metric, @@ -53,6 +68,39 @@ def test_objective( callback.assert_called_once() +@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock()) # TODO: remove after fix +@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_all_segments"]) +def test_objective_fail_none( + validate_on_dataset_mock, + ts_name, + request, + target_metric=MAE(missing_mode="ignore"), + metric_aggregation: Literal["mean"] = "mean", + metrics=[MAE(missing_mode="ignore")], + backtest_params={}, + initializer=MagicMock(spec=_Initializer), + callback=MagicMock(spec=_Callback), + pipeline=Pipeline(model=NaiveModel(), transforms=[TimeSeriesImputerTransform()], horizon=7), + params_to_tune={}, +): + ts = request.getfixturevalue(ts_name) + trial = MagicMock() + _objective = Tune.objective( + ts=ts, + pipeline=pipeline, + params_to_tune=params_to_tune, + target_metric=target_metric, + metric_aggregation=metric_aggregation, + metrics=metrics, + backtest_params=backtest_params, + initializer=initializer, + callback=callback, + ) + + with pytest.raises(ValueError, match="Metric value is None"): + _ = _objective(trial) + + def test_fit_called_tune( ts=MagicMock(), tune=MagicMock(), @@ -92,6 +140,7 @@ def test_init_optuna( ) +@pytest.mark.filterwarnings("ignore: overflow encountered in multiply") @pytest.mark.parametrize( "params, model", [ @@ -165,23 +214,30 @@ def test_top_k( assert [pipeline.model.lag for pipeline in top_k] == [i for i in range(expected_k)] # noqa C416 +@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"]) @pytest.mark.parametrize( "pipeline", [ - (Pipeline(NaiveModel(1), horizon=7)), - (AutoRegressivePipeline(model=NaiveModel(1), horizon=7, transforms=[])), - (AutoRegressivePipeline(model=NaiveModel(1), horizon=7, transforms=[DateFlagsTransform()])), + (Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7)), + (AutoRegressivePipeline(model=NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7)), + ( + AutoRegressivePipeline( + model=NaiveModel(1), transforms=[DateFlagsTransform(), TimeSeriesImputerTransform()], horizon=7 + ) + ), ], ) -def test_tune_run(example_tsds, optuna_storage, pipeline): +def test_tune_run(ts_name, optuna_storage, pipeline, request): + ts = request.getfixturevalue(ts_name) tune = Tune( pipeline=pipeline, - target_metric=MAE(), + target_metric=MAE(missing_mode="ignore"), + metrics=[MAE(missing_mode="ignore")], metric_aggregation="median", horizon=7, storage=optuna_storage, ) - tune.fit(ts=example_tsds, n_trials=2) + tune.fit(ts=ts, n_trials=2) assert len(tune._optuna.study.trials) == 2 assert len(tune.summary()) == 2 diff --git a/tests/test_loggers/test_file_logger.py b/tests/test_loggers/test_file_logger.py index 8f3385435..1e120534f 100644 --- a/tests/test_loggers/test_file_logger.py +++ b/tests/test_loggers/test_file_logger.py @@ -154,6 +154,7 @@ def test_base_file_logger_log_backtest_run(example_tsds: TSDataset): "median", "mean", "std", + "size", "percentile_5", "percentile_25", "percentile_75", @@ -213,7 +214,16 @@ def test_base_file_logger_log_backtest_metrics(example_tsds: TSDataset, aggregat with open(crossval_results_folder.joinpath("metrics_summary.json"), "r") as inf: metrics_summary = json.load(inf) - statistic_keys = ["median", "mean", "std", "percentile_5", "percentile_25", "percentile_75", "percentile_95"] + statistic_keys = [ + "median", + "mean", + "std", + "size", + "percentile_5", + "percentile_25", + "percentile_75", + "percentile_95", + ] assert len(metrics_summary.keys()) == len(metrics) * len(statistic_keys) tslogger.remove(idx) diff --git a/tests/test_metrics/test_metrics_utils.py b/tests/test_metrics/test_metrics_utils.py index 512186c5f..2114d17e5 100644 --- a/tests/test_metrics/test_metrics_utils.py +++ b/tests/test_metrics/test_metrics_utils.py @@ -1,11 +1,16 @@ +from typing import Any +from typing import Dict from typing import Tuple import numpy as np +import pandas as pd +import pytest from etna.datasets import TSDataset from etna.metrics import MAE from etna.metrics import MAPE from etna.metrics import MSE +from etna.metrics.utils import aggregate_metrics_df from etna.metrics.utils import compute_metrics @@ -21,3 +26,87 @@ def test_compute_metrics(train_test_dfs: Tuple[TSDataset, TSDataset]): ] result = compute_metrics(metrics=metrics, y_true=true_df, y_pred=forecast_df) np.testing.assert_array_equal(sorted(expected_keys), sorted(result.keys())) + + +@pytest.fixture +def metrics_df_with_folds() -> pd.DataFrame: + df = pd.DataFrame( + { + "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3, + "MAE": [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0], + "MSE": [None, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0], + "MAPE": [None, None, None, 20.0, 30.0, 40.0, 30.0, 40.0, 50.0], + "SMAPE": [None, None, None, None, None, None, 50.0, 60.0, 70.0], + "RMSE": [None, None, None, None, None, None, None, None, None], + "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2], + } + ) + return df + + +@pytest.fixture +def metrics_df_no_folds(metrics_df_with_folds) -> pd.DataFrame: + df = metrics_df_with_folds + df = df.groupby("segment").mean(numeric_only=False).reset_index().drop("fold_number", axis=1) + return df + + +@pytest.fixture +def aggregated_metrics_df() -> Dict[str, Any]: + result = { + "MAE_mean": 3.0, + "MAE_median": 3.0, + "MAE_std": 0.816496580927726, + "MAE_notna_size": 3.0, + "MAE_percentile_5": 2.1, + "MAE_percentile_25": 2.5, + "MAE_percentile_75": 3.5, + "MAE_percentile_95": 3.9, + "MSE_mean": 4.5, + "MSE_median": 4.0, + "MSE_std": 1.0801234497346435, + "MSE_notna_size": 3.0, + "MSE_percentile_5": 3.55, + "MSE_percentile_25": 3.75, + "MSE_percentile_75": 5.0, + "MSE_percentile_95": 5.8, + "MAPE_mean": 35.0, + "MAPE_median": 35.0, + "MAPE_std": 5.0, + "MAPE_notna_size": 2.0, + "MAPE_percentile_5": 30.5, + "MAPE_percentile_25": 32.5, + "MAPE_percentile_75": 37.5, + "MAPE_percentile_95": 39.5, + "SMAPE_mean": 60.0, + "SMAPE_median": 60.0, + "SMAPE_std": 0.0, + "SMAPE_notna_size": 1.0, + "SMAPE_percentile_5": 60.0, + "SMAPE_percentile_25": 60.0, + "SMAPE_percentile_75": 60.0, + "SMAPE_percentile_95": 60.0, + "RMSE_mean": None, + "RMSE_median": None, + "RMSE_std": None, + "RMSE_notna_size": 0.0, + "RMSE_percentile_5": None, + "RMSE_percentile_25": None, + "RMSE_percentile_75": None, + "RMSE_percentile_95": None, + } + return result + + +@pytest.mark.parametrize( + "df_name, answer_name", + [ + ("metrics_df_with_folds", "aggregated_metrics_df"), + ("metrics_df_no_folds", "aggregated_metrics_df"), + ], +) +def test_aggregate_metrics_df(df_name, answer_name, request): + metrics_df = request.getfixturevalue(df_name) + answer = request.getfixturevalue(answer_name) + result = aggregate_metrics_df(metrics_df) + assert result == answer