Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update interval metrics to handle missing values #541

Merged
merged 6 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add parameter `missing_mode` into `MAE` metric ([#523](https://github.com/etna-team/etna/pull/523))
- Add parameter `missing_mode` into `MAPE` and `SMAPE` metrics ([#524](https://github.com/etna-team/etna/pull/524))
- Add parameter `missing_mode` into `Sign`, `WAPE` and `MaxDeviation` metrics ([#530](https://github.com/etna-team/etna/pull/530))
- Add parameter `missing_mode` into `Coverage` and `Width` metrics ([#541](https://github.com/etna-team/etna/pull/541))
-
- Update `aggregate_metrics_df` to work with `None` values ([#522](https://github.com/etna-team/etna/pull/522))
-
Expand Down
69 changes: 58 additions & 11 deletions etna/metrics/intervals_metrics.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings
from reprlib import repr
from typing import Dict
from typing import Optional
from typing import Sequence
Expand All @@ -8,7 +10,7 @@
import pandas as pd

from etna.datasets import TSDataset
from etna.metrics.base import Metric
from etna.metrics.base import MetricWithMissingHandling
from etna.metrics.functional_metrics import ArrayLike


Expand All @@ -34,17 +36,36 @@
if not borders_presented:
raise ValueError("Provided intervals borders names must be in dataset!")

else:
missing_per_segment = ts.loc[:, pd.IndexSlice[:, list(borders_set)]].isna().any()
if missing_per_segment.any():
raise ValueError(

Check warning on line 42 in etna/metrics/intervals_metrics.py

View check run for this annotation

Codecov / codecov/patch

etna/metrics/intervals_metrics.py#L42

Added line #L42 was not covered by tests
"Provided intervals borders contain missing values! "
f"Series with missing values {repr(missing_per_segment[missing_per_segment].index.tolist())}"
)

else:
if not quantiles_presented:
raise ValueError("All quantiles must be presented in the dataset!")

else:
missing_per_segment = ts.loc[:, pd.IndexSlice[:, list(quantiles_set)]].isna().any()
if missing_per_segment.any():
raise ValueError(
"Quantiles contain missing values! "
f"Series with missing values {repr(missing_per_segment[missing_per_segment].index.tolist())}"
)


class Coverage(Metric, _IntervalsMetricMixin):
class Coverage(MetricWithMissingHandling, _IntervalsMetricMixin):
"""Coverage metric for prediction intervals - precenteage of samples in the interval ``[lower quantile, upper quantile]``.

.. math::
Coverage(y\_true, y\_pred) = \\frac{\\sum_{i=1}^{n}{[ y\_true_i \\ge y\_pred_i^{lower\_quantile}] * [y\_true_i \\le y\_pred_i^{upper\_quantile}] }}{n}

This metric can handle missing values with parameter ``missing_mode``.
If there are too many of them in ``ignore`` mode, the result will be ``None``.

Notes
-----
Works just if ``quantiles`` presented in ``y_pred``
Expand All @@ -58,6 +79,7 @@
mode: str = "per-segment",
upper_name: Optional[str] = None,
lower_name: Optional[str] = None,
missing_mode: str = "error",
**kwargs,
):
"""Init metric.
Expand All @@ -78,6 +100,8 @@
name of column with upper border of the interval
lower_name:
name of column with lower border of the interval
missing_mode:
mode of handling missing values (see :py:class:`~etna.metrics.base.MetricMissingMode`)
kwargs:
metric's computation arguments
"""
Expand All @@ -96,7 +120,7 @@
if quantiles is None and lower_name is None:
quantiles = (0.025, 0.975)

super().__init__(mode=mode, metric_fn=dummy, **kwargs)
super().__init__(mode=mode, metric_fn=dummy, missing_mode=missing_mode, **kwargs)
self.quantiles = sorted(quantiles if quantiles is not None else tuple())
self.upper_name = upper_name
self.lower_name = lower_name
Expand Down Expand Up @@ -136,7 +160,7 @@
lower_border = f"target_{self.quantiles[0]:.4g}"
upper_border = f"target_{self.quantiles[1]:.4g}"

df_true = y_true[:, :, "target"].sort_index(axis=1)
df_true = y_true.df.loc[:, pd.IndexSlice[:, "target"]].sort_index(axis=1)

intervals_df: pd.DataFrame = y_pred.get_prediction_intervals()
df_pred_lower = intervals_df.loc[:, pd.IndexSlice[:, lower_border]].sort_index(axis=1)
Expand All @@ -146,8 +170,19 @@

upper_quantile_flag = df_true.values <= df_pred_upper.values
lower_quantile_flag = df_true.values >= df_pred_lower.values
values = np.mean(upper_quantile_flag * lower_quantile_flag, axis=0)
metrics_per_segment = dict(zip(segments, values))

nan_mask = np.isnan(df_true.values) | np.isnan(df_pred_upper.values) | np.isnan(df_pred_lower.values)
in_bounds = (upper_quantile_flag * lower_quantile_flag).astype(float)
in_bounds[nan_mask] = np.NaN

with warnings.catch_warnings():
warnings.filterwarnings(
message="Mean of empty slice",
action="ignore",
)
values = np.nanmean(in_bounds, axis=0)

metrics_per_segment = dict(zip(segments, (None if np.isnan(x) else x for x in values)))

metrics = self._aggregate_metrics(metrics_per_segment)
return metrics
Expand All @@ -158,12 +193,15 @@
return None


class Width(Metric, _IntervalsMetricMixin):
class Width(MetricWithMissingHandling, _IntervalsMetricMixin):
"""Mean width of prediction intervals.

.. math::
Width(y\_true, y\_pred) = \\frac{\\sum_{i=1}^{n}\\mid y\_pred_i^{upper\_quantile} - y\_pred_i^{lower\_quantile} \\mid}{n}

This metric can handle missing values with parameter ``missing_mode``.
If there are too many of them in ``ignore`` mode, the result will be ``None``.

Notes
-----
Works just if quantiles presented in ``y_pred``.
Expand All @@ -177,6 +215,7 @@
mode: str = "per-segment",
upper_name: Optional[str] = None,
lower_name: Optional[str] = None,
missing_mode: str = "error",
**kwargs,
):
"""Init metric.
Expand All @@ -197,6 +236,8 @@
name of column with upper border of the interval
lower_name:
name of column with lower border of the interval
missing_mode:
mode of handling missing values (see :py:class:`~etna.metrics.base.MetricMissingMode`)
kwargs:
metric's computation arguments
"""
Expand All @@ -215,7 +256,7 @@
if quantiles is None and lower_name is None:
quantiles = (0.025, 0.975)

super().__init__(mode=mode, metric_fn=dummy, **kwargs)
super().__init__(mode=mode, metric_fn=dummy, missing_mode=missing_mode, **kwargs)
self.quantiles = sorted(quantiles if quantiles is not None else tuple())
self.upper_name = upper_name
self.lower_name = lower_name
Expand Down Expand Up @@ -255,16 +296,22 @@
lower_border = f"target_{self.quantiles[0]:.4g}"
upper_border = f"target_{self.quantiles[1]:.4g}"

df_true = y_true[:, :, "target"].sort_index(axis=1)
df_true = y_true.df.loc[:, pd.IndexSlice[:, "target"]].sort_index(axis=1)

intervals_df: pd.DataFrame = y_pred.get_prediction_intervals()
df_pred_lower = intervals_df.loc[:, pd.IndexSlice[:, lower_border]].sort_index(axis=1)
df_pred_upper = intervals_df.loc[:, pd.IndexSlice[:, upper_border]].sort_index(axis=1)

segments = df_true.columns.get_level_values("segment").unique()

values = np.mean(np.abs(df_pred_upper.values - df_pred_lower.values), axis=0)
metrics_per_segment = dict(zip(segments, values))
with warnings.catch_warnings():
warnings.filterwarnings(
message="Mean of empty slice",
action="ignore",
)
values = np.nanmean(np.abs(df_pred_upper.values - df_pred_lower.values), axis=0)

metrics_per_segment = dict(zip(segments, (None if np.isnan(x) else x for x in values)))

metrics = self._aggregate_metrics(metrics_per_segment)
return metrics
Expand Down
Loading
Loading