Merge branch 'master' into issue-535

etna-team · Dec 23, 2024 · 059675b · 059675b
2 parents b10ee3c + 20d7fe9
commit 059675b
Show file tree

Hide file tree

Showing 12 changed files with 338 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,7 +38,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Update `aggregate_metrics_df` to work with `None` values ([#522](https://github.com/etna-team/etna/pull/522))
 - 
 - 
-- 
+- Rework validation of `FoldMask` to not fail on tail nans ([#536](https://github.com/etna-team/etna/pull/536))
+- Add parameter `missing_mode` into `R2` and `MedAE` metrics ([#537](https://github.com/etna-team/etna/pull/537))
 - 
 - 
 
@@ -50,7 +51,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
 - Fix `target` leakage in `MeanSegmentEncoderTransform` ([#503](https://github.com/etna-team/etna/pull/503))
 - 
-- 
+- Add handling scikit-learn version >= 1.4 in `OneHotEncoderTransform` and `HierarchicalClustering` ([#529](https://github.com/etna-team/etna/pull/529))
 - 
 - 
 - 

diff --git a/etna/clustering/hierarchical/base.py b/etna/clustering/hierarchical/base.py
@@ -5,6 +5,7 @@
 from typing import Union
 
 import pandas as pd
+from sklearn import __version__ as sklearn_version
 from sklearn.cluster import AgglomerativeClustering
 
 from etna.clustering.base import Clustering
@@ -81,9 +82,16 @@ def build_clustering_algo(
         """
         self.n_clusters = n_clusters
         self.linkage = ClusteringLinkageMode(linkage).name
+
+        sklearn_version_tuple = tuple(map(int, sklearn_version.split(".")))
+        if sklearn_version_tuple < (1, 2):
+            clustering_algo_params["affinity"] = "precomputed"
+        else:
+            clustering_algo_params["metric"] = "precomputed"
         self.clustering_algo = AgglomerativeClustering(
-            n_clusters=self.n_clusters, affinity="precomputed", linkage=self.linkage, **clustering_algo_params
+            n_clusters=self.n_clusters, linkage=self.linkage, **clustering_algo_params
         )
+
         self.clusters = None
         self.segment2cluster = None
         self.centroids_df = None

diff --git a/etna/metrics/__init__.py b/etna/metrics/__init__.py
@@ -1,8 +1,6 @@
 """Module with metrics of forecasting quality."""
 
 from sklearn.metrics import mean_squared_log_error as msle
-from sklearn.metrics import median_absolute_error as medae
-from sklearn.metrics import r2_score
 
 from etna.metrics.base import Metric
 from etna.metrics.base import MetricAggregationMode

diff --git a/etna/metrics/functional_metrics.py b/etna/metrics/functional_metrics.py
@@ -8,8 +8,6 @@
 import numpy as np
 from sklearn.metrics import mean_squared_error as mse_sklearn
 from sklearn.metrics import mean_squared_log_error as msle
-from sklearn.metrics import median_absolute_error as medae
-from sklearn.metrics import r2_score
 from typing_extensions import assert_never
 
 ArrayLike = Union[float, Sequence[float], Sequence[Sequence[float]]]
@@ -243,6 +241,128 @@ def smape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15, multioutput:
     return result
 
 
+def r2_score(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike:
+    """Coefficient of determination metric.
+
+    .. math::
+        R^2(y\_true, y\_pred) = 1 - \\frac{\\sum_{i=1}^{n}{(y\_true_i - y\_pred_i)^2}}{\\sum_{i=1}^{n}{(y\_true_i - \\overline{y\_true})^2}}
+
+    The nans are ignored during computation. If all values are nans, the result is NaN.
+
+    Parameters
+    ----------
+    y_true:
+        array-like of shape (n_samples,) or (n_samples, n_outputs)
+
+        Ground truth (correct) target values.
+
+    y_pred:
+        array-like of shape (n_samples,) or (n_samples, n_outputs)
+
+        Estimated target values.
+
+    multioutput:
+        Defines aggregating of multiple output values
+        (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`).
+
+    Returns
+    -------
+    :
+        A floating point value, or an array of floating point values,
+        one for each individual target.
+    """
+    y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred)
+
+    if len(y_true_array.shape) != len(y_pred_array.shape):
+        raise ValueError("Shapes of the labels must be the same")
+
+    axis = _get_axis_by_multioutput(multioutput)
+    not_nan = ~np.isnan(y_true_array - y_pred_array)
+    with warnings.catch_warnings():
+        # this helps to prevent warning in case of all nans
+        warnings.filterwarnings(
+            message="invalid value encountered in scalar divide",
+            action="ignore",
+        )
+        warnings.filterwarnings(
+            message="invalid value encountered in divide",
+            action="ignore",
+        )
+        warnings.filterwarnings(
+            message="Degrees of freedom <= 0 for slice",
+            action="ignore",
+        )
+
+        numerator = np.asarray(mse(y_true=y_true, y_pred=y_pred, multioutput=multioutput))
+        y_true_array = y_true_array.astype(float)  # otherwise we can't assign NaN to it
+        y_true_array[~not_nan] = np.NaN
+        denominator = np.asarray(np.nanvar(y_true_array, axis=axis))
+        nonzero_numerator = np.asarray(numerator != 0)
+        nonzero_denominator = np.asarray(denominator != 0)
+
+        result = np.ones_like(numerator, dtype=float)
+        valid_score = nonzero_denominator & nonzero_numerator
+        # if numerator and denominator aren't zero, then just compute r2_score
+        result[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score])
+        # if numerator is non-zero, the answer is 0.0, otherwise (getting 0/0) the answer is 1.0
+        result[nonzero_numerator & ~nonzero_denominator] = 0.0
+
+        # if there are less than 2 values, result is NaN
+        num_not_nans = np.sum(not_nan, axis=axis)
+        result = np.where(num_not_nans < 2, np.NaN, result)
+
+        if multioutput is FunctionalMetricMultioutput.joint:
+            return result.item()
+        else:
+            return result  # type: ignore
+
+
+def medae(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike:
+    """Median absolute error metric.
+
+    .. math::
+       MedAE(y\_true, y\_pred) = median(\\mid y\_true_1 - y\_pred_1 \\mid, \\cdots, \\mid y\_true_n - y\_pred_n \\mid)
+
+    The nans are ignored during computation. If all values are nans, the result is NaN.
+
+    Parameters
+    ----------
+    y_true:
+        array-like of shape (n_samples,) or (n_samples, n_outputs)
+
+        Ground truth (correct) target values.
+
+    y_pred:
+        array-like of shape (n_samples,) or (n_samples, n_outputs)
+
+        Estimated target values.
+
+    multioutput:
+        Defines aggregating of multiple output values
+        (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`).
+
+    Returns
+    -------
+    :
+        A non-negative floating point value (the best value is 0.0), or an array of floating point values,
+        one for each individual target.
+    """
+    y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred)
+
+    if len(y_true_array.shape) != len(y_pred_array.shape):
+        raise ValueError("Shapes of the labels must be the same")
+
+    axis = _get_axis_by_multioutput(multioutput)
+    with warnings.catch_warnings():
+        # this helps to prevent warning in case of all nans
+        warnings.filterwarnings(
+            message="All-NaN slice encountered",
+            action="ignore",
+        )
+        result = np.nanmedian(np.abs(y_true_array - y_pred_array), axis=axis)
+    return result
+
+
 def sign(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike:
     """Sign error metric.
 
@@ -331,10 +451,10 @@ def max_deviation(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "join
     isnan = np.all(np.isnan(diff), axis=axis)
     result = np.max(np.abs(prefix_error_sum), axis=axis)
     result = np.where(isnan, np.NaN, result)
-    try:
+    if multioutput is FunctionalMetricMultioutput.joint:
         return result.item()
-    except ValueError as e:
-        return result  # type: ignore
+    else:
+        return result
 
 
 rmse = partial(mse_sklearn, squared=False)
@@ -401,9 +521,9 @@ def wape(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> Ar
         isnan = np.all(isnan, axis=axis)
         result = np.where(denominator == 0, np.NaN, numerator / denominator)
         result = np.where(isnan, np.NaN, result)
-        try:
+        if multioutput is FunctionalMetricMultioutput.joint:
             return result.item()
-        except ValueError as e:
+        else:
             return result  # type: ignore
 
 

diff --git a/etna/metrics/metrics.py b/etna/metrics/metrics.py
@@ -148,17 +148,21 @@ def greater_is_better(self) -> bool:
         return False
 
 
-class R2(Metric):
+class R2(MetricWithMissingHandling):
     """Coefficient of determination metric with multi-segment computation support.
 
     .. math::
         R^2(y\_true, y\_pred) = 1 - \\frac{\\sum_{i=1}^{n}{(y\_true_i - y\_pred_i)^2}}{\\sum_{i=1}^{n}{(y\_true_i - \\overline{y\_true})^2}}
+
+    This metric can handle missing values with parameter ``missing_mode``.
+    If there are too many of them in ``ignore`` mode, the result will be ``None``.
+
     Notes
     -----
     You can read more about logic of multi-segment metrics in Metric docs.
     """
 
-    def __init__(self, mode: str = "per-segment", **kwargs):
+    def __init__(self, mode: str = "per-segment", missing_mode: str = "error", **kwargs):
         """Init metric.
 
         Parameters
@@ -171,11 +175,19 @@ def __init__(self, mode: str = "per-segment", **kwargs):
             * if "per-segment" -- does not aggregate metrics
 
             See :py:class:`~etna.metrics.base.MetricAggregationMode`.
+        missing_mode:
+            mode of handling missing values (see :py:class:`~etna.metrics.base.MetricMissingMode`)
         kwargs:
             metric's computation arguments
         """
         r2_per_output = partial(r2_score, multioutput="raw_values")
-        super().__init__(mode=mode, metric_fn=r2_per_output, metric_fn_signature="matrix_to_array", **kwargs)
+        super().__init__(
+            mode=mode,
+            metric_fn=r2_per_output,
+            metric_fn_signature="matrix_to_array",
+            missing_mode=missing_mode,
+            **kwargs,
+        )
 
     @property
     def greater_is_better(self) -> bool:
@@ -277,18 +289,21 @@ def greater_is_better(self) -> bool:
         return False
 
 
-class MedAE(Metric):
+class MedAE(MetricWithMissingHandling):
     """Median absolute error metric with multi-segment computation support.
 
     .. math::
        MedAE(y\_true, y\_pred) = median(\\mid y\_true_1 - y\_pred_1 \\mid, \\cdots, \\mid y\_true_n - y\_pred_n \\mid)
 
+    This metric can handle missing values with parameter ``missing_mode``.
+    If there are too many of them in ``ignore`` mode, the result will be ``None``.
+
     Notes
     -----
     You can read more about logic of multi-segment metrics in Metric docs.
     """
 
-    def __init__(self, mode: str = "per-segment", **kwargs):
+    def __init__(self, mode: str = "per-segment", missing_mode: str = "error", **kwargs):
         """Init metric.
 
         Parameters
@@ -301,11 +316,19 @@ def __init__(self, mode: str = "per-segment", **kwargs):
             * if "per-segment" -- does not aggregate metrics
 
             See :py:class:`~etna.metrics.base.MetricAggregationMode`.
+        missing_mode:
+            mode of handling missing values (see :py:class:`~etna.metrics.base.MetricMissingMode`)
         kwargs:
             metric's computation arguments
         """
         medae_per_output = partial(medae, multioutput="raw_values")
-        super().__init__(mode=mode, metric_fn=medae_per_output, metric_fn_signature="matrix_to_array", **kwargs)
+        super().__init__(
+            mode=mode,
+            metric_fn=medae_per_output,
+            metric_fn_signature="matrix_to_array",
+            missing_mode=missing_mode,
+            **kwargs,
+        )
 
     @property
     def greater_is_better(self) -> bool:

diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py
@@ -184,8 +184,6 @@ def validate_on_dataset(self, ts: TSDataset, horizon: int):
             Some of target timestamps aren't present in a given dataset
         ValueError:
             First train timestamp should be later than minimal dataset timestamp
-        ValueError:
-            Last train timestamp should be not later than the ending of the shortest segment
         ValueError:
             Last target timestamp should be not later than horizon steps after last train timestamp
         """
@@ -201,12 +199,6 @@ def validate_on_dataset(self, ts: TSDataset, horizon: int):
             diff = set(self.target_timestamps).difference(set(timestamps))
             raise ValueError(f"Some target timestamps aren't present in a given dataset: {reprlib.repr(diff)}")
 
-        dataset_description = ts.describe()
-
-        dataset_min_last_timestamp = dataset_description["end_timestamp"].min()
-        if self.last_train_timestamp > dataset_min_last_timestamp:
-            raise ValueError(f"Last train timestamp should be not later than {dataset_min_last_timestamp}!")
-
         dataset_horizon_border_timestamp = timestamps[timestamps.index(self.last_train_timestamp) + horizon]
         mask_last_target_timestamp = self.target_timestamps[-1]
         if dataset_horizon_border_timestamp < mask_last_target_timestamp:

diff --git a/etna/transforms/encoders/categorical.py b/etna/transforms/encoders/categorical.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+from sklearn import __version__ as sklearn_version
 from sklearn import preprocessing
 from sklearn.utils._encode import _check_unknown
 from sklearn.utils._encode import _encode
@@ -215,7 +216,15 @@ def __init__(self, in_column: str, out_column: Optional[str] = None, return_type
         self.in_column = in_column
         self.out_column = out_column
         self.return_type = ReturnType(return_type)
-        self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=int)
+
+        sklearn_version_tuple = tuple(map(int, sklearn_version.split(".")))
+        encoder_params = {}
+        if sklearn_version_tuple < (1, 2):
+            encoder_params["sparse"] = False
+        else:
+            encoder_params["sparse_output"] = False
+        self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", dtype=int, **encoder_params)
+
         self.in_column_regressor: Optional[bool] = None
 
     def get_regressors_info(self) -> List[str]:

diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py
@@ -47,7 +47,6 @@ def pool_list():
     ]
 
 
-@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
 @pytest.mark.parametrize(
     "ts_name",
     [
@@ -59,7 +58,6 @@ def pool_list():
     ],
 )
 def test_objective(
-    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),
@@ -93,10 +91,8 @@ def test_objective(
     callback.assert_called_once()
 
 
-@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
 @pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_all_segments"])
 def test_objective_fail_none(
-    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),