Skip to content

Commit

Permalink
Merge cat-features-dl-models into master (#344)
Browse files Browse the repository at this point in the history
* Allow RNN to work with categorical features (#334)

* add working with categoricals in RNN

* update changelog

* fix tests

* add test on MultiEmbedding, write comments

* remove numeric columns that are also categorical

* fix docs

---------

Co-authored-by: Egor Baturin <[email protected]>

* Allow DeepAR Native, MLP to work with categorical features (#336)

* add handling categorical features in MLP and DeepAR Native

* update changelog

* fix

* lints

* fix mlp tests

* fix

---------

Co-authored-by: Egor Baturin <[email protected]>

* Allow DeepState to work with categorical features (#342)

* cat

* add embeddings to deepstate

* update changelog

* minor fixes for other models

---------

Co-authored-by: Egor Baturin <[email protected]>

* update notebook

* update notebook: final

* lints

---------

Co-authored-by: Egor Baturin <[email protected]>
  • Loading branch information
egoriyaa and Egor Baturin authored May 28, 2024
1 parent 5ae3f7e commit 4ba2c0e
Show file tree
Hide file tree
Showing 13 changed files with 1,758 additions and 763 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-

### Changed
-
-
- Allow `RNNModel` to work with categorical features ([#334](https://github.com/etna-team/etna/pull/334))
- Allow `DeepARNativeModel` and `MLPModel` to work with categorical features ([#336](https://github.com/etna-team/etna/pull/336))
- Allow `DeepState` to work with categorical features ([#342](https://github.com/etna-team/etna/pull/342))
-
-
-
Expand Down
76 changes: 69 additions & 7 deletions etna/models/nn/deepar_native/deepar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict
from typing import Iterator
from typing import Optional
from typing import Tuple

import numpy as np
import pandas as pd
Expand All @@ -20,13 +21,16 @@
from etna.models.base import DeepBaseNet
from etna.models.nn.deepar_native.loss import DeepARLoss
from etna.models.nn.deepar_native.loss import GaussianLoss
from etna.models.nn.utils import MultiEmbedding


class DeepARNativeBatch(TypedDict):
"""Batch specification for DeepAR."""

encoder_real: "torch.Tensor"
decoder_real: "torch.Tensor"
encoder_categorical: Dict[str, "torch.Tensor"]
decoder_categorical: Dict[str, "torch.Tensor"]
encoder_target: "torch.Tensor"
decoder_target: "torch.Tensor"
segment: "torch.Tensor"
Expand All @@ -42,6 +46,7 @@ def __init__(
num_layers: int,
dropout: float,
hidden_size: int,
embedding_sizes: Dict[str, Tuple[int, int]],
lr: float,
scale: bool,
n_samples: int,
Expand All @@ -60,6 +65,8 @@ def __init__(
dropout rate in rnn layer
hidden_size:
size of the hidden state
embedding_sizes:
dictionary mapping categorical feature name to tuple of number of categorical classes and embedding size
lr:
learning rate
scale:
Expand All @@ -77,15 +84,22 @@ def __init__(
self.num_layers = num_layers
self.dropout = dropout
self.hidden_size = hidden_size
self.embedding_sizes = embedding_sizes
self.lr = lr
self.scale = scale
self.n_samples = n_samples
self.loss = loss
self.optimizer_params = {} if optimizer_params is None else optimizer_params
self.cat_size = sum([dim for (_, dim) in self.embedding_sizes.values()])
self.embedding: Optional[MultiEmbedding] = None
if self.embedding_sizes:
self.embedding = MultiEmbedding(
embedding_sizes=self.embedding_sizes,
)
self.rnn = nn.LSTM(
num_layers=self.num_layers,
hidden_size=self.hidden_size,
input_size=self.input_size,
input_size=self.input_size + self.cat_size,
batch_first=True,
dropout=self.dropout,
)
Expand Down Expand Up @@ -114,22 +128,31 @@ def forward(self, x: DeepARNativeBatch, *args, **kwargs): # type: ignore
"""
encoder_real = x["encoder_real"].float() # (batch_size, encoder_length-1, input_size)
decoder_real = x["decoder_real"].float() # (batch_size, decoder_length, input_size)
encoder_categorical = x["encoder_categorical"] # each (batch_size, encoder_length-1, 1)
decoder_categorical = x["decoder_categorical"] # each (batch_size, decoder_length, 1)
decoder_target = x["decoder_target"].float() # (batch_size, decoder_length, 1)
decoder_length = decoder_real.shape[1]

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()

encoder_values = torch.concat((encoder_real, encoder_embeddings), dim=2)
decoder_values = torch.concat((decoder_real, decoder_embeddings), dim=2)

weights = x["weight"]
forecasts = torch.zeros((decoder_target.shape[0], decoder_target.shape[1], self.n_samples))

for j in range(self.n_samples):
_, (h_n, c_n) = self.rnn(encoder_real)
_, (h_n, c_n) = self.rnn(encoder_values)
for i in range(decoder_length):
output, (h_n, c_n) = self.rnn(decoder_real[:, i, None], (h_n, c_n)) # (batch_size, 1, hidden_size)
output, (h_n, c_n) = self.rnn(decoder_values[:, i, None], (h_n, c_n)) # (batch_size, 1, hidden_size)
loc, scale = self.get_distribution_params(output)
forecast_point = self.loss.sample(
loc=loc, scale=scale, weights=weights, theoretical_mean=self.n_samples == 1
).flatten() # (batch_size)
forecasts[:, i, j] = forecast_point
if i < decoder_length - 1:
decoder_real[:, i + 1, 0] = forecast_point
decoder_values[:, i + 1, 0] = forecast_point
return torch.mean(forecasts, dim=2).unsqueeze(2)

def get_distribution_params(self, output):
Expand Down Expand Up @@ -164,12 +187,21 @@ def step(self, batch: DeepARNativeBatch, *args, **kwargs): # type: ignore
"""
encoder_real = batch["encoder_real"].float() # (batch_size, encoder_length-1, input_size)
decoder_real = batch["decoder_real"].float() # (batch_size, decoder_length, input_size)
encoder_categorical = batch["encoder_categorical"] # each (batch_size, encoder_length-1, 1)
decoder_categorical = batch["decoder_categorical"] # each (batch_size, decoder_length, 1)
encoder_target = batch["encoder_target"].float() # (batch_size, encoder_length-1, 1)
decoder_target = batch["decoder_target"].float() # (batch_size, decoder_length, 1)
weights = batch["weight"]

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()

encoder_values = torch.concat((encoder_real, encoder_embeddings), dim=2)
decoder_values = torch.concat((decoder_real, decoder_embeddings), dim=2)

target = torch.cat((encoder_target, decoder_target), dim=1) # (batch_size, encoder_length+decoder_length-1, 1)
output, (_, _) = self.rnn(
torch.cat((encoder_real, decoder_real), dim=1)
torch.cat((encoder_values, decoder_values), dim=1)
) # (batch_size, encoder_length+decoder_length-1, hidden_size)
loc, scale = self.get_distribution_params(output) # (batch_size, encoder_length+decoder_length-1, 1)
target_prediction = self.loss.sample(loc=loc, scale=scale, weights=weights, theoretical_mean=True)
Expand All @@ -181,16 +213,24 @@ def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: in
segment = df["segment"].values[0]
values_target = df["target"].values
values_real = (
df.drop(["segment", "timestamp"], axis=1)
df.drop(["segment", "timestamp"] + list(self.embedding_sizes.keys()), axis=1)
.select_dtypes(include=[np.number])
.assign(target_shifted=df["target"].shift(1))
.drop(["target"], axis=1)
.pipe(lambda x: x[["target_shifted"] + [i for i in x.columns if i != "target_shifted"]])
.values
)

# Categories that were not seen during `fit` will be filled with new category
for feature in self.embedding_sizes:
df[feature] = df[feature].astype(float).fillna(self.embedding_sizes[feature][0])

# Columns in `values_categorical` are in the same order as in `embedding_sizes`
values_categorical = df[self.embedding_sizes.keys()].values.T

def _make(
values_real: np.ndarray,
values_categorical: np.ndarray,
values_target: np.ndarray,
segment: str,
start_idx: int,
Expand All @@ -201,6 +241,8 @@ def _make(
sample: Dict[str, Any] = {
"encoder_real": list(),
"decoder_real": list(),
"encoder_categorical": dict(),
"decoder_categorical": dict(),
"encoder_target": list(),
"decoder_target": list(),
"segment": None,
Expand All @@ -219,6 +261,15 @@ def _make(
sample["encoder_real"] = values_real[start_idx : start_idx + encoder_length].copy()
sample["encoder_real"] = sample["encoder_real"][1:]

for index, feature in enumerate(self.embedding_sizes.keys()):
sample["encoder_categorical"][feature] = values_categorical[index][
start_idx : start_idx + encoder_length
].reshape(-1, 1)[1:]

sample["decoder_categorical"][feature] = values_categorical[index][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(-1, 1)

target = values_target[start_idx : start_idx + total_sample_length].reshape(-1, 1)
sample["encoder_target"] = target[1:encoder_length]
sample["decoder_target"] = target[encoder_length:]
Expand All @@ -237,6 +288,7 @@ def _make(
batch = _make(
values_target=values_target,
values_real=values_real,
values_categorical=values_categorical,
segment=segment,
start_idx=start_idx,
encoder_length=encoder_length,
Expand All @@ -256,6 +308,11 @@ def configure_optimizers(self) -> "torch.optim.Optimizer":
class DeepARNativeModel(DeepBaseModel):
"""DeepAR based model on LSTM cell.
Model needs label encoded inputs for categorical features, for that purposes use :py:class:`~etna.transforms.LabelEncoderTransform`.
Feature values that weren't seen during ``fit`` should be set to NaN, to get this behaviour use encoder with *strategy="none"*.
If there are numeric columns that are passed to ``embedding_sizes`` parameter, they will be considered only as categorical features.
Note
----
This model requires ``torch`` extension to be installed.
Expand All @@ -270,6 +327,7 @@ def __init__(
num_layers: int = 2,
dropout: float = 0.0,
hidden_size: int = 16,
embedding_sizes: Optional[Dict[str, Tuple[int, int]]] = None,
lr: float = 1e-3,
scale: bool = True,
n_samples: int = 1,
Expand All @@ -288,7 +346,7 @@ def __init__(
Parameters
----------
input_size:
size of the input feature space: target plus extra features
size of the input numeric feature space: target plus extra numeric features
encoder_length:
encoder length
decoder_length:
Expand All @@ -299,6 +357,8 @@ def __init__(
dropout rate in rnn layer
hidden_size:
size of the hidden state
embedding_sizes:
dictionary mapping categorical feature name to tuple of number of categorical classes and embedding size
lr:
learning rate
scale:
Expand Down Expand Up @@ -336,6 +396,7 @@ def __init__(
self.num_layers = num_layers
self.dropout = dropout
self.hidden_size = hidden_size
self.embedding_sizes = embedding_sizes
self.lr = lr
self.scale = scale
self.n_samples = n_samples
Expand All @@ -347,6 +408,7 @@ def __init__(
num_layers=num_layers,
dropout=dropout,
hidden_size=hidden_size,
embedding_sizes=embedding_sizes if embedding_sizes is not None else {},
lr=lr,
scale=scale,
n_samples=n_samples,
Expand Down
Loading

0 comments on commit 4ba2c0e

Please sign in to comment.