add stateful processing of recurrent layers

added initialisation of hidden states to layers; fixes #230 renamed GRU parameters, to be consistend with all other layers
CPJKU · Dec 8, 2016 · eb0310e · eb0310e
1 parent 421fa8f
commit eb0310e
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 50 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -17,11 +17,13 @@ Bug fixes:
 API relevant changes:
 
 * Reorder `GRUCell` parameters, to be consistent with all other layers (#235)
+* Rename `GRULayer` parameters, to be consistent with all other layers (#235)
 
 Other changes:
 
 * `num_threads` is passed to `ParallelProcessor` in single mode (#217)
 * Use `install_requires` in `setup.py` to specify dependencies (#226)
+* Allow initialisation of previous/hidden states in RNNs (#235)
 
 
 Version 0.14.1 (release date: 2016-08-01)

diff --git a/madmom/ml/nn/layers.py b/madmom/ml/nn/layers.py
@@ -98,43 +98,70 @@ class RecurrentLayer(FeedForwardLayer):
         Recurrent weights.
     activation_fn : numpy ufunc
         Activation function.
+    init : numpy array, shape (num_hiddens,), optional
+        Initial state of hidden units.
 
     """
 
-    def __init__(self, weights, bias, recurrent_weights, activation_fn):
+    def __init__(self, weights, bias, recurrent_weights, activation_fn,
+                 init=None):
         super(RecurrentLayer, self).__init__(weights, bias, activation_fn)
         self.recurrent_weights = recurrent_weights
+        if init is None:
+            init = np.zeros(self.bias.size, dtype=NN_DTYPE)
+        self.init = init
+        self._prev = init
 
-    def activate(self, data):
+    def activate(self, data, reset=True):
         """
         Activate the layer.
 
         Parameters
         ----------
         data : numpy array, shape (num_frames, num_inputs)
             Activate with this data.
+        reset : bool, optional
+            Reset the state of the layer before activating it.
 
         Returns
         -------
         numpy array, shape (num_frames, num_hiddens)
             Activations for this data.
 
         """
-        # if we don't have recurrent weights, we don't have to loop
-        if self.recurrent_weights is None:
-            return super(RecurrentLayer, self).activate(data)
+        # reset state
+        if reset:
+            self.reset()
         # weight input and add bias
         out = np.dot(data, self.weights) + self.bias
         # loop through all time steps
         for i in range(len(data)):
             # add weighted previous step
-            if i >= 1:
-                out[i] += np.dot(out[i - 1], self.recurrent_weights)
+            out[i] += np.dot(self._prev, self.recurrent_weights)
             # apply activation function
             self.activation_fn(out[i], out=out[i])
+            # set reference to current output
+            self._prev = out[i]
         # return
         return out
 
+    def reset(self, init=None):
+        """
+        Reset the layer to its initial state.
+
+        Parameters
+        ----------
+        init : numpy array, shape (num_hiddens,), optional
+            Reset the hidden units to this initial state.
+
+        """
+        # TODO: old models do not have the init attribute, thus create it
+        #       remove this initialisation code after updating the models
+        if not hasattr(self, 'init'):
+            self.init = np.zeros(self.bias.size, dtype=NN_DTYPE)
+        # reset previous time step to initial value
+        self._prev = self.init if init is None else init
+
 
 class BidirectionalLayer(Layer):
     """
@@ -267,7 +294,7 @@ def __init__(self, weights, bias, recurrent_weights, activation_fn=tanh):
                                    activation_fn=activation_fn)
 
 
-class LSTMLayer(Layer):
+class LSTMLayer(RecurrentLayer):
     """
     Recurrent network layer with Long Short-Term Memory units.
 
@@ -283,66 +310,103 @@ class LSTMLayer(Layer):
         Output gate.
     activation_fn : numpy ufunc, optional
         Activation function.
+    init : numpy array, shape (num_hiddens, ), optional
+        Initial state of the layer.
+    cell_init : numpy array, shape (num_hiddens, ), optional
+        Initial state of the cell.
 
     """
 
     def __init__(self, input_gate, forget_gate, cell, output_gate,
-                 activation_fn=tanh):
+                 activation_fn=tanh, init=None, cell_init=None):
         self.input_gate = input_gate
         self.forget_gate = forget_gate
         self.cell = cell
         self.output_gate = output_gate
         self.activation_fn = activation_fn
-
-    def activate(self, data):
+        # keep the state of the layer and cell
+        if init is None:
+            init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        self.init = init
+        self._prev = self.init
+        if cell_init is None:
+            cell_init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        self.cell_init = cell_init
+        self._state = self.cell_init
+
+    def activate(self, data, reset=True):
         """
         Activate the LSTM layer.
 
         Parameters
         ----------
         data : numpy array, shape (num_frames, num_inputs)
             Activate with this data.
+        reset : bool, optional
+            Reset the state of the layer before activating it.
 
         Returns
         -------
         numpy array, shape (num_frames, num_hiddens)
             Activations for this data.
 
         """
+        # init / reset previous time step
+        if reset:
+            self.reset()
         # init arrays
         size = len(data)
         # output matrix for the whole sequence
         out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
-        # output (of the previous time step)
-        out_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
-        # state (of the previous time step)
-        state_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
         # process the input data
         for i in range(size):
             # cache input data
             data_ = data[i]
             # input gate:
-            # operate on current data, previous state and previous output
-            ig = self.input_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and state
+            ig = self.input_gate.activate(data_, self._prev, self._state)
             # forget gate:
-            # operate on current data, previous state and previous output
-            fg = self.forget_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and state
+            fg = self.forget_gate.activate(data_, self._prev, self._state)
             # cell:
             # operate on current data and previous output
-            cell = self.cell.activate(data_, out_)
+            cell = self.cell.activate(data_, self._prev)
             # internal state:
             # weight the cell with the input gate
             # and add the previous state weighted by the forget gate
-            state_ = cell * ig + state_ * fg
+            self._state = cell * ig + self._state * fg
             # output gate:
-            # operate on current data, current state and previous output
-            og = self.output_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and current state
+            og = self.output_gate.activate(data_, self._prev, self._state)
             # output:
             # apply activation function to state and weight by output gate
-            out_ = self.activation_fn(state_) * og
-            out[i] = out_
+            out[i] = self.activation_fn(self._state) * og
+            # set reference to current output
+            self._prev = out[i]
         return out
 
+    def reset(self, init=None, cell_init=None):
+        """
+        Reset the layer to its initial state.
+
+        Parameters
+        ----------
+        init : numpy array, shape (num_hiddens,), optional
+            Reset the hidden units to this initial state.
+        cell_init : numpy array, shape (num_hiddens,), optional
+            Reset the cells to this initial state.
+
+        """
+        # TODO: old models do not have the init attribute, thus create it
+        #       remove this initialisation code after updating the models
+        if not hasattr(self, 'init'):
+            self.init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        if not hasattr(self, 'cell_init'):
+            self.cell_init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        # reset previous time step and state to initial value
+        self._prev = self.init if init is None else init
+        self._state = self.cell_init if cell_init is None else cell_init
+
 
 class GRUCell(object):
     """
@@ -411,7 +475,7 @@ def activate(self, data, prev, reset_gate):
         return self.activation_fn(out)
 
 
-class GRULayer(Layer):
+class GRULayer(RecurrentLayer):
     """
     Recurrent network layer with Gated Recurrent Units (GRU) as proposed in
     [1]_.
@@ -424,7 +488,7 @@ class GRULayer(Layer):
         Update gate.
     cell : :class:`GRUCell`
         GRU cell.
-    hid_init : numpy array, shape (num_hiddens,), optional
+    init : numpy array, shape (num_hiddens,), optional
         Initial state of hidden units.
 
     References
@@ -443,54 +507,77 @@ class GRULayer(Layer):
 
     """
 
-    def __init__(self, reset_gate, update_gate, cell, hid_init=None):
+    def __init__(self, reset_gate, update_gate, cell, init=None):
         # init the gates
         self.reset_gate = reset_gate
         self.update_gate = update_gate
         self.cell = cell
-        if hid_init is None:
-            hid_init = np.zeros(cell.bias.size, dtype=NN_DTYPE)
-        self.hid_init = hid_init
+        # keep the state of the layer
+        if init is None:
+            init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        self.init = init
+        self._prev = init
 
-    def activate(self, data):
+    def activate(self, data, reset=True):
         """
         Activate the GRU layer.
 
         Parameters
         ----------
         data : numpy array, shape (num_frames, num_inputs)
             Activate with this data.
+        reset : bool, optional
+            Reset the state of the layer before activating it.
 
         Returns
         -------
         numpy array, shape (num_frames, num_hiddens)
             Activations for this data.
 
         """
+        # init / reset previous time step
+        if reset:
+            self.reset()
         # init arrays
         size = len(data)
         # output matrix for the whole sequence
-        out = np.zeros((size, self.update_gate.bias.size), dtype=NN_DTYPE)
-        # output (of the previous time step)
-        out_ = self.hid_init
+        out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
         # process the input data
         for i in range(size):
             # cache input data
             data_ = data[i]
             # reset gate:
             # operate on current data and previous output
-            rg = self.reset_gate.activate(data_, out_)
+            rg = self.reset_gate.activate(data_, self._prev)
             # update gate:
             # operate on current data and previous output
-            ug = self.update_gate.activate(data_, out_)
+            ug = self.update_gate.activate(data_, self._prev)
             # cell (implemented as in [1]):
             # operate on current data, previous output and reset gate
-            cell = self.cell.activate(data_, out_, rg)
-            # output (activation)
-            out_ = ug * cell + (1 - ug) * out_
-            out[i] = out_
+            cell = self.cell.activate(data_, self._prev, rg)
+            # output:
+            out[i] = ug * cell + (1 - ug) * self._prev
+            # set reference to current output
+            self._prev = out[i]
         return out
 
+    def reset(self, init=None):
+        """
+        Reset the layer to its initial state.
+
+        Parameters
+        ----------
+        init : scalar or numpy array, shape (num_hiddens,), optional
+            Reset the hidden units to this initial state.
+
+        """
+        # TODO: old models do not have the init attribute, thus create it
+        #       remove this initialisation code after updating the models
+        if not hasattr(self, 'init'):
+            self.init = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        # reset previous time step to initial value
+        self._prev = self.init if init is None else init
+
 
 def _kernel_margins(kernel_shape, margin_shift):
     """