Refactor RNNs to allow initialisation of the previous output and states.

The previous output ans state of recurrent layers is saved. This makes the layers more flexible and also compatible for streaming mode.
CPJKU · Dec 4, 2016 · dafabcb · dafabcb
1 parent 0445769
commit dafabcb
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 37 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -16,6 +16,7 @@ Bug fixes:
 API relevant changes:
 
 * Reorder `GRUCell` parameters, to be consistent with all other layers (#235)
+* Rename `GRULayer` parameters, to be consistent with all other layers (#235)
 
 Other changes:
 

diff --git a/madmom/ml/nn/layers.py b/madmom/ml/nn/layers.py
@@ -101,9 +101,12 @@ class RecurrentLayer(FeedForwardLayer):
 
     """
 
-    def __init__(self, weights, bias, recurrent_weights, activation_fn):
+    def __init__(self, weights, bias, recurrent_weights, activation_fn,
+                 prev_output=None):
         super(RecurrentLayer, self).__init__(weights, bias, activation_fn)
         self.recurrent_weights = recurrent_weights
+        # keep the output of the previous time step
+        self.prev_output = prev_output
 
     def activate(self, data):
         """
@@ -120,18 +123,19 @@ def activate(self, data):
             Activations for this data.
 
         """
-        # if we don't have recurrent weights, we don't have to loop
-        if self.recurrent_weights is None:
-            return super(RecurrentLayer, self).activate(data)
+        # init previous output if needed
+        if not hasattr(self, 'prev_output') or self.prev_output is None:
+            self.prev_output = np.zeros(self.bias.size, dtype=NN_DTYPE)
         # weight input and add bias
         out = np.dot(data, self.weights) + self.bias
         # loop through all time steps
         for i in range(len(data)):
             # add weighted previous step
-            if i >= 1:
-                out[i] += np.dot(out[i - 1], self.recurrent_weights)
+            out[i] += np.dot(self.prev_output, self.recurrent_weights)
             # apply activation function
             self.activation_fn(out[i], out=out[i])
+            # update previous output
+            self.prev_output = out[i]
         # return
         return out
 
@@ -287,12 +291,15 @@ class LSTMLayer(Layer):
     """
 
     def __init__(self, input_gate, forget_gate, cell, output_gate,
-                 activation_fn=tanh):
+                 activation_fn=tanh, prev_output=None, prev_state=None):
         self.input_gate = input_gate
         self.forget_gate = forget_gate
         self.cell = cell
         self.output_gate = output_gate
         self.activation_fn = activation_fn
+        # keep the output and state of the previous time step
+        self.prev_output = prev_output
+        self.prev_state = prev_state
 
     def activate(self, data):
         """
@@ -309,38 +316,46 @@ def activate(self, data):
             Activations for this data.
 
         """
+        # init previous output
+        if not hasattr(self, 'prev_output') or self.prev_output is None:
+            self.prev_output = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
+        # init previous state
+        if not hasattr(self, 'prev_state') or self.prev_state is None:
+            self.prev_state = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
         # init arrays
         size = len(data)
         # output matrix for the whole sequence
         out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
-        # output (of the previous time step)
-        out_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
-        # state (of the previous time step)
-        state_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
         # process the input data
         for i in range(size):
             # cache input data
             data_ = data[i]
             # input gate:
-            # operate on current data, previous state and previous output
-            ig = self.input_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and state
+            ig = self.input_gate.activate(data_, self.prev_output,
+                                          self.prev_state)
             # forget gate:
-            # operate on current data, previous state and previous output
-            fg = self.forget_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and state
+            fg = self.forget_gate.activate(data_, self.prev_output,
+                                           self.prev_state)
             # cell:
             # operate on current data and previous output
-            cell = self.cell.activate(data_, out_)
+            cell = self.cell.activate(data_, self.prev_output)
             # internal state:
             # weight the cell with the input gate
             # and add the previous state weighted by the forget gate
-            state_ = cell * ig + state_ * fg
+            # Note: we overwrite self.prev_state, since we don't need to access
+            #       the previous state any more
+            self.prev_state = cell * ig + self.prev_state * fg
             # output gate:
-            # operate on current data, current state and previous output
-            og = self.output_gate.activate(data_, out_, state_)
+            # operate on current data, previous output and current state
+            og = self.output_gate.activate(data_, self.prev_output,
+                                           self.prev_state)
             # output:
             # apply activation function to state and weight by output gate
-            out_ = self.activation_fn(state_) * og
-            out[i] = out_
+            out[i] = self.activation_fn(self.prev_state) * og
+            # save current output for next time step
+            self.prev_output = out[i]
         return out
 
 
@@ -423,13 +438,13 @@ class GRULayer(Layer):
     update_gate : :class:`Gate`
         Update gate.
     cell : :class:`GRUCell`
-        GRU cell
+        GRU cell.
     hid_init : numpy array, shape (num_hiddens,), optional
         Initial state of hidden units.
 
     References
     ----------
-    .. [1] Kyunghyun Cho, Bart Van Merrienboer, Dzmitry Bahdanau, and Yoshua
+    .. [1] Kyunghyun Cho, Bart van Merriënboer, Dzmitry Bahdanau, and Yoshua
            Bengio,
            "On the properties of neural machine translation: Encoder-decoder
            approaches",
@@ -443,14 +458,12 @@ class GRULayer(Layer):
 
     """
 
-    def __init__(self, reset_gate, update_gate, cell, hid_init=None):
+    def __init__(self, reset_gate, update_gate, cell, prev_output=None):
         # init the gates
         self.reset_gate = reset_gate
         self.update_gate = update_gate
         self.cell = cell
-        if hid_init is None:
-            hid_init = np.zeros(cell.bias.size, dtype=NN_DTYPE)
-        self.hid_init = hid_init
+        self.prev_output = prev_output
 
     def activate(self, data):
         """
@@ -470,25 +483,26 @@ def activate(self, data):
         # init arrays
         size = len(data)
         # output matrix for the whole sequence
-        out = np.zeros((size, self.update_gate.bias.size), dtype=NN_DTYPE)
-        # output (of the previous time step)
-        out_ = self.hid_init
+        out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
+        # init previous output if needed
+        if not hasattr(self, 'prev_output') or self.prev_output is None:
+            self.prev_output = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
         # process the input data
         for i in range(size):
             # cache input data
             data_ = data[i]
             # reset gate:
             # operate on current data and previous output
-            rg = self.reset_gate.activate(data_, out_)
+            rg = self.reset_gate.activate(data_, self.prev_output)
             # update gate:
             # operate on current data and previous output
-            ug = self.update_gate.activate(data_, out_)
+            ug = self.update_gate.activate(data_, self.prev_output)
             # cell (implemented as in [1]):
             # operate on current data, previous output and reset gate
-            cell = self.cell.activate(data_, out_, rg)
-            # output (activation)
-            out_ = ug * cell + (1 - ug) * out_
-            out[i] = out_
+            cell = self.cell.activate(data_, self.prev_output, rg)
+            # output:
+            out[i] = ug * cell + (1 - ug) * self.prev_output
+            self.prev_output = out[i]
         return out
 
 

diff --git a/tests/test_ml_nn.py b/tests/test_ml_nn.py
@@ -113,7 +113,7 @@ def setUp(self):
         self.gru_1 = layers.GRULayer(self.reset_gate, self.update_gate,
                                      self.gru_cell)
         self.gru_2 = layers.GRULayer(self.reset_gate, self.update_gate,
-                                     self.gru_cell, hid_init=TestGRUClass.H)
+                                     self.gru_cell, prev_output=TestGRUClass.H)
 
     def test_process(self):
         self.assertTrue(