Skip to content

Commit

Permalink
Refactor RNNs to allow initialisation of the previous output and states.
Browse files Browse the repository at this point in the history
The previous output ans state of recurrent layers is saved. This makes
the layers more flexible and also compatible for streaming mode.
  • Loading branch information
Sebastian Böck committed Dec 4, 2016
1 parent 0445769 commit dafabcb
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 37 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Bug fixes:
API relevant changes:

* Reorder `GRUCell` parameters, to be consistent with all other layers (#235)
* Rename `GRULayer` parameters, to be consistent with all other layers (#235)

Other changes:

Expand Down
86 changes: 50 additions & 36 deletions madmom/ml/nn/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,12 @@ class RecurrentLayer(FeedForwardLayer):
"""

def __init__(self, weights, bias, recurrent_weights, activation_fn):
def __init__(self, weights, bias, recurrent_weights, activation_fn,
prev_output=None):
super(RecurrentLayer, self).__init__(weights, bias, activation_fn)
self.recurrent_weights = recurrent_weights
# keep the output of the previous time step
self.prev_output = prev_output

def activate(self, data):
"""
Expand All @@ -120,18 +123,19 @@ def activate(self, data):
Activations for this data.
"""
# if we don't have recurrent weights, we don't have to loop
if self.recurrent_weights is None:
return super(RecurrentLayer, self).activate(data)
# init previous output if needed
if not hasattr(self, 'prev_output') or self.prev_output is None:
self.prev_output = np.zeros(self.bias.size, dtype=NN_DTYPE)
# weight input and add bias
out = np.dot(data, self.weights) + self.bias
# loop through all time steps
for i in range(len(data)):
# add weighted previous step
if i >= 1:
out[i] += np.dot(out[i - 1], self.recurrent_weights)
out[i] += np.dot(self.prev_output, self.recurrent_weights)
# apply activation function
self.activation_fn(out[i], out=out[i])
# update previous output
self.prev_output = out[i]
# return
return out

Expand Down Expand Up @@ -287,12 +291,15 @@ class LSTMLayer(Layer):
"""

def __init__(self, input_gate, forget_gate, cell, output_gate,
activation_fn=tanh):
activation_fn=tanh, prev_output=None, prev_state=None):
self.input_gate = input_gate
self.forget_gate = forget_gate
self.cell = cell
self.output_gate = output_gate
self.activation_fn = activation_fn
# keep the output and state of the previous time step
self.prev_output = prev_output
self.prev_state = prev_state

def activate(self, data):
"""
Expand All @@ -309,38 +316,46 @@ def activate(self, data):
Activations for this data.
"""
# init previous output
if not hasattr(self, 'prev_output') or self.prev_output is None:
self.prev_output = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
# init previous state
if not hasattr(self, 'prev_state') or self.prev_state is None:
self.prev_state = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
# init arrays
size = len(data)
# output matrix for the whole sequence
out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
# output (of the previous time step)
out_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
# state (of the previous time step)
state_ = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
# process the input data
for i in range(size):
# cache input data
data_ = data[i]
# input gate:
# operate on current data, previous state and previous output
ig = self.input_gate.activate(data_, out_, state_)
# operate on current data, previous output and state
ig = self.input_gate.activate(data_, self.prev_output,
self.prev_state)
# forget gate:
# operate on current data, previous state and previous output
fg = self.forget_gate.activate(data_, out_, state_)
# operate on current data, previous output and state
fg = self.forget_gate.activate(data_, self.prev_output,
self.prev_state)
# cell:
# operate on current data and previous output
cell = self.cell.activate(data_, out_)
cell = self.cell.activate(data_, self.prev_output)
# internal state:
# weight the cell with the input gate
# and add the previous state weighted by the forget gate
state_ = cell * ig + state_ * fg
# Note: we overwrite self.prev_state, since we don't need to access
# the previous state any more
self.prev_state = cell * ig + self.prev_state * fg
# output gate:
# operate on current data, current state and previous output
og = self.output_gate.activate(data_, out_, state_)
# operate on current data, previous output and current state
og = self.output_gate.activate(data_, self.prev_output,
self.prev_state)
# output:
# apply activation function to state and weight by output gate
out_ = self.activation_fn(state_) * og
out[i] = out_
out[i] = self.activation_fn(self.prev_state) * og
# save current output for next time step
self.prev_output = out[i]
return out


Expand Down Expand Up @@ -423,13 +438,13 @@ class GRULayer(Layer):
update_gate : :class:`Gate`
Update gate.
cell : :class:`GRUCell`
GRU cell
GRU cell.
hid_init : numpy array, shape (num_hiddens,), optional
Initial state of hidden units.
References
----------
.. [1] Kyunghyun Cho, Bart Van Merrienboer, Dzmitry Bahdanau, and Yoshua
.. [1] Kyunghyun Cho, Bart van Merriënboer, Dzmitry Bahdanau, and Yoshua
Bengio,
"On the properties of neural machine translation: Encoder-decoder
approaches",
Expand All @@ -443,14 +458,12 @@ class GRULayer(Layer):
"""

def __init__(self, reset_gate, update_gate, cell, hid_init=None):
def __init__(self, reset_gate, update_gate, cell, prev_output=None):
# init the gates
self.reset_gate = reset_gate
self.update_gate = update_gate
self.cell = cell
if hid_init is None:
hid_init = np.zeros(cell.bias.size, dtype=NN_DTYPE)
self.hid_init = hid_init
self.prev_output = prev_output

def activate(self, data):
"""
Expand All @@ -470,25 +483,26 @@ def activate(self, data):
# init arrays
size = len(data)
# output matrix for the whole sequence
out = np.zeros((size, self.update_gate.bias.size), dtype=NN_DTYPE)
# output (of the previous time step)
out_ = self.hid_init
out = np.zeros((size, self.cell.bias.size), dtype=NN_DTYPE)
# init previous output if needed
if not hasattr(self, 'prev_output') or self.prev_output is None:
self.prev_output = np.zeros(self.cell.bias.size, dtype=NN_DTYPE)
# process the input data
for i in range(size):
# cache input data
data_ = data[i]
# reset gate:
# operate on current data and previous output
rg = self.reset_gate.activate(data_, out_)
rg = self.reset_gate.activate(data_, self.prev_output)
# update gate:
# operate on current data and previous output
ug = self.update_gate.activate(data_, out_)
ug = self.update_gate.activate(data_, self.prev_output)
# cell (implemented as in [1]):
# operate on current data, previous output and reset gate
cell = self.cell.activate(data_, out_, rg)
# output (activation)
out_ = ug * cell + (1 - ug) * out_
out[i] = out_
cell = self.cell.activate(data_, self.prev_output, rg)
# output:
out[i] = ug * cell + (1 - ug) * self.prev_output
self.prev_output = out[i]
return out


Expand Down
2 changes: 1 addition & 1 deletion tests/test_ml_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def setUp(self):
self.gru_1 = layers.GRULayer(self.reset_gate, self.update_gate,
self.gru_cell)
self.gru_2 = layers.GRULayer(self.reset_gate, self.update_gate,
self.gru_cell, hid_init=TestGRUClass.H)
self.gru_cell, prev_output=TestGRUClass.H)

def test_process(self):
self.assertTrue(
Expand Down

0 comments on commit dafabcb

Please sign in to comment.