Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request #456 from lukaszkaiser/push
Browse files Browse the repository at this point in the history
1.3.2 (small changes for colab)
  • Loading branch information
lukaszkaiser authored Dec 4, 2017
2 parents 970dac9 + f2fb96b commit d9f807c
Show file tree
Hide file tree
Showing 5 changed files with 543 additions and 405 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='tensor2tensor',
version='1.3.1',
version='1.3.2',
description='Tensor2Tensor',
author='Google Inc.',
author_email='[email protected]',
Expand Down
8 changes: 8 additions & 0 deletions tensor2tensor/data_generators/translate_enfr.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,14 @@ def use_small_dataset(self):
return False


@registry.register_problem
class TranslateEnfrWmt32kPacked(TranslateEnfrWmt32k):

@property
def packed_length(self):
return 256


@registry.register_problem
class TranslateEnfrWmtSmallCharacters(translate.TranslateProblem):
"""Problem spec for WMT En-Fr translation."""
Expand Down
19 changes: 15 additions & 4 deletions tensor2tensor/layers/common_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,8 @@ def dot_product_attention(q,
dropout_rate=0.0,
image_shapes=None,
name=None,
make_image_summary=True):
make_image_summary=True,
save_weights_to=None):
"""dot-product attention.
Args:
Expand All @@ -1195,17 +1196,22 @@ def dot_product_attention(q,
see comments for attention_image_summary()
name: an optional string
make_image_summary: True if you want an image summary.
save_weights_to: an optional dictionary to capture attention weights
for vizualization; the weights tensor will be appended there under
a string key created from the variable scope (including name).
Returns:
A Tensor.
"""
with tf.variable_scope(
name, default_name="dot_product_attention", values=[q, k, v]):
name, default_name="dot_product_attention", values=[q, k, v]) as scope:
# [batch, num_heads, query_length, memory_length]
logits = tf.matmul(q, k, transpose_b=True)
if bias is not None:
logits += bias
weights = tf.nn.softmax(logits, name="attention_weights")
if save_weights_to is not None:
save_weights_to[scope.name] = weights
# dropping out the attention links for each of the heads
weights = tf.nn.dropout(weights, 1.0 - dropout_rate)
if (not tf.get_variable_scope().reuse and
Expand Down Expand Up @@ -2245,6 +2251,7 @@ def multihead_attention(query_antecedent,
gap_size=0,
num_memory_blocks=2,
name=None,
save_weights_to=None,
**kwargs):
"""Multihead scaled-dot-product attention with input/output transformations.
Expand Down Expand Up @@ -2284,7 +2291,10 @@ def multihead_attention(query_antecedent,
memory blocks.
num_memory_blocks: Integer option to indicate how many memory blocks to look
at.
name: an optional string
name: an optional string.
save_weights_to: an optional dictionary to capture attention weights
for vizualization; the weights tensor will be appended there under
a string key created from the variable scope (including name).
**kwargs (dict): Parameters for the attention function
Caching:
Expand Down Expand Up @@ -2345,7 +2355,8 @@ def multihead_attention(query_antecedent,
if isinstance(x, tuple):
x, additional_returned_value = x # Unpack
elif attention_type == "dot_product":
x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes)
x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes,
save_weights_to=save_weights_to)
elif attention_type == "dot_product_relative":
x = dot_product_attention_relative(q, k, v, bias, max_relative_position,
dropout_rate, image_shapes)
Expand Down
27 changes: 22 additions & 5 deletions tensor2tensor/models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
class Transformer(t2t_model.T2TModel):
"""Attention net. See file docstring."""

def __init__(self, *args, **kwargs):
super(Transformer, self).__init__(*args, **kwargs)
self.attention_weights = dict() # For vizualizing attention heads.

def encode(self, inputs, target_space, hparams, features=None):
"""Encode transformer inputs.
Expand Down Expand Up @@ -73,7 +77,8 @@ def encode(self, inputs, target_space, hparams, features=None):

encoder_output = transformer_encoder(
encoder_input, self_attention_bias,
hparams, nonpadding=_features_to_nonpadding(features, "inputs"))
hparams, nonpadding=_features_to_nonpadding(features, "inputs"),
save_weights_to=self.attention_weights)

return encoder_output, encoder_decoder_attention_bias

Expand Down Expand Up @@ -114,7 +119,8 @@ def decode(self,
encoder_decoder_attention_bias,
hparams,
cache=cache,
nonpadding=nonpadding)
nonpadding=nonpadding,
save_weights_to=self.attention_weights)

if hparams.use_tpu and hparams.mode == tf.estimator.ModeKeys.TRAIN:
# TPU does not react kindly to extra dimensions.
Expand Down Expand Up @@ -507,7 +513,8 @@ def transformer_encoder(encoder_input,
encoder_self_attention_bias,
hparams,
name="encoder",
nonpadding=None):
nonpadding=None,
save_weights_to=None):
"""A stack of transformer layers.
Args:
Expand All @@ -522,6 +529,9 @@ def transformer_encoder(encoder_input,
encoder_self_attention_bias. The knowledge about padding is used
for pad_remover(efficiency) and to mask out padding in convoltutional
layers.
save_weights_to: an optional dictionary to capture attention weights
for vizualization; the weights tensor will be appended there under
a string key created from the variable scope (including name).
Returns:
y: a Tensors
Expand Down Expand Up @@ -551,6 +561,7 @@ def transformer_encoder(encoder_input,
hparams.num_heads,
hparams.attention_dropout,
attention_type=hparams.self_attention_type,
save_weights_to=save_weights_to,
max_relative_position=hparams.max_relative_position)
x = common_layers.layer_postprocess(x, y, hparams)
with tf.variable_scope("ffn"):
Expand All @@ -571,7 +582,8 @@ def transformer_decoder(decoder_input,
hparams,
cache=None,
name="decoder",
nonpadding=None):
nonpadding=None,
save_weights_to=None):
"""A stack of transformer layers.
Args:
Expand All @@ -590,6 +602,9 @@ def transformer_decoder(decoder_input,
to mask out padding in convoltutional layers. We generally only
need this mask for "packed" datasets, because for ordinary datasets,
no padding is ever followed by nonpadding.
save_weights_to: an optional dictionary to capture attention weights
for vizualization; the weights tensor will be appended there under
a string key created from the variable scope (including name).
Returns:
y: a Tensors
Expand All @@ -612,6 +627,7 @@ def transformer_decoder(decoder_input,
hparams.num_heads,
hparams.attention_dropout,
attention_type=hparams.self_attention_type,
save_weights_to=save_weights_to,
max_relative_position=hparams.max_relative_position,
cache=layer_cache)
x = common_layers.layer_postprocess(x, y, hparams)
Expand All @@ -624,7 +640,8 @@ def transformer_decoder(decoder_input,
hparams.attention_key_channels or hparams.hidden_size,
hparams.attention_value_channels or hparams.hidden_size,
hparams.hidden_size, hparams.num_heads,
hparams.attention_dropout)
hparams.attention_dropout,
save_weights_to=save_weights_to)
x = common_layers.layer_postprocess(x, y, hparams)
with tf.variable_scope("ffn"):
y = transformer_ffn_layer(
Expand Down
Loading

0 comments on commit d9f807c

Please sign in to comment.