diff --git a/.gitignore b/.gitignore index e610f29ba..dd84837dd 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,6 @@ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info -# PyPI distribution artifacts +# PyPI distribution artificats build/ dist/ diff --git a/setup.py b/setup.py index 5b2d423f8..fbb81470e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.0.7', + version='1.0.8', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', diff --git a/tensor2tensor/bin/make_tf_configs.py b/tensor2tensor/bin/make_tf_configs.py index 791589939..8b9367ca6 100644 --- a/tensor2tensor/bin/make_tf_configs.py +++ b/tensor2tensor/bin/make_tf_configs.py @@ -55,7 +55,7 @@ def main(_): for idx, job in enumerate(jobs): if task_type == "worker": cmd_line_flags = " ".join([ - "--master=%s" % job, + "--master=grpc://%s" % job, "--ps_replicas=%d" % len(ps), "--worker_replicas=%d" % len(workers), "--worker_gpu=1", @@ -66,6 +66,7 @@ def main(_): ]) else: cmd_line_flags = " ".join([ + "--master=grpc://%s" % job, "--schedule=run_std_server", ]) diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py index 452fc637a..f5c954036 100644 --- a/tensor2tensor/data_generators/algorithmic_math.py +++ b/tensor2tensor/data_generators/algorithmic_math.py @@ -570,16 +570,16 @@ def calculus_integrate(alphabet_size=26, functions = {"log": "L"} alg_cfg = math_dataset_init(alphabet_size, digits=5, functions=functions) - nbr_case=0 + nbr_case = 0 while nbr_case < nbr_cases: try: sample, target = generate_calculus_integrate_sample( - alg_cfg.vlist, - list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions) + alg_cfg.vlist, + list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions) yield { - "inputs": alg_cfg.int_encoder(sample), - "targets": alg_cfg.int_encoder(target) + "inputs": alg_cfg.int_encoder(sample), + "targets": alg_cfg.int_encoder(target) } - except: + except: # pylint:disable=bare-except continue - nbr_case = nbr_case + 1 + nbr_case += 1 diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py old mode 100755 new mode 100644 index c50d19afa..fb85d99c3 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -27,7 +27,7 @@ import six from six.moves import xrange # pylint: disable=redefined-builtin -import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3 +import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3 from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder from tensor2tensor.data_generators.tokenizer import Tokenizer diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py old mode 100755 new mode 100644 index c525e4ec0..e88a90983 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -28,6 +28,7 @@ import tensorflow as tf + # End-of-sentence marker (should correspond to the position of EOS in the # RESERVED_TOKENS list in text_encoder.py) EOS = 1 @@ -44,6 +45,7 @@ def character_generator(source_path, target_path, character_vocab, eos=None): Args: source_path: path to the file with source sentences. target_path: path to the file with target sentences. + character_vocab: a TextEncoder to encode the characters. eos: integer to append at the end of each sequence (default: None). Yields: diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py old mode 100755 new mode 100644 index 0366fdfb0..b6af3cf93 --- a/tensor2tensor/data_generators/wmt_test.py +++ b/tensor2tensor/data_generators/wmt_test.py @@ -25,8 +25,8 @@ # Dependency imports import six -from tensor2tensor.data_generators import wmt from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import wmt import tensorflow as tf @@ -40,7 +40,7 @@ def testCharacterGenerator(self): if six.PY2: enc_f = lambda s: s else: - enc_f = lambda s: s.encode('utf-8') + enc_f = lambda s: s.encode("utf-8") with io.open(tmp_file_path + ".src", "wb") as src_file: src_file.write(enc_f("source1\n")) src_file.write(enc_f("source2\n")) @@ -51,16 +51,15 @@ def testCharacterGenerator(self): # Call character generator on the generated files. results_src, results_tgt = [], [] character_vocab = text_encoder.ByteTextEncoder() - for dictionary in wmt.character_generator(tmp_file_path + ".src", - tmp_file_path + ".tgt", - character_vocab): + for dictionary in wmt.character_generator( + tmp_file_path + ".src", tmp_file_path + ".tgt", character_vocab): self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"]) results_src.append(dictionary["inputs"]) results_tgt.append(dictionary["targets"]) # Check that the results match the files. # First check that the results match the encoded original strings; - # this is a comparison of integer arrays + # this is a comparison of integer arrays. self.assertEqual(len(results_src), 2) self.assertEqual(results_src[0], character_vocab.encode("source1")) diff --git a/tensor2tensor/docs/distributed_training.md b/tensor2tensor/docs/distributed_training.md index be3726f06..e7ddd7294 100644 --- a/tensor2tensor/docs/distributed_training.md +++ b/tensor2tensor/docs/distributed_training.md @@ -35,7 +35,7 @@ os.environ['TF_CONFIG'] = json.dumps({ The following T2T command-line flags must also be set on the workers for distributed training: -- `--master=$ADDRESS` +- `--master=grpc://$ADDRESS` - `--worker_replicas=$NUM_WORKERS` - `--worker_gpu=$NUM_GPUS_PER_WORKER` - `--worker_id=$WORKER_ID` @@ -55,6 +55,17 @@ Parameter servers only need `--schedule=run_std_server`. generates the `TF_CONFIG` json strings and the above-mentioned command-line flags for the workers and parameter servers. +Given a set of worker and parameter server addresses, the script outputs, for +each job, a line with the `TF_CONFIG` environment variable and the command-line +flags necessary for distributed training. For each job, you should invoke the +`t2t-trainer` with the `TF_CONFIG` value and flags that are output. + +For example: + +``` +TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ... +``` + ## Command-line flags for eval jobs Eval jobs should set the following flags and do not need the `TF_CONFIG` diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 30b871640..99fbd8232 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -24,8 +24,6 @@ from __future__ import division from __future__ import print_function -import copy - # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin @@ -43,13 +41,9 @@ class AttentionLM(t2t_model.T2TModel): """Attention net. See file docstring.""" - def model_fn_body(self, features, train): + def model_fn_body(self, features): # Remove dropout if not training - hparams = copy.copy(self._hparams) - if not train: - hparams.attention_dropout = 0. - hparams.relu_dropout = 0. - hparams.residual_dropout = 0. + hparams = self._hparams targets = features["targets"] targets = tf.squeeze(targets, 2) @@ -162,8 +156,10 @@ def attention_lm_base(): hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) + hparams.add_hparam("pos", "timing") # timing, none return hparams diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 9cd0547f7..b4d27d400 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -24,8 +24,6 @@ from __future__ import division from __future__ import print_function -import copy - # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin @@ -43,13 +41,9 @@ class AttentionLmMoe(t2t_model.T2TModel): """Attention net. See file docstring.""" - def model_fn_body_sharded(self, sharded_features, train): + def model_fn_body_sharded(self, sharded_features): # Remove dropout if not training - hparams = copy.copy(self._hparams) - if not train: - hparams.attention_dropout = 0. - hparams.relu_dropout = 0. - hparams.residual_dropout = 0. + hparams = self._hparams dp = self._data_parallelism targets = sharded_features["targets"] targets = dp(tf.squeeze, targets, 2) @@ -81,7 +75,9 @@ def residual_fn(x, y): with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): y, loss = common_layers.moe_layer( - dp, self._ps_devices, x, train, hparams.hidden_size, + dp, self._ps_devices, x, + hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef) extra_loss += loss @@ -162,10 +158,12 @@ def attention_lm_moe_base(): hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) + hparams.add_hparam("pos", "timing") # timing, none return hparams diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py index bb7119a15..bbcf392aa 100644 --- a/tensor2tensor/models/bluenet.py +++ b/tensor2tensor/models/bluenet.py @@ -30,7 +30,7 @@ import tensorflow as tf -def residual_module(x, hparams, train, n, sep): +def residual_module(x, hparams, n, sep): """A stack of convolution blocks with residual connection.""" k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((1, 1), k) for _ in xrange(n)] @@ -43,56 +43,55 @@ def residual_module(x, hparams, train, n, sep): separability=sep, name="block") x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") - return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + return tf.nn.dropout(x, 1.0 - hparams.dropout) -def residual_module1(x, hparams, train): - return residual_module(x, hparams, train, 1, 1) +def residual_module1(x, hparams): + return residual_module(x, hparams, 1, 1) -def residual_module1_sep(x, hparams, train): - return residual_module(x, hparams, train, 1, 0) +def residual_module1_sep(x, hparams): + return residual_module(x, hparams, 1, 0) -def residual_module2(x, hparams, train): - return residual_module(x, hparams, train, 2, 1) +def residual_module2(x, hparams): + return residual_module(x, hparams, 2, 1) -def residual_module2_sep(x, hparams, train): - return residual_module(x, hparams, train, 2, 0) +def residual_module2_sep(x, hparams): + return residual_module(x, hparams, 2, 0) -def residual_module3(x, hparams, train): - return residual_module(x, hparams, train, 3, 1) +def residual_module3(x, hparams): + return residual_module(x, hparams, 3, 1) -def residual_module3_sep(x, hparams, train): - return residual_module(x, hparams, train, 3, 0) +def residual_module3_sep(x, hparams): + return residual_module(x, hparams, 3, 0) -def norm_module(x, hparams, train): - del train # Unused. +def norm_module(x, hparams): return common_layers.layer_norm(x, hparams.hidden_size, name="norm_module") -def identity_module(x, hparams, train): - del hparams, train # Unused. +def identity_module(x, hparams): + del hparams # Unused. return x -def run_modules(blocks, cur, hparams, train, dp): +def run_modules(blocks, cur, hparams, dp): """Run blocks in parallel using dp as data_parallelism.""" assert len(blocks) % dp.n == 0 res = [] for i in xrange(len(blocks) // dp.n): - res.extend(dp(blocks[i * dp.n:(i + 1) * dp.n], cur, hparams, train)) + res.extend(dp(blocks[i * dp.n:(i + 1) * dp.n], cur, hparams)) return res @registry.register_model class BlueNet(t2t_model.T2TModel): - def model_fn_body_sharded(self, sharded_features, train): + def model_fn_body_sharded(self, sharded_features): dp = self._data_parallelism dp._reuse = False # pylint:disable=protected-access hparams = self._hparams @@ -106,7 +105,7 @@ def model_fn_body_sharded(self, sharded_features, train): cur_shape = cur.get_shape() for i in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % i): - processed = run_modules(blocks, cur, hparams, train, dp) + processed = run_modules(blocks, cur, hparams, dp) cur = common_layers.shakeshake(processed) cur.set_shape(cur_shape) diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py index 70996ab02..a325e5a55 100644 --- a/tensor2tensor/models/bluenet_test.py +++ b/tensor2tensor/models/bluenet_test.py @@ -42,8 +42,9 @@ def testBlueNet(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = bluenet.BlueNet(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = bluenet.BlueNet( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index 42db05700..1a82144d6 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -30,7 +30,7 @@ import tensorflow as tf -def residual_dilated_conv(x, repeat, padding, name, hparams, train): +def residual_dilated_conv(x, repeat, padding, name, hparams): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) @@ -45,11 +45,11 @@ def residual_dilated_conv(x, repeat, padding, name, hparams, train): padding=padding, name="residual_conv") x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") - x = tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + x = tf.nn.dropout(x, hparams.dropout) return x -def bytenet_internal(inputs, targets, hparams, train): +def bytenet_internal(inputs, targets, hparams): """ByteNet, main step used for training.""" with tf.variable_scope("bytenet"): # Flatten inputs and extend length by 50%. @@ -63,7 +63,7 @@ def bytenet_internal(inputs, targets, hparams, train): inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) final_encoder = residual_dilated_conv( - inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train) + inputs, hparams.num_block_repeat, "SAME", "encoder", hparams) shifted_targets = common_layers.shift_left(targets) kernel = (hparams.kernel_height, hparams.kernel_width) @@ -74,15 +74,15 @@ def bytenet_internal(inputs, targets, hparams, train): return residual_dilated_conv( decoder_start, hparams.num_block_repeat, - "LEFT", "decoder", hparams, train) + "LEFT", "decoder", hparams) @registry.register_model class ByteNet(t2t_model.T2TModel): - def model_fn_body(self, features, train): + def model_fn_body(self, features): return bytenet_internal(features["inputs"], features["targets"], - self._hparams, train) + self._hparams) @registry.register_hparams diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py index 676220cc8..8202d5b74 100644 --- a/tensor2tensor/models/bytenet_test.py +++ b/tensor2tensor/models/bytenet_test.py @@ -42,8 +42,9 @@ def testByteNet(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = bytenet.ByteNet(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = bytenet.ByteNet( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index 689f407f5..41ca6f4b0 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -45,6 +45,8 @@ def basic_params1(): kernel_width=1, hidden_size=64, compress_steps=0, + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. dropout=0.2, clip_grad_norm=2.0, initializer="orthogonal", diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index f9d63a464..078fcc5a3 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -1211,7 +1211,6 @@ def conv_lstm(x, def diagonal_conv_gru(x, kernel_size, filters, - train, dropout=0.0, name=None, reuse=None): @@ -1234,8 +1233,7 @@ def do_conv(args, name, bias_start): gate, gate_cost = hard_sigmoid(do_conv(x, "gate", 0.7)) candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0)) - # Dropout if training. - if dropout > 0.0 and train: + if dropout > 0.0: candidate = tf.nn.dropout(candidate, 1.0 - dropout) # Diagonal shift. diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index 78f79eed0..992c42db4 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -67,6 +67,7 @@ def lstm_seq2seq_internal(inputs, targets, hparams, train): @registry.register_model("baseline_lstm_seq2seq") class LSTMSeq2Seq(t2t_model.T2TModel): - def model_fn_body(self, features, train): + def model_fn_body(self, features): + train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN return lstm_seq2seq_internal(features["inputs"], features["targets"], self._hparams, train) diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 33347cb84..e5bdb184b 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -43,8 +43,9 @@ def testLSTMSeq2Seq(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = lstm.LSTMSeq2Seq(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = lstm.LSTMSeq2Seq( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index 7247b791e..66a8491f2 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -70,7 +70,8 @@ def add_and_normalize(x, y): @registry.register_model class MultiModel(t2t_model.T2TModel): - def model_fn_body_sharded(self, sharded_features, train): + def model_fn_body_sharded(self, sharded_features): + train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN dp = self._data_parallelism hparams = self._hparams targets = sharded_features["targets"] @@ -86,7 +87,7 @@ def encode_half(inputs, inputs_mask, hparams): inputs = common_layers.add_timing_signal(inputs) return slicenet.multi_conv_res(inputs, "SAME", "encoder1", hparams.num_hidden_layers // 2, - hparams, train, mask=inputs_mask) + hparams, mask=inputs_mask) target_space_emb = dp(slicenet.embed_target_space, sharded_features["target_space_id"], @@ -101,7 +102,7 @@ def encode_half(inputs, inputs_mask, hparams): expert_loss *= hparams.moe_loss_coef inputs_encoded = dp( slicenet.multi_conv_res, inputs_encoded, "SAME", - "encoder2", hparams.num_hidden_layers, hparams, train, + "encoder2", hparams.num_hidden_layers, hparams, mask=inputs_mask) # If we're just predicing a class, there is no use for a decoder, return. @@ -112,7 +113,7 @@ def encode_half(inputs, inputs_mask, hparams): # Do the middle part. decoder_start, similarity_loss = dp( slicenet.slicenet_middle, inputs_encoded, targets, - target_space_emb, inputs_mask, hparams, train) + target_space_emb, inputs_mask, hparams) # Decode. decoder_half = dp( @@ -137,7 +138,6 @@ def encode_half(inputs, inputs_mask, hparams): "decoder2", hparams.num_hidden_layers // 2, hparams, - train, mask=inputs_mask, source=inputs_encoded) diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index 8df682c5c..72fe4a326 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -43,8 +43,9 @@ def testMultiModel(self): "targets": tf.constant(y, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } - model = multimodel.MultiModel(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = multimodel.MultiModel( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py index dbae77f43..dce0dbc30 100644 --- a/tensor2tensor/models/neural_gpu.py +++ b/tensor2tensor/models/neural_gpu.py @@ -30,12 +30,11 @@ import tensorflow as tf -def neural_gpu(inputs, hparams, train, name=None): +def neural_gpu(inputs, hparams, name=None): """The core Neural GPU.""" with tf.variable_scope(name, "neural_gpu"): - def step(state, inp): # pylint: disable=missing-docstring - x = tf.nn.dropout(state, 1.0 - hparams.dropout * tf.to_float(train)) + x = tf.nn.dropout(state, 1.0 - hparams.dropout) for layer in xrange(hparams.num_hidden_layers): x = common_layers.conv_gru( x, (hparams.kernel_height, hparams.kernel_width), @@ -57,11 +56,11 @@ def step(state, inp): # pylint: disable=missing-docstring @registry.register_model class NeuralGPU(t2t_model.T2TModel): - def model_fn_body(self, features, train): - return neural_gpu(features["inputs"], self._hparams, train) + def model_fn_body(self, features): + return neural_gpu(features["inputs"], self._hparams) -def diagonal_neural_gpu(inputs, hparams, train, name=None): +def diagonal_neural_gpu(inputs, hparams, name=None): """Improved Neural GPU as in https://arxiv.org/abs/1702.08727.""" with tf.variable_scope(name, "diagonal_neural_gpu"): @@ -73,7 +72,6 @@ def step(state_tup, inp): x, new_loss = common_layers.diagonal_conv_gru( x, (hparams.kernel_height, hparams.kernel_width), hparams.hidden_size, - train, dropout=hparams.dropout, name="dcgru_%d" % layer) # Padding input is zeroed-out in the modality, we check this by summing. @@ -93,8 +91,8 @@ def step(state_tup, inp): @registry.register_model class DiagonalNeuralGPU(t2t_model.T2TModel): - def model_fn_body(self, features, train): - return diagonal_neural_gpu(features["inputs"], self._hparams, train) + def model_fn_body(self, features): + return diagonal_neural_gpu(features["inputs"], self._hparams) @registry.register_hparams("neuralgpu_1") diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index 0d4937a5d..3065bb1c4 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -49,8 +49,9 @@ def testNeuralGPU(self): "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } - model = neural_gpu.NeuralGPU(hparams, p_hparams) - shadred_logits, _, _ = model.model_fn(features, True) + model = neural_gpu.NeuralGPU( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + shadred_logits, _, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index eddf4cc96..0b9efc2c3 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -46,8 +46,7 @@ def get_norm(hparams): "'noam', 'none'.") -def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, - bias=None): +def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None): """Complete attention layer with preprocessing.""" separabilities = [hparams.separability, hparams.separability] if hparams.separability < 0: @@ -71,7 +70,6 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, tf.shape(inputs_encoded)[1] ]) - attention_dropout = hparams.attention_dropout * tf.to_float(train) qv = common_attention.multihead_attention( targets_timed, None, @@ -80,7 +78,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, hparams.hidden_size, hparams.hidden_size, hparams.num_heads, - attention_dropout, + hparams.attention_dropout, name="self_attention", summaries=False) qv = common_attention.multihead_attention( @@ -91,7 +89,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, hparams.hidden_size, hparams.hidden_size, hparams.num_heads, - attention_dropout, + hparams.attention_dropout, name="encdec_attention", summaries=False) return tf.expand_dims(qv, 2) @@ -101,7 +99,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train, return norm_fn(targets_shifted + targets_with_attention, name="attn_norm") -def multi_conv_res(x, padding, name, layers, hparams, train, +def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None): """A stack of separable convolution blocks with residual connections.""" with tf.variable_scope(name): @@ -152,10 +150,10 @@ def multi_conv_res(x, padding, name, layers, hparams, train, separabilities=separabilities2, name="residual2") + y if source is not None and hparams.attention_type != "none": - x += attention(x, source, norm_fn, hparams, train, bias=padding_bias) + x += attention(x, source, norm_fn, hparams, bias=padding_bias) if mask is not None: x *= mask - return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + return tf.nn.dropout(x, 1.0 - hparams.dropout) def rank_loss(sentence_emb, image_emb, margin=0.2): @@ -188,8 +186,7 @@ def similarity_cost(inputs_encoded, targets_encoded): return rank_loss(x, y) -def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, - hparams, train): +def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams): """Middle part of slicenet, connecting encoder and decoder.""" norm_fn = get_norm(hparams) @@ -204,7 +201,7 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, extra_layers = int(hparams.num_hidden_layers * 1.5) with tf.variable_scope(tf.get_variable_scope(), reuse=True): targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder", - extra_layers, hparams, train) + extra_layers, hparams) with tf.variable_scope("similarity_loss"): similarity_loss = similarity_cost(inputs_encoded, targets_encoded) similarity_loss *= hparams.sim_loss_mult @@ -219,7 +216,7 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, else: inputs_padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. targets_with_attention = attention( - targets_shifted, inputs_encoded, norm_fn, hparams, train, + targets_shifted, inputs_encoded, norm_fn, hparams, bias=inputs_padding_bias) # Positional targets: merge attention and raw. @@ -247,8 +244,7 @@ def embedding_to_padding(emb): return tf.to_float(tf.equal(emb_sum, 0.0)) -def slicenet_internal(inputs, targets, target_space, - problem_idx, hparams, train): +def slicenet_internal(inputs, targets, target_space, problem_idx, hparams): """The slicenet model, main step used for training.""" with tf.variable_scope("slicenet"): # Flatten inputs and encode. @@ -258,14 +254,14 @@ def slicenet_internal(inputs, targets, target_space, target_space_emb = embed_target_space(target_space, hparams.hidden_size) extra_layers = int(hparams.num_hidden_layers * 1.5) inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers, - hparams, train, mask=inputs_mask) + hparams, mask=inputs_mask) target_modality_name = hparams.problems[problem_idx].target_modality.name if "class_label_modality" in target_modality_name: # If we're just predicing a class, there is no use for a decoder. return inputs_encoded # Do the middle part. decoder_start, similarity_loss = slicenet_middle( - inputs_encoded, targets, target_space_emb, inputs_mask, hparams, train) + inputs_encoded, targets, target_space_emb, inputs_mask, hparams) # Decode. decoder_final = multi_conv_res( decoder_start, @@ -273,7 +269,6 @@ def slicenet_internal(inputs, targets, target_space, "decoder", hparams.num_hidden_layers, hparams, - train, mask=inputs_mask, source=inputs_encoded) return decoder_final, tf.reduce_mean(similarity_loss) @@ -282,10 +277,10 @@ def slicenet_internal(inputs, targets, target_space, @registry.register_model class SliceNet(t2t_model.T2TModel): - def model_fn_body(self, features, train): + def model_fn_body(self, features): return slicenet_internal(features["inputs"], features["targets"], features["target_space_id"], self._problem_idx, - self._hparams, train) + self._hparams) _KERNEL_SCHEMES = { "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)], diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index bbeb3a284..db563b481 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -42,8 +42,9 @@ def testSliceNet(self): "targets": tf.constant(y, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } - model = slicenet.SliceNet(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = slicenet.SliceNet( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 94fb0776c..88d901df9 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -41,13 +41,9 @@ class Transformer(t2t_model.T2TModel): """Attention net. See file docstring.""" - def model_fn_body(self, features, train): + def model_fn_body(self, features): # Remove dropout if not training hparams = copy.copy(self._hparams) - if not train: - hparams.attention_dropout = 0. - hparams.relu_dropout = 0. - hparams.residual_dropout = 0. targets = features["targets"] inputs = features.get("inputs") target_space = features.get("target_space_id") @@ -300,10 +296,12 @@ def transformer_base(): hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) - hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("residual_dropout", 0.1) + hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) return hparams diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 1b43ce625..9535558a4 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -48,8 +48,8 @@ def _testTransformer(self, net): "targets": tf.constant(targets, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } - model = net(hparams, p_hparams) - shadred_logits, _, _ = model.model_fn(features, True) + model = net(hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + shadred_logits, _, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index 01b5adb78..d28a1628e 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -30,7 +30,7 @@ import tensorflow as tf -def residual_block(x, hparams, train): +def residual_block(x, hparams): """A stack of convolution blocks with residual connection.""" k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] @@ -42,24 +42,24 @@ def residual_block(x, hparams, train): separability=0, name="residual_block") x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") - return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) + return tf.nn.dropout(x, 1.0 - hparams.dropout) -def xception_internal(inputs, hparams, train): +def xception_internal(inputs, hparams): """Xception body.""" with tf.variable_scope("xception"): cur = inputs for i in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % i): - cur = residual_block(cur, hparams, train) + cur = residual_block(cur, hparams) return cur @registry.register_model class Xception(t2t_model.T2TModel): - def model_fn_body(self, features, train): - return xception_internal(features["inputs"], self._hparams, train) + def model_fn_body(self, features): + return xception_internal(features["inputs"], self._hparams) @registry.register_hparams diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py index 4eabb387a..cd158b852 100644 --- a/tensor2tensor/models/xception_test.py +++ b/tensor2tensor/models/xception_test.py @@ -42,8 +42,9 @@ def testXception(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = xception.Xception(hparams, p_hparams) - sharded_logits, _, _ = model.model_fn(features, True) + model = xception.Xception( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 5ebb74280..4d7ccd771 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -17,6 +17,7 @@ from __future__ import division from __future__ import print_function +import copy import time # Dependency imports @@ -51,6 +52,7 @@ class T2TModel(object): def __init__(self, hparams, + mode, problem_hparams, problem_idx=0, data_parallelism=None, @@ -59,6 +61,7 @@ def __init__(self, Args: hparams: a hyperparameters object. + mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. problem_hparams: a hyperparameters object. problem_idx: an integer. data_parallelism: a expert_utils.parallelism @@ -72,6 +75,13 @@ def __init__(self, data_parallelism = eu.Parallelism([""]) if ps_devices is None: ps_devices = [""] + hparams = copy.copy(hparams) + hparams.add_hparam("mode", mode) + # when not in training mode, set all forms of dropout to zero. + if mode != tf.contrib.learn.ModeKeys.TRAIN: + for key in hparams.values(): + if key[-len("dropout"):] == "dropout": + setattr(hparams, key, 0.0) self._hparams = hparams self._data_parallelism = data_parallelism self._num_datashards = data_parallelism.n @@ -332,12 +342,11 @@ def _shard_features(self, features): # pylint: disable=missing-docstring 0)) return sharded_features - def model_fn(self, features, train, skip=False, last_position_only=False): + def model_fn(self, features, skip=False, last_position_only=False): """Computes the entire model and produces sharded logits and training loss. Args: features: A dictionary of feature name to tensor. - train: a boolean `Scalar` (whether we are in training mode). skip: a boolean, if we're just dummy-calling and actually skip this model (but we need to create variables to not confuse distributed training). last_position_only: a boolean, compute logits for only the last position. @@ -392,7 +401,7 @@ def model_fn(self, features, train, skip=False, last_position_only=False): body_outputs, extra_loss = transformed_features["targets"], 0.0 else: body_outputs, extra_loss = self.model_fn_body_sharded( - transformed_features, train) + transformed_features) with tf.variable_scope(target_modality.name, reuse=target_reuse): if not last_position_only: @@ -420,7 +429,7 @@ def model_fn(self, features, train, skip=False, last_position_only=False): tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time)) return sharded_logits, training_loss, extra_loss - def model_fn_body_sharded(self, sharded_features, train): + def model_fn_body_sharded(self, sharded_features): """Mixture-of-experts models will override this function. Compute model body on all datashards. @@ -428,7 +437,6 @@ def model_fn_body_sharded(self, sharded_features, train): Args: sharded_features: map from string to list of Tensors each with shape [batch, ?, ?, body_input_size] - train: A boolean `Scalar` (whether we are in training mode). Returns: sharded_body_output: @@ -442,7 +450,7 @@ def model_fn_body_sharded(self, sharded_features, train): } for d in xrange(self._num_datashards)] output = self._data_parallelism( _with_timing(self.model_fn_body, "model_fn_body"), - datashard_to_features, train) + datashard_to_features) if isinstance(output, tuple): loss = tf.reduce_mean(output[1]) output = output[0] @@ -450,7 +458,7 @@ def model_fn_body_sharded(self, sharded_features, train): loss = 0.0 return output, loss - def model_fn_body(self, features, train): + def model_fn_body(self, features): """Most models will override this function. Compute label logits for one shard as a function of the transformed @@ -459,7 +467,6 @@ def model_fn_body(self, features, train): Args: features: A dictionary of key to Tensor. Each Tensor has shape `[batch_size, ?, ?, hidden_size]`. - train: A boolean `Scalar` (whether we are in training mode). Returns: a `Tensor` of logits with shape `[batch_size, O, P, body_output_size]`. diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index d901b4241..940927638 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -384,7 +384,8 @@ def model_fn(features, targets, mode): def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( - hparams, hparams.problems[n], n, dp, _ps_devices(all_workers=True)) + hparams, mode, hparams.problems[n], + n, dp, _ps_devices(all_workers=True)) if mode == tf.contrib.learn.ModeKeys.INFER: return model_class.infer( features, @@ -402,7 +403,7 @@ def nth_model(n): # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) sharded_logits, training_loss, extra_loss = model_class.model_fn( - features, train, skip=(skipping_is_on and skip_this_one)) + features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg", reuse=True): loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n) o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1) @@ -643,8 +644,8 @@ def _save_until_eos(hyp): # pylint: disable=missing-docstring decodes = [] for _ in range(num_decode_batches): result_iter = estimator.predict( - input_fn=input_fn.next if six.PY2 else input_fn.__next__, - as_iterable=True) + input_fn=input_fn.next if six.PY2 else input_fn.__next__, + as_iterable=True) for result in result_iter: def log_fn(inputs, outputs):