diff --git a/.gitignore b/.gitignore
index e610f29ba..dd84837dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,6 @@
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
 
-# PyPI distribution artifacts
+# PyPI distribution artificats
 build/
 dist/
diff --git a/setup.py b/setup.py
index 5b2d423f8..fbb81470e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.0.7',
+    version='1.0.8',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
diff --git a/tensor2tensor/bin/make_tf_configs.py b/tensor2tensor/bin/make_tf_configs.py
index 791589939..8b9367ca6 100644
--- a/tensor2tensor/bin/make_tf_configs.py
+++ b/tensor2tensor/bin/make_tf_configs.py
@@ -55,7 +55,7 @@ def main(_):
     for idx, job in enumerate(jobs):
       if task_type == "worker":
         cmd_line_flags = " ".join([
-            "--master=%s" % job,
+            "--master=grpc://%s" % job,
             "--ps_replicas=%d" % len(ps),
             "--worker_replicas=%d" % len(workers),
             "--worker_gpu=1",
@@ -66,6 +66,7 @@ def main(_):
         ])
       else:
         cmd_line_flags = " ".join([
+            "--master=grpc://%s" % job,
             "--schedule=run_std_server",
         ])
 
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
index 452fc637a..f5c954036 100644
--- a/tensor2tensor/data_generators/algorithmic_math.py
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -570,16 +570,16 @@ def calculus_integrate(alphabet_size=26,
 
   functions = {"log": "L"}
   alg_cfg = math_dataset_init(alphabet_size, digits=5, functions=functions)
-  nbr_case=0
+  nbr_case = 0
   while nbr_case < nbr_cases:
     try:
       sample, target = generate_calculus_integrate_sample(
-        alg_cfg.vlist,
-        list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions)
+          alg_cfg.vlist,
+          list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions)
       yield {
-        "inputs": alg_cfg.int_encoder(sample),
-        "targets": alg_cfg.int_encoder(target)
+          "inputs": alg_cfg.int_encoder(sample),
+          "targets": alg_cfg.int_encoder(target)
       }
-    except:
+    except:  # pylint:disable=bare-except
       continue
-    nbr_case = nbr_case + 1
+    nbr_case += 1
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
old mode 100755
new mode 100644
index c50d19afa..fb85d99c3
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -27,7 +27,7 @@
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
-import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3
+import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
 from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
 from tensor2tensor.data_generators.tokenizer import Tokenizer
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
old mode 100755
new mode 100644
index c525e4ec0..e88a90983
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -28,6 +28,7 @@
 
 import tensorflow as tf
 
+
 # End-of-sentence marker (should correspond to the position of EOS in the
 # RESERVED_TOKENS list in text_encoder.py)
 EOS = 1
@@ -44,6 +45,7 @@ def character_generator(source_path, target_path, character_vocab, eos=None):
   Args:
     source_path: path to the file with source sentences.
     target_path: path to the file with target sentences.
+    character_vocab: a TextEncoder to encode the characters.
     eos: integer to append at the end of each sequence (default: None).
 
   Yields:
diff --git a/tensor2tensor/data_generators/wmt_test.py b/tensor2tensor/data_generators/wmt_test.py
old mode 100755
new mode 100644
index 0366fdfb0..b6af3cf93
--- a/tensor2tensor/data_generators/wmt_test.py
+++ b/tensor2tensor/data_generators/wmt_test.py
@@ -25,8 +25,8 @@
 # Dependency imports
 
 import six
-from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import wmt
 
 import tensorflow as tf
 
@@ -40,7 +40,7 @@ def testCharacterGenerator(self):
     if six.PY2:
       enc_f = lambda s: s
     else:
-      enc_f = lambda s: s.encode('utf-8')
+      enc_f = lambda s: s.encode("utf-8")
     with io.open(tmp_file_path + ".src", "wb") as src_file:
       src_file.write(enc_f("source1\n"))
       src_file.write(enc_f("source2\n"))
@@ -51,16 +51,15 @@ def testCharacterGenerator(self):
     # Call character generator on the generated files.
     results_src, results_tgt = [], []
     character_vocab = text_encoder.ByteTextEncoder()
-    for dictionary in wmt.character_generator(tmp_file_path + ".src",
-                                              tmp_file_path + ".tgt",
-                                              character_vocab):
+    for dictionary in wmt.character_generator(
+        tmp_file_path + ".src", tmp_file_path + ".tgt", character_vocab):
       self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"])
       results_src.append(dictionary["inputs"])
       results_tgt.append(dictionary["targets"])
 
     # Check that the results match the files.
     # First check that the results match the encoded original strings;
-    # this is a comparison of integer arrays
+    # this is a comparison of integer arrays.
     self.assertEqual(len(results_src), 2)
     self.assertEqual(results_src[0],
                      character_vocab.encode("source1"))
diff --git a/tensor2tensor/docs/distributed_training.md b/tensor2tensor/docs/distributed_training.md
index be3726f06..e7ddd7294 100644
--- a/tensor2tensor/docs/distributed_training.md
+++ b/tensor2tensor/docs/distributed_training.md
@@ -35,7 +35,7 @@ os.environ['TF_CONFIG'] = json.dumps({
 The following T2T command-line flags must also be set on the workers for
 distributed training:
 
-- `--master=$ADDRESS`
+- `--master=grpc://$ADDRESS`
 - `--worker_replicas=$NUM_WORKERS`
 - `--worker_gpu=$NUM_GPUS_PER_WORKER`
 - `--worker_id=$WORKER_ID`
@@ -55,6 +55,17 @@ Parameter servers only need `--schedule=run_std_server`.
 generates the `TF_CONFIG` json strings and the above-mentioned command-line
 flags for the workers and parameter servers.
 
+Given a set of worker and parameter server addresses, the script outputs, for
+each job, a line with the `TF_CONFIG` environment variable and the command-line
+flags necessary for distributed training. For each job, you should invoke the
+`t2t-trainer` with the `TF_CONFIG` value and flags that are output.
+
+For example:
+
+```
+TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ...
+```
+
 ## Command-line flags for eval jobs
 
 Eval jobs should set the following flags and do not need the `TF_CONFIG`
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 30b871640..99fbd8232 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -24,8 +24,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -43,13 +41,9 @@
 class AttentionLM(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     # Remove dropout if not training
-    hparams = copy.copy(self._hparams)
-    if not train:
-      hparams.attention_dropout = 0.
-      hparams.relu_dropout = 0.
-      hparams.residual_dropout = 0.
+    hparams = self._hparams
     targets = features["targets"]
     targets = tf.squeeze(targets, 2)
 
@@ -162,8 +156,10 @@ def attention_lm_base():
   hparams.add_hparam("num_heads", 8)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("pos", "timing")  # timing, none
   return hparams
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 9cd0547f7..b4d27d400 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -24,8 +24,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -43,13 +41,9 @@
 class AttentionLmMoe(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
-  def model_fn_body_sharded(self, sharded_features, train):
+  def model_fn_body_sharded(self, sharded_features):
     # Remove dropout if not training
-    hparams = copy.copy(self._hparams)
-    if not train:
-      hparams.attention_dropout = 0.
-      hparams.relu_dropout = 0.
-      hparams.residual_dropout = 0.
+    hparams = self._hparams
     dp = self._data_parallelism
     targets = sharded_features["targets"]
     targets = dp(tf.squeeze, targets, 2)
@@ -81,7 +75,9 @@ def residual_fn(x, y):
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, loss = common_layers.moe_layer(
-                dp, self._ps_devices, x, train, hparams.hidden_size,
+                dp, self._ps_devices, x,
+                hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                hparams.hidden_size,
                 hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2,
                 hparams.moe_loss_coef)
             extra_loss += loss
@@ -162,10 +158,12 @@ def attention_lm_moe_base():
   hparams.add_hparam("num_heads", 8)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("pos", "timing")  # timing, none
   return hparams
 
 
diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py
index bb7119a15..bbcf392aa 100644
--- a/tensor2tensor/models/bluenet.py
+++ b/tensor2tensor/models/bluenet.py
@@ -30,7 +30,7 @@
 import tensorflow as tf
 
 
-def residual_module(x, hparams, train, n, sep):
+def residual_module(x, hparams, n, sep):
   """A stack of convolution blocks with residual connection."""
   k = (hparams.kernel_height, hparams.kernel_width)
   dilations_and_kernels = [((1, 1), k) for _ in xrange(n)]
@@ -43,56 +43,55 @@ def residual_module(x, hparams, train, n, sep):
         separability=sep,
         name="block")
     x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
-  return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+  return tf.nn.dropout(x, 1.0 - hparams.dropout)
 
 
-def residual_module1(x, hparams, train):
-  return residual_module(x, hparams, train, 1, 1)
+def residual_module1(x, hparams):
+  return residual_module(x, hparams, 1, 1)
 
 
-def residual_module1_sep(x, hparams, train):
-  return residual_module(x, hparams, train, 1, 0)
+def residual_module1_sep(x, hparams):
+  return residual_module(x, hparams, 1, 0)
 
 
-def residual_module2(x, hparams, train):
-  return residual_module(x, hparams, train, 2, 1)
+def residual_module2(x, hparams):
+  return residual_module(x, hparams, 2, 1)
 
 
-def residual_module2_sep(x, hparams, train):
-  return residual_module(x, hparams, train, 2, 0)
+def residual_module2_sep(x, hparams):
+  return residual_module(x, hparams, 2, 0)
 
 
-def residual_module3(x, hparams, train):
-  return residual_module(x, hparams, train, 3, 1)
+def residual_module3(x, hparams):
+  return residual_module(x, hparams, 3, 1)
 
 
-def residual_module3_sep(x, hparams, train):
-  return residual_module(x, hparams, train, 3, 0)
+def residual_module3_sep(x, hparams):
+  return residual_module(x, hparams, 3, 0)
 
 
-def norm_module(x, hparams, train):
-  del train  # Unused.
+def norm_module(x, hparams):
   return common_layers.layer_norm(x, hparams.hidden_size, name="norm_module")
 
 
-def identity_module(x, hparams, train):
-  del hparams, train  # Unused.
+def identity_module(x, hparams):
+  del hparams  # Unused.
   return x
 
 
-def run_modules(blocks, cur, hparams, train, dp):
+def run_modules(blocks, cur, hparams, dp):
   """Run blocks in parallel using dp as data_parallelism."""
   assert len(blocks) % dp.n == 0
   res = []
   for i in xrange(len(blocks) // dp.n):
-    res.extend(dp(blocks[i * dp.n:(i + 1) * dp.n], cur, hparams, train))
+    res.extend(dp(blocks[i * dp.n:(i + 1) * dp.n], cur, hparams))
   return res
 
 
 @registry.register_model
 class BlueNet(t2t_model.T2TModel):
 
-  def model_fn_body_sharded(self, sharded_features, train):
+  def model_fn_body_sharded(self, sharded_features):
     dp = self._data_parallelism
     dp._reuse = False  # pylint:disable=protected-access
     hparams = self._hparams
@@ -106,7 +105,7 @@ def model_fn_body_sharded(self, sharded_features, train):
     cur_shape = cur.get_shape()
     for i in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % i):
-        processed = run_modules(blocks, cur, hparams, train, dp)
+        processed = run_modules(blocks, cur, hparams, dp)
         cur = common_layers.shakeshake(processed)
         cur.set_shape(cur_shape)
 
diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py
index 70996ab02..a325e5a55 100644
--- a/tensor2tensor/models/bluenet_test.py
+++ b/tensor2tensor/models/bluenet_test.py
@@ -42,8 +42,9 @@ def testBlueNet(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = bluenet.BlueNet(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = bluenet.BlueNet(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
index 42db05700..1a82144d6 100644
--- a/tensor2tensor/models/bytenet.py
+++ b/tensor2tensor/models/bytenet.py
@@ -30,7 +30,7 @@
 import tensorflow as tf
 
 
-def residual_dilated_conv(x, repeat, padding, name, hparams, train):
+def residual_dilated_conv(x, repeat, padding, name, hparams):
   """A stack of convolution blocks with residual connections."""
   with tf.variable_scope(name):
     k = (hparams.kernel_height, hparams.kernel_width)
@@ -45,11 +45,11 @@ def residual_dilated_conv(x, repeat, padding, name, hparams, train):
             padding=padding,
             name="residual_conv")
         x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
-        x = tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+        x = tf.nn.dropout(x, hparams.dropout)
     return x
 
 
-def bytenet_internal(inputs, targets, hparams, train):
+def bytenet_internal(inputs, targets, hparams):
   """ByteNet, main step used for training."""
   with tf.variable_scope("bytenet"):
     # Flatten inputs and extend length by 50%.
@@ -63,7 +63,7 @@ def bytenet_internal(inputs, targets, hparams, train):
     inputs, targets = common_layers.pad_to_same_length(
         inputs, targets, final_length_divisible_by=50)
     final_encoder = residual_dilated_conv(
-        inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train)
+        inputs, hparams.num_block_repeat, "SAME", "encoder", hparams)
 
     shifted_targets = common_layers.shift_left(targets)
     kernel = (hparams.kernel_height, hparams.kernel_width)
@@ -74,15 +74,15 @@ def bytenet_internal(inputs, targets, hparams, train):
 
     return residual_dilated_conv(
         decoder_start, hparams.num_block_repeat,
-        "LEFT", "decoder", hparams, train)
+        "LEFT", "decoder", hparams)
 
 
 @registry.register_model
 class ByteNet(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     return bytenet_internal(features["inputs"], features["targets"],
-                            self._hparams, train)
+                            self._hparams)
 
 
 @registry.register_hparams
diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py
index 676220cc8..8202d5b74 100644
--- a/tensor2tensor/models/bytenet_test.py
+++ b/tensor2tensor/models/bytenet_test.py
@@ -42,8 +42,9 @@ def testByteNet(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = bytenet.ByteNet(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = bytenet.ByteNet(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
index 689f407f5..41ca6f4b0 100644
--- a/tensor2tensor/models/common_hparams.py
+++ b/tensor2tensor/models/common_hparams.py
@@ -45,6 +45,8 @@ def basic_params1():
       kernel_width=1,
       hidden_size=64,
       compress_steps=0,
+      # All hyperparameters ending in "dropout" are automatically set to 0.0
+      # when not in training mode.
       dropout=0.2,
       clip_grad_norm=2.0,
       initializer="orthogonal",
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index f9d63a464..078fcc5a3 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -1211,7 +1211,6 @@ def conv_lstm(x,
 def diagonal_conv_gru(x,
                       kernel_size,
                       filters,
-                      train,
                       dropout=0.0,
                       name=None,
                       reuse=None):
@@ -1234,8 +1233,7 @@ def do_conv(args, name, bias_start):
     gate, gate_cost = hard_sigmoid(do_conv(x, "gate", 0.7))
     candidate = tf.tanh(do_conv(reset * x, "candidate", 0.0))
 
-    # Dropout if training.
-    if dropout > 0.0 and train:
+    if dropout > 0.0:
       candidate = tf.nn.dropout(candidate, 1.0 - dropout)
 
     # Diagonal shift.
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index 78f79eed0..992c42db4 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -67,6 +67,7 @@ def lstm_seq2seq_internal(inputs, targets, hparams, train):
 @registry.register_model("baseline_lstm_seq2seq")
 class LSTMSeq2Seq(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
+    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     return lstm_seq2seq_internal(features["inputs"], features["targets"],
                                  self._hparams, train)
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index 33347cb84..e5bdb184b 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -43,8 +43,9 @@ def testLSTMSeq2Seq(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = lstm.LSTMSeq2Seq(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = lstm.LSTMSeq2Seq(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index 7247b791e..66a8491f2 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -70,7 +70,8 @@ def add_and_normalize(x, y):
 @registry.register_model
 class MultiModel(t2t_model.T2TModel):
 
-  def model_fn_body_sharded(self, sharded_features, train):
+  def model_fn_body_sharded(self, sharded_features):
+    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     dp = self._data_parallelism
     hparams = self._hparams
     targets = sharded_features["targets"]
@@ -86,7 +87,7 @@ def encode_half(inputs, inputs_mask, hparams):
       inputs = common_layers.add_timing_signal(inputs)
       return slicenet.multi_conv_res(inputs, "SAME", "encoder1",
                                      hparams.num_hidden_layers // 2,
-                                     hparams, train, mask=inputs_mask)
+                                     hparams, mask=inputs_mask)
 
     target_space_emb = dp(slicenet.embed_target_space,
                           sharded_features["target_space_id"],
@@ -101,7 +102,7 @@ def encode_half(inputs, inputs_mask, hparams):
       expert_loss *= hparams.moe_loss_coef
     inputs_encoded = dp(
         slicenet.multi_conv_res, inputs_encoded, "SAME",
-        "encoder2", hparams.num_hidden_layers, hparams, train,
+        "encoder2", hparams.num_hidden_layers, hparams,
         mask=inputs_mask)
 
     # If we're just predicing a class, there is no use for a decoder, return.
@@ -112,7 +113,7 @@ def encode_half(inputs, inputs_mask, hparams):
     # Do the middle part.
     decoder_start, similarity_loss = dp(
         slicenet.slicenet_middle, inputs_encoded, targets,
-        target_space_emb, inputs_mask, hparams, train)
+        target_space_emb, inputs_mask, hparams)
 
     # Decode.
     decoder_half = dp(
@@ -137,7 +138,6 @@ def encode_half(inputs, inputs_mask, hparams):
         "decoder2",
         hparams.num_hidden_layers // 2,
         hparams,
-        train,
         mask=inputs_mask,
         source=inputs_encoded)
 
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index 8df682c5c..72fe4a326 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -43,8 +43,9 @@ def testMultiModel(self):
           "targets": tf.constant(y, dtype=tf.int32),
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
-      model = multimodel.MultiModel(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = multimodel.MultiModel(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py
index dbae77f43..dce0dbc30 100644
--- a/tensor2tensor/models/neural_gpu.py
+++ b/tensor2tensor/models/neural_gpu.py
@@ -30,12 +30,11 @@
 import tensorflow as tf
 
 
-def neural_gpu(inputs, hparams, train, name=None):
+def neural_gpu(inputs, hparams, name=None):
   """The core Neural GPU."""
   with tf.variable_scope(name, "neural_gpu"):
-
     def step(state, inp):  # pylint: disable=missing-docstring
-      x = tf.nn.dropout(state, 1.0 - hparams.dropout * tf.to_float(train))
+      x = tf.nn.dropout(state, 1.0 - hparams.dropout)
       for layer in xrange(hparams.num_hidden_layers):
         x = common_layers.conv_gru(
             x, (hparams.kernel_height, hparams.kernel_width),
@@ -57,11 +56,11 @@ def step(state, inp):  # pylint: disable=missing-docstring
 @registry.register_model
 class NeuralGPU(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
-    return neural_gpu(features["inputs"], self._hparams, train)
+  def model_fn_body(self, features):
+    return neural_gpu(features["inputs"], self._hparams)
 
 
-def diagonal_neural_gpu(inputs, hparams, train, name=None):
+def diagonal_neural_gpu(inputs, hparams, name=None):
   """Improved Neural GPU as in https://arxiv.org/abs/1702.08727."""
   with tf.variable_scope(name, "diagonal_neural_gpu"):
 
@@ -73,7 +72,6 @@ def step(state_tup, inp):
         x, new_loss = common_layers.diagonal_conv_gru(
             x, (hparams.kernel_height, hparams.kernel_width),
             hparams.hidden_size,
-            train,
             dropout=hparams.dropout,
             name="dcgru_%d" % layer)
       # Padding input is zeroed-out in the modality, we check this by summing.
@@ -93,8 +91,8 @@ def step(state_tup, inp):
 @registry.register_model
 class DiagonalNeuralGPU(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
-    return diagonal_neural_gpu(features["inputs"], self._hparams, train)
+  def model_fn_body(self, features):
+    return diagonal_neural_gpu(features["inputs"], self._hparams)
 
 
 @registry.register_hparams("neuralgpu_1")
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
index 0d4937a5d..3065bb1c4 100644
--- a/tensor2tensor/models/neural_gpu_test.py
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -49,8 +49,9 @@ def testNeuralGPU(self):
           "inputs": tf.constant(inputs, dtype=tf.int32),
           "targets": tf.constant(targets, dtype=tf.int32)
       }
-      model = neural_gpu.NeuralGPU(hparams, p_hparams)
-      shadred_logits, _, _ = model.model_fn(features, True)
+      model = neural_gpu.NeuralGPU(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      shadred_logits, _, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index eddf4cc96..0b9efc2c3 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -46,8 +46,7 @@ def get_norm(hparams):
                    "'noam', 'none'.")
 
 
-def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
-              bias=None):
+def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None):
   """Complete attention layer with preprocessing."""
   separabilities = [hparams.separability, hparams.separability]
   if hparams.separability < 0:
@@ -71,7 +70,6 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
         tf.shape(inputs_encoded)[1]
     ])
 
-    attention_dropout = hparams.attention_dropout * tf.to_float(train)
     qv = common_attention.multihead_attention(
         targets_timed,
         None,
@@ -80,7 +78,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
         hparams.hidden_size,
         hparams.hidden_size,
         hparams.num_heads,
-        attention_dropout,
+        hparams.attention_dropout,
         name="self_attention",
         summaries=False)
     qv = common_attention.multihead_attention(
@@ -91,7 +89,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
         hparams.hidden_size,
         hparams.hidden_size,
         hparams.num_heads,
-        attention_dropout,
+        hparams.attention_dropout,
         name="encdec_attention",
         summaries=False)
     return tf.expand_dims(qv, 2)
@@ -101,7 +99,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, train,
     return norm_fn(targets_shifted + targets_with_attention, name="attn_norm")
 
 
-def multi_conv_res(x, padding, name, layers, hparams, train,
+def multi_conv_res(x, padding, name, layers, hparams,
                    mask=None, source=None):
   """A stack of separable convolution blocks with residual connections."""
   with tf.variable_scope(name):
@@ -152,10 +150,10 @@ def multi_conv_res(x, padding, name, layers, hparams, train,
             separabilities=separabilities2,
             name="residual2") + y
         if source is not None and hparams.attention_type != "none":
-          x += attention(x, source, norm_fn, hparams, train, bias=padding_bias)
+          x += attention(x, source, norm_fn, hparams, bias=padding_bias)
         if mask is not None:
           x *= mask
-    return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+    return tf.nn.dropout(x, 1.0 - hparams.dropout)
 
 
 def rank_loss(sentence_emb, image_emb, margin=0.2):
@@ -188,8 +186,7 @@ def similarity_cost(inputs_encoded, targets_encoded):
   return rank_loss(x, y)
 
 
-def slicenet_middle(inputs_encoded, targets, target_space_emb, mask,
-                    hparams, train):
+def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
   """Middle part of slicenet, connecting encoder and decoder."""
   norm_fn = get_norm(hparams)
 
@@ -204,7 +201,7 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask,
     extra_layers = int(hparams.num_hidden_layers * 1.5)
     with tf.variable_scope(tf.get_variable_scope(), reuse=True):
       targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder",
-                                       extra_layers, hparams, train)
+                                       extra_layers, hparams)
     with tf.variable_scope("similarity_loss"):
       similarity_loss = similarity_cost(inputs_encoded, targets_encoded)
       similarity_loss *= hparams.sim_loss_mult
@@ -219,7 +216,7 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask,
   else:
     inputs_padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
     targets_with_attention = attention(
-        targets_shifted, inputs_encoded, norm_fn, hparams, train,
+        targets_shifted, inputs_encoded, norm_fn, hparams,
         bias=inputs_padding_bias)
 
   # Positional targets: merge attention and raw.
@@ -247,8 +244,7 @@ def embedding_to_padding(emb):
   return tf.to_float(tf.equal(emb_sum, 0.0))
 
 
-def slicenet_internal(inputs, targets, target_space,
-                      problem_idx, hparams, train):
+def slicenet_internal(inputs, targets, target_space, problem_idx, hparams):
   """The slicenet model, main step used for training."""
   with tf.variable_scope("slicenet"):
     # Flatten inputs and encode.
@@ -258,14 +254,14 @@ def slicenet_internal(inputs, targets, target_space,
     target_space_emb = embed_target_space(target_space, hparams.hidden_size)
     extra_layers = int(hparams.num_hidden_layers * 1.5)
     inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers,
-                                    hparams, train, mask=inputs_mask)
+                                    hparams, mask=inputs_mask)
     target_modality_name = hparams.problems[problem_idx].target_modality.name
     if "class_label_modality" in target_modality_name:
       # If we're just predicing a class, there is no use for a decoder.
       return inputs_encoded
     # Do the middle part.
     decoder_start, similarity_loss = slicenet_middle(
-        inputs_encoded, targets, target_space_emb, inputs_mask, hparams, train)
+        inputs_encoded, targets, target_space_emb, inputs_mask, hparams)
     # Decode.
     decoder_final = multi_conv_res(
         decoder_start,
@@ -273,7 +269,6 @@ def slicenet_internal(inputs, targets, target_space,
         "decoder",
         hparams.num_hidden_layers,
         hparams,
-        train,
         mask=inputs_mask,
         source=inputs_encoded)
     return decoder_final, tf.reduce_mean(similarity_loss)
@@ -282,10 +277,10 @@ def slicenet_internal(inputs, targets, target_space,
 @registry.register_model
 class SliceNet(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     return slicenet_internal(features["inputs"], features["targets"],
                              features["target_space_id"], self._problem_idx,
-                             self._hparams, train)
+                             self._hparams)
 
 _KERNEL_SCHEMES = {
     "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)],
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index bbeb3a284..db563b481 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -42,8 +42,9 @@ def testSliceNet(self):
           "targets": tf.constant(y, dtype=tf.int32),
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
-      model = slicenet.SliceNet(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = slicenet.SliceNet(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 94fb0776c..88d901df9 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -41,13 +41,9 @@
 class Transformer(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     # Remove dropout if not training
     hparams = copy.copy(self._hparams)
-    if not train:
-      hparams.attention_dropout = 0.
-      hparams.relu_dropout = 0.
-      hparams.residual_dropout = 0.
     targets = features["targets"]
     inputs = features.get("inputs")
     target_space = features.get("target_space_id")
@@ -300,10 +296,12 @@ def transformer_base():
   hparams.add_hparam("ffn_layer", "conv_hidden_relu")
   hparams.add_hparam("parameter_attention_key_channels", 0)
   hparams.add_hparam("parameter_attention_value_channels", 0)
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("nbr_decoder_problems", 1)
   return hparams
 
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 1b43ce625..9535558a4 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -48,8 +48,8 @@ def _testTransformer(self, net):
           "targets": tf.constant(targets, dtype=tf.int32),
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
-      model = net(hparams, p_hparams)
-      shadred_logits, _, _ = model.model_fn(features, True)
+      model = net(hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      shadred_logits, _, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
index 01b5adb78..d28a1628e 100644
--- a/tensor2tensor/models/xception.py
+++ b/tensor2tensor/models/xception.py
@@ -30,7 +30,7 @@
 import tensorflow as tf
 
 
-def residual_block(x, hparams, train):
+def residual_block(x, hparams):
   """A stack of convolution blocks with residual connection."""
   k = (hparams.kernel_height, hparams.kernel_width)
   dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
@@ -42,24 +42,24 @@ def residual_block(x, hparams, train):
       separability=0,
       name="residual_block")
   x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
-  return tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train))
+  return tf.nn.dropout(x, 1.0 - hparams.dropout)
 
 
-def xception_internal(inputs, hparams, train):
+def xception_internal(inputs, hparams):
   """Xception body."""
   with tf.variable_scope("xception"):
     cur = inputs
     for i in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % i):
-        cur = residual_block(cur, hparams, train)
+        cur = residual_block(cur, hparams)
     return cur
 
 
 @registry.register_model
 class Xception(t2t_model.T2TModel):
 
-  def model_fn_body(self, features, train):
-    return xception_internal(features["inputs"], self._hparams, train)
+  def model_fn_body(self, features):
+    return xception_internal(features["inputs"], self._hparams)
 
 
 @registry.register_hparams
diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py
index 4eabb387a..cd158b852 100644
--- a/tensor2tensor/models/xception_test.py
+++ b/tensor2tensor/models/xception_test.py
@@ -42,8 +42,9 @@ def testXception(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = xception.Xception(hparams, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features, True)
+      model = xception.Xception(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 5ebb74280..4d7ccd771 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -17,6 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import time
 
 # Dependency imports
@@ -51,6 +52,7 @@ class T2TModel(object):
 
   def __init__(self,
                hparams,
+               mode,
                problem_hparams,
                problem_idx=0,
                data_parallelism=None,
@@ -59,6 +61,7 @@ def __init__(self,
 
     Args:
       hparams: a hyperparameters object.
+      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
       problem_hparams: a hyperparameters object.
       problem_idx: an integer.
       data_parallelism: a expert_utils.parallelism
@@ -72,6 +75,13 @@ def __init__(self,
       data_parallelism = eu.Parallelism([""])
     if ps_devices is None:
       ps_devices = [""]
+    hparams = copy.copy(hparams)
+    hparams.add_hparam("mode", mode)
+    # when not in training mode, set all forms of dropout to zero.
+    if mode != tf.contrib.learn.ModeKeys.TRAIN:
+      for key in hparams.values():
+        if key[-len("dropout"):] == "dropout":
+          setattr(hparams, key, 0.0)
     self._hparams = hparams
     self._data_parallelism = data_parallelism
     self._num_datashards = data_parallelism.n
@@ -332,12 +342,11 @@ def _shard_features(self, features):  # pylint: disable=missing-docstring
                                                        0))
     return sharded_features
 
-  def model_fn(self, features, train, skip=False, last_position_only=False):
+  def model_fn(self, features, skip=False, last_position_only=False):
     """Computes the entire model and produces sharded logits and training loss.
 
     Args:
       features: A dictionary of feature name to tensor.
-      train: a boolean `Scalar` (whether we are in training mode).
       skip: a boolean, if we're just dummy-calling and actually skip this model
         (but we need to create variables to not confuse distributed training).
       last_position_only: a boolean, compute logits for only the last position.
@@ -392,7 +401,7 @@ def model_fn(self, features, train, skip=False, last_position_only=False):
         body_outputs, extra_loss = transformed_features["targets"], 0.0
       else:
         body_outputs, extra_loss = self.model_fn_body_sharded(
-            transformed_features, train)
+            transformed_features)
 
     with tf.variable_scope(target_modality.name, reuse=target_reuse):
       if not last_position_only:
@@ -420,7 +429,7 @@ def model_fn(self, features, train, skip=False, last_position_only=False):
     tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time))
     return sharded_logits, training_loss, extra_loss
 
-  def model_fn_body_sharded(self, sharded_features, train):
+  def model_fn_body_sharded(self, sharded_features):
     """Mixture-of-experts models will override this function.
 
     Compute model body on all datashards.
@@ -428,7 +437,6 @@ def model_fn_body_sharded(self, sharded_features, train):
     Args:
       sharded_features: map from string to list of Tensors each with shape
          [batch, ?, ?, body_input_size]
-      train: A boolean `Scalar` (whether we are in training mode).
 
     Returns:
       sharded_body_output:
@@ -442,7 +450,7 @@ def model_fn_body_sharded(self, sharded_features, train):
       } for d in xrange(self._num_datashards)]
       output = self._data_parallelism(
           _with_timing(self.model_fn_body, "model_fn_body"),
-          datashard_to_features, train)
+          datashard_to_features)
       if isinstance(output, tuple):
         loss = tf.reduce_mean(output[1])
         output = output[0]
@@ -450,7 +458,7 @@ def model_fn_body_sharded(self, sharded_features, train):
         loss = 0.0
       return output, loss
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     """Most models will override this function.
 
     Compute label logits for one shard as a function of the transformed
@@ -459,7 +467,6 @@ def model_fn_body(self, features, train):
     Args:
       features: A dictionary of key to Tensor.  Each Tensor has shape
          `[batch_size, ?, ?, hidden_size]`.
-      train: A boolean `Scalar` (whether we are in training mode).
 
     Returns:
       a `Tensor` of logits with shape `[batch_size, O, P, body_output_size]`.
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index d901b4241..940927638 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -384,7 +384,8 @@ def model_fn(features, targets, mode):
     def nth_model(n):
       """Build the model for the n-th problem, plus some added variables."""
       model_class = registry.model(model)(
-          hparams, hparams.problems[n], n, dp, _ps_devices(all_workers=True))
+          hparams, mode, hparams.problems[n],
+          n, dp, _ps_devices(all_workers=True))
       if mode == tf.contrib.learn.ModeKeys.INFER:
         return model_class.infer(
             features,
@@ -402,7 +403,7 @@ def nth_model(n):
       # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
       skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
       sharded_logits, training_loss, extra_loss = model_class.model_fn(
-          features, train, skip=(skipping_is_on and skip_this_one))
+          features, skip=(skipping_is_on and skip_this_one))
       with tf.variable_scope("losses_avg", reuse=True):
         loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n)
         o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1)
@@ -643,8 +644,8 @@ def _save_until_eos(hyp):  #  pylint: disable=missing-docstring
   decodes = []
   for _ in range(num_decode_batches):
     result_iter = estimator.predict(
-      input_fn=input_fn.next if six.PY2 else input_fn.__next__,
-      as_iterable=True)
+        input_fn=input_fn.next if six.PY2 else input_fn.__next__,
+        as_iterable=True)
     for result in result_iter:
 
       def log_fn(inputs, outputs):