From b9900e5cc99bffe365a9fcab1050c6df4e3c9ce4 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 1 Feb 2018 18:01:11 -0800
Subject: [PATCH 01/31] Tune transformer ae hparams.

PiperOrigin-RevId: 184224758
---
 setup.py                                    |   2 -
 tensor2tensor/bin/t2t-rl-trainer            |  16 --
 tensor2tensor/bin/t2t_rl_trainer.py         |  92 --------
 tensor2tensor/models/transformer_vae.py     |   9 +-
 tensor2tensor/rl/README.md                  |  10 -
 tensor2tensor/rl/__init__.py                |   0
 tensor2tensor/rl/collect.py                 |  94 --------
 tensor2tensor/rl/envs/__init__.py           |   0
 tensor2tensor/rl/envs/batch_env.py          | 129 -----------
 tensor2tensor/rl/envs/in_graph_batch_env.py | 163 --------------
 tensor2tensor/rl/envs/utils.py              | 225 --------------------
 tensor2tensor/rl/networks.py                |  65 ------
 tensor2tensor/rl/ppo.py                     |  98 ---------
 tensor2tensor/rl/train_test.py              |  36 ----
 14 files changed, 6 insertions(+), 933 deletions(-)
 delete mode 100644 tensor2tensor/bin/t2t-rl-trainer
 delete mode 100644 tensor2tensor/bin/t2t_rl_trainer.py
 delete mode 100644 tensor2tensor/rl/README.md
 delete mode 100644 tensor2tensor/rl/__init__.py
 delete mode 100644 tensor2tensor/rl/collect.py
 delete mode 100644 tensor2tensor/rl/envs/__init__.py
 delete mode 100644 tensor2tensor/rl/envs/batch_env.py
 delete mode 100644 tensor2tensor/rl/envs/in_graph_batch_env.py
 delete mode 100644 tensor2tensor/rl/envs/utils.py
 delete mode 100644 tensor2tensor/rl/networks.py
 delete mode 100644 tensor2tensor/rl/ppo.py
 delete mode 100644 tensor2tensor/rl/train_test.py

diff --git a/setup.py b/setup.py
index ee0eb0d09..1d3f14a94 100644
--- a/setup.py
+++ b/setup.py
@@ -36,8 +36,6 @@
         'future',
         'gevent',
         'gunicorn',
-        'gym<=0.9.5',  # gym in version 0.9.6 has some temporary issues.
-        'munch',
         'numpy',
         'requests',
         'scipy',
diff --git a/tensor2tensor/bin/t2t-rl-trainer b/tensor2tensor/bin/t2t-rl-trainer
deleted file mode 100644
index 06c97d2d5..000000000
--- a/tensor2tensor/bin/t2t-rl-trainer
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python
-"""t2t-rl-trainer."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensor2tensor.bin import t2t_rl_trainer
-
-import tensorflow as tf
-
-def main(argv):
-  t2t_rl_trainer.main(argv)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensor2tensor/bin/t2t_rl_trainer.py b/tensor2tensor/bin/t2t_rl_trainer.py
deleted file mode 100644
index b53692ccc..000000000
--- a/tensor2tensor/bin/t2t_rl_trainer.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Training of RL agent with PPO algorithm."""
-
-from __future__ import absolute_import
-
-import functools
-from munch import Munch
-import tensorflow as tf
-
-from tensor2tensor.rl.collect import define_collect
-from tensor2tensor.rl.envs.utils import define_batch_env
-from tensor2tensor.rl.ppo import define_ppo_epoch
-
-
-def define_train(policy_lambda, env_lambda, config):
-  env = env_lambda()
-  action_space = env.action_space
-  observation_space = env.observation_space
-
-  batch_env = define_batch_env(env_lambda, config["num_agents"])
-
-  policy_factory = tf.make_template(
-      'network',
-      functools.partial(policy_lambda, observation_space,
-                        action_space, config))
-
-  (collect_op, memory) = define_collect(policy_factory, batch_env, config)
-
-  with tf.control_dependencies([collect_op]):
-    ppo_op = define_ppo_epoch(memory, policy_factory, config)
-
-  return ppo_op
-
-
-def main():
-  train(example_params())
-
-
-def train(params):
-  policy_lambda, env_lambda, config = params
-  ppo_op = define_train(policy_lambda, env_lambda, config)
-
-  with tf.Session() as sess:
-    sess.run(tf.global_variables_initializer())
-    for _ in range(config.epochs_num):
-      sess.run(ppo_op)
-
-
-def example_params():
-  from tensor2tensor.rl import networks
-  config = {}
-  config['init_mean_factor'] = 0.1
-  config['init_logstd'] = 0.1
-  config['policy_layers'] = 100, 100
-  config['value_layers'] = 100, 100
-  config['num_agents'] = 30
-  config['clipping_coef'] = 0.2
-  config['gae_gamma'] = 0.99
-  config['gae_lambda'] = 0.95
-  config['entropy_loss_coef'] = 0.01
-  config['value_loss_coef'] = 1
-  config['optimizer'] = tf.train.AdamOptimizer
-  config['learning_rate'] = 1e-4
-  config['optimization_epochs'] = 15
-  config['epoch_length'] = 200
-  config['epochs_num'] = 2000
-
-  config = Munch(config)
-  return networks.feed_forward_gaussian_fun, pendulum_lambda, config
-
-
-def pendulum_lambda():
-  import gym
-  return gym.make("Pendulum-v0")
-
-
-if __name__ == '__main__':
-  main()
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index ac9a66b77..b1efb75ca 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -830,8 +830,11 @@ def transformer_ae_small():
   hparams.hidden_size = 384
   hparams.filter_size = 2048
   hparams.label_smoothing = 0.0
-  hparams.optimizer = "Adafactor"
-  hparams.add_hparam("z_size", 16)
+  hparams.optimizer = "Adafactor"  # Can be unstable, maybe try Adam.
+  hparams.optimizer_adam_epsilon = 1e-9
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.998  # Needs tuning, try 0.98 to 0.999.
+  hparams.add_hparam("z_size", 14)
   hparams.add_hparam("noise_dev", 0.0)
   hparams.add_hparam("d_mix", 0.5)
   # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
@@ -840,7 +843,7 @@ def transformer_ae_small():
   # Reshape method for hierarchical vq-vae: slice, project
   hparams.add_hparam("reshape_method", "slice")
   hparams.add_hparam("trainable_projections", False)
-  hparams.add_hparam("unmasked_percentage", 0.3)
+  hparams.add_hparam("unmasked_percentage", 0.1)
   hparams.add_hparam("do_ae", True)
   hparams.add_hparam("do_mask", True)
   hparams.add_hparam("do_refine", False)
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
deleted file mode 100644
index bf21ab1ad..000000000
--- a/tensor2tensor/rl/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Tensor2Tensor Reinforcement Learning starter.
-
-The rl package intention is to provide possiblity to run reinforcement
-algorithms within Tensorflow's computation graph.
-
-Currently the only supported algorithm is Proximy Policy Optimization - PPO.
-
-## Sample usage - training in Pendulum-v0 environment.
-
-```t2t-rl-trainer```
diff --git a/tensor2tensor/rl/__init__.py b/tensor2tensor/rl/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py
deleted file mode 100644
index dadab4d92..000000000
--- a/tensor2tensor/rl/collect.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Collect trajectories from interactions of agent with environment."""
-
-import tensorflow as tf
-
-
-def define_collect(policy_factory, batch_env, config):
-
-  memory_shape = [config.epoch_length] + [batch_env.observ.shape.as_list()[0]]
-  memories_shapes_and_types = [
-      # observation
-      (memory_shape + [batch_env.observ.shape.as_list()[1]], tf.float32),
-      (memory_shape, tf.float32),      # reward
-      (memory_shape, tf.bool),         # done
-      (memory_shape + batch_env.action_shape, tf.float32),  # action
-      (memory_shape, tf.float32),      # pdf
-      (memory_shape, tf.float32),      # value function
-  ]
-  memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False)
-            for (shape, dtype) in memories_shapes_and_types]
-  cumulative_rewards = tf.Variable(
-      tf.zeros(config.num_agents, tf.float32), trainable=False)
-
-  should_reset_var = tf.Variable(True, trainable=False)
-  reset_op = tf.cond(should_reset_var,
-                     lambda: batch_env.reset(tf.range(config.num_agents)),
-                     lambda: 0.0)
-  with tf.control_dependencies([reset_op]):
-    reset_once_op = tf.assign(should_reset_var, False)
-
-  with tf.control_dependencies([reset_once_op]):
-
-    def step(index, scores_sum, scores_num):
-      # Note - the only way to ensure making a copy of tensor is to run simple
-      # operation. We are waiting for tf.copy:
-      # https://github.com/tensorflow/tensorflow/issues/11186
-      obs_copy = batch_env.observ + 0
-      actor_critic = policy_factory(tf.expand_dims(obs_copy, 0))
-      policy = actor_critic.policy
-      action = policy.sample()
-      postprocessed_action = actor_critic.action_postprocessing(action)
-      simulate_output = batch_env.simulate(postprocessed_action[0, ...])
-      pdf = policy.prob(action)[0]
-      with tf.control_dependencies(simulate_output):
-        reward, done = simulate_output
-        done = tf.reshape(done, (config.num_agents,))
-        to_save = [obs_copy, reward, done, action[0, ...], pdf,
-                   actor_critic.value[0]]
-        save_ops = [tf.scatter_update(memory_slot, index, value)
-                    for memory_slot, value in zip(memory, to_save)]
-        cumulate_rewards_op = cumulative_rewards.assign_add(reward)
-        agent_indicies_to_reset = tf.where(done)[:, 0]
-      with tf.control_dependencies([cumulate_rewards_op]):
-        scores_sum_delta = tf.reduce_sum(
-            tf.gather(cumulative_rewards, agent_indicies_to_reset))
-        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
-      with tf.control_dependencies(save_ops + [scores_sum_delta,
-                                               scores_num_delta]):
-        reset_env_op = batch_env.reset(agent_indicies_to_reset)
-        reset_cumulative_rewards_op = tf.scatter_update(
-            cumulative_rewards, agent_indicies_to_reset,
-            tf.zeros(tf.shape(agent_indicies_to_reset)))
-      with tf.control_dependencies([reset_env_op,
-                                    reset_cumulative_rewards_op]):
-        return [index + 1, scores_sum + scores_sum_delta,
-                scores_num + scores_num_delta]
-
-    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
-    index, scores_sum, scores_num = tf.while_loop(
-        lambda c, _1, _2: c < config.epoch_length,
-        step,
-        init,
-        parallel_iterations=1,
-        back_prop=False)
-  mean_score = tf.cond(tf.greater(scores_num, 0),
-                       lambda: scores_sum / tf.cast(scores_num, tf.float32),
-                       lambda: 0.)
-  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
-  with tf.control_dependencies([printing]):
-    return tf.identity(index), memory
diff --git a/tensor2tensor/rl/envs/__init__.py b/tensor2tensor/rl/envs/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tensor2tensor/rl/envs/batch_env.py b/tensor2tensor/rl/envs/batch_env.py
deleted file mode 100644
index 30bfdce55..000000000
--- a/tensor2tensor/rl/envs/batch_env.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code was based on Danijar Hafner's code from tf.agents:
-# https://github.com/tensorflow/agents/blob/master/agents/tools/batch_env.py
-
-"""Combine multiple environments to step them in batch."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-class BatchEnv(object):
-  """Combine multiple environments to step them in batch."""
-
-  def __init__(self, envs, blocking):
-    """Combine multiple environments to step them in batch.
-
-    To step environments in parallel, environments must support a
-    `blocking=False` argument to their step and reset functions that makes them
-    return callables instead to receive the result at a later time.
-
-    Args:
-      envs: List of environments.
-      blocking: Step environments after another rather than in parallel.
-
-    Raises:
-      ValueError: Environments have different observation or action spaces.
-    """
-    self._envs = envs
-    self._blocking = blocking
-    observ_space = self._envs[0].observation_space
-    if not all(env.observation_space == observ_space for env in self._envs):
-      raise ValueError('All environments must use the same observation space.')
-    action_space = self._envs[0].action_space
-    if not all(env.action_space == action_space for env in self._envs):
-      raise ValueError('All environments must use the same observation space.')
-
-  def __len__(self):
-    """Number of combined environments."""
-    return len(self._envs)
-
-  def __getitem__(self, index):
-    """Access an underlying environment by index."""
-    return self._envs[index]
-
-  def __getattr__(self, name):
-    """Forward unimplemented attributes to one of the original environments.
-
-    Args:
-      name: Attribute that was accessed.
-
-    Returns:
-      Value behind the attribute name one of the wrapped environments.
-    """
-    return getattr(self._envs[0], name)
-
-  def step(self, actions):
-    """Forward a batch of actions to the wrapped environments.
-
-    Args:
-      actions: Batched action to apply to the environment.
-
-    Raises:
-      ValueError: Invalid actions.
-
-    Returns:
-      Batch of observations, rewards, and done flags.
-    """
-    for index, (env, action) in enumerate(zip(self._envs, actions)):
-      if not env.action_space.contains(action):
-        message = 'Invalid action at index {}: {}'
-        raise ValueError(message.format(index, action))
-    if self._blocking:
-      transitions = [
-          env.step(action)
-          for env, action in zip(self._envs, actions)]
-    else:
-      transitions = [
-          env.step(action, blocking=False)
-          for env, action in zip(self._envs, actions)]
-      transitions = [transition() for transition in transitions]
-    observs, rewards, dones, infos = zip(*transitions)
-    observ = np.stack(observs).astype(np.float32)
-    reward = np.stack(rewards).astype(np.float32)
-    done = np.stack(dones)
-    info = tuple(infos)
-    return observ, reward, done, info
-
-  def reset(self, indices=None):
-    """Reset the environment and convert the resulting observation.
-
-    Args:
-      indices: The batch indices of environments to reset; defaults to all.
-
-    Returns:
-      Batch of observations.
-    """
-    if indices is None:
-      indices = np.arange(len(self._envs))
-    if self._blocking:
-      observs = [self._envs[index].reset() for index in indices]
-    else:
-      observs = [self._envs[index].reset(blocking=False) for index in indices]
-      observs = [observ() for observ in observs]
-    observ = np.stack(observs)
-    observ = observ.astype(np.float32)
-    return observ
-
-  def close(self):
-    """Send close messages to the external process and join them."""
-    for env in self._envs:
-      if hasattr(env, 'close'):
-        env.close()
diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py
deleted file mode 100644
index d0e1e4c26..000000000
--- a/tensor2tensor/rl/envs/in_graph_batch_env.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code was based on Danijar Hafner's code from tf.agents:
-# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
-
-"""Batch of environments inside the TensorFlow graph."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gym
-import tensorflow as tf
-
-
-class InGraphBatchEnv(object):
-  """Batch of environments inside the TensorFlow graph.
-
-  The batch of environments will be stepped and reset inside of the graph using
-  a tf.py_func(). The current batch of observations, actions, rewards, and done
-  flags are held in according variables.
-  """
-
-  def __init__(self, batch_env):
-    """Batch of environments inside the TensorFlow graph.
-
-    Args:
-      batch_env: Batch environment.
-    """
-    self._batch_env = batch_env
-    observ_shape = self._parse_shape(self._batch_env.observation_space)
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
-    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
-    with tf.variable_scope('env_temporary'):
-      self._observ = tf.Variable(
-          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
-          name='observ', trainable=False)
-
-  def __getattr__(self, name):
-    """Forward unimplemented attributes to one of the original environments.
-
-    Args:
-      name: Attribute that was accessed.
-
-    Returns:
-      Value behind the attribute name in one of the original environments.
-    """
-    return getattr(self._batch_env, name)
-
-  def __len__(self):
-    """Number of combined environments."""
-    return len(self._batch_env)
-
-  def __getitem__(self, index):
-    """Access an underlying environment by index."""
-    return self._batch_env[index]
-
-  def simulate(self, action):
-    """Step the batch of environments.
-
-    The results of the step can be accessed from the variables defined below.
-
-    Args:
-      action: Tensor holding the batch of actions to apply.
-
-    Returns:
-      Operation.
-    """
-    with tf.name_scope('environment/simulate'):
-      if action.dtype in (tf.float16, tf.float32, tf.float64):
-        action = tf.check_numerics(action, 'action')
-      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-      observ, reward, done = tf.py_func(
-          lambda a: self._batch_env.step(a)[:3], [action],
-          [observ_dtype, tf.float32, tf.bool], name='step')
-      observ = tf.check_numerics(observ, 'observ')
-      reward = tf.check_numerics(reward, 'reward')
-      with tf.control_dependencies([self._observ.assign(observ)]):
-        return tf.identity(reward), tf.identity(done)
-
-
-  def reset(self, indices=None):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    return tf.cond(
-        tf.cast(tf.shape(indices)[0], tf.bool),
-        lambda: self._reset_non_empty(indices), lambda: 0.0)
-
-  def _reset_non_empty(self, indices):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset; defaults to all.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    observ = tf.py_func(
-        self._batch_env.reset, [indices], observ_dtype, name='reset')
-    observ = tf.check_numerics(observ, 'observ')
-    with tf.control_dependencies([
-        tf.scatter_update(self._observ, indices, observ)]):
-      return tf.identity(observ)
-
-  @property
-  def observ(self):
-    """Access the variable holding the current observation."""
-    return self._observ
-
-  def close(self):
-    """Send close messages to the external process and join them."""
-    self._batch_env.close()
-
-  def _parse_shape(self, space):
-    """Get a tensor shape from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      Shape tuple.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return ()
-    if isinstance(space, gym.spaces.Box):
-      return space.shape
-    raise NotImplementedError()
-
-  def _parse_dtype(self, space):
-    """Get a tensor dtype from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      TensorFlow data type.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return tf.int32
-    if isinstance(space, gym.spaces.Box):
-      return tf.float32
-    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
deleted file mode 100644
index 2b81af270..000000000
--- a/tensor2tensor/rl/envs/utils.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code was based on Danijar Hafner's code from tf.agents:
-# https://github.com/tensorflow/agents/blob/master/agents/tools/wrappers.py
-# https://github.com/tensorflow/agents/blob/master/agents/scripts/utility.py
-
-"""Utilities for using batched environments."""
-
-import atexit
-import multiprocessing
-import sys
-import traceback
-import tensorflow as tf
-
-from tensor2tensor.rl.envs import batch_env
-from tensor2tensor.rl.envs import in_graph_batch_env
-
-class ExternalProcessEnv(object):
-  """Step environment in a separate process for lock free paralellism."""
-
-  # Message types for communication via the pipe.
-  _ACCESS = 1
-  _CALL = 2
-  _RESULT = 3
-  _EXCEPTION = 4
-  _CLOSE = 5
-
-  def __init__(self, constructor):
-    """Step environment in a separate process for lock free paralellism.
-
-    The environment will be created in the external process by calling the
-    specified callable. This can be an environment class, or a function
-    creating the environment and potentially wrapping it. The returned
-    environment should not access global variables.
-
-    Args:
-      constructor: Callable that creates and returns an OpenAI gym environment.
-
-    Attributes:
-      observation_space: The cached observation space of the environment.
-      action_space: The cached action space of the environment.
-    """
-    self._conn, conn = multiprocessing.Pipe()
-    self._process = multiprocessing.Process(
-        target=self._worker, args=(constructor, conn))
-    atexit.register(self.close)
-    self._process.start()
-    self._observ_space = None
-    self._action_space = None
-
-  @property
-  def observation_space(self):
-    if not self._observ_space:
-      self._observ_space = self.__getattr__('observation_space')
-    return self._observ_space
-
-  @property
-  def action_space(self):
-    if not self._action_space:
-      self._action_space = self.__getattr__('action_space')
-    return self._action_space
-
-  def __getattr__(self, name):
-    """Request an attribute from the environment.
-
-    Note that this involves communication with the external process, so it can
-    be slow.
-
-    Args:
-      name: Attribute to access.
-
-    Returns:
-      Value of the attribute.
-    """
-    self._conn.send((self._ACCESS, name))
-    return self._receive()
-
-  def call(self, name, *args, **kwargs):
-    """Asynchronously call a method of the external environment.
-
-    Args:
-      name: Name of the method to call.
-      *args: Positional arguments to forward to the method.
-      **kwargs: Keyword arguments to forward to the method.
-
-    Returns:
-      Promise object that blocks and provides the return value when called.
-    """
-    payload = name, args, kwargs
-    self._conn.send((self._CALL, payload))
-    return self._receive
-
-  def close(self):
-    """Send a close message to the external process and join it."""
-    try:
-      self._conn.send((self._CLOSE, None))
-      self._conn.close()
-    except IOError:
-      # The connection was already closed.
-      pass
-    self._process.join()
-
-  def step(self, action, blocking=True):
-    """Step the environment.
-
-    Args:
-      action: The action to apply to the environment.
-      blocking: Whether to wait for the result.
-
-    Returns:
-      Transition tuple when blocking, otherwise callable that returns the
-      transition tuple.
-    """
-    promise = self.call('step', action)
-    if blocking:
-      return promise()
-    else:
-      return promise
-
-  def reset(self, blocking=True):
-    """Reset the environment.
-
-    Args:
-      blocking: Whether to wait for the result.
-
-    Returns:
-      New observation when blocking, otherwise callable that returns the new
-      observation.
-    """
-    promise = self.call('reset')
-    if blocking:
-      return promise()
-    else:
-      return promise
-
-  def _receive(self):
-    """Wait for a message from the worker process and return its payload.
-
-    Raises:
-      Exception: An exception was raised inside the worker process.
-      KeyError: The reveived message is of an unknown type.
-
-    Returns:
-      Payload object of the message.
-    """
-    message, payload = self._conn.recv()
-    # Re-raise exceptions in the main process.
-    if message == self._EXCEPTION:
-      stacktrace = payload
-      raise Exception(stacktrace)
-    if message == self._RESULT:
-      return payload
-    raise KeyError('Received message of unexpected type {}'.format(message))
-
-  def _worker(self, constructor, conn):
-    """The process waits for actions and sends back environment results.
-
-    Args:
-      constructor: Constructor for the OpenAI Gym environment.
-      conn: Connection for communication to the main process.
-    """
-    try:
-      env = constructor()
-      while True:
-        try:
-          # Only block for short times to have keyboard exceptions be raised.
-          if not conn.poll(0.1):
-            continue
-          message, payload = conn.recv()
-        except (EOFError, KeyboardInterrupt):
-          break
-        if message == self._ACCESS:
-          name = payload
-          result = getattr(env, name)
-          conn.send((self._RESULT, result))
-          continue
-        if message == self._CALL:
-          name, args, kwargs = payload
-          result = getattr(env, name)(*args, **kwargs)
-          conn.send((self._RESULT, result))
-          continue
-        if message == self._CLOSE:
-          assert payload is None
-          break
-        raise KeyError('Received message of unknown type {}'.format(message))
-    except Exception:  # pylint: disable=broad-except
-      stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
-      tf.logging.error('Error in environment process: {}'.format(stacktrace))
-      conn.send((self._EXCEPTION, stacktrace))
-    conn.close()
-
-def define_batch_env(constructor, num_agents, env_processes=True):
-  """Create environments and apply all desired wrappers.
-
-  Args:
-    constructor: Constructor of an OpenAI gym environment.
-    num_agents: Number of environments to combine in the batch.
-    env_processes: Whether to step environment in external processes.
-
-  Returns:
-    In-graph environments object.
-  """
-  with tf.variable_scope('environments'):
-    if env_processes:
-      envs = [
-          ExternalProcessEnv(constructor)
-          for _ in range(num_agents)]
-    else:
-      envs = [constructor() for _ in range(num_agents)]
-    env = batch_env.BatchEnv(envs, blocking=not env_processes)
-    env = in_graph_batch_env.InGraphBatchEnv(env)
-    return env
diff --git a/tensor2tensor/rl/networks.py b/tensor2tensor/rl/networks.py
deleted file mode 100644
index af8709191..000000000
--- a/tensor2tensor/rl/networks.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Neural networks for actor-critic algorithms."""
-
-import operator
-import functools
-import collections
-import tensorflow as tf
-import gym
-
-
-NetworkOutput = collections.namedtuple(
-    'NetworkOutput', 'policy, value, action_postprocessing')
-
-
-def feed_forward_gaussian_fun(observation_space, action_space, config,
-                              observations):
-  assert isinstance(observation_space, gym.spaces.box.Box)
-
-  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
-      factor=config.init_mean_factor)
-  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
-
-  flat_observations = tf.reshape(observations, [
-      tf.shape(observations)[0], tf.shape(observations)[1],
-      functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
-
-  with tf.variable_scope('policy'):
-    x = flat_observations
-    for size in config.policy_layers:
-      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
-    mean = tf.contrib.layers.fully_connected(
-        x, action_space.shape[0], tf.tanh,
-        weights_initializer=mean_weights_initializer)
-    logstd = tf.get_variable(
-        'logstd', mean.shape[2:], tf.float32, logstd_initializer)
-    logstd = tf.tile(
-        logstd[None, None],
-        [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
-  with tf.variable_scope('value'):
-    x = flat_observations
-    for size in config.value_layers:
-      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
-    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
-  mean = tf.check_numerics(mean, 'mean')
-  logstd = tf.check_numerics(logstd, 'logstd')
-  value = tf.check_numerics(value, 'value')
-
-  policy = tf.contrib.distributions.MultivariateNormalDiag(mean,
-                                                           tf.exp(logstd))
-
-  return NetworkOutput(policy, value, lambda a: tf.clip_by_value(a, -2., 2))
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
deleted file mode 100644
index 1c9654608..000000000
--- a/tensor2tensor/rl/ppo.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PPO algorithm implementation.
-
-Based on: https://arxiv.org/abs/1707.06347
-"""
-
-import tensorflow as tf
-
-def define_ppo_step(observation, action, reward, done, value, old_pdf,
-                    policy_factory, config):
-
-  new_policy_dist, new_value, _ = policy_factory(observation)
-  new_pdf = new_policy_dist.prob(action)
-
-  ratio = new_pdf/old_pdf
-  clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef,
-                                   1 + config.clipping_coef)
-
-  advantage = calculate_discounted_return(
-      reward, value, done, config.gae_gamma, config.gae_lambda) - value
-
-  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
-                                                     keep_dims=True)
-  advantage_normalized = tf.stop_gradient(
-      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
-
-  surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized,
-                                   ratio * advantage_normalized)
-  policy_loss = -tf.reduce_mean(surrogate_objective)
-
-  value_error = calculate_discounted_return(
-      reward, new_value, done, config.gae_gamma, config.gae_lambda) - value
-  value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2)
-
-  entropy = new_policy_dist.entropy()
-  entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
-
-  total_loss = policy_loss + value_loss + entropy_loss
-
-  optimization_op = config.optimizer(config.learning_rate).minimize(total_loss)
-
-  with tf.control_dependencies([optimization_op]):
-    return [tf.identity(x) for x in (policy_loss, value_loss, entropy_loss)]
-
-
-def define_ppo_epoch(memory, policy_factory, config):
-  observation, reward, done, action, old_pdf, value = memory
-
-  # This is to avoid propagating gradients though simulation of simulation
-  observation = tf.stop_gradient(observation)
-  action = tf.stop_gradient(action)
-  reward = tf.stop_gradient(reward)
-  done = tf.stop_gradient(done)
-  value = tf.stop_gradient(value)
-  old_pdf = tf.stop_gradient(old_pdf)
-
-  policy_loss, value_loss, entropy_loss = tf.scan(
-      lambda _1, _2: define_ppo_step(observation, action, reward, done, value,
-                                     old_pdf, policy_factory, config),
-      tf.range(config.optimization_epochs),
-      [0., 0., 0.],
-      parallel_iterations=1)
-
-  print_losses = tf.group(
-      tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '),
-      tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
-      tf.Print(0, [tf.reduce_mean(entropy_loss)], 'entropy loss: '))
-
-  return print_losses
-
-
-def calculate_discounted_return(reward, value, done, discount, unused_lambda):
-  """Discounted Monte-Carlo returns."""
-  done = tf.cast(done, tf.float32)
-  reward2 = done[-1, :] * reward[-1, :] + (1 - done[-1, :]) * value[-1, :]
-  reward = tf.concat([reward[:-1,], reward2[None, ...]], axis=0)
-  return_ = tf.reverse(tf.scan(
-      lambda agg, cur: cur[0] + (1 - cur[1]) * discount * agg,  # fn
-      [tf.reverse(reward, [0]),  # elem
-       tf.reverse(done, [0])],
-      tf.zeros_like(reward[0, :]),  # initializer
-      1,
-      False), [0])
-  return tf.check_numerics(return_, 'return')
diff --git a/tensor2tensor/rl/train_test.py b/tensor2tensor/rl/train_test.py
deleted file mode 100644
index ac14c2083..000000000
--- a/tensor2tensor/rl/train_test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests of basic flow of collecting trajectories and training PPO."""
-
-import tensorflow as tf
-
-from tensor2tensor.bin import t2t_rl_trainer
-
-
-FLAGS = tf.app.flags.FLAGS
-
-
-class TrainTest(tf.test.TestCase):
-
-  def test_no_crash_pendulum(self):
-    params = t2t_rl_trainer.example_params()
-    params[2].epochs_num = 10
-    t2t_rl_trainer.train(params)
-
-
-if __name__ == '__main__':
-  FLAGS.config = 'unused'
-  tf.test.main()

From 12038533d25e979227d79cea88c74a198be263f7 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Fri, 2 Feb 2018 00:03:52 -0800
Subject: [PATCH 02/31] Implement fast decoding of transformer for language
 modeling problems (no inputs, partial targets).

PiperOrigin-RevId: 184248180
---
 tensor2tensor/models/transformer.py      | 114 +++++++++++++++--------
 tensor2tensor/models/transformer_test.py |  39 +++++++-
 2 files changed, 115 insertions(+), 38 deletions(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index b241cc24a..3aeb7a790 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -146,12 +146,13 @@ def body(self, features):
     """
     hparams = self._hparams
 
-    inputs = features.get("inputs")
-    encoder_output, encoder_decoder_attention_bias = (None, None)
-    if inputs is not None:
+    if self.has_input:
+      inputs = features["inputs"]
       target_space = features["target_space_id"]
       encoder_output, encoder_decoder_attention_bias = self.encode(
           inputs, target_space, hparams, features=features)
+    else:
+      encoder_output, encoder_decoder_attention_bias = (None, None)
 
     targets = features["targets"]
     targets = common_layers.flatten4d3d(targets)
@@ -245,31 +246,44 @@ def _fast_decode(self,
       raise NotImplementedError("Fast decoding only supports a single shard.")
     dp = self._data_parallelism
     hparams = self._hparams
-
-    inputs = features["inputs"]
     target_modality = self._problem_hparams.target_modality
-    if target_modality.is_class_modality:
-      decode_length = 1
+
+    if self.has_input:
+      inputs = features["inputs"]
+      if target_modality.is_class_modality:
+        decode_length = 1
+      else:
+        decode_length = common_layers.shape_list(inputs)[1] + decode_length
+
+      # TODO(llion): Clean up this reshaping logic.
+      inputs = tf.expand_dims(inputs, axis=1)
+      if len(inputs.shape) < 5:
+        inputs = tf.expand_dims(inputs, axis=4)
+      s = common_layers.shape_list(inputs)
+      batch_size = s[0]
+      inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]])
+      # _shard_features called to ensure that the variable names match
+      inputs = self._shard_features({"inputs": inputs})["inputs"]
+      input_modality = self._problem_hparams.input_modality["inputs"]
+      with tf.variable_scope(input_modality.name):
+        inputs = input_modality.bottom_sharded(inputs, dp)
+      with tf.variable_scope("body"):
+        encoder_output, encoder_decoder_attention_bias = dp(
+            self.encode, inputs, features["target_space_id"], hparams,
+            features=features)
+      encoder_output = encoder_output[0]
+      encoder_decoder_attention_bias = encoder_decoder_attention_bias[0]
+      partial_targets = None
     else:
-      decode_length = common_layers.shape_list(inputs)[1] + decode_length
-
-    # TODO(llion): Clean up this reshaping logic.
-    inputs = tf.expand_dims(inputs, axis=1)
-    if len(inputs.shape) < 5:
-      inputs = tf.expand_dims(inputs, axis=4)
-    s = common_layers.shape_list(inputs)
-    inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]])
-    # _shard_features called to ensure that the variable names match
-    inputs = self._shard_features({"inputs": inputs})["inputs"]
-    input_modality = self._problem_hparams.input_modality["inputs"]
-    with tf.variable_scope(input_modality.name):
-      inputs = input_modality.bottom_sharded(inputs, dp)
-    with tf.variable_scope("body"):
-      encoder_output, encoder_decoder_attention_bias = dp(
-          self.encode, inputs, features["target_space_id"], hparams,
-          features=features)
-    encoder_output = encoder_output[0]
-    encoder_decoder_attention_bias = encoder_decoder_attention_bias[0]
+      # The problem has no inputs.
+      # In this case, features["inputs"] contains partial targets.
+      # We force the outputs to begin with these sequences.
+      encoder_output = None
+      encoder_decoder_attention_bias = None
+      partial_targets = tf.squeeze(tf.to_int64(features["inputs"]), [2, 3])
+      partial_targets_length = common_layers.shape_list(partial_targets)[1]
+      decode_length += partial_targets_length
+      batch_size = tf.shape(partial_targets)[0]
 
     if hparams.pos == "timing":
       timing_signal = common_attention.get_timing_signal_1d(
@@ -320,16 +334,30 @@ def symbols_to_logits_fn(ids, i, cache):
 
       with tf.variable_scope("body"):
         body_outputs = dp(
-            self.decode, targets, cache["encoder_output"],
-            cache["encoder_decoder_attention_bias"], bias, hparams, cache,
+            self.decode, targets, cache.get("encoder_output"),
+            cache.get("encoder_decoder_attention_bias"),
+            bias, hparams, cache,
             nonpadding=features_to_nonpadding(features, "targets"))
 
       with tf.variable_scope(target_modality.name):
         logits = target_modality.top_sharded(body_outputs, None, dp)[0]
 
-      return tf.squeeze(logits, axis=[1, 2, 3]), cache
-
-    return fast_decode(
+      ret = tf.squeeze(logits, axis=[1, 2, 3])
+      if partial_targets is not None:
+        # If the position is within the given partial targets, we alter the
+        # logits to always return those values.
+        # A faster approach would be to process the partial targets in one
+        # iteration in order to fill the corresponding parts of the cache.
+        # This would require broader changes, though.
+        vocab_size = tf.shape(ret)[1]
+        def forced_logits():
+          return tf.one_hot(tf.tile(partial_targets[:, i], [beam_size]),
+                            vocab_size, 0.0, -1e9)
+        ret = tf.cond(
+            tf.less(i, partial_targets_length), forced_logits, lambda: ret)
+      return ret, cache
+
+    ret = fast_decode(
         encoder_output=encoder_output,
         encoder_decoder_attention_bias=encoder_decoder_attention_bias,
         symbols_to_logits_fn=symbols_to_logits_fn,
@@ -338,7 +366,11 @@ def symbols_to_logits_fn(ids, i, cache):
         vocab_size=target_modality.top_dimensionality,
         beam_size=beam_size,
         top_beams=top_beams,
-        alpha=alpha)
+        alpha=alpha,
+        batch_size=batch_size)
+    if partial_targets is not None:
+      ret["outputs"] = ret["outputs"][:, partial_targets_length:]
+    return ret
 
 
 def fast_decode(encoder_output,
@@ -350,7 +382,8 @@ def fast_decode(encoder_output,
                 beam_size=1,
                 top_beams=1,
                 alpha=1.0,
-                eos_id=beam_search.EOS_ID):
+                eos_id=beam_search.EOS_ID,
+                batch_size=None):
   """Given encoder output and a symbols to logits function, does fast decoding.
 
   Implements both greedy and beam search decoding, uses beam search iff
@@ -370,6 +403,7 @@ def fast_decode(encoder_output,
     alpha: Float that controls the length penalty. larger the alpha, stronger
       the preference for slonger translations.
     eos_id: End-of-sequence symbol in beam search.
+    batch_size: an integer scalar - must be passed if there is no input
 
   Returns:
       A dict of decoding results {
@@ -379,8 +413,12 @@ def fast_decode(encoder_output,
           "scores": decoding log probs from the beam search,
               None if using greedy decoding (beam_size=1)
       }
+
+    Raises:
+      NotImplementedError: If beam size > 1 with partial targets.
   """
-  batch_size = common_layers.shape_list(encoder_output)[0]
+  if encoder_output is not None:
+    batch_size = common_layers.shape_list(encoder_output)[0]
 
   key_channels = hparams.attention_key_channels or hparams.hidden_size
   value_channels = hparams.attention_value_channels or hparams.hidden_size
@@ -394,8 +432,9 @@ def fast_decode(encoder_output,
       for layer in range(num_layers)
   }
 
-  cache["encoder_output"] = encoder_output
-  cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+  if encoder_output is not None:
+    cache["encoder_output"] = encoder_output
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
 
   if beam_size > 1:  # Beam Search
     initial_ids = tf.zeros([batch_size], dtype=tf.int32)
@@ -417,6 +456,7 @@ def fast_decode(encoder_output,
   else:  # Greedy
 
     def inner_loop(i, finished, next_id, decoded_ids, cache):
+      """One step of greedy decoding."""
       logits, cache = symbols_to_logits_fn(next_id, i, cache)
       temperature = (0.0 if hparams.sampling_method == "argmax" else
                      hparams.sampling_temp)
@@ -430,7 +470,7 @@ def is_not_finished(i, finished, *_):
       return (i < decode_length) & tf.logical_not(tf.reduce_all(finished))
 
     decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
-    finished = tf.constant(False, shape=[batch_size])
+    finished = tf.fill([batch_size], False)
     next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
     _, _, _, decoded_ids, _ = tf.while_loop(
         is_not_finished,
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index f67476006..0c9b6f794 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -37,13 +37,15 @@
 
 class TransformerTest(tf.test.TestCase):
 
-  def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN):
+  def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN, has_input=True):
     hparams.hidden_size = 8
     hparams.filter_size = 32
     hparams.num_heads = 1
     hparams.layer_prepostprocess_dropout = 0.0
 
     p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE)
+    if not has_input:
+      p_hparams.input_modality = {}
     hparams.problems = [p_hparams]
 
     inputs = -1 + np.random.random_integers(
@@ -108,6 +110,41 @@ def testGreedyVsFast(self):
     self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
     self.assertAllClose(greedy_res, fast_res)
 
+  def testSlowVsFastNoInput(self):
+    model, features = self.getModel(
+        transformer.transformer_small(), has_input=False)
+
+    decode_length = 2
+
+    out_logits, _ = model(features)
+    out_logits = tf.squeeze(out_logits, axis=[2, 3])
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
+        labels=tf.reshape(features["targets"], [-1]))
+    loss = tf.reduce_mean(loss)
+    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)
+
+    with self.test_session():
+      tf.global_variables_initializer().run()
+      for _ in range(100):
+        apply_grad.run()
+
+    model.set_mode(tf.estimator.ModeKeys.PREDICT)
+
+    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+      slow_result = model._slow_greedy_infer(
+          features, decode_length)["outputs"]
+      slow_result = tf.squeeze(slow_result, axis=[2, 3])
+
+      fast_result = model._greedy_infer(features, decode_length)["outputs"]
+
+    with self.test_session():
+      slow_res = slow_result.eval()
+      fast_res = fast_result.eval()
+
+    self.assertEqual(fast_res.shape, (BATCH_SIZE, decode_length))
+    self.assertAllClose(slow_res, fast_res)
+
   def testBeamVsFast(self):
     model, features = self.getModel(transformer.transformer_small())
 

From 17ddd3e467ad6c356a1e035a38bb2069f0e8f9d8 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 2 Feb 2018 10:30:13 -0800
Subject: [PATCH 03/31] make config argument optional in problem.input_fn

PiperOrigin-RevId: 184301328
---
 tensor2tensor/data_generators/problem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 890271dbe..8fed895ca 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -670,7 +670,7 @@ def input_fn(self,
     partition_id, num_partitions = self._dataset_partition(mode, config)
 
     is_training = mode == tf.estimator.ModeKeys.TRAIN
-    if config.use_tpu:
+    if config and config.use_tpu:
       num_threads = 64
     else:
       num_threads = 4 if is_training else 1

From 4bc354c1015a19a6e6e706e239d7d7350a650cdb Mon Sep 17 00:00:00 2001
From: Ashish Vaswani <avaswani@google.com>
Date: Sat, 3 Feb 2018 15:24:51 -0800
Subject: [PATCH 04/31] Internal.

PiperOrigin-RevId: 184412973
---
 tensor2tensor/layers/modalities.py      |  40 ++++++
 tensor2tensor/models/transformer_vae.py | 174 ++++++++++++++++++++----
 2 files changed, 190 insertions(+), 24 deletions(-)

diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 26063388b..ea289103b 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -298,6 +298,46 @@ def top(self, body_output, _):
       return x
 
 
+@registry.register_image_modality("channel_embeddings_bottom")
+class ImageChannelEmbeddingsBottom(modality.Modality):
+  """Modality for images using channel compression for generation."""
+
+  def get_channel_embeddings(self, io_depth, targets, hidden_size,
+                             name="channel"):
+    """Get separate embedding for each of the channels."""
+    targets_split = tf.split(targets, io_depth, axis=3)
+    rgb_embedding_var = tf.get_variable("rgb_target_emb_%s" % name,
+                                        [256 * io_depth, hidden_size])
+    rgb_embedding_var = tf.identity(rgb_embedding_var)
+    rgb_embedding_var *= float(hidden_size)**0.5
+    channel_target_embs = []
+    for i in xrange(io_depth):
+      # Adding the channel offsets to get the right embedding since the
+      # embedding tensor has shape 256 * io_depth, hidden_size
+      target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256
+      target_embs = common_layers.gather(rgb_embedding_var, target_ids)
+      channel_target_embs.append(target_embs)
+
+    return tf.concat(channel_target_embs, axis=-1)
+
+  def targets_bottom(self, inputs):
+    io_depth = self._model_hparams.num_channels
+    hidden_size = self._model_hparams.hidden_size
+    return self.get_channel_embeddings(io_depth, inputs, hidden_size,
+                                       "input_bottom")
+
+  def top(self, body_output, _):
+    with tf.variable_scope(self.name):
+      img_len = self._model_hparams.img_len
+      channels = self._model_hparams.num_channels
+      x = tf.layers.dense(body_output, 256,
+                          use_bias=True, activation=None,
+                          name="output_conv")
+      x = tf.reshape(x,
+                     [-1, img_len, img_len, channels, self.top_dimensionality])
+      return x
+
+
 @registry.register_audio_modality("default")
 class AudioModality(modality.Modality):
   """Performs strided conv compressions for audio data."""
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index b1efb75ca..bd0ce7d0a 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -24,7 +24,7 @@
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
-
+from tensor2tensor.google.models import common_image_attention as cia
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
@@ -32,6 +32,7 @@
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
+
 import tensorflow as tf
 from tensorflow.python.training import moving_averages
 
@@ -459,26 +460,57 @@ def decode_transformer(encoder_output,
                        encoder_decoder_attention_bias,
                        targets,
                        hparams,
-                       name):
+                       name,
+                       task=None):
   """Original Transformer decoder."""
   with tf.variable_scope(name):
-    targets = common_layers.flatten4d3d(targets)
-
-    decoder_input, decoder_self_bias = transformer.transformer_prepare_decoder(
-        targets, hparams)
-
-    decoder_input = tf.nn.dropout(decoder_input,
-                                  1.0 - hparams.layer_prepostprocess_dropout)
-
-    decoder_output = transformer.transformer_decoder(
-        decoder_input,
-        encoder_output,
-        decoder_self_bias,
-        encoder_decoder_attention_bias,
-        hparams)
-
+    if task is None:
+      task = hparams.task
+    if task == "translate":
+      targets = common_layers.flatten4d3d(targets)
+
+      decoder_input, decoder_self_bias = (
+          transformer.transformer_prepare_decoder(targets, hparams))
+
+      decoder_input = tf.nn.dropout(decoder_input,
+                                    1.0 - hparams.layer_prepostprocess_dropout)
+
+      decoder_output = transformer.transformer_decoder(
+          decoder_input,
+          encoder_output,
+          decoder_self_bias,
+          encoder_decoder_attention_bias,
+          hparams)
+      decoder_output = tf.expand_dims(decoder_output, axis=2)
+    else:
+      assert task == "image"
+      inputs = None
+      # have to reshape targets as b, 32, 32, 3 * hidden size] beacuse otherwise
+      # prepare_image will choke
+      targets = tf.reshape(targets, [tf.shape(targets)[0], hparams.img_len,
+                                     hparams.img_len,
+                                     hparams.num_channels*hparams.hidden_size])
+
+      # Prepare decoder inputs and bias.
+      decoder_input, _, _, bias = cia.prepare_decoder(targets, hparams)
+      # Add class label to decoder input.
+      if not hparams.drop_inputs:
+        decoder_input += tf.reshape(
+            inputs,
+            [common_layers.shape_list(targets)[0], 1, 1, hparams.hidden_size])
+      decoder_output = cia.transformer_decoder_layers(
+          decoder_input,
+          None,
+          bias,
+          hparams.num_decoder_layers or hparams.num_hidden_layers,
+          hparams,
+          attention_type=hparams.dec_attention_type,
+          name="decoder")
+    decoder_output_shape = common_layers.shape_list(decoder_output)
+    decoder_output = tf.reshape(decoder_output, [decoder_output_shape[0], -1, 1,
+                                                 hparams.hidden_size])
     # Expand since t2t expects 4d tensors.
-    return tf.expand_dims(decoder_output, axis=2)
+    return decoder_output
 
 
 def multinomial_sample(x, vocab_size, temperature):
@@ -566,7 +598,10 @@ def ae_transformer_internal(inputs,
     _DO_SUMMARIES = False
 
   # Prepare.
-  batch_size = common_layers.shape_list(inputs)[0]
+  if inputs is not None:
+    batch_size = common_layers.shape_list(inputs)[0]
+  else:
+    batch_size = common_layers.shape_list(targets)[0]
   targets = tf.reshape(targets, [batch_size, -1, 1, hparams.hidden_size])
 
   # Encoder.
@@ -579,7 +614,15 @@ def ae_transformer_internal(inputs,
   # Autoencoding.
   losses = {"extra": tf.constant(0.0), "latent_pred": tf.constant(0.0)}
   if hparams.do_ae:
-    max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1)
+    # flatten here
+    original_targets_shape = tf.shape(targets)
+    if hparams.task == "image":
+      cia.maybe_reshape_4d_to_3d(targets, hparams)
+    if hparams.task == "translate":
+      max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1)
+    else:
+      assert hparams.task == "image"
+      max_targets_len_from_inputs = targets
     targets, _ = common_layers.pad_to_same_length(
         targets, max_targets_len_from_inputs,
         final_length_divisible_by=2**hparams.num_compress_steps)
@@ -599,8 +642,10 @@ def ae_transformer_internal(inputs,
       # Extra loss predicting latent code from input. Discrete only.
       if hparams.bottleneck_kind not in ["dense", "vae"]:
         latents_pred = decode_transformer(
-            tf.stop_gradient(inputs), tf.stop_gradient(ed),
-            tf.stop_gradient(latents_dense), hparams, "extra")
+            tf.stop_gradient(inputs) if inputs is not None else None,
+            tf.stop_gradient(ed) if inputs is not None else None,
+            tf.stop_gradient(latents_dense), hparams, "extra",
+            task="translate")
         _, latent_pred_loss = ae_latent_softmax(
             latents_pred, latents_discrete, hparams)
         losses["latent_pred"] = tf.reduce_mean(
@@ -659,12 +704,18 @@ def bn_inputs():
         if hparams.do_attend_decompress:
           d = attend(d, inputs, hparams, "decompress_attend_%d" % j)
         d = decompress_step(d, hparams, i > 0, False, "decompress_%d" % j)
+      # targets is always [batch, length, 1, depth]
       targets = mask * targets + (1.0 - mask) * d
-    targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1)
+      # reshape back to 4d here
+      if hparams.task == "image":
+        targets = tf.reshape(targets, original_targets_shape)
+    else:
+      targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1)
 
   res = decode_transformer(inputs, ed, targets, hparams, "decoder")
   if hparams.do_ae:
-    res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :]
+    if not hparams.do_mask:
+      res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :]
     if hparams.do_mask and hparams.do_refine:
       def refine_res():
         # return residual_conv(res, 1, (5, 1), hparams, "refine")
@@ -872,6 +923,9 @@ def transformer_ae_small():
   hparams.add_hparam("random_top_k", 1)
   hparams.kl_warmup_steps = 150000
   hparams.force_full_predict = True
+
+  # task params
+  hparams.add_hparam("task", "translate")  # translate or image tasks supported
   return hparams
 
 
@@ -894,6 +948,78 @@ def transformer_ae_cifar():
   return hparams
 
 
+@registry.register_hparams
+def imagetransformer_ae_cifar():
+  """Hyperparameters for CIFAR-10 experiments."""
+  hparams = transformer_ae_small()
+  hparams.filter_size = 512
+  hparams.num_compress_steps = 3
+  hparams.v_size = 1024 * 64
+  hparams.startup_steps = 10000
+  hparams.kmeans_lr_factor = 0.0
+  hparams.is_2d = 0
+  hparams.learning_rate_warmup_steps = 8000
+  hparams.learning_rate = 0.2
+  hparams.hidden_size = 512
+  hparams.batch_size = 1
+  hparams.max_length = 256
+  hparams.dropout = 0.0
+  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
+  hparams.optimizer_adam_epsilon = 1e-9
+  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate = 0.1
+  hparams.initializer_gain = 0.2
+  hparams.num_hidden_layers = 6
+  hparams.initializer = "uniform_unit_scaling"
+  hparams.weight_decay = 0.0
+  hparams.optimizer_adam_beta1 = 0.9
+  hparams.optimizer_adam_beta2 = 0.98
+  hparams.label_smoothing = 0.0
+  hparams.norm_type = "layer"
+  hparams.layer_prepostprocess_dropout = 0.0
+  hparams.num_heads = 8
+  hparams.task = "image"
+  hparams.ffn_layer = "conv_hidden_relu"
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
+  hparams.attention_dropout = 0.0
+  hparams.relu_dropout = 0.
+  hparams.pos = "timing"  # timing, none
+  hparams.nbr_decoder_problems = 1
+  hparams.num_output_layers = 3
+  hparams.add_hparam("block_size", 1)
+
+  # dilated attention based flags
+  hparams.add_hparam("gap_sizes", [2, 4, 8, 16, 32, 64, 2, 4, 8, 16, 32, 64])
+  hparams.add_hparam("dilated_attention", False)
+
+  # image size related flags
+  # assuming that the image has same height and width
+  hparams.add_hparam("img_len", 32)
+  hparams.add_hparam("num_channels", 3)
+  # Local attention params
+  hparams.add_hparam("local_and_global_att", False)
+  hparams.add_hparam("block_length", 256)
+  hparams.add_hparam("block_width", 128)
+  hparams.num_encoder_layers = 4
+  hparams.num_decoder_layers = 12
+  hparams.sep_rgb_embed = False
+  hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D)
+  hparams.add_hparam("block_rastor_scan", False)
+
+  # multipos attention params
+  hparams.add_hparam("q_filter_width", 1)
+  hparams.add_hparam("kv_filter_width", 1)
+
+  hparams.add_hparam("unconditional", False)  # unconditional generation
+
+  hparams.target_modality = "image:channel_embeddings_bottom"
+  hparams.drop_inputs = True
+  hparams.do_attend_compress = False
+  hparams.do_attend_decompress = False
+  return hparams
+
+
 @registry.register_hparams
 def transformer_ae_base():
   """Set of hyperparameters."""

From 522d222f1e1aff79210bd68c8250c751702906b2 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Sun, 4 Feb 2018 07:55:04 -0800
Subject: [PATCH 05/31] Omit newline characters form librispeech targets. Add
 option to preprocess waveforms in the SpeechModality.bottom

PiperOrigin-RevId: 184448336
---
 tensor2tensor/data_generators/librispeech.py  |   3 +-
 .../data_generators/speech_recognition.py     | 128 ++++++++++++++----
 2 files changed, 100 insertions(+), 31 deletions(-)

diff --git a/tensor2tensor/data_generators/librispeech.py b/tensor2tensor/data_generators/librispeech.py
index ad8e931d8..0c59824c1 100644
--- a/tensor2tensor/data_generators/librispeech.py
+++ b/tensor2tensor/data_generators/librispeech.py
@@ -66,8 +66,7 @@ def _collect_data(directory, input_ext, transcription_ext):
       transcript_path = os.path.join(root, transcript)
       with open(transcript_path, "r") as transcript_file:
         for transcript_line in transcript_file:
-          line_contents = transcript_line.split(" ", 1)
-          assert len(line_contents) == 2
+          line_contents = transcript_line.strip().split(" ", 1)
           media_base, label = line_contents
           key = os.path.join(root, media_base)
           assert key not in data_files
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
index 01a3db564..4c037aeb3 100644
--- a/tensor2tensor/data_generators/speech_recognition.py
+++ b/tensor2tensor/data_generators/speech_recognition.py
@@ -32,7 +32,9 @@
 
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import modality
 from tensor2tensor.utils import registry
@@ -76,7 +78,7 @@ def compute_mel_filterbank_features(
     frame_length=25, frame_step=10, fft_length=None,
     window_fn=functools.partial(tf.contrib.signal.hann_window, periodic=True),
     lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80,
-    log_noise_floor=1e-3):
+    log_noise_floor=1e-3, apply_mask=True):
   """Implement mel-filterbank extraction using tf ops.
 
   Args:
@@ -93,6 +95,7 @@ def compute_mel_filterbank_features(
     upper_edge_hertz: highest frequency of the filterbank
     num_mel_bins: filterbank size
     log_noise_floor: clip small values to prevent numeric overflow in log
+    apply_mask: When working on a batch of samples, set padding frames to zero
   Returns:
     filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
   """
@@ -100,14 +103,24 @@ def compute_mel_filterbank_features(
   # Transform of each signal in `signals`. Its shape is
   # [batch_size, ?, fft_unique_bins]
   # where fft_unique_bins = fft_length // 2 + 1
+
+  # Find the wave length: the largest index for which the value is !=0
+  # note that waveforms samples that are exactly 0.0 are quite common, so
+  # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
+  wav_lens = tf.reduce_max(
+      tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
+      tf.to_int32(tf.not_equal(waveforms, 0.0)),
+      axis=-1) + 1
   if dither > 0:
     waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
   if preemphasis > 0:
     waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
+    wav_lens -= 1
   frame_length = int(frame_length * sample_rate / 1e3)
   frame_step = int(frame_step * sample_rate / 1e3)
   if fft_length is None:
     fft_length = int(2**(np.ceil(np.log2(frame_length))))
+
   stfts = tf.contrib.signal.stft(
       waveforms,
       frame_length=frame_length,
@@ -116,6 +129,11 @@ def compute_mel_filterbank_features(
       window_fn=window_fn,
       pad_end=True)
 
+  stft_lens = (wav_lens + (frame_step - 1)) // frame_step
+  masks = tf.to_float(tf.less_equal(
+      tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
+      tf.expand_dims(stft_lens, 1)))
+
   # An energy spectrogram is the magnitude of the complex-valued STFT.
   # A float32 Tensor of shape [batch_size, ?, 257].
   magnitude_spectrograms = tf.abs(stfts)
@@ -134,7 +152,10 @@ def compute_mel_filterbank_features(
 
   log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))
 
-  return tf.expand_dims(log_mel_sgram, -1)
+  if apply_mask:
+    log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)
+
+  return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
 
 
 #
@@ -207,12 +228,21 @@ def vocab_size(self):
     return 256
 
 
+class ByteTextEncoderWithEos(text_encoder.ByteTextEncoder):
+  """Encodes each byte to an id and appends the EOS token."""
+
+  def encode(self, s):
+    return super(ByteTextEncoderWithEos, self).encode(s) + [text_encoder.EOS_ID]
+
+
 class SpeechRecognitionProblem(problem.Problem):
   """Base class for speech recognition problems."""
 
   def hparams(self, defaults, model_hparams):
     p = model_hparams
     # Filterbank extraction
+    # Filterbank extraction in bottom instead of preprocess_example is faster.
+    p.add_hparam("audio_preproc_in_bottom", True)
     # The trainer seems to reserve memory for all members of the input dict
     p.add_hparam("audio_keep_example_waveforms", False)
     p.add_hparam("audio_sample_rate", 16000)
@@ -248,7 +278,7 @@ def feature_encoders(self, _):
                          # decoding.py doesn't try to convert the floats
                          # into text...
         "waveforms": AudioEncoder(),
-        "targets": text_encoder.ByteTextEncoder(),
+        "targets": ByteTextEncoderWithEos(),
     }
 
   def example_reading_spec(self):
@@ -263,25 +293,30 @@ def example_reading_spec(self):
 
   def preprocess_example(self, example, mode, hparams):
     p = hparams
-    waveforms = tf.expand_dims(example["waveforms"], 0)
-    mel_fbanks = compute_mel_filterbank_features(
-        waveforms,
-        sample_rate=p.audio_sample_rate,
-        dither=p.audio_dither,
-        preemphasis=p.audio_preemphasis,
-        frame_length=p.audio_frame_length,
-        frame_step=p.audio_frame_step,
-        lower_edge_hertz=p.audio_lower_edge_hertz,
-        upper_edge_hertz=p.audio_upper_edge_hertz,
-        num_mel_bins=p.audio_num_mel_bins)
-    if p.audio_add_delta_deltas:
-      mel_fbanks = add_delta_deltas(mel_fbanks)
-    fbank_size = common_layers.shape_list(mel_fbanks)
-    assert fbank_size[0] == 1
-    # Later models like to flatten the two spatial dims. Instead, we add a
-    # unit spatial dim and flatten the frequencies and channels.
-    example["inputs"] = tf.reshape(
-        mel_fbanks, [fbank_size[1], 1, fbank_size[2] * fbank_size[3]])
+    if p.audio_preproc_in_bottom:
+      example["inputs"] = tf.expand_dims(
+          tf.expand_dims(example["waveforms"], -1), -1)
+    else:
+      waveforms = tf.expand_dims(example["waveforms"], 0)
+      mel_fbanks = compute_mel_filterbank_features(
+          waveforms,
+          sample_rate=p.audio_sample_rate,
+          dither=p.audio_dither,
+          preemphasis=p.audio_preemphasis,
+          frame_length=p.audio_frame_length,
+          frame_step=p.audio_frame_step,
+          lower_edge_hertz=p.audio_lower_edge_hertz,
+          upper_edge_hertz=p.audio_upper_edge_hertz,
+          num_mel_bins=p.audio_num_mel_bins,
+          apply_mask=False)
+      if p.audio_add_delta_deltas:
+        mel_fbanks = add_delta_deltas(mel_fbanks)
+      fbank_size = common_layers.shape_list(mel_fbanks)
+      assert fbank_size[0] == 1
+      # Later models like to flatten the two spatial dims. Instead, we add a
+      # unit spatial dim and flatten the frequencies and channels.
+      example["inputs"] = tf.reshape(
+          mel_fbanks, [fbank_size[1], 1, fbank_size[2] * fbank_size[3]])
     if not p.audio_keep_example_waveforms:
       del example["waveforms"]
     return super(SpeechRecognitionProblem, self
@@ -308,35 +343,70 @@ def bottom(self, inputs):
     p = self._model_hparams
     training = p.mode == tf.estimator.ModeKeys.TRAIN
 
+    num_mel_bins = p.audio_num_mel_bins
+    num_channels = 3 if p.audio_add_delta_deltas else 1
+
     with tf.variable_scope(self.name):
-      x = inputs
-      num_mel_bins = p.audio_num_mel_bins
-      num_channels = 3 if p.audio_add_delta_deltas else 1
+      if p.audio_preproc_in_bottom:
+        # Compute filterbanks
+        with tf.variable_scope("fbanks"):
+          waveforms = tf.squeeze(inputs, [2, 3])
+          mel_fbanks = compute_mel_filterbank_features(
+              waveforms,
+              sample_rate=p.audio_sample_rate,
+              dither=p.audio_dither,
+              preemphasis=p.audio_preemphasis,
+              frame_length=p.audio_frame_length,
+              frame_step=p.audio_frame_step,
+              lower_edge_hertz=p.audio_lower_edge_hertz,
+              upper_edge_hertz=p.audio_upper_edge_hertz,
+              num_mel_bins=p.audio_num_mel_bins,
+              apply_mask=True)
+          if p.audio_add_delta_deltas:
+            mel_fbanks = add_delta_deltas(mel_fbanks)
+          x = tf.reshape(mel_fbanks,
+                         common_layers.shape_list(mel_fbanks)[:2] +
+                         [1, num_mel_bins * num_channels])
+      else:
+        x = inputs
+
       # The convention is that the models are flattened along the spatial,
       # dimensions, thus the speech preprocessor treats frequencies and
       # channels as image colors (last axis)
       x.set_shape([None, None, 1, num_mel_bins * num_channels])
 
+      xshape = common_layers.shape_list(x)
+      x = tf.reshape(x, [-1, 1, num_mel_bins * num_channels])
+
+      padding_mask = common_attention.embedding_to_padding(x)
+      pad_remover = expert_utils.PadRemover(padding_mask)
+
+      x = pad_remover.remove(x)
+
       # This replaces CMVN estimation on data
       x = tf.layers.batch_normalization(
-          x, axis=3, center=False, scale=False, training=training)
+          x, axis=2, center=False, scale=False, training=training)
+
+      x = pad_remover.restore(x)
 
-      xshape = common_layers.shape_list(x)
       # restore batch_size x time x frequency x channel layout
       x = tf.reshape(x, [xshape[0], xshape[1], num_mel_bins, num_channels])
 
       # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
       for _ in range(2):
+        x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
         x = tf.layers.conv2d(
             x, 128, (3, 3), (2, 2), use_bias=False)
-        x = tf.layers.batch_normalization(x, axis=3, training=training)
+        x = common_layers.layer_norm(x)
         x = tf.nn.relu(x)
 
       xshape = common_layers.shape_list(x)
       # apply a conv that will remove all frequencies and at the same time
       # project the output into desired hidden_size
+      x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
       x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False)
+
       assert common_layers.shape_list(x)[2] == 1
-      x = tf.layers.batch_normalization(x, axis=3, training=training)
+      x = common_layers.layer_norm(x)
       x = tf.nn.relu(x)
     return x

From 3843dd2236ca1bfa119ca1e1a75e525f2408da55 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Sun, 4 Feb 2018 21:09:44 -0800
Subject: [PATCH 06/31] Revert to appending latents in transformer ae as it
 works better.

PiperOrigin-RevId: 184480516
---
 tensor2tensor/models/transformer_vae.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index bd0ce7d0a..a65f0a3df 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -709,12 +709,12 @@ def bn_inputs():
       # reshape back to 4d here
       if hparams.task == "image":
         targets = tf.reshape(targets, original_targets_shape)
-    else:
+    if hparams.task == "translate":
       targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1)
 
   res = decode_transformer(inputs, ed, targets, hparams, "decoder")
   if hparams.do_ae:
-    if not hparams.do_mask:
+    if hparams.task == "translate":
       res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :]
     if hparams.do_mask and hparams.do_refine:
       def refine_res():

From 203161028325bfe98c3ca7a0f8dcbec228c19b53 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Mon, 5 Feb 2018 12:44:50 -0800
Subject: [PATCH 07/31] Reset TPU input partitioning when switching modes

PiperOrigin-RevId: 184570928
---
 tensor2tensor/data_generators/problem.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 8fed895ca..929f547ee 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -633,6 +633,8 @@ def _dataset_partition(self, mode, config):
       num_partitions: an integer
     """
     if mode != tf.estimator.ModeKeys.TRAIN or not hasattr(config, "tpu_config"):
+      # Reset in the case when using TPU but alternating TRAIN and EVAL.
+      self._next_partition_id = 0
       return 0, 1
     if config.tpu_config.per_host_input_for_training:
       num_partitions = max(config.tpu_config.num_shards // 8, 1)

From 8057abd2953d27092127b6d0ba73104144e4126b Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Mon, 5 Feb 2018 13:09:03 -0800
Subject: [PATCH 08/31] Use z_size instead of v_size for VQ-VAE

PiperOrigin-RevId: 184574565
---
 tensor2tensor/models/transformer_vae.py | 41 ++++++++++++++-----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index a65f0a3df..ac5658f08 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -288,22 +288,23 @@ def embed(x):
         hot = tf.one_hot(x, hparams.v_size)
         h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense")
       elif hparams.bottleneck_kind == "vq-vae":
-        means_embed = means
         shape_x = common_layers.shape_list(x)
         x_flat = tf.reshape(x, [-1, 1])
         c = int_to_bit(x_flat, nbits=int(math.log(hparams.v_size, 2)), base=2)
         shape = common_layers.shape_list(c)
         new_shape = shape
         new_shape[-1] = hparams.num_blocks
-        new_shape.append(int(math.log(hparams.v_size, 2) // hparams.num_blocks))
+        new_shape.append(int(math.log(hparams.v_size, 2) / hparams.num_blocks))
         c = tf.to_int32(tf.reshape(c, shape=new_shape))
         c = bit_to_int(
             c,
-            nbits=int(math.log(hparams.v_size, 2) // hparams.num_blocks),
+            nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks),
             base=2)
-        h1 = tf.gather(tf.transpose(means_embed, [1, 0, 2]), c)
-        h1 = tf.stack(
-            [h1[:, :, i, i, :] for i in range(hparams.num_blocks)], axis=-2)
+        c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1)
+        c_hot_flat = tf.reshape(
+            c_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size])
+        h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means)
+        h1 = tf.transpose(h1, perm=[1, 0, 2])
         new_shape = shape_x
         new_shape.append(hparams.hidden_size)
         h1 = tf.reshape(h1, new_shape)
@@ -355,10 +356,11 @@ def embed(x):
 
       # Get the discrete latent represenation
       x_means_idx = tf.argmax(x_means_hot, axis=-1)
+
       # Get the binary representation
       x_means_bits = int_to_bit(
           x_means_idx,
-          nbits=int(math.log(hparams.v_size, 2) // hparams.num_blocks),
+          nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks),
           base=2)
       shape = common_layers.shape_list(x_means_bits)
       new_shape = shape[:-1]
@@ -528,7 +530,7 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams):
   vocab_size = hparams.v_size
   if hparams.bottleneck_kind == "semhash":
     vocab_size = 2**hparams.z_size
-  if hparams.num_blocks < 2:
+  if hparams.num_decode_blocks < 2:
     latents_logits = tf.layers.dense(latents_pred, vocab_size,
                                      name="extra_logits")
     loss = None
@@ -542,15 +544,17 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams):
   # Multi-block case.
   vocab_bits = int(math.log(vocab_size, 2))
   assert vocab_size == 2**vocab_bits
-  assert vocab_bits % hparams.num_blocks == 0
-  block_vocab_size = 2**(vocab_bits // hparams.num_blocks)
-  latents_logits = [tf.layers.dense(latents_pred, block_vocab_size,
-                                    name="extra_logits_%d" % i)
-                    for i in xrange(hparams.num_blocks)]
+  assert vocab_bits % hparams.num_decode_blocks == 0
+  block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks)
+  latents_logits = [
+      tf.layers.dense(
+          latents_pred, block_vocab_size, name="extra_logits_%d" % i)
+      for i in xrange(hparams.num_decode_blocks)
+  ]
   loss = None
   if latents_discrete is not None:
     losses = []
-    for i in xrange(hparams.num_blocks):
+    for i in xrange(hparams.num_decode_blocks):
       d = tf.floormod(tf.floordiv(latents_discrete,
                                   block_vocab_size**i), block_vocab_size)
       losses.append(tf.nn.sparse_softmax_cross_entropy_with_logits(
@@ -629,7 +633,7 @@ def ae_transformer_internal(inputs,
     targets_c = compress(targets, inputs, False, hparams, "compress")
     if hparams.mode != tf.estimator.ModeKeys.PREDICT:
       # Compress and bottleneck.
-      latents_dense, latents_discrete, extra_loss, _ = bottleneck(
+      latents_dense, latents_discrete, extra_loss, embed = bottleneck(
           targets_c, hparams, 2 * 2048, "vc", means, ema_count, ema_means)
       if _DO_SUMMARIES:
         tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1]))
@@ -752,7 +756,7 @@ def __init__(self, *args, **kwargs):
       self._hparams.block_dim = int(
           self._hparams.hidden_size // self._hparams.num_blocks)
       self._hparams.block_v_size = 2**(
-          math.log(self._hparams.v_size, 2) / self._hparams.num_blocks)
+          self._hparams.z_size / self._hparams.num_blocks)
       self._hparams.block_v_size = int(self._hparams.block_v_size)
 
       if self._hparams.reshape_method == "project":
@@ -881,16 +885,17 @@ def transformer_ae_small():
   hparams.hidden_size = 384
   hparams.filter_size = 2048
   hparams.label_smoothing = 0.0
-  hparams.optimizer = "Adafactor"  # Can be unstable, maybe try Adam.
+  hparams.optimizer = "Adam"  # Can be unstable, maybe try Adam.
   hparams.optimizer_adam_epsilon = 1e-9
   hparams.optimizer_adam_beta1 = 0.9
-  hparams.optimizer_adam_beta2 = 0.998  # Needs tuning, try 0.98 to 0.999.
+  hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
   hparams.add_hparam("z_size", 14)
   hparams.add_hparam("noise_dev", 0.0)
   hparams.add_hparam("d_mix", 0.5)
   # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
   hparams.add_hparam("bottleneck_kind", "semhash")
   hparams.add_hparam("num_blocks", 1)
+  hparams.add_hparam("num_decode_blocks", 1)
   # Reshape method for hierarchical vq-vae: slice, project
   hparams.add_hparam("reshape_method", "slice")
   hparams.add_hparam("trainable_projections", False)

From 9e7a4d1c7c73b87d661c5102e57ae905eed28363 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Mon, 5 Feb 2018 15:59:44 -0800
Subject: [PATCH 09/31] Fix bug with instability in higher z_sizes caused by
 math.log

PiperOrigin-RevId: 184600621
---
 tensor2tensor/models/transformer_vae.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index ac5658f08..a94e91473 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -290,15 +290,15 @@ def embed(x):
       elif hparams.bottleneck_kind == "vq-vae":
         shape_x = common_layers.shape_list(x)
         x_flat = tf.reshape(x, [-1, 1])
-        c = int_to_bit(x_flat, nbits=int(math.log(hparams.v_size, 2)), base=2)
+        c = int_to_bit(x_flat, nbits=hparams.z_size, base=2)
         shape = common_layers.shape_list(c)
         new_shape = shape
         new_shape[-1] = hparams.num_blocks
-        new_shape.append(int(math.log(hparams.v_size, 2) / hparams.num_blocks))
+        new_shape.append(int(hparams.z_size / hparams.num_blocks))
         c = tf.to_int32(tf.reshape(c, shape=new_shape))
         c = bit_to_int(
             c,
-            nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks),
+            nbits=int(hparams.z_size / hparams.num_blocks),
             base=2)
         c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1)
         c_hot_flat = tf.reshape(
@@ -360,15 +360,15 @@ def embed(x):
       # Get the binary representation
       x_means_bits = int_to_bit(
           x_means_idx,
-          nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks),
+          nbits=int(hparams.z_size / hparams.num_blocks),
           base=2)
       shape = common_layers.shape_list(x_means_bits)
       new_shape = shape[:-1]
-      new_shape[-1] = int(math.log(hparams.v_size, 2))
+      new_shape[-1] = hparams.z_size
       x_means_bits = tf.reshape(x_means_bits, shape=new_shape)
       c = bit_to_int(
           tf.to_int32(x_means_bits),
-          nbits=int(math.log(hparams.v_size, 2)),
+          nbits=hparams.z_size,
           base=2)
 
       # Update the ema variables

From 798c406cad5937e38b01d853fb5116c30f779bd4 Mon Sep 17 00:00:00 2001
From: Ashish Vaswani <avaswani@google.com>
Date: Mon, 5 Feb 2018 17:22:47 -0800
Subject: [PATCH 10/31] Adding common_image_attention, a set of functions that
 support generating image. Jointly programmed by nikp and avaswani

PiperOrigin-RevId: 184612638
---
 .../layers/common_image_attention.py          | 544 ++++++++++++++++++
 tensor2tensor/models/transformer_vae.py       |   2 +-
 2 files changed, 545 insertions(+), 1 deletion(-)
 create mode 100644 tensor2tensor/layers/common_image_attention.py

diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
new file mode 100644
index 000000000..88fc8ed93
--- /dev/null
+++ b/tensor2tensor/layers/common_image_attention.py
@@ -0,0 +1,544 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utils for attention mechanism for images."""
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import expert_utils
+
+import tensorflow as tf
+
+
+class AttentionType(object):
+  LOCAL_1D = "local_1d"
+  LOCAL_2D = "local_2d"
+  GLOBAL = "global"
+  GLOCAL = "global_local"
+  MOE_LOCAL_1D = "moe_local1d"
+
+  @staticmethod
+  def get_choices():
+    return [
+        AttentionType.GLOBAL,
+        AttentionType.GLOCAL,
+        AttentionType.MOE_LOCAL_1D,
+        AttentionType.LOCAL_1D,
+        AttentionType.LOCAL_2D,
+    ]
+
+
+def maybe_reshape_4d_to_3d(x, hparams):
+  """Reshape input from 4D to 3D if necessary."""
+  x_shape = common_layers.shape_list(x)
+  is_4d = False
+  if len(x_shape) == 4:
+    x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]])
+    is_4d = True
+  x.set_shape([None, None, hparams.hidden_size])
+  return x, x_shape, is_4d
+
+
+def local_attention_2d(x, hparams, attention_type="local_attention_2d"):
+  """Local 2d, self attention layer."""
+  # self-attention
+  with tf.variable_scope("local_2d_self_att"):
+    y = common_attention.multihead_attention_2d(
+        x,
+        None,
+        hparams.attention_key_channels or hparams.hidden_size,
+        hparams.attention_value_channels or hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        attention_type=attention_type,
+        query_shape=hparams.query_shape,
+        memory_flange=hparams.memory_flange,
+        name="self_attention")
+  return y
+
+
+def local_attention_1d(x,
+                       self_attention_bias,
+                       hparams,
+                       attention_type="local_unmasked",
+                       q_padding="VALID",
+                       kv_padding="VALID"):
+  """Local 1d self attention."""
+  # self-attention
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  with tf.variable_scope("local_1d_self_att"):
+    y = common_attention.multihead_attention(
+        x,
+        None,
+        self_attention_bias,
+        hparams.attention_key_channels or hparams.hidden_size,
+        hparams.attention_value_channels or hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        hparams.attention_dropout,
+        attention_type=attention_type,
+        block_width=hparams.block_width,
+        block_length=hparams.block_length,
+        q_padding=q_padding,
+        kv_padding=kv_padding,
+        q_filter_width=hparams.q_filter_width,
+        kv_filter_width=hparams.kv_filter_width,
+        name="self_attention")
+    if is_4d:
+      y = tf.reshape(y, x_shape)
+      y.set_shape([None, None, None, hparams.hidden_size])
+    return y
+
+
+def local_global_attention(x,
+                           self_attention_bias,
+                           hparams,
+                           q_padding="LEFT",
+                           kv_padding="LEFT"):
+  """Local and global 1d self attention."""
+  with tf.variable_scope("self_local_global_att"):
+    [x_global, x_local] = tf.split(x, 2, axis=-1)
+    split_hidden_size = int(hparams.hidden_size / 2)
+    split_heads = int(hparams.num_heads / 2)
+    y_global = common_attention.multihead_attention(
+        x_global,
+        None,
+        self_attention_bias,
+        hparams.attention_key_channels or split_hidden_size,
+        hparams.attention_value_channels or split_hidden_size,
+        split_hidden_size,
+        split_heads,
+        hparams.attention_dropout,
+        q_filter_width=hparams.q_filter_width,
+        kv_filter_width=hparams.kv_filter_width,
+        q_padding=q_padding,
+        kv_padding=kv_padding,
+        name="global_self_att")
+    y_local = common_attention.multihead_attention(
+        x_local,
+        None,
+        self_attention_bias,
+        hparams.attention_key_channels or split_hidden_size,
+        hparams.attention_value_channels or split_hidden_size,
+        split_hidden_size,
+        split_heads,
+        hparams.attention_dropout,
+        attention_type="local_masked",
+        block_length=hparams.block_length,
+        block_width=hparams.block_width,
+        q_filter_width=hparams.q_filter_width,
+        kv_filter_width=hparams.kv_filter_width,
+        q_padding=q_padding,
+        kv_padding=kv_padding,
+        name="local_self_att")
+    y = tf.concat([y_global, y_local], axis=-1)
+    return y
+
+
+def full_self_attention(x,
+                        self_attention_bias,
+                        hparams,
+                        q_padding="LEFT",
+                        kv_padding="LEFT"):
+  """Full self-attention layer."""
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  with tf.variable_scope("self_att"):
+    y = common_attention.multihead_attention(
+        x,
+        None,
+        self_attention_bias,
+        hparams.attention_key_channels or hparams.hidden_size,
+        hparams.attention_value_channels or hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        hparams.attention_dropout,
+        q_filter_width=hparams.q_filter_width,
+        kv_filter_width=hparams.kv_filter_width,
+        q_padding=q_padding,
+        kv_padding=kv_padding,
+        name="self_att")
+    if is_4d:
+      y = tf.reshape(y, [x_shape[0], x_shape[1], x_shape[2], x_shape[3]])
+      y.set_shape([None, None, None, hparams.hidden_size])
+    return y
+
+
+def encdec_attention_1d(x,
+                        encoder_output,
+                        hparams):
+  """Local 1d self attention."""
+  x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams)
+  encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output, hparams)
+  with tf.variable_scope("encdec_attention"):
+    # Encoder Decoder attention
+    y = common_attention.multihead_attention(
+        x,
+        encoder_output,
+        None,
+        hparams.attention_key_channels or hparams.hidden_size,
+        hparams.attention_value_channels or hparams.hidden_size,
+        hparams.hidden_size,
+        hparams.num_heads,
+        hparams.attention_dropout,
+        name="encdec_attention")
+  if is_4d:
+    y = tf.reshape(y, x_shape)
+    y.set_shape([None, None, None, hparams.hidden_size])
+  return y
+
+
+def transformer_decoder_layers(inputs,
+                               encoder_output,
+                               bias,
+                               num_layers,
+                               hparams,
+                               attention_type=AttentionType.LOCAL_2D,
+                               name="transformer"):
+  """Multi layer transformer."""
+  x = inputs
+  x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+  for layer in xrange(num_layers):
+    with tf.variable_scope("%s_layer_%d" % (name, layer)):
+      # self-attention + skip connections
+      if attention_type == AttentionType.LOCAL_2D:
+        y = local_attention_2d(common_layers.layer_preprocess(x, hparams),
+                               hparams,
+                               attention_type="masked_local_attention_2d")
+      elif attention_type == AttentionType.LOCAL_1D:
+        y = local_attention_1d(common_layers.layer_preprocess(x, hparams),
+                               bias, hparams,
+                               attention_type="local_mask_right",
+                               q_padding="LEFT", kv_padding="LEFT")
+      elif attention_type == AttentionType.GLOCAL:
+        y = local_global_attention(common_layers.layer_preprocess(x, hparams),
+                                   bias, hparams,
+                                   q_padding="LEFT", kv_padding="LEFT")
+      elif attention_type == AttentionType.GLOBAL:
+        y = full_self_attention(common_layers.layer_preprocess(x, hparams),
+                                bias, hparams,
+                                q_padding="LEFT", kv_padding="LEFT")
+      # TODO(nikip): Add support for dilated attention.
+      x = common_layers.layer_postprocess(x, y, hparams)
+      # enc-dec attention + skip connections
+      if encoder_output is not None:
+        y = encdec_attention_1d(common_layers.layer_preprocess(x, hparams),
+                                encoder_output, hparams)
+        x = common_layers.layer_postprocess(x, y, hparams)
+      # feed-fwd layers + skip connections
+      y = ffn_layer(common_layers.layer_preprocess(x, hparams), hparams)
+      x = common_layers.layer_postprocess(x, y, hparams)
+  return common_layers.layer_preprocess(x, hparams)
+
+
+def transformer_encoder_layers(inputs,
+                               num_layers,
+                               hparams,
+                               attention_type=AttentionType.GLOBAL,
+                               self_attention_bias=None,
+                               q_padding="VALID",
+                               kv_padding="VALID",
+                               name="transformer"):
+  """Multi layer transformer encoder."""
+  x = inputs
+  x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+
+  for layer in xrange(num_layers):
+    # attention layers + skip connections
+    with tf.variable_scope("%s_layer_%d" % (name, layer)):
+      if attention_type == AttentionType.LOCAL_2D:
+        y = local_attention_2d(common_layers.layer_preprocess(x, hparams),
+                               hparams,
+                               attention_type="local_attention_2d")
+      elif attention_type == AttentionType.LOCAL_1D:
+        y = local_attention_1d(common_layers.layer_preprocess(x, hparams),
+                               self_attention_bias, hparams,
+                               attention_type="local_unmasked",
+                               q_padding=q_padding, kv_padding=kv_padding)
+      elif attention_type == AttentionType.GLOBAL:
+        y = full_self_attention(common_layers.layer_preprocess(x, hparams),
+                                self_attention_bias, hparams,
+                                q_padding=q_padding, kv_padding=kv_padding)
+      x = common_layers.layer_postprocess(x, y, hparams)
+      # feed-fwd layer + skip connections
+      y = ffn_layer(common_layers.layer_preprocess(x, hparams), hparams)
+      x = common_layers.layer_postprocess(x, y, hparams)
+  return common_layers.layer_preprocess(x, hparams)
+
+
+def ffn_layer(x, hparams):
+  """ffn layer transformer."""
+  with tf.variable_scope("ffn"):
+    if hparams.ffn_layer == "none":
+      return x
+    if hparams.ffn_layer == "conv_hidden_relu":
+      y = common_layers.dense_relu_dense(
+          x,
+          hparams.filter_size,
+          hparams.hidden_size,
+          dropout=hparams.relu_dropout)
+    elif hparams.ffn_layer == "normed_conv_hidden_relu":
+      y = common_layers.normed_conv_hidden_relu(
+          x,
+          hparams.norm_type,
+          hparams.layer_norm_epsilon,
+          hparams.filter_size,
+          hparams.hidden_size,
+          dropout=hparams.relu_dropout,
+          norm_name="convnorm")
+    elif hparams.ffn_layer == "self_attention_ffn":
+      x_shape = tf.shape(x)
+      x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
+      y = common_attention.ffn_self_attention_layer(
+          x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
+          hparams.attention_dropout, hparams.share_kv)
+      y = tf.reshape(y, x_shape)
+    else:
+      assert hparams.ffn_layer == "glu_ffn"
+      y = common_layers.gated_linear_unit_layer(x)
+    return y
+
+
+def transformer_layers_sharded(dp,
+                               ps_devices,
+                               inputs,
+                               num_layers,
+                               hparams,
+                               self_attention_bias=None,
+                               enc_output=None,
+                               attention_type=AttentionType.GLOBAL,
+                               name="transformer"):
+  """Multi layer transformer, sharded by the data parallelism dp."""
+  x = inputs
+  extra_loss = tf.constant(0.0)
+  moe_hidden_sizes = [int(s) for s in hparams.moe_hidden_sizes.split(",")]
+  expert_fn = expert_utils.ffn_expert_fn(
+      hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
+  x = dp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout)
+  for layer in xrange(num_layers):
+    with tf.variable_scope("%s_layer_%d" % (name, layer)):
+      # self-attention
+      if attention_type == AttentionType.LOCAL_2D:
+        y = dp(local_attention_2d(common_layers.layer_preprocess(x, hparams),
+                                  hparams,
+                                  attention_type="masked_local_attention_2d"))
+      elif attention_type == AttentionType.LOCAL_1D:
+        y = dp(local_attention_1d(common_layers.layer_preprocess(x, hparams),
+                                  self_attention_bias, hparams,
+                                  attention_type="local_mask_right",
+                                  q_padding="LEFT", kv_padding="LEFT"))
+      elif attention_type == AttentionType.GLOCAL:
+        y = dp(local_global_attention(
+            common_layers.layer_preprocess(x, hparams), self_attention_bias,
+            hparams, q_padding="LEFT", kv_padding="LEFT"))
+      elif attention_type == AttentionType.GLOBAL:
+        y = dp(full_self_attention(common_layers.layer_preprocess(x, hparams),
+                                   self_attention_bias, hparams,
+                                   q_padding="LEFT", kv_padding="LEFT"))
+      x = common_layers.layer_postprocess(x, y, hparams)
+      if enc_output is not None:
+        y = dp(encdec_attention_1d(common_layers.layer_preprocess(x, hparams),
+                                   enc_output, hparams))
+        x = dp(common_layers.layer_postprocess, x, y, hparams)
+      with tf.variable_scope("ffn"):
+        if str(layer) in hparams.moe_layers_decoder.split(","):
+          y, loss = expert_utils.distributed_moe(
+              dp,
+              ps_devices,
+              common_layers.layer_preprocess(x, hparams),
+              hparams.mode == tf.estimator.ModeKeys.TRAIN,
+              input_size=hparams.hidden_size,
+              expert_fn=expert_fn,
+              num_experts=hparams.moe_num_experts,
+              k=hparams.moe_k,
+              loss_coef=hparams.moe_loss_coef)
+          extra_loss += loss
+          x = dp(common_layers.layer_postprocess, x, y, hparams)
+        else:
+          y = dp(ffn_layer, common_layers.layer_preprocess(x, hparams), hparams)
+          x = dp(common_layers.layer_postprocess, x, y, hparams)
+  return dp(common_layers.layer_preprocess, x, hparams), extra_loss
+
+
+def postprocess_image(x, rows, cols, hparams):
+  """Postprocessing after decoding."""
+  batch = common_layers.shape_list(x)[0]
+  channels = 256
+  x = tf.reshape(x, [batch, rows, cols, hparams.hidden_size])
+  # targets = common_layers.conv(x, 256, (1, 1), name="output_conv")
+  targets = tf.layers.dense(x, 256, use_bias=True, activation=None,
+                            name="output_conv")
+  if hparams.mode == tf.contrib.learn.ModeKeys.INFER:
+    y = targets
+    y = tf.reshape(y, [batch, -1, hparams.img_len*3, channels])
+    yshape = common_layers.shape_list(y)
+    block_length = hparams.query_shape[0]
+    block_width = hparams.query_shape[1]
+
+    # Break into block row wise.
+    y = tf.reshape(y,
+                   [batch, yshape[1] // block_length,
+                    block_length,
+                    yshape[2], channels])
+    yshape = common_layers.shape_list(y)
+    # Break into blocks width wise.
+    y_blocks = tf.reshape(y,
+                          [batch, yshape[1], yshape[2],
+                           yshape[3] // block_width,
+                           block_width, channels])
+
+    # Reshape targets as [batch_size, num_blocks_rows, num_block_cols,
+    # block_length, block_width, channels]
+    targets = tf.transpose(y_blocks, [0, 1, 3, 2, 4, 5])
+
+  return targets
+
+
+def prepare_encoder(inputs, hparams, attention_type="local_1d"):
+  """Prepare encoder for images."""
+  x = prepare_image(inputs, hparams, name="enc_channels")
+  # Add position signals.
+  x = add_pos_signals(x, hparams, "enc_pos")
+  x_shape = common_layers.shape_list(x)
+  if attention_type == "local_1d":
+    x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], hparams.hidden_size])
+    x.set_shape([None, None, hparams.hidden_size])
+  elif attention_type == "local_2d":
+    x.set_shape([None, None, None, hparams.hidden_size])
+  return x
+
+
+def prepare_decoder(targets, hparams):
+  """Prepare decoder for images."""
+  targets_shape = common_layers.shape_list(targets)
+  channels = hparams.num_channels
+  curr_infer_length = None
+
+  # during training, images are [batch, IMG_LEN, IMG_LEN, 3].
+  # At inference, they are [batch, curr_infer_length, 1, 1]
+  if hparams.mode == tf.contrib.learn.ModeKeys.INFER:
+    curr_infer_length = targets_shape[1]
+    if hparams.block_rastor_scan:
+      assert hparams.img_len*channels % hparams.query_shape[1] == 0
+      assert hparams.img_len % hparams.query_shape[0] == 0
+      total_block_width = hparams.img_len*channels
+      # Decoding is in block rastor scan order. We divide the image into
+      # hparams.query_shape blocks and then decode each block in rastor scan.
+      # To make that compatible with our inference pipeline, pad the target so
+      # that rows is a multiple of query_shape and columns is a multiple of
+      # hparams.img_len*channels
+      curr_infer_length = targets_shape[1]
+      block_padding_factor = total_block_width * hparams.query_shape[0]
+      targets = tf.pad(targets, [
+          [0, 0], [0, -curr_infer_length % block_padding_factor],
+          [0, 0], [0, 0]])
+
+      num_blocks = total_block_width // hparams.query_shape[1]
+      # Reshape the image to represent blocks
+      target_blocks = tf.reshape(
+          targets, [targets_shape[0], -1, num_blocks, hparams.query_shape[0],
+                    hparams.query_shape[1]])
+      # Transpose to read the image in 2D fashion.
+      targets = tf.transpose(target_blocks, [0, 1, 3, 2, 4])
+    else:
+      # add padding to make sure the size of targets is a multiple of img_height
+      # times number of channels. This is  needed for positional encodings and
+      # for doing the RGB lookup.
+      padding_factor = channels * hparams.img_len
+      targets = tf.pad(targets, [
+          [0, 0], [0, -curr_infer_length % padding_factor], [0, 0], [0, 0]])
+    targets = tf.reshape(targets,
+                         [targets_shape[0], -1, hparams.img_len, channels])
+  # Preprocess image
+  x = prepare_image(targets, hparams, name="dec_channels")
+  x_shape = common_layers.shape_list(x)
+  # mask out upper triangle to avoid looking into the future.
+  bias = common_attention.attention_bias_lower_triangle(x_shape[1]*x_shape[2])
+  if hparams.dec_attention_type == AttentionType.LOCAL_2D:
+    x = common_attention.right_shift_blockwise(x, hparams.query_shape)
+    x = add_pos_signals(x, hparams, "dec_pos")
+  else:
+    # Add position signals
+    x = tf.reshape(x, [-1, x_shape[1]*x_shape[2], hparams.hidden_size])
+    x = common_layers.shift_right_3d(x)
+    x = tf.reshape(x, [-1, x_shape[1], x_shape[2], hparams.hidden_size])
+    x = add_pos_signals(x, hparams, "dec_pos")
+  x.set_shape([None, None, None, hparams.hidden_size])
+  return x, x_shape[1], x_shape[2], bias
+
+
+def prepare_image(inputs, hparams, name=None):
+  """Prepare image."""
+  inputs_shape = common_layers.shape_list(inputs)
+  batch = inputs_shape[0]
+  orig_rows = inputs_shape[1]
+  orig_cols = inputs_shape[2]
+  channels = hparams.num_channels
+
+  hidden_size = hparams.hidden_size
+  # Only do lookup if the embeddings haven't been looked up already.
+  # if the last dimension is number of channels, then this is very likely the
+  # channel ids tensor. We have to make sure.
+  if inputs_shape[-1] == hparams.num_channels:
+    inputs = tf.to_int32(inputs)
+    x = get_channel_embeddings(channels, inputs, hidden_size, name=name)
+  else:
+    x = inputs
+  x = tf.reshape(x, [batch, orig_rows, orig_cols * channels, hidden_size])
+
+  return x
+
+
+def create_output(decoder_output, rows, cols, targets, hparams):
+  """Create output from decoder output and vars."""
+  decoded_image = postprocess_image(decoder_output, rows, cols, hparams)
+  targets_shape = common_layers.shape_list(targets)
+  if hparams.mode == tf.estimator.ModeKeys.PREDICT:
+    # Hardcoding that the number of intensity values is 256.
+    y = tf.reshape(decoded_image, [targets_shape[0], -1, 1, 1, 256])
+    output = y[:, :targets_shape[1], :, :, :]
+  else:
+    output = tf.reshape(decoded_image, [
+        targets_shape[0], targets_shape[1], targets_shape[2],
+        targets_shape[3], 256
+    ])
+  return output
+
+
+def get_channel_embeddings(io_depth, targets, hidden_size, name="channel"):
+  """Get separate embedding for each of the channels."""
+  targets_split = tf.split(targets, io_depth, axis=3)
+  rgb_embedding_var = tf.get_variable("rgb_target_emb_%s" % name,
+                                      [256 * io_depth, hidden_size])
+  rgb_embedding_var = tf.identity(rgb_embedding_var)
+  rgb_embedding_var *= float(hidden_size)**0.5
+  channel_target_embs = []
+  for i in xrange(io_depth):
+    # Adding the channel offsets to get the right embedding since the
+    # embedding tensor has shape 256 * io_depth, hidden_size
+    target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256
+    target_embs = common_layers.gather(rgb_embedding_var, target_ids)
+    channel_target_embs.append(target_embs)
+
+  return tf.concat(channel_target_embs, axis=-1)
+
+
+def add_pos_signals(x, hparams, name="pos_emb"):
+  with tf.variable_scope(name, reuse=False):
+    if hparams.pos == "timing":
+      x = common_attention.add_timing_signal_nd(x)
+    else:
+      assert hparams.pos == "emb"
+      x = common_attention.add_positional_embedding_nd(
+          x, hparams.max_length, name=name)
+  return x
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index a94e91473..ed0fa51bf 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -24,8 +24,8 @@
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensor2tensor.google.models import common_image_attention as cia
 from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_image_attention as cia
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
 from tensor2tensor.utils import expert_utils

From 887c4108abfccdc45bde793b7eb9b876329489bb Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Tue, 6 Feb 2018 12:35:56 -0800
Subject: [PATCH 11/31] Rename hierarchical vq-vae to decomposed vq-vae as in
 paper.

PiperOrigin-RevId: 184722477
---
 tensor2tensor/models/transformer_vae.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index ed0fa51bf..08d2bdf2c 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -760,7 +760,7 @@ def __init__(self, *args, **kwargs):
       self._hparams.block_v_size = int(self._hparams.block_v_size)
 
       if self._hparams.reshape_method == "project":
-        tf.logging.info("Using random projections for hierarchical vq-vae")
+        tf.logging.info("Using projections for decomposed vq-vae")
         tf.logging.info("Trainable projections = {}".format(
             self._hparams.trainable_projections))
         self._hparams.projection_tensors = tf.get_variable(
@@ -773,7 +773,7 @@ def __init__(self, *args, **kwargs):
             trainable=self._hparams.trainable_projections)
         self._hparams.reshape_fn = project_hidden
       elif self._hparams.reshape_method == "slice":
-        tf.logging.info("Using slices for hierarchical vq-vae")
+        tf.logging.info("Using slices for decomposed vq-vae")
         self._hparams.reshape_fn = slice_hidden
       else:
         raise ValueError("Unknown reshape method")
@@ -896,7 +896,7 @@ def transformer_ae_small():
   hparams.add_hparam("bottleneck_kind", "semhash")
   hparams.add_hparam("num_blocks", 1)
   hparams.add_hparam("num_decode_blocks", 1)
-  # Reshape method for hierarchical vq-vae: slice, project
+  # Reshape method for decomposed vq-vae: slice, project
   hparams.add_hparam("reshape_method", "slice")
   hparams.add_hparam("trainable_projections", False)
   hparams.add_hparam("unmasked_percentage", 0.1)

From 21c9fed2c2a54d3ce286842dd8879a02743d57a4 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Tue, 6 Feb 2018 14:09:08 -0800
Subject: [PATCH 12/31] Add unmasked version of memory reduced attention

PiperOrigin-RevId: 184737000
---
 tensor2tensor/layers/common_attention.py | 51 +++++++++++++++---------
 tensor2tensor/models/transformer_moe.py  |  4 +-
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index e82e6d471..229521cc5 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -192,9 +192,9 @@ def memeff_attention_fn(*args, **kwargs):
       attention_type="local_mask_right",
   )
 
-  # === Memory-compressed multihead self attention layer ===
+  # === Masked memory-compressed multihead self attention layer ===
   # Only works for self attention. Always mask the future.
-  compressed_attention_fn = register_layer(
+  compressed_attention_masked_fn = register_layer(
       multihead_self_attention_reduced,
       default_kwargs=dict(
           factor=hparams.attention_red_factor,
@@ -209,6 +209,13 @@ def memeff_attention_fn(*args, **kwargs):
       ),
   )
 
+  # === Unmasked memory-compressed multihead self attention layer ===
+  # Only works for self attention. Never mask the future. Bias never added
+  compressed_attention_fn = partial(
+      compressed_attention_masked_fn,
+      add_mask=False,
+  )
+
   # Feed-forwards layers:
 
   # === Mixture of expert layer ===
@@ -259,14 +266,17 @@ def memeff_attention_fn(*args, **kwargs):
   # Define all available layers
 
   layers = dict(
+      # Attention layers:
       a=multihead_attention_fn,  # Multihead full attention
       loc=local_attention_fn,  # Local attention
-      locm=local_attention_masked_fn,  # Local masked attention
+      locm=local_attention_masked_fn,  # Local attention (masked)
       red=compressed_attention_fn,  # Memory-compressed attention
+      redm=compressed_attention_masked_fn,  # Memory-compressed att (masked)
       mem=memeff_attention_fn,  # Memory efficient
-      fc=conv_hidden_relu,
-      sep=sep_conv_relu,  # Fully connected
-      sepm=sep_conv_relu_masked,  # masked separable convolution
+      # Feed-forward layers:
+      fc=conv_hidden_relu,  # Fully connected
+      sep=sep_conv_relu,  # Separable convolution (unmasked)
+      sepm=sep_conv_relu_masked,  # Separable convolution (masked)
       moe=distributed_moe,  # Mixture of expert layer
   )
   return layers
@@ -3415,6 +3425,7 @@ def multihead_self_attention_reduced(
     multihead_params=None,
     nonlinearity="none",
     reduction_type="conv",
+    add_mask=True,
 ):
   """Reduce the length dimension by compressing with conv.
 
@@ -3426,6 +3437,7 @@ def multihead_self_attention_reduced(
     multihead_params (dict): parameters for multihead attention
     nonlinearity (str): Add some non-linearity after the memory block
     reduction_type (str): type of compression
+    add_mask (bool): If True, add the bias to prevent attention to the future
 
   Returns:
     (tf.Tensor): float32 of shape [batch, length, depth]
@@ -3475,18 +3487,21 @@ def construct_bias_vectors(t, axis):
     # [1, length_k] or [length_q, 1]
     return length_coordinates
 
-  bias = tf.to_float(
-      tf.greater(
-          # Because we add the first elem to the memory block and it can be
-          # attended by anyone,we don't need to add +1 anymore to prevent self
-          # attention Use * factor to make sure the last tokens  of a block
-          # cannot attend the block
-          construct_bias_vectors(memory_x, 0) * factor,
-          # +epsilon to avoid float equality
-          construct_bias_vectors(x, 1) + 1e-3,
-      )) * -1e9
-  bias = tf.expand_dims(bias, axis=0)
-  bias = tf.expand_dims(bias, axis=0)  # [1, 1, length_k, length_q]
+  if add_mask:  # Create mask to prevent attention to the future
+    bias = tf.to_float(
+        tf.greater(
+            # Because we add the first elem to the memory block and it can be
+            # attended by anyone,we don't need to add +1 anymore to prevent self
+            # attention Use * factor to make sure the last tokens  of a block
+            # cannot attend the block
+            construct_bias_vectors(memory_x, 0) * factor,
+            # +epsilon to avoid float equality
+            construct_bias_vectors(x, 1) + 1e-3,
+        )) * -1e9
+    bias = tf.expand_dims(bias, axis=0)
+    bias = tf.expand_dims(bias, axis=0)  # [1, 1, length_k, length_q]
+  else:
+    bias = None
 
   return multihead_attention(
       query_antecedent=x,
diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py
index efa67bf27..2bf807b19 100644
--- a/tensor2tensor/models/transformer_moe.py
+++ b/tensor2tensor/models/transformer_moe.py
@@ -329,7 +329,7 @@ def transformer_moe_8k_lm():
   #  * Memory efficient multihead attention (slow):
   # hparams.layer_types = "#mem/mem/mem-moe/mem/mem"
   #  * Alternate between local/compressed attention layers (faster):
-  # hparams.layer_types = "#locm/red/locm-moe/red/locm"
+  # hparams.layer_types = "#locm/redm/locm-moe/redm/locm"
 
   return hparams
 
@@ -386,6 +386,6 @@ def transformer_moe_prepend_8k():
   hparams.eval_drop_long_sequences = False
   hparams.max_input_seq_length = 7500,
   hparams.default_ff = "sepm"
-  hparams.layer_types = "locm/red/locm-moe/red/locm"
+  hparams.layer_types = "locm/redm/locm-moe/redm/locm"
   hparams.moe_num_experts = 256
   return hparams

From f04475680c47781fd3c34b6c37cb2309715eff56 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 6 Feb 2018 15:07:36 -0800
Subject: [PATCH 13/31] T2TModel.has_input checks for "inputs"

PiperOrigin-RevId: 184747531
---
 tensor2tensor/utils/t2t_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 0623a975e..225c4d19b 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -124,7 +124,7 @@ def hparams(self):
   @property
   def has_input(self):
     if self._problem_hparams:
-      return self._problem_hparams.input_modality
+      return "inputs" in self._problem_hparams.input_modality
     else:
       return True
 

From 361f5d195d4590f34489d26543e8b38c3d7385a8 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Tue, 6 Feb 2018 18:25:12 -0800
Subject: [PATCH 14/31] Use beam search for latent prediction in transformer ae
 (only for num_blocks=1 for now).

PiperOrigin-RevId: 184774591
---
 tensor2tensor/models/transformer_vae.py | 40 +++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 08d2bdf2c..835470d57 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -28,6 +28,7 @@
 from tensor2tensor.layers import common_image_attention as cia
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
+from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
@@ -566,8 +567,42 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams):
   return sample, loss
 
 
+def ae_latent_sample_beam(latents_dense_in, inputs, ed, embed, hparams):
+  """Sample from the latent space in the autoencoder."""
+  vocab_size = 2**hparams.z_size
+  beam_size = 1  # TODO(lukaszkaiser): larger beam sizes seem to work bad.
+  inputs = tf.tile(inputs, [beam_size, 1, 1])
+  ed = tf.tile(ed, [beam_size, 1, 1, 1])
+
+  def symbols_to_logits_fn(ids):
+    """Go from ids to logits."""
+    ids = tf.expand_dims(ids, axis=2)  # Ids start with added all-zeros.
+    latents_discrete = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0]])
+
+    with tf.variable_scope(tf.get_variable_scope(), reuse=False):
+      latents_dense = embed(latents_discrete)
+      latents_pred = decode_transformer(
+          inputs, ed, latents_dense, hparams, "extra")
+      logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits")
+      current_output_position = common_layers.shape_list(ids)[1] - 1
+      logits = logits[:, current_output_position, :, :]
+    return tf.squeeze(logits, axis=[1])
+
+  initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32)
+  length = tf.shape(latents_dense_in)[1]
+  ids, _ = beam_search.beam_search(
+      symbols_to_logits_fn, initial_ids, beam_size, length,
+      vocab_size, alpha=0.0, eos_id=-1, stop_early=False)
+
+  res = tf.expand_dims(ids[:, 0, :], axis=2)  # Pick first beam.
+  return res[:, 1:]  # Remove the added all-zeros from ids.
+
+
 def ae_latent_sample(latents_dense, inputs, ed, embed, iters, hparams):
   """Sample from the latent space in the autoencoder."""
+  if hparams.num_decode_blocks < 2:
+    # TODO(lukaszkaiser): beam-search only works in non-blocked mode for now.
+    return ae_latent_sample_beam(latents_dense, inputs, ed, embed, hparams)
   latents_pred = decode_transformer(inputs, ed, latents_dense, hparams, "extra")
   latents_discrete, _ = ae_latent_softmax(latents_pred, None, hparams)
 
@@ -680,7 +715,8 @@ def bn_inputs():
                                     ema_count, ema_means)
         latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :])
         if cache is None:
-          cache = ae_latent_sample(latents_dense, inputs, ed, embed, 8, hparams)
+          cache = ae_latent_sample(
+              latents_dense, inputs, ed, embed, 16, hparams)
         latents_dense = embed(cache)
     # Postprocess.
     d = latents_dense
@@ -890,7 +926,7 @@ def transformer_ae_small():
   hparams.optimizer_adam_beta1 = 0.9
   hparams.optimizer_adam_beta2 = 0.997  # Needs tuning, try 0.98 to 0.999.
   hparams.add_hparam("z_size", 14)
-  hparams.add_hparam("noise_dev", 0.0)
+  hparams.add_hparam("noise_dev", 0.5)
   hparams.add_hparam("d_mix", 0.5)
   # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae.
   hparams.add_hparam("bottleneck_kind", "semhash")

From 416f78458118ea364da2abf3574d53d27d5c99c7 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 6 Feb 2018 20:15:50 -0800
Subject: [PATCH 15/31] Support launching T2T on --cloud_mlengine with CPU,
 GPU, and multi-GPU machines

PiperOrigin-RevId: 184782692
---
 tensor2tensor/bin/t2t_trainer.py              |  23 ++-
 tensor2tensor/data_generators/problem.py      |  22 ++-
 tensor2tensor/layers/common_attention.py      |   2 +-
 tensor2tensor/layers/common_layers.py         |  10 +-
 tensor2tensor/layers/modalities.py            |   2 +-
 tensor2tensor/models/revnet.py                |   2 +-
 tensor2tensor/models/transformer_vae.py       |  12 +-
 tensor2tensor/utils/beam_search.py            |   2 +-
 tensor2tensor/utils/cloud_mlengine.py         | 137 ++++++++++++++++++
 .../utils/{cloud.py => cloud_tpu.py}          |  13 +-
 tensor2tensor/utils/diet.py                   |   4 +-
 11 files changed, 203 insertions(+), 26 deletions(-)
 create mode 100644 tensor2tensor/utils/cloud_mlengine.py
 rename tensor2tensor/utils/{cloud.py => cloud_tpu.py} (97%)

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 8f1f0dfdc..051c2e90d 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -26,7 +26,8 @@
 
 from tensor2tensor import models  # pylint: disable=unused-import
 from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
-from tensor2tensor.utils import cloud
+from tensor2tensor.utils import cloud_mlengine
+from tensor2tensor.utils import cloud_tpu
 from tensor2tensor.utils import decoding
 from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
 from tensor2tensor.utils import registry
@@ -81,6 +82,18 @@
 flags.DEFINE_bool("cloud_delete_on_done", False,
                   "Whether to delete the VM and TPU instance when done.")
 
+# Google Cloud ML Engine
+flags.DEFINE_bool("cloud_mlengine", False,
+                  "Whether to launch on Cloud ML Engine.")
+flags.DEFINE_string("cloud_mlengine_master_type", None,
+                    "Machine type for master on Cloud ML Engine. "
+                    "If provided, overrides default selections based on "
+                    "--worker_gpu. User is responsible for ensuring "
+                    "type is valid and that --worker_gpu matches number of "
+                    "GPUs on machine type. See documentation: "
+                    "https://cloud.google.com/ml-engine/reference/rest/v1/"
+                    "projects.jobs#traininginput")
+
 
 def get_problem_name():
   problems = FLAGS.problems.split("-")
@@ -244,7 +257,7 @@ def maybe_cloud_tpu():
                      "be gs:// paths, i.e. on Google Cloud Storage.")
 
   FLAGS.use_tpu = True
-  with cloud.cloud_tpu(
+  with cloud_tpu.cloud_tpu(
       FLAGS.cloud_vm_name,
       FLAGS.cloud_tpu_name,
       delete_on_done=FLAGS.cloud_delete_on_done) as tpu_master:
@@ -258,6 +271,12 @@ def main(_):
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
+  if FLAGS.cloud_mlengine:
+    assert not FLAGS.cloud_tpu
+    assert FLAGS.output_dir.startswith("gs://")
+    assert FLAGS.data_dir.startswith("gs://")
+    return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"]))
+
   if FLAGS.generate_data:
     generate_data()
 
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 929f547ee..a2c330c2d 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -517,16 +517,26 @@ def _maybe_reverse_and_copy(example):
     if shuffle_files:
       random.shuffle(data_files)
     dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
-    dataset = dataset.apply(
-        tf.contrib.data.parallel_interleave(
-            _load_records, sloppy=is_training, cycle_length=8))
+
+    if hasattr(tf.contrib.data, "parallel_interleave"):
+      dataset = dataset.apply(
+          tf.contrib.data.parallel_interleave(
+              _load_records, sloppy=is_training, cycle_length=8))
+    else:
+      dataset = dataset.interleave(_load_records, cycle_length=8,
+                                   block_length=16)
+
     if repeat:
       dataset = dataset.repeat()
     dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
     if preprocess:
-      dataset = dataset.apply(
-          tf.contrib.data.parallel_interleave(
-              _preprocess, sloppy=is_training, cycle_length=8))
+      if hasattr(tf.contrib.data, "parallel_interleave"):
+        dataset = dataset.apply(
+            tf.contrib.data.parallel_interleave(
+                _preprocess, sloppy=is_training, cycle_length=8))
+      else:
+        dataset = dataset.interleave(_preprocess, cycle_length=8,
+                                     block_length=16)
     dataset = dataset.map(
         _maybe_reverse_and_copy, num_parallel_calls=num_threads)
 
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 229521cc5..0f400c575 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -3402,7 +3402,7 @@ def pad_and_reshape(x):
             block_length,
             block_length,  # Restore the block length dimension
         ])
-    weights = tf.reduce_sum(weights, axis=3, keepdims=True)  # Compress block
+    weights = tf.reduce_sum(weights, axis=3, keep_dims=True)  # Compress block
     v_out = tf.matmul(weights, v)  # [1, block_length] @ [block_length, depth]
     v_out = tf.squeeze(v_out, axis=3)
     return v_out
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index c8d54fb99..7b22dc44b 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -509,8 +509,8 @@ def layer_norm_vars(filters):
 
 def layer_norm_compute_python(x, epsilon, scale, bias):
   """Layer norm raw computation."""
-  mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
+  mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
+  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
   norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
   return norm_x * scale + bias
 
@@ -1171,7 +1171,7 @@ def mask_from_embedding(emb):
   Returns:
     a 0.0/1.0 Tensor with shape [batch, width, height, 1].
   """
-  return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keepdims=True))
+  return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keep_dims=True))
 
 
 def mask_leq(target_length, source_length):
@@ -1703,7 +1703,7 @@ def smoothing_cross_entropy(logits,
           depth=vocab_size,
           on_value=confidence,
           off_value=low_confidence)
-    xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+    xentropy = tf.nn.softmax_cross_entropy_with_logits(
         logits=logits, labels=soft_targets)
     return xentropy - normalizing
 
@@ -1737,7 +1737,7 @@ def global_pool_1d(inputs, pooling_type="MAX", mask=None):
       if mask is not None:
         # Some elems are dummy elems so we can't just reduce the average.
         output = tf.reduce_sum(inputs, axis=1)
-        num_elems = tf.reduce_sum(mask, axis=1, keepdims=True)
+        num_elems = tf.reduce_sum(mask, axis=1, keep_dims=True)
         output = tf.div(output, tf.maximum(num_elems, 1))
       else:
         output = tf.reduce_mean(inputs, axis=1)
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index ea289103b..f5788701c 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -461,7 +461,7 @@ def top(self, body_output, _):
     """
     with tf.variable_scope(self.name):
       x = body_output
-      x = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
+      x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
       res = tf.layers.dense(x, self._vocab_size)
       return tf.expand_dims(res, 3)
 
diff --git a/tensor2tensor/models/revnet.py b/tensor2tensor/models/revnet.py
index 28b4cf681..3a6c7b32b 100644
--- a/tensor2tensor/models/revnet.py
+++ b/tensor2tensor/models/revnet.py
@@ -277,7 +277,7 @@ def final_block(x1, x2, dim='2d', training=True, scope='final_block'):
 
     # Global average pooling
     net = tf.reduce_mean(y, CONFIG[dim]['reduction_dimensions'],
-                         name='final_pool', keepdims=True)
+                         name='final_pool', keep_dims=True)
 
     return net
 
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 835470d57..0ce43081b 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -92,9 +92,9 @@ def top_k_softmax(x, k):
   """Calculate softmax(x), select top-k and rescale to sum to 1."""
   x = tf.nn.softmax(x)
   top_x, _ = tf.nn.top_k(x, k=k+1)
-  min_top = tf.reduce_min(top_x, axis=-1, keepdims=True)
+  min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True)
   x = tf.nn.relu((x - min_top) + 1e-12)
-  x /= tf.reduce_sum(x, axis=-1, keepdims=True)
+  x /= tf.reduce_sum(x, axis=-1, keep_dims=True)
   return x, tf.reduce_max(top_x, axis=-1)
 
 
@@ -142,7 +142,7 @@ def dae(x, hparams, name):
     maxvhot = tf.stop_gradient(tf.one_hot(maxvec, hparams.v_size))
     # Add losses that prevent too few being used.
     distrib = tf.reshape(logsm, [-1, hparams.v_size]) * maxvhot
-    d_mean = tf.reduce_mean(distrib, axis=[0], keepdims=True)
+    d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True)
     d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0])
     d_dev = - tf.reduce_mean(d_variance)
     ret = s
@@ -202,8 +202,8 @@ def slice_hidden(x, hparams):
 def nearest(x, means, hparams):
   """Find the nearest means to elements in x."""
   x_reshaped = hparams.reshape_fn(x, hparams)
-  x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keepdims=True)
-  means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keepdims=True)
+  x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keep_dims=True)
+  means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True)
   scalar_prod = tf.matmul(
       tf.transpose(x_reshaped, perm=[1, 0, 2]),
       tf.transpose(means, perm=[0, 2, 1]))
@@ -393,7 +393,7 @@ def embed(x):
             tf.transpose(x_reshaped, perm=[1, 0, 2]))
         updated_ema_means = moving_averages.assign_moving_average(
             ema_means, dw, hparams.decay, zero_debias=False)
-        n = tf.reduce_sum(updated_ema_count, axis=-1, keepdims=True)
+        n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True)
         updated_ema_count = ((updated_ema_count + hparams.epsilon) /
                              (n + hparams.v_size * hparams.epsilon) * n)
         updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1)
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index 3841b5953..3c7b8c203 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -90,7 +90,7 @@ def get_state_shape_invariants(tensor):
 
 
 def log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
 
 
 def compute_batch_indices(batch_size, beam_size):
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
new file mode 100644
index 000000000..12d89f470
--- /dev/null
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Launch on GCP's ML Engine."""
+
+import os
+import sys
+import tempfile
+
+from googleapiclient import discovery
+from oauth2client.client import GoogleCredentials
+from tensor2tensor.utils import cloud_tpu as cloud
+import tensorflow as tf
+
+CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
+
+# TODO(rsepassi):
+# * Support t2t_usr_dir
+# * Support --autotune
+# * Add documentation clould_mlengine.md
+# * Enable multi-machine sync/async training
+
+
+def args_dict_as_args(args_dict):
+  del args_dict['cloud_mlengine']
+  args = []
+  for name, val in args_dict.items():
+    if val is None:
+      continue
+    args.extend(['--%s' % name, str(val)])
+  return args
+
+
+def machine_config(num_gpus=1, use_tpu=False, master_type=None):
+  """Return dict specifying machine config for trainingInput."""
+  scale_tier = 'BASIC_GPU'
+  if use_tpu:
+    scale_tier = 'BASIC_TPU'
+  elif num_gpus <= 0:
+    scale_tier = 'BASIC'
+  elif num_gpus > 1:
+    scale_tier = 'CUSTOM'
+
+  config = {'scaleTier': scale_tier}
+
+  if scale_tier == 'CUSTOM':
+    assert num_gpus > 1
+    if num_gpus not in [4, 8]:
+      raise ValueError('Must use exactly 1, 4, or 8 GPUs.')
+    config['masterType'] = ('complex_model_m_gpu'
+                            if num_gpus == 4 else 'complex_model_l_gpu')
+
+  if master_type:
+    config['masterType'] = master_type
+
+  return config
+
+
+def configure_job(flags_dict):
+  """Construct jobSpec for ML Engine job."""
+  train_dir = flags_dict['output_dir']
+  assert train_dir.startswith('gs://')
+  job_name = os.path.basename(train_dir)
+
+  # See documentation:
+  # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
+  training_input = {
+      'packageUris': [os.path.join(train_dir, 'tensor2tensor.tar.gz')],
+      'pythonModule': 'tensor2tensor.bin.t2t_trainer',
+      'args': args_dict_as_args(flags_dict),
+      'region': cloud.default_region(),
+      'runtimeVersion': '1.4',
+      'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7',
+  }
+  training_input.update(
+      machine_config(
+          num_gpus=flags_dict['worker_gpu'],
+          use_tpu=flags_dict['use_tpu'],
+          master_type=flags_dict['cloud_mlengine_master_type']))
+
+  if training_input['scaleTier'] == 'CUSTOM':
+    assert 'masterType' in training_input
+
+  job_spec = {'jobId': job_name, 'trainingInput': training_input}
+  return job_spec
+
+
+def launch_job(job_spec):
+  """Launch job on ML Engine."""
+  project_id = 'projects/{}'.format(cloud.default_project())
+  credentials = GoogleCredentials.get_application_default()
+  cloudml = discovery.build('ml', 'v1', credentials=credentials)
+  request = cloudml.projects().jobs().create(body=job_spec, parent=project_id)
+  request.execute()
+
+
+def tar_and_copy_t2t(train_dir, usr_dir):
+  """Tar Tensor2Tensor and cp to train_dir."""
+  tf.logging.info('Tarring and pushing local Tensor2Tensor package.')
+  location = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+  tmp_dir = tempfile.gettempdir()
+  cloud.shell_run(
+      'tar -zcf {tmp_dir}/tensor2tensor.tar.gz -C {location} .',
+      location=location,
+      tmp_dir=tmp_dir)
+  cloud.shell_run(
+      ('gsutil cp {tmp_dir}/tensor2tensor.tar.gz '
+       '{train_dir}/tensor2tensor.tar.gz'),
+      tmp_dir=tmp_dir,
+      train_dir=train_dir.strip('/'))
+  if usr_dir:
+    raise ValueError('--t2t_usr_dir is not currently supported in conjunction '
+                     'with auto-launching on Cloud ML Engine.')
+
+
+def launch(flags_dict):
+  job_spec = configure_job(flags_dict)
+  job_name = job_spec['jobId']
+  tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
+                  job_spec)
+  assert cloud.confirm()
+  tar_and_copy_t2t(flags_dict['output_dir'], flags_dict['t2t_usr_dir'])
+  launch_job(job_spec)
+  tf.logging.info('Launched %s. See console to track: %s.', job_name,
+                  CONSOLE_URL)
diff --git a/tensor2tensor/utils/cloud.py b/tensor2tensor/utils/cloud_tpu.py
similarity index 97%
rename from tensor2tensor/utils/cloud.py
rename to tensor2tensor/utils/cloud_tpu.py
index 937c6ee46..53dd36bd0 100644
--- a/tensor2tensor/utils/cloud.py
+++ b/tensor2tensor/utils/cloud_tpu.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Launch on GCP."""
+"""Launch on TPU on GCP."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -191,6 +191,9 @@ def create_tpu(cls):
   gcloud compute ssh {name} -- -N
   """
 
+  DEFAULT_PROJECT = "gcloud config get-value project"
+  DEFAULT_REGION = "gcloud config get-value compute/region"
+
 
 @contextlib.contextmanager
 def shell_background(cmd_, **kwargs):
@@ -224,6 +227,14 @@ def format_cmd(cmd_, **kwargs):
   return cmd_.format(**kwargs).strip().split()
 
 
+def default_region():
+  return shell_output(Gcloud.DEFAULT_REGION).strip()
+
+
+def default_project():
+  return shell_output(Gcloud.DEFAULT_PROJECT).strip()
+
+
 def create_vm(vm_name):
   out = shell_output(Gcloud.create_vm(), name=vm_name)
   return out.split("\n")[1:-1][0].split()[4]
diff --git a/tensor2tensor/utils/diet.py b/tensor2tensor/utils/diet.py
index 19702338b..7ecfba693 100644
--- a/tensor2tensor/utils/diet.py
+++ b/tensor2tensor/utils/diet.py
@@ -193,10 +193,10 @@ def update_variable(self, var, grad_var):
     beta2_pow = tf.pow(params.beta2, global_step)
     if params.factored_second_moment_accumulator and len(var.shape) == 2:
       vr_update = tf.assign(slots["adam_vr"], slots["adam_vr"] * params.beta2 +
-                            tf.reduce_mean(grad_squared, 1, keepdims=True) *
+                            tf.reduce_mean(grad_squared, 1, keep_dims=True) *
                             (1.0 - params.beta2))
       vc_update = tf.assign(slots["adam_vc"], slots["adam_vc"] * params.beta2 +
-                            tf.reduce_mean(grad_squared, 0, keepdims=True) *
+                            tf.reduce_mean(grad_squared, 0, keep_dims=True) *
                             (1.0 - params.beta2))
       with tf.control_dependencies([vr_update, vc_update]):
         vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon

From 0b2c8c2340065345cd2eba7578d4b2c173fdaa0e Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 10:22:43 -0800
Subject: [PATCH 16/31] Add dep on google-api-python-client

PiperOrigin-RevId: 184857261
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 1d3f14a94..2ee6e74ee 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@
         'flask',
         'future',
         'gevent',
+        'google-api-python-client',
         'gunicorn',
         'numpy',
         'requests',

From 2cde9c2b74d016a9bffe5520ebe06d3cd671ad47 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 7 Feb 2018 11:02:02 -0800
Subject: [PATCH 17/31] Clean up obsolete ae cifar hparams, increase default
 batch size.

PiperOrigin-RevId: 184863791
---
 tensor2tensor/models/transformer_vae.py | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 0ce43081b..59f2d08ba 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -854,7 +854,7 @@ def body(self, features):
       return res, loss
 
   def prepare_features_for_infer(self, features):
-    if not self._hparams.do_ae:
+    if self._hparams.do_mask or not self._hparams.do_ae:
       return features
     beam_batch_size = self._decode_hparams.beam_size
     beam_batch_size *= self._decode_hparams.batch_size
@@ -970,25 +970,6 @@ def transformer_ae_small():
   return hparams
 
 
-@registry.register_hparams
-def transformer_ae_cifar():
-  """Hyperparameters for CIFAR-10 experiments."""
-  hparams = transformer_ae_small()
-  hparams.hidden_size = 256
-  hparams.filter_size = 512
-  hparams.batch_size = 1024 * 4
-  hparams.num_compress_steps = 2
-  hparams.v_size = 1024 * 64
-  hparams.kl_warmup_steps = 150000
-  hparams.startup_steps = 10000
-  hparams.kmeans_lr_factor = 0.0
-  hparams.is_2d = 1
-  hparams.learning_rate_warmup_steps = 8000
-  hparams.learning_rate = 0.2
-  hparams.ffn_layer = "conv_hidden_relu_with_sepconv"
-  return hparams
-
-
 @registry.register_hparams
 def imagetransformer_ae_cifar():
   """Hyperparameters for CIFAR-10 experiments."""
@@ -1065,7 +1046,7 @@ def imagetransformer_ae_cifar():
 def transformer_ae_base():
   """Set of hyperparameters."""
   hparams = transformer_ae_small()
-  hparams.batch_size = 1024
+  hparams.batch_size = 2048
   hparams.hidden_size = 512
   hparams.filter_size = 4096
   hparams.num_hidden_layers = 6

From 963f44cfec617f56bac21f9ca8bf190c356a8797 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 11:44:48 -0800
Subject: [PATCH 18/31] Support --t2t_usr_dir on Cloud ML Engine

PiperOrigin-RevId: 184871178
---
 tensor2tensor/utils/cloud_mlengine.py | 95 ++++++++++++++++++++++-----
 tensor2tensor/utils/usr_dir.py        | 13 +++-
 2 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
index 12d89f470..0bb5ddc40 100644
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -16,25 +16,40 @@
 """Launch on GCP's ML Engine."""
 
 import os
+import shutil
 import sys
 import tempfile
 
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials
 from tensor2tensor.utils import cloud_tpu as cloud
+from tensor2tensor.utils import usr_dir as usr_dir_lib
 import tensorflow as tf
 
 CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
 
 # TODO(rsepassi):
-# * Support t2t_usr_dir
 # * Support --autotune
 # * Add documentation clould_mlengine.md
 # * Enable multi-machine sync/async training
 
+SETUP_PY = """
+from setuptools import find_packages
+from setuptools import setup
+setup(
+    name='DummyUsrDirPackage',
+    version='0.1',
+    packages=find_packages(),
+)
+"""
+
 
 def args_dict_as_args(args_dict):
+  """Convert dict to list of args suitable for passing on cmd line."""
+  args_dict = dict(args_dict)
   del args_dict['cloud_mlengine']
+  # Configured later
+  del args_dict['t2t_usr_dir']
   args = []
   for name, val in args_dict.items():
     if val is None:
@@ -77,7 +92,6 @@ def configure_job(flags_dict):
   # See documentation:
   # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
   training_input = {
-      'packageUris': [os.path.join(train_dir, 'tensor2tensor.tar.gz')],
       'pythonModule': 'tensor2tensor.bin.t2t_trainer',
       'args': args_dict_as_args(flags_dict),
       'region': cloud.default_region(),
@@ -106,32 +120,81 @@ def launch_job(job_spec):
   request.execute()
 
 
-def tar_and_copy_t2t(train_dir, usr_dir):
-  """Tar Tensor2Tensor and cp to train_dir."""
-  tf.logging.info('Tarring and pushing local Tensor2Tensor package.')
-  location = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-  tmp_dir = tempfile.gettempdir()
+def _tar_and_copy(src_dir, target_dir):
+  """Tar and gzip src_dir and copy to GCS target_dir."""
+  src_dir = src_dir.rstrip('/')
+  target_dir = target_dir.rstrip('/')
+  tmp_dir = tempfile.gettempdir().rstrip('/')
+  src_base = os.path.basename(src_dir)
   cloud.shell_run(
-      'tar -zcf {tmp_dir}/tensor2tensor.tar.gz -C {location} .',
-      location=location,
+      'tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .',
+      src_dir=src_dir,
+      src_base=src_base,
       tmp_dir=tmp_dir)
+  final_destination = '%s/%s.tar.gz' % (target_dir, src_base)
   cloud.shell_run(
-      ('gsutil cp {tmp_dir}/tensor2tensor.tar.gz '
-       '{train_dir}/tensor2tensor.tar.gz'),
+      ('gsutil cp {tmp_dir}/{src_base}.tar.gz '
+       '{final_destination}'),
       tmp_dir=tmp_dir,
-      train_dir=train_dir.strip('/'))
-  if usr_dir:
-    raise ValueError('--t2t_usr_dir is not currently supported in conjunction '
-                     'with auto-launching on Cloud ML Engine.')
+      src_base=src_base,
+      final_destination=final_destination)
+  return final_destination
+
+
+def tar_and_copy_t2t(train_dir):
+  """Tar Tensor2Tensor and cp to train_dir."""
+  tf.logging.info('Tarring and pushing local Tensor2Tensor package.')
+  t2t_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+  t2t_tar = _tar_and_copy(t2t_dir, train_dir)
+  return t2t_tar
+
+
+def tar_and_copy_usr_dir(usr_dir, train_dir):
+  """Package, tar, and copy usr_dir to GCS train_dir."""
+  tf.logging.info('Tarring and pushing t2t_usr_dir.')
+  usr_dir = os.path.abspath(os.path.expanduser(usr_dir))
+  # Copy usr dir to a temp location
+  top_dir = os.path.join(tempfile.gettempdir(), 't2t_usr_container')
+  tmp_usr_dir = os.path.join(top_dir, usr_dir_lib.INTERNAL_USR_DIR_PACKAGE)
+  shutil.rmtree(top_dir, ignore_errors=True)
+  shutil.copytree(usr_dir, tmp_usr_dir)
+  # Insert setup.py if one does not exist
+  top_setup_fname = os.path.join(top_dir, 'setup.py')
+  usr_setup_fname = os.path.join(tmp_usr_dir, 'setup.py')
+  if tf.gfile.Exists(usr_setup_fname):
+    tf.gfile.Move(usr_setup_fname, top_setup_fname)
+  else:
+    with tf.gfile.Open(top_setup_fname, 'w') as f:
+      f.write(SETUP_PY)
+  usr_tar = _tar_and_copy(top_dir, train_dir)
+  return usr_tar
+
+
+def configure_trainer_package(job_spec, t2t_tar):
+  assert t2t_tar.startswith('gs://')
+  job_spec['trainingInput']['packageUris'] = [t2t_tar]
+
+
+def configure_usr_dir(job_spec, usr_tar):
+  assert usr_tar.startswith('gs://')
+  job_spec['trainingInput']['packageUris'].append(usr_tar)
+  usr_args = ['--t2t_usr_dir', usr_dir_lib.INTERNAL_USR_DIR_PACKAGE]
+  job_spec['trainingInput']['args'].extend(usr_args)
 
 
 def launch(flags_dict):
+  """Launch t2t_trainer on Cloud ML Engine."""
   job_spec = configure_job(flags_dict)
   job_name = job_spec['jobId']
   tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
                   job_spec)
   assert cloud.confirm()
-  tar_and_copy_t2t(flags_dict['output_dir'], flags_dict['t2t_usr_dir'])
+  train_dir = flags_dict['output_dir']
+  t2t_tar = tar_and_copy_t2t(train_dir)
+  configure_trainer_package(job_spec, t2t_tar)
+  if flags_dict['t2t_usr_dir']:
+    usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir)
+    configure_usr_dir(job_spec, usr_tar)
   launch_job(job_spec)
   tf.logging.info('Launched %s. See console to track: %s.', job_name,
                   CONSOLE_URL)
diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py
index d89745b98..5edd6f1a2 100644
--- a/tensor2tensor/utils/usr_dir.py
+++ b/tensor2tensor/utils/usr_dir.py
@@ -27,13 +27,20 @@
 import tensorflow as tf
 
 
+INTERNAL_USR_DIR_PACKAGE = "t2t_usr_dir_internal"
+
+
 def import_usr_dir(usr_dir):
   """Import module at usr_dir, if provided."""
   if not usr_dir:
     return
-  dir_path = os.path.expanduser(usr_dir)
-  if dir_path[-1] == "/":
-    dir_path = dir_path[:-1]
+  if usr_dir == INTERNAL_USR_DIR_PACKAGE:
+    # The package has been installed with pip under this name for Cloud ML
+    # Engine so just import it.
+    importlib.import_module(INTERNAL_USR_DIR_PACKAGE)
+    return
+
+  dir_path = os.path.abspath(os.path.expanduser(usr_dir).rstrip("/"))
   containing_dir, module_name = os.path.split(dir_path)
   tf.logging.info("Importing user module %s from path %s", module_name,
                   containing_dir)

From 62bc2e77efd3ccba424fa1f29d3035102f0fa9a8 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 12:23:29 -0800
Subject: [PATCH 19/31] Add a test for the t2t_trainer script

PiperOrigin-RevId: 184877423
---
 .travis.yml                           |  2 +-
 tensor2tensor/bin/t2t_trainer_test.py | 50 +++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 tensor2tensor/bin/t2t_trainer_test.py

diff --git a/.travis.yml b/.travis.yml
index 2cdcd85bf..e703a2bc9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ script:
   - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)"
 
   # Run tests
-  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py
+  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py --ignore=tensor2tensor/bin/t2t_trainer_test.py
   - pytest tensor2tensor/utils/registry_test.py
   - pytest tensor2tensor/utils/trainer_lib_test.py
 
diff --git a/tensor2tensor/bin/t2t_trainer_test.py b/tensor2tensor/bin/t2t_trainer_test.py
new file mode 100644
index 000000000..b1f38cec5
--- /dev/null
+++ b/tensor2tensor/bin/t2t_trainer_test.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for t2t_trainer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.utils import trainer_lib_test
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+class TrainerTest(tf.test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    trainer_lib_test.TrainerLibTest.setUpClass()
+
+  def testTrain(self):
+    FLAGS.problems = "tiny_algo"
+    FLAGS.model = "transformer"
+    FLAGS.hparams_set = "transformer_tiny"
+    FLAGS.train_steps = 1
+    FLAGS.eval_steps = 1
+    FLAGS.output_dir = tf.test.get_temp_dir()
+    FLAGS.data_dir = tf.test.get_temp_dir()
+    t2t_trainer.main(None)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 0c3218dfc760930e0a549e6676a5eb2bdf7488a9 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 7 Feb 2018 15:27:28 -0800
Subject: [PATCH 20/31] The most basic version of the "supervised" attention
 experiment.

PiperOrigin-RevId: 184906398
---
 tensor2tensor/layers/common_attention.py | 28 ++++++++++++++++++++++++
 tensor2tensor/models/transformer.py      | 19 ++++++++++++----
 tensor2tensor/models/transformer_test.py | 12 ++++++++++
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 0f400c575..63bf8d6cd 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -327,6 +327,34 @@ def add_standard_attention_hparams(hparams):
   return hparams
 
 
+def encoder_decoder_attention_loss(expected_attention, actual_attentions):
+  """Computes encdec attention loss between expected and actual attentions.
+
+  Args:
+    expected_attention: Tensor storing the expected encoder-decoder attention
+      weights with shape [batch_size, target_length, input_length].
+    actual_attentions: Dictionary with actual attention weights for different
+      attention types and hidden layers.
+
+  Returns:
+    MSE loss between the actual and expected attention weights.
+  """
+  # For each hidden layer, we have an attention weight tensor with shape
+  # [batch_size, num_heads, target_length, input_length].
+  actual_encdec_attention_weights = [
+      t for layer_key, t in actual_attentions.items()
+      if "encdec_attention" in layer_key
+  ]
+  # Stack all hidden layer attention weight tensors to get a tensor with shape
+  # [num_hidden_layers, batch_size, num_heads, target_length, input_length].
+  actual_attention_weights = tf.stack(actual_encdec_attention_weights)
+  # Reduce mean across all layers (axis=0) and all heads (axis=2) to get a
+  # tensor with shape [batch_size, target_length, input_length].
+  actual_attention_weights = tf.reduce_mean(actual_attention_weights, [0, 2])
+  return tf.losses.mean_squared_error(expected_attention,
+                                      actual_attention_weights)
+
+
 @expert_utils.add_name_scope()
 def get_timing_signal_1d(length,
                          channels,
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 3aeb7a790..305a17379 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -160,10 +160,21 @@ def body(self, features):
     decoder_input, decoder_self_attention_bias = transformer_prepare_decoder(
         targets, hparams, features=features)
 
-    return self.decode(decoder_input, encoder_output,
-                       encoder_decoder_attention_bias,
-                       decoder_self_attention_bias, hparams,
-                       nonpadding=features_to_nonpadding(features, "targets"))
+    decoder_output = self.decode(
+        decoder_input,
+        encoder_output,
+        encoder_decoder_attention_bias,
+        decoder_self_attention_bias,
+        hparams,
+        nonpadding=features_to_nonpadding(features, "targets"))
+
+    expected_attention_weights = features.get("expected_attention_weights")
+    if expected_attention_weights is not None:
+      attention_loss = common_attention.encoder_decoder_attention_loss(
+          expected_attention_weights, self.attention_weights)
+      return decoder_output, {"attention_loss": attention_loss}
+
+    return decoder_output
 
   def _greedy_infer(self, features, decode_length):
     """Fast version of greedy decoding.
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 0c9b6f794..1a6134b51 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -207,6 +207,18 @@ def testTransformerWithoutProblem(self):
         body_out.get_shape().as_list(),
         [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
 
+  def testTransformerWithEncoderDecoderAttentionLoss(self):
+    model, features = self.getModel(transformer.transformer_small())
+    expected_attention_weights = np.random.random_sample(
+        size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH))
+    features["expected_attention_weights"] = tf.constant(
+        expected_attention_weights, dtype=tf.float32)
+    _, extra_loss = model(features)
+    with self.test_session() as session:
+      session.run(tf.global_variables_initializer())
+      res = session.run(extra_loss["attention_loss"])
+    self.assertEqual(res.shape, ())
+
 
 if __name__ == "__main__":
   tf.test.main()

From b86ed582cf5c7834e8d8ab1c1b3efffce8776633 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 17:11:53 -0800
Subject: [PATCH 21/31] Support hyperparameter tuning on Cloud ML Engine

PiperOrigin-RevId: 184921645
---
 tensor2tensor/bin/t2t_trainer.py           |  52 ++++++++++-
 tensor2tensor/layers/common_hparams.py     | 104 +++++++++++----------
 tensor2tensor/models/slicenet.py           |   4 -
 tensor2tensor/models/transformer.py        |  15 +--
 tensor2tensor/models/transformer_sketch.py |   5 -
 tensor2tensor/utils/cloud_mlengine.py      |  73 ++++++++++++---
 tensor2tensor/utils/trainer_lib.py         |   4 +
 7 files changed, 164 insertions(+), 93 deletions(-)

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 051c2e90d..a755e0d33 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -93,6 +93,20 @@
                     "GPUs on machine type. See documentation: "
                     "https://cloud.google.com/ml-engine/reference/rest/v1/"
                     "projects.jobs#traininginput")
+# Hyperparameter tuning on Cloud ML Engine
+# Pass an --hparams_range to enable
+flags.DEFINE_string("autotune_objective", None,
+                    "TensorBoard metric name to optimize.")
+flags.DEFINE_bool("autotune_maximize", True,
+                  "Whether to maximize (vs. minimize) autotune_objective.")
+flags.DEFINE_integer("autotune_max_trials", 10,
+                     "Maximum number of tuning experiments to run.")
+flags.DEFINE_integer("autotune_parallel_trials", 1,
+                     "How many trials to run in parallel (will spin up this "
+                     "many jobs.")
+flags.DEFINE_string("job-dir", None,
+                    "DO NOT USE. Exists only for Cloud ML Engine to pass in "
+                    "during hyperparameter tuning. Overrides --output_dir.")
 
 
 def get_problem_name():
@@ -101,6 +115,33 @@ def get_problem_name():
   return problems[0]
 
 
+def set_hparams_from_args(args):
+  """Set hparams overrides from unparsed args list."""
+  if not args:
+    return
+
+  hp_prefix = "--hp_"
+  tf.logging.info("Found unparsed command-line arguments. Checking if any "
+                  "start with %s and interpreting those as hparams "
+                  "settings.", hp_prefix)
+
+  pairs = []
+  i = 0
+  while i < len(args):
+    arg = args[i]
+    if arg.startswith(hp_prefix):
+      pairs.append((arg.lstrip(hp_prefix), args[i+1]))
+      i += 2
+    else:
+      tf.logging.warn("Found unknown flag: %s", arg)
+      i += 1
+
+  as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs])
+  if FLAGS.hparams:
+    as_hparams = "," + as_hparams
+  FLAGS.hparams += as_hparams
+
+
 def create_hparams():
   if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set:
     tf.logging.warn("Not all hyperparameter sets work on TPU. "
@@ -265,21 +306,22 @@ def maybe_cloud_tpu():
     yield
 
 
-def main(_):
+def main(argv):
   tf.logging.set_verbosity(tf.logging.INFO)
   trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
   if FLAGS.cloud_mlengine:
-    assert not FLAGS.cloud_tpu
-    assert FLAGS.output_dir.startswith("gs://")
-    assert FLAGS.data_dir.startswith("gs://")
-    return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"]))
+    return cloud_mlengine.launch()
 
   if FLAGS.generate_data:
     generate_data()
 
+  if FLAGS.job_dir:
+    FLAGS.output_dir = FLAGS.job_dir
+
+  set_hparams_from_args(argv[1:])
   hparams = create_hparams()
   if is_chief():
     save_metadata(hparams)
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 02a5df2f3..147757b47 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -21,7 +21,6 @@
 
 # Dependency imports
 
-import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 from tensor2tensor.utils import registry
 
@@ -224,10 +223,15 @@ class RangedHParams(object):
   LOG_SCALE = 2
   REVERSE_LOG_SCALE = 3
 
+  SCALES_STR = {
+      LINEAR_SCALE: "UNIT_LINEAR_SCALE",
+      LOG_SCALE: "UNIT_LOG_SCALE",
+      REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE",
+  }
+
   def __init__(self):
     self._categorical_params = {}
     self._discrete_params = {}
-    self._discrete_float_params = {}
     self._float_params = {}
     self._int_params = {}
 
@@ -237,10 +241,12 @@ def _check_reset_and_type_change(self, name, orig_ctr):
     if name in orig_ctr:
       tf.logging.warning("Overwriting hparam %s", name)
 
-    ctr_names = [(self._categorical_params,
-                  "categorical"), (self._discrete_params, "discrete"),
-                 (self._float_params, "float"), (self._int_params, "int"),
-                 (self._discrete_float_params, "discrete_float")]
+    ctr_names = [
+        (self._categorical_params, "categorical"),
+        (self._discrete_params, "discrete"),
+        (self._float_params, "float"),
+        (self._int_params, "int"),
+    ]
     ctrs, names = list(zip(*ctr_names))
     orig_name = names[ctrs.index(orig_ctr)]
 
@@ -263,23 +269,17 @@ def set_discrete(self, name, feasible_points, scale=None, length=None):
     self._discrete_params[name] = (name, feasible_points, scale, length)
 
   def set_float(self, name, min_val, max_val, scale=None, length=None):
-    if name in self._discrete_float_params:
-      del self._discrete_float_params[name]
     self._check_reset_and_type_change(name, self._float_params)
     self._float_params[name] = (name, min_val, max_val, scale, length)
 
-  def set_discrete_float(self, name, val):
-    self._check_reset_and_type_change(name, self._discrete_float_params)
-    self._discrete_float_params[name] = (name, [val])
-
   def set_int(self, name, min_val, max_val, scale=None, length=None):
     self._check_reset_and_type_change(name, self._int_params)
     self._int_params[name] = (name, min_val, max_val, scale, length)
 
   def fix_select_params(self, hp):
     ctrs = [
-        self._categorical_params, self._discrete_params,
-        self._discrete_float_params, self._float_params, self._int_params
+        self._categorical_params, self._discrete_params, self._float_params,
+        self._int_params
     ]
     for key, val in hp.values().iteritems():
       for ctr in ctrs:
@@ -287,52 +287,56 @@ def fix_select_params(self, hp):
           del ctr[key]
       self.set_discrete(key, [val])
 
+  def to_parameter_specs(self, name_prefix=""):
+    """To list of dicts suitable for Cloud ML Engine hyperparameter tuning."""
+    specs = []
+    for name, categories, _ in self._categorical_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "CATEGORICAL",
+          "categoricalValues": categories,
+      }
+      specs.append(spec)
 
-def fill_ranged_hparams_from_hparams(hparams, ranged_hparams):
-  """Fill ranged_hparams with singleton values from hparams.
+    for name, feasible_points, scale, _ in self._discrete_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "DISCRETE",
+          "discreteValues": feasible_points,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  HParams are placed in RangedHParams with the following functions, according to
-  type:
-    * int: set_discrete
-    * bool: set_discrete
-    * float: set_discrete_float
-    * str: set_categorical
+    for name, min_val, max_val, scale, _ in self._float_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "DOUBLE",
+          "minValue": min_val,
+          "maxValue": max_val,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  Args:
-    hparams: tf.contrib.training.HParams; contains the hyperparameters to copy
-      over to ranged_hparams.
-    ranged_hparams: RangedHParams; will have hparams values copied to it.
+    for name, min_val, max_val, scale, _ in self._int_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "INTEGER",
+          "minValue": min_val,
+          "maxValue": max_val,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  Raises:
-    ValueError: if hparams contains a hyperparameter not of type
-      {int, float, str, bool}.
-  """
-  for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types):  # pylint: disable=protected-access
-
-    if is_multivalent:
-      raise ValueError("Multivalent hparams not supported in RangedHParams. "
-                       "Hyperparameter %s is multivalent." % name)
-    val = getattr(hparams, name)
-    if hp_type == int:
-      ranged_hparams.set_discrete(name, [val])
-    elif hp_type == bool:
-      ranged_hparams.set_discrete(name, [int(val)])
-    elif hp_type == float:
-      ranged_hparams.set_discrete_float(name, val)
-    elif hp_type == str:
-      ranged_hparams.set_categorical(name, [val])
-    else:
-      raise ValueError("Unsupported type %s for param %s" % (hp_type, name))
+    return specs
 
 
 @registry.register_ranged_hparams("basic1")
 def basic_range1(ranged_hparams):
   """A basic range of hyperparameters."""
   rhp = ranged_hparams
-
-  hparams = basic_params1()
-  fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
   rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6])
   rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE)
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index e77412513..43cfa571e 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -381,10 +381,6 @@ def slicenet_params1_tiny():
 def slicenet_range1(ranged_hparams):
   """Small range of hyperparameters."""
   rhp = ranged_hparams
-
-  hparams = slicenet_params1()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE)
   rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE)
   rhp.set_float("optimizer_adam_beta2", 0.995, 0.998)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 305a17379..df4b32277 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1228,11 +1228,9 @@ def transformer_prepend():
   return transformer_prepend_v2()
 
 
-@registry.register_ranged_hparams("transformer_base")
+@registry.register_ranged_hparams
 def transformer_base_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_base()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1337,8 +1335,6 @@ def transformer_tiny_tpu():
 @registry.register_ranged_hparams
 def transformer_tiny_tpu_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_tiny_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_float("weight_decay", 0.0, 2.0)
 
@@ -1346,8 +1342,6 @@ def transformer_tiny_tpu_range(rhp):
 @registry.register_ranged_hparams
 def transformer_tpu_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1358,13 +1352,6 @@ def transformer_tpu_range(rhp):
   rhp.set_float("weight_decay", 0.0, 2.0)
 
 
-@registry.register_ranged_hparams
-def transformer_tpu_batch_range(rhp):
-  hparams = transformer_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-  rhp.set_discrete("batch_size", [256, 512, 768, 1024])
-
-
 @registry.register_hparams
 def transformer_small_tpu():
   """TPU-friendly version of transformer_small.
diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py
index 913243f00..e8fe796a8 100644
--- a/tensor2tensor/models/transformer_sketch.py
+++ b/tensor2tensor/models/transformer_sketch.py
@@ -22,7 +22,6 @@
 
 # Dependency imports
 
-from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
 from tensor2tensor.models import transformer_vae
@@ -125,10 +124,6 @@ def transformer_sketch_6layer():
 @registry.register_ranged_hparams("transformer_sketch_ranged")
 def transformer_sketch_ranged(rhp):
   """Range of hparams for vizier."""
-
-  hparams = transformer_sketch()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_categorical("ffn_layer",
                       ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"])
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
index 0bb5ddc40..90c5c09e3 100644
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -22,14 +22,17 @@
 
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials
+from tensor2tensor.layers import common_hparams
 from tensor2tensor.utils import cloud_tpu as cloud
+from tensor2tensor.utils import registry
 from tensor2tensor.utils import usr_dir as usr_dir_lib
 import tensorflow as tf
 
+FLAGS = tf.flags.FLAGS
+
 CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
 
 # TODO(rsepassi):
-# * Support --autotune
 # * Add documentation clould_mlengine.md
 # * Enable multi-machine sync/async training
 
@@ -44,9 +47,9 @@
 """
 
 
-def args_dict_as_args(args_dict):
-  """Convert dict to list of args suitable for passing on cmd line."""
-  args_dict = dict(args_dict)
+def flags_as_args():
+  """Convert FLAGS to list of args suitable for passing on cmd line."""
+  args_dict = dict(FLAGS.__dict__['__flags'])
   del args_dict['cloud_mlengine']
   # Configured later
   del args_dict['t2t_usr_dir']
@@ -54,6 +57,8 @@ def args_dict_as_args(args_dict):
   for name, val in args_dict.items():
     if val is None:
       continue
+    if name.startswith('autotune'):
+      continue
     args.extend(['--%s' % name, str(val)])
   return args
 
@@ -83,9 +88,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None):
   return config
 
 
-def configure_job(flags_dict):
+def configure_job():
   """Construct jobSpec for ML Engine job."""
-  train_dir = flags_dict['output_dir']
+  train_dir = FLAGS.output_dir
   assert train_dir.startswith('gs://')
   job_name = os.path.basename(train_dir)
 
@@ -93,16 +98,27 @@ def configure_job(flags_dict):
   # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
   training_input = {
       'pythonModule': 'tensor2tensor.bin.t2t_trainer',
-      'args': args_dict_as_args(flags_dict),
+      'args': flags_as_args(),
       'region': cloud.default_region(),
       'runtimeVersion': '1.4',
       'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7',
+      'jobDir': train_dir,
   }
   training_input.update(
       machine_config(
-          num_gpus=flags_dict['worker_gpu'],
-          use_tpu=flags_dict['use_tpu'],
-          master_type=flags_dict['cloud_mlengine_master_type']))
+          num_gpus=FLAGS.worker_gpu,
+          use_tpu=FLAGS.use_tpu,
+          master_type=FLAGS.cloud_mlengine_master_type))
+  if FLAGS.hparams_range:
+    assert FLAGS.autotune_objective
+    tf.logging.info('Configuring hyperparameter tuning.')
+    training_input['hyperparameters'] = configure_autotune(
+        FLAGS.hparams_range,
+        FLAGS.autotune_objective,
+        FLAGS.autotune_maximize,
+        FLAGS.autotune_max_trials,
+        FLAGS.autotune_parallel_trials,
+    )
 
   if training_input['scaleTier'] == 'CUSTOM':
     assert 'masterType' in training_input
@@ -170,6 +186,26 @@ def tar_and_copy_usr_dir(usr_dir, train_dir):
   return usr_tar
 
 
+def autotune_paramspecs(hparams_range):
+  rhp = common_hparams.RangedHParams()
+  registry.ranged_hparams(hparams_range)(rhp)
+  return rhp.to_parameter_specs(name_prefix='hp_')
+
+
+def configure_autotune(hparams_range,
+                       objective,
+                       maximize=True,
+                       max_trials=10,
+                       parallel_trials=1):
+  return {
+      'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
+      'params': autotune_paramspecs(hparams_range),
+      'maxTrials': max_trials,
+      'maxParallelTrials': parallel_trials,
+      'hyperparameterMetricTag': objective,
+  }
+
+
 def configure_trainer_package(job_spec, t2t_tar):
   assert t2t_tar.startswith('gs://')
   job_spec['trainingInput']['packageUris'] = [t2t_tar]
@@ -182,18 +218,25 @@ def configure_usr_dir(job_spec, usr_tar):
   job_spec['trainingInput']['args'].extend(usr_args)
 
 
-def launch(flags_dict):
+def launch():
   """Launch t2t_trainer on Cloud ML Engine."""
-  job_spec = configure_job(flags_dict)
+  assert not FLAGS.cloud_tpu
+  assert not FLAGS.job_dir
+  assert FLAGS.output_dir.startswith('gs://')
+  assert FLAGS.data_dir.startswith('gs://')
+  assert FLAGS.worker_replicas <= 1
+  assert FLAGS.ps_replicas <= 0
+
+  job_spec = configure_job()
   job_name = job_spec['jobId']
   tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
                   job_spec)
   assert cloud.confirm()
-  train_dir = flags_dict['output_dir']
+  train_dir = FLAGS.output_dir
   t2t_tar = tar_and_copy_t2t(train_dir)
   configure_trainer_package(job_spec, t2t_tar)
-  if flags_dict['t2t_usr_dir']:
-    usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir)
+  if FLAGS.t2t_usr_dir:
+    usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir)
     configure_usr_dir(job_spec, usr_tar)
   launch_job(job_spec)
   tf.logging.info('Launched %s. See console to track: %s.', job_name,
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
index 039b06e68..0e64f2475 100644
--- a/tensor2tensor/utils/trainer_lib.py
+++ b/tensor2tensor/utils/trainer_lib.py
@@ -68,8 +68,12 @@ def create_hparams(hparams_set,
                    hparams_overrides_str="",
                    data_dir=None,
                    problem_name=None):
+  """Create HParams with data_dir and problem hparams, if kwargs provided."""
   hparams = registry.hparams(hparams_set)()
   if hparams_overrides_str:
+    tf.logging.info("Overriding hparams in %s with %s",
+                    hparams_set,
+                    hparams_overrides_str)
     hparams = hparams.parse(hparams_overrides_str)
   if data_dir:
     hparams.add_hparam("data_dir", data_dir)

From dd3322c6392acf72e7ba64df55910fdc74ea624d Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 17:21:04 -0800
Subject: [PATCH 22/31] Add Travis test for export and serve

PiperOrigin-RevId: 184922812
---
 .travis.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index e703a2bc9..5e0aff001 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,8 +3,11 @@ python:
   - "2.7"
   - "3.6"
 before_install:
+  - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list
+  - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add -
   - sudo apt-get update -qq
   - sudo apt-get install -qq libhdf5-dev
+  - sudo apt-get install -qq tensorflow-model-server
 install:
   - pip install -q .[tensorflow]
   - pip install -q .[tests]
@@ -36,5 +39,14 @@ script:
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
   - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
   - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
+
+  # Export and query (on Python 2 only)
+  - t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+        pip install tensorflow-serving-api;
+        tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo &
+        sleep 10;
+        t2t-query-server --problem=$T2T_PROBLEM --server=localhost:9000 --servable_name=my_model --data_dir=$T2T_DATA_DIR --inputs_once='1 0 1 0 1 0';
+    fi
 git:
   depth: 3

From 3b1f70f92de247364e8aa0da56f36bbdff08399f Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 17:53:21 -0800
Subject: [PATCH 23/31] Fix hello_t2t.ipynb

PiperOrigin-RevId: 184926720
---
 tensor2tensor/notebooks/hello_t2t.ipynb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb
index cc9f66a02..3b3573098 100644
--- a/tensor2tensor/notebooks/hello_t2t.ipynb
+++ b/tensor2tensor/notebooks/hello_t2t.ipynb
@@ -60,7 +60,7 @@
       },
       "source": [
         "# Install deps\n",
-        "!pip install -q tensor2tensor"
+        "!pip install -q -U tensor2tensor tensorflow"
       ],
       "cell_type": "code",
       "execution_count": 0,
@@ -87,7 +87,7 @@
         "from tensor2tensor import models\n",
         "from tensor2tensor import problems\n",
         "from tensor2tensor.layers import common_layers\n",
-        "from tensor2tensor.tpu import tpu_trainer_lib\n",
+        "from tensor2tensor.utils import trainer_lib\n",
         "from tensor2tensor.utils import t2t_model\n",
         "from tensor2tensor.utils import registry\n",
         "from tensor2tensor.utils import metrics\n",
@@ -597,7 +597,7 @@
         "model_name = \"transformer\"\n",
         "hparams_set = \"transformer_base\"\n",
         "\n",
-        "hparams = tpu_trainer_lib.create_hparams(hparams_set, data_dir=data_dir, problem_name=\"translate_ende_wmt32k\")\n",
+        "hparams = trainer_lib.create_hparams(hparams_set, data_dir=data_dir, problem_name=\"translate_ende_wmt32k\")\n",
         "\n",
         "# NOTE: Only create the model once when restoring from a checkpoint; it's a\n",
         "# Layer and so subsequent instantiations will have different variable scopes\n",
@@ -1407,7 +1407,7 @@
         "    return tf.layers.conv2d(tf.nn.relu(h2), filters,\n",
         "                            kernel_size=(3, 3))\n",
         "\n",
-        "hparams = tpu_trainer_lib.create_hparams(\"basic_1\", data_dir=data_dir, problem_name=\"image_mnist\")\n",
+        "hparams = trainer_lib.create_hparams(\"basic_1\", data_dir=data_dir, problem_name=\"image_mnist\")\n",
         "hparams.hidden_size = 64\n",
         "model = MySimpleModel(hparams, Modes.TRAIN)"
       ],
@@ -1584,7 +1584,7 @@
         "    break\n",
         "\n",
         "  # Make the inputs and targets 4D\n",
-        "  example[\"inputs\"] = tf.reshape(example[\"inputs\"], [1, 28, 28, 3])\n",
+        "  example[\"inputs\"] = tf.reshape(example[\"inputs\"], [1, 28, 28, 1])\n",
         "  example[\"targets\"] = tf.reshape(example[\"targets\"], [1, 1, 1, 1])\n",
         "\n",
         "  # Call the model\n",

From 424d4ace667e49b23b1d4c9b35e727f6edb16453 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 7 Feb 2018 18:44:53 -0800
Subject: [PATCH 24/31] Add documentation for Cloud ML Engine

PiperOrigin-RevId: 184932319
---
 docs/cloud_mlengine.md | 80 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 docs/cloud_mlengine.md

diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md
new file mode 100644
index 000000000..b257fab25
--- /dev/null
+++ b/docs/cloud_mlengine.md
@@ -0,0 +1,80 @@
+# Running on Cloud ML Engine
+
+Google Cloud Platform offers a managed training environment for TensorFlow
+models called [Cloud ML Engine](https://cloud.google.com/ml-engine/) and
+you can easily launch Tensor2Tensor on it, including for hyperparameter tuning.
+
+# Launch
+
+It's the same `t2t-trainer` you know and love with the addition of the
+`--cloud_mlengine` flag, which by default will launch on a 1-GPU machine.
+
+```
+# Note that both the data dir and output dir have to be on GCS
+DATA_DIR=gs://my-bucket/data
+OUTPUT_DIR=gs://my-bucket/train
+t2t-trainer \
+  --problems=translate_ende_wmt32k \
+  --model=transformer \
+  --hparams_set=transformer_base \
+  --data_dir=$DATA_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --cloud_mlengine
+```
+
+By passing `--worker_gpu=4` or `--worker_gpu=8` it will automatically launch on
+machines with 4 or 8 GPUs.
+
+You can additionally pass the `--cloud_mlengine_master_type` to select another
+kind of machine (see the [docs for
+`masterType`](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput)
+for your options). If you provide this flag yourself, make sure you pass the
+correct value for `--worker_gpu`.
+
+**Note**: `t2t-trainer` only currently supports launching with single machines,
+possibly with multiple GPUs. Multi-machine setups are not yet supported out of
+the box with the `--cloud_mlengine` flag, though multi-machine should in
+principle work just fine. Contributions/testers welcome.
+
+## `--t2t_usr_dir`
+
+Launching on Cloud ML Engine works with `--t2t_usr_dir` as well as long as the
+directory is fully self-contained (i.e. the imports only refer to other modules
+in the directory). If there are additional PyPI dependencies that you need, you
+can include a `setup.py` file in your directory (ensure that it uses
+`setuptools.find_packages`).
+
+# Hyperparameter Tuning
+
+Hyperparameter tuning with `t2t-trainer` and Cloud ML Engine is also a breeze
+with `--hparams_range` and the `--autotune_*` flags:
+
+```
+t2t-trainer \
+  --problems=translate_ende_wmt32k \
+  --model=transformer \
+  --hparams_set=transformer_base \
+  --data_dir=$DATA_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --cloud_mlengine \
+  --hparams_range=transformer_base_range \
+  --autotune_objective='metrics-translate_ende_wmt32k/neg_log_perplexity' \
+  --autotune_maximize \
+  --autotune_max_trials=100 \
+  --autotune_parallel_trials=3
+```
+
+The `--hparams_range` specifies the search space and should be registered with
+`@register_ranged_hparams`. It defines a `RangedHParams` object that sets
+search ranges and scales for various parameters. See `transformer_base_range`
+in
+[`transformer.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py)
+for an example.
+
+The metric name passed as `--autotune_objective` should be exactly what you'd
+see in TensorBoard. To minimize a metric, set `--autotune_maximize=False`.
+
+You control how many total trials to run with `--autotune_max_trials` and the
+number of jobs to launch in parallel with `--autotune_parallel_trials`.
+
+Happy tuning!

From 6ae03f76279710597a0731dcc2d4305ab74f741f Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 8 Feb 2018 08:54:31 -0800
Subject: [PATCH 25/31] Support hyperparameter tuning on Cloud ML Engine

PiperOrigin-RevId: 185001360
---
 tensor2tensor/bin/t2t_trainer.py           |  52 +----------
 tensor2tensor/layers/common_hparams.py     | 104 ++++++++++-----------
 tensor2tensor/models/slicenet.py           |   4 +
 tensor2tensor/models/transformer.py        |  15 ++-
 tensor2tensor/models/transformer_sketch.py |   5 +
 tensor2tensor/utils/cloud_mlengine.py      |  73 +++------------
 tensor2tensor/utils/trainer_lib.py         |   4 -
 7 files changed, 93 insertions(+), 164 deletions(-)

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index a755e0d33..051c2e90d 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -93,20 +93,6 @@
                     "GPUs on machine type. See documentation: "
                     "https://cloud.google.com/ml-engine/reference/rest/v1/"
                     "projects.jobs#traininginput")
-# Hyperparameter tuning on Cloud ML Engine
-# Pass an --hparams_range to enable
-flags.DEFINE_string("autotune_objective", None,
-                    "TensorBoard metric name to optimize.")
-flags.DEFINE_bool("autotune_maximize", True,
-                  "Whether to maximize (vs. minimize) autotune_objective.")
-flags.DEFINE_integer("autotune_max_trials", 10,
-                     "Maximum number of tuning experiments to run.")
-flags.DEFINE_integer("autotune_parallel_trials", 1,
-                     "How many trials to run in parallel (will spin up this "
-                     "many jobs.")
-flags.DEFINE_string("job-dir", None,
-                    "DO NOT USE. Exists only for Cloud ML Engine to pass in "
-                    "during hyperparameter tuning. Overrides --output_dir.")
 
 
 def get_problem_name():
@@ -115,33 +101,6 @@ def get_problem_name():
   return problems[0]
 
 
-def set_hparams_from_args(args):
-  """Set hparams overrides from unparsed args list."""
-  if not args:
-    return
-
-  hp_prefix = "--hp_"
-  tf.logging.info("Found unparsed command-line arguments. Checking if any "
-                  "start with %s and interpreting those as hparams "
-                  "settings.", hp_prefix)
-
-  pairs = []
-  i = 0
-  while i < len(args):
-    arg = args[i]
-    if arg.startswith(hp_prefix):
-      pairs.append((arg.lstrip(hp_prefix), args[i+1]))
-      i += 2
-    else:
-      tf.logging.warn("Found unknown flag: %s", arg)
-      i += 1
-
-  as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs])
-  if FLAGS.hparams:
-    as_hparams = "," + as_hparams
-  FLAGS.hparams += as_hparams
-
-
 def create_hparams():
   if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set:
     tf.logging.warn("Not all hyperparameter sets work on TPU. "
@@ -306,22 +265,21 @@ def maybe_cloud_tpu():
     yield
 
 
-def main(argv):
+def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
   trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
   if FLAGS.cloud_mlengine:
-    return cloud_mlengine.launch()
+    assert not FLAGS.cloud_tpu
+    assert FLAGS.output_dir.startswith("gs://")
+    assert FLAGS.data_dir.startswith("gs://")
+    return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"]))
 
   if FLAGS.generate_data:
     generate_data()
 
-  if FLAGS.job_dir:
-    FLAGS.output_dir = FLAGS.job_dir
-
-  set_hparams_from_args(argv[1:])
   hparams = create_hparams()
   if is_chief():
     save_metadata(hparams)
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 147757b47..02a5df2f3 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -21,6 +21,7 @@
 
 # Dependency imports
 
+import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 from tensor2tensor.utils import registry
 
@@ -223,15 +224,10 @@ class RangedHParams(object):
   LOG_SCALE = 2
   REVERSE_LOG_SCALE = 3
 
-  SCALES_STR = {
-      LINEAR_SCALE: "UNIT_LINEAR_SCALE",
-      LOG_SCALE: "UNIT_LOG_SCALE",
-      REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE",
-  }
-
   def __init__(self):
     self._categorical_params = {}
     self._discrete_params = {}
+    self._discrete_float_params = {}
     self._float_params = {}
     self._int_params = {}
 
@@ -241,12 +237,10 @@ def _check_reset_and_type_change(self, name, orig_ctr):
     if name in orig_ctr:
       tf.logging.warning("Overwriting hparam %s", name)
 
-    ctr_names = [
-        (self._categorical_params, "categorical"),
-        (self._discrete_params, "discrete"),
-        (self._float_params, "float"),
-        (self._int_params, "int"),
-    ]
+    ctr_names = [(self._categorical_params,
+                  "categorical"), (self._discrete_params, "discrete"),
+                 (self._float_params, "float"), (self._int_params, "int"),
+                 (self._discrete_float_params, "discrete_float")]
     ctrs, names = list(zip(*ctr_names))
     orig_name = names[ctrs.index(orig_ctr)]
 
@@ -269,17 +263,23 @@ def set_discrete(self, name, feasible_points, scale=None, length=None):
     self._discrete_params[name] = (name, feasible_points, scale, length)
 
   def set_float(self, name, min_val, max_val, scale=None, length=None):
+    if name in self._discrete_float_params:
+      del self._discrete_float_params[name]
     self._check_reset_and_type_change(name, self._float_params)
     self._float_params[name] = (name, min_val, max_val, scale, length)
 
+  def set_discrete_float(self, name, val):
+    self._check_reset_and_type_change(name, self._discrete_float_params)
+    self._discrete_float_params[name] = (name, [val])
+
   def set_int(self, name, min_val, max_val, scale=None, length=None):
     self._check_reset_and_type_change(name, self._int_params)
     self._int_params[name] = (name, min_val, max_val, scale, length)
 
   def fix_select_params(self, hp):
     ctrs = [
-        self._categorical_params, self._discrete_params, self._float_params,
-        self._int_params
+        self._categorical_params, self._discrete_params,
+        self._discrete_float_params, self._float_params, self._int_params
     ]
     for key, val in hp.values().iteritems():
       for ctr in ctrs:
@@ -287,56 +287,52 @@ def fix_select_params(self, hp):
           del ctr[key]
       self.set_discrete(key, [val])
 
-  def to_parameter_specs(self, name_prefix=""):
-    """To list of dicts suitable for Cloud ML Engine hyperparameter tuning."""
-    specs = []
-    for name, categories, _ in self._categorical_params.values():
-      spec = {
-          "parameterName": name_prefix + name,
-          "type": "CATEGORICAL",
-          "categoricalValues": categories,
-      }
-      specs.append(spec)
 
-    for name, feasible_points, scale, _ in self._discrete_params.values():
-      spec = {
-          "parameterName": name_prefix + name,
-          "type": "DISCRETE",
-          "discreteValues": feasible_points,
-      }
-      if scale:
-        spec["scaleType"] = self.SCALES_STR[scale]
-      specs.append(spec)
+def fill_ranged_hparams_from_hparams(hparams, ranged_hparams):
+  """Fill ranged_hparams with singleton values from hparams.
 
-    for name, min_val, max_val, scale, _ in self._float_params.values():
-      spec = {
-          "parameterName": name_prefix + name,
-          "type": "DOUBLE",
-          "minValue": min_val,
-          "maxValue": max_val,
-      }
-      if scale:
-        spec["scaleType"] = self.SCALES_STR[scale]
-      specs.append(spec)
+  HParams are placed in RangedHParams with the following functions, according to
+  type:
+    * int: set_discrete
+    * bool: set_discrete
+    * float: set_discrete_float
+    * str: set_categorical
 
-    for name, min_val, max_val, scale, _ in self._int_params.values():
-      spec = {
-          "parameterName": name_prefix + name,
-          "type": "INTEGER",
-          "minValue": min_val,
-          "maxValue": max_val,
-      }
-      if scale:
-        spec["scaleType"] = self.SCALES_STR[scale]
-      specs.append(spec)
+  Args:
+    hparams: tf.contrib.training.HParams; contains the hyperparameters to copy
+      over to ranged_hparams.
+    ranged_hparams: RangedHParams; will have hparams values copied to it.
 
-    return specs
+  Raises:
+    ValueError: if hparams contains a hyperparameter not of type
+      {int, float, str, bool}.
+  """
+  for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types):  # pylint: disable=protected-access
+
+    if is_multivalent:
+      raise ValueError("Multivalent hparams not supported in RangedHParams. "
+                       "Hyperparameter %s is multivalent." % name)
+    val = getattr(hparams, name)
+    if hp_type == int:
+      ranged_hparams.set_discrete(name, [val])
+    elif hp_type == bool:
+      ranged_hparams.set_discrete(name, [int(val)])
+    elif hp_type == float:
+      ranged_hparams.set_discrete_float(name, val)
+    elif hp_type == str:
+      ranged_hparams.set_categorical(name, [val])
+    else:
+      raise ValueError("Unsupported type %s for param %s" % (hp_type, name))
 
 
 @registry.register_ranged_hparams("basic1")
 def basic_range1(ranged_hparams):
   """A basic range of hyperparameters."""
   rhp = ranged_hparams
+
+  hparams = basic_params1()
+  fill_ranged_hparams_from_hparams(hparams, rhp)
+
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
   rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6])
   rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE)
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 43cfa571e..e77412513 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -381,6 +381,10 @@ def slicenet_params1_tiny():
 def slicenet_range1(ranged_hparams):
   """Small range of hyperparameters."""
   rhp = ranged_hparams
+
+  hparams = slicenet_params1()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
+
   rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE)
   rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE)
   rhp.set_float("optimizer_adam_beta2", 0.995, 0.998)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index df4b32277..305a17379 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1228,9 +1228,11 @@ def transformer_prepend():
   return transformer_prepend_v2()
 
 
-@registry.register_ranged_hparams
+@registry.register_ranged_hparams("transformer_base")
 def transformer_base_range(rhp):
   """Small range of hyperparameters."""
+  hparams = transformer_base()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1335,6 +1337,8 @@ def transformer_tiny_tpu():
 @registry.register_ranged_hparams
 def transformer_tiny_tpu_range(rhp):
   """Small range of hyperparameters."""
+  hparams = transformer_tiny_tpu()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_float("weight_decay", 0.0, 2.0)
 
@@ -1342,6 +1346,8 @@ def transformer_tiny_tpu_range(rhp):
 @registry.register_ranged_hparams
 def transformer_tpu_range(rhp):
   """Small range of hyperparameters."""
+  hparams = transformer_tpu()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1352,6 +1358,13 @@ def transformer_tpu_range(rhp):
   rhp.set_float("weight_decay", 0.0, 2.0)
 
 
+@registry.register_ranged_hparams
+def transformer_tpu_batch_range(rhp):
+  hparams = transformer_tpu()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
+  rhp.set_discrete("batch_size", [256, 512, 768, 1024])
+
+
 @registry.register_hparams
 def transformer_small_tpu():
   """TPU-friendly version of transformer_small.
diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py
index e8fe796a8..913243f00 100644
--- a/tensor2tensor/models/transformer_sketch.py
+++ b/tensor2tensor/models/transformer_sketch.py
@@ -22,6 +22,7 @@
 
 # Dependency imports
 
+from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
 from tensor2tensor.models import transformer_vae
@@ -124,6 +125,10 @@ def transformer_sketch_6layer():
 @registry.register_ranged_hparams("transformer_sketch_ranged")
 def transformer_sketch_ranged(rhp):
   """Range of hparams for vizier."""
+
+  hparams = transformer_sketch()
+  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
+
   rhp.set_categorical("ffn_layer",
                       ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"])
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
index 90c5c09e3..0bb5ddc40 100644
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -22,17 +22,14 @@
 
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials
-from tensor2tensor.layers import common_hparams
 from tensor2tensor.utils import cloud_tpu as cloud
-from tensor2tensor.utils import registry
 from tensor2tensor.utils import usr_dir as usr_dir_lib
 import tensorflow as tf
 
-FLAGS = tf.flags.FLAGS
-
 CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
 
 # TODO(rsepassi):
+# * Support --autotune
 # * Add documentation clould_mlengine.md
 # * Enable multi-machine sync/async training
 
@@ -47,9 +44,9 @@
 """
 
 
-def flags_as_args():
-  """Convert FLAGS to list of args suitable for passing on cmd line."""
-  args_dict = dict(FLAGS.__dict__['__flags'])
+def args_dict_as_args(args_dict):
+  """Convert dict to list of args suitable for passing on cmd line."""
+  args_dict = dict(args_dict)
   del args_dict['cloud_mlengine']
   # Configured later
   del args_dict['t2t_usr_dir']
@@ -57,8 +54,6 @@ def flags_as_args():
   for name, val in args_dict.items():
     if val is None:
       continue
-    if name.startswith('autotune'):
-      continue
     args.extend(['--%s' % name, str(val)])
   return args
 
@@ -88,9 +83,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None):
   return config
 
 
-def configure_job():
+def configure_job(flags_dict):
   """Construct jobSpec for ML Engine job."""
-  train_dir = FLAGS.output_dir
+  train_dir = flags_dict['output_dir']
   assert train_dir.startswith('gs://')
   job_name = os.path.basename(train_dir)
 
@@ -98,27 +93,16 @@ def configure_job():
   # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
   training_input = {
       'pythonModule': 'tensor2tensor.bin.t2t_trainer',
-      'args': flags_as_args(),
+      'args': args_dict_as_args(flags_dict),
       'region': cloud.default_region(),
       'runtimeVersion': '1.4',
       'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7',
-      'jobDir': train_dir,
   }
   training_input.update(
       machine_config(
-          num_gpus=FLAGS.worker_gpu,
-          use_tpu=FLAGS.use_tpu,
-          master_type=FLAGS.cloud_mlengine_master_type))
-  if FLAGS.hparams_range:
-    assert FLAGS.autotune_objective
-    tf.logging.info('Configuring hyperparameter tuning.')
-    training_input['hyperparameters'] = configure_autotune(
-        FLAGS.hparams_range,
-        FLAGS.autotune_objective,
-        FLAGS.autotune_maximize,
-        FLAGS.autotune_max_trials,
-        FLAGS.autotune_parallel_trials,
-    )
+          num_gpus=flags_dict['worker_gpu'],
+          use_tpu=flags_dict['use_tpu'],
+          master_type=flags_dict['cloud_mlengine_master_type']))
 
   if training_input['scaleTier'] == 'CUSTOM':
     assert 'masterType' in training_input
@@ -186,26 +170,6 @@ def tar_and_copy_usr_dir(usr_dir, train_dir):
   return usr_tar
 
 
-def autotune_paramspecs(hparams_range):
-  rhp = common_hparams.RangedHParams()
-  registry.ranged_hparams(hparams_range)(rhp)
-  return rhp.to_parameter_specs(name_prefix='hp_')
-
-
-def configure_autotune(hparams_range,
-                       objective,
-                       maximize=True,
-                       max_trials=10,
-                       parallel_trials=1):
-  return {
-      'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
-      'params': autotune_paramspecs(hparams_range),
-      'maxTrials': max_trials,
-      'maxParallelTrials': parallel_trials,
-      'hyperparameterMetricTag': objective,
-  }
-
-
 def configure_trainer_package(job_spec, t2t_tar):
   assert t2t_tar.startswith('gs://')
   job_spec['trainingInput']['packageUris'] = [t2t_tar]
@@ -218,25 +182,18 @@ def configure_usr_dir(job_spec, usr_tar):
   job_spec['trainingInput']['args'].extend(usr_args)
 
 
-def launch():
+def launch(flags_dict):
   """Launch t2t_trainer on Cloud ML Engine."""
-  assert not FLAGS.cloud_tpu
-  assert not FLAGS.job_dir
-  assert FLAGS.output_dir.startswith('gs://')
-  assert FLAGS.data_dir.startswith('gs://')
-  assert FLAGS.worker_replicas <= 1
-  assert FLAGS.ps_replicas <= 0
-
-  job_spec = configure_job()
+  job_spec = configure_job(flags_dict)
   job_name = job_spec['jobId']
   tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
                   job_spec)
   assert cloud.confirm()
-  train_dir = FLAGS.output_dir
+  train_dir = flags_dict['output_dir']
   t2t_tar = tar_and_copy_t2t(train_dir)
   configure_trainer_package(job_spec, t2t_tar)
-  if FLAGS.t2t_usr_dir:
-    usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir)
+  if flags_dict['t2t_usr_dir']:
+    usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir)
     configure_usr_dir(job_spec, usr_tar)
   launch_job(job_spec)
   tf.logging.info('Launched %s. See console to track: %s.', job_name,
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
index 0e64f2475..039b06e68 100644
--- a/tensor2tensor/utils/trainer_lib.py
+++ b/tensor2tensor/utils/trainer_lib.py
@@ -68,12 +68,8 @@ def create_hparams(hparams_set,
                    hparams_overrides_str="",
                    data_dir=None,
                    problem_name=None):
-  """Create HParams with data_dir and problem hparams, if kwargs provided."""
   hparams = registry.hparams(hparams_set)()
   if hparams_overrides_str:
-    tf.logging.info("Overriding hparams in %s with %s",
-                    hparams_set,
-                    hparams_overrides_str)
     hparams = hparams.parse(hparams_overrides_str)
   if data_dir:
     hparams.add_hparam("data_dir", data_dir)

From bbee177e9551bb1fa80ce6188cc277563cbe3377 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 8 Feb 2018 09:31:26 -0800
Subject: [PATCH 26/31] Changes to make Librispeech TPU friendly.

PiperOrigin-RevId: 185006656
---
 .../data_generators/speech_recognition.py     | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
index 4c037aeb3..e17e4de85 100644
--- a/tensor2tensor/data_generators/speech_recognition.py
+++ b/tensor2tensor/data_generators/speech_recognition.py
@@ -34,7 +34,6 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_layers
-from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import modality
 from tensor2tensor.utils import registry
@@ -242,7 +241,7 @@ def hparams(self, defaults, model_hparams):
     p = model_hparams
     # Filterbank extraction
     # Filterbank extraction in bottom instead of preprocess_example is faster.
-    p.add_hparam("audio_preproc_in_bottom", True)
+    p.add_hparam("audio_preproc_in_bottom", False)
     # The trainer seems to reserve memory for all members of the input dict
     p.add_hparam("audio_keep_example_waveforms", False)
     p.add_hparam("audio_sample_rate", 16000)
@@ -341,7 +340,6 @@ def bottom(self, inputs):
       float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
     """
     p = self._model_hparams
-    training = p.mode == tf.estimator.ModeKeys.TRAIN
 
     num_mel_bins = p.audio_num_mel_bins
     num_channels = 3 if p.audio_add_delta_deltas else 1
@@ -376,18 +374,19 @@ def bottom(self, inputs):
       x.set_shape([None, None, 1, num_mel_bins * num_channels])
 
       xshape = common_layers.shape_list(x)
-      x = tf.reshape(x, [-1, 1, num_mel_bins * num_channels])
 
-      padding_mask = common_attention.embedding_to_padding(x)
-      pad_remover = expert_utils.PadRemover(padding_mask)
-
-      x = pad_remover.remove(x)
+      nonpadding_mask = 1. - common_attention.embedding_to_padding(x)
+      num_of_nonpadding_elements = tf.reduce_sum(
+          nonpadding_mask) * num_mel_bins * num_channels
 
       # This replaces CMVN estimation on data
-      x = tf.layers.batch_normalization(
-          x, axis=2, center=False, scale=False, training=training)
-
-      x = pad_remover.restore(x)
+      mean = tf.reduce_sum(
+          x, axis=[1, 2], keepdims=True) / num_of_nonpadding_elements
+      variance = (num_of_nonpadding_elements * mean**2. -
+                  2. * mean * tf.reduce_sum(x, axis=[1, 2], keepdims=True) +
+                  tf.reduce_sum(x**2, axis=[1, 2], keepdims=True)
+                 ) / num_of_nonpadding_elements
+      x = (x - mean) / variance * tf.expand_dims(nonpadding_mask, -1)
 
       # restore batch_size x time x frequency x channel layout
       x = tf.reshape(x, [xshape[0], xshape[1], num_mel_bins, num_channels])

From ec364a158c8a45813e37a873c5b81d4dbb912fb6 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 8 Feb 2018 11:15:20 -0800
Subject: [PATCH 27/31] internal merge

PiperOrigin-RevId: 185022378
---
 setup.py                                    |   1 +
 tensor2tensor/rl/README.md                  |  11 +
 tensor2tensor/rl/__init__.py                |  15 ++
 tensor2tensor/rl/collect.py                 |  95 ++++++++
 tensor2tensor/rl/envs/__init__.py           |  15 ++
 tensor2tensor/rl/envs/batch_env.py          | 129 +++++++++++
 tensor2tensor/rl/envs/in_graph_batch_env.py | 165 ++++++++++++++
 tensor2tensor/rl/envs/utils.py              | 229 ++++++++++++++++++++
 tensor2tensor/rl/networks.py                |  69 ++++++
 tensor2tensor/rl/ppo.py                     | 102 +++++++++
 tensor2tensor/rl/rl_trainer_lib.py          |  87 ++++++++
 tensor2tensor/rl/rl_trainer_lib_test.py     |  34 +++
 tensor2tensor/rl/t2t_rl_trainer.py          |  30 +++
 13 files changed, 982 insertions(+)
 create mode 100644 tensor2tensor/rl/README.md
 create mode 100644 tensor2tensor/rl/__init__.py
 create mode 100644 tensor2tensor/rl/collect.py
 create mode 100644 tensor2tensor/rl/envs/__init__.py
 create mode 100644 tensor2tensor/rl/envs/batch_env.py
 create mode 100644 tensor2tensor/rl/envs/in_graph_batch_env.py
 create mode 100644 tensor2tensor/rl/envs/utils.py
 create mode 100644 tensor2tensor/rl/networks.py
 create mode 100644 tensor2tensor/rl/ppo.py
 create mode 100644 tensor2tensor/rl/rl_trainer_lib.py
 create mode 100644 tensor2tensor/rl/rl_trainer_lib_test.py
 create mode 100644 tensor2tensor/rl/t2t_rl_trainer.py

diff --git a/setup.py b/setup.py
index 2ee6e74ee..567d7a392 100644
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@
         'gevent',
         'google-api-python-client',
         'gunicorn',
+        'gym<=0.9.5',  # gym in version 0.9.6 has some temporary issues.
         'numpy',
         'requests',
         'scipy',
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
new file mode 100644
index 000000000..053119f05
--- /dev/null
+++ b/tensor2tensor/rl/README.md
@@ -0,0 +1,11 @@
+# Tensor2Tensor experimental Reinforcement Learning.
+
+The rl package intention is to provide possiblity to run reinforcement
+algorithms within Tensorflow's computation graph. It's very experimental
+for now and under heavy development.
+
+Currently the only supported algorithm is Proximy Policy Optimization - PPO.
+
+## Sample usage - training in Pendulum-v0 environment.
+
+```python rl/t2t_rl_trainer.py```
diff --git a/tensor2tensor/rl/__init__.py b/tensor2tensor/rl/__init__.py
new file mode 100644
index 000000000..3f714ce1f
--- /dev/null
+++ b/tensor2tensor/rl/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py
new file mode 100644
index 000000000..2ea262143
--- /dev/null
+++ b/tensor2tensor/rl/collect.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Collect trajectories from interactions of agent with environment."""
+
+import tensorflow as tf
+
+
+def define_collect(policy_factory, batch_env, config):
+  """Collect trajectories."""
+  memory_shape = [config.epoch_length] + [batch_env.observ.shape.as_list()[0]]
+  memories_shapes_and_types = [
+      # observation
+      (memory_shape + [batch_env.observ.shape.as_list()[1]], tf.float32),
+      (memory_shape, tf.float32),      # reward
+      (memory_shape, tf.bool),         # done
+      (memory_shape + batch_env.action_shape, tf.float32),  # action
+      (memory_shape, tf.float32),      # pdf
+      (memory_shape, tf.float32),      # value function
+  ]
+  memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False)
+            for (shape, dtype) in memories_shapes_and_types]
+  cumulative_rewards = tf.Variable(
+      tf.zeros(config.num_agents, tf.float32), trainable=False)
+
+  should_reset_var = tf.Variable(True, trainable=False)
+  reset_op = tf.cond(should_reset_var,
+                     lambda: batch_env.reset(tf.range(config.num_agents)),
+                     lambda: 0.0)
+  with tf.control_dependencies([reset_op]):
+    reset_once_op = tf.assign(should_reset_var, False)
+
+  with tf.control_dependencies([reset_once_op]):
+
+    def step(index, scores_sum, scores_num):
+      """Single step."""
+      # Note - the only way to ensure making a copy of tensor is to run simple
+      # operation. We are waiting for tf.copy:
+      # https://github.com/tensorflow/tensorflow/issues/11186
+      obs_copy = batch_env.observ + 0
+      actor_critic = policy_factory(tf.expand_dims(obs_copy, 0))
+      policy = actor_critic.policy
+      action = policy.sample()
+      postprocessed_action = actor_critic.action_postprocessing(action)
+      simulate_output = batch_env.simulate(postprocessed_action[0, ...])
+      pdf = policy.prob(action)[0]
+      with tf.control_dependencies(simulate_output):
+        reward, done = simulate_output
+        done = tf.reshape(done, (config.num_agents,))
+        to_save = [obs_copy, reward, done, action[0, ...], pdf,
+                   actor_critic.value[0]]
+        save_ops = [tf.scatter_update(memory_slot, index, value)
+                    for memory_slot, value in zip(memory, to_save)]
+        cumulate_rewards_op = cumulative_rewards.assign_add(reward)
+        agent_indices_to_reset = tf.where(done)[:, 0]
+      with tf.control_dependencies([cumulate_rewards_op]):
+        scores_sum_delta = tf.reduce_sum(
+            tf.gather(cumulative_rewards, agent_indices_to_reset))
+        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
+      with tf.control_dependencies(save_ops + [scores_sum_delta,
+                                               scores_num_delta]):
+        reset_env_op = batch_env.reset(agent_indices_to_reset)
+        reset_cumulative_rewards_op = tf.scatter_update(
+            cumulative_rewards, agent_indices_to_reset,
+            tf.zeros(tf.shape(agent_indices_to_reset)))
+      with tf.control_dependencies([reset_env_op,
+                                    reset_cumulative_rewards_op]):
+        return [index + 1, scores_sum + scores_sum_delta,
+                scores_num + scores_num_delta]
+
+    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
+    index, scores_sum, scores_num = tf.while_loop(
+        lambda c, _1, _2: c < config.epoch_length,
+        step,
+        init,
+        parallel_iterations=1,
+        back_prop=False)
+  mean_score = tf.cond(tf.greater(scores_num, 0),
+                       lambda: scores_sum / tf.cast(scores_num, tf.float32),
+                       lambda: 0.)
+  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
+  with tf.control_dependencies([printing]):
+    return tf.identity(index), memory
diff --git a/tensor2tensor/rl/envs/__init__.py b/tensor2tensor/rl/envs/__init__.py
new file mode 100644
index 000000000..3f714ce1f
--- /dev/null
+++ b/tensor2tensor/rl/envs/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensor2tensor/rl/envs/batch_env.py b/tensor2tensor/rl/envs/batch_env.py
new file mode 100644
index 000000000..453348976
--- /dev/null
+++ b/tensor2tensor/rl/envs/batch_env.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Combine multiple environments to step them in batch."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/batch_env.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class BatchEnv(object):
+  """Combine multiple environments to step them in batch."""
+
+  def __init__(self, envs, blocking):
+    """Combine multiple environments to step them in batch.
+
+    To step environments in parallel, environments must support a
+    `blocking=False` argument to their step and reset functions that makes them
+    return callables instead to receive the result at a later time.
+
+    Args:
+      envs: List of environments.
+      blocking: Step environments after another rather than in parallel.
+
+    Raises:
+      ValueError: Environments have different observation or action spaces.
+    """
+    self._envs = envs
+    self._blocking = blocking
+    observ_space = self._envs[0].observation_space
+    if not all(env.observation_space == observ_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+    action_space = self._envs[0].action_space
+    if not all(env.action_space == action_space for env in self._envs):
+      raise ValueError('All environments must use the same observation space.')
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._envs)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._envs[index]
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name one of the wrapped environments.
+    """
+    return getattr(self._envs[0], name)
+
+  def step(self, actions):
+    """Forward a batch of actions to the wrapped environments.
+
+    Args:
+      actions: Batched action to apply to the environment.
+
+    Raises:
+      ValueError: Invalid actions.
+
+    Returns:
+      Batch of observations, rewards, and done flags.
+    """
+    for index, (env, action) in enumerate(zip(self._envs, actions)):
+      if not env.action_space.contains(action):
+        message = 'Invalid action at index {}: {}'
+        raise ValueError(message.format(index, action))
+    if self._blocking:
+      transitions = [
+          env.step(action)
+          for env, action in zip(self._envs, actions)]
+    else:
+      transitions = [
+          env.step(action, blocking=False)
+          for env, action in zip(self._envs, actions)]
+      transitions = [transition() for transition in transitions]
+    observs, rewards, dones, infos = zip(*transitions)
+    observ = np.stack(observs).astype(np.float32)
+    reward = np.stack(rewards).astype(np.float32)
+    done = np.stack(dones)
+    info = tuple(infos)
+    return observ, reward, done, info
+
+  def reset(self, indices=None):
+    """Reset the environment and convert the resulting observation.
+
+    Args:
+      indices: The batch indices of environments to reset; defaults to all.
+
+    Returns:
+      Batch of observations.
+    """
+    if indices is None:
+      indices = np.arange(len(self._envs))
+    if self._blocking:
+      observs = [self._envs[index].reset() for index in indices]
+    else:
+      observs = [self._envs[index].reset(blocking=False) for index in indices]
+      observs = [observ() for observ in observs]
+    observ = np.stack(observs)
+    observ = observ.astype(np.float32)
+    return observ
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    for env in self._envs:
+      if hasattr(env, 'close'):
+        env.close()
diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py
new file mode 100644
index 000000000..eae0826a3
--- /dev/null
+++ b/tensor2tensor/rl/envs/in_graph_batch_env.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import gym
+
+import tensorflow as tf
+
+
+class InGraphBatchEnv(object):
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
+  """
+
+  def __init__(self, batch_env):
+    """Batch of environments inside the TensorFlow graph.
+
+    Args:
+      batch_env: Batch environment.
+    """
+    self._batch_env = batch_env
+    observ_shape = self._parse_shape(self._batch_env.observation_space)
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
+    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
+    with tf.variable_scope('env_temporary'):
+      self._observ = tf.Variable(
+          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+          name='observ', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in one of the original environments.
+    """
+    return getattr(self._batch_env, name)
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._batch_env)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._batch_env[index]
+
+  def simulate(self, action):
+    """Step the batch of environments.
+
+    The results of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the batch of actions to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._batch_env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      with tf.control_dependencies([self._observ.assign(observ)]):
+        return tf.identity(reward), tf.identity(done)
+
+  def reset(self, indices=None):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    return tf.cond(
+        tf.cast(tf.shape(indices)[0], tf.bool),
+        lambda: self._reset_non_empty(indices), lambda: 0.0)
+
+  def _reset_non_empty(self, indices):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    observ = tf.py_func(
+        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    self._batch_env.close()
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
new file mode 100644
index 000000000..8171fbe17
--- /dev/null
+++ b/tensor2tensor/rl/envs/utils.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for using batched environments."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/wrappers.py
+# https://github.com/tensorflow/agents/blob/master/agents/scripts/utility.py
+
+import atexit
+import multiprocessing
+import sys
+import traceback
+
+# Dependency imports
+
+from tensor2tensor.rl.envs import batch_env
+from tensor2tensor.rl.envs import in_graph_batch_env
+import tensorflow as tf
+
+
+class ExternalProcessEnv(object):
+  """Step environment in a separate process for lock free paralellism."""
+
+  # Message types for communication via the pipe.
+  _ACCESS = 1
+  _CALL = 2
+  _RESULT = 3
+  _EXCEPTION = 4
+  _CLOSE = 5
+
+  def __init__(self, constructor):
+    """Step environment in a separate process for lock free paralellism.
+
+    The environment will be created in the external process by calling the
+    specified callable. This can be an environment class, or a function
+    creating the environment and potentially wrapping it. The returned
+    environment should not access global variables.
+
+    Args:
+      constructor: Callable that creates and returns an OpenAI gym environment.
+
+    Attributes:
+      observation_space: The cached observation space of the environment.
+      action_space: The cached action space of the environment.
+    """
+    self._conn, conn = multiprocessing.Pipe()
+    self._process = multiprocessing.Process(
+        target=self._worker, args=(constructor, conn))
+    atexit.register(self.close)
+    self._process.start()
+    self._observ_space = None
+    self._action_space = None
+
+  @property
+  def observation_space(self):
+    if not self._observ_space:
+      self._observ_space = self.__getattr__('observation_space')
+    return self._observ_space
+
+  @property
+  def action_space(self):
+    if not self._action_space:
+      self._action_space = self.__getattr__('action_space')
+    return self._action_space
+
+  def __getattr__(self, name):
+    """Request an attribute from the environment.
+
+    Note that this involves communication with the external process, so it can
+    be slow.
+
+    Args:
+      name: Attribute to access.
+
+    Returns:
+      Value of the attribute.
+    """
+    self._conn.send((self._ACCESS, name))
+    return self._receive()
+
+  def call(self, name, *args, **kwargs):
+    """Asynchronously call a method of the external environment.
+
+    Args:
+      name: Name of the method to call.
+      *args: Positional arguments to forward to the method.
+      **kwargs: Keyword arguments to forward to the method.
+
+    Returns:
+      Promise object that blocks and provides the return value when called.
+    """
+    payload = name, args, kwargs
+    self._conn.send((self._CALL, payload))
+    return self._receive
+
+  def close(self):
+    """Send a close message to the external process and join it."""
+    try:
+      self._conn.send((self._CLOSE, None))
+      self._conn.close()
+    except IOError:
+      # The connection was already closed.
+      pass
+    self._process.join()
+
+  def step(self, action, blocking=True):
+    """Step the environment.
+
+    Args:
+      action: The action to apply to the environment.
+      blocking: Whether to wait for the result.
+
+    Returns:
+      Transition tuple when blocking, otherwise callable that returns the
+      transition tuple.
+    """
+    promise = self.call('step', action)
+    if blocking:
+      return promise()
+    else:
+      return promise
+
+  def reset(self, blocking=True):
+    """Reset the environment.
+
+    Args:
+      blocking: Whether to wait for the result.
+
+    Returns:
+      New observation when blocking, otherwise callable that returns the new
+      observation.
+    """
+    promise = self.call('reset')
+    if blocking:
+      return promise()
+    else:
+      return promise
+
+  def _receive(self):
+    """Wait for a message from the worker process and return its payload.
+
+    Raises:
+      Exception: An exception was raised inside the worker process.
+      KeyError: The reveived message is of an unknown type.
+
+    Returns:
+      Payload object of the message.
+    """
+    message, payload = self._conn.recv()
+    # Re-raise exceptions in the main process.
+    if message == self._EXCEPTION:
+      stacktrace = payload
+      raise Exception(stacktrace)
+    if message == self._RESULT:
+      return payload
+    raise KeyError('Received message of unexpected type {}'.format(message))
+
+  def _worker(self, constructor, conn):
+    """The process waits for actions and sends back environment results.
+
+    Args:
+      constructor: Constructor for the OpenAI Gym environment.
+      conn: Connection for communication to the main process.
+    """
+    try:
+      env = constructor()
+      while True:
+        try:
+          # Only block for short times to have keyboard exceptions be raised.
+          if not conn.poll(0.1):
+            continue
+          message, payload = conn.recv()
+        except (EOFError, KeyboardInterrupt):
+          break
+        if message == self._ACCESS:
+          name = payload
+          result = getattr(env, name)
+          conn.send((self._RESULT, result))
+          continue
+        if message == self._CALL:
+          name, args, kwargs = payload
+          result = getattr(env, name)(*args, **kwargs)
+          conn.send((self._RESULT, result))
+          continue
+        if message == self._CLOSE:
+          assert payload is None
+          break
+        raise KeyError('Received message of unknown type {}'.format(message))
+    except Exception:  # pylint: disable=broad-except
+      stacktrace = ''.join(traceback.format_exception(*sys.exc_info()))
+      tf.logging.error('Error in environment process: {}'.format(stacktrace))
+      conn.send((self._EXCEPTION, stacktrace))
+    conn.close()
+
+
+def define_batch_env(constructor, num_agents, env_processes=True):
+  """Create environments and apply all desired wrappers.
+
+  Args:
+    constructor: Constructor of an OpenAI gym environment.
+    num_agents: Number of environments to combine in the batch.
+    env_processes: Whether to step environment in external processes.
+
+  Returns:
+    In-graph environments object.
+  """
+  with tf.variable_scope('environments'):
+    if env_processes:
+      envs = [
+          ExternalProcessEnv(constructor)
+          for _ in range(num_agents)]
+    else:
+      envs = [constructor() for _ in range(num_agents)]
+    env = batch_env.BatchEnv(envs, blocking=not env_processes)
+    env = in_graph_batch_env.InGraphBatchEnv(env)
+    return env
diff --git a/tensor2tensor/rl/networks.py b/tensor2tensor/rl/networks.py
new file mode 100644
index 000000000..4ad7c5020
--- /dev/null
+++ b/tensor2tensor/rl/networks.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Neural networks for actor-critic algorithms."""
+
+import collections
+import functools
+import operator
+
+# Dependency imports
+
+import gym
+import tensorflow as tf
+
+
+NetworkOutput = collections.namedtuple(
+    'NetworkOutput', 'policy, value, action_postprocessing')
+
+
+def feed_forward_gaussian_fun(observation_space, action_space, config,
+                              observations):
+  """Feed-forward gaussian."""
+  assert isinstance(observation_space, gym.spaces.box.Box)
+
+  mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer(
+      factor=config.init_mean_factor)
+  logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10)
+
+  flat_observations = tf.reshape(observations, [
+      tf.shape(observations)[0], tf.shape(observations)[1],
+      functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)])
+
+  with tf.variable_scope('policy'):
+    x = flat_observations
+    for size in config.policy_layers:
+      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+    mean = tf.contrib.layers.fully_connected(
+        x, action_space.shape[0], tf.tanh,
+        weights_initializer=mean_weights_initializer)
+    logstd = tf.get_variable(
+        'logstd', mean.shape[2:], tf.float32, logstd_initializer)
+    logstd = tf.tile(
+        logstd[None, None],
+        [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2))
+  with tf.variable_scope('value'):
+    x = flat_observations
+    for size in config.value_layers:
+      x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu)
+    value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
+  mean = tf.check_numerics(mean, 'mean')
+  logstd = tf.check_numerics(logstd, 'logstd')
+  value = tf.check_numerics(value, 'value')
+
+  policy = tf.contrib.distributions.MultivariateNormalDiag(mean,
+                                                           tf.exp(logstd))
+
+  return NetworkOutput(policy, value, lambda a: tf.clip_by_value(a, -2., 2))
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
new file mode 100644
index 000000000..a2b34c797
--- /dev/null
+++ b/tensor2tensor/rl/ppo.py
@@ -0,0 +1,102 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PPO algorithm implementation.
+
+Based on: https://arxiv.org/abs/1707.06347
+"""
+
+
+import tensorflow as tf
+
+
+def define_ppo_step(observation, action, reward, done, value, old_pdf,
+                    policy_factory, config):
+  """A step of PPO."""
+  new_policy_dist, new_value, _ = policy_factory(observation)
+  new_pdf = new_policy_dist.prob(action)
+
+  ratio = new_pdf/old_pdf
+  clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef,
+                                   1 + config.clipping_coef)
+
+  advantage = calculate_discounted_return(
+      reward, value, done, config.gae_gamma, config.gae_lambda) - value
+
+  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
+                                                     keep_dims=True)
+  advantage_normalized = tf.stop_gradient(
+      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
+
+  surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized,
+                                   ratio * advantage_normalized)
+  policy_loss = -tf.reduce_mean(surrogate_objective)
+
+  value_error = calculate_discounted_return(
+      reward, new_value, done, config.gae_gamma, config.gae_lambda) - value
+  value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2)
+
+  entropy = new_policy_dist.entropy()
+  entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
+
+  total_loss = policy_loss + value_loss + entropy_loss
+
+  optimization_op = config.optimizer(config.learning_rate).minimize(total_loss)
+
+  with tf.control_dependencies([optimization_op]):
+    return [tf.identity(x) for x in (policy_loss, value_loss, entropy_loss)]
+
+
+def define_ppo_epoch(memory, policy_factory, config):
+  """An epoch of PPO."""
+  observation, reward, done, action, old_pdf, value = memory
+
+  # This is to avoid propagating gradients though simulation of simulation
+  observation = tf.stop_gradient(observation)
+  action = tf.stop_gradient(action)
+  reward = tf.stop_gradient(reward)
+  done = tf.stop_gradient(done)
+  value = tf.stop_gradient(value)
+  old_pdf = tf.stop_gradient(old_pdf)
+
+  policy_loss, value_loss, entropy_loss = tf.scan(
+      lambda _1, _2: define_ppo_step(  # pylint: disable=g-long-lambda
+          observation, action, reward, done, value,
+          old_pdf, policy_factory, config),
+      tf.range(config.optimization_epochs),
+      [0., 0., 0.],
+      parallel_iterations=1)
+
+  print_losses = tf.group(
+      tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '),
+      tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '),
+      tf.Print(0, [tf.reduce_mean(entropy_loss)], 'entropy loss: '))
+
+  return print_losses
+
+
+def calculate_discounted_return(reward, value, done, discount, unused_lambda):
+  """Discounted Monte-Carlo returns."""
+  done = tf.cast(done, tf.float32)
+  reward2 = done[-1, :] * reward[-1, :] + (1 - done[-1, :]) * value[-1, :]
+  reward = tf.concat([reward[:-1,], reward2[None, ...]], axis=0)
+  return_ = tf.reverse(tf.scan(
+      lambda agg, cur: cur[0] + (1 - cur[1]) * discount * agg,  # fn
+      [tf.reverse(reward, [0]),  # elem
+       tf.reverse(done, [0])],
+      tf.zeros_like(reward[0, :]),  # initializer
+      1,
+      False), [0])
+  return tf.check_numerics(return_, 'return')
diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py
new file mode 100644
index 000000000..ced6da342
--- /dev/null
+++ b/tensor2tensor/rl/rl_trainer_lib.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training of RL agent with PPO algorithm."""
+
+from __future__ import absolute_import
+
+import functools
+
+# Dependency imports
+
+import gym
+from tensor2tensor.rl import collect
+from tensor2tensor.rl import networks
+from tensor2tensor.rl import ppo
+from tensor2tensor.rl.envs import utils
+
+import tensorflow as tf
+
+
+def define_train(policy_lambda, env_lambda, config):
+  """Define the training setup."""
+  env = env_lambda()
+  action_space = env.action_space
+  observation_space = env.observation_space
+
+  batch_env = utils.define_batch_env(env_lambda, config.num_agents)
+
+  policy_factory = tf.make_template(
+      "network",
+      functools.partial(policy_lambda, observation_space,
+                        action_space, config))
+
+  (collect_op, memory) = collect.define_collect(
+      policy_factory, batch_env, config)
+
+  with tf.control_dependencies([collect_op]):
+    ppo_op = ppo.define_ppo_epoch(memory, policy_factory, config)
+
+  return ppo_op
+
+
+def train(params):
+  policy_lambda, env_lambda, config = params
+  ppo_op = define_train(policy_lambda, env_lambda, config)
+
+  with tf.Session() as sess:
+    sess.run(tf.global_variables_initializer())
+    for _ in range(config.epochs_num):
+      sess.run(ppo_op)
+
+
+def example_params():
+  """Example hyperparameters."""
+  config = tf.contrib.training.HParams(
+      init_mean_factor=0.1,
+      init_logstd=0.1,
+      policy_layers=(100, 100),
+      value_layers=(100, 100),
+      num_agents=30,
+      clipping_coef=0.2,
+      gae_gamma=0.99,
+      gae_lambda=0.95,
+      entropy_loss_coef=0.01,
+      value_loss_coef=1,
+      optimizer=tf.train.AdamOptimizer,
+      learning_rate=1e-4,
+      optimization_epochs=15,
+      epoch_length=200,
+      epochs_num=2000)
+  return networks.feed_forward_gaussian_fun, pendulum_lambda, config
+
+
+def pendulum_lambda():
+  return gym.make("Pendulum-v0")
diff --git a/tensor2tensor/rl/rl_trainer_lib_test.py b/tensor2tensor/rl/rl_trainer_lib_test.py
new file mode 100644
index 000000000..1a276ef39
--- /dev/null
+++ b/tensor2tensor/rl/rl_trainer_lib_test.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests of basic flow of collecting trajectories and training PPO."""
+
+# Dependency imports
+
+from tensor2tensor.rl import rl_trainer_lib
+
+import tensorflow as tf
+
+
+class TrainTest(tf.test.TestCase):
+
+  def test_no_crash_pendulum(self):
+    params = rl_trainer_lib.example_params()
+    params[2].epochs_num = 10
+    rl_trainer_lib.train(params)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py
new file mode 100644
index 000000000..875c28567
--- /dev/null
+++ b/tensor2tensor/rl/t2t_rl_trainer.py
@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training of RL agent with PPO algorithm."""
+
+# Dependency imports
+
+from tensor2tensor.rl import rl_trainer_lib
+
+import tensorflow as tf
+
+
+def main(_):
+  rl_trainer_lib.train(rl_trainer_lib.example_params())
+
+
+if __name__ == "__main__":
+  tf.app.run()

From c03fd19df53eabe75c2d7aac86431cd4ba833918 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 8 Feb 2018 11:16:13 -0800
Subject: [PATCH 28/31] Support hyperparameter tuning on Cloud ML Engine
 (reapply)

PiperOrigin-RevId: 185022560
---
 tensor2tensor/bin/t2t_trainer.py           |  55 ++++++++++-
 tensor2tensor/layers/common_hparams.py     | 104 +++++++++++----------
 tensor2tensor/models/slicenet.py           |   4 -
 tensor2tensor/models/transformer.py        |  15 +--
 tensor2tensor/models/transformer_sketch.py |   5 -
 tensor2tensor/utils/cloud_mlengine.py      |  74 +++++++++++----
 tensor2tensor/utils/trainer_lib.py         |   4 +
 7 files changed, 167 insertions(+), 94 deletions(-)

diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 051c2e90d..469734883 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -93,6 +93,22 @@
                     "GPUs on machine type. See documentation: "
                     "https://cloud.google.com/ml-engine/reference/rest/v1/"
                     "projects.jobs#traininginput")
+# Hyperparameter tuning on Cloud ML Engine
+# Pass an --hparams_range to enable
+flags.DEFINE_string("autotune_objective", None,
+                    "TensorBoard metric name to optimize.")
+flags.DEFINE_bool("autotune_maximize", True,
+                  "Whether to maximize (vs. minimize) autotune_objective.")
+flags.DEFINE_integer("autotune_max_trials", 10,
+                     "Maximum number of tuning experiments to run.")
+flags.DEFINE_integer("autotune_parallel_trials", 1,
+                     "How many trials to run in parallel (will spin up this "
+                     "many jobs.")
+# Note than in open-source TensorFlow, the dash gets converted to an underscore,
+# so access is FLAGS.job_dir.
+flags.DEFINE_string("job-dir", None,
+                    "DO NOT USE. Exists only for Cloud ML Engine to pass in "
+                    "during hyperparameter tuning. Overrides --output_dir.")
 
 
 def get_problem_name():
@@ -101,6 +117,33 @@ def get_problem_name():
   return problems[0]
 
 
+def set_hparams_from_args(args):
+  """Set hparams overrides from unparsed args list."""
+  if not args:
+    return
+
+  hp_prefix = "--hp_"
+  tf.logging.info("Found unparsed command-line arguments. Checking if any "
+                  "start with %s and interpreting those as hparams "
+                  "settings.", hp_prefix)
+
+  pairs = []
+  i = 0
+  while i < len(args):
+    arg = args[i]
+    if arg.startswith(hp_prefix):
+      pairs.append((arg.lstrip(hp_prefix), args[i+1]))
+      i += 2
+    else:
+      tf.logging.warn("Found unknown flag: %s", arg)
+      i += 1
+
+  as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs])
+  if FLAGS.hparams:
+    as_hparams = "," + as_hparams
+  FLAGS.hparams += as_hparams
+
+
 def create_hparams():
   if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set:
     tf.logging.warn("Not all hyperparameter sets work on TPU. "
@@ -265,21 +308,23 @@ def maybe_cloud_tpu():
     yield
 
 
-def main(_):
+def main(argv):
   tf.logging.set_verbosity(tf.logging.INFO)
   trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
   if FLAGS.cloud_mlengine:
-    assert not FLAGS.cloud_tpu
-    assert FLAGS.output_dir.startswith("gs://")
-    assert FLAGS.data_dir.startswith("gs://")
-    return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"]))
+    return cloud_mlengine.launch()
 
   if FLAGS.generate_data:
     generate_data()
 
+  if hasattr(FLAGS, "job_dir") and FLAGS.job_dir:
+    FLAGS.output_dir = FLAGS.job_dir
+
+  if argv:
+    set_hparams_from_args(argv[1:])
   hparams = create_hparams()
   if is_chief():
     save_metadata(hparams)
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 02a5df2f3..147757b47 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -21,7 +21,6 @@
 
 # Dependency imports
 
-import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 from tensor2tensor.utils import registry
 
@@ -224,10 +223,15 @@ class RangedHParams(object):
   LOG_SCALE = 2
   REVERSE_LOG_SCALE = 3
 
+  SCALES_STR = {
+      LINEAR_SCALE: "UNIT_LINEAR_SCALE",
+      LOG_SCALE: "UNIT_LOG_SCALE",
+      REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE",
+  }
+
   def __init__(self):
     self._categorical_params = {}
     self._discrete_params = {}
-    self._discrete_float_params = {}
     self._float_params = {}
     self._int_params = {}
 
@@ -237,10 +241,12 @@ def _check_reset_and_type_change(self, name, orig_ctr):
     if name in orig_ctr:
       tf.logging.warning("Overwriting hparam %s", name)
 
-    ctr_names = [(self._categorical_params,
-                  "categorical"), (self._discrete_params, "discrete"),
-                 (self._float_params, "float"), (self._int_params, "int"),
-                 (self._discrete_float_params, "discrete_float")]
+    ctr_names = [
+        (self._categorical_params, "categorical"),
+        (self._discrete_params, "discrete"),
+        (self._float_params, "float"),
+        (self._int_params, "int"),
+    ]
     ctrs, names = list(zip(*ctr_names))
     orig_name = names[ctrs.index(orig_ctr)]
 
@@ -263,23 +269,17 @@ def set_discrete(self, name, feasible_points, scale=None, length=None):
     self._discrete_params[name] = (name, feasible_points, scale, length)
 
   def set_float(self, name, min_val, max_val, scale=None, length=None):
-    if name in self._discrete_float_params:
-      del self._discrete_float_params[name]
     self._check_reset_and_type_change(name, self._float_params)
     self._float_params[name] = (name, min_val, max_val, scale, length)
 
-  def set_discrete_float(self, name, val):
-    self._check_reset_and_type_change(name, self._discrete_float_params)
-    self._discrete_float_params[name] = (name, [val])
-
   def set_int(self, name, min_val, max_val, scale=None, length=None):
     self._check_reset_and_type_change(name, self._int_params)
     self._int_params[name] = (name, min_val, max_val, scale, length)
 
   def fix_select_params(self, hp):
     ctrs = [
-        self._categorical_params, self._discrete_params,
-        self._discrete_float_params, self._float_params, self._int_params
+        self._categorical_params, self._discrete_params, self._float_params,
+        self._int_params
     ]
     for key, val in hp.values().iteritems():
       for ctr in ctrs:
@@ -287,52 +287,56 @@ def fix_select_params(self, hp):
           del ctr[key]
       self.set_discrete(key, [val])
 
+  def to_parameter_specs(self, name_prefix=""):
+    """To list of dicts suitable for Cloud ML Engine hyperparameter tuning."""
+    specs = []
+    for name, categories, _ in self._categorical_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "CATEGORICAL",
+          "categoricalValues": categories,
+      }
+      specs.append(spec)
 
-def fill_ranged_hparams_from_hparams(hparams, ranged_hparams):
-  """Fill ranged_hparams with singleton values from hparams.
+    for name, feasible_points, scale, _ in self._discrete_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "DISCRETE",
+          "discreteValues": feasible_points,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  HParams are placed in RangedHParams with the following functions, according to
-  type:
-    * int: set_discrete
-    * bool: set_discrete
-    * float: set_discrete_float
-    * str: set_categorical
+    for name, min_val, max_val, scale, _ in self._float_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "DOUBLE",
+          "minValue": min_val,
+          "maxValue": max_val,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  Args:
-    hparams: tf.contrib.training.HParams; contains the hyperparameters to copy
-      over to ranged_hparams.
-    ranged_hparams: RangedHParams; will have hparams values copied to it.
+    for name, min_val, max_val, scale, _ in self._int_params.values():
+      spec = {
+          "parameterName": name_prefix + name,
+          "type": "INTEGER",
+          "minValue": min_val,
+          "maxValue": max_val,
+      }
+      if scale:
+        spec["scaleType"] = self.SCALES_STR[scale]
+      specs.append(spec)
 
-  Raises:
-    ValueError: if hparams contains a hyperparameter not of type
-      {int, float, str, bool}.
-  """
-  for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types):  # pylint: disable=protected-access
-
-    if is_multivalent:
-      raise ValueError("Multivalent hparams not supported in RangedHParams. "
-                       "Hyperparameter %s is multivalent." % name)
-    val = getattr(hparams, name)
-    if hp_type == int:
-      ranged_hparams.set_discrete(name, [val])
-    elif hp_type == bool:
-      ranged_hparams.set_discrete(name, [int(val)])
-    elif hp_type == float:
-      ranged_hparams.set_discrete_float(name, val)
-    elif hp_type == str:
-      ranged_hparams.set_categorical(name, [val])
-    else:
-      raise ValueError("Unsupported type %s for param %s" % (hp_type, name))
+    return specs
 
 
 @registry.register_ranged_hparams("basic1")
 def basic_range1(ranged_hparams):
   """A basic range of hyperparameters."""
   rhp = ranged_hparams
-
-  hparams = basic_params1()
-  fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
   rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6])
   rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE)
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index e77412513..43cfa571e 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -381,10 +381,6 @@ def slicenet_params1_tiny():
 def slicenet_range1(ranged_hparams):
   """Small range of hyperparameters."""
   rhp = ranged_hparams
-
-  hparams = slicenet_params1()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE)
   rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE)
   rhp.set_float("optimizer_adam_beta2", 0.995, 0.998)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 305a17379..df4b32277 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1228,11 +1228,9 @@ def transformer_prepend():
   return transformer_prepend_v2()
 
 
-@registry.register_ranged_hparams("transformer_base")
+@registry.register_ranged_hparams
 def transformer_base_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_base()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1337,8 +1335,6 @@ def transformer_tiny_tpu():
 @registry.register_ranged_hparams
 def transformer_tiny_tpu_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_tiny_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_float("weight_decay", 0.0, 2.0)
 
@@ -1346,8 +1342,6 @@ def transformer_tiny_tpu_range(rhp):
 @registry.register_ranged_hparams
 def transformer_tpu_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
   # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
   rhp.set_discrete("learning_rate_warmup_steps",
@@ -1358,13 +1352,6 @@ def transformer_tpu_range(rhp):
   rhp.set_float("weight_decay", 0.0, 2.0)
 
 
-@registry.register_ranged_hparams
-def transformer_tpu_batch_range(rhp):
-  hparams = transformer_tpu()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-  rhp.set_discrete("batch_size", [256, 512, 768, 1024])
-
-
 @registry.register_hparams
 def transformer_small_tpu():
   """TPU-friendly version of transformer_small.
diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py
index 913243f00..e8fe796a8 100644
--- a/tensor2tensor/models/transformer_sketch.py
+++ b/tensor2tensor/models/transformer_sketch.py
@@ -22,7 +22,6 @@
 
 # Dependency imports
 
-from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
 from tensor2tensor.models import transformer_vae
@@ -125,10 +124,6 @@ def transformer_sketch_6layer():
 @registry.register_ranged_hparams("transformer_sketch_ranged")
 def transformer_sketch_ranged(rhp):
   """Range of hparams for vizier."""
-
-  hparams = transformer_sketch()
-  common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-
   rhp.set_categorical("ffn_layer",
                       ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"])
   rhp.set_discrete("batch_size", [1024, 2048, 4096])
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
index 0bb5ddc40..82ac23a39 100644
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -22,15 +22,17 @@
 
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials
+from tensor2tensor.layers import common_hparams
 from tensor2tensor.utils import cloud_tpu as cloud
+from tensor2tensor.utils import registry
 from tensor2tensor.utils import usr_dir as usr_dir_lib
 import tensorflow as tf
 
+FLAGS = tf.flags.FLAGS
+
 CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
 
 # TODO(rsepassi):
-# * Support --autotune
-# * Add documentation clould_mlengine.md
 # * Enable multi-machine sync/async training
 
 SETUP_PY = """
@@ -44,9 +46,9 @@
 """
 
 
-def args_dict_as_args(args_dict):
-  """Convert dict to list of args suitable for passing on cmd line."""
-  args_dict = dict(args_dict)
+def flags_as_args():
+  """Convert FLAGS to list of args suitable for passing on cmd line."""
+  args_dict = dict(FLAGS.__dict__['__flags'])
   del args_dict['cloud_mlengine']
   # Configured later
   del args_dict['t2t_usr_dir']
@@ -54,6 +56,8 @@ def args_dict_as_args(args_dict):
   for name, val in args_dict.items():
     if val is None:
       continue
+    if name.startswith('autotune'):
+      continue
     args.extend(['--%s' % name, str(val)])
   return args
 
@@ -83,9 +87,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None):
   return config
 
 
-def configure_job(flags_dict):
+def configure_job():
   """Construct jobSpec for ML Engine job."""
-  train_dir = flags_dict['output_dir']
+  train_dir = FLAGS.output_dir
   assert train_dir.startswith('gs://')
   job_name = os.path.basename(train_dir)
 
@@ -93,16 +97,27 @@ def configure_job(flags_dict):
   # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
   training_input = {
       'pythonModule': 'tensor2tensor.bin.t2t_trainer',
-      'args': args_dict_as_args(flags_dict),
+      'args': flags_as_args(),
       'region': cloud.default_region(),
       'runtimeVersion': '1.4',
       'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7',
+      'jobDir': train_dir,
   }
   training_input.update(
       machine_config(
-          num_gpus=flags_dict['worker_gpu'],
-          use_tpu=flags_dict['use_tpu'],
-          master_type=flags_dict['cloud_mlengine_master_type']))
+          num_gpus=FLAGS.worker_gpu,
+          use_tpu=FLAGS.use_tpu,
+          master_type=FLAGS.cloud_mlengine_master_type))
+  if FLAGS.hparams_range:
+    assert FLAGS.autotune_objective
+    tf.logging.info('Configuring hyperparameter tuning.')
+    training_input['hyperparameters'] = configure_autotune(
+        FLAGS.hparams_range,
+        FLAGS.autotune_objective,
+        FLAGS.autotune_maximize,
+        FLAGS.autotune_max_trials,
+        FLAGS.autotune_parallel_trials,
+    )
 
   if training_input['scaleTier'] == 'CUSTOM':
     assert 'masterType' in training_input
@@ -170,6 +185,26 @@ def tar_and_copy_usr_dir(usr_dir, train_dir):
   return usr_tar
 
 
+def autotune_paramspecs(hparams_range):
+  rhp = common_hparams.RangedHParams()
+  registry.ranged_hparams(hparams_range)(rhp)
+  return rhp.to_parameter_specs(name_prefix='hp_')
+
+
+def configure_autotune(hparams_range,
+                       objective,
+                       maximize=True,
+                       max_trials=10,
+                       parallel_trials=1):
+  return {
+      'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
+      'params': autotune_paramspecs(hparams_range),
+      'maxTrials': max_trials,
+      'maxParallelTrials': parallel_trials,
+      'hyperparameterMetricTag': objective,
+  }
+
+
 def configure_trainer_package(job_spec, t2t_tar):
   assert t2t_tar.startswith('gs://')
   job_spec['trainingInput']['packageUris'] = [t2t_tar]
@@ -182,18 +217,25 @@ def configure_usr_dir(job_spec, usr_tar):
   job_spec['trainingInput']['args'].extend(usr_args)
 
 
-def launch(flags_dict):
+def launch():
   """Launch t2t_trainer on Cloud ML Engine."""
-  job_spec = configure_job(flags_dict)
+  assert not FLAGS.cloud_tpu
+  assert not FLAGS.job_dir
+  assert FLAGS.output_dir.startswith('gs://')
+  assert FLAGS.data_dir.startswith('gs://')
+  assert FLAGS.worker_replicas <= 1
+  assert FLAGS.ps_replicas <= 0
+
+  job_spec = configure_job()
   job_name = job_spec['jobId']
   tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
                   job_spec)
   assert cloud.confirm()
-  train_dir = flags_dict['output_dir']
+  train_dir = FLAGS.output_dir
   t2t_tar = tar_and_copy_t2t(train_dir)
   configure_trainer_package(job_spec, t2t_tar)
-  if flags_dict['t2t_usr_dir']:
-    usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir)
+  if FLAGS.t2t_usr_dir:
+    usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir)
     configure_usr_dir(job_spec, usr_tar)
   launch_job(job_spec)
   tf.logging.info('Launched %s. See console to track: %s.', job_name,
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
index 039b06e68..0e64f2475 100644
--- a/tensor2tensor/utils/trainer_lib.py
+++ b/tensor2tensor/utils/trainer_lib.py
@@ -68,8 +68,12 @@ def create_hparams(hparams_set,
                    hparams_overrides_str="",
                    data_dir=None,
                    problem_name=None):
+  """Create HParams with data_dir and problem hparams, if kwargs provided."""
   hparams = registry.hparams(hparams_set)()
   if hparams_overrides_str:
+    tf.logging.info("Overriding hparams in %s with %s",
+                    hparams_set,
+                    hparams_overrides_str)
     hparams = hparams.parse(hparams_overrides_str)
   if data_dir:
     hparams.add_hparam("data_dir", data_dir)

From 5e8bc75510b2a98b3130f0b5965176bef860409d Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 8 Feb 2018 11:41:48 -0800
Subject: [PATCH 29/31] Shake-shake config for TPU.

PiperOrigin-RevId: 185026739
---
 tensor2tensor/models/shake_shake.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py
index 5e1680edb..31a576338 100644
--- a/tensor2tensor/models/shake_shake.py
+++ b/tensor2tensor/models/shake_shake.py
@@ -191,3 +191,11 @@ def shakeshake_big():
   hparams.layer_prepostprocess_dropout = 0.0
   hparams.hidden_size = 96
   return hparams
+
+
+@registry.register_hparams
+def shakeshake_tpu():
+  hparams = shakeshake_big()
+  hparams.learning_rate_cosine_cycle_steps = 180000
+  hparams.learning_rate = 0.6
+  return hparams

From 77781497f1ca51b0a067cd4394e44e347a11e547 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Thu, 8 Feb 2018 13:30:45 -0800
Subject: [PATCH 30/31] Revert "noam" learning-rate-scheme to use linear
 warmup.  Add learning_rate_schedule hparam to specify a schedule that does
 not have separate warmup and decay phases.

PiperOrigin-RevId: 185042750
---
 tensor2tensor/layers/common_hparams.py |  5 +++++
 tensor2tensor/models/transformer.py    |  2 +-
 tensor2tensor/utils/optimize.py        | 21 +++++++++++++++++----
 tensor2tensor/utils/t2t_model.py       |  2 +-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 147757b47..8fbd88bd2 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -63,6 +63,11 @@ def basic_params1():
       optimizer_momentum_nesterov=False,
       weight_decay=1e-6,
       weight_noise=0.0,
+      learning_rate_schedule="warmup_and_decay",
+      # If learning_rate_schedule=="warmup_and_decay", then this specifies
+      # the decay part of the schedule.
+      # The warmup is always exponential.
+      # TODO(noam): add a hyperparameter to control the warmup.
       learning_rate_decay_scheme="none",
       # decay_steps and decay_staircase for learning_rate_decay_scheme=="exp"
       learning_rate_decay_steps=5000,
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index df4b32277..061b68ab7 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -877,7 +877,7 @@ def transformer_base_v1():
   hparams.max_length = 256
   hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
   hparams.optimizer_adam_epsilon = 1e-9
-  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay"
   hparams.learning_rate = 0.1
   hparams.learning_rate_warmup_steps = 4000
   hparams.initializer_gain = 1.0
diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
index a497d56bd..bc09c009d 100644
--- a/tensor2tensor/utils/optimize.py
+++ b/tensor2tensor/utils/optimize.py
@@ -168,10 +168,6 @@ def learning_rate_decay(hparams, warmup_steps=0):
                                    hparams.learning_rate_boundaries,
                                    hparams.learning_rate_multiples)
 
-  if scheme == "noam":
-    return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
-        (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5)
-
   if scheme == "cosine":
     cycle_steps = hparams.learning_rate_cosine_cycle_steps
     cycle_position = global_step % (2 * cycle_steps)
@@ -224,6 +220,23 @@ def learning_rate_decay_with_warmup(hparams, num_worker_replicas=1):
   return tf.where(global_step < warmup_steps, warmup, decay)
 
 
+def learning_rate_schedule(hparams, num_worker_replicas=1):
+  """Learning rate schedule based on hparams."""
+  schedule = hparams.learning_rate_schedule
+  warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps)
+  global_step = tf.to_float(tf.train.get_or_create_global_step())
+  if hparams.learning_rate_decay_scheme == "noam":
+    # backwards compatiblity with previous behavior
+    schedule = "linear_warmup_rsqrt_decay"
+  if schedule == "warmup_and_decay":
+    return learning_rate_decay_with_warmup(hparams, num_worker_replicas)
+  elif schedule == "linear_warmup_rsqrt_decay":
+    return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
+        (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5)
+  else:
+    raise ValueError("Unrecognized learning rate schedule: %s" % schedule)
+
+
 def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
   """Apply weight decay and weight noise."""
   if var_list is None:
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 225c4d19b..aadf5e358 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -296,7 +296,7 @@ def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
     tf.logging.info("Base learning rate: %f", self.hparams.learning_rate)
     lr = self.hparams.learning_rate
-    decay_rate = optimize.learning_rate_decay_with_warmup(self.hparams)
+    decay_rate = optimize.learning_rate_schedule(self.hparams)
     lr *= decay_rate
     if self.hparams.learning_rate_minimum:
       lr_min = float(self.hparams.learning_rate_minimum)

From 290a12a8cd2c9ba94456d8112f453b473b5a0857 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 8 Feb 2018 13:57:00 -0800
Subject: [PATCH 31/31] v1.4.4

PiperOrigin-RevId: 185046879
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 567d7a392..1153dbba8 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.4.3',
+    version='1.4.4',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',