From b9900e5cc99bffe365a9fcab1050c6df4e3c9ce4 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 1 Feb 2018 18:01:11 -0800 Subject: [PATCH 01/31] Tune transformer ae hparams. PiperOrigin-RevId: 184224758 --- setup.py | 2 - tensor2tensor/bin/t2t-rl-trainer | 16 -- tensor2tensor/bin/t2t_rl_trainer.py | 92 -------- tensor2tensor/models/transformer_vae.py | 9 +- tensor2tensor/rl/README.md | 10 - tensor2tensor/rl/__init__.py | 0 tensor2tensor/rl/collect.py | 94 -------- tensor2tensor/rl/envs/__init__.py | 0 tensor2tensor/rl/envs/batch_env.py | 129 ----------- tensor2tensor/rl/envs/in_graph_batch_env.py | 163 -------------- tensor2tensor/rl/envs/utils.py | 225 -------------------- tensor2tensor/rl/networks.py | 65 ------ tensor2tensor/rl/ppo.py | 98 --------- tensor2tensor/rl/train_test.py | 36 ---- 14 files changed, 6 insertions(+), 933 deletions(-) delete mode 100644 tensor2tensor/bin/t2t-rl-trainer delete mode 100644 tensor2tensor/bin/t2t_rl_trainer.py delete mode 100644 tensor2tensor/rl/README.md delete mode 100644 tensor2tensor/rl/__init__.py delete mode 100644 tensor2tensor/rl/collect.py delete mode 100644 tensor2tensor/rl/envs/__init__.py delete mode 100644 tensor2tensor/rl/envs/batch_env.py delete mode 100644 tensor2tensor/rl/envs/in_graph_batch_env.py delete mode 100644 tensor2tensor/rl/envs/utils.py delete mode 100644 tensor2tensor/rl/networks.py delete mode 100644 tensor2tensor/rl/ppo.py delete mode 100644 tensor2tensor/rl/train_test.py diff --git a/setup.py b/setup.py index ee0eb0d09..1d3f14a94 100644 --- a/setup.py +++ b/setup.py @@ -36,8 +36,6 @@ 'future', 'gevent', 'gunicorn', - 'gym<=0.9.5', # gym in version 0.9.6 has some temporary issues. - 'munch', 'numpy', 'requests', 'scipy', diff --git a/tensor2tensor/bin/t2t-rl-trainer b/tensor2tensor/bin/t2t-rl-trainer deleted file mode 100644 index 06c97d2d5..000000000 --- a/tensor2tensor/bin/t2t-rl-trainer +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python -"""t2t-rl-trainer.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensor2tensor.bin import t2t_rl_trainer - -import tensorflow as tf - -def main(argv): - t2t_rl_trainer.main(argv) - - -if __name__ == "__main__": - tf.app.run() diff --git a/tensor2tensor/bin/t2t_rl_trainer.py b/tensor2tensor/bin/t2t_rl_trainer.py deleted file mode 100644 index b53692ccc..000000000 --- a/tensor2tensor/bin/t2t_rl_trainer.py +++ /dev/null @@ -1,92 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Training of RL agent with PPO algorithm.""" - -from __future__ import absolute_import - -import functools -from munch import Munch -import tensorflow as tf - -from tensor2tensor.rl.collect import define_collect -from tensor2tensor.rl.envs.utils import define_batch_env -from tensor2tensor.rl.ppo import define_ppo_epoch - - -def define_train(policy_lambda, env_lambda, config): - env = env_lambda() - action_space = env.action_space - observation_space = env.observation_space - - batch_env = define_batch_env(env_lambda, config["num_agents"]) - - policy_factory = tf.make_template( - 'network', - functools.partial(policy_lambda, observation_space, - action_space, config)) - - (collect_op, memory) = define_collect(policy_factory, batch_env, config) - - with tf.control_dependencies([collect_op]): - ppo_op = define_ppo_epoch(memory, policy_factory, config) - - return ppo_op - - -def main(): - train(example_params()) - - -def train(params): - policy_lambda, env_lambda, config = params - ppo_op = define_train(policy_lambda, env_lambda, config) - - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - for _ in range(config.epochs_num): - sess.run(ppo_op) - - -def example_params(): - from tensor2tensor.rl import networks - config = {} - config['init_mean_factor'] = 0.1 - config['init_logstd'] = 0.1 - config['policy_layers'] = 100, 100 - config['value_layers'] = 100, 100 - config['num_agents'] = 30 - config['clipping_coef'] = 0.2 - config['gae_gamma'] = 0.99 - config['gae_lambda'] = 0.95 - config['entropy_loss_coef'] = 0.01 - config['value_loss_coef'] = 1 - config['optimizer'] = tf.train.AdamOptimizer - config['learning_rate'] = 1e-4 - config['optimization_epochs'] = 15 - config['epoch_length'] = 200 - config['epochs_num'] = 2000 - - config = Munch(config) - return networks.feed_forward_gaussian_fun, pendulum_lambda, config - - -def pendulum_lambda(): - import gym - return gym.make("Pendulum-v0") - - -if __name__ == '__main__': - main() diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index ac9a66b77..b1efb75ca 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -830,8 +830,11 @@ def transformer_ae_small(): hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 - hparams.optimizer = "Adafactor" - hparams.add_hparam("z_size", 16) + hparams.optimizer = "Adafactor" # Can be unstable, maybe try Adam. + hparams.optimizer_adam_epsilon = 1e-9 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.998 # Needs tuning, try 0.98 to 0.999. + hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.0) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. @@ -840,7 +843,7 @@ def transformer_ae_small(): # Reshape method for hierarchical vq-vae: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) - hparams.add_hparam("unmasked_percentage", 0.3) + hparams.add_hparam("unmasked_percentage", 0.1) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("do_refine", False) diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md deleted file mode 100644 index bf21ab1ad..000000000 --- a/tensor2tensor/rl/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Tensor2Tensor Reinforcement Learning starter. - -The rl package intention is to provide possiblity to run reinforcement -algorithms within Tensorflow's computation graph. - -Currently the only supported algorithm is Proximy Policy Optimization - PPO. - -## Sample usage - training in Pendulum-v0 environment. - -```t2t-rl-trainer``` diff --git a/tensor2tensor/rl/__init__.py b/tensor2tensor/rl/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py deleted file mode 100644 index dadab4d92..000000000 --- a/tensor2tensor/rl/collect.py +++ /dev/null @@ -1,94 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Collect trajectories from interactions of agent with environment.""" - -import tensorflow as tf - - -def define_collect(policy_factory, batch_env, config): - - memory_shape = [config.epoch_length] + [batch_env.observ.shape.as_list()[0]] - memories_shapes_and_types = [ - # observation - (memory_shape + [batch_env.observ.shape.as_list()[1]], tf.float32), - (memory_shape, tf.float32), # reward - (memory_shape, tf.bool), # done - (memory_shape + batch_env.action_shape, tf.float32), # action - (memory_shape, tf.float32), # pdf - (memory_shape, tf.float32), # value function - ] - memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False) - for (shape, dtype) in memories_shapes_and_types] - cumulative_rewards = tf.Variable( - tf.zeros(config.num_agents, tf.float32), trainable=False) - - should_reset_var = tf.Variable(True, trainable=False) - reset_op = tf.cond(should_reset_var, - lambda: batch_env.reset(tf.range(config.num_agents)), - lambda: 0.0) - with tf.control_dependencies([reset_op]): - reset_once_op = tf.assign(should_reset_var, False) - - with tf.control_dependencies([reset_once_op]): - - def step(index, scores_sum, scores_num): - # Note - the only way to ensure making a copy of tensor is to run simple - # operation. We are waiting for tf.copy: - # https://github.com/tensorflow/tensorflow/issues/11186 - obs_copy = batch_env.observ + 0 - actor_critic = policy_factory(tf.expand_dims(obs_copy, 0)) - policy = actor_critic.policy - action = policy.sample() - postprocessed_action = actor_critic.action_postprocessing(action) - simulate_output = batch_env.simulate(postprocessed_action[0, ...]) - pdf = policy.prob(action)[0] - with tf.control_dependencies(simulate_output): - reward, done = simulate_output - done = tf.reshape(done, (config.num_agents,)) - to_save = [obs_copy, reward, done, action[0, ...], pdf, - actor_critic.value[0]] - save_ops = [tf.scatter_update(memory_slot, index, value) - for memory_slot, value in zip(memory, to_save)] - cumulate_rewards_op = cumulative_rewards.assign_add(reward) - agent_indicies_to_reset = tf.where(done)[:, 0] - with tf.control_dependencies([cumulate_rewards_op]): - scores_sum_delta = tf.reduce_sum( - tf.gather(cumulative_rewards, agent_indicies_to_reset)) - scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) - with tf.control_dependencies(save_ops + [scores_sum_delta, - scores_num_delta]): - reset_env_op = batch_env.reset(agent_indicies_to_reset) - reset_cumulative_rewards_op = tf.scatter_update( - cumulative_rewards, agent_indicies_to_reset, - tf.zeros(tf.shape(agent_indicies_to_reset))) - with tf.control_dependencies([reset_env_op, - reset_cumulative_rewards_op]): - return [index + 1, scores_sum + scores_sum_delta, - scores_num + scores_num_delta] - - init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] - index, scores_sum, scores_num = tf.while_loop( - lambda c, _1, _2: c < config.epoch_length, - step, - init, - parallel_iterations=1, - back_prop=False) - mean_score = tf.cond(tf.greater(scores_num, 0), - lambda: scores_sum / tf.cast(scores_num, tf.float32), - lambda: 0.) - printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") - with tf.control_dependencies([printing]): - return tf.identity(index), memory diff --git a/tensor2tensor/rl/envs/__init__.py b/tensor2tensor/rl/envs/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tensor2tensor/rl/envs/batch_env.py b/tensor2tensor/rl/envs/batch_env.py deleted file mode 100644 index 30bfdce55..000000000 --- a/tensor2tensor/rl/envs/batch_env.py +++ /dev/null @@ -1,129 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code was based on Danijar Hafner's code from tf.agents: -# https://github.com/tensorflow/agents/blob/master/agents/tools/batch_env.py - -"""Combine multiple environments to step them in batch.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - - -class BatchEnv(object): - """Combine multiple environments to step them in batch.""" - - def __init__(self, envs, blocking): - """Combine multiple environments to step them in batch. - - To step environments in parallel, environments must support a - `blocking=False` argument to their step and reset functions that makes them - return callables instead to receive the result at a later time. - - Args: - envs: List of environments. - blocking: Step environments after another rather than in parallel. - - Raises: - ValueError: Environments have different observation or action spaces. - """ - self._envs = envs - self._blocking = blocking - observ_space = self._envs[0].observation_space - if not all(env.observation_space == observ_space for env in self._envs): - raise ValueError('All environments must use the same observation space.') - action_space = self._envs[0].action_space - if not all(env.action_space == action_space for env in self._envs): - raise ValueError('All environments must use the same observation space.') - - def __len__(self): - """Number of combined environments.""" - return len(self._envs) - - def __getitem__(self, index): - """Access an underlying environment by index.""" - return self._envs[index] - - def __getattr__(self, name): - """Forward unimplemented attributes to one of the original environments. - - Args: - name: Attribute that was accessed. - - Returns: - Value behind the attribute name one of the wrapped environments. - """ - return getattr(self._envs[0], name) - - def step(self, actions): - """Forward a batch of actions to the wrapped environments. - - Args: - actions: Batched action to apply to the environment. - - Raises: - ValueError: Invalid actions. - - Returns: - Batch of observations, rewards, and done flags. - """ - for index, (env, action) in enumerate(zip(self._envs, actions)): - if not env.action_space.contains(action): - message = 'Invalid action at index {}: {}' - raise ValueError(message.format(index, action)) - if self._blocking: - transitions = [ - env.step(action) - for env, action in zip(self._envs, actions)] - else: - transitions = [ - env.step(action, blocking=False) - for env, action in zip(self._envs, actions)] - transitions = [transition() for transition in transitions] - observs, rewards, dones, infos = zip(*transitions) - observ = np.stack(observs).astype(np.float32) - reward = np.stack(rewards).astype(np.float32) - done = np.stack(dones) - info = tuple(infos) - return observ, reward, done, info - - def reset(self, indices=None): - """Reset the environment and convert the resulting observation. - - Args: - indices: The batch indices of environments to reset; defaults to all. - - Returns: - Batch of observations. - """ - if indices is None: - indices = np.arange(len(self._envs)) - if self._blocking: - observs = [self._envs[index].reset() for index in indices] - else: - observs = [self._envs[index].reset(blocking=False) for index in indices] - observs = [observ() for observ in observs] - observ = np.stack(observs) - observ = observ.astype(np.float32) - return observ - - def close(self): - """Send close messages to the external process and join them.""" - for env in self._envs: - if hasattr(env, 'close'): - env.close() diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py deleted file mode 100644 index d0e1e4c26..000000000 --- a/tensor2tensor/rl/envs/in_graph_batch_env.py +++ /dev/null @@ -1,163 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code was based on Danijar Hafner's code from tf.agents: -# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py - -"""Batch of environments inside the TensorFlow graph.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import gym -import tensorflow as tf - - -class InGraphBatchEnv(object): - """Batch of environments inside the TensorFlow graph. - - The batch of environments will be stepped and reset inside of the graph using - a tf.py_func(). The current batch of observations, actions, rewards, and done - flags are held in according variables. - """ - - def __init__(self, batch_env): - """Batch of environments inside the TensorFlow graph. - - Args: - batch_env: Batch environment. - """ - self._batch_env = batch_env - observ_shape = self._parse_shape(self._batch_env.observation_space) - observ_dtype = self._parse_dtype(self._batch_env.observation_space) - self.action_shape = list(self._parse_shape(self._batch_env.action_space)) - self.action_dtype = self._parse_dtype(self._batch_env.action_space) - with tf.variable_scope('env_temporary'): - self._observ = tf.Variable( - tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype), - name='observ', trainable=False) - - def __getattr__(self, name): - """Forward unimplemented attributes to one of the original environments. - - Args: - name: Attribute that was accessed. - - Returns: - Value behind the attribute name in one of the original environments. - """ - return getattr(self._batch_env, name) - - def __len__(self): - """Number of combined environments.""" - return len(self._batch_env) - - def __getitem__(self, index): - """Access an underlying environment by index.""" - return self._batch_env[index] - - def simulate(self, action): - """Step the batch of environments. - - The results of the step can be accessed from the variables defined below. - - Args: - action: Tensor holding the batch of actions to apply. - - Returns: - Operation. - """ - with tf.name_scope('environment/simulate'): - if action.dtype in (tf.float16, tf.float32, tf.float64): - action = tf.check_numerics(action, 'action') - observ_dtype = self._parse_dtype(self._batch_env.observation_space) - observ, reward, done = tf.py_func( - lambda a: self._batch_env.step(a)[:3], [action], - [observ_dtype, tf.float32, tf.bool], name='step') - observ = tf.check_numerics(observ, 'observ') - reward = tf.check_numerics(reward, 'reward') - with tf.control_dependencies([self._observ.assign(observ)]): - return tf.identity(reward), tf.identity(done) - - - def reset(self, indices=None): - """Reset the batch of environments. - - Args: - indices: The batch indices of the environments to reset. - - Returns: - Batch tensor of the new observations. - """ - return tf.cond( - tf.cast(tf.shape(indices)[0], tf.bool), - lambda: self._reset_non_empty(indices), lambda: 0.0) - - def _reset_non_empty(self, indices): - """Reset the batch of environments. - - Args: - indices: The batch indices of the environments to reset; defaults to all. - - Returns: - Batch tensor of the new observations. - """ - observ_dtype = self._parse_dtype(self._batch_env.observation_space) - observ = tf.py_func( - self._batch_env.reset, [indices], observ_dtype, name='reset') - observ = tf.check_numerics(observ, 'observ') - with tf.control_dependencies([ - tf.scatter_update(self._observ, indices, observ)]): - return tf.identity(observ) - - @property - def observ(self): - """Access the variable holding the current observation.""" - return self._observ - - def close(self): - """Send close messages to the external process and join them.""" - self._batch_env.close() - - def _parse_shape(self, space): - """Get a tensor shape from a OpenAI Gym space. - - Args: - space: Gym space. - - Returns: - Shape tuple. - """ - if isinstance(space, gym.spaces.Discrete): - return () - if isinstance(space, gym.spaces.Box): - return space.shape - raise NotImplementedError() - - def _parse_dtype(self, space): - """Get a tensor dtype from a OpenAI Gym space. - - Args: - space: Gym space. - - Returns: - TensorFlow data type. - """ - if isinstance(space, gym.spaces.Discrete): - return tf.int32 - if isinstance(space, gym.spaces.Box): - return tf.float32 - raise NotImplementedError() diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py deleted file mode 100644 index 2b81af270..000000000 --- a/tensor2tensor/rl/envs/utils.py +++ /dev/null @@ -1,225 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code was based on Danijar Hafner's code from tf.agents: -# https://github.com/tensorflow/agents/blob/master/agents/tools/wrappers.py -# https://github.com/tensorflow/agents/blob/master/agents/scripts/utility.py - -"""Utilities for using batched environments.""" - -import atexit -import multiprocessing -import sys -import traceback -import tensorflow as tf - -from tensor2tensor.rl.envs import batch_env -from tensor2tensor.rl.envs import in_graph_batch_env - -class ExternalProcessEnv(object): - """Step environment in a separate process for lock free paralellism.""" - - # Message types for communication via the pipe. - _ACCESS = 1 - _CALL = 2 - _RESULT = 3 - _EXCEPTION = 4 - _CLOSE = 5 - - def __init__(self, constructor): - """Step environment in a separate process for lock free paralellism. - - The environment will be created in the external process by calling the - specified callable. This can be an environment class, or a function - creating the environment and potentially wrapping it. The returned - environment should not access global variables. - - Args: - constructor: Callable that creates and returns an OpenAI gym environment. - - Attributes: - observation_space: The cached observation space of the environment. - action_space: The cached action space of the environment. - """ - self._conn, conn = multiprocessing.Pipe() - self._process = multiprocessing.Process( - target=self._worker, args=(constructor, conn)) - atexit.register(self.close) - self._process.start() - self._observ_space = None - self._action_space = None - - @property - def observation_space(self): - if not self._observ_space: - self._observ_space = self.__getattr__('observation_space') - return self._observ_space - - @property - def action_space(self): - if not self._action_space: - self._action_space = self.__getattr__('action_space') - return self._action_space - - def __getattr__(self, name): - """Request an attribute from the environment. - - Note that this involves communication with the external process, so it can - be slow. - - Args: - name: Attribute to access. - - Returns: - Value of the attribute. - """ - self._conn.send((self._ACCESS, name)) - return self._receive() - - def call(self, name, *args, **kwargs): - """Asynchronously call a method of the external environment. - - Args: - name: Name of the method to call. - *args: Positional arguments to forward to the method. - **kwargs: Keyword arguments to forward to the method. - - Returns: - Promise object that blocks and provides the return value when called. - """ - payload = name, args, kwargs - self._conn.send((self._CALL, payload)) - return self._receive - - def close(self): - """Send a close message to the external process and join it.""" - try: - self._conn.send((self._CLOSE, None)) - self._conn.close() - except IOError: - # The connection was already closed. - pass - self._process.join() - - def step(self, action, blocking=True): - """Step the environment. - - Args: - action: The action to apply to the environment. - blocking: Whether to wait for the result. - - Returns: - Transition tuple when blocking, otherwise callable that returns the - transition tuple. - """ - promise = self.call('step', action) - if blocking: - return promise() - else: - return promise - - def reset(self, blocking=True): - """Reset the environment. - - Args: - blocking: Whether to wait for the result. - - Returns: - New observation when blocking, otherwise callable that returns the new - observation. - """ - promise = self.call('reset') - if blocking: - return promise() - else: - return promise - - def _receive(self): - """Wait for a message from the worker process and return its payload. - - Raises: - Exception: An exception was raised inside the worker process. - KeyError: The reveived message is of an unknown type. - - Returns: - Payload object of the message. - """ - message, payload = self._conn.recv() - # Re-raise exceptions in the main process. - if message == self._EXCEPTION: - stacktrace = payload - raise Exception(stacktrace) - if message == self._RESULT: - return payload - raise KeyError('Received message of unexpected type {}'.format(message)) - - def _worker(self, constructor, conn): - """The process waits for actions and sends back environment results. - - Args: - constructor: Constructor for the OpenAI Gym environment. - conn: Connection for communication to the main process. - """ - try: - env = constructor() - while True: - try: - # Only block for short times to have keyboard exceptions be raised. - if not conn.poll(0.1): - continue - message, payload = conn.recv() - except (EOFError, KeyboardInterrupt): - break - if message == self._ACCESS: - name = payload - result = getattr(env, name) - conn.send((self._RESULT, result)) - continue - if message == self._CALL: - name, args, kwargs = payload - result = getattr(env, name)(*args, **kwargs) - conn.send((self._RESULT, result)) - continue - if message == self._CLOSE: - assert payload is None - break - raise KeyError('Received message of unknown type {}'.format(message)) - except Exception: # pylint: disable=broad-except - stacktrace = ''.join(traceback.format_exception(*sys.exc_info())) - tf.logging.error('Error in environment process: {}'.format(stacktrace)) - conn.send((self._EXCEPTION, stacktrace)) - conn.close() - -def define_batch_env(constructor, num_agents, env_processes=True): - """Create environments and apply all desired wrappers. - - Args: - constructor: Constructor of an OpenAI gym environment. - num_agents: Number of environments to combine in the batch. - env_processes: Whether to step environment in external processes. - - Returns: - In-graph environments object. - """ - with tf.variable_scope('environments'): - if env_processes: - envs = [ - ExternalProcessEnv(constructor) - for _ in range(num_agents)] - else: - envs = [constructor() for _ in range(num_agents)] - env = batch_env.BatchEnv(envs, blocking=not env_processes) - env = in_graph_batch_env.InGraphBatchEnv(env) - return env diff --git a/tensor2tensor/rl/networks.py b/tensor2tensor/rl/networks.py deleted file mode 100644 index af8709191..000000000 --- a/tensor2tensor/rl/networks.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Neural networks for actor-critic algorithms.""" - -import operator -import functools -import collections -import tensorflow as tf -import gym - - -NetworkOutput = collections.namedtuple( - 'NetworkOutput', 'policy, value, action_postprocessing') - - -def feed_forward_gaussian_fun(observation_space, action_space, config, - observations): - assert isinstance(observation_space, gym.spaces.box.Box) - - mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( - factor=config.init_mean_factor) - logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) - - flat_observations = tf.reshape(observations, [ - tf.shape(observations)[0], tf.shape(observations)[1], - functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) - - with tf.variable_scope('policy'): - x = flat_observations - for size in config.policy_layers: - x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) - mean = tf.contrib.layers.fully_connected( - x, action_space.shape[0], tf.tanh, - weights_initializer=mean_weights_initializer) - logstd = tf.get_variable( - 'logstd', mean.shape[2:], tf.float32, logstd_initializer) - logstd = tf.tile( - logstd[None, None], - [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) - with tf.variable_scope('value'): - x = flat_observations - for size in config.value_layers: - x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) - value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] - mean = tf.check_numerics(mean, 'mean') - logstd = tf.check_numerics(logstd, 'logstd') - value = tf.check_numerics(value, 'value') - - policy = tf.contrib.distributions.MultivariateNormalDiag(mean, - tf.exp(logstd)) - - return NetworkOutput(policy, value, lambda a: tf.clip_by_value(a, -2., 2)) diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py deleted file mode 100644 index 1c9654608..000000000 --- a/tensor2tensor/rl/ppo.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""PPO algorithm implementation. - -Based on: https://arxiv.org/abs/1707.06347 -""" - -import tensorflow as tf - -def define_ppo_step(observation, action, reward, done, value, old_pdf, - policy_factory, config): - - new_policy_dist, new_value, _ = policy_factory(observation) - new_pdf = new_policy_dist.prob(action) - - ratio = new_pdf/old_pdf - clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef, - 1 + config.clipping_coef) - - advantage = calculate_discounted_return( - reward, value, done, config.gae_gamma, config.gae_lambda) - value - - advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1], - keep_dims=True) - advantage_normalized = tf.stop_gradient( - (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8)) - - surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized, - ratio * advantage_normalized) - policy_loss = -tf.reduce_mean(surrogate_objective) - - value_error = calculate_discounted_return( - reward, new_value, done, config.gae_gamma, config.gae_lambda) - value - value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2) - - entropy = new_policy_dist.entropy() - entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy) - - total_loss = policy_loss + value_loss + entropy_loss - - optimization_op = config.optimizer(config.learning_rate).minimize(total_loss) - - with tf.control_dependencies([optimization_op]): - return [tf.identity(x) for x in (policy_loss, value_loss, entropy_loss)] - - -def define_ppo_epoch(memory, policy_factory, config): - observation, reward, done, action, old_pdf, value = memory - - # This is to avoid propagating gradients though simulation of simulation - observation = tf.stop_gradient(observation) - action = tf.stop_gradient(action) - reward = tf.stop_gradient(reward) - done = tf.stop_gradient(done) - value = tf.stop_gradient(value) - old_pdf = tf.stop_gradient(old_pdf) - - policy_loss, value_loss, entropy_loss = tf.scan( - lambda _1, _2: define_ppo_step(observation, action, reward, done, value, - old_pdf, policy_factory, config), - tf.range(config.optimization_epochs), - [0., 0., 0.], - parallel_iterations=1) - - print_losses = tf.group( - tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '), - tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '), - tf.Print(0, [tf.reduce_mean(entropy_loss)], 'entropy loss: ')) - - return print_losses - - -def calculate_discounted_return(reward, value, done, discount, unused_lambda): - """Discounted Monte-Carlo returns.""" - done = tf.cast(done, tf.float32) - reward2 = done[-1, :] * reward[-1, :] + (1 - done[-1, :]) * value[-1, :] - reward = tf.concat([reward[:-1,], reward2[None, ...]], axis=0) - return_ = tf.reverse(tf.scan( - lambda agg, cur: cur[0] + (1 - cur[1]) * discount * agg, # fn - [tf.reverse(reward, [0]), # elem - tf.reverse(done, [0])], - tf.zeros_like(reward[0, :]), # initializer - 1, - False), [0]) - return tf.check_numerics(return_, 'return') diff --git a/tensor2tensor/rl/train_test.py b/tensor2tensor/rl/train_test.py deleted file mode 100644 index ac14c2083..000000000 --- a/tensor2tensor/rl/train_test.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Tensor2Tensor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests of basic flow of collecting trajectories and training PPO.""" - -import tensorflow as tf - -from tensor2tensor.bin import t2t_rl_trainer - - -FLAGS = tf.app.flags.FLAGS - - -class TrainTest(tf.test.TestCase): - - def test_no_crash_pendulum(self): - params = t2t_rl_trainer.example_params() - params[2].epochs_num = 10 - t2t_rl_trainer.train(params) - - -if __name__ == '__main__': - FLAGS.config = 'unused' - tf.test.main() From 12038533d25e979227d79cea88c74a198be263f7 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Fri, 2 Feb 2018 00:03:52 -0800 Subject: [PATCH 02/31] Implement fast decoding of transformer for language modeling problems (no inputs, partial targets). PiperOrigin-RevId: 184248180 --- tensor2tensor/models/transformer.py | 114 +++++++++++++++-------- tensor2tensor/models/transformer_test.py | 39 +++++++- 2 files changed, 115 insertions(+), 38 deletions(-) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index b241cc24a..3aeb7a790 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -146,12 +146,13 @@ def body(self, features): """ hparams = self._hparams - inputs = features.get("inputs") - encoder_output, encoder_decoder_attention_bias = (None, None) - if inputs is not None: + if self.has_input: + inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features) + else: + encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets = common_layers.flatten4d3d(targets) @@ -245,31 +246,44 @@ def _fast_decode(self, raise NotImplementedError("Fast decoding only supports a single shard.") dp = self._data_parallelism hparams = self._hparams - - inputs = features["inputs"] target_modality = self._problem_hparams.target_modality - if target_modality.is_class_modality: - decode_length = 1 + + if self.has_input: + inputs = features["inputs"] + if target_modality.is_class_modality: + decode_length = 1 + else: + decode_length = common_layers.shape_list(inputs)[1] + decode_length + + # TODO(llion): Clean up this reshaping logic. + inputs = tf.expand_dims(inputs, axis=1) + if len(inputs.shape) < 5: + inputs = tf.expand_dims(inputs, axis=4) + s = common_layers.shape_list(inputs) + batch_size = s[0] + inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]]) + # _shard_features called to ensure that the variable names match + inputs = self._shard_features({"inputs": inputs})["inputs"] + input_modality = self._problem_hparams.input_modality["inputs"] + with tf.variable_scope(input_modality.name): + inputs = input_modality.bottom_sharded(inputs, dp) + with tf.variable_scope("body"): + encoder_output, encoder_decoder_attention_bias = dp( + self.encode, inputs, features["target_space_id"], hparams, + features=features) + encoder_output = encoder_output[0] + encoder_decoder_attention_bias = encoder_decoder_attention_bias[0] + partial_targets = None else: - decode_length = common_layers.shape_list(inputs)[1] + decode_length - - # TODO(llion): Clean up this reshaping logic. - inputs = tf.expand_dims(inputs, axis=1) - if len(inputs.shape) < 5: - inputs = tf.expand_dims(inputs, axis=4) - s = common_layers.shape_list(inputs) - inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]]) - # _shard_features called to ensure that the variable names match - inputs = self._shard_features({"inputs": inputs})["inputs"] - input_modality = self._problem_hparams.input_modality["inputs"] - with tf.variable_scope(input_modality.name): - inputs = input_modality.bottom_sharded(inputs, dp) - with tf.variable_scope("body"): - encoder_output, encoder_decoder_attention_bias = dp( - self.encode, inputs, features["target_space_id"], hparams, - features=features) - encoder_output = encoder_output[0] - encoder_decoder_attention_bias = encoder_decoder_attention_bias[0] + # The problem has no inputs. + # In this case, features["inputs"] contains partial targets. + # We force the outputs to begin with these sequences. + encoder_output = None + encoder_decoder_attention_bias = None + partial_targets = tf.squeeze(tf.to_int64(features["inputs"]), [2, 3]) + partial_targets_length = common_layers.shape_list(partial_targets)[1] + decode_length += partial_targets_length + batch_size = tf.shape(partial_targets)[0] if hparams.pos == "timing": timing_signal = common_attention.get_timing_signal_1d( @@ -320,16 +334,30 @@ def symbols_to_logits_fn(ids, i, cache): with tf.variable_scope("body"): body_outputs = dp( - self.decode, targets, cache["encoder_output"], - cache["encoder_decoder_attention_bias"], bias, hparams, cache, + self.decode, targets, cache.get("encoder_output"), + cache.get("encoder_decoder_attention_bias"), + bias, hparams, cache, nonpadding=features_to_nonpadding(features, "targets")) with tf.variable_scope(target_modality.name): logits = target_modality.top_sharded(body_outputs, None, dp)[0] - return tf.squeeze(logits, axis=[1, 2, 3]), cache - - return fast_decode( + ret = tf.squeeze(logits, axis=[1, 2, 3]) + if partial_targets is not None: + # If the position is within the given partial targets, we alter the + # logits to always return those values. + # A faster approach would be to process the partial targets in one + # iteration in order to fill the corresponding parts of the cache. + # This would require broader changes, though. + vocab_size = tf.shape(ret)[1] + def forced_logits(): + return tf.one_hot(tf.tile(partial_targets[:, i], [beam_size]), + vocab_size, 0.0, -1e9) + ret = tf.cond( + tf.less(i, partial_targets_length), forced_logits, lambda: ret) + return ret, cache + + ret = fast_decode( encoder_output=encoder_output, encoder_decoder_attention_bias=encoder_decoder_attention_bias, symbols_to_logits_fn=symbols_to_logits_fn, @@ -338,7 +366,11 @@ def symbols_to_logits_fn(ids, i, cache): vocab_size=target_modality.top_dimensionality, beam_size=beam_size, top_beams=top_beams, - alpha=alpha) + alpha=alpha, + batch_size=batch_size) + if partial_targets is not None: + ret["outputs"] = ret["outputs"][:, partial_targets_length:] + return ret def fast_decode(encoder_output, @@ -350,7 +382,8 @@ def fast_decode(encoder_output, beam_size=1, top_beams=1, alpha=1.0, - eos_id=beam_search.EOS_ID): + eos_id=beam_search.EOS_ID, + batch_size=None): """Given encoder output and a symbols to logits function, does fast decoding. Implements both greedy and beam search decoding, uses beam search iff @@ -370,6 +403,7 @@ def fast_decode(encoder_output, alpha: Float that controls the length penalty. larger the alpha, stronger the preference for slonger translations. eos_id: End-of-sequence symbol in beam search. + batch_size: an integer scalar - must be passed if there is no input Returns: A dict of decoding results { @@ -379,8 +413,12 @@ def fast_decode(encoder_output, "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } + + Raises: + NotImplementedError: If beam size > 1 with partial targets. """ - batch_size = common_layers.shape_list(encoder_output)[0] + if encoder_output is not None: + batch_size = common_layers.shape_list(encoder_output)[0] key_channels = hparams.attention_key_channels or hparams.hidden_size value_channels = hparams.attention_value_channels or hparams.hidden_size @@ -394,8 +432,9 @@ def fast_decode(encoder_output, for layer in range(num_layers) } - cache["encoder_output"] = encoder_output - cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias + if encoder_output is not None: + cache["encoder_output"] = encoder_output + cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if beam_size > 1: # Beam Search initial_ids = tf.zeros([batch_size], dtype=tf.int32) @@ -417,6 +456,7 @@ def fast_decode(encoder_output, else: # Greedy def inner_loop(i, finished, next_id, decoded_ids, cache): + """One step of greedy decoding.""" logits, cache = symbols_to_logits_fn(next_id, i, cache) temperature = (0.0 if hparams.sampling_method == "argmax" else hparams.sampling_temp) @@ -430,7 +470,7 @@ def is_not_finished(i, finished, *_): return (i < decode_length) & tf.logical_not(tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64) - finished = tf.constant(False, shape=[batch_size]) + finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int64) _, _, _, decoded_ids, _ = tf.while_loop( is_not_finished, diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index f67476006..0c9b6f794 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -37,13 +37,15 @@ class TransformerTest(tf.test.TestCase): - def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN): + def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN, has_input=True): hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE) + if not has_input: + p_hparams.input_modality = {} hparams.problems = [p_hparams] inputs = -1 + np.random.random_integers( @@ -108,6 +110,41 @@ def testGreedyVsFast(self): self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(greedy_res, fast_res) + def testSlowVsFastNoInput(self): + model, features = self.getModel( + transformer.transformer_small(), has_input=False) + + decode_length = 2 + + out_logits, _ = model(features) + out_logits = tf.squeeze(out_logits, axis=[2, 3]) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), + labels=tf.reshape(features["targets"], [-1])) + loss = tf.reduce_mean(loss) + apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) + + with self.test_session(): + tf.global_variables_initializer().run() + for _ in range(100): + apply_grad.run() + + model.set_mode(tf.estimator.ModeKeys.PREDICT) + + with tf.variable_scope(tf.get_variable_scope(), reuse=True): + slow_result = model._slow_greedy_infer( + features, decode_length)["outputs"] + slow_result = tf.squeeze(slow_result, axis=[2, 3]) + + fast_result = model._greedy_infer(features, decode_length)["outputs"] + + with self.test_session(): + slow_res = slow_result.eval() + fast_res = fast_result.eval() + + self.assertEqual(fast_res.shape, (BATCH_SIZE, decode_length)) + self.assertAllClose(slow_res, fast_res) + def testBeamVsFast(self): model, features = self.getModel(transformer.transformer_small()) From 17ddd3e467ad6c356a1e035a38bb2069f0e8f9d8 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 2 Feb 2018 10:30:13 -0800 Subject: [PATCH 03/31] make config argument optional in problem.input_fn PiperOrigin-RevId: 184301328 --- tensor2tensor/data_generators/problem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 890271dbe..8fed895ca 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -670,7 +670,7 @@ def input_fn(self, partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN - if config.use_tpu: + if config and config.use_tpu: num_threads = 64 else: num_threads = 4 if is_training else 1 From 4bc354c1015a19a6e6e706e239d7d7350a650cdb Mon Sep 17 00:00:00 2001 From: Ashish Vaswani Date: Sat, 3 Feb 2018 15:24:51 -0800 Subject: [PATCH 04/31] Internal. PiperOrigin-RevId: 184412973 --- tensor2tensor/layers/modalities.py | 40 ++++++ tensor2tensor/models/transformer_vae.py | 174 ++++++++++++++++++++---- 2 files changed, 190 insertions(+), 24 deletions(-) diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 26063388b..ea289103b 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -298,6 +298,46 @@ def top(self, body_output, _): return x +@registry.register_image_modality("channel_embeddings_bottom") +class ImageChannelEmbeddingsBottom(modality.Modality): + """Modality for images using channel compression for generation.""" + + def get_channel_embeddings(self, io_depth, targets, hidden_size, + name="channel"): + """Get separate embedding for each of the channels.""" + targets_split = tf.split(targets, io_depth, axis=3) + rgb_embedding_var = tf.get_variable("rgb_target_emb_%s" % name, + [256 * io_depth, hidden_size]) + rgb_embedding_var = tf.identity(rgb_embedding_var) + rgb_embedding_var *= float(hidden_size)**0.5 + channel_target_embs = [] + for i in xrange(io_depth): + # Adding the channel offsets to get the right embedding since the + # embedding tensor has shape 256 * io_depth, hidden_size + target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256 + target_embs = common_layers.gather(rgb_embedding_var, target_ids) + channel_target_embs.append(target_embs) + + return tf.concat(channel_target_embs, axis=-1) + + def targets_bottom(self, inputs): + io_depth = self._model_hparams.num_channels + hidden_size = self._model_hparams.hidden_size + return self.get_channel_embeddings(io_depth, inputs, hidden_size, + "input_bottom") + + def top(self, body_output, _): + with tf.variable_scope(self.name): + img_len = self._model_hparams.img_len + channels = self._model_hparams.num_channels + x = tf.layers.dense(body_output, 256, + use_bias=True, activation=None, + name="output_conv") + x = tf.reshape(x, + [-1, img_len, img_len, channels, self.top_dimensionality]) + return x + + @registry.register_audio_modality("default") class AudioModality(modality.Modality): """Performs strided conv compressions for audio data.""" diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index b1efb75ca..bd0ce7d0a 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -24,7 +24,7 @@ # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin - +from tensor2tensor.google.models import common_image_attention as cia from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer @@ -32,6 +32,7 @@ from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model + import tensorflow as tf from tensorflow.python.training import moving_averages @@ -459,26 +460,57 @@ def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets, hparams, - name): + name, + task=None): """Original Transformer decoder.""" with tf.variable_scope(name): - targets = common_layers.flatten4d3d(targets) - - decoder_input, decoder_self_bias = transformer.transformer_prepare_decoder( - targets, hparams) - - decoder_input = tf.nn.dropout(decoder_input, - 1.0 - hparams.layer_prepostprocess_dropout) - - decoder_output = transformer.transformer_decoder( - decoder_input, - encoder_output, - decoder_self_bias, - encoder_decoder_attention_bias, - hparams) - + if task is None: + task = hparams.task + if task == "translate": + targets = common_layers.flatten4d3d(targets) + + decoder_input, decoder_self_bias = ( + transformer.transformer_prepare_decoder(targets, hparams)) + + decoder_input = tf.nn.dropout(decoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) + + decoder_output = transformer.transformer_decoder( + decoder_input, + encoder_output, + decoder_self_bias, + encoder_decoder_attention_bias, + hparams) + decoder_output = tf.expand_dims(decoder_output, axis=2) + else: + assert task == "image" + inputs = None + # have to reshape targets as b, 32, 32, 3 * hidden size] beacuse otherwise + # prepare_image will choke + targets = tf.reshape(targets, [tf.shape(targets)[0], hparams.img_len, + hparams.img_len, + hparams.num_channels*hparams.hidden_size]) + + # Prepare decoder inputs and bias. + decoder_input, _, _, bias = cia.prepare_decoder(targets, hparams) + # Add class label to decoder input. + if not hparams.drop_inputs: + decoder_input += tf.reshape( + inputs, + [common_layers.shape_list(targets)[0], 1, 1, hparams.hidden_size]) + decoder_output = cia.transformer_decoder_layers( + decoder_input, + None, + bias, + hparams.num_decoder_layers or hparams.num_hidden_layers, + hparams, + attention_type=hparams.dec_attention_type, + name="decoder") + decoder_output_shape = common_layers.shape_list(decoder_output) + decoder_output = tf.reshape(decoder_output, [decoder_output_shape[0], -1, 1, + hparams.hidden_size]) # Expand since t2t expects 4d tensors. - return tf.expand_dims(decoder_output, axis=2) + return decoder_output def multinomial_sample(x, vocab_size, temperature): @@ -566,7 +598,10 @@ def ae_transformer_internal(inputs, _DO_SUMMARIES = False # Prepare. - batch_size = common_layers.shape_list(inputs)[0] + if inputs is not None: + batch_size = common_layers.shape_list(inputs)[0] + else: + batch_size = common_layers.shape_list(targets)[0] targets = tf.reshape(targets, [batch_size, -1, 1, hparams.hidden_size]) # Encoder. @@ -579,7 +614,15 @@ def ae_transformer_internal(inputs, # Autoencoding. losses = {"extra": tf.constant(0.0), "latent_pred": tf.constant(0.0)} if hparams.do_ae: - max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1) + # flatten here + original_targets_shape = tf.shape(targets) + if hparams.task == "image": + cia.maybe_reshape_4d_to_3d(targets, hparams) + if hparams.task == "translate": + max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1) + else: + assert hparams.task == "image" + max_targets_len_from_inputs = targets targets, _ = common_layers.pad_to_same_length( targets, max_targets_len_from_inputs, final_length_divisible_by=2**hparams.num_compress_steps) @@ -599,8 +642,10 @@ def ae_transformer_internal(inputs, # Extra loss predicting latent code from input. Discrete only. if hparams.bottleneck_kind not in ["dense", "vae"]: latents_pred = decode_transformer( - tf.stop_gradient(inputs), tf.stop_gradient(ed), - tf.stop_gradient(latents_dense), hparams, "extra") + tf.stop_gradient(inputs) if inputs is not None else None, + tf.stop_gradient(ed) if inputs is not None else None, + tf.stop_gradient(latents_dense), hparams, "extra", + task="translate") _, latent_pred_loss = ae_latent_softmax( latents_pred, latents_discrete, hparams) losses["latent_pred"] = tf.reduce_mean( @@ -659,12 +704,18 @@ def bn_inputs(): if hparams.do_attend_decompress: d = attend(d, inputs, hparams, "decompress_attend_%d" % j) d = decompress_step(d, hparams, i > 0, False, "decompress_%d" % j) + # targets is always [batch, length, 1, depth] targets = mask * targets + (1.0 - mask) * d - targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1) + # reshape back to 4d here + if hparams.task == "image": + targets = tf.reshape(targets, original_targets_shape) + else: + targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1) res = decode_transformer(inputs, ed, targets, hparams, "decoder") if hparams.do_ae: - res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :] + if not hparams.do_mask: + res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :] if hparams.do_mask and hparams.do_refine: def refine_res(): # return residual_conv(res, 1, (5, 1), hparams, "refine") @@ -872,6 +923,9 @@ def transformer_ae_small(): hparams.add_hparam("random_top_k", 1) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True + + # task params + hparams.add_hparam("task", "translate") # translate or image tasks supported return hparams @@ -894,6 +948,78 @@ def transformer_ae_cifar(): return hparams +@registry.register_hparams +def imagetransformer_ae_cifar(): + """Hyperparameters for CIFAR-10 experiments.""" + hparams = transformer_ae_small() + hparams.filter_size = 512 + hparams.num_compress_steps = 3 + hparams.v_size = 1024 * 64 + hparams.startup_steps = 10000 + hparams.kmeans_lr_factor = 0.0 + hparams.is_2d = 0 + hparams.learning_rate_warmup_steps = 8000 + hparams.learning_rate = 0.2 + hparams.hidden_size = 512 + hparams.batch_size = 1 + hparams.max_length = 256 + hparams.dropout = 0.0 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer_adam_epsilon = 1e-9 + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 0.1 + hparams.initializer_gain = 0.2 + hparams.num_hidden_layers = 6 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + hparams.label_smoothing = 0.0 + hparams.norm_type = "layer" + hparams.layer_prepostprocess_dropout = 0.0 + hparams.num_heads = 8 + hparams.task = "image" + hparams.ffn_layer = "conv_hidden_relu" + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. + hparams.attention_dropout = 0.0 + hparams.relu_dropout = 0. + hparams.pos = "timing" # timing, none + hparams.nbr_decoder_problems = 1 + hparams.num_output_layers = 3 + hparams.add_hparam("block_size", 1) + + # dilated attention based flags + hparams.add_hparam("gap_sizes", [2, 4, 8, 16, 32, 64, 2, 4, 8, 16, 32, 64]) + hparams.add_hparam("dilated_attention", False) + + # image size related flags + # assuming that the image has same height and width + hparams.add_hparam("img_len", 32) + hparams.add_hparam("num_channels", 3) + # Local attention params + hparams.add_hparam("local_and_global_att", False) + hparams.add_hparam("block_length", 256) + hparams.add_hparam("block_width", 128) + hparams.num_encoder_layers = 4 + hparams.num_decoder_layers = 12 + hparams.sep_rgb_embed = False + hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D) + hparams.add_hparam("block_rastor_scan", False) + + # multipos attention params + hparams.add_hparam("q_filter_width", 1) + hparams.add_hparam("kv_filter_width", 1) + + hparams.add_hparam("unconditional", False) # unconditional generation + + hparams.target_modality = "image:channel_embeddings_bottom" + hparams.drop_inputs = True + hparams.do_attend_compress = False + hparams.do_attend_decompress = False + return hparams + + @registry.register_hparams def transformer_ae_base(): """Set of hyperparameters.""" From 522d222f1e1aff79210bd68c8250c751702906b2 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Sun, 4 Feb 2018 07:55:04 -0800 Subject: [PATCH 05/31] Omit newline characters form librispeech targets. Add option to preprocess waveforms in the SpeechModality.bottom PiperOrigin-RevId: 184448336 --- tensor2tensor/data_generators/librispeech.py | 3 +- .../data_generators/speech_recognition.py | 128 ++++++++++++++---- 2 files changed, 100 insertions(+), 31 deletions(-) diff --git a/tensor2tensor/data_generators/librispeech.py b/tensor2tensor/data_generators/librispeech.py index ad8e931d8..0c59824c1 100644 --- a/tensor2tensor/data_generators/librispeech.py +++ b/tensor2tensor/data_generators/librispeech.py @@ -66,8 +66,7 @@ def _collect_data(directory, input_ext, transcription_ext): transcript_path = os.path.join(root, transcript) with open(transcript_path, "r") as transcript_file: for transcript_line in transcript_file: - line_contents = transcript_line.split(" ", 1) - assert len(line_contents) == 2 + line_contents = transcript_line.strip().split(" ", 1) media_base, label = line_contents key = os.path.join(root, media_base) assert key not in data_files diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py index 01a3db564..4c037aeb3 100644 --- a/tensor2tensor/data_generators/speech_recognition.py +++ b/tensor2tensor/data_generators/speech_recognition.py @@ -32,7 +32,9 @@ from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers +from tensor2tensor.utils import expert_utils from tensor2tensor.utils import metrics from tensor2tensor.utils import modality from tensor2tensor.utils import registry @@ -76,7 +78,7 @@ def compute_mel_filterbank_features( frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial(tf.contrib.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, - log_noise_floor=1e-3): + log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: @@ -93,6 +95,7 @@ def compute_mel_filterbank_features( upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log + apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ @@ -100,14 +103,24 @@ def compute_mel_filterbank_features( # Transform of each signal in `signals`. Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 + + # Find the wave length: the largest index for which the value is !=0 + # note that waveforms samples that are exactly 0.0 are quite common, so + # simply doing sum(waveforms != 0, axis=-1) will not work correctly. + wav_lens = tf.reduce_max( + tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * + tf.to_int32(tf.not_equal(waveforms, 0.0)), + axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] + wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) + stfts = tf.contrib.signal.stft( waveforms, frame_length=frame_length, @@ -116,6 +129,11 @@ def compute_mel_filterbank_features( window_fn=window_fn, pad_end=True) + stft_lens = (wav_lens + (frame_step - 1)) // frame_step + masks = tf.to_float(tf.less_equal( + tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), + tf.expand_dims(stft_lens, 1))) + # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) @@ -134,7 +152,10 @@ def compute_mel_filterbank_features( log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) - return tf.expand_dims(log_mel_sgram, -1) + if apply_mask: + log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) + + return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams") # @@ -207,12 +228,21 @@ def vocab_size(self): return 256 +class ByteTextEncoderWithEos(text_encoder.ByteTextEncoder): + """Encodes each byte to an id and appends the EOS token.""" + + def encode(self, s): + return super(ByteTextEncoderWithEos, self).encode(s) + [text_encoder.EOS_ID] + + class SpeechRecognitionProblem(problem.Problem): """Base class for speech recognition problems.""" def hparams(self, defaults, model_hparams): p = model_hparams # Filterbank extraction + # Filterbank extraction in bottom instead of preprocess_example is faster. + p.add_hparam("audio_preproc_in_bottom", True) # The trainer seems to reserve memory for all members of the input dict p.add_hparam("audio_keep_example_waveforms", False) p.add_hparam("audio_sample_rate", 16000) @@ -248,7 +278,7 @@ def feature_encoders(self, _): # decoding.py doesn't try to convert the floats # into text... "waveforms": AudioEncoder(), - "targets": text_encoder.ByteTextEncoder(), + "targets": ByteTextEncoderWithEos(), } def example_reading_spec(self): @@ -263,25 +293,30 @@ def example_reading_spec(self): def preprocess_example(self, example, mode, hparams): p = hparams - waveforms = tf.expand_dims(example["waveforms"], 0) - mel_fbanks = compute_mel_filterbank_features( - waveforms, - sample_rate=p.audio_sample_rate, - dither=p.audio_dither, - preemphasis=p.audio_preemphasis, - frame_length=p.audio_frame_length, - frame_step=p.audio_frame_step, - lower_edge_hertz=p.audio_lower_edge_hertz, - upper_edge_hertz=p.audio_upper_edge_hertz, - num_mel_bins=p.audio_num_mel_bins) - if p.audio_add_delta_deltas: - mel_fbanks = add_delta_deltas(mel_fbanks) - fbank_size = common_layers.shape_list(mel_fbanks) - assert fbank_size[0] == 1 - # Later models like to flatten the two spatial dims. Instead, we add a - # unit spatial dim and flatten the frequencies and channels. - example["inputs"] = tf.reshape( - mel_fbanks, [fbank_size[1], 1, fbank_size[2] * fbank_size[3]]) + if p.audio_preproc_in_bottom: + example["inputs"] = tf.expand_dims( + tf.expand_dims(example["waveforms"], -1), -1) + else: + waveforms = tf.expand_dims(example["waveforms"], 0) + mel_fbanks = compute_mel_filterbank_features( + waveforms, + sample_rate=p.audio_sample_rate, + dither=p.audio_dither, + preemphasis=p.audio_preemphasis, + frame_length=p.audio_frame_length, + frame_step=p.audio_frame_step, + lower_edge_hertz=p.audio_lower_edge_hertz, + upper_edge_hertz=p.audio_upper_edge_hertz, + num_mel_bins=p.audio_num_mel_bins, + apply_mask=False) + if p.audio_add_delta_deltas: + mel_fbanks = add_delta_deltas(mel_fbanks) + fbank_size = common_layers.shape_list(mel_fbanks) + assert fbank_size[0] == 1 + # Later models like to flatten the two spatial dims. Instead, we add a + # unit spatial dim and flatten the frequencies and channels. + example["inputs"] = tf.reshape( + mel_fbanks, [fbank_size[1], 1, fbank_size[2] * fbank_size[3]]) if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self @@ -308,35 +343,70 @@ def bottom(self, inputs): p = self._model_hparams training = p.mode == tf.estimator.ModeKeys.TRAIN + num_mel_bins = p.audio_num_mel_bins + num_channels = 3 if p.audio_add_delta_deltas else 1 + with tf.variable_scope(self.name): - x = inputs - num_mel_bins = p.audio_num_mel_bins - num_channels = 3 if p.audio_add_delta_deltas else 1 + if p.audio_preproc_in_bottom: + # Compute filterbanks + with tf.variable_scope("fbanks"): + waveforms = tf.squeeze(inputs, [2, 3]) + mel_fbanks = compute_mel_filterbank_features( + waveforms, + sample_rate=p.audio_sample_rate, + dither=p.audio_dither, + preemphasis=p.audio_preemphasis, + frame_length=p.audio_frame_length, + frame_step=p.audio_frame_step, + lower_edge_hertz=p.audio_lower_edge_hertz, + upper_edge_hertz=p.audio_upper_edge_hertz, + num_mel_bins=p.audio_num_mel_bins, + apply_mask=True) + if p.audio_add_delta_deltas: + mel_fbanks = add_delta_deltas(mel_fbanks) + x = tf.reshape(mel_fbanks, + common_layers.shape_list(mel_fbanks)[:2] + + [1, num_mel_bins * num_channels]) + else: + x = inputs + # The convention is that the models are flattened along the spatial, # dimensions, thus the speech preprocessor treats frequencies and # channels as image colors (last axis) x.set_shape([None, None, 1, num_mel_bins * num_channels]) + xshape = common_layers.shape_list(x) + x = tf.reshape(x, [-1, 1, num_mel_bins * num_channels]) + + padding_mask = common_attention.embedding_to_padding(x) + pad_remover = expert_utils.PadRemover(padding_mask) + + x = pad_remover.remove(x) + # This replaces CMVN estimation on data x = tf.layers.batch_normalization( - x, axis=3, center=False, scale=False, training=training) + x, axis=2, center=False, scale=False, training=training) + + x = pad_remover.restore(x) - xshape = common_layers.shape_list(x) # restore batch_size x time x frequency x channel layout x = tf.reshape(x, [xshape[0], xshape[1], num_mel_bins, num_channels]) # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding? for _ in range(2): + x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d( x, 128, (3, 3), (2, 2), use_bias=False) - x = tf.layers.batch_normalization(x, axis=3, training=training) + x = common_layers.layer_norm(x) x = tf.nn.relu(x) xshape = common_layers.shape_list(x) # apply a conv that will remove all frequencies and at the same time # project the output into desired hidden_size + x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False) + assert common_layers.shape_list(x)[2] == 1 - x = tf.layers.batch_normalization(x, axis=3, training=training) + x = common_layers.layer_norm(x) x = tf.nn.relu(x) return x From 3843dd2236ca1bfa119ca1e1a75e525f2408da55 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Sun, 4 Feb 2018 21:09:44 -0800 Subject: [PATCH 06/31] Revert to appending latents in transformer ae as it works better. PiperOrigin-RevId: 184480516 --- tensor2tensor/models/transformer_vae.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index bd0ce7d0a..a65f0a3df 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -709,12 +709,12 @@ def bn_inputs(): # reshape back to 4d here if hparams.task == "image": targets = tf.reshape(targets, original_targets_shape) - else: + if hparams.task == "translate": targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1) res = decode_transformer(inputs, ed, targets, hparams, "decoder") if hparams.do_ae: - if not hparams.do_mask: + if hparams.task == "translate": res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :] if hparams.do_mask and hparams.do_refine: def refine_res(): From 203161028325bfe98c3ca7a0f8dcbec228c19b53 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Mon, 5 Feb 2018 12:44:50 -0800 Subject: [PATCH 07/31] Reset TPU input partitioning when switching modes PiperOrigin-RevId: 184570928 --- tensor2tensor/data_generators/problem.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 8fed895ca..929f547ee 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -633,6 +633,8 @@ def _dataset_partition(self, mode, config): num_partitions: an integer """ if mode != tf.estimator.ModeKeys.TRAIN or not hasattr(config, "tpu_config"): + # Reset in the case when using TPU but alternating TRAIN and EVAL. + self._next_partition_id = 0 return 0, 1 if config.tpu_config.per_host_input_for_training: num_partitions = max(config.tpu_config.num_shards // 8, 1) From 8057abd2953d27092127b6d0ba73104144e4126b Mon Sep 17 00:00:00 2001 From: T2T Team Date: Mon, 5 Feb 2018 13:09:03 -0800 Subject: [PATCH 08/31] Use z_size instead of v_size for VQ-VAE PiperOrigin-RevId: 184574565 --- tensor2tensor/models/transformer_vae.py | 41 ++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index a65f0a3df..ac5658f08 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -288,22 +288,23 @@ def embed(x): hot = tf.one_hot(x, hparams.v_size) h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense") elif hparams.bottleneck_kind == "vq-vae": - means_embed = means shape_x = common_layers.shape_list(x) x_flat = tf.reshape(x, [-1, 1]) c = int_to_bit(x_flat, nbits=int(math.log(hparams.v_size, 2)), base=2) shape = common_layers.shape_list(c) new_shape = shape new_shape[-1] = hparams.num_blocks - new_shape.append(int(math.log(hparams.v_size, 2) // hparams.num_blocks)) + new_shape.append(int(math.log(hparams.v_size, 2) / hparams.num_blocks)) c = tf.to_int32(tf.reshape(c, shape=new_shape)) c = bit_to_int( c, - nbits=int(math.log(hparams.v_size, 2) // hparams.num_blocks), + nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks), base=2) - h1 = tf.gather(tf.transpose(means_embed, [1, 0, 2]), c) - h1 = tf.stack( - [h1[:, :, i, i, :] for i in range(hparams.num_blocks)], axis=-2) + c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1) + c_hot_flat = tf.reshape( + c_hot, shape=[-1, hparams.num_blocks, hparams.block_v_size]) + h1 = tf.matmul(tf.transpose(c_hot_flat, perm=[1, 0, 2]), means) + h1 = tf.transpose(h1, perm=[1, 0, 2]) new_shape = shape_x new_shape.append(hparams.hidden_size) h1 = tf.reshape(h1, new_shape) @@ -355,10 +356,11 @@ def embed(x): # Get the discrete latent represenation x_means_idx = tf.argmax(x_means_hot, axis=-1) + # Get the binary representation x_means_bits = int_to_bit( x_means_idx, - nbits=int(math.log(hparams.v_size, 2) // hparams.num_blocks), + nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks), base=2) shape = common_layers.shape_list(x_means_bits) new_shape = shape[:-1] @@ -528,7 +530,7 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams): vocab_size = hparams.v_size if hparams.bottleneck_kind == "semhash": vocab_size = 2**hparams.z_size - if hparams.num_blocks < 2: + if hparams.num_decode_blocks < 2: latents_logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits") loss = None @@ -542,15 +544,17 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams): # Multi-block case. vocab_bits = int(math.log(vocab_size, 2)) assert vocab_size == 2**vocab_bits - assert vocab_bits % hparams.num_blocks == 0 - block_vocab_size = 2**(vocab_bits // hparams.num_blocks) - latents_logits = [tf.layers.dense(latents_pred, block_vocab_size, - name="extra_logits_%d" % i) - for i in xrange(hparams.num_blocks)] + assert vocab_bits % hparams.num_decode_blocks == 0 + block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks) + latents_logits = [ + tf.layers.dense( + latents_pred, block_vocab_size, name="extra_logits_%d" % i) + for i in xrange(hparams.num_decode_blocks) + ] loss = None if latents_discrete is not None: losses = [] - for i in xrange(hparams.num_blocks): + for i in xrange(hparams.num_decode_blocks): d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i), block_vocab_size) losses.append(tf.nn.sparse_softmax_cross_entropy_with_logits( @@ -629,7 +633,7 @@ def ae_transformer_internal(inputs, targets_c = compress(targets, inputs, False, hparams, "compress") if hparams.mode != tf.estimator.ModeKeys.PREDICT: # Compress and bottleneck. - latents_dense, latents_discrete, extra_loss, _ = bottleneck( + latents_dense, latents_discrete, extra_loss, embed = bottleneck( targets_c, hparams, 2 * 2048, "vc", means, ema_count, ema_means) if _DO_SUMMARIES: tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1])) @@ -752,7 +756,7 @@ def __init__(self, *args, **kwargs): self._hparams.block_dim = int( self._hparams.hidden_size // self._hparams.num_blocks) self._hparams.block_v_size = 2**( - math.log(self._hparams.v_size, 2) / self._hparams.num_blocks) + self._hparams.z_size / self._hparams.num_blocks) self._hparams.block_v_size = int(self._hparams.block_v_size) if self._hparams.reshape_method == "project": @@ -881,16 +885,17 @@ def transformer_ae_small(): hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 - hparams.optimizer = "Adafactor" # Can be unstable, maybe try Adam. + hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 - hparams.optimizer_adam_beta2 = 0.998 # Needs tuning, try 0.98 to 0.999. + hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.0) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) + hparams.add_hparam("num_decode_blocks", 1) # Reshape method for hierarchical vq-vae: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) From 9e7a4d1c7c73b87d661c5102e57ae905eed28363 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Mon, 5 Feb 2018 15:59:44 -0800 Subject: [PATCH 09/31] Fix bug with instability in higher z_sizes caused by math.log PiperOrigin-RevId: 184600621 --- tensor2tensor/models/transformer_vae.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index ac5658f08..a94e91473 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -290,15 +290,15 @@ def embed(x): elif hparams.bottleneck_kind == "vq-vae": shape_x = common_layers.shape_list(x) x_flat = tf.reshape(x, [-1, 1]) - c = int_to_bit(x_flat, nbits=int(math.log(hparams.v_size, 2)), base=2) + c = int_to_bit(x_flat, nbits=hparams.z_size, base=2) shape = common_layers.shape_list(c) new_shape = shape new_shape[-1] = hparams.num_blocks - new_shape.append(int(math.log(hparams.v_size, 2) / hparams.num_blocks)) + new_shape.append(int(hparams.z_size / hparams.num_blocks)) c = tf.to_int32(tf.reshape(c, shape=new_shape)) c = bit_to_int( c, - nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks), + nbits=int(hparams.z_size / hparams.num_blocks), base=2) c_hot = tf.one_hot(c, depth=hparams.block_v_size, axis=-1) c_hot_flat = tf.reshape( @@ -360,15 +360,15 @@ def embed(x): # Get the binary representation x_means_bits = int_to_bit( x_means_idx, - nbits=int(math.log(hparams.v_size, 2) / hparams.num_blocks), + nbits=int(hparams.z_size / hparams.num_blocks), base=2) shape = common_layers.shape_list(x_means_bits) new_shape = shape[:-1] - new_shape[-1] = int(math.log(hparams.v_size, 2)) + new_shape[-1] = hparams.z_size x_means_bits = tf.reshape(x_means_bits, shape=new_shape) c = bit_to_int( tf.to_int32(x_means_bits), - nbits=int(math.log(hparams.v_size, 2)), + nbits=hparams.z_size, base=2) # Update the ema variables From 798c406cad5937e38b01d853fb5116c30f779bd4 Mon Sep 17 00:00:00 2001 From: Ashish Vaswani Date: Mon, 5 Feb 2018 17:22:47 -0800 Subject: [PATCH 10/31] Adding common_image_attention, a set of functions that support generating image. Jointly programmed by nikp and avaswani PiperOrigin-RevId: 184612638 --- .../layers/common_image_attention.py | 544 ++++++++++++++++++ tensor2tensor/models/transformer_vae.py | 2 +- 2 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 tensor2tensor/layers/common_image_attention.py diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py new file mode 100644 index 000000000..88fc8ed93 --- /dev/null +++ b/tensor2tensor/layers/common_image_attention.py @@ -0,0 +1,544 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utils for attention mechanism for images.""" +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_layers +from tensor2tensor.utils import expert_utils + +import tensorflow as tf + + +class AttentionType(object): + LOCAL_1D = "local_1d" + LOCAL_2D = "local_2d" + GLOBAL = "global" + GLOCAL = "global_local" + MOE_LOCAL_1D = "moe_local1d" + + @staticmethod + def get_choices(): + return [ + AttentionType.GLOBAL, + AttentionType.GLOCAL, + AttentionType.MOE_LOCAL_1D, + AttentionType.LOCAL_1D, + AttentionType.LOCAL_2D, + ] + + +def maybe_reshape_4d_to_3d(x, hparams): + """Reshape input from 4D to 3D if necessary.""" + x_shape = common_layers.shape_list(x) + is_4d = False + if len(x_shape) == 4: + x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], x_shape[3]]) + is_4d = True + x.set_shape([None, None, hparams.hidden_size]) + return x, x_shape, is_4d + + +def local_attention_2d(x, hparams, attention_type="local_attention_2d"): + """Local 2d, self attention layer.""" + # self-attention + with tf.variable_scope("local_2d_self_att"): + y = common_attention.multihead_attention_2d( + x, + None, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + attention_type=attention_type, + query_shape=hparams.query_shape, + memory_flange=hparams.memory_flange, + name="self_attention") + return y + + +def local_attention_1d(x, + self_attention_bias, + hparams, + attention_type="local_unmasked", + q_padding="VALID", + kv_padding="VALID"): + """Local 1d self attention.""" + # self-attention + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + with tf.variable_scope("local_1d_self_att"): + y = common_attention.multihead_attention( + x, + None, + self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + attention_type=attention_type, + block_width=hparams.block_width, + block_length=hparams.block_length, + q_padding=q_padding, + kv_padding=kv_padding, + q_filter_width=hparams.q_filter_width, + kv_filter_width=hparams.kv_filter_width, + name="self_attention") + if is_4d: + y = tf.reshape(y, x_shape) + y.set_shape([None, None, None, hparams.hidden_size]) + return y + + +def local_global_attention(x, + self_attention_bias, + hparams, + q_padding="LEFT", + kv_padding="LEFT"): + """Local and global 1d self attention.""" + with tf.variable_scope("self_local_global_att"): + [x_global, x_local] = tf.split(x, 2, axis=-1) + split_hidden_size = int(hparams.hidden_size / 2) + split_heads = int(hparams.num_heads / 2) + y_global = common_attention.multihead_attention( + x_global, + None, + self_attention_bias, + hparams.attention_key_channels or split_hidden_size, + hparams.attention_value_channels or split_hidden_size, + split_hidden_size, + split_heads, + hparams.attention_dropout, + q_filter_width=hparams.q_filter_width, + kv_filter_width=hparams.kv_filter_width, + q_padding=q_padding, + kv_padding=kv_padding, + name="global_self_att") + y_local = common_attention.multihead_attention( + x_local, + None, + self_attention_bias, + hparams.attention_key_channels or split_hidden_size, + hparams.attention_value_channels or split_hidden_size, + split_hidden_size, + split_heads, + hparams.attention_dropout, + attention_type="local_masked", + block_length=hparams.block_length, + block_width=hparams.block_width, + q_filter_width=hparams.q_filter_width, + kv_filter_width=hparams.kv_filter_width, + q_padding=q_padding, + kv_padding=kv_padding, + name="local_self_att") + y = tf.concat([y_global, y_local], axis=-1) + return y + + +def full_self_attention(x, + self_attention_bias, + hparams, + q_padding="LEFT", + kv_padding="LEFT"): + """Full self-attention layer.""" + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + with tf.variable_scope("self_att"): + y = common_attention.multihead_attention( + x, + None, + self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + q_filter_width=hparams.q_filter_width, + kv_filter_width=hparams.kv_filter_width, + q_padding=q_padding, + kv_padding=kv_padding, + name="self_att") + if is_4d: + y = tf.reshape(y, [x_shape[0], x_shape[1], x_shape[2], x_shape[3]]) + y.set_shape([None, None, None, hparams.hidden_size]) + return y + + +def encdec_attention_1d(x, + encoder_output, + hparams): + """Local 1d self attention.""" + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + encoder_output, _, _ = maybe_reshape_4d_to_3d(encoder_output, hparams) + with tf.variable_scope("encdec_attention"): + # Encoder Decoder attention + y = common_attention.multihead_attention( + x, + encoder_output, + None, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + name="encdec_attention") + if is_4d: + y = tf.reshape(y, x_shape) + y.set_shape([None, None, None, hparams.hidden_size]) + return y + + +def transformer_decoder_layers(inputs, + encoder_output, + bias, + num_layers, + hparams, + attention_type=AttentionType.LOCAL_2D, + name="transformer"): + """Multi layer transformer.""" + x = inputs + x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) + for layer in xrange(num_layers): + with tf.variable_scope("%s_layer_%d" % (name, layer)): + # self-attention + skip connections + if attention_type == AttentionType.LOCAL_2D: + y = local_attention_2d(common_layers.layer_preprocess(x, hparams), + hparams, + attention_type="masked_local_attention_2d") + elif attention_type == AttentionType.LOCAL_1D: + y = local_attention_1d(common_layers.layer_preprocess(x, hparams), + bias, hparams, + attention_type="local_mask_right", + q_padding="LEFT", kv_padding="LEFT") + elif attention_type == AttentionType.GLOCAL: + y = local_global_attention(common_layers.layer_preprocess(x, hparams), + bias, hparams, + q_padding="LEFT", kv_padding="LEFT") + elif attention_type == AttentionType.GLOBAL: + y = full_self_attention(common_layers.layer_preprocess(x, hparams), + bias, hparams, + q_padding="LEFT", kv_padding="LEFT") + # TODO(nikip): Add support for dilated attention. + x = common_layers.layer_postprocess(x, y, hparams) + # enc-dec attention + skip connections + if encoder_output is not None: + y = encdec_attention_1d(common_layers.layer_preprocess(x, hparams), + encoder_output, hparams) + x = common_layers.layer_postprocess(x, y, hparams) + # feed-fwd layers + skip connections + y = ffn_layer(common_layers.layer_preprocess(x, hparams), hparams) + x = common_layers.layer_postprocess(x, y, hparams) + return common_layers.layer_preprocess(x, hparams) + + +def transformer_encoder_layers(inputs, + num_layers, + hparams, + attention_type=AttentionType.GLOBAL, + self_attention_bias=None, + q_padding="VALID", + kv_padding="VALID", + name="transformer"): + """Multi layer transformer encoder.""" + x = inputs + x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) + + for layer in xrange(num_layers): + # attention layers + skip connections + with tf.variable_scope("%s_layer_%d" % (name, layer)): + if attention_type == AttentionType.LOCAL_2D: + y = local_attention_2d(common_layers.layer_preprocess(x, hparams), + hparams, + attention_type="local_attention_2d") + elif attention_type == AttentionType.LOCAL_1D: + y = local_attention_1d(common_layers.layer_preprocess(x, hparams), + self_attention_bias, hparams, + attention_type="local_unmasked", + q_padding=q_padding, kv_padding=kv_padding) + elif attention_type == AttentionType.GLOBAL: + y = full_self_attention(common_layers.layer_preprocess(x, hparams), + self_attention_bias, hparams, + q_padding=q_padding, kv_padding=kv_padding) + x = common_layers.layer_postprocess(x, y, hparams) + # feed-fwd layer + skip connections + y = ffn_layer(common_layers.layer_preprocess(x, hparams), hparams) + x = common_layers.layer_postprocess(x, y, hparams) + return common_layers.layer_preprocess(x, hparams) + + +def ffn_layer(x, hparams): + """ffn layer transformer.""" + with tf.variable_scope("ffn"): + if hparams.ffn_layer == "none": + return x + if hparams.ffn_layer == "conv_hidden_relu": + y = common_layers.dense_relu_dense( + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout) + elif hparams.ffn_layer == "normed_conv_hidden_relu": + y = common_layers.normed_conv_hidden_relu( + x, + hparams.norm_type, + hparams.layer_norm_epsilon, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout, + norm_name="convnorm") + elif hparams.ffn_layer == "self_attention_ffn": + x_shape = tf.shape(x) + x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size]) + y = common_attention.ffn_self_attention_layer( + x, hparams.filter_size, hparams.hidden_size, hparams.num_parts, + hparams.attention_dropout, hparams.share_kv) + y = tf.reshape(y, x_shape) + else: + assert hparams.ffn_layer == "glu_ffn" + y = common_layers.gated_linear_unit_layer(x) + return y + + +def transformer_layers_sharded(dp, + ps_devices, + inputs, + num_layers, + hparams, + self_attention_bias=None, + enc_output=None, + attention_type=AttentionType.GLOBAL, + name="transformer"): + """Multi layer transformer, sharded by the data parallelism dp.""" + x = inputs + extra_loss = tf.constant(0.0) + moe_hidden_sizes = [int(s) for s in hparams.moe_hidden_sizes.split(",")] + expert_fn = expert_utils.ffn_expert_fn( + hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size) + x = dp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout) + for layer in xrange(num_layers): + with tf.variable_scope("%s_layer_%d" % (name, layer)): + # self-attention + if attention_type == AttentionType.LOCAL_2D: + y = dp(local_attention_2d(common_layers.layer_preprocess(x, hparams), + hparams, + attention_type="masked_local_attention_2d")) + elif attention_type == AttentionType.LOCAL_1D: + y = dp(local_attention_1d(common_layers.layer_preprocess(x, hparams), + self_attention_bias, hparams, + attention_type="local_mask_right", + q_padding="LEFT", kv_padding="LEFT")) + elif attention_type == AttentionType.GLOCAL: + y = dp(local_global_attention( + common_layers.layer_preprocess(x, hparams), self_attention_bias, + hparams, q_padding="LEFT", kv_padding="LEFT")) + elif attention_type == AttentionType.GLOBAL: + y = dp(full_self_attention(common_layers.layer_preprocess(x, hparams), + self_attention_bias, hparams, + q_padding="LEFT", kv_padding="LEFT")) + x = common_layers.layer_postprocess(x, y, hparams) + if enc_output is not None: + y = dp(encdec_attention_1d(common_layers.layer_preprocess(x, hparams), + enc_output, hparams)) + x = dp(common_layers.layer_postprocess, x, y, hparams) + with tf.variable_scope("ffn"): + if str(layer) in hparams.moe_layers_decoder.split(","): + y, loss = expert_utils.distributed_moe( + dp, + ps_devices, + common_layers.layer_preprocess(x, hparams), + hparams.mode == tf.estimator.ModeKeys.TRAIN, + input_size=hparams.hidden_size, + expert_fn=expert_fn, + num_experts=hparams.moe_num_experts, + k=hparams.moe_k, + loss_coef=hparams.moe_loss_coef) + extra_loss += loss + x = dp(common_layers.layer_postprocess, x, y, hparams) + else: + y = dp(ffn_layer, common_layers.layer_preprocess(x, hparams), hparams) + x = dp(common_layers.layer_postprocess, x, y, hparams) + return dp(common_layers.layer_preprocess, x, hparams), extra_loss + + +def postprocess_image(x, rows, cols, hparams): + """Postprocessing after decoding.""" + batch = common_layers.shape_list(x)[0] + channels = 256 + x = tf.reshape(x, [batch, rows, cols, hparams.hidden_size]) + # targets = common_layers.conv(x, 256, (1, 1), name="output_conv") + targets = tf.layers.dense(x, 256, use_bias=True, activation=None, + name="output_conv") + if hparams.mode == tf.contrib.learn.ModeKeys.INFER: + y = targets + y = tf.reshape(y, [batch, -1, hparams.img_len*3, channels]) + yshape = common_layers.shape_list(y) + block_length = hparams.query_shape[0] + block_width = hparams.query_shape[1] + + # Break into block row wise. + y = tf.reshape(y, + [batch, yshape[1] // block_length, + block_length, + yshape[2], channels]) + yshape = common_layers.shape_list(y) + # Break into blocks width wise. + y_blocks = tf.reshape(y, + [batch, yshape[1], yshape[2], + yshape[3] // block_width, + block_width, channels]) + + # Reshape targets as [batch_size, num_blocks_rows, num_block_cols, + # block_length, block_width, channels] + targets = tf.transpose(y_blocks, [0, 1, 3, 2, 4, 5]) + + return targets + + +def prepare_encoder(inputs, hparams, attention_type="local_1d"): + """Prepare encoder for images.""" + x = prepare_image(inputs, hparams, name="enc_channels") + # Add position signals. + x = add_pos_signals(x, hparams, "enc_pos") + x_shape = common_layers.shape_list(x) + if attention_type == "local_1d": + x = tf.reshape(x, [x_shape[0], x_shape[1]*x_shape[2], hparams.hidden_size]) + x.set_shape([None, None, hparams.hidden_size]) + elif attention_type == "local_2d": + x.set_shape([None, None, None, hparams.hidden_size]) + return x + + +def prepare_decoder(targets, hparams): + """Prepare decoder for images.""" + targets_shape = common_layers.shape_list(targets) + channels = hparams.num_channels + curr_infer_length = None + + # during training, images are [batch, IMG_LEN, IMG_LEN, 3]. + # At inference, they are [batch, curr_infer_length, 1, 1] + if hparams.mode == tf.contrib.learn.ModeKeys.INFER: + curr_infer_length = targets_shape[1] + if hparams.block_rastor_scan: + assert hparams.img_len*channels % hparams.query_shape[1] == 0 + assert hparams.img_len % hparams.query_shape[0] == 0 + total_block_width = hparams.img_len*channels + # Decoding is in block rastor scan order. We divide the image into + # hparams.query_shape blocks and then decode each block in rastor scan. + # To make that compatible with our inference pipeline, pad the target so + # that rows is a multiple of query_shape and columns is a multiple of + # hparams.img_len*channels + curr_infer_length = targets_shape[1] + block_padding_factor = total_block_width * hparams.query_shape[0] + targets = tf.pad(targets, [ + [0, 0], [0, -curr_infer_length % block_padding_factor], + [0, 0], [0, 0]]) + + num_blocks = total_block_width // hparams.query_shape[1] + # Reshape the image to represent blocks + target_blocks = tf.reshape( + targets, [targets_shape[0], -1, num_blocks, hparams.query_shape[0], + hparams.query_shape[1]]) + # Transpose to read the image in 2D fashion. + targets = tf.transpose(target_blocks, [0, 1, 3, 2, 4]) + else: + # add padding to make sure the size of targets is a multiple of img_height + # times number of channels. This is needed for positional encodings and + # for doing the RGB lookup. + padding_factor = channels * hparams.img_len + targets = tf.pad(targets, [ + [0, 0], [0, -curr_infer_length % padding_factor], [0, 0], [0, 0]]) + targets = tf.reshape(targets, + [targets_shape[0], -1, hparams.img_len, channels]) + # Preprocess image + x = prepare_image(targets, hparams, name="dec_channels") + x_shape = common_layers.shape_list(x) + # mask out upper triangle to avoid looking into the future. + bias = common_attention.attention_bias_lower_triangle(x_shape[1]*x_shape[2]) + if hparams.dec_attention_type == AttentionType.LOCAL_2D: + x = common_attention.right_shift_blockwise(x, hparams.query_shape) + x = add_pos_signals(x, hparams, "dec_pos") + else: + # Add position signals + x = tf.reshape(x, [-1, x_shape[1]*x_shape[2], hparams.hidden_size]) + x = common_layers.shift_right_3d(x) + x = tf.reshape(x, [-1, x_shape[1], x_shape[2], hparams.hidden_size]) + x = add_pos_signals(x, hparams, "dec_pos") + x.set_shape([None, None, None, hparams.hidden_size]) + return x, x_shape[1], x_shape[2], bias + + +def prepare_image(inputs, hparams, name=None): + """Prepare image.""" + inputs_shape = common_layers.shape_list(inputs) + batch = inputs_shape[0] + orig_rows = inputs_shape[1] + orig_cols = inputs_shape[2] + channels = hparams.num_channels + + hidden_size = hparams.hidden_size + # Only do lookup if the embeddings haven't been looked up already. + # if the last dimension is number of channels, then this is very likely the + # channel ids tensor. We have to make sure. + if inputs_shape[-1] == hparams.num_channels: + inputs = tf.to_int32(inputs) + x = get_channel_embeddings(channels, inputs, hidden_size, name=name) + else: + x = inputs + x = tf.reshape(x, [batch, orig_rows, orig_cols * channels, hidden_size]) + + return x + + +def create_output(decoder_output, rows, cols, targets, hparams): + """Create output from decoder output and vars.""" + decoded_image = postprocess_image(decoder_output, rows, cols, hparams) + targets_shape = common_layers.shape_list(targets) + if hparams.mode == tf.estimator.ModeKeys.PREDICT: + # Hardcoding that the number of intensity values is 256. + y = tf.reshape(decoded_image, [targets_shape[0], -1, 1, 1, 256]) + output = y[:, :targets_shape[1], :, :, :] + else: + output = tf.reshape(decoded_image, [ + targets_shape[0], targets_shape[1], targets_shape[2], + targets_shape[3], 256 + ]) + return output + + +def get_channel_embeddings(io_depth, targets, hidden_size, name="channel"): + """Get separate embedding for each of the channels.""" + targets_split = tf.split(targets, io_depth, axis=3) + rgb_embedding_var = tf.get_variable("rgb_target_emb_%s" % name, + [256 * io_depth, hidden_size]) + rgb_embedding_var = tf.identity(rgb_embedding_var) + rgb_embedding_var *= float(hidden_size)**0.5 + channel_target_embs = [] + for i in xrange(io_depth): + # Adding the channel offsets to get the right embedding since the + # embedding tensor has shape 256 * io_depth, hidden_size + target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256 + target_embs = common_layers.gather(rgb_embedding_var, target_ids) + channel_target_embs.append(target_embs) + + return tf.concat(channel_target_embs, axis=-1) + + +def add_pos_signals(x, hparams, name="pos_emb"): + with tf.variable_scope(name, reuse=False): + if hparams.pos == "timing": + x = common_attention.add_timing_signal_nd(x) + else: + assert hparams.pos == "emb" + x = common_attention.add_positional_embedding_nd( + x, hparams.max_length, name=name) + return x diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index a94e91473..ed0fa51bf 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -24,8 +24,8 @@ # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.google.models import common_image_attention as cia from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_image_attention as cia from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer from tensor2tensor.utils import expert_utils From 887c4108abfccdc45bde793b7eb9b876329489bb Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 6 Feb 2018 12:35:56 -0800 Subject: [PATCH 11/31] Rename hierarchical vq-vae to decomposed vq-vae as in paper. PiperOrigin-RevId: 184722477 --- tensor2tensor/models/transformer_vae.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index ed0fa51bf..08d2bdf2c 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -760,7 +760,7 @@ def __init__(self, *args, **kwargs): self._hparams.block_v_size = int(self._hparams.block_v_size) if self._hparams.reshape_method == "project": - tf.logging.info("Using random projections for hierarchical vq-vae") + tf.logging.info("Using projections for decomposed vq-vae") tf.logging.info("Trainable projections = {}".format( self._hparams.trainable_projections)) self._hparams.projection_tensors = tf.get_variable( @@ -773,7 +773,7 @@ def __init__(self, *args, **kwargs): trainable=self._hparams.trainable_projections) self._hparams.reshape_fn = project_hidden elif self._hparams.reshape_method == "slice": - tf.logging.info("Using slices for hierarchical vq-vae") + tf.logging.info("Using slices for decomposed vq-vae") self._hparams.reshape_fn = slice_hidden else: raise ValueError("Unknown reshape method") @@ -896,7 +896,7 @@ def transformer_ae_small(): hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) hparams.add_hparam("num_decode_blocks", 1) - # Reshape method for hierarchical vq-vae: slice, project + # Reshape method for decomposed vq-vae: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) hparams.add_hparam("unmasked_percentage", 0.1) From 21c9fed2c2a54d3ce286842dd8879a02743d57a4 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 6 Feb 2018 14:09:08 -0800 Subject: [PATCH 12/31] Add unmasked version of memory reduced attention PiperOrigin-RevId: 184737000 --- tensor2tensor/layers/common_attention.py | 51 +++++++++++++++--------- tensor2tensor/models/transformer_moe.py | 4 +- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index e82e6d471..229521cc5 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -192,9 +192,9 @@ def memeff_attention_fn(*args, **kwargs): attention_type="local_mask_right", ) - # === Memory-compressed multihead self attention layer === + # === Masked memory-compressed multihead self attention layer === # Only works for self attention. Always mask the future. - compressed_attention_fn = register_layer( + compressed_attention_masked_fn = register_layer( multihead_self_attention_reduced, default_kwargs=dict( factor=hparams.attention_red_factor, @@ -209,6 +209,13 @@ def memeff_attention_fn(*args, **kwargs): ), ) + # === Unmasked memory-compressed multihead self attention layer === + # Only works for self attention. Never mask the future. Bias never added + compressed_attention_fn = partial( + compressed_attention_masked_fn, + add_mask=False, + ) + # Feed-forwards layers: # === Mixture of expert layer === @@ -259,14 +266,17 @@ def memeff_attention_fn(*args, **kwargs): # Define all available layers layers = dict( + # Attention layers: a=multihead_attention_fn, # Multihead full attention loc=local_attention_fn, # Local attention - locm=local_attention_masked_fn, # Local masked attention + locm=local_attention_masked_fn, # Local attention (masked) red=compressed_attention_fn, # Memory-compressed attention + redm=compressed_attention_masked_fn, # Memory-compressed att (masked) mem=memeff_attention_fn, # Memory efficient - fc=conv_hidden_relu, - sep=sep_conv_relu, # Fully connected - sepm=sep_conv_relu_masked, # masked separable convolution + # Feed-forward layers: + fc=conv_hidden_relu, # Fully connected + sep=sep_conv_relu, # Separable convolution (unmasked) + sepm=sep_conv_relu_masked, # Separable convolution (masked) moe=distributed_moe, # Mixture of expert layer ) return layers @@ -3415,6 +3425,7 @@ def multihead_self_attention_reduced( multihead_params=None, nonlinearity="none", reduction_type="conv", + add_mask=True, ): """Reduce the length dimension by compressing with conv. @@ -3426,6 +3437,7 @@ def multihead_self_attention_reduced( multihead_params (dict): parameters for multihead attention nonlinearity (str): Add some non-linearity after the memory block reduction_type (str): type of compression + add_mask (bool): If True, add the bias to prevent attention to the future Returns: (tf.Tensor): float32 of shape [batch, length, depth] @@ -3475,18 +3487,21 @@ def construct_bias_vectors(t, axis): # [1, length_k] or [length_q, 1] return length_coordinates - bias = tf.to_float( - tf.greater( - # Because we add the first elem to the memory block and it can be - # attended by anyone,we don't need to add +1 anymore to prevent self - # attention Use * factor to make sure the last tokens of a block - # cannot attend the block - construct_bias_vectors(memory_x, 0) * factor, - # +epsilon to avoid float equality - construct_bias_vectors(x, 1) + 1e-3, - )) * -1e9 - bias = tf.expand_dims(bias, axis=0) - bias = tf.expand_dims(bias, axis=0) # [1, 1, length_k, length_q] + if add_mask: # Create mask to prevent attention to the future + bias = tf.to_float( + tf.greater( + # Because we add the first elem to the memory block and it can be + # attended by anyone,we don't need to add +1 anymore to prevent self + # attention Use * factor to make sure the last tokens of a block + # cannot attend the block + construct_bias_vectors(memory_x, 0) * factor, + # +epsilon to avoid float equality + construct_bias_vectors(x, 1) + 1e-3, + )) * -1e9 + bias = tf.expand_dims(bias, axis=0) + bias = tf.expand_dims(bias, axis=0) # [1, 1, length_k, length_q] + else: + bias = None return multihead_attention( query_antecedent=x, diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py index efa67bf27..2bf807b19 100644 --- a/tensor2tensor/models/transformer_moe.py +++ b/tensor2tensor/models/transformer_moe.py @@ -329,7 +329,7 @@ def transformer_moe_8k_lm(): # * Memory efficient multihead attention (slow): # hparams.layer_types = "#mem/mem/mem-moe/mem/mem" # * Alternate between local/compressed attention layers (faster): - # hparams.layer_types = "#locm/red/locm-moe/red/locm" + # hparams.layer_types = "#locm/redm/locm-moe/redm/locm" return hparams @@ -386,6 +386,6 @@ def transformer_moe_prepend_8k(): hparams.eval_drop_long_sequences = False hparams.max_input_seq_length = 7500, hparams.default_ff = "sepm" - hparams.layer_types = "locm/red/locm-moe/red/locm" + hparams.layer_types = "locm/redm/locm-moe/redm/locm" hparams.moe_num_experts = 256 return hparams From f04475680c47781fd3c34b6c37cb2309715eff56 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 6 Feb 2018 15:07:36 -0800 Subject: [PATCH 13/31] T2TModel.has_input checks for "inputs" PiperOrigin-RevId: 184747531 --- tensor2tensor/utils/t2t_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 0623a975e..225c4d19b 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -124,7 +124,7 @@ def hparams(self): @property def has_input(self): if self._problem_hparams: - return self._problem_hparams.input_modality + return "inputs" in self._problem_hparams.input_modality else: return True From 361f5d195d4590f34489d26543e8b38c3d7385a8 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Tue, 6 Feb 2018 18:25:12 -0800 Subject: [PATCH 14/31] Use beam search for latent prediction in transformer ae (only for num_blocks=1 for now). PiperOrigin-RevId: 184774591 --- tensor2tensor/models/transformer_vae.py | 40 +++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 08d2bdf2c..835470d57 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -28,6 +28,7 @@ from tensor2tensor.layers import common_image_attention as cia from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer +from tensor2tensor.utils import beam_search from tensor2tensor.utils import expert_utils from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -566,8 +567,42 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams): return sample, loss +def ae_latent_sample_beam(latents_dense_in, inputs, ed, embed, hparams): + """Sample from the latent space in the autoencoder.""" + vocab_size = 2**hparams.z_size + beam_size = 1 # TODO(lukaszkaiser): larger beam sizes seem to work bad. + inputs = tf.tile(inputs, [beam_size, 1, 1]) + ed = tf.tile(ed, [beam_size, 1, 1, 1]) + + def symbols_to_logits_fn(ids): + """Go from ids to logits.""" + ids = tf.expand_dims(ids, axis=2) # Ids start with added all-zeros. + latents_discrete = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0]]) + + with tf.variable_scope(tf.get_variable_scope(), reuse=False): + latents_dense = embed(latents_discrete) + latents_pred = decode_transformer( + inputs, ed, latents_dense, hparams, "extra") + logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits") + current_output_position = common_layers.shape_list(ids)[1] - 1 + logits = logits[:, current_output_position, :, :] + return tf.squeeze(logits, axis=[1]) + + initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32) + length = tf.shape(latents_dense_in)[1] + ids, _ = beam_search.beam_search( + symbols_to_logits_fn, initial_ids, beam_size, length, + vocab_size, alpha=0.0, eos_id=-1, stop_early=False) + + res = tf.expand_dims(ids[:, 0, :], axis=2) # Pick first beam. + return res[:, 1:] # Remove the added all-zeros from ids. + + def ae_latent_sample(latents_dense, inputs, ed, embed, iters, hparams): """Sample from the latent space in the autoencoder.""" + if hparams.num_decode_blocks < 2: + # TODO(lukaszkaiser): beam-search only works in non-blocked mode for now. + return ae_latent_sample_beam(latents_dense, inputs, ed, embed, hparams) latents_pred = decode_transformer(inputs, ed, latents_dense, hparams, "extra") latents_discrete, _ = ae_latent_softmax(latents_pred, None, hparams) @@ -680,7 +715,8 @@ def bn_inputs(): ema_count, ema_means) latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :]) if cache is None: - cache = ae_latent_sample(latents_dense, inputs, ed, embed, 8, hparams) + cache = ae_latent_sample( + latents_dense, inputs, ed, embed, 16, hparams) latents_dense = embed(cache) # Postprocess. d = latents_dense @@ -890,7 +926,7 @@ def transformer_ae_small(): hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("z_size", 14) - hparams.add_hparam("noise_dev", 0.0) + hparams.add_hparam("noise_dev", 0.5) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. hparams.add_hparam("bottleneck_kind", "semhash") From 416f78458118ea364da2abf3574d53d27d5c99c7 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 6 Feb 2018 20:15:50 -0800 Subject: [PATCH 15/31] Support launching T2T on --cloud_mlengine with CPU, GPU, and multi-GPU machines PiperOrigin-RevId: 184782692 --- tensor2tensor/bin/t2t_trainer.py | 23 ++- tensor2tensor/data_generators/problem.py | 22 ++- tensor2tensor/layers/common_attention.py | 2 +- tensor2tensor/layers/common_layers.py | 10 +- tensor2tensor/layers/modalities.py | 2 +- tensor2tensor/models/revnet.py | 2 +- tensor2tensor/models/transformer_vae.py | 12 +- tensor2tensor/utils/beam_search.py | 2 +- tensor2tensor/utils/cloud_mlengine.py | 137 ++++++++++++++++++ .../utils/{cloud.py => cloud_tpu.py} | 13 +- tensor2tensor/utils/diet.py | 4 +- 11 files changed, 203 insertions(+), 26 deletions(-) create mode 100644 tensor2tensor/utils/cloud_mlengine.py rename tensor2tensor/utils/{cloud.py => cloud_tpu.py} (97%) diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index 8f1f0dfdc..051c2e90d 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -26,7 +26,8 @@ from tensor2tensor import models # pylint: disable=unused-import from tensor2tensor import problems as problems_lib # pylint: disable=unused-import -from tensor2tensor.utils import cloud +from tensor2tensor.utils import cloud_mlengine +from tensor2tensor.utils import cloud_tpu from tensor2tensor.utils import decoding from tensor2tensor.utils import flags as t2t_flags # pylint: disable=unused-import from tensor2tensor.utils import registry @@ -81,6 +82,18 @@ flags.DEFINE_bool("cloud_delete_on_done", False, "Whether to delete the VM and TPU instance when done.") +# Google Cloud ML Engine +flags.DEFINE_bool("cloud_mlengine", False, + "Whether to launch on Cloud ML Engine.") +flags.DEFINE_string("cloud_mlengine_master_type", None, + "Machine type for master on Cloud ML Engine. " + "If provided, overrides default selections based on " + "--worker_gpu. User is responsible for ensuring " + "type is valid and that --worker_gpu matches number of " + "GPUs on machine type. See documentation: " + "https://cloud.google.com/ml-engine/reference/rest/v1/" + "projects.jobs#traininginput") + def get_problem_name(): problems = FLAGS.problems.split("-") @@ -244,7 +257,7 @@ def maybe_cloud_tpu(): "be gs:// paths, i.e. on Google Cloud Storage.") FLAGS.use_tpu = True - with cloud.cloud_tpu( + with cloud_tpu.cloud_tpu( FLAGS.cloud_vm_name, FLAGS.cloud_tpu_name, delete_on_done=FLAGS.cloud_delete_on_done) as tpu_master: @@ -258,6 +271,12 @@ def main(_): usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) log_registry() + if FLAGS.cloud_mlengine: + assert not FLAGS.cloud_tpu + assert FLAGS.output_dir.startswith("gs://") + assert FLAGS.data_dir.startswith("gs://") + return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"])) + if FLAGS.generate_data: generate_data() diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 929f547ee..a2c330c2d 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -517,16 +517,26 @@ def _maybe_reverse_and_copy(example): if shuffle_files: random.shuffle(data_files) dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files)) - dataset = dataset.apply( - tf.contrib.data.parallel_interleave( - _load_records, sloppy=is_training, cycle_length=8)) + + if hasattr(tf.contrib.data, "parallel_interleave"): + dataset = dataset.apply( + tf.contrib.data.parallel_interleave( + _load_records, sloppy=is_training, cycle_length=8)) + else: + dataset = dataset.interleave(_load_records, cycle_length=8, + block_length=16) + if repeat: dataset = dataset.repeat() dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads) if preprocess: - dataset = dataset.apply( - tf.contrib.data.parallel_interleave( - _preprocess, sloppy=is_training, cycle_length=8)) + if hasattr(tf.contrib.data, "parallel_interleave"): + dataset = dataset.apply( + tf.contrib.data.parallel_interleave( + _preprocess, sloppy=is_training, cycle_length=8)) + else: + dataset = dataset.interleave(_preprocess, cycle_length=8, + block_length=16) dataset = dataset.map( _maybe_reverse_and_copy, num_parallel_calls=num_threads) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 229521cc5..0f400c575 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -3402,7 +3402,7 @@ def pad_and_reshape(x): block_length, block_length, # Restore the block length dimension ]) - weights = tf.reduce_sum(weights, axis=3, keepdims=True) # Compress block + weights = tf.reduce_sum(weights, axis=3, keep_dims=True) # Compress block v_out = tf.matmul(weights, v) # [1, block_length] @ [block_length, depth] v_out = tf.squeeze(v_out, axis=3) return v_out diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index c8d54fb99..7b22dc44b 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -509,8 +509,8 @@ def layer_norm_vars(filters): def layer_norm_compute_python(x, epsilon, scale, bias): """Layer norm raw computation.""" - mean = tf.reduce_mean(x, axis=[-1], keepdims=True) - variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) + mean = tf.reduce_mean(x, axis=[-1], keep_dims=True) + variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * scale + bias @@ -1171,7 +1171,7 @@ def mask_from_embedding(emb): Returns: a 0.0/1.0 Tensor with shape [batch, width, height, 1]. """ - return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keepdims=True)) + return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keep_dims=True)) def mask_leq(target_length, source_length): @@ -1703,7 +1703,7 @@ def smoothing_cross_entropy(logits, depth=vocab_size, on_value=confidence, off_value=low_confidence) - xentropy = tf.nn.softmax_cross_entropy_with_logits_v2( + xentropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=soft_targets) return xentropy - normalizing @@ -1737,7 +1737,7 @@ def global_pool_1d(inputs, pooling_type="MAX", mask=None): if mask is not None: # Some elems are dummy elems so we can't just reduce the average. output = tf.reduce_sum(inputs, axis=1) - num_elems = tf.reduce_sum(mask, axis=1, keepdims=True) + num_elems = tf.reduce_sum(mask, axis=1, keep_dims=True) output = tf.div(output, tf.maximum(num_elems, 1)) else: output = tf.reduce_mean(inputs, axis=1) diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index ea289103b..f5788701c 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -461,7 +461,7 @@ def top(self, body_output, _): """ with tf.variable_scope(self.name): x = body_output - x = tf.reduce_mean(x, axis=[1, 2], keepdims=True) + x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True) res = tf.layers.dense(x, self._vocab_size) return tf.expand_dims(res, 3) diff --git a/tensor2tensor/models/revnet.py b/tensor2tensor/models/revnet.py index 28b4cf681..3a6c7b32b 100644 --- a/tensor2tensor/models/revnet.py +++ b/tensor2tensor/models/revnet.py @@ -277,7 +277,7 @@ def final_block(x1, x2, dim='2d', training=True, scope='final_block'): # Global average pooling net = tf.reduce_mean(y, CONFIG[dim]['reduction_dimensions'], - name='final_pool', keepdims=True) + name='final_pool', keep_dims=True) return net diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 835470d57..0ce43081b 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -92,9 +92,9 @@ def top_k_softmax(x, k): """Calculate softmax(x), select top-k and rescale to sum to 1.""" x = tf.nn.softmax(x) top_x, _ = tf.nn.top_k(x, k=k+1) - min_top = tf.reduce_min(top_x, axis=-1, keepdims=True) + min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True) x = tf.nn.relu((x - min_top) + 1e-12) - x /= tf.reduce_sum(x, axis=-1, keepdims=True) + x /= tf.reduce_sum(x, axis=-1, keep_dims=True) return x, tf.reduce_max(top_x, axis=-1) @@ -142,7 +142,7 @@ def dae(x, hparams, name): maxvhot = tf.stop_gradient(tf.one_hot(maxvec, hparams.v_size)) # Add losses that prevent too few being used. distrib = tf.reshape(logsm, [-1, hparams.v_size]) * maxvhot - d_mean = tf.reduce_mean(distrib, axis=[0], keepdims=True) + d_mean = tf.reduce_mean(distrib, axis=[0], keep_dims=True) d_variance = tf.reduce_mean(tf.square(distrib - d_mean), axis=[0]) d_dev = - tf.reduce_mean(d_variance) ret = s @@ -202,8 +202,8 @@ def slice_hidden(x, hparams): def nearest(x, means, hparams): """Find the nearest means to elements in x.""" x_reshaped = hparams.reshape_fn(x, hparams) - x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keepdims=True) - means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keepdims=True) + x_norm_sq = tf.reduce_sum(tf.square(x_reshaped), axis=-1, keep_dims=True) + means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keep_dims=True) scalar_prod = tf.matmul( tf.transpose(x_reshaped, perm=[1, 0, 2]), tf.transpose(means, perm=[0, 2, 1])) @@ -393,7 +393,7 @@ def embed(x): tf.transpose(x_reshaped, perm=[1, 0, 2])) updated_ema_means = moving_averages.assign_moving_average( ema_means, dw, hparams.decay, zero_debias=False) - n = tf.reduce_sum(updated_ema_count, axis=-1, keepdims=True) + n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True) updated_ema_count = ((updated_ema_count + hparams.epsilon) / (n + hparams.v_size * hparams.epsilon) * n) updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1) diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py index 3841b5953..3c7b8c203 100644 --- a/tensor2tensor/utils/beam_search.py +++ b/tensor2tensor/utils/beam_search.py @@ -90,7 +90,7 @@ def get_state_shape_invariants(tensor): def log_prob_from_logits(logits): - return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True) + return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True) def compute_batch_indices(batch_size, beam_size): diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py new file mode 100644 index 000000000..12d89f470 --- /dev/null +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Launch on GCP's ML Engine.""" + +import os +import sys +import tempfile + +from googleapiclient import discovery +from oauth2client.client import GoogleCredentials +from tensor2tensor.utils import cloud_tpu as cloud +import tensorflow as tf + +CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' + +# TODO(rsepassi): +# * Support t2t_usr_dir +# * Support --autotune +# * Add documentation clould_mlengine.md +# * Enable multi-machine sync/async training + + +def args_dict_as_args(args_dict): + del args_dict['cloud_mlengine'] + args = [] + for name, val in args_dict.items(): + if val is None: + continue + args.extend(['--%s' % name, str(val)]) + return args + + +def machine_config(num_gpus=1, use_tpu=False, master_type=None): + """Return dict specifying machine config for trainingInput.""" + scale_tier = 'BASIC_GPU' + if use_tpu: + scale_tier = 'BASIC_TPU' + elif num_gpus <= 0: + scale_tier = 'BASIC' + elif num_gpus > 1: + scale_tier = 'CUSTOM' + + config = {'scaleTier': scale_tier} + + if scale_tier == 'CUSTOM': + assert num_gpus > 1 + if num_gpus not in [4, 8]: + raise ValueError('Must use exactly 1, 4, or 8 GPUs.') + config['masterType'] = ('complex_model_m_gpu' + if num_gpus == 4 else 'complex_model_l_gpu') + + if master_type: + config['masterType'] = master_type + + return config + + +def configure_job(flags_dict): + """Construct jobSpec for ML Engine job.""" + train_dir = flags_dict['output_dir'] + assert train_dir.startswith('gs://') + job_name = os.path.basename(train_dir) + + # See documentation: + # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput + training_input = { + 'packageUris': [os.path.join(train_dir, 'tensor2tensor.tar.gz')], + 'pythonModule': 'tensor2tensor.bin.t2t_trainer', + 'args': args_dict_as_args(flags_dict), + 'region': cloud.default_region(), + 'runtimeVersion': '1.4', + 'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7', + } + training_input.update( + machine_config( + num_gpus=flags_dict['worker_gpu'], + use_tpu=flags_dict['use_tpu'], + master_type=flags_dict['cloud_mlengine_master_type'])) + + if training_input['scaleTier'] == 'CUSTOM': + assert 'masterType' in training_input + + job_spec = {'jobId': job_name, 'trainingInput': training_input} + return job_spec + + +def launch_job(job_spec): + """Launch job on ML Engine.""" + project_id = 'projects/{}'.format(cloud.default_project()) + credentials = GoogleCredentials.get_application_default() + cloudml = discovery.build('ml', 'v1', credentials=credentials) + request = cloudml.projects().jobs().create(body=job_spec, parent=project_id) + request.execute() + + +def tar_and_copy_t2t(train_dir, usr_dir): + """Tar Tensor2Tensor and cp to train_dir.""" + tf.logging.info('Tarring and pushing local Tensor2Tensor package.') + location = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + tmp_dir = tempfile.gettempdir() + cloud.shell_run( + 'tar -zcf {tmp_dir}/tensor2tensor.tar.gz -C {location} .', + location=location, + tmp_dir=tmp_dir) + cloud.shell_run( + ('gsutil cp {tmp_dir}/tensor2tensor.tar.gz ' + '{train_dir}/tensor2tensor.tar.gz'), + tmp_dir=tmp_dir, + train_dir=train_dir.strip('/')) + if usr_dir: + raise ValueError('--t2t_usr_dir is not currently supported in conjunction ' + 'with auto-launching on Cloud ML Engine.') + + +def launch(flags_dict): + job_spec = configure_job(flags_dict) + job_name = job_spec['jobId'] + tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, + job_spec) + assert cloud.confirm() + tar_and_copy_t2t(flags_dict['output_dir'], flags_dict['t2t_usr_dir']) + launch_job(job_spec) + tf.logging.info('Launched %s. See console to track: %s.', job_name, + CONSOLE_URL) diff --git a/tensor2tensor/utils/cloud.py b/tensor2tensor/utils/cloud_tpu.py similarity index 97% rename from tensor2tensor/utils/cloud.py rename to tensor2tensor/utils/cloud_tpu.py index 937c6ee46..53dd36bd0 100644 --- a/tensor2tensor/utils/cloud.py +++ b/tensor2tensor/utils/cloud_tpu.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Launch on GCP.""" +"""Launch on TPU on GCP.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -191,6 +191,9 @@ def create_tpu(cls): gcloud compute ssh {name} -- -N """ + DEFAULT_PROJECT = "gcloud config get-value project" + DEFAULT_REGION = "gcloud config get-value compute/region" + @contextlib.contextmanager def shell_background(cmd_, **kwargs): @@ -224,6 +227,14 @@ def format_cmd(cmd_, **kwargs): return cmd_.format(**kwargs).strip().split() +def default_region(): + return shell_output(Gcloud.DEFAULT_REGION).strip() + + +def default_project(): + return shell_output(Gcloud.DEFAULT_PROJECT).strip() + + def create_vm(vm_name): out = shell_output(Gcloud.create_vm(), name=vm_name) return out.split("\n")[1:-1][0].split()[4] diff --git a/tensor2tensor/utils/diet.py b/tensor2tensor/utils/diet.py index 19702338b..7ecfba693 100644 --- a/tensor2tensor/utils/diet.py +++ b/tensor2tensor/utils/diet.py @@ -193,10 +193,10 @@ def update_variable(self, var, grad_var): beta2_pow = tf.pow(params.beta2, global_step) if params.factored_second_moment_accumulator and len(var.shape) == 2: vr_update = tf.assign(slots["adam_vr"], slots["adam_vr"] * params.beta2 + - tf.reduce_mean(grad_squared, 1, keepdims=True) * + tf.reduce_mean(grad_squared, 1, keep_dims=True) * (1.0 - params.beta2)) vc_update = tf.assign(slots["adam_vc"], slots["adam_vc"] * params.beta2 + - tf.reduce_mean(grad_squared, 0, keepdims=True) * + tf.reduce_mean(grad_squared, 0, keep_dims=True) * (1.0 - params.beta2)) with tf.control_dependencies([vr_update, vc_update]): vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon From 0b2c8c2340065345cd2eba7578d4b2c173fdaa0e Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 10:22:43 -0800 Subject: [PATCH 16/31] Add dep on google-api-python-client PiperOrigin-RevId: 184857261 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1d3f14a94..2ee6e74ee 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ 'flask', 'future', 'gevent', + 'google-api-python-client', 'gunicorn', 'numpy', 'requests', From 2cde9c2b74d016a9bffe5520ebe06d3cd671ad47 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Wed, 7 Feb 2018 11:02:02 -0800 Subject: [PATCH 17/31] Clean up obsolete ae cifar hparams, increase default batch size. PiperOrigin-RevId: 184863791 --- tensor2tensor/models/transformer_vae.py | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 0ce43081b..59f2d08ba 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -854,7 +854,7 @@ def body(self, features): return res, loss def prepare_features_for_infer(self, features): - if not self._hparams.do_ae: + if self._hparams.do_mask or not self._hparams.do_ae: return features beam_batch_size = self._decode_hparams.beam_size beam_batch_size *= self._decode_hparams.batch_size @@ -970,25 +970,6 @@ def transformer_ae_small(): return hparams -@registry.register_hparams -def transformer_ae_cifar(): - """Hyperparameters for CIFAR-10 experiments.""" - hparams = transformer_ae_small() - hparams.hidden_size = 256 - hparams.filter_size = 512 - hparams.batch_size = 1024 * 4 - hparams.num_compress_steps = 2 - hparams.v_size = 1024 * 64 - hparams.kl_warmup_steps = 150000 - hparams.startup_steps = 10000 - hparams.kmeans_lr_factor = 0.0 - hparams.is_2d = 1 - hparams.learning_rate_warmup_steps = 8000 - hparams.learning_rate = 0.2 - hparams.ffn_layer = "conv_hidden_relu_with_sepconv" - return hparams - - @registry.register_hparams def imagetransformer_ae_cifar(): """Hyperparameters for CIFAR-10 experiments.""" @@ -1065,7 +1046,7 @@ def imagetransformer_ae_cifar(): def transformer_ae_base(): """Set of hyperparameters.""" hparams = transformer_ae_small() - hparams.batch_size = 1024 + hparams.batch_size = 2048 hparams.hidden_size = 512 hparams.filter_size = 4096 hparams.num_hidden_layers = 6 From 963f44cfec617f56bac21f9ca8bf190c356a8797 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 11:44:48 -0800 Subject: [PATCH 18/31] Support --t2t_usr_dir on Cloud ML Engine PiperOrigin-RevId: 184871178 --- tensor2tensor/utils/cloud_mlengine.py | 95 ++++++++++++++++++++++----- tensor2tensor/utils/usr_dir.py | 13 +++- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index 12d89f470..0bb5ddc40 100644 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -16,25 +16,40 @@ """Launch on GCP's ML Engine.""" import os +import shutil import sys import tempfile from googleapiclient import discovery from oauth2client.client import GoogleCredentials from tensor2tensor.utils import cloud_tpu as cloud +from tensor2tensor.utils import usr_dir as usr_dir_lib import tensorflow as tf CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' # TODO(rsepassi): -# * Support t2t_usr_dir # * Support --autotune # * Add documentation clould_mlengine.md # * Enable multi-machine sync/async training +SETUP_PY = """ +from setuptools import find_packages +from setuptools import setup +setup( + name='DummyUsrDirPackage', + version='0.1', + packages=find_packages(), +) +""" + def args_dict_as_args(args_dict): + """Convert dict to list of args suitable for passing on cmd line.""" + args_dict = dict(args_dict) del args_dict['cloud_mlengine'] + # Configured later + del args_dict['t2t_usr_dir'] args = [] for name, val in args_dict.items(): if val is None: @@ -77,7 +92,6 @@ def configure_job(flags_dict): # See documentation: # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { - 'packageUris': [os.path.join(train_dir, 'tensor2tensor.tar.gz')], 'pythonModule': 'tensor2tensor.bin.t2t_trainer', 'args': args_dict_as_args(flags_dict), 'region': cloud.default_region(), @@ -106,32 +120,81 @@ def launch_job(job_spec): request.execute() -def tar_and_copy_t2t(train_dir, usr_dir): - """Tar Tensor2Tensor and cp to train_dir.""" - tf.logging.info('Tarring and pushing local Tensor2Tensor package.') - location = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) - tmp_dir = tempfile.gettempdir() +def _tar_and_copy(src_dir, target_dir): + """Tar and gzip src_dir and copy to GCS target_dir.""" + src_dir = src_dir.rstrip('/') + target_dir = target_dir.rstrip('/') + tmp_dir = tempfile.gettempdir().rstrip('/') + src_base = os.path.basename(src_dir) cloud.shell_run( - 'tar -zcf {tmp_dir}/tensor2tensor.tar.gz -C {location} .', - location=location, + 'tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .', + src_dir=src_dir, + src_base=src_base, tmp_dir=tmp_dir) + final_destination = '%s/%s.tar.gz' % (target_dir, src_base) cloud.shell_run( - ('gsutil cp {tmp_dir}/tensor2tensor.tar.gz ' - '{train_dir}/tensor2tensor.tar.gz'), + ('gsutil cp {tmp_dir}/{src_base}.tar.gz ' + '{final_destination}'), tmp_dir=tmp_dir, - train_dir=train_dir.strip('/')) - if usr_dir: - raise ValueError('--t2t_usr_dir is not currently supported in conjunction ' - 'with auto-launching on Cloud ML Engine.') + src_base=src_base, + final_destination=final_destination) + return final_destination + + +def tar_and_copy_t2t(train_dir): + """Tar Tensor2Tensor and cp to train_dir.""" + tf.logging.info('Tarring and pushing local Tensor2Tensor package.') + t2t_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + t2t_tar = _tar_and_copy(t2t_dir, train_dir) + return t2t_tar + + +def tar_and_copy_usr_dir(usr_dir, train_dir): + """Package, tar, and copy usr_dir to GCS train_dir.""" + tf.logging.info('Tarring and pushing t2t_usr_dir.') + usr_dir = os.path.abspath(os.path.expanduser(usr_dir)) + # Copy usr dir to a temp location + top_dir = os.path.join(tempfile.gettempdir(), 't2t_usr_container') + tmp_usr_dir = os.path.join(top_dir, usr_dir_lib.INTERNAL_USR_DIR_PACKAGE) + shutil.rmtree(top_dir, ignore_errors=True) + shutil.copytree(usr_dir, tmp_usr_dir) + # Insert setup.py if one does not exist + top_setup_fname = os.path.join(top_dir, 'setup.py') + usr_setup_fname = os.path.join(tmp_usr_dir, 'setup.py') + if tf.gfile.Exists(usr_setup_fname): + tf.gfile.Move(usr_setup_fname, top_setup_fname) + else: + with tf.gfile.Open(top_setup_fname, 'w') as f: + f.write(SETUP_PY) + usr_tar = _tar_and_copy(top_dir, train_dir) + return usr_tar + + +def configure_trainer_package(job_spec, t2t_tar): + assert t2t_tar.startswith('gs://') + job_spec['trainingInput']['packageUris'] = [t2t_tar] + + +def configure_usr_dir(job_spec, usr_tar): + assert usr_tar.startswith('gs://') + job_spec['trainingInput']['packageUris'].append(usr_tar) + usr_args = ['--t2t_usr_dir', usr_dir_lib.INTERNAL_USR_DIR_PACKAGE] + job_spec['trainingInput']['args'].extend(usr_args) def launch(flags_dict): + """Launch t2t_trainer on Cloud ML Engine.""" job_spec = configure_job(flags_dict) job_name = job_spec['jobId'] tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, job_spec) assert cloud.confirm() - tar_and_copy_t2t(flags_dict['output_dir'], flags_dict['t2t_usr_dir']) + train_dir = flags_dict['output_dir'] + t2t_tar = tar_and_copy_t2t(train_dir) + configure_trainer_package(job_spec, t2t_tar) + if flags_dict['t2t_usr_dir']: + usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir) + configure_usr_dir(job_spec, usr_tar) launch_job(job_spec) tf.logging.info('Launched %s. See console to track: %s.', job_name, CONSOLE_URL) diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py index d89745b98..5edd6f1a2 100644 --- a/tensor2tensor/utils/usr_dir.py +++ b/tensor2tensor/utils/usr_dir.py @@ -27,13 +27,20 @@ import tensorflow as tf +INTERNAL_USR_DIR_PACKAGE = "t2t_usr_dir_internal" + + def import_usr_dir(usr_dir): """Import module at usr_dir, if provided.""" if not usr_dir: return - dir_path = os.path.expanduser(usr_dir) - if dir_path[-1] == "/": - dir_path = dir_path[:-1] + if usr_dir == INTERNAL_USR_DIR_PACKAGE: + # The package has been installed with pip under this name for Cloud ML + # Engine so just import it. + importlib.import_module(INTERNAL_USR_DIR_PACKAGE) + return + + dir_path = os.path.abspath(os.path.expanduser(usr_dir).rstrip("/")) containing_dir, module_name = os.path.split(dir_path) tf.logging.info("Importing user module %s from path %s", module_name, containing_dir) From 62bc2e77efd3ccba424fa1f29d3035102f0fa9a8 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 12:23:29 -0800 Subject: [PATCH 19/31] Add a test for the t2t_trainer script PiperOrigin-RevId: 184877423 --- .travis.yml | 2 +- tensor2tensor/bin/t2t_trainer_test.py | 50 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tensor2tensor/bin/t2t_trainer_test.py diff --git a/.travis.yml b/.travis.yml index 2cdcd85bf..e703a2bc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ script: - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)" # Run tests - - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py + - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py --ignore=tensor2tensor/bin/t2t_trainer_test.py - pytest tensor2tensor/utils/registry_test.py - pytest tensor2tensor/utils/trainer_lib_test.py diff --git a/tensor2tensor/bin/t2t_trainer_test.py b/tensor2tensor/bin/t2t_trainer_test.py new file mode 100644 index 000000000..b1f38cec5 --- /dev/null +++ b/tensor2tensor/bin/t2t_trainer_test.py @@ -0,0 +1,50 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for t2t_trainer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.bin import t2t_trainer +from tensor2tensor.utils import trainer_lib_test + +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + + +class TrainerTest(tf.test.TestCase): + + @classmethod + def setUpClass(cls): + trainer_lib_test.TrainerLibTest.setUpClass() + + def testTrain(self): + FLAGS.problems = "tiny_algo" + FLAGS.model = "transformer" + FLAGS.hparams_set = "transformer_tiny" + FLAGS.train_steps = 1 + FLAGS.eval_steps = 1 + FLAGS.output_dir = tf.test.get_temp_dir() + FLAGS.data_dir = tf.test.get_temp_dir() + t2t_trainer.main(None) + + +if __name__ == "__main__": + tf.test.main() From 0c3218dfc760930e0a549e6676a5eb2bdf7488a9 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 7 Feb 2018 15:27:28 -0800 Subject: [PATCH 20/31] The most basic version of the "supervised" attention experiment. PiperOrigin-RevId: 184906398 --- tensor2tensor/layers/common_attention.py | 28 ++++++++++++++++++++++++ tensor2tensor/models/transformer.py | 19 ++++++++++++---- tensor2tensor/models/transformer_test.py | 12 ++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 0f400c575..63bf8d6cd 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -327,6 +327,34 @@ def add_standard_attention_hparams(hparams): return hparams +def encoder_decoder_attention_loss(expected_attention, actual_attentions): + """Computes encdec attention loss between expected and actual attentions. + + Args: + expected_attention: Tensor storing the expected encoder-decoder attention + weights with shape [batch_size, target_length, input_length]. + actual_attentions: Dictionary with actual attention weights for different + attention types and hidden layers. + + Returns: + MSE loss between the actual and expected attention weights. + """ + # For each hidden layer, we have an attention weight tensor with shape + # [batch_size, num_heads, target_length, input_length]. + actual_encdec_attention_weights = [ + t for layer_key, t in actual_attentions.items() + if "encdec_attention" in layer_key + ] + # Stack all hidden layer attention weight tensors to get a tensor with shape + # [num_hidden_layers, batch_size, num_heads, target_length, input_length]. + actual_attention_weights = tf.stack(actual_encdec_attention_weights) + # Reduce mean across all layers (axis=0) and all heads (axis=2) to get a + # tensor with shape [batch_size, target_length, input_length]. + actual_attention_weights = tf.reduce_mean(actual_attention_weights, [0, 2]) + return tf.losses.mean_squared_error(expected_attention, + actual_attention_weights) + + @expert_utils.add_name_scope() def get_timing_signal_1d(length, channels, diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 3aeb7a790..305a17379 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -160,10 +160,21 @@ def body(self, features): decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) - return self.decode(decoder_input, encoder_output, - encoder_decoder_attention_bias, - decoder_self_attention_bias, hparams, - nonpadding=features_to_nonpadding(features, "targets")) + decoder_output = self.decode( + decoder_input, + encoder_output, + encoder_decoder_attention_bias, + decoder_self_attention_bias, + hparams, + nonpadding=features_to_nonpadding(features, "targets")) + + expected_attention_weights = features.get("expected_attention_weights") + if expected_attention_weights is not None: + attention_loss = common_attention.encoder_decoder_attention_loss( + expected_attention_weights, self.attention_weights) + return decoder_output, {"attention_loss": attention_loss} + + return decoder_output def _greedy_infer(self, features, decode_length): """Fast version of greedy decoding. diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 0c9b6f794..1a6134b51 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -207,6 +207,18 @@ def testTransformerWithoutProblem(self): body_out.get_shape().as_list(), [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) + def testTransformerWithEncoderDecoderAttentionLoss(self): + model, features = self.getModel(transformer.transformer_small()) + expected_attention_weights = np.random.random_sample( + size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH)) + features["expected_attention_weights"] = tf.constant( + expected_attention_weights, dtype=tf.float32) + _, extra_loss = model(features) + with self.test_session() as session: + session.run(tf.global_variables_initializer()) + res = session.run(extra_loss["attention_loss"]) + self.assertEqual(res.shape, ()) + if __name__ == "__main__": tf.test.main() From b86ed582cf5c7834e8d8ab1c1b3efffce8776633 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 17:11:53 -0800 Subject: [PATCH 21/31] Support hyperparameter tuning on Cloud ML Engine PiperOrigin-RevId: 184921645 --- tensor2tensor/bin/t2t_trainer.py | 52 ++++++++++- tensor2tensor/layers/common_hparams.py | 104 +++++++++++---------- tensor2tensor/models/slicenet.py | 4 - tensor2tensor/models/transformer.py | 15 +-- tensor2tensor/models/transformer_sketch.py | 5 - tensor2tensor/utils/cloud_mlengine.py | 73 ++++++++++++--- tensor2tensor/utils/trainer_lib.py | 4 + 7 files changed, 164 insertions(+), 93 deletions(-) diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index 051c2e90d..a755e0d33 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -93,6 +93,20 @@ "GPUs on machine type. See documentation: " "https://cloud.google.com/ml-engine/reference/rest/v1/" "projects.jobs#traininginput") +# Hyperparameter tuning on Cloud ML Engine +# Pass an --hparams_range to enable +flags.DEFINE_string("autotune_objective", None, + "TensorBoard metric name to optimize.") +flags.DEFINE_bool("autotune_maximize", True, + "Whether to maximize (vs. minimize) autotune_objective.") +flags.DEFINE_integer("autotune_max_trials", 10, + "Maximum number of tuning experiments to run.") +flags.DEFINE_integer("autotune_parallel_trials", 1, + "How many trials to run in parallel (will spin up this " + "many jobs.") +flags.DEFINE_string("job-dir", None, + "DO NOT USE. Exists only for Cloud ML Engine to pass in " + "during hyperparameter tuning. Overrides --output_dir.") def get_problem_name(): @@ -101,6 +115,33 @@ def get_problem_name(): return problems[0] +def set_hparams_from_args(args): + """Set hparams overrides from unparsed args list.""" + if not args: + return + + hp_prefix = "--hp_" + tf.logging.info("Found unparsed command-line arguments. Checking if any " + "start with %s and interpreting those as hparams " + "settings.", hp_prefix) + + pairs = [] + i = 0 + while i < len(args): + arg = args[i] + if arg.startswith(hp_prefix): + pairs.append((arg.lstrip(hp_prefix), args[i+1])) + i += 2 + else: + tf.logging.warn("Found unknown flag: %s", arg) + i += 1 + + as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs]) + if FLAGS.hparams: + as_hparams = "," + as_hparams + FLAGS.hparams += as_hparams + + def create_hparams(): if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set: tf.logging.warn("Not all hyperparameter sets work on TPU. " @@ -265,21 +306,22 @@ def maybe_cloud_tpu(): yield -def main(_): +def main(argv): tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) log_registry() if FLAGS.cloud_mlengine: - assert not FLAGS.cloud_tpu - assert FLAGS.output_dir.startswith("gs://") - assert FLAGS.data_dir.startswith("gs://") - return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"])) + return cloud_mlengine.launch() if FLAGS.generate_data: generate_data() + if FLAGS.job_dir: + FLAGS.output_dir = FLAGS.job_dir + + set_hparams_from_args(argv[1:]) hparams = create_hparams() if is_chief(): save_metadata(hparams) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 02a5df2f3..147757b47 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -21,7 +21,6 @@ # Dependency imports -import six from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.utils import registry @@ -224,10 +223,15 @@ class RangedHParams(object): LOG_SCALE = 2 REVERSE_LOG_SCALE = 3 + SCALES_STR = { + LINEAR_SCALE: "UNIT_LINEAR_SCALE", + LOG_SCALE: "UNIT_LOG_SCALE", + REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE", + } + def __init__(self): self._categorical_params = {} self._discrete_params = {} - self._discrete_float_params = {} self._float_params = {} self._int_params = {} @@ -237,10 +241,12 @@ def _check_reset_and_type_change(self, name, orig_ctr): if name in orig_ctr: tf.logging.warning("Overwriting hparam %s", name) - ctr_names = [(self._categorical_params, - "categorical"), (self._discrete_params, "discrete"), - (self._float_params, "float"), (self._int_params, "int"), - (self._discrete_float_params, "discrete_float")] + ctr_names = [ + (self._categorical_params, "categorical"), + (self._discrete_params, "discrete"), + (self._float_params, "float"), + (self._int_params, "int"), + ] ctrs, names = list(zip(*ctr_names)) orig_name = names[ctrs.index(orig_ctr)] @@ -263,23 +269,17 @@ def set_discrete(self, name, feasible_points, scale=None, length=None): self._discrete_params[name] = (name, feasible_points, scale, length) def set_float(self, name, min_val, max_val, scale=None, length=None): - if name in self._discrete_float_params: - del self._discrete_float_params[name] self._check_reset_and_type_change(name, self._float_params) self._float_params[name] = (name, min_val, max_val, scale, length) - def set_discrete_float(self, name, val): - self._check_reset_and_type_change(name, self._discrete_float_params) - self._discrete_float_params[name] = (name, [val]) - def set_int(self, name, min_val, max_val, scale=None, length=None): self._check_reset_and_type_change(name, self._int_params) self._int_params[name] = (name, min_val, max_val, scale, length) def fix_select_params(self, hp): ctrs = [ - self._categorical_params, self._discrete_params, - self._discrete_float_params, self._float_params, self._int_params + self._categorical_params, self._discrete_params, self._float_params, + self._int_params ] for key, val in hp.values().iteritems(): for ctr in ctrs: @@ -287,52 +287,56 @@ def fix_select_params(self, hp): del ctr[key] self.set_discrete(key, [val]) + def to_parameter_specs(self, name_prefix=""): + """To list of dicts suitable for Cloud ML Engine hyperparameter tuning.""" + specs = [] + for name, categories, _ in self._categorical_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "CATEGORICAL", + "categoricalValues": categories, + } + specs.append(spec) -def fill_ranged_hparams_from_hparams(hparams, ranged_hparams): - """Fill ranged_hparams with singleton values from hparams. + for name, feasible_points, scale, _ in self._discrete_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "DISCRETE", + "discreteValues": feasible_points, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - HParams are placed in RangedHParams with the following functions, according to - type: - * int: set_discrete - * bool: set_discrete - * float: set_discrete_float - * str: set_categorical + for name, min_val, max_val, scale, _ in self._float_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "DOUBLE", + "minValue": min_val, + "maxValue": max_val, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - Args: - hparams: tf.contrib.training.HParams; contains the hyperparameters to copy - over to ranged_hparams. - ranged_hparams: RangedHParams; will have hparams values copied to it. + for name, min_val, max_val, scale, _ in self._int_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "INTEGER", + "minValue": min_val, + "maxValue": max_val, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - Raises: - ValueError: if hparams contains a hyperparameter not of type - {int, float, str, bool}. - """ - for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types): # pylint: disable=protected-access - - if is_multivalent: - raise ValueError("Multivalent hparams not supported in RangedHParams. " - "Hyperparameter %s is multivalent." % name) - val = getattr(hparams, name) - if hp_type == int: - ranged_hparams.set_discrete(name, [val]) - elif hp_type == bool: - ranged_hparams.set_discrete(name, [int(val)]) - elif hp_type == float: - ranged_hparams.set_discrete_float(name, val) - elif hp_type == str: - ranged_hparams.set_categorical(name, [val]) - else: - raise ValueError("Unsupported type %s for param %s" % (hp_type, name)) + return specs @registry.register_ranged_hparams("basic1") def basic_range1(ranged_hparams): """A basic range of hyperparameters.""" rhp = ranged_hparams - - hparams = basic_params1() - fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_discrete("batch_size", [1024, 2048, 4096]) rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6]) rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE) diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index e77412513..43cfa571e 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -381,10 +381,6 @@ def slicenet_params1_tiny(): def slicenet_range1(ranged_hparams): """Small range of hyperparameters.""" rhp = ranged_hparams - - hparams = slicenet_params1() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 305a17379..df4b32277 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -1228,11 +1228,9 @@ def transformer_prepend(): return transformer_prepend_v2() -@registry.register_ranged_hparams("transformer_base") +@registry.register_ranged_hparams def transformer_base_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_base() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1337,8 +1335,6 @@ def transformer_tiny_tpu(): @registry.register_ranged_hparams def transformer_tiny_tpu_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_tiny_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_float("weight_decay", 0.0, 2.0) @@ -1346,8 +1342,6 @@ def transformer_tiny_tpu_range(rhp): @registry.register_ranged_hparams def transformer_tpu_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1358,13 +1352,6 @@ def transformer_tpu_range(rhp): rhp.set_float("weight_decay", 0.0, 2.0) -@registry.register_ranged_hparams -def transformer_tpu_batch_range(rhp): - hparams = transformer_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_discrete("batch_size", [256, 512, 768, 1024]) - - @registry.register_hparams def transformer_small_tpu(): """TPU-friendly version of transformer_small. diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py index 913243f00..e8fe796a8 100644 --- a/tensor2tensor/models/transformer_sketch.py +++ b/tensor2tensor/models/transformer_sketch.py @@ -22,7 +22,6 @@ # Dependency imports -from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer from tensor2tensor.models import transformer_vae @@ -125,10 +124,6 @@ def transformer_sketch_6layer(): @registry.register_ranged_hparams("transformer_sketch_ranged") def transformer_sketch_ranged(rhp): """Range of hparams for vizier.""" - - hparams = transformer_sketch() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_categorical("ffn_layer", ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"]) rhp.set_discrete("batch_size", [1024, 2048, 4096]) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index 0bb5ddc40..90c5c09e3 100644 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -22,14 +22,17 @@ from googleapiclient import discovery from oauth2client.client import GoogleCredentials +from tensor2tensor.layers import common_hparams from tensor2tensor.utils import cloud_tpu as cloud +from tensor2tensor.utils import registry from tensor2tensor.utils import usr_dir as usr_dir_lib import tensorflow as tf +FLAGS = tf.flags.FLAGS + CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' # TODO(rsepassi): -# * Support --autotune # * Add documentation clould_mlengine.md # * Enable multi-machine sync/async training @@ -44,9 +47,9 @@ """ -def args_dict_as_args(args_dict): - """Convert dict to list of args suitable for passing on cmd line.""" - args_dict = dict(args_dict) +def flags_as_args(): + """Convert FLAGS to list of args suitable for passing on cmd line.""" + args_dict = dict(FLAGS.__dict__['__flags']) del args_dict['cloud_mlengine'] # Configured later del args_dict['t2t_usr_dir'] @@ -54,6 +57,8 @@ def args_dict_as_args(args_dict): for name, val in args_dict.items(): if val is None: continue + if name.startswith('autotune'): + continue args.extend(['--%s' % name, str(val)]) return args @@ -83,9 +88,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None): return config -def configure_job(flags_dict): +def configure_job(): """Construct jobSpec for ML Engine job.""" - train_dir = flags_dict['output_dir'] + train_dir = FLAGS.output_dir assert train_dir.startswith('gs://') job_name = os.path.basename(train_dir) @@ -93,16 +98,27 @@ def configure_job(flags_dict): # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { 'pythonModule': 'tensor2tensor.bin.t2t_trainer', - 'args': args_dict_as_args(flags_dict), + 'args': flags_as_args(), 'region': cloud.default_region(), 'runtimeVersion': '1.4', 'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7', + 'jobDir': train_dir, } training_input.update( machine_config( - num_gpus=flags_dict['worker_gpu'], - use_tpu=flags_dict['use_tpu'], - master_type=flags_dict['cloud_mlengine_master_type'])) + num_gpus=FLAGS.worker_gpu, + use_tpu=FLAGS.use_tpu, + master_type=FLAGS.cloud_mlengine_master_type)) + if FLAGS.hparams_range: + assert FLAGS.autotune_objective + tf.logging.info('Configuring hyperparameter tuning.') + training_input['hyperparameters'] = configure_autotune( + FLAGS.hparams_range, + FLAGS.autotune_objective, + FLAGS.autotune_maximize, + FLAGS.autotune_max_trials, + FLAGS.autotune_parallel_trials, + ) if training_input['scaleTier'] == 'CUSTOM': assert 'masterType' in training_input @@ -170,6 +186,26 @@ def tar_and_copy_usr_dir(usr_dir, train_dir): return usr_tar +def autotune_paramspecs(hparams_range): + rhp = common_hparams.RangedHParams() + registry.ranged_hparams(hparams_range)(rhp) + return rhp.to_parameter_specs(name_prefix='hp_') + + +def configure_autotune(hparams_range, + objective, + maximize=True, + max_trials=10, + parallel_trials=1): + return { + 'goal': 'MAXIMIZE' if maximize else 'MINIMIZE', + 'params': autotune_paramspecs(hparams_range), + 'maxTrials': max_trials, + 'maxParallelTrials': parallel_trials, + 'hyperparameterMetricTag': objective, + } + + def configure_trainer_package(job_spec, t2t_tar): assert t2t_tar.startswith('gs://') job_spec['trainingInput']['packageUris'] = [t2t_tar] @@ -182,18 +218,25 @@ def configure_usr_dir(job_spec, usr_tar): job_spec['trainingInput']['args'].extend(usr_args) -def launch(flags_dict): +def launch(): """Launch t2t_trainer on Cloud ML Engine.""" - job_spec = configure_job(flags_dict) + assert not FLAGS.cloud_tpu + assert not FLAGS.job_dir + assert FLAGS.output_dir.startswith('gs://') + assert FLAGS.data_dir.startswith('gs://') + assert FLAGS.worker_replicas <= 1 + assert FLAGS.ps_replicas <= 0 + + job_spec = configure_job() job_name = job_spec['jobId'] tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, job_spec) assert cloud.confirm() - train_dir = flags_dict['output_dir'] + train_dir = FLAGS.output_dir t2t_tar = tar_and_copy_t2t(train_dir) configure_trainer_package(job_spec, t2t_tar) - if flags_dict['t2t_usr_dir']: - usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir) + if FLAGS.t2t_usr_dir: + usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir) configure_usr_dir(job_spec, usr_tar) launch_job(job_spec) tf.logging.info('Launched %s. See console to track: %s.', job_name, diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index 039b06e68..0e64f2475 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -68,8 +68,12 @@ def create_hparams(hparams_set, hparams_overrides_str="", data_dir=None, problem_name=None): + """Create HParams with data_dir and problem hparams, if kwargs provided.""" hparams = registry.hparams(hparams_set)() if hparams_overrides_str: + tf.logging.info("Overriding hparams in %s with %s", + hparams_set, + hparams_overrides_str) hparams = hparams.parse(hparams_overrides_str) if data_dir: hparams.add_hparam("data_dir", data_dir) From dd3322c6392acf72e7ba64df55910fdc74ea624d Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 17:21:04 -0800 Subject: [PATCH 22/31] Add Travis test for export and serve PiperOrigin-RevId: 184922812 --- .travis.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.travis.yml b/.travis.yml index e703a2bc9..5e0aff001 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,11 @@ python: - "2.7" - "3.6" before_install: + - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list + - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add - - sudo apt-get update -qq - sudo apt-get install -qq libhdf5-dev + - sudo apt-get install -qq tensorflow-model-server install: - pip install -q .[tensorflow] - pip install -q .[tests] @@ -36,5 +39,14 @@ script: - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10' + + # Export and query (on Python 2 only) + - t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + pip install tensorflow-serving-api; + tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo & + sleep 10; + t2t-query-server --problem=$T2T_PROBLEM --server=localhost:9000 --servable_name=my_model --data_dir=$T2T_DATA_DIR --inputs_once='1 0 1 0 1 0'; + fi git: depth: 3 From 3b1f70f92de247364e8aa0da56f36bbdff08399f Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 17:53:21 -0800 Subject: [PATCH 23/31] Fix hello_t2t.ipynb PiperOrigin-RevId: 184926720 --- tensor2tensor/notebooks/hello_t2t.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb index cc9f66a02..3b3573098 100644 --- a/tensor2tensor/notebooks/hello_t2t.ipynb +++ b/tensor2tensor/notebooks/hello_t2t.ipynb @@ -60,7 +60,7 @@ }, "source": [ "# Install deps\n", - "!pip install -q tensor2tensor" + "!pip install -q -U tensor2tensor tensorflow" ], "cell_type": "code", "execution_count": 0, @@ -87,7 +87,7 @@ "from tensor2tensor import models\n", "from tensor2tensor import problems\n", "from tensor2tensor.layers import common_layers\n", - "from tensor2tensor.tpu import tpu_trainer_lib\n", + "from tensor2tensor.utils import trainer_lib\n", "from tensor2tensor.utils import t2t_model\n", "from tensor2tensor.utils import registry\n", "from tensor2tensor.utils import metrics\n", @@ -597,7 +597,7 @@ "model_name = \"transformer\"\n", "hparams_set = \"transformer_base\"\n", "\n", - "hparams = tpu_trainer_lib.create_hparams(hparams_set, data_dir=data_dir, problem_name=\"translate_ende_wmt32k\")\n", + "hparams = trainer_lib.create_hparams(hparams_set, data_dir=data_dir, problem_name=\"translate_ende_wmt32k\")\n", "\n", "# NOTE: Only create the model once when restoring from a checkpoint; it's a\n", "# Layer and so subsequent instantiations will have different variable scopes\n", @@ -1407,7 +1407,7 @@ " return tf.layers.conv2d(tf.nn.relu(h2), filters,\n", " kernel_size=(3, 3))\n", "\n", - "hparams = tpu_trainer_lib.create_hparams(\"basic_1\", data_dir=data_dir, problem_name=\"image_mnist\")\n", + "hparams = trainer_lib.create_hparams(\"basic_1\", data_dir=data_dir, problem_name=\"image_mnist\")\n", "hparams.hidden_size = 64\n", "model = MySimpleModel(hparams, Modes.TRAIN)" ], @@ -1584,7 +1584,7 @@ " break\n", "\n", " # Make the inputs and targets 4D\n", - " example[\"inputs\"] = tf.reshape(example[\"inputs\"], [1, 28, 28, 3])\n", + " example[\"inputs\"] = tf.reshape(example[\"inputs\"], [1, 28, 28, 1])\n", " example[\"targets\"] = tf.reshape(example[\"targets\"], [1, 1, 1, 1])\n", "\n", " # Call the model\n", From 424d4ace667e49b23b1d4c9b35e727f6edb16453 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 7 Feb 2018 18:44:53 -0800 Subject: [PATCH 24/31] Add documentation for Cloud ML Engine PiperOrigin-RevId: 184932319 --- docs/cloud_mlengine.md | 80 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 docs/cloud_mlengine.md diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md new file mode 100644 index 000000000..b257fab25 --- /dev/null +++ b/docs/cloud_mlengine.md @@ -0,0 +1,80 @@ +# Running on Cloud ML Engine + +Google Cloud Platform offers a managed training environment for TensorFlow +models called [Cloud ML Engine](https://cloud.google.com/ml-engine/) and +you can easily launch Tensor2Tensor on it, including for hyperparameter tuning. + +# Launch + +It's the same `t2t-trainer` you know and love with the addition of the +`--cloud_mlengine` flag, which by default will launch on a 1-GPU machine. + +``` +# Note that both the data dir and output dir have to be on GCS +DATA_DIR=gs://my-bucket/data +OUTPUT_DIR=gs://my-bucket/train +t2t-trainer \ + --problems=translate_ende_wmt32k \ + --model=transformer \ + --hparams_set=transformer_base \ + --data_dir=$DATA_DIR \ + --output_dir=$OUTPUT_DIR \ + --cloud_mlengine +``` + +By passing `--worker_gpu=4` or `--worker_gpu=8` it will automatically launch on +machines with 4 or 8 GPUs. + +You can additionally pass the `--cloud_mlengine_master_type` to select another +kind of machine (see the [docs for +`masterType`](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput) +for your options). If you provide this flag yourself, make sure you pass the +correct value for `--worker_gpu`. + +**Note**: `t2t-trainer` only currently supports launching with single machines, +possibly with multiple GPUs. Multi-machine setups are not yet supported out of +the box with the `--cloud_mlengine` flag, though multi-machine should in +principle work just fine. Contributions/testers welcome. + +## `--t2t_usr_dir` + +Launching on Cloud ML Engine works with `--t2t_usr_dir` as well as long as the +directory is fully self-contained (i.e. the imports only refer to other modules +in the directory). If there are additional PyPI dependencies that you need, you +can include a `setup.py` file in your directory (ensure that it uses +`setuptools.find_packages`). + +# Hyperparameter Tuning + +Hyperparameter tuning with `t2t-trainer` and Cloud ML Engine is also a breeze +with `--hparams_range` and the `--autotune_*` flags: + +``` +t2t-trainer \ + --problems=translate_ende_wmt32k \ + --model=transformer \ + --hparams_set=transformer_base \ + --data_dir=$DATA_DIR \ + --output_dir=$OUTPUT_DIR \ + --cloud_mlengine \ + --hparams_range=transformer_base_range \ + --autotune_objective='metrics-translate_ende_wmt32k/neg_log_perplexity' \ + --autotune_maximize \ + --autotune_max_trials=100 \ + --autotune_parallel_trials=3 +``` + +The `--hparams_range` specifies the search space and should be registered with +`@register_ranged_hparams`. It defines a `RangedHParams` object that sets +search ranges and scales for various parameters. See `transformer_base_range` +in +[`transformer.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) +for an example. + +The metric name passed as `--autotune_objective` should be exactly what you'd +see in TensorBoard. To minimize a metric, set `--autotune_maximize=False`. + +You control how many total trials to run with `--autotune_max_trials` and the +number of jobs to launch in parallel with `--autotune_parallel_trials`. + +Happy tuning! From 6ae03f76279710597a0731dcc2d4305ab74f741f Mon Sep 17 00:00:00 2001 From: T2T Team Date: Thu, 8 Feb 2018 08:54:31 -0800 Subject: [PATCH 25/31] Support hyperparameter tuning on Cloud ML Engine PiperOrigin-RevId: 185001360 --- tensor2tensor/bin/t2t_trainer.py | 52 +---------- tensor2tensor/layers/common_hparams.py | 104 ++++++++++----------- tensor2tensor/models/slicenet.py | 4 + tensor2tensor/models/transformer.py | 15 ++- tensor2tensor/models/transformer_sketch.py | 5 + tensor2tensor/utils/cloud_mlengine.py | 73 +++------------ tensor2tensor/utils/trainer_lib.py | 4 - 7 files changed, 93 insertions(+), 164 deletions(-) diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index a755e0d33..051c2e90d 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -93,20 +93,6 @@ "GPUs on machine type. See documentation: " "https://cloud.google.com/ml-engine/reference/rest/v1/" "projects.jobs#traininginput") -# Hyperparameter tuning on Cloud ML Engine -# Pass an --hparams_range to enable -flags.DEFINE_string("autotune_objective", None, - "TensorBoard metric name to optimize.") -flags.DEFINE_bool("autotune_maximize", True, - "Whether to maximize (vs. minimize) autotune_objective.") -flags.DEFINE_integer("autotune_max_trials", 10, - "Maximum number of tuning experiments to run.") -flags.DEFINE_integer("autotune_parallel_trials", 1, - "How many trials to run in parallel (will spin up this " - "many jobs.") -flags.DEFINE_string("job-dir", None, - "DO NOT USE. Exists only for Cloud ML Engine to pass in " - "during hyperparameter tuning. Overrides --output_dir.") def get_problem_name(): @@ -115,33 +101,6 @@ def get_problem_name(): return problems[0] -def set_hparams_from_args(args): - """Set hparams overrides from unparsed args list.""" - if not args: - return - - hp_prefix = "--hp_" - tf.logging.info("Found unparsed command-line arguments. Checking if any " - "start with %s and interpreting those as hparams " - "settings.", hp_prefix) - - pairs = [] - i = 0 - while i < len(args): - arg = args[i] - if arg.startswith(hp_prefix): - pairs.append((arg.lstrip(hp_prefix), args[i+1])) - i += 2 - else: - tf.logging.warn("Found unknown flag: %s", arg) - i += 1 - - as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs]) - if FLAGS.hparams: - as_hparams = "," + as_hparams - FLAGS.hparams += as_hparams - - def create_hparams(): if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set: tf.logging.warn("Not all hyperparameter sets work on TPU. " @@ -306,22 +265,21 @@ def maybe_cloud_tpu(): yield -def main(argv): +def main(_): tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) log_registry() if FLAGS.cloud_mlengine: - return cloud_mlengine.launch() + assert not FLAGS.cloud_tpu + assert FLAGS.output_dir.startswith("gs://") + assert FLAGS.data_dir.startswith("gs://") + return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"])) if FLAGS.generate_data: generate_data() - if FLAGS.job_dir: - FLAGS.output_dir = FLAGS.job_dir - - set_hparams_from_args(argv[1:]) hparams = create_hparams() if is_chief(): save_metadata(hparams) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 147757b47..02a5df2f3 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -21,6 +21,7 @@ # Dependency imports +import six from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.utils import registry @@ -223,15 +224,10 @@ class RangedHParams(object): LOG_SCALE = 2 REVERSE_LOG_SCALE = 3 - SCALES_STR = { - LINEAR_SCALE: "UNIT_LINEAR_SCALE", - LOG_SCALE: "UNIT_LOG_SCALE", - REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE", - } - def __init__(self): self._categorical_params = {} self._discrete_params = {} + self._discrete_float_params = {} self._float_params = {} self._int_params = {} @@ -241,12 +237,10 @@ def _check_reset_and_type_change(self, name, orig_ctr): if name in orig_ctr: tf.logging.warning("Overwriting hparam %s", name) - ctr_names = [ - (self._categorical_params, "categorical"), - (self._discrete_params, "discrete"), - (self._float_params, "float"), - (self._int_params, "int"), - ] + ctr_names = [(self._categorical_params, + "categorical"), (self._discrete_params, "discrete"), + (self._float_params, "float"), (self._int_params, "int"), + (self._discrete_float_params, "discrete_float")] ctrs, names = list(zip(*ctr_names)) orig_name = names[ctrs.index(orig_ctr)] @@ -269,17 +263,23 @@ def set_discrete(self, name, feasible_points, scale=None, length=None): self._discrete_params[name] = (name, feasible_points, scale, length) def set_float(self, name, min_val, max_val, scale=None, length=None): + if name in self._discrete_float_params: + del self._discrete_float_params[name] self._check_reset_and_type_change(name, self._float_params) self._float_params[name] = (name, min_val, max_val, scale, length) + def set_discrete_float(self, name, val): + self._check_reset_and_type_change(name, self._discrete_float_params) + self._discrete_float_params[name] = (name, [val]) + def set_int(self, name, min_val, max_val, scale=None, length=None): self._check_reset_and_type_change(name, self._int_params) self._int_params[name] = (name, min_val, max_val, scale, length) def fix_select_params(self, hp): ctrs = [ - self._categorical_params, self._discrete_params, self._float_params, - self._int_params + self._categorical_params, self._discrete_params, + self._discrete_float_params, self._float_params, self._int_params ] for key, val in hp.values().iteritems(): for ctr in ctrs: @@ -287,56 +287,52 @@ def fix_select_params(self, hp): del ctr[key] self.set_discrete(key, [val]) - def to_parameter_specs(self, name_prefix=""): - """To list of dicts suitable for Cloud ML Engine hyperparameter tuning.""" - specs = [] - for name, categories, _ in self._categorical_params.values(): - spec = { - "parameterName": name_prefix + name, - "type": "CATEGORICAL", - "categoricalValues": categories, - } - specs.append(spec) - for name, feasible_points, scale, _ in self._discrete_params.values(): - spec = { - "parameterName": name_prefix + name, - "type": "DISCRETE", - "discreteValues": feasible_points, - } - if scale: - spec["scaleType"] = self.SCALES_STR[scale] - specs.append(spec) +def fill_ranged_hparams_from_hparams(hparams, ranged_hparams): + """Fill ranged_hparams with singleton values from hparams. - for name, min_val, max_val, scale, _ in self._float_params.values(): - spec = { - "parameterName": name_prefix + name, - "type": "DOUBLE", - "minValue": min_val, - "maxValue": max_val, - } - if scale: - spec["scaleType"] = self.SCALES_STR[scale] - specs.append(spec) + HParams are placed in RangedHParams with the following functions, according to + type: + * int: set_discrete + * bool: set_discrete + * float: set_discrete_float + * str: set_categorical - for name, min_val, max_val, scale, _ in self._int_params.values(): - spec = { - "parameterName": name_prefix + name, - "type": "INTEGER", - "minValue": min_val, - "maxValue": max_val, - } - if scale: - spec["scaleType"] = self.SCALES_STR[scale] - specs.append(spec) + Args: + hparams: tf.contrib.training.HParams; contains the hyperparameters to copy + over to ranged_hparams. + ranged_hparams: RangedHParams; will have hparams values copied to it. - return specs + Raises: + ValueError: if hparams contains a hyperparameter not of type + {int, float, str, bool}. + """ + for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types): # pylint: disable=protected-access + + if is_multivalent: + raise ValueError("Multivalent hparams not supported in RangedHParams. " + "Hyperparameter %s is multivalent." % name) + val = getattr(hparams, name) + if hp_type == int: + ranged_hparams.set_discrete(name, [val]) + elif hp_type == bool: + ranged_hparams.set_discrete(name, [int(val)]) + elif hp_type == float: + ranged_hparams.set_discrete_float(name, val) + elif hp_type == str: + ranged_hparams.set_categorical(name, [val]) + else: + raise ValueError("Unsupported type %s for param %s" % (hp_type, name)) @registry.register_ranged_hparams("basic1") def basic_range1(ranged_hparams): """A basic range of hyperparameters.""" rhp = ranged_hparams + + hparams = basic_params1() + fill_ranged_hparams_from_hparams(hparams, rhp) + rhp.set_discrete("batch_size", [1024, 2048, 4096]) rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6]) rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE) diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 43cfa571e..e77412513 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -381,6 +381,10 @@ def slicenet_params1_tiny(): def slicenet_range1(ranged_hparams): """Small range of hyperparameters.""" rhp = ranged_hparams + + hparams = slicenet_params1() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) + rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index df4b32277..305a17379 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -1228,9 +1228,11 @@ def transformer_prepend(): return transformer_prepend_v2() -@registry.register_ranged_hparams +@registry.register_ranged_hparams("transformer_base") def transformer_base_range(rhp): """Small range of hyperparameters.""" + hparams = transformer_base() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1335,6 +1337,8 @@ def transformer_tiny_tpu(): @registry.register_ranged_hparams def transformer_tiny_tpu_range(rhp): """Small range of hyperparameters.""" + hparams = transformer_tiny_tpu() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_float("weight_decay", 0.0, 2.0) @@ -1342,6 +1346,8 @@ def transformer_tiny_tpu_range(rhp): @registry.register_ranged_hparams def transformer_tpu_range(rhp): """Small range of hyperparameters.""" + hparams = transformer_tpu() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1352,6 +1358,13 @@ def transformer_tpu_range(rhp): rhp.set_float("weight_decay", 0.0, 2.0) +@registry.register_ranged_hparams +def transformer_tpu_batch_range(rhp): + hparams = transformer_tpu() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) + rhp.set_discrete("batch_size", [256, 512, 768, 1024]) + + @registry.register_hparams def transformer_small_tpu(): """TPU-friendly version of transformer_small. diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py index e8fe796a8..913243f00 100644 --- a/tensor2tensor/models/transformer_sketch.py +++ b/tensor2tensor/models/transformer_sketch.py @@ -22,6 +22,7 @@ # Dependency imports +from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer from tensor2tensor.models import transformer_vae @@ -124,6 +125,10 @@ def transformer_sketch_6layer(): @registry.register_ranged_hparams("transformer_sketch_ranged") def transformer_sketch_ranged(rhp): """Range of hparams for vizier.""" + + hparams = transformer_sketch() + common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) + rhp.set_categorical("ffn_layer", ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"]) rhp.set_discrete("batch_size", [1024, 2048, 4096]) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index 90c5c09e3..0bb5ddc40 100644 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -22,17 +22,14 @@ from googleapiclient import discovery from oauth2client.client import GoogleCredentials -from tensor2tensor.layers import common_hparams from tensor2tensor.utils import cloud_tpu as cloud -from tensor2tensor.utils import registry from tensor2tensor.utils import usr_dir as usr_dir_lib import tensorflow as tf -FLAGS = tf.flags.FLAGS - CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' # TODO(rsepassi): +# * Support --autotune # * Add documentation clould_mlengine.md # * Enable multi-machine sync/async training @@ -47,9 +44,9 @@ """ -def flags_as_args(): - """Convert FLAGS to list of args suitable for passing on cmd line.""" - args_dict = dict(FLAGS.__dict__['__flags']) +def args_dict_as_args(args_dict): + """Convert dict to list of args suitable for passing on cmd line.""" + args_dict = dict(args_dict) del args_dict['cloud_mlengine'] # Configured later del args_dict['t2t_usr_dir'] @@ -57,8 +54,6 @@ def flags_as_args(): for name, val in args_dict.items(): if val is None: continue - if name.startswith('autotune'): - continue args.extend(['--%s' % name, str(val)]) return args @@ -88,9 +83,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None): return config -def configure_job(): +def configure_job(flags_dict): """Construct jobSpec for ML Engine job.""" - train_dir = FLAGS.output_dir + train_dir = flags_dict['output_dir'] assert train_dir.startswith('gs://') job_name = os.path.basename(train_dir) @@ -98,27 +93,16 @@ def configure_job(): # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { 'pythonModule': 'tensor2tensor.bin.t2t_trainer', - 'args': flags_as_args(), + 'args': args_dict_as_args(flags_dict), 'region': cloud.default_region(), 'runtimeVersion': '1.4', 'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7', - 'jobDir': train_dir, } training_input.update( machine_config( - num_gpus=FLAGS.worker_gpu, - use_tpu=FLAGS.use_tpu, - master_type=FLAGS.cloud_mlengine_master_type)) - if FLAGS.hparams_range: - assert FLAGS.autotune_objective - tf.logging.info('Configuring hyperparameter tuning.') - training_input['hyperparameters'] = configure_autotune( - FLAGS.hparams_range, - FLAGS.autotune_objective, - FLAGS.autotune_maximize, - FLAGS.autotune_max_trials, - FLAGS.autotune_parallel_trials, - ) + num_gpus=flags_dict['worker_gpu'], + use_tpu=flags_dict['use_tpu'], + master_type=flags_dict['cloud_mlengine_master_type'])) if training_input['scaleTier'] == 'CUSTOM': assert 'masterType' in training_input @@ -186,26 +170,6 @@ def tar_and_copy_usr_dir(usr_dir, train_dir): return usr_tar -def autotune_paramspecs(hparams_range): - rhp = common_hparams.RangedHParams() - registry.ranged_hparams(hparams_range)(rhp) - return rhp.to_parameter_specs(name_prefix='hp_') - - -def configure_autotune(hparams_range, - objective, - maximize=True, - max_trials=10, - parallel_trials=1): - return { - 'goal': 'MAXIMIZE' if maximize else 'MINIMIZE', - 'params': autotune_paramspecs(hparams_range), - 'maxTrials': max_trials, - 'maxParallelTrials': parallel_trials, - 'hyperparameterMetricTag': objective, - } - - def configure_trainer_package(job_spec, t2t_tar): assert t2t_tar.startswith('gs://') job_spec['trainingInput']['packageUris'] = [t2t_tar] @@ -218,25 +182,18 @@ def configure_usr_dir(job_spec, usr_tar): job_spec['trainingInput']['args'].extend(usr_args) -def launch(): +def launch(flags_dict): """Launch t2t_trainer on Cloud ML Engine.""" - assert not FLAGS.cloud_tpu - assert not FLAGS.job_dir - assert FLAGS.output_dir.startswith('gs://') - assert FLAGS.data_dir.startswith('gs://') - assert FLAGS.worker_replicas <= 1 - assert FLAGS.ps_replicas <= 0 - - job_spec = configure_job() + job_spec = configure_job(flags_dict) job_name = job_spec['jobId'] tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, job_spec) assert cloud.confirm() - train_dir = FLAGS.output_dir + train_dir = flags_dict['output_dir'] t2t_tar = tar_and_copy_t2t(train_dir) configure_trainer_package(job_spec, t2t_tar) - if FLAGS.t2t_usr_dir: - usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir) + if flags_dict['t2t_usr_dir']: + usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir) configure_usr_dir(job_spec, usr_tar) launch_job(job_spec) tf.logging.info('Launched %s. See console to track: %s.', job_name, diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index 0e64f2475..039b06e68 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -68,12 +68,8 @@ def create_hparams(hparams_set, hparams_overrides_str="", data_dir=None, problem_name=None): - """Create HParams with data_dir and problem hparams, if kwargs provided.""" hparams = registry.hparams(hparams_set)() if hparams_overrides_str: - tf.logging.info("Overriding hparams in %s with %s", - hparams_set, - hparams_overrides_str) hparams = hparams.parse(hparams_overrides_str) if data_dir: hparams.add_hparam("data_dir", data_dir) From bbee177e9551bb1fa80ce6188cc277563cbe3377 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Thu, 8 Feb 2018 09:31:26 -0800 Subject: [PATCH 26/31] Changes to make Librispeech TPU friendly. PiperOrigin-RevId: 185006656 --- .../data_generators/speech_recognition.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py index 4c037aeb3..e17e4de85 100644 --- a/tensor2tensor/data_generators/speech_recognition.py +++ b/tensor2tensor/data_generators/speech_recognition.py @@ -34,7 +34,6 @@ from tensor2tensor.data_generators import text_encoder from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers -from tensor2tensor.utils import expert_utils from tensor2tensor.utils import metrics from tensor2tensor.utils import modality from tensor2tensor.utils import registry @@ -242,7 +241,7 @@ def hparams(self, defaults, model_hparams): p = model_hparams # Filterbank extraction # Filterbank extraction in bottom instead of preprocess_example is faster. - p.add_hparam("audio_preproc_in_bottom", True) + p.add_hparam("audio_preproc_in_bottom", False) # The trainer seems to reserve memory for all members of the input dict p.add_hparam("audio_keep_example_waveforms", False) p.add_hparam("audio_sample_rate", 16000) @@ -341,7 +340,6 @@ def bottom(self, inputs): float32 tensor with shape [batch_size, shorter_len, 1, hidden_size] """ p = self._model_hparams - training = p.mode == tf.estimator.ModeKeys.TRAIN num_mel_bins = p.audio_num_mel_bins num_channels = 3 if p.audio_add_delta_deltas else 1 @@ -376,18 +374,19 @@ def bottom(self, inputs): x.set_shape([None, None, 1, num_mel_bins * num_channels]) xshape = common_layers.shape_list(x) - x = tf.reshape(x, [-1, 1, num_mel_bins * num_channels]) - padding_mask = common_attention.embedding_to_padding(x) - pad_remover = expert_utils.PadRemover(padding_mask) - - x = pad_remover.remove(x) + nonpadding_mask = 1. - common_attention.embedding_to_padding(x) + num_of_nonpadding_elements = tf.reduce_sum( + nonpadding_mask) * num_mel_bins * num_channels # This replaces CMVN estimation on data - x = tf.layers.batch_normalization( - x, axis=2, center=False, scale=False, training=training) - - x = pad_remover.restore(x) + mean = tf.reduce_sum( + x, axis=[1, 2], keepdims=True) / num_of_nonpadding_elements + variance = (num_of_nonpadding_elements * mean**2. - + 2. * mean * tf.reduce_sum(x, axis=[1, 2], keepdims=True) + + tf.reduce_sum(x**2, axis=[1, 2], keepdims=True) + ) / num_of_nonpadding_elements + x = (x - mean) / variance * tf.expand_dims(nonpadding_mask, -1) # restore batch_size x time x frequency x channel layout x = tf.reshape(x, [xshape[0], xshape[1], num_mel_bins, num_channels]) From ec364a158c8a45813e37a873c5b81d4dbb912fb6 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 8 Feb 2018 11:15:20 -0800 Subject: [PATCH 27/31] internal merge PiperOrigin-RevId: 185022378 --- setup.py | 1 + tensor2tensor/rl/README.md | 11 + tensor2tensor/rl/__init__.py | 15 ++ tensor2tensor/rl/collect.py | 95 ++++++++ tensor2tensor/rl/envs/__init__.py | 15 ++ tensor2tensor/rl/envs/batch_env.py | 129 +++++++++++ tensor2tensor/rl/envs/in_graph_batch_env.py | 165 ++++++++++++++ tensor2tensor/rl/envs/utils.py | 229 ++++++++++++++++++++ tensor2tensor/rl/networks.py | 69 ++++++ tensor2tensor/rl/ppo.py | 102 +++++++++ tensor2tensor/rl/rl_trainer_lib.py | 87 ++++++++ tensor2tensor/rl/rl_trainer_lib_test.py | 34 +++ tensor2tensor/rl/t2t_rl_trainer.py | 30 +++ 13 files changed, 982 insertions(+) create mode 100644 tensor2tensor/rl/README.md create mode 100644 tensor2tensor/rl/__init__.py create mode 100644 tensor2tensor/rl/collect.py create mode 100644 tensor2tensor/rl/envs/__init__.py create mode 100644 tensor2tensor/rl/envs/batch_env.py create mode 100644 tensor2tensor/rl/envs/in_graph_batch_env.py create mode 100644 tensor2tensor/rl/envs/utils.py create mode 100644 tensor2tensor/rl/networks.py create mode 100644 tensor2tensor/rl/ppo.py create mode 100644 tensor2tensor/rl/rl_trainer_lib.py create mode 100644 tensor2tensor/rl/rl_trainer_lib_test.py create mode 100644 tensor2tensor/rl/t2t_rl_trainer.py diff --git a/setup.py b/setup.py index 2ee6e74ee..567d7a392 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ 'gevent', 'google-api-python-client', 'gunicorn', + 'gym<=0.9.5', # gym in version 0.9.6 has some temporary issues. 'numpy', 'requests', 'scipy', diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md new file mode 100644 index 000000000..053119f05 --- /dev/null +++ b/tensor2tensor/rl/README.md @@ -0,0 +1,11 @@ +# Tensor2Tensor experimental Reinforcement Learning. + +The rl package intention is to provide possiblity to run reinforcement +algorithms within Tensorflow's computation graph. It's very experimental +for now and under heavy development. + +Currently the only supported algorithm is Proximy Policy Optimization - PPO. + +## Sample usage - training in Pendulum-v0 environment. + +```python rl/t2t_rl_trainer.py``` diff --git a/tensor2tensor/rl/__init__.py b/tensor2tensor/rl/__init__.py new file mode 100644 index 000000000..3f714ce1f --- /dev/null +++ b/tensor2tensor/rl/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py new file mode 100644 index 000000000..2ea262143 --- /dev/null +++ b/tensor2tensor/rl/collect.py @@ -0,0 +1,95 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Collect trajectories from interactions of agent with environment.""" + +import tensorflow as tf + + +def define_collect(policy_factory, batch_env, config): + """Collect trajectories.""" + memory_shape = [config.epoch_length] + [batch_env.observ.shape.as_list()[0]] + memories_shapes_and_types = [ + # observation + (memory_shape + [batch_env.observ.shape.as_list()[1]], tf.float32), + (memory_shape, tf.float32), # reward + (memory_shape, tf.bool), # done + (memory_shape + batch_env.action_shape, tf.float32), # action + (memory_shape, tf.float32), # pdf + (memory_shape, tf.float32), # value function + ] + memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False) + for (shape, dtype) in memories_shapes_and_types] + cumulative_rewards = tf.Variable( + tf.zeros(config.num_agents, tf.float32), trainable=False) + + should_reset_var = tf.Variable(True, trainable=False) + reset_op = tf.cond(should_reset_var, + lambda: batch_env.reset(tf.range(config.num_agents)), + lambda: 0.0) + with tf.control_dependencies([reset_op]): + reset_once_op = tf.assign(should_reset_var, False) + + with tf.control_dependencies([reset_once_op]): + + def step(index, scores_sum, scores_num): + """Single step.""" + # Note - the only way to ensure making a copy of tensor is to run simple + # operation. We are waiting for tf.copy: + # https://github.com/tensorflow/tensorflow/issues/11186 + obs_copy = batch_env.observ + 0 + actor_critic = policy_factory(tf.expand_dims(obs_copy, 0)) + policy = actor_critic.policy + action = policy.sample() + postprocessed_action = actor_critic.action_postprocessing(action) + simulate_output = batch_env.simulate(postprocessed_action[0, ...]) + pdf = policy.prob(action)[0] + with tf.control_dependencies(simulate_output): + reward, done = simulate_output + done = tf.reshape(done, (config.num_agents,)) + to_save = [obs_copy, reward, done, action[0, ...], pdf, + actor_critic.value[0]] + save_ops = [tf.scatter_update(memory_slot, index, value) + for memory_slot, value in zip(memory, to_save)] + cumulate_rewards_op = cumulative_rewards.assign_add(reward) + agent_indices_to_reset = tf.where(done)[:, 0] + with tf.control_dependencies([cumulate_rewards_op]): + scores_sum_delta = tf.reduce_sum( + tf.gather(cumulative_rewards, agent_indices_to_reset)) + scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) + with tf.control_dependencies(save_ops + [scores_sum_delta, + scores_num_delta]): + reset_env_op = batch_env.reset(agent_indices_to_reset) + reset_cumulative_rewards_op = tf.scatter_update( + cumulative_rewards, agent_indices_to_reset, + tf.zeros(tf.shape(agent_indices_to_reset))) + with tf.control_dependencies([reset_env_op, + reset_cumulative_rewards_op]): + return [index + 1, scores_sum + scores_sum_delta, + scores_num + scores_num_delta] + + init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] + index, scores_sum, scores_num = tf.while_loop( + lambda c, _1, _2: c < config.epoch_length, + step, + init, + parallel_iterations=1, + back_prop=False) + mean_score = tf.cond(tf.greater(scores_num, 0), + lambda: scores_sum / tf.cast(scores_num, tf.float32), + lambda: 0.) + printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") + with tf.control_dependencies([printing]): + return tf.identity(index), memory diff --git a/tensor2tensor/rl/envs/__init__.py b/tensor2tensor/rl/envs/__init__.py new file mode 100644 index 000000000..3f714ce1f --- /dev/null +++ b/tensor2tensor/rl/envs/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tensor2tensor/rl/envs/batch_env.py b/tensor2tensor/rl/envs/batch_env.py new file mode 100644 index 000000000..453348976 --- /dev/null +++ b/tensor2tensor/rl/envs/batch_env.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Combine multiple environments to step them in batch.""" + +# The code was based on Danijar Hafner's code from tf.agents: +# https://github.com/tensorflow/agents/blob/master/agents/tools/batch_env.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + + +class BatchEnv(object): + """Combine multiple environments to step them in batch.""" + + def __init__(self, envs, blocking): + """Combine multiple environments to step them in batch. + + To step environments in parallel, environments must support a + `blocking=False` argument to their step and reset functions that makes them + return callables instead to receive the result at a later time. + + Args: + envs: List of environments. + blocking: Step environments after another rather than in parallel. + + Raises: + ValueError: Environments have different observation or action spaces. + """ + self._envs = envs + self._blocking = blocking + observ_space = self._envs[0].observation_space + if not all(env.observation_space == observ_space for env in self._envs): + raise ValueError('All environments must use the same observation space.') + action_space = self._envs[0].action_space + if not all(env.action_space == action_space for env in self._envs): + raise ValueError('All environments must use the same observation space.') + + def __len__(self): + """Number of combined environments.""" + return len(self._envs) + + def __getitem__(self, index): + """Access an underlying environment by index.""" + return self._envs[index] + + def __getattr__(self, name): + """Forward unimplemented attributes to one of the original environments. + + Args: + name: Attribute that was accessed. + + Returns: + Value behind the attribute name one of the wrapped environments. + """ + return getattr(self._envs[0], name) + + def step(self, actions): + """Forward a batch of actions to the wrapped environments. + + Args: + actions: Batched action to apply to the environment. + + Raises: + ValueError: Invalid actions. + + Returns: + Batch of observations, rewards, and done flags. + """ + for index, (env, action) in enumerate(zip(self._envs, actions)): + if not env.action_space.contains(action): + message = 'Invalid action at index {}: {}' + raise ValueError(message.format(index, action)) + if self._blocking: + transitions = [ + env.step(action) + for env, action in zip(self._envs, actions)] + else: + transitions = [ + env.step(action, blocking=False) + for env, action in zip(self._envs, actions)] + transitions = [transition() for transition in transitions] + observs, rewards, dones, infos = zip(*transitions) + observ = np.stack(observs).astype(np.float32) + reward = np.stack(rewards).astype(np.float32) + done = np.stack(dones) + info = tuple(infos) + return observ, reward, done, info + + def reset(self, indices=None): + """Reset the environment and convert the resulting observation. + + Args: + indices: The batch indices of environments to reset; defaults to all. + + Returns: + Batch of observations. + """ + if indices is None: + indices = np.arange(len(self._envs)) + if self._blocking: + observs = [self._envs[index].reset() for index in indices] + else: + observs = [self._envs[index].reset(blocking=False) for index in indices] + observs = [observ() for observ in observs] + observ = np.stack(observs) + observ = observ.astype(np.float32) + return observ + + def close(self): + """Send close messages to the external process and join them.""" + for env in self._envs: + if hasattr(env, 'close'): + env.close() diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py new file mode 100644 index 000000000..eae0826a3 --- /dev/null +++ b/tensor2tensor/rl/envs/in_graph_batch_env.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Batch of environments inside the TensorFlow graph.""" + +# The code was based on Danijar Hafner's code from tf.agents: +# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import gym + +import tensorflow as tf + + +class InGraphBatchEnv(object): + """Batch of environments inside the TensorFlow graph. + + The batch of environments will be stepped and reset inside of the graph using + a tf.py_func(). The current batch of observations, actions, rewards, and done + flags are held in according variables. + """ + + def __init__(self, batch_env): + """Batch of environments inside the TensorFlow graph. + + Args: + batch_env: Batch environment. + """ + self._batch_env = batch_env + observ_shape = self._parse_shape(self._batch_env.observation_space) + observ_dtype = self._parse_dtype(self._batch_env.observation_space) + self.action_shape = list(self._parse_shape(self._batch_env.action_space)) + self.action_dtype = self._parse_dtype(self._batch_env.action_space) + with tf.variable_scope('env_temporary'): + self._observ = tf.Variable( + tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype), + name='observ', trainable=False) + + def __getattr__(self, name): + """Forward unimplemented attributes to one of the original environments. + + Args: + name: Attribute that was accessed. + + Returns: + Value behind the attribute name in one of the original environments. + """ + return getattr(self._batch_env, name) + + def __len__(self): + """Number of combined environments.""" + return len(self._batch_env) + + def __getitem__(self, index): + """Access an underlying environment by index.""" + return self._batch_env[index] + + def simulate(self, action): + """Step the batch of environments. + + The results of the step can be accessed from the variables defined below. + + Args: + action: Tensor holding the batch of actions to apply. + + Returns: + Operation. + """ + with tf.name_scope('environment/simulate'): + if action.dtype in (tf.float16, tf.float32, tf.float64): + action = tf.check_numerics(action, 'action') + observ_dtype = self._parse_dtype(self._batch_env.observation_space) + observ, reward, done = tf.py_func( + lambda a: self._batch_env.step(a)[:3], [action], + [observ_dtype, tf.float32, tf.bool], name='step') + observ = tf.check_numerics(observ, 'observ') + reward = tf.check_numerics(reward, 'reward') + with tf.control_dependencies([self._observ.assign(observ)]): + return tf.identity(reward), tf.identity(done) + + def reset(self, indices=None): + """Reset the batch of environments. + + Args: + indices: The batch indices of the environments to reset. + + Returns: + Batch tensor of the new observations. + """ + return tf.cond( + tf.cast(tf.shape(indices)[0], tf.bool), + lambda: self._reset_non_empty(indices), lambda: 0.0) + + def _reset_non_empty(self, indices): + """Reset the batch of environments. + + Args: + indices: The batch indices of the environments to reset; defaults to all. + + Returns: + Batch tensor of the new observations. + """ + observ_dtype = self._parse_dtype(self._batch_env.observation_space) + observ = tf.py_func( + self._batch_env.reset, [indices], observ_dtype, name='reset') + observ = tf.check_numerics(observ, 'observ') + with tf.control_dependencies([ + tf.scatter_update(self._observ, indices, observ)]): + return tf.identity(observ) + + @property + def observ(self): + """Access the variable holding the current observation.""" + return self._observ + + def close(self): + """Send close messages to the external process and join them.""" + self._batch_env.close() + + def _parse_shape(self, space): + """Get a tensor shape from a OpenAI Gym space. + + Args: + space: Gym space. + + Returns: + Shape tuple. + """ + if isinstance(space, gym.spaces.Discrete): + return () + if isinstance(space, gym.spaces.Box): + return space.shape + raise NotImplementedError() + + def _parse_dtype(self, space): + """Get a tensor dtype from a OpenAI Gym space. + + Args: + space: Gym space. + + Returns: + TensorFlow data type. + """ + if isinstance(space, gym.spaces.Discrete): + return tf.int32 + if isinstance(space, gym.spaces.Box): + return tf.float32 + raise NotImplementedError() diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py new file mode 100644 index 000000000..8171fbe17 --- /dev/null +++ b/tensor2tensor/rl/envs/utils.py @@ -0,0 +1,229 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for using batched environments.""" + +# The code was based on Danijar Hafner's code from tf.agents: +# https://github.com/tensorflow/agents/blob/master/agents/tools/wrappers.py +# https://github.com/tensorflow/agents/blob/master/agents/scripts/utility.py + +import atexit +import multiprocessing +import sys +import traceback + +# Dependency imports + +from tensor2tensor.rl.envs import batch_env +from tensor2tensor.rl.envs import in_graph_batch_env +import tensorflow as tf + + +class ExternalProcessEnv(object): + """Step environment in a separate process for lock free paralellism.""" + + # Message types for communication via the pipe. + _ACCESS = 1 + _CALL = 2 + _RESULT = 3 + _EXCEPTION = 4 + _CLOSE = 5 + + def __init__(self, constructor): + """Step environment in a separate process for lock free paralellism. + + The environment will be created in the external process by calling the + specified callable. This can be an environment class, or a function + creating the environment and potentially wrapping it. The returned + environment should not access global variables. + + Args: + constructor: Callable that creates and returns an OpenAI gym environment. + + Attributes: + observation_space: The cached observation space of the environment. + action_space: The cached action space of the environment. + """ + self._conn, conn = multiprocessing.Pipe() + self._process = multiprocessing.Process( + target=self._worker, args=(constructor, conn)) + atexit.register(self.close) + self._process.start() + self._observ_space = None + self._action_space = None + + @property + def observation_space(self): + if not self._observ_space: + self._observ_space = self.__getattr__('observation_space') + return self._observ_space + + @property + def action_space(self): + if not self._action_space: + self._action_space = self.__getattr__('action_space') + return self._action_space + + def __getattr__(self, name): + """Request an attribute from the environment. + + Note that this involves communication with the external process, so it can + be slow. + + Args: + name: Attribute to access. + + Returns: + Value of the attribute. + """ + self._conn.send((self._ACCESS, name)) + return self._receive() + + def call(self, name, *args, **kwargs): + """Asynchronously call a method of the external environment. + + Args: + name: Name of the method to call. + *args: Positional arguments to forward to the method. + **kwargs: Keyword arguments to forward to the method. + + Returns: + Promise object that blocks and provides the return value when called. + """ + payload = name, args, kwargs + self._conn.send((self._CALL, payload)) + return self._receive + + def close(self): + """Send a close message to the external process and join it.""" + try: + self._conn.send((self._CLOSE, None)) + self._conn.close() + except IOError: + # The connection was already closed. + pass + self._process.join() + + def step(self, action, blocking=True): + """Step the environment. + + Args: + action: The action to apply to the environment. + blocking: Whether to wait for the result. + + Returns: + Transition tuple when blocking, otherwise callable that returns the + transition tuple. + """ + promise = self.call('step', action) + if blocking: + return promise() + else: + return promise + + def reset(self, blocking=True): + """Reset the environment. + + Args: + blocking: Whether to wait for the result. + + Returns: + New observation when blocking, otherwise callable that returns the new + observation. + """ + promise = self.call('reset') + if blocking: + return promise() + else: + return promise + + def _receive(self): + """Wait for a message from the worker process and return its payload. + + Raises: + Exception: An exception was raised inside the worker process. + KeyError: The reveived message is of an unknown type. + + Returns: + Payload object of the message. + """ + message, payload = self._conn.recv() + # Re-raise exceptions in the main process. + if message == self._EXCEPTION: + stacktrace = payload + raise Exception(stacktrace) + if message == self._RESULT: + return payload + raise KeyError('Received message of unexpected type {}'.format(message)) + + def _worker(self, constructor, conn): + """The process waits for actions and sends back environment results. + + Args: + constructor: Constructor for the OpenAI Gym environment. + conn: Connection for communication to the main process. + """ + try: + env = constructor() + while True: + try: + # Only block for short times to have keyboard exceptions be raised. + if not conn.poll(0.1): + continue + message, payload = conn.recv() + except (EOFError, KeyboardInterrupt): + break + if message == self._ACCESS: + name = payload + result = getattr(env, name) + conn.send((self._RESULT, result)) + continue + if message == self._CALL: + name, args, kwargs = payload + result = getattr(env, name)(*args, **kwargs) + conn.send((self._RESULT, result)) + continue + if message == self._CLOSE: + assert payload is None + break + raise KeyError('Received message of unknown type {}'.format(message)) + except Exception: # pylint: disable=broad-except + stacktrace = ''.join(traceback.format_exception(*sys.exc_info())) + tf.logging.error('Error in environment process: {}'.format(stacktrace)) + conn.send((self._EXCEPTION, stacktrace)) + conn.close() + + +def define_batch_env(constructor, num_agents, env_processes=True): + """Create environments and apply all desired wrappers. + + Args: + constructor: Constructor of an OpenAI gym environment. + num_agents: Number of environments to combine in the batch. + env_processes: Whether to step environment in external processes. + + Returns: + In-graph environments object. + """ + with tf.variable_scope('environments'): + if env_processes: + envs = [ + ExternalProcessEnv(constructor) + for _ in range(num_agents)] + else: + envs = [constructor() for _ in range(num_agents)] + env = batch_env.BatchEnv(envs, blocking=not env_processes) + env = in_graph_batch_env.InGraphBatchEnv(env) + return env diff --git a/tensor2tensor/rl/networks.py b/tensor2tensor/rl/networks.py new file mode 100644 index 000000000..4ad7c5020 --- /dev/null +++ b/tensor2tensor/rl/networks.py @@ -0,0 +1,69 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Neural networks for actor-critic algorithms.""" + +import collections +import functools +import operator + +# Dependency imports + +import gym +import tensorflow as tf + + +NetworkOutput = collections.namedtuple( + 'NetworkOutput', 'policy, value, action_postprocessing') + + +def feed_forward_gaussian_fun(observation_space, action_space, config, + observations): + """Feed-forward gaussian.""" + assert isinstance(observation_space, gym.spaces.box.Box) + + mean_weights_initializer = tf.contrib.layers.variance_scaling_initializer( + factor=config.init_mean_factor) + logstd_initializer = tf.random_normal_initializer(config.init_logstd, 1e-10) + + flat_observations = tf.reshape(observations, [ + tf.shape(observations)[0], tf.shape(observations)[1], + functools.reduce(operator.mul, observations.shape.as_list()[2:], 1)]) + + with tf.variable_scope('policy'): + x = flat_observations + for size in config.policy_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + mean = tf.contrib.layers.fully_connected( + x, action_space.shape[0], tf.tanh, + weights_initializer=mean_weights_initializer) + logstd = tf.get_variable( + 'logstd', mean.shape[2:], tf.float32, logstd_initializer) + logstd = tf.tile( + logstd[None, None], + [tf.shape(mean)[0], tf.shape(mean)[1]] + [1] * (mean.shape.ndims - 2)) + with tf.variable_scope('value'): + x = flat_observations + for size in config.value_layers: + x = tf.contrib.layers.fully_connected(x, size, tf.nn.relu) + value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0] + mean = tf.check_numerics(mean, 'mean') + logstd = tf.check_numerics(logstd, 'logstd') + value = tf.check_numerics(value, 'value') + + policy = tf.contrib.distributions.MultivariateNormalDiag(mean, + tf.exp(logstd)) + + return NetworkOutput(policy, value, lambda a: tf.clip_by_value(a, -2., 2)) diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py new file mode 100644 index 000000000..a2b34c797 --- /dev/null +++ b/tensor2tensor/rl/ppo.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PPO algorithm implementation. + +Based on: https://arxiv.org/abs/1707.06347 +""" + + +import tensorflow as tf + + +def define_ppo_step(observation, action, reward, done, value, old_pdf, + policy_factory, config): + """A step of PPO.""" + new_policy_dist, new_value, _ = policy_factory(observation) + new_pdf = new_policy_dist.prob(action) + + ratio = new_pdf/old_pdf + clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef, + 1 + config.clipping_coef) + + advantage = calculate_discounted_return( + reward, value, done, config.gae_gamma, config.gae_lambda) - value + + advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1], + keep_dims=True) + advantage_normalized = tf.stop_gradient( + (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8)) + + surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized, + ratio * advantage_normalized) + policy_loss = -tf.reduce_mean(surrogate_objective) + + value_error = calculate_discounted_return( + reward, new_value, done, config.gae_gamma, config.gae_lambda) - value + value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2) + + entropy = new_policy_dist.entropy() + entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy) + + total_loss = policy_loss + value_loss + entropy_loss + + optimization_op = config.optimizer(config.learning_rate).minimize(total_loss) + + with tf.control_dependencies([optimization_op]): + return [tf.identity(x) for x in (policy_loss, value_loss, entropy_loss)] + + +def define_ppo_epoch(memory, policy_factory, config): + """An epoch of PPO.""" + observation, reward, done, action, old_pdf, value = memory + + # This is to avoid propagating gradients though simulation of simulation + observation = tf.stop_gradient(observation) + action = tf.stop_gradient(action) + reward = tf.stop_gradient(reward) + done = tf.stop_gradient(done) + value = tf.stop_gradient(value) + old_pdf = tf.stop_gradient(old_pdf) + + policy_loss, value_loss, entropy_loss = tf.scan( + lambda _1, _2: define_ppo_step( # pylint: disable=g-long-lambda + observation, action, reward, done, value, + old_pdf, policy_factory, config), + tf.range(config.optimization_epochs), + [0., 0., 0.], + parallel_iterations=1) + + print_losses = tf.group( + tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: '), + tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '), + tf.Print(0, [tf.reduce_mean(entropy_loss)], 'entropy loss: ')) + + return print_losses + + +def calculate_discounted_return(reward, value, done, discount, unused_lambda): + """Discounted Monte-Carlo returns.""" + done = tf.cast(done, tf.float32) + reward2 = done[-1, :] * reward[-1, :] + (1 - done[-1, :]) * value[-1, :] + reward = tf.concat([reward[:-1,], reward2[None, ...]], axis=0) + return_ = tf.reverse(tf.scan( + lambda agg, cur: cur[0] + (1 - cur[1]) * discount * agg, # fn + [tf.reverse(reward, [0]), # elem + tf.reverse(done, [0])], + tf.zeros_like(reward[0, :]), # initializer + 1, + False), [0]) + return tf.check_numerics(return_, 'return') diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py new file mode 100644 index 000000000..ced6da342 --- /dev/null +++ b/tensor2tensor/rl/rl_trainer_lib.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training of RL agent with PPO algorithm.""" + +from __future__ import absolute_import + +import functools + +# Dependency imports + +import gym +from tensor2tensor.rl import collect +from tensor2tensor.rl import networks +from tensor2tensor.rl import ppo +from tensor2tensor.rl.envs import utils + +import tensorflow as tf + + +def define_train(policy_lambda, env_lambda, config): + """Define the training setup.""" + env = env_lambda() + action_space = env.action_space + observation_space = env.observation_space + + batch_env = utils.define_batch_env(env_lambda, config.num_agents) + + policy_factory = tf.make_template( + "network", + functools.partial(policy_lambda, observation_space, + action_space, config)) + + (collect_op, memory) = collect.define_collect( + policy_factory, batch_env, config) + + with tf.control_dependencies([collect_op]): + ppo_op = ppo.define_ppo_epoch(memory, policy_factory, config) + + return ppo_op + + +def train(params): + policy_lambda, env_lambda, config = params + ppo_op = define_train(policy_lambda, env_lambda, config) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + for _ in range(config.epochs_num): + sess.run(ppo_op) + + +def example_params(): + """Example hyperparameters.""" + config = tf.contrib.training.HParams( + init_mean_factor=0.1, + init_logstd=0.1, + policy_layers=(100, 100), + value_layers=(100, 100), + num_agents=30, + clipping_coef=0.2, + gae_gamma=0.99, + gae_lambda=0.95, + entropy_loss_coef=0.01, + value_loss_coef=1, + optimizer=tf.train.AdamOptimizer, + learning_rate=1e-4, + optimization_epochs=15, + epoch_length=200, + epochs_num=2000) + return networks.feed_forward_gaussian_fun, pendulum_lambda, config + + +def pendulum_lambda(): + return gym.make("Pendulum-v0") diff --git a/tensor2tensor/rl/rl_trainer_lib_test.py b/tensor2tensor/rl/rl_trainer_lib_test.py new file mode 100644 index 000000000..1a276ef39 --- /dev/null +++ b/tensor2tensor/rl/rl_trainer_lib_test.py @@ -0,0 +1,34 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests of basic flow of collecting trajectories and training PPO.""" + +# Dependency imports + +from tensor2tensor.rl import rl_trainer_lib + +import tensorflow as tf + + +class TrainTest(tf.test.TestCase): + + def test_no_crash_pendulum(self): + params = rl_trainer_lib.example_params() + params[2].epochs_num = 10 + rl_trainer_lib.train(params) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py new file mode 100644 index 000000000..875c28567 --- /dev/null +++ b/tensor2tensor/rl/t2t_rl_trainer.py @@ -0,0 +1,30 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training of RL agent with PPO algorithm.""" + +# Dependency imports + +from tensor2tensor.rl import rl_trainer_lib + +import tensorflow as tf + + +def main(_): + rl_trainer_lib.train(rl_trainer_lib.example_params()) + + +if __name__ == "__main__": + tf.app.run() From c03fd19df53eabe75c2d7aac86431cd4ba833918 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 8 Feb 2018 11:16:13 -0800 Subject: [PATCH 28/31] Support hyperparameter tuning on Cloud ML Engine (reapply) PiperOrigin-RevId: 185022560 --- tensor2tensor/bin/t2t_trainer.py | 55 ++++++++++- tensor2tensor/layers/common_hparams.py | 104 +++++++++++---------- tensor2tensor/models/slicenet.py | 4 - tensor2tensor/models/transformer.py | 15 +-- tensor2tensor/models/transformer_sketch.py | 5 - tensor2tensor/utils/cloud_mlengine.py | 74 +++++++++++---- tensor2tensor/utils/trainer_lib.py | 4 + 7 files changed, 167 insertions(+), 94 deletions(-) diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index 051c2e90d..469734883 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -93,6 +93,22 @@ "GPUs on machine type. See documentation: " "https://cloud.google.com/ml-engine/reference/rest/v1/" "projects.jobs#traininginput") +# Hyperparameter tuning on Cloud ML Engine +# Pass an --hparams_range to enable +flags.DEFINE_string("autotune_objective", None, + "TensorBoard metric name to optimize.") +flags.DEFINE_bool("autotune_maximize", True, + "Whether to maximize (vs. minimize) autotune_objective.") +flags.DEFINE_integer("autotune_max_trials", 10, + "Maximum number of tuning experiments to run.") +flags.DEFINE_integer("autotune_parallel_trials", 1, + "How many trials to run in parallel (will spin up this " + "many jobs.") +# Note than in open-source TensorFlow, the dash gets converted to an underscore, +# so access is FLAGS.job_dir. +flags.DEFINE_string("job-dir", None, + "DO NOT USE. Exists only for Cloud ML Engine to pass in " + "during hyperparameter tuning. Overrides --output_dir.") def get_problem_name(): @@ -101,6 +117,33 @@ def get_problem_name(): return problems[0] +def set_hparams_from_args(args): + """Set hparams overrides from unparsed args list.""" + if not args: + return + + hp_prefix = "--hp_" + tf.logging.info("Found unparsed command-line arguments. Checking if any " + "start with %s and interpreting those as hparams " + "settings.", hp_prefix) + + pairs = [] + i = 0 + while i < len(args): + arg = args[i] + if arg.startswith(hp_prefix): + pairs.append((arg.lstrip(hp_prefix), args[i+1])) + i += 2 + else: + tf.logging.warn("Found unknown flag: %s", arg) + i += 1 + + as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs]) + if FLAGS.hparams: + as_hparams = "," + as_hparams + FLAGS.hparams += as_hparams + + def create_hparams(): if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set: tf.logging.warn("Not all hyperparameter sets work on TPU. " @@ -265,21 +308,23 @@ def maybe_cloud_tpu(): yield -def main(_): +def main(argv): tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) log_registry() if FLAGS.cloud_mlengine: - assert not FLAGS.cloud_tpu - assert FLAGS.output_dir.startswith("gs://") - assert FLAGS.data_dir.startswith("gs://") - return cloud_mlengine.launch(dict(FLAGS.__dict__["__flags"])) + return cloud_mlengine.launch() if FLAGS.generate_data: generate_data() + if hasattr(FLAGS, "job_dir") and FLAGS.job_dir: + FLAGS.output_dir = FLAGS.job_dir + + if argv: + set_hparams_from_args(argv[1:]) hparams = create_hparams() if is_chief(): save_metadata(hparams) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 02a5df2f3..147757b47 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -21,7 +21,6 @@ # Dependency imports -import six from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.utils import registry @@ -224,10 +223,15 @@ class RangedHParams(object): LOG_SCALE = 2 REVERSE_LOG_SCALE = 3 + SCALES_STR = { + LINEAR_SCALE: "UNIT_LINEAR_SCALE", + LOG_SCALE: "UNIT_LOG_SCALE", + REVERSE_LOG_SCALE: "UNIT_REVERSE_LOG_SCALE", + } + def __init__(self): self._categorical_params = {} self._discrete_params = {} - self._discrete_float_params = {} self._float_params = {} self._int_params = {} @@ -237,10 +241,12 @@ def _check_reset_and_type_change(self, name, orig_ctr): if name in orig_ctr: tf.logging.warning("Overwriting hparam %s", name) - ctr_names = [(self._categorical_params, - "categorical"), (self._discrete_params, "discrete"), - (self._float_params, "float"), (self._int_params, "int"), - (self._discrete_float_params, "discrete_float")] + ctr_names = [ + (self._categorical_params, "categorical"), + (self._discrete_params, "discrete"), + (self._float_params, "float"), + (self._int_params, "int"), + ] ctrs, names = list(zip(*ctr_names)) orig_name = names[ctrs.index(orig_ctr)] @@ -263,23 +269,17 @@ def set_discrete(self, name, feasible_points, scale=None, length=None): self._discrete_params[name] = (name, feasible_points, scale, length) def set_float(self, name, min_val, max_val, scale=None, length=None): - if name in self._discrete_float_params: - del self._discrete_float_params[name] self._check_reset_and_type_change(name, self._float_params) self._float_params[name] = (name, min_val, max_val, scale, length) - def set_discrete_float(self, name, val): - self._check_reset_and_type_change(name, self._discrete_float_params) - self._discrete_float_params[name] = (name, [val]) - def set_int(self, name, min_val, max_val, scale=None, length=None): self._check_reset_and_type_change(name, self._int_params) self._int_params[name] = (name, min_val, max_val, scale, length) def fix_select_params(self, hp): ctrs = [ - self._categorical_params, self._discrete_params, - self._discrete_float_params, self._float_params, self._int_params + self._categorical_params, self._discrete_params, self._float_params, + self._int_params ] for key, val in hp.values().iteritems(): for ctr in ctrs: @@ -287,52 +287,56 @@ def fix_select_params(self, hp): del ctr[key] self.set_discrete(key, [val]) + def to_parameter_specs(self, name_prefix=""): + """To list of dicts suitable for Cloud ML Engine hyperparameter tuning.""" + specs = [] + for name, categories, _ in self._categorical_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "CATEGORICAL", + "categoricalValues": categories, + } + specs.append(spec) -def fill_ranged_hparams_from_hparams(hparams, ranged_hparams): - """Fill ranged_hparams with singleton values from hparams. + for name, feasible_points, scale, _ in self._discrete_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "DISCRETE", + "discreteValues": feasible_points, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - HParams are placed in RangedHParams with the following functions, according to - type: - * int: set_discrete - * bool: set_discrete - * float: set_discrete_float - * str: set_categorical + for name, min_val, max_val, scale, _ in self._float_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "DOUBLE", + "minValue": min_val, + "maxValue": max_val, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - Args: - hparams: tf.contrib.training.HParams; contains the hyperparameters to copy - over to ranged_hparams. - ranged_hparams: RangedHParams; will have hparams values copied to it. + for name, min_val, max_val, scale, _ in self._int_params.values(): + spec = { + "parameterName": name_prefix + name, + "type": "INTEGER", + "minValue": min_val, + "maxValue": max_val, + } + if scale: + spec["scaleType"] = self.SCALES_STR[scale] + specs.append(spec) - Raises: - ValueError: if hparams contains a hyperparameter not of type - {int, float, str, bool}. - """ - for name, (hp_type, is_multivalent) in six.iteritems(hparams._hparam_types): # pylint: disable=protected-access - - if is_multivalent: - raise ValueError("Multivalent hparams not supported in RangedHParams. " - "Hyperparameter %s is multivalent." % name) - val = getattr(hparams, name) - if hp_type == int: - ranged_hparams.set_discrete(name, [val]) - elif hp_type == bool: - ranged_hparams.set_discrete(name, [int(val)]) - elif hp_type == float: - ranged_hparams.set_discrete_float(name, val) - elif hp_type == str: - ranged_hparams.set_categorical(name, [val]) - else: - raise ValueError("Unsupported type %s for param %s" % (hp_type, name)) + return specs @registry.register_ranged_hparams("basic1") def basic_range1(ranged_hparams): """A basic range of hyperparameters.""" rhp = ranged_hparams - - hparams = basic_params1() - fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_discrete("batch_size", [1024, 2048, 4096]) rhp.set_discrete("num_hidden_layers", [1, 2, 3, 4, 5, 6]) rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512], scale=rhp.LOG_SCALE) diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index e77412513..43cfa571e 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -381,10 +381,6 @@ def slicenet_params1_tiny(): def slicenet_range1(ranged_hparams): """Small range of hyperparameters.""" rhp = ranged_hparams - - hparams = slicenet_params1() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 305a17379..df4b32277 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -1228,11 +1228,9 @@ def transformer_prepend(): return transformer_prepend_v2() -@registry.register_ranged_hparams("transformer_base") +@registry.register_ranged_hparams def transformer_base_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_base() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1337,8 +1335,6 @@ def transformer_tiny_tpu(): @registry.register_ranged_hparams def transformer_tiny_tpu_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_tiny_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_float("weight_decay", 0.0, 2.0) @@ -1346,8 +1342,6 @@ def transformer_tiny_tpu_range(rhp): @registry.register_ranged_hparams def transformer_tpu_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", @@ -1358,13 +1352,6 @@ def transformer_tpu_range(rhp): rhp.set_float("weight_decay", 0.0, 2.0) -@registry.register_ranged_hparams -def transformer_tpu_batch_range(rhp): - hparams = transformer_tpu() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_discrete("batch_size", [256, 512, 768, 1024]) - - @registry.register_hparams def transformer_small_tpu(): """TPU-friendly version of transformer_small. diff --git a/tensor2tensor/models/transformer_sketch.py b/tensor2tensor/models/transformer_sketch.py index 913243f00..e8fe796a8 100644 --- a/tensor2tensor/models/transformer_sketch.py +++ b/tensor2tensor/models/transformer_sketch.py @@ -22,7 +22,6 @@ # Dependency imports -from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer from tensor2tensor.models import transformer_vae @@ -125,10 +124,6 @@ def transformer_sketch_6layer(): @registry.register_ranged_hparams("transformer_sketch_ranged") def transformer_sketch_ranged(rhp): """Range of hparams for vizier.""" - - hparams = transformer_sketch() - common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - rhp.set_categorical("ffn_layer", ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"]) rhp.set_discrete("batch_size", [1024, 2048, 4096]) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index 0bb5ddc40..82ac23a39 100644 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -22,15 +22,17 @@ from googleapiclient import discovery from oauth2client.client import GoogleCredentials +from tensor2tensor.layers import common_hparams from tensor2tensor.utils import cloud_tpu as cloud +from tensor2tensor.utils import registry from tensor2tensor.utils import usr_dir as usr_dir_lib import tensorflow as tf +FLAGS = tf.flags.FLAGS + CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' # TODO(rsepassi): -# * Support --autotune -# * Add documentation clould_mlengine.md # * Enable multi-machine sync/async training SETUP_PY = """ @@ -44,9 +46,9 @@ """ -def args_dict_as_args(args_dict): - """Convert dict to list of args suitable for passing on cmd line.""" - args_dict = dict(args_dict) +def flags_as_args(): + """Convert FLAGS to list of args suitable for passing on cmd line.""" + args_dict = dict(FLAGS.__dict__['__flags']) del args_dict['cloud_mlengine'] # Configured later del args_dict['t2t_usr_dir'] @@ -54,6 +56,8 @@ def args_dict_as_args(args_dict): for name, val in args_dict.items(): if val is None: continue + if name.startswith('autotune'): + continue args.extend(['--%s' % name, str(val)]) return args @@ -83,9 +87,9 @@ def machine_config(num_gpus=1, use_tpu=False, master_type=None): return config -def configure_job(flags_dict): +def configure_job(): """Construct jobSpec for ML Engine job.""" - train_dir = flags_dict['output_dir'] + train_dir = FLAGS.output_dir assert train_dir.startswith('gs://') job_name = os.path.basename(train_dir) @@ -93,16 +97,27 @@ def configure_job(flags_dict): # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { 'pythonModule': 'tensor2tensor.bin.t2t_trainer', - 'args': args_dict_as_args(flags_dict), + 'args': flags_as_args(), 'region': cloud.default_region(), 'runtimeVersion': '1.4', 'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7', + 'jobDir': train_dir, } training_input.update( machine_config( - num_gpus=flags_dict['worker_gpu'], - use_tpu=flags_dict['use_tpu'], - master_type=flags_dict['cloud_mlengine_master_type'])) + num_gpus=FLAGS.worker_gpu, + use_tpu=FLAGS.use_tpu, + master_type=FLAGS.cloud_mlengine_master_type)) + if FLAGS.hparams_range: + assert FLAGS.autotune_objective + tf.logging.info('Configuring hyperparameter tuning.') + training_input['hyperparameters'] = configure_autotune( + FLAGS.hparams_range, + FLAGS.autotune_objective, + FLAGS.autotune_maximize, + FLAGS.autotune_max_trials, + FLAGS.autotune_parallel_trials, + ) if training_input['scaleTier'] == 'CUSTOM': assert 'masterType' in training_input @@ -170,6 +185,26 @@ def tar_and_copy_usr_dir(usr_dir, train_dir): return usr_tar +def autotune_paramspecs(hparams_range): + rhp = common_hparams.RangedHParams() + registry.ranged_hparams(hparams_range)(rhp) + return rhp.to_parameter_specs(name_prefix='hp_') + + +def configure_autotune(hparams_range, + objective, + maximize=True, + max_trials=10, + parallel_trials=1): + return { + 'goal': 'MAXIMIZE' if maximize else 'MINIMIZE', + 'params': autotune_paramspecs(hparams_range), + 'maxTrials': max_trials, + 'maxParallelTrials': parallel_trials, + 'hyperparameterMetricTag': objective, + } + + def configure_trainer_package(job_spec, t2t_tar): assert t2t_tar.startswith('gs://') job_spec['trainingInput']['packageUris'] = [t2t_tar] @@ -182,18 +217,25 @@ def configure_usr_dir(job_spec, usr_tar): job_spec['trainingInput']['args'].extend(usr_args) -def launch(flags_dict): +def launch(): """Launch t2t_trainer on Cloud ML Engine.""" - job_spec = configure_job(flags_dict) + assert not FLAGS.cloud_tpu + assert not FLAGS.job_dir + assert FLAGS.output_dir.startswith('gs://') + assert FLAGS.data_dir.startswith('gs://') + assert FLAGS.worker_replicas <= 1 + assert FLAGS.ps_replicas <= 0 + + job_spec = configure_job() job_name = job_spec['jobId'] tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, job_spec) assert cloud.confirm() - train_dir = flags_dict['output_dir'] + train_dir = FLAGS.output_dir t2t_tar = tar_and_copy_t2t(train_dir) configure_trainer_package(job_spec, t2t_tar) - if flags_dict['t2t_usr_dir']: - usr_tar = tar_and_copy_usr_dir(flags_dict['t2t_usr_dir'], train_dir) + if FLAGS.t2t_usr_dir: + usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir) configure_usr_dir(job_spec, usr_tar) launch_job(job_spec) tf.logging.info('Launched %s. See console to track: %s.', job_name, diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index 039b06e68..0e64f2475 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -68,8 +68,12 @@ def create_hparams(hparams_set, hparams_overrides_str="", data_dir=None, problem_name=None): + """Create HParams with data_dir and problem hparams, if kwargs provided.""" hparams = registry.hparams(hparams_set)() if hparams_overrides_str: + tf.logging.info("Overriding hparams in %s with %s", + hparams_set, + hparams_overrides_str) hparams = hparams.parse(hparams_overrides_str) if data_dir: hparams.add_hparam("data_dir", data_dir) From 5e8bc75510b2a98b3130f0b5965176bef860409d Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 8 Feb 2018 11:41:48 -0800 Subject: [PATCH 29/31] Shake-shake config for TPU. PiperOrigin-RevId: 185026739 --- tensor2tensor/models/shake_shake.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py index 5e1680edb..31a576338 100644 --- a/tensor2tensor/models/shake_shake.py +++ b/tensor2tensor/models/shake_shake.py @@ -191,3 +191,11 @@ def shakeshake_big(): hparams.layer_prepostprocess_dropout = 0.0 hparams.hidden_size = 96 return hparams + + +@registry.register_hparams +def shakeshake_tpu(): + hparams = shakeshake_big() + hparams.learning_rate_cosine_cycle_steps = 180000 + hparams.learning_rate = 0.6 + return hparams From 77781497f1ca51b0a067cd4394e44e347a11e547 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Thu, 8 Feb 2018 13:30:45 -0800 Subject: [PATCH 30/31] Revert "noam" learning-rate-scheme to use linear warmup. Add learning_rate_schedule hparam to specify a schedule that does not have separate warmup and decay phases. PiperOrigin-RevId: 185042750 --- tensor2tensor/layers/common_hparams.py | 5 +++++ tensor2tensor/models/transformer.py | 2 +- tensor2tensor/utils/optimize.py | 21 +++++++++++++++++---- tensor2tensor/utils/t2t_model.py | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 147757b47..8fbd88bd2 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -63,6 +63,11 @@ def basic_params1(): optimizer_momentum_nesterov=False, weight_decay=1e-6, weight_noise=0.0, + learning_rate_schedule="warmup_and_decay", + # If learning_rate_schedule=="warmup_and_decay", then this specifies + # the decay part of the schedule. + # The warmup is always exponential. + # TODO(noam): add a hyperparameter to control the warmup. learning_rate_decay_scheme="none", # decay_steps and decay_staircase for learning_rate_decay_scheme=="exp" learning_rate_decay_steps=5000, diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index df4b32277..061b68ab7 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -877,7 +877,7 @@ def transformer_base_v1(): hparams.max_length = 256 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 - hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py index a497d56bd..bc09c009d 100644 --- a/tensor2tensor/utils/optimize.py +++ b/tensor2tensor/utils/optimize.py @@ -168,10 +168,6 @@ def learning_rate_decay(hparams, warmup_steps=0): hparams.learning_rate_boundaries, hparams.learning_rate_multiples) - if scheme == "noam": - return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( - (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5) - if scheme == "cosine": cycle_steps = hparams.learning_rate_cosine_cycle_steps cycle_position = global_step % (2 * cycle_steps) @@ -224,6 +220,23 @@ def learning_rate_decay_with_warmup(hparams, num_worker_replicas=1): return tf.where(global_step < warmup_steps, warmup, decay) +def learning_rate_schedule(hparams, num_worker_replicas=1): + """Learning rate schedule based on hparams.""" + schedule = hparams.learning_rate_schedule + warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps) + global_step = tf.to_float(tf.train.get_or_create_global_step()) + if hparams.learning_rate_decay_scheme == "noam": + # backwards compatiblity with previous behavior + schedule = "linear_warmup_rsqrt_decay" + if schedule == "warmup_and_decay": + return learning_rate_decay_with_warmup(hparams, num_worker_replicas) + elif schedule == "linear_warmup_rsqrt_decay": + return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( + (global_step + 1) * warmup_steps**-1.5, (global_step + 1)**-0.5) + else: + raise ValueError("Unrecognized learning rate schedule: %s" % schedule) + + def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None): """Apply weight decay and weight noise.""" if var_list is None: diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 225c4d19b..aadf5e358 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -296,7 +296,7 @@ def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" tf.logging.info("Base learning rate: %f", self.hparams.learning_rate) lr = self.hparams.learning_rate - decay_rate = optimize.learning_rate_decay_with_warmup(self.hparams) + decay_rate = optimize.learning_rate_schedule(self.hparams) lr *= decay_rate if self.hparams.learning_rate_minimum: lr_min = float(self.hparams.learning_rate_minimum) From 290a12a8cd2c9ba94456d8112f453b473b5a0857 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 8 Feb 2018 13:57:00 -0800 Subject: [PATCH 31/31] v1.4.4 PiperOrigin-RevId: 185046879 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 567d7a392..1153dbba8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.4.3', + version='1.4.4', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com',