From 0bdfcbb57fb0f22e44d3f852889a94716009fffc Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 18:49:03 -0700 Subject: [PATCH 1/6] Use get_residual_fn to get the residual_fn in the transformer. PiperOrigin-RevId: 163919630 --- README.md | 2 +- .../generator.py} | 1 - tensor2tensor/models/transformer.py | 18 ++++++++++++------ tensor2tensor/{bin/t2t-trainer => trainer.py} | 1 - 4 files changed, 13 insertions(+), 9 deletions(-) rename tensor2tensor/{bin/t2t-datagen => data_generators/generator.py} (99%) rename tensor2tensor/{bin/t2t-trainer => trainer.py} (99%) diff --git a/README.md b/README.md index bb0f6f534..5bb1c31a3 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ python -c "from tensor2tensor.models.transformer import Transformer" **Datasets** are all standardized on `TFRecord` files with `tensorflow.Example` protocol buffers. All datasets are registered and generated with the [data -generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-datagen) +generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/generator.py) and many common sequence datasets are already available for generation and use. ### Problems and Modalities diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/data_generators/generator.py similarity index 99% rename from tensor2tensor/bin/t2t-datagen rename to tensor2tensor/data_generators/generator.py index 837d6d203..bc79f2384 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/data_generators/generator.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 2320a57f1..a2b55febf 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -56,12 +56,7 @@ def model_fn_body(self, features): (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( targets, hparams) - def residual_fn(x, y): - return common_layers.residual_fn(x, y, - hparams.norm_type, - hparams.residual_dropout, - hparams.hidden_size, - epsilon=hparams.layer_norm_epsilon) + residual_fn = get_residual_fn(hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) @@ -76,6 +71,17 @@ def residual_fn(x, y): return decoder_output +def get_residual_fn(hparams): + """Get residual_fn.""" + def residual_fn(x, y): + return common_layers.residual_fn(x, y, + hparams.norm_type, + hparams.residual_dropout, + hparams.hidden_size, + epsilon=hparams.layer_norm_epsilon) + return residual_fn + + def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/trainer.py similarity index 99% rename from tensor2tensor/bin/t2t-trainer rename to tensor2tensor/trainer.py index 13dd7d355..41c9cd33b 100644 --- a/tensor2tensor/bin/t2t-trainer +++ b/tensor2tensor/trainer.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # From 4390618e692f790871019aadc0371efcd76a89f4 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 2 Aug 2017 10:49:20 -0700 Subject: [PATCH 2/6] Add requests dependency PiperOrigin-RevId: 164005758 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 6f509d03e..fd8e77a46 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ ], install_requires=[ 'numpy', + 'requests', 'sympy', 'six', ], From 9394d0e3f2ecc0f7fa14d59dec17b0da3cff9a21 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 2 Aug 2017 11:05:05 -0700 Subject: [PATCH 3/6] Use ModeKeys enum consistently in trainer_utils instead of string literals. PiperOrigin-RevId: 164008619 --- tensor2tensor/utils/trainer_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 33053806d..5c0240e16 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -181,8 +181,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, eval_hooks.append(hook) return tf.contrib.learn.Experiment( estimator=estimator, - train_input_fn=input_fns["train"], - eval_input_fn=input_fns["eval"], + train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN], + eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL], eval_metrics=eval_metrics, train_steps=train_steps, eval_steps=eval_steps, @@ -220,7 +220,9 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): keep_checkpoint_max=FLAGS.keep_checkpoint_max)) # Store the hparams in the estimator as well estimator.hparams = hparams - return estimator, {"train": train_input_fn, "eval": eval_input_fn} + return estimator, { + tf.contrib.learn.ModeKeys.TRAIN: train_input_fn, + tf.contrib.learn.ModeKeys.EVAL: eval_input_fn} def log_registry(): From f6799b9515e0e214d2d4295f4e4cf94cf27cf333 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 2 Aug 2017 16:45:57 -0700 Subject: [PATCH 4/6] File/code moves PiperOrigin-RevId: 164058229 --- README.md | 2 +- .../generator.py => bin/t2t-datagen} | 26 +- tensor2tensor/{trainer.py => bin/t2t-trainer} | 1 + tensor2tensor/data_generators/all_problems.py | 1 + tensor2tensor/data_generators/image.py | 45 +- .../data_generators/problem_hparams.py | 88 +- .../{models => layers}/common_attention.py | 131 +- .../common_attention_test.py | 32 +- .../{models => layers}/common_hparams.py | 10 +- .../{models => layers}/common_layers.py | 15 +- .../{models => layers}/common_layers_test.py | 9 +- .../{models => layers}/modalities.py | 2 +- .../{models => layers}/modalities_test.py | 2 +- tensor2tensor/models/attention_lm.py | 10 +- tensor2tensor/models/attention_lm_moe.py | 43 +- tensor2tensor/models/bluenet.py | 128 +- tensor2tensor/models/bytenet.py | 13 +- tensor2tensor/models/gene_expression.py | 4 +- tensor2tensor/models/gene_expression_test.py | 2 +- tensor2tensor/models/long_answer.py | 53 +- tensor2tensor/models/lstm.py | 48 +- tensor2tensor/models/lstm_test.py | 6 +- tensor2tensor/models/models.py | 2 +- tensor2tensor/models/multimodel.py | 99 +- tensor2tensor/models/neural_gpu.py | 5 +- tensor2tensor/models/neural_gpu_test.py | 6 +- tensor2tensor/models/shake_shake.py | 4 +- tensor2tensor/models/slicenet.py | 19 +- tensor2tensor/models/slicenet_test.py | 6 +- tensor2tensor/models/transformer.py | 37 +- .../models/transformer_alternative.py | 39 +- tensor2tensor/models/xception.py | 4 +- tensor2tensor/utils/decoding.py | 371 ++++++ tensor2tensor/utils/devices.py | 147 +++ tensor2tensor/utils/input_fn_builder.py | 200 +++ tensor2tensor/utils/metrics.py | 2 +- tensor2tensor/utils/modality.py | 2 +- tensor2tensor/utils/model_builder.py | 451 +++++++ tensor2tensor/utils/registry.py | 6 +- tensor2tensor/utils/trainer_utils.py | 1085 +---------------- tensor2tensor/utils/trainer_utils_test.py | 4 +- 41 files changed, 1648 insertions(+), 1512 deletions(-) rename tensor2tensor/{data_generators/generator.py => bin/t2t-datagen} (93%) rename tensor2tensor/{trainer.py => bin/t2t-trainer} (99%) rename tensor2tensor/{models => layers}/common_attention.py (89%) rename tensor2tensor/{models => layers}/common_attention_test.py (77%) rename tensor2tensor/{models => layers}/common_hparams.py (97%) rename tensor2tensor/{models => layers}/common_layers.py (99%) rename tensor2tensor/{models => layers}/common_layers_test.py (98%) rename tensor2tensor/{models => layers}/modalities.py (99%) rename tensor2tensor/{models => layers}/modalities_test.py (98%) create mode 100644 tensor2tensor/utils/decoding.py create mode 100644 tensor2tensor/utils/devices.py create mode 100644 tensor2tensor/utils/input_fn_builder.py create mode 100644 tensor2tensor/utils/model_builder.py diff --git a/README.md b/README.md index 5bb1c31a3..bb0f6f534 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ python -c "from tensor2tensor.models.transformer import Transformer" **Datasets** are all standardized on `TFRecord` files with `tensorflow.Example` protocol buffers. All datasets are registered and generated with the [data -generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/generator.py) +generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-datagen) and many common sequence datasets are already available for generation and use. ### Problems and Modalities diff --git a/tensor2tensor/data_generators/generator.py b/tensor2tensor/bin/t2t-datagen similarity index 93% rename from tensor2tensor/data_generators/generator.py rename to tensor2tensor/bin/t2t-datagen index bc79f2384..39453dbee 100644 --- a/tensor2tensor/data_generators/generator.py +++ b/tensor2tensor/bin/t2t-datagen @@ -1,3 +1,4 @@ +#!/usr/bin/env python # coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # @@ -15,14 +16,15 @@ """Produces the training and dev data for --problem into --data_dir. -generator.py produces sharded and shuffled TFRecord files of tensorflow.Example -protocol buffers for a variety of datasets registered in this file. +Produces sharded and shuffled TFRecord files of tensorflow.Example protocol +buffers for a variety of registered datasets. -All datasets are registered in _SUPPORTED_PROBLEM_GENERATORS. Each entry maps a -string name (selectable on the command-line with --problem) to a function that -takes 2 arguments - input_directory and mode (one of "train" or "dev") - and -yields for each training example a dictionary mapping string feature names to -lists of {string, int, float}. The generator will be run once for each mode. +All Problems are registered with @registry.register_problem or are in +_SUPPORTED_PROBLEM_GENERATORS in this file. Each entry maps a string name +(selectable on the command-line with --problem) to a function that takes 2 +arguments - input_directory and mode (one of "train" or "dev") - and yields for +each training example a dictionary mapping string feature names to lists of +{string, int, float}. The generator will be run once for each mode. """ from __future__ import absolute_import from __future__ import division @@ -228,8 +230,7 @@ def generate_data_for_problem(problem): num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( - problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, - num_shards) + problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) @@ -249,9 +250,10 @@ def generate_data_for_registered_problem(problem_name): raise ValueError("--num_shards should not be set for registered Problem.") problem = registry.problem(problem_name) task_id = None if FLAGS.task_id < 0 else FLAGS.task_id - problem.generate_data(os.path.expanduser(FLAGS.data_dir), - os.path.expanduser(FLAGS.tmp_dir), - task_id=task_id) + problem.generate_data( + os.path.expanduser(FLAGS.data_dir), + os.path.expanduser(FLAGS.tmp_dir), + task_id=task_id) if __name__ == "__main__": diff --git a/tensor2tensor/trainer.py b/tensor2tensor/bin/t2t-trainer similarity index 99% rename from tensor2tensor/trainer.py rename to tensor2tensor/bin/t2t-trainer index 41c9cd33b..13dd7d355 100644 --- a/tensor2tensor/trainer.py +++ b/tensor2tensor/bin/t2t-trainer @@ -1,3 +1,4 @@ +#!/usr/bin/env python # coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index 6830cf0bf..9be133a61 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -30,6 +30,7 @@ from tensor2tensor.data_generators import wmt from tensor2tensor.data_generators import wsj_parsing + # Problem modules that require optional dependencies # pylint: disable=g-import-not-at-top try: diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index a2e328f00..d70d9339e 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -36,7 +36,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry import tensorflow as tf @@ -76,10 +76,11 @@ class ImageFSNS(ImageProblem): def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") - fsns_urls = generator_utils.maybe_download( - tmp_dir, "fsns_urls.txt", list_url) - fsns_files = [f.strip() for f in open(fsns_urls, "r") - if f.startswith("http://")] + fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", + list_url) + fsns_files = [ + f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") + ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( @@ -88,8 +89,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: - generator_utils.maybe_download( - data_dir, "charset_size134.txt", url) + generator_utils.maybe_download(data_dir, "charset_size134.txt", url) def feature_encoders(self, data_dir): # This vocab file must be present within the data directory. @@ -111,8 +111,8 @@ def hparams(self, defaults, model_hparams): def example_reading_spec(self): label_key = "image/unpadded_label" - return super(ImageFSNS, self).example_reading_spec(self, - label_key=label_key) + return super(ImageFSNS, self).example_reading_spec( + self, label_key=label_key) class Image2ClassProblem(ImageProblem): @@ -161,6 +161,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): def imagenet_preprocess_examples(examples, mode): """Preprocessing used for Imagenet and similar problems.""" + def preprocess(img): img = tf.image.resize_images(img, [360, 360]) img = common_layers.image_augmentation(tf.to_float(img) / 255.) @@ -215,8 +216,8 @@ def is_small(self): def preprocess_examples(self, examples, mode): examples = imagenet_preprocess_examples(examples, mode) - examples["inputs"] = tf.to_int64(tf.image.resize_images( - examples["inputs"], [32, 32])) + examples["inputs"] = tf.to_int64( + tf.image.resize_images(examples["inputs"], [32, 32])) def image_generator(images, labels): @@ -665,12 +666,20 @@ def generator(self, data_dir, tmp_dir, is_training): vocab_filename = "vocab.endefr.%d" % self.targeted_vocab_size if is_training: return mscoco_generator( - data_dir, tmp_dir, True, 80000, - vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size) + data_dir, + tmp_dir, + True, + 80000, + vocab_filename=vocab_filename, + vocab_size=self.targeted_vocab_size) else: return mscoco_generator( - data_dir, tmp_dir, False, 40000, - vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size) + data_dir, + tmp_dir, + False, + 40000, + vocab_filename=vocab_filename, + vocab_size=self.targeted_vocab_size) @registry.register_problem @@ -690,8 +699,8 @@ def targeted_vocab_size(self): def _get_celeba(directory): """Download and extract CELEBA to directory unless it is there.""" # path = os.path.join(directory, _CELEBA_NAME) - path = generator_utils.maybe_download_from_drive(directory, - _CELEBA_NAME, _CELEBA_URL) + path = generator_utils.maybe_download_from_drive(directory, _CELEBA_NAME, + _CELEBA_URL) if not tf.gfile.Exists(path): zipfile.ZipFile(path + ".zip", "r").extractall(directory) @@ -711,7 +720,7 @@ def celeba_generator(tmp_dir, how_many, start_from=0): """ _get_celeba(tmp_dir) image_files = tf.gfile.Glob(os.path.join(tmp_dir, _CELEBA_NAME) + "/*.jpg") - for filename in image_files[start_from:start_from+how_many]: + for filename in image_files[start_from:start_from + how_many]: with tf.gfile.Open(filename, "r") as f: encoded_image_data = f.read() yield { diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 607078d2f..d0577db52 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -25,7 +25,7 @@ # Dependency imports from tensor2tensor.data_generators import text_encoder -from tensor2tensor.models import modalities # pylint: disable=unused-import +from tensor2tensor.layers import modalities # pylint: disable=unused-import from tensor2tensor.utils import registry import tensorflow as tf @@ -202,8 +202,7 @@ def default_problem_hparams(): # the targets. For instance `problem_copy` will copy the inputs, but # `problem_rev_copy` will copy the targets. was_reversed=False, - was_copy=False, - ) + was_copy=False,) def test_problem_hparams(unused_model_hparams, input_vocab_size, @@ -327,9 +326,7 @@ def lm1b_32k(model_hparams): encoder = text_encoder.SubwordTextEncoder( os.path.join(model_hparams.data_dir, "lm1b_32k.subword_text_encoder")) p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) - p.vocabulary = { - "targets": encoder - } + p.vocabulary = {"targets": encoder} p.target_space_id = 3 return p @@ -343,9 +340,7 @@ def lm1b_characters(unused_model_hparams): p.input_modality = {} encoder = text_encoder.ByteTextEncoder() p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) - p.vocabulary = { - "targets": encoder - } + p.vocabulary = {"targets": encoder} p.target_space_id = 2 return p @@ -358,10 +353,7 @@ def wiki_32k(model_hparams): modality_spec = (registry.Modalities.SYMBOL, encoder.vocab_size) p.input_modality = {"inputs": modality_spec} p.target_modality = modality_spec - p.vocabulary = { - "inputs": encoder, - "targets": encoder - } + p.vocabulary = {"inputs": encoder, "targets": encoder} p.target_space_id = 3 return p @@ -430,9 +422,7 @@ def wmt_parsing_tokens(model_hparams, wrong_vocab_size): return p -def wsj_parsing_tokens(model_hparams, - prefix, - wrong_source_vocab_size, +def wsj_parsing_tokens(model_hparams, prefix, wrong_source_vocab_size, wrong_target_vocab_size): """English to parse tree translation benchmark. @@ -487,11 +477,9 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size): p = default_problem_hparams() # This vocab file must be present within the data directory. source_vocab_filename = os.path.join( - model_hparams.data_dir, - "ice_source.vocab.%d" % wrong_source_vocab_size) - target_vocab_filename = os.path.join( - model_hparams.data_dir, - "ice_target.vocab.256") + model_hparams.data_dir, "ice_source.vocab.%d" % wrong_source_vocab_size) + target_vocab_filename = os.path.join(model_hparams.data_dir, + "ice_target.vocab.256") source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) p.input_modality = { @@ -502,7 +490,7 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size): "inputs": source_subtokenizer, "targets": target_subtokenizer, } - p.input_space_id = 18 # Icelandic tokens + p.input_space_id = 18 # Icelandic tokens p.target_space_id = 19 # Icelandic parse tokens return p @@ -534,23 +522,41 @@ def image_celeba(unused_model_hparams): # Dictionary of named hyperparameter settings for various problems. # This is only accessed through the problem_hparams function below. PROBLEM_HPARAMS_MAP = { - "audio_timit_characters_tune": audio_timit_characters, - "audio_timit_characters_test": audio_timit_characters, - "audio_timit_tokens_8k_tune": lambda p: audio_timit_tokens(p, 2**13), - "audio_timit_tokens_8k_test": lambda p: audio_timit_tokens(p, 2**13), - "audio_wsj_characters_tune": audio_wsj_characters, - "audio_wsj_characters_test": audio_wsj_characters, - "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13), - "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13), - "lm1b_characters": lm1b_characters, - "lm1b_32k": lm1b_32k, - "wiki_32k": wiki_32k, - "ice_parsing_characters": wmt_parsing_characters, - "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13), - "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13), - "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens( # pylint: disable=g-long-lambda - p, "wsj", 2**14, 2**9), - "wmt_ende_bpe32k": wmt_ende_bpe32k, - "image_celeba_tune": image_celeba, - "img2img_imagenet": img2img_imagenet, + "audio_timit_characters_tune": + audio_timit_characters, + "audio_timit_characters_test": + audio_timit_characters, + "audio_timit_tokens_8k_tune": + lambda p: audio_timit_tokens(p, 2**13), + "audio_timit_tokens_8k_test": + lambda p: audio_timit_tokens(p, 2**13), + "audio_wsj_characters_tune": + audio_wsj_characters, + "audio_wsj_characters_test": + audio_wsj_characters, + "audio_wsj_tokens_8k_tune": + lambda p: audio_wsj_tokens(p, 2**13), + "audio_wsj_tokens_8k_test": + lambda p: audio_wsj_tokens(p, 2**13), + "lm1b_characters": + lm1b_characters, + "lm1b_32k": + lm1b_32k, + "wiki_32k": + wiki_32k, + "ice_parsing_characters": + wmt_parsing_characters, + "ice_parsing_tokens": + lambda p: ice_parsing_tokens(p, 2**13), + "wmt_parsing_tokens_8k": + lambda p: wmt_parsing_tokens(p, 2**13), + "wsj_parsing_tokens_16k": + lambda p: wsj_parsing_tokens( # pylint: disable=g-long-lambda + p, "wsj", 2**14, 2**9), + "wmt_ende_bpe32k": + wmt_ende_bpe32k, + "image_celeba_tune": + image_celeba, + "img2img_imagenet": + img2img_imagenet, } diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/layers/common_attention.py similarity index 89% rename from tensor2tensor/models/common_attention.py rename to tensor2tensor/layers/common_attention.py index b52fb8aea..e343dba0a 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -22,7 +22,7 @@ # Dependency imports -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers import tensorflow as tf @@ -157,9 +157,10 @@ def add_positional_embedding_nd(x, max_length, name): shape[i + 1] = max_length size[i + 1] = dynamic_shape[i + 1] var = (tf.get_variable( - name + "_%d" % i, shape, - initializer=tf.random_normal_initializer(0, depth ** -0.5)) - * (depth ** 0.5)) + name + "_%d" % i, + shape, + initializer=tf.random_normal_initializer(0, depth**-0.5)) * + (depth**0.5)) x += tf.slice(var, start, size) return x @@ -314,11 +315,13 @@ def attention_image_summary(attn, image_shapes=None): assert len(image_shapes) == 6 q_rows, q_cols, q_channnels, m_rows, m_cols, m_channels = list( image_shapes) - image = tf.reshape(image, [-1, q_rows, q_cols, q_channnels, - m_rows, m_cols, m_channels, 3]) + image = tf.reshape(image, [ + -1, q_rows, q_cols, q_channnels, m_rows, m_cols, m_channels, 3 + ]) image = tf.transpose(image, [0, 1, 4, 3, 2, 5, 6, 7]) - image = tf.reshape(image, [-1, q_rows * m_rows * q_channnels, - q_cols * m_cols * m_channels, 3]) + image = tf.reshape(image, [ + -1, q_rows * m_rows * q_channnels, q_cols * m_cols * m_channels, 3 + ]) tf.summary.image("attention", image, max_outputs=1) @@ -358,9 +361,13 @@ def dot_product_attention(q, return tf.matmul(weights, v) -def masked_local_attention_1d(q, k, v, - block_length=128, look_right=True, - use_whole_block=False, name=None): +def masked_local_attention_1d(q, + k, + v, + block_length=128, + look_right=True, + use_whole_block=False, + name=None): """Attention to the source position and a neigborhood around it. The sequence is divided into blocks of length block_size. Attention for a @@ -390,8 +397,8 @@ def masked_local_attention_1d(q, k, v, Returns: a Tensor of shape [batch, heads, length, depth_v] """ - with tf.variable_scope(name, default_name="local_attention_1d", - values=[q, k, v]): + with tf.variable_scope( + name, default_name="local_attention_1d", values=[q, k, v]): v_shape = v.get_shape() batch = tf.shape(q)[0] heads = tf.shape(q)[1] @@ -401,8 +408,7 @@ def masked_local_attention_1d(q, k, v, original_length = length # If (length < block_length), then we use only one block. - block_length = tf.where(tf.less(length, block_length), - length, block_length) + block_length = tf.where(tf.less(length, block_length), length, block_length) # Pad to desired length. padding_size = tf.mod(-length, block_length) length += padding_size @@ -417,24 +423,23 @@ def masked_local_attention_1d(q, k, v, # We shift everything over by half a block so query is in center. pad_right = block_length // 2 pad_left = block_length - pad_right - extra_padding = [[0, 0], [0, 0], - [pad_left, padding_size+pad_right], [0, 0]] + extra_padding = [[0, 0], [0, 0], [pad_left, padding_size + pad_right], + [0, 0]] k = tf.pad(k, extra_padding) v = tf.pad(v, extra_padding) # Reshape into blocks. q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k]) - k = tf.reshape(k, [batch, heads, num_blocks+1, block_length, depth_k]) - v = tf.reshape(v, [batch, heads, num_blocks+1, block_length, depth_v]) + k = tf.reshape(k, [batch, heads, num_blocks + 1, block_length, depth_k]) + v = tf.reshape(v, [batch, heads, num_blocks + 1, block_length, depth_v]) # Get local blocks by slicing. def local(x): """Create a local version of the keys or values.""" - prev_block = tf.slice( - x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1]) - cur_block = tf.slice( - x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) + prev_block = tf.slice(x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1]) + cur_block = tf.slice(x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) return tf.concat([prev_block, cur_block], 3) + local_k = local(k) local_v = local(v) local_length = tf.shape(local_k)[3] @@ -466,7 +471,11 @@ def local(x): return output -def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100, +def unmasked_local_attention_1d(q, + k, + v, + block_length=128, + filter_width=100, name=None): """strided block local self-attention. @@ -481,19 +490,22 @@ def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100, Returns: a Tensor of shape [batch, heads, length, depth_v] """ - with tf.variable_scope(name, default_name="local_self_attention_1d", - values=[q, k, v]): + with tf.variable_scope( + name, default_name="local_self_attention_1d", values=[q, k, v]): v_shape = v.get_shape() depth_v = tf.shape(v)[3] batch_size = tf.shape(q)[0] num_heads = tf.shape(q)[1] original_length = tf.shape(q)[2] + # making sure q is a multiple of d def pad_to_multiple(x, pad_length): x_length = tf.shape(x)[2] return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]]) + def pad_l_and_r(x, pad_length): return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]]) + q = pad_to_multiple(q, block_length) k = pad_to_multiple(k, block_length) v = pad_to_multiple(v, block_length) @@ -501,16 +513,17 @@ def pad_l_and_r(x, pad_length): # Setting up q blocks new_q_shape = tf.shape(q) # Setting up q blocks - q = tf.reshape(q, [new_q_shape[0], new_q_shape[1], - new_q_shape[2]//block_length, - block_length, new_q_shape[3]]) + q = tf.reshape(q, [ + new_q_shape[0], new_q_shape[1], new_q_shape[2] // block_length, + block_length, new_q_shape[3] + ]) # Setting up k and v values k = pad_l_and_r(k, filter_width) v = pad_l_and_r(v, filter_width) length = tf.shape(k)[2] - full_filter_width = block_length + 2*filter_width + full_filter_width = block_length + 2 * filter_width # getting gather indices indices = tf.range(0, length, delta=1, name="index_range") # making indices [1, length, 1] to appy convs @@ -541,7 +554,7 @@ def pad_l_and_r(x, pad_length): logits = tf.matmul(q, k_new, transpose_b=True) - attention = tf.nn.softmax(logits+attention_bias) + attention = tf.nn.softmax(logits + attention_bias) output = tf.matmul(attention, v_new) output = tf.reshape(output, [batch_size, num_heads, -1, depth_v]) @@ -626,14 +639,13 @@ def multihead_attention(query_antecedent, key_depth_per_head = total_key_depth // num_heads q *= key_depth_per_head**-0.5 if attention_type == "dot_product": - x = dot_product_attention( - q, k, v, bias, dropout_rate, image_shapes) + x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes) elif attention_type == "local_mask_right": x = masked_local_attention_1d(q, k, v, block_length=block_length) else: assert attention_type == "local_unmasked" - x = unmasked_local_attention_1d(q, k, v, block_length=block_length, - filter_width=block_width) + x = unmasked_local_attention_1d( + q, k, v, block_length=block_length, filter_width=block_width) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x @@ -669,29 +681,22 @@ def ffn_self_attention_layer(x, A Tensor. """ - with tf.variable_scope(name, default_name="feedforward_self_attention", - values=[x]): + with tf.variable_scope( + name, default_name="feedforward_self_attention", values=[x]): x_shape = tf.shape(x) part_depth = filter_depth // num_parts if not share_kv: combined = common_layers.conv1d( - x, - filter_depth * 3, - 1, - name="qkv_transform") + x, filter_depth * 3, 1, name="qkv_transform") combined = tf.expand_dims(combined, axis=2) q, k, v = tf.split(combined, 3, axis=3) else: - q = tf.expand_dims(common_layers.conv1d( - x, - filter_depth, - 1, - name="q_transform"), axis=2) - kv_combined = tf.expand_dims(common_layers.conv1d( - tf.concat([x, x], axis=1), - filter_depth, - 1, - name="kv_transform"), axis=2) + q = tf.expand_dims( + common_layers.conv1d(x, filter_depth, 1, name="q_transform"), axis=2) + kv_combined = tf.expand_dims( + common_layers.conv1d( + tf.concat([x, x], axis=1), filter_depth, 1, name="kv_transform"), + axis=2) k, v = tf.split(kv_combined, [x_shape[1], x_shape[1]], axis=1) batch_q = tf.reshape(q, [-1, 1, num_parts, part_depth]) @@ -701,8 +706,7 @@ def ffn_self_attention_layer(x, batch_q *= part_depth**-0.5 # non-masked bias bias = None - x = dot_product_attention( - batch_q, batch_k, batch_v, bias, dropout_rate) + x = dot_product_attention(batch_q, batch_k, batch_v, bias, dropout_rate) x = tf.reshape(x, [x_shape[0], x_shape[1], filter_depth]) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x @@ -738,20 +742,21 @@ def parameter_attention(x, Returns: A Tensor. """ - with tf.variable_scope(name, default_name="parameter_attention", - values=[x]): + with tf.variable_scope(name, default_name="parameter_attention", values=[x]): head_size_k = total_key_depth // num_heads head_size_v = total_value_depth // num_heads var_shape_k = [num_heads, memory_rows, head_size_k] var_shape_v = [num_heads, memory_rows, head_size_v] k = tf.get_variable( - "k", var_shape_k, - initializer=tf.random_normal_initializer( - 0, output_depth ** -0.5)) * (num_heads ** 0.5) + "k", + var_shape_k, + initializer=tf.random_normal_initializer(0, output_depth**-0.5)) * ( + num_heads**0.5) v = tf.get_variable( - "v", var_shape_v, - initializer=tf.random_normal_initializer( - 0, output_depth ** -0.5)) * (output_depth ** 0.5) + "v", + var_shape_v, + initializer=tf.random_normal_initializer(0, output_depth**-0.5)) * ( + output_depth**0.5) batch_size = tf.shape(x)[0] length = tf.shape(x)[1] q = common_layers.conv1d(x, total_key_depth, 1, name="q_transform") @@ -759,8 +764,8 @@ def parameter_attention(x, # This is a cheaper form of attention dropout where we use to use # the same dropout decisions across batch elemets and query positions, # but different decisions across heads and memory positions. - v = tf.nn.dropout(v, 1.0 - dropout_rate, - noise_shape=[num_heads, memory_rows, 1]) + v = tf.nn.dropout( + v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1]) # query is [batch, length, hidden_size] # reshape and transpose it to [heads, batch * length, head_size] q = tf.reshape(q, [batch_size, length, num_heads, head_size_k]) diff --git a/tensor2tensor/models/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py similarity index 77% rename from tensor2tensor/models/common_attention_test.py rename to tensor2tensor/layers/common_attention_test.py index a09da74e1..61855b876 100644 --- a/tensor2tensor/models/common_attention_test.py +++ b/tensor2tensor/layers/common_attention_test.py @@ -22,7 +22,7 @@ # Dependency imports import numpy as np -from tensor2tensor.models import common_attention +from tensor2tensor.layers import common_attention import tensorflow as tf @@ -42,22 +42,14 @@ def testDotProductAttention(self): self.assertEqual(res.shape, (5, 7, 12, 32)) def testMaskedLocalAttention(self): - q = np.array([[[[1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0]]]]) - k = np.array([[[[1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0]]]]) + q = np.array([[[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [ + 1.0, 0.0, 0.0, 0.0 + ], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]]]) + k = np.array([[[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [ + 1.0, 0.0, 0.0, 0.0 + ], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]]]) v = np.ones((1, 1, 8, 1)) with self.test_session() as session: q_ = tf.constant(q, dtype=tf.float32) @@ -77,7 +69,8 @@ def testLocalUnmaskedAttention(self): tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(y, dtype=tf.float32), - block_length=4, filter_width=3) + block_length=4, + filter_width=3) session.run(tf.global_variables_initializer()) res = session.run(a) self.assertEqual(res.shape, (5, 4, 25, 16)) @@ -90,7 +83,8 @@ def testLocalUnmaskedAttentionMatchingBlockLength(self): tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(y, dtype=tf.float32), - block_length=5, filter_width=3) + block_length=5, + filter_width=3) session.run(tf.global_variables_initializer()) res = session.run(a) self.assertEqual(res.shape, (5, 4, 25, 16)) diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/layers/common_hparams.py similarity index 97% rename from tensor2tensor/models/common_hparams.py rename to tensor2tensor/layers/common_hparams.py index 353586393..6ecb06fb4 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -220,10 +220,6 @@ def basic_range1(ranged_hparams): rhp.set_float("optimizer_adam_epsilon", 1e-7, 1e-2, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta1", 0.8, 0.9) rhp.set_float("optimizer_adam_beta2", 0.995, 0.999) - rhp.set_categorical("optimizer", - ["Adam", - "Adagrad", - "Momentum", - "RMSProp", - "SGD", - "YellowFin"]) + rhp.set_categorical("optimizer", [ + "Adam", "Adagrad", "Momentum", "RMSProp", "SGD", "YellowFin" + ]) diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/layers/common_layers.py similarity index 99% rename from tensor2tensor/models/common_layers.py rename to tensor2tensor/layers/common_layers.py index 5449a8bef..8a58cd065 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -475,7 +475,8 @@ def residual_fn(x, residual_dropout, filters=None, epsilon=1e-16, - name=None, reuse=None): + name=None, + reuse=None): """Returns a function for combining layer input and layer output. The returned function on x (layer input) and y (layer output) computes: @@ -494,8 +495,8 @@ def residual_fn(x, Returns: residual layer output with applied norm_fn. """ - with tf.variable_scope(name, default_name="residual", - values=[x, y], reuse=reuse): + with tf.variable_scope( + name, default_name="residual", values=[x, y], reuse=reuse): norm_fn = get_norm(norm_type) res = x + tf.nn.dropout(y, 1.0 - residual_dropout) if norm_type == "layer": @@ -1517,8 +1518,8 @@ def linear_set_layer(layer_size, output: A tensor of dimensions batch_size x sequence_length x output_dims dimension containing the sequences of transformed vectors. """ - with tf.variable_scope(name, default_name="linear_set_layer", - values=[inputs]): + with tf.variable_scope( + name, default_name="linear_set_layer", values=[inputs]): # Apply 1D convolution to apply linear filter to each element # along the 2nd dimension. outputs = conv1d(inputs, layer_size, 1, activation=None, name="set_conv") @@ -1529,8 +1530,8 @@ def linear_set_layer(layer_size, # simply add the transformed context to get the same effect. if len(context.get_shape().as_list()) == 2: context = tf.expand_dims(context, axis=1) - cont_tfm = conv1d(context, layer_size, 1, - activation=None, name="cont_conv") + cont_tfm = conv1d( + context, layer_size, 1, activation=None, name="cont_conv") outputs += cont_tfm if activation_fn is not None: diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/layers/common_layers_test.py similarity index 98% rename from tensor2tensor/models/common_layers_test.py rename to tensor2tensor/layers/common_layers_test.py index 8e724587b..df3ccc68f 100644 --- a/tensor2tensor/models/common_layers_test.py +++ b/tensor2tensor/layers/common_layers_test.py @@ -22,7 +22,7 @@ # Dependency imports import numpy as np -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers import tensorflow as tf @@ -351,8 +351,7 @@ def testResidualFn(self): x2 = np.random.rand(5, 2, 1, 11) x3 = common_layers.residual_fn( tf.constant(x1, dtype=tf.float32), - tf.constant(x2, dtype=tf.float32), - norm_type, 0.1) + tf.constant(x2, dtype=tf.float32), norm_type, 0.1) session.run(tf.global_variables_initializer()) actual = session.run(x3) self.assertEqual(actual.shape, (5, 2, 1, 11)) @@ -365,7 +364,9 @@ def testResidualFnWithLayerNorm(self): x3 = common_layers.residual_fn( tf.constant(x1, dtype=tf.float32), tf.constant(x2, dtype=tf.float32), - norm_type, 0.1, epsilon=0.1) + norm_type, + 0.1, + epsilon=0.1) session.run(tf.global_variables_initializer()) actual = session.run(x3) self.assertEqual(actual.shape, (5, 2, 1, 11)) diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/layers/modalities.py similarity index 99% rename from tensor2tensor/models/modalities.py rename to tensor2tensor/layers/modalities.py index 912c54f8c..523c52fa8 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -22,7 +22,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers from tensor2tensor.utils import expert_utils as eu from tensor2tensor.utils import modality from tensor2tensor.utils import registry diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/layers/modalities_test.py similarity index 98% rename from tensor2tensor/models/modalities_test.py rename to tensor2tensor/layers/modalities_test.py index 9130613b9..0ccd13777 100644 --- a/tensor2tensor/models/modalities_test.py +++ b/tensor2tensor/layers/modalities_test.py @@ -22,7 +22,7 @@ import numpy as np -from tensor2tensor.models import modalities +from tensor2tensor.layers import modalities from tensor2tensor.utils import expert_utils import tensorflow as tf diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 3b874555f..664bc9e21 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -29,9 +29,9 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -56,8 +56,8 @@ def residual_fn(x, y): y, 1.0 - hparams.residual_dropout)) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) - decoder_output = attention_lm_decoder( - decoder_input, residual_fn, decoder_self_attention_bias, hparams) + decoder_output = attention_lm_decoder(decoder_input, residual_fn, + decoder_self_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 4b37050bb..780478fec 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -29,9 +29,9 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -61,32 +61,33 @@ def residual_fn(x, y): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("attention"): - y = dp(common_attention.multihead_attention, - x, - None, - decoder_self_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="decoder_self_attention") + y = dp( + common_attention.multihead_attention, + x, + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + name="decoder_self_attention") x = dp(residual_fn, x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): y, loss = common_layers.moe_layer( dp, self._ps_devices, x, hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, - hparams.hidden_size, - hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, - hparams.moe_loss_coef) + hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, + hparams.moe_n2, hparams.moe_loss_coef) extra_loss += loss else: - y = dp(common_layers.conv_hidden_relu, - x, - hparams.filter_size, - hparams.hidden_size, - dropout=hparams.relu_dropout) + y = dp( + common_layers.conv_hidden_relu, + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout) x = dp(residual_fn, x, y) decoder_output = dp(tf.expand_dims, x, 2) return decoder_output, extra_loss diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py index 87ad70e41..96cb60615 100644 --- a/tensor2tensor/models/bluenet.py +++ b/tensor2tensor/models/bluenet.py @@ -27,14 +27,13 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model import tensorflow as tf - # var: 1d tensor, raw weights for each choice # tempered_var: raw weights with temperature applied # inv_t: inverse of the temperature to use when normalizing `var` @@ -86,7 +85,7 @@ def create_selection_weights(name, assert len(shape) == 1 # TODO(rshin): Change this to select without replacement? selection = tf.multinomial(tf.expand_dims(var, axis=0), 4) - selection = tf.squeeze(selection, axis=0) # [k] selected classes. + selection = tf.squeeze(selection, axis=0) # [k] selected classes. to_run = tf.one_hot(selection, shape[0]) # [k x nmodules] one-hot. # [nmodules], 0=not run, 1=run. to_run = tf.minimum(tf.reduce_sum(to_run, axis=0), 1) @@ -101,16 +100,12 @@ def create_selection_weights(name, if names is not None: tf.get_collection_ref("selection_weight_names/" + var.name).extend( - names.flatten() - if isinstance(names, np.ndarray) else names) + names.flatten() if isinstance(names, np.ndarray) else names) tf.add_to_collection("selection_weight_names_tensor/" + var.name, tf.constant(names)) return SelectionWeights( - var=var, - tempered_var=tempered_var, - inv_t=inv_t, - normalized=weights) + var=var, tempered_var=tempered_var, inv_t=inv_t, normalized=weights) def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels, @@ -155,18 +150,13 @@ def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels, channel_weights.append(channel_weight) channel_weight = tf.add_n(channel_weights) - multiplier = (tf.reshape(kernel_weight, max_kernel_size + (1, 1)) * - tf.reshape(channel_weight, (1, 1, -1, 1))) + multiplier = (tf.reshape(kernel_weight, max_kernel_size + + (1, 1)) * tf.reshape(channel_weight, (1, 1, -1, 1))) return multiplier -def make_subseparable_kernel( - kernel_size, - input_channels, - filters, - separability, - kernel_initializer, - kernel_regularizer): +def make_subseparable_kernel(kernel_size, input_channels, filters, separability, + kernel_initializer, kernel_regularizer): """Make a kernel to do subseparable convolution wiht `tf.nn.conv2d`. Args: @@ -198,16 +188,14 @@ def make_subseparable_kernel( regularizer=kernel_regularizer) pointwise_kernel = tf.get_variable( - "pointwise_kernel", - (input_channels, filters), + "pointwise_kernel", (input_channels, filters), initializer=kernel_initializer, regularizer=kernel_regularizer) expanded_depthwise_kernel = tf.transpose( tf.scatter_nd( indices=tf.tile( - tf.expand_dims( - tf.range(0, input_channels), axis=1), [1, 2]), + tf.expand_dims(tf.range(0, input_channels), axis=1), [1, 2]), updates=tf.transpose(depthwise_kernel, (2, 0, 1)), shape=(input_channels, input_channels) + kernel_size), (2, 3, 0, 1)) @@ -230,21 +218,20 @@ def make_subseparable_kernel( raise NotImplementedError -def multi_subseparable_conv( - inputs, - filters, - kernel_sizes, - input_channels, - separabilities, - kernel_selection_weights=None, - channel_selection_weights=None, - separability_selection_weights=None, - kernel_selection_weights_params=None, - channel_selection_weights_params=None, - separability_selection_weights_params=None, - kernel_initializer=None, - kernel_regularizer=None, - scope=None): +def multi_subseparable_conv(inputs, + filters, + kernel_sizes, + input_channels, + separabilities, + kernel_selection_weights=None, + channel_selection_weights=None, + separability_selection_weights=None, + kernel_selection_weights_params=None, + channel_selection_weights_params=None, + separability_selection_weights_params=None, + kernel_initializer=None, + kernel_regularizer=None, + scope=None): """Simultaneously compute different kinds of convolutions on subsets of input. Args: @@ -299,44 +286,33 @@ def multi_subseparable_conv( kernel_selection_weights = create_selection_weights( "kernels", "softmax", (len(kernel_sizes),), - names=[ - "kernel_h{}_w{}".format(h, w) for h, w in kernel_sizes - ], + names=["kernel_h{}_w{}".format(h, w) for h, w in kernel_sizes], **kernel_selection_weights_params) if channel_selection_weights is None: channel_selection_weights = create_selection_weights( "channels", "softmax", (len(input_channels),), - names=[ - "channels_{}_{}".format(c1, c2) for c1, c2 in input_channels - ], + names=["channels_{}_{}".format(c1, c2) for c1, c2 in input_channels], **channel_selection_weights_params) if separability_selection_weights is None: separability_selection_weights = create_selection_weights( "separability", "softmax", (len(separabilities),), - names=[ - "separability_{}".format(s) for s in separabilities - ], + names=["separability_{}".format(s) for s in separabilities], **separability_selection_weights_params) kernels = [] for separability in separabilities: with tf.variable_scope("separablity_{}".format(separability)): - kernel = make_subseparable_kernel( - max_kernel_size, - max_num_channels, - filters, - separability, - kernel_initializer, - kernel_regularizer) + kernel = make_subseparable_kernel(max_kernel_size, max_num_channels, + filters, separability, + kernel_initializer, kernel_regularizer) premultiplier = kernel_premultiplier( max_kernel_size, kernel_sizes, input_channels, - kernel_selection_weights, - channel_selection_weights) + kernel_selection_weights, channel_selection_weights) kernels.append(kernel * premultiplier) @@ -358,18 +334,24 @@ def multi_subseparable_conv( def conv_module(kw, kh, sep, div): + def convfn(x, hparams): return common_layers.subseparable_conv( - x, hparams.hidden_size // div, (kw, kh), - padding="SAME", separability=sep, + x, + hparams.hidden_size // div, (kw, kh), + padding="SAME", + separability=sep, name="conv_%d%d_sep%d_div%d" % (kw, kh, sep, div)) + return convfn def multi_conv_module(kernel_sizes, seps): + def convfn(x, hparams): return multi_subseparable_conv(x, hparams.hidden_size, kernel_sizes, [(0, hparams.hidden_size)], seps) + return convfn @@ -438,15 +420,16 @@ def run_unary_modules_basic(modules, cur, hparams): def run_unary_modules_sample(modules, cur, hparams, k): """Run modules, sampling k.""" selection_weights = create_selection_weights( - "selection", - ("softmax_topk", k), + "selection", ("softmax_topk", k), shape=[len(modules)], inv_t=100.0 * common_layers.inverse_exp_decay( hparams.anneal_until, min_value=0.01)) - all_res = [tf.cond(tf.less(selection_weights.normalized[n], 1e-6), - lambda: tf.zeros_like(cur), - lambda i=n: modules[i](cur, hparams)) - for n in xrange(len(modules))] + all_res = [ + tf.cond( + tf.less(selection_weights.normalized[n], 1e-6), + lambda: tf.zeros_like(cur), + lambda i=n: modules[i](cur, hparams)) for n in xrange(len(modules)) + ] all_res = tf.concat([tf.expand_dims(r, axis=0) for r in all_res], axis=0) res = all_res * tf.reshape(selection_weights.normalized, [-1, 1, 1, 1, 1]) return tf.reduce_sum(res, axis=0) @@ -461,8 +444,7 @@ def run_unary_modules(modules, cur, hparams): def batch_deviation(x): """Average deviation of the batch.""" x_mean = tf.reduce_mean(x, axis=[0], keep_dims=True) - x_variance = tf.reduce_mean( - tf.square(x - x_mean), axis=[0], keep_dims=True) + x_variance = tf.reduce_mean(tf.square(x - x_mean), axis=[0], keep_dims=True) return tf.reduce_mean(tf.sqrt(x_variance)) @@ -475,13 +457,15 @@ def model_fn_body(self, features): multi_conv = multi_conv_module( kernel_sizes=[(3, 3), (5, 5), (7, 7)], seps=[0, 1]) conv_modules = [multi_conv, identity_module] - activation_modules = [identity_module, - lambda x, _: tf.nn.relu(x), - lambda x, _: tf.nn.elu(x), - lambda x, _: tf.tanh(x)] + activation_modules = [ + identity_module, lambda x, _: tf.nn.relu(x), lambda x, _: tf.nn.elu(x), + lambda x, _: tf.tanh(x) + ] norm_modules = [identity_module, layernorm_module, noamnorm_module] - binary_modules = [first_binary_module, second_binary_module, - sum_binary_module, shakeshake_binary_module] + binary_modules = [ + first_binary_module, second_binary_module, sum_binary_module, + shakeshake_binary_module + ] inputs = features["inputs"] def run_unary(x, name): diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index 28862e594..d9c4e29a9 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -23,8 +23,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -63,8 +63,8 @@ def bytenet_internal(inputs, targets, hparams): # Pad inputs and targets to be the same length, divisible by 50. inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) - final_encoder = residual_dilated_conv( - inputs, hparams.num_block_repeat, "SAME", "encoder", hparams) + final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat, + "SAME", "encoder", hparams) shifted_targets = common_layers.shift_left(targets) kernel = (hparams.kernel_height, hparams.kernel_width) @@ -73,9 +73,8 @@ def bytenet_internal(inputs, targets, hparams): hparams.hidden_size, [((1, 1), kernel)], padding="LEFT") - return residual_dilated_conv( - decoder_start, hparams.num_block_repeat, - "LEFT", "decoder", hparams) + return residual_dilated_conv(decoder_start, hparams.num_block_repeat, + "LEFT", "decoder", hparams) @registry.register_model diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py index bdb93509b..af2d83158 100644 --- a/tensor2tensor/models/gene_expression.py +++ b/tensor2tensor/models/gene_expression.py @@ -22,8 +22,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index a43eda97a..3b1dc6873 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -23,8 +23,8 @@ import numpy as np from tensor2tensor.data_generators import gene_expression as gene_data +from tensor2tensor.layers import modalities # pylint: disable=unused-import from tensor2tensor.models import gene_expression -from tensor2tensor.models import modalities # pylint: disable=unused-import import tensorflow as tf diff --git a/tensor2tensor/models/long_answer.py b/tensor2tensor/models/long_answer.py index be8024f63..a9fb45e4a 100644 --- a/tensor2tensor/models/long_answer.py +++ b/tensor2tensor/models/long_answer.py @@ -34,9 +34,9 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -67,34 +67,35 @@ def residual_fn(x, y): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("attention"): - y = dp(common_attention.multihead_attention, - x, - None, - None, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - attention_type="local_mask_right", - block_length=hparams.block_length, - name="decoder_self_attention") + y = dp( + common_attention.multihead_attention, + x, + None, + None, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + attention_type="local_mask_right", + block_length=hparams.block_length, + name="decoder_self_attention") x = dp(residual_fn, x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): y, loss = common_layers.moe_layer( dp, self._ps_devices, x, hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, - hparams.hidden_size, - hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2, - hparams.moe_loss_coef) + hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1, + hparams.moe_n2, hparams.moe_loss_coef) extra_loss += loss else: - y = dp(common_layers.conv_hidden_relu, - x, - hparams.filter_size, - hparams.hidden_size, - dropout=hparams.relu_dropout) + y = dp( + common_layers.conv_hidden_relu, + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.relu_dropout) x = dp(residual_fn, x, y) x = dp(long_answer_output, x, inputs) return x, extra_loss @@ -113,7 +114,8 @@ def long_answer_prepare_decoder(inputs, targets, hparams): """ decoder_input = tf.concat([ length_embedding(targets, hparams), inputs, - common_layers.shift_left_3d(targets)], 1) + common_layers.shift_left_3d(targets) + ], 1) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return decoder_input @@ -140,8 +142,7 @@ def length_embedding(targets, hparams): padded_target_length = tf.shape(targets)[1] if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN: lengths = padded_target_length * tf.to_int32( - tf.less(tf.random_uniform([batch]), - hparams.answer_length_prob_train)) + tf.less(tf.random_uniform([batch]), hparams.answer_length_prob_train)) elif hparams.mode == tf.contrib.learn.ModeKeys.EVAL: lengths = 0 else: diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index 195879d78..d79b04494 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -23,25 +23,29 @@ # Dependency imports -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model import tensorflow as tf from tensorflow.python.util import nest - # Track Tuple of state and attention values -AttentionTuple = collections.namedtuple("AttentionTuple", - ("state", "attention")) +AttentionTuple = collections.namedtuple("AttentionTuple", ("state", + "attention")) class ExternalAttentionCellWrapper(tf.contrib.rnn.RNNCell): """Wrapper for external attention states for an encoder-decoder setup.""" - def __init__(self, cell, attn_states, attn_vec_size=None, - input_size=None, state_is_tuple=True, reuse=None): + def __init__(self, + cell, + attn_states, + attn_vec_size=None, + input_size=None, + state_is_tuple=True, + reuse=None): """Create a cell with attention. Args: @@ -137,8 +141,8 @@ def call(self, inputs, state): new_attns = self._attention(new_state_cat, attn_states, attn_length) with tf.variable_scope("attn_output_projection"): - output = tf.layers.dense(tf.concat([lstm_output, new_attns], axis=1), - self._attn_size) + output = tf.layers.dense( + tf.concat([lstm_output, new_attns], axis=1), self._attn_size) new_state = AttentionTuple(new_state, new_attns) @@ -151,18 +155,16 @@ def _attention(self, query, attn_states, attn_length): tanh = tf.tanh with tf.variable_scope("attention"): - k = tf.get_variable( - "attn_w", [1, 1, self._attn_size, self._attn_vec_size]) + k = tf.get_variable("attn_w", + [1, 1, self._attn_size, self._attn_vec_size]) v = tf.get_variable("attn_v", [self._attn_vec_size, 1]) - hidden = tf.reshape(attn_states, - [-1, attn_length, 1, self._attn_size]) + hidden = tf.reshape(attn_states, [-1, attn_length, 1, self._attn_size]) hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME") y = tf.layers.dense(query, self._attn_vec_size) y = tf.reshape(y, [-1, 1, 1, self._attn_vec_size]) s = reduce_sum(v * tanh(hidden_features + y), [2, 3]) a = softmax(s) - d = reduce_sum( - tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) + d = reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) new_attns = tf.reshape(d, [-1, self._attn_size]) return new_attns @@ -186,8 +188,8 @@ def dropout_lstm_cell(): time_major=False) -def lstm_attention_decoder(inputs, hparams, train, name, - initial_state, attn_states): +def lstm_attention_decoder(inputs, hparams, train, name, initial_state, + attn_states): """Run LSTM cell with attention on inputs of shape [batch x time x size].""" def dropout_lstm_cell(): @@ -196,9 +198,10 @@ def dropout_lstm_cell(): input_keep_prob=1.0 - hparams.dropout * tf.to_float(train)) layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)] - cell = ExternalAttentionCellWrapper(tf.nn.rnn_cell.MultiRNNCell(layers), - attn_states, - attn_vec_size=hparams.attn_vec_size) + cell = ExternalAttentionCellWrapper( + tf.nn.rnn_cell.MultiRNNCell(layers), + attn_states, + attn_vec_size=hparams.attn_vec_size) initial_state = cell.combine_state(initial_state) with tf.variable_scope(name): return tf.nn.dynamic_rnn( @@ -239,10 +242,7 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train): # LSTM decoder with attention shifted_targets = common_layers.shift_left(targets) decoder_outputs, _ = lstm_attention_decoder( - common_layers.flatten4d3d(shifted_targets), - hparams, - train, - "decoder", + common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder", final_encoder_state, encoder_outputs) return tf.expand_dims(decoder_outputs, axis=2) diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 6ac792f48..7da3d2380 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -24,7 +24,7 @@ import numpy as np from tensor2tensor.data_generators import problem_hparams -from tensor2tensor.models import common_hparams +from tensor2tensor.layers import common_hparams from tensor2tensor.models import lstm import tensorflow as tf @@ -44,8 +44,8 @@ def testLSTMSeq2Seq(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = lstm.LSTMSeq2seq( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + model = lstm.LSTMSeq2seq(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index 907a801cf..c2a904888 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -23,6 +23,7 @@ # pylint: disable=unused-import +from tensor2tensor.layers import modalities from tensor2tensor.models import attention_lm from tensor2tensor.models import attention_lm_moe from tensor2tensor.models import bluenet @@ -30,7 +31,6 @@ from tensor2tensor.models import gene_expression from tensor2tensor.models import long_answer from tensor2tensor.models import lstm -from tensor2tensor.models import modalities from tensor2tensor.models import multimodel from tensor2tensor.models import neural_gpu from tensor2tensor.models import shake_shake diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index 6f60dbfbf..290c78732 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -22,10 +22,10 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers -from tensor2tensor.models import modalities +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers +from tensor2tensor.layers import modalities from tensor2tensor.models import slicenet from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -41,12 +41,22 @@ def conv_res_step(x, hparams, padding, mask): dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)] with tf.variable_scope("conv_res_step"): y = common_layers.subseparable_conv_block( - x, hparams.filter_size, dilations_and_kernels1, - padding=padding, mask=mask, separabilities=0, name="residual1") + x, + hparams.filter_size, + dilations_and_kernels1, + padding=padding, + mask=mask, + separabilities=0, + name="residual1") y = tf.nn.dropout(y, 1.0 - hparams.dropout) return common_layers.subseparable_conv_block( - y, hparams.hidden_size, dilations_and_kernels2, - padding=padding, mask=mask, separabilities=0, name="residual2") + y, + hparams.hidden_size, + dilations_and_kernels2, + padding=padding, + mask=mask, + separabilities=0, + name="residual2") def residual_fn2(x, y, hparams): @@ -102,9 +112,9 @@ def flatten(inputs): expert_loss = 0.0 for i in xrange(hparams.num_hidden_layers): with tf.variable_scope("enc_layer_%d" % i): - inputs_encoded, moe_loss = conv_experts( - inputs_encoded, hparams, dp, self._ps_devices, "SAME", - inputs_mask, i) + inputs_encoded, moe_loss = conv_experts(inputs_encoded, hparams, dp, + self._ps_devices, "SAME", + inputs_mask, i) expert_loss += tf.reduce_mean(moe_loss) * hparams.moe_loss_coef # If we're just predicing a class, there is no use for a decoder, return. @@ -116,54 +126,57 @@ def flatten(inputs): inputs3d = dp(tf.squeeze, inputs, 2) inputs_encoded3d = dp(tf.squeeze, inputs_encoded, 2) encoder_padding = dp(common_attention.embedding_to_padding, inputs3d) - encoder_attention_bias = dp( - common_attention.attention_bias_ignore_padding, encoder_padding) + encoder_attention_bias = dp(common_attention.attention_bias_ignore_padding, + encoder_padding) targets = dp(common_layers.flatten4d3d, sharded_features["targets"]) target_space_emb = dp(slicenet.embed_target_space, sharded_features["target_space_id"], hparams.hidden_size) - (decoder_input, decoder_self_attention_bias) = dp( - prepare_decoder, targets, target_space_emb) + (decoder_input, decoder_self_attention_bias) = dp(prepare_decoder, targets, + target_space_emb) x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.dropout) for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("dec_layer_%d" % layer): with tf.variable_scope("attention"): - y = dp(common_attention.multihead_attention, - x, - None, - decoder_self_attention_bias, - hparams.hidden_size, - hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="decoder_self_attention") - z = dp(common_attention.multihead_attention, - y, - inputs_encoded3d, - encoder_attention_bias, - hparams.hidden_size, - hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="encdec_attention") + y = dp( + common_attention.multihead_attention, + x, + None, + decoder_self_attention_bias, + hparams.hidden_size, + hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + name="decoder_self_attention") + z = dp( + common_attention.multihead_attention, + y, + inputs_encoded3d, + encoder_attention_bias, + hparams.hidden_size, + hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + name="encdec_attention") x = dp(residual_fn3, x, y, z, hparams) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): y, moe_loss = common_layers.moe_layer( - dp, self._ps_devices, x, train, - hparams.hidden_size, hparams.filter_size, - hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef) + dp, self._ps_devices, x, train, hparams.hidden_size, + hparams.filter_size, hparams.moe_n1, hparams.moe_n2, + hparams.moe_loss_coef) expert_loss += tf.reduce_mean(moe_loss) else: - y = dp(common_layers.conv_hidden_relu, - x, - hparams.filter_size, - hparams.hidden_size, - dropout=hparams.dropout) + y = dp( + common_layers.conv_hidden_relu, + x, + hparams.filter_size, + hparams.hidden_size, + dropout=hparams.dropout) x = dp(residual_fn2, x, y, hparams) x = dp(tf.expand_dims, x, 2) diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py index fc9d75639..4037aa8d4 100644 --- a/tensor2tensor/models/neural_gpu.py +++ b/tensor2tensor/models/neural_gpu.py @@ -23,8 +23,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -34,6 +34,7 @@ def neural_gpu(inputs, hparams, name=None): """The core Neural GPU.""" with tf.variable_scope(name, "neural_gpu"): + def step(state, inp): # pylint: disable=missing-docstring x = tf.nn.dropout(state, 1.0 - hparams.dropout) for layer in xrange(hparams.num_hidden_layers): diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index 46c01f403..b7a1e98f7 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -24,7 +24,7 @@ import numpy as np from tensor2tensor.data_generators import problem_hparams -from tensor2tensor.models import common_hparams +from tensor2tensor.layers import common_hparams from tensor2tensor.models import neural_gpu import tensorflow as tf @@ -50,8 +50,8 @@ def testNeuralGPU(self): "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } - model = neural_gpu.NeuralGPU( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + model = neural_gpu.NeuralGPU(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams) shadred_logits, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py index 7fa40783a..aa91654a3 100644 --- a/tensor2tensor/models/shake_shake.py +++ b/tensor2tensor/models/shake_shake.py @@ -23,8 +23,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index f1534137c..8900e6d11 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -23,9 +23,9 @@ from six.moves import xrange # pylint: disable=redefined-builtin from six.moves import zip # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -83,8 +83,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None): return norm_fn(targets_shifted + targets_with_attention, name="attn_norm") -def multi_conv_res(x, padding, name, layers, hparams, - mask=None, source=None): +def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None): """A stack of separable convolution blocks with residual connections.""" with tf.variable_scope(name): padding_bias = None @@ -200,7 +199,10 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams): else: inputs_padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. targets_with_attention = attention( - targets_shifted, inputs_encoded, norm_fn, hparams, + targets_shifted, + inputs_encoded, + norm_fn, + hparams, bias=inputs_padding_bias) # Positional targets: merge attention and raw. @@ -237,8 +239,8 @@ def slicenet_internal(inputs, targets, target_space, problem_idx, hparams): inputs = common_layers.add_timing_signal(inputs) # Add position info. target_space_emb = embed_target_space(target_space, hparams.hidden_size) extra_layers = int(hparams.num_hidden_layers * 1.5) - inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers, - hparams, mask=inputs_mask) + inputs_encoded = multi_conv_res( + inputs, "SAME", "encoder", extra_layers, hparams, mask=inputs_mask) target_modality_name = hparams.problems[problem_idx].target_modality.name if "class_label_modality" in target_modality_name: # If we're just predicing a class, there is no use for a decoder. @@ -266,6 +268,7 @@ def model_fn_body(self, features): features["target_space_id"], self._problem_idx, self._hparams) + _KERNEL_SCHEMES = { "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)], "3.7.7.7": [(3, 1), (7, 1), (7, 1), (7, 1)], diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index c357448e4..388acde1b 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -24,7 +24,7 @@ import numpy as np from tensor2tensor.data_generators import image # pylint: disable=unused-import -from tensor2tensor.models import modalities # pylint: disable=unused-import +from tensor2tensor.layers import modalities # pylint: disable=unused-import from tensor2tensor.models import slicenet from tensor2tensor.utils import registry @@ -47,8 +47,8 @@ def testSliceNet(self): "targets": tf.constant(y, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } - model = slicenet.SliceNet( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + model = slicenet.SliceNet(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index a2b55febf..1add44115 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -27,9 +27,9 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -49,10 +49,9 @@ def model_fn_body(self, features): inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) - (encoder_input, - encoder_self_attention_bias, - encoder_decoder_attention_bias) = ( - transformer_prepare_encoder(inputs, target_space, hparams)) + (encoder_input, encoder_self_attention_bias, + encoder_decoder_attention_bias) = (transformer_prepare_encoder( + inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( targets, hparams) @@ -73,12 +72,16 @@ def model_fn_body(self, features): def get_residual_fn(hparams): """Get residual_fn.""" + def residual_fn(x, y): - return common_layers.residual_fn(x, y, - hparams.norm_type, - hparams.residual_dropout, - hparams.hidden_size, - epsilon=hparams.layer_norm_epsilon) + return common_layers.residual_fn( + x, + y, + hparams.norm_type, + hparams.residual_dropout, + hparams.hidden_size, + epsilon=hparams.layer_norm_epsilon) + return residual_fn @@ -113,8 +116,7 @@ def transformer_prepare_encoder(inputs, target_space, hparams): encoder_input += emb_target_space if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) - return (encoder_input, - encoder_self_attention_bias, + return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) @@ -251,12 +253,9 @@ def transformer_ffn_layer(x, hparams): dropout=hparams.relu_dropout) elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( - x, - hparams.parameter_attention_key_channels or hparams.hidden_size, + x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.filter_size, - hparams.num_heads, + hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py index 1f20bfb51..2604748be 100644 --- a/tensor2tensor/models/transformer_alternative.py +++ b/tensor2tensor/models/transformer_alternative.py @@ -20,7 +20,6 @@ Code is mostly copied from original Transformer source. """ - from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -29,8 +28,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_attention -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -50,10 +49,11 @@ def model_fn_body(self, features): inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) - (encoder_input, encoder_attention_bias, _) = ( - transformer.transformer_prepare_encoder(inputs, target_space, hparams)) - (decoder_input, _) = ( - transformer.transformer_prepare_decoder(targets, hparams)) + (encoder_input, + encoder_attention_bias, _) = (transformer.transformer_prepare_encoder( + inputs, target_space, hparams)) + (decoder_input, _) = (transformer.transformer_prepare_decoder( + targets, hparams)) encoder_mask = bias_to_mask(encoder_attention_bias) @@ -64,12 +64,12 @@ def residual_fn(x, y): encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) - encoder_output = alt_transformer_encoder( - encoder_input, residual_fn, encoder_mask, hparams) + encoder_output = alt_transformer_encoder(encoder_input, residual_fn, + encoder_mask, hparams) - decoder_output = alt_transformer_decoder( - decoder_input, encoder_output, residual_fn, - encoder_attention_bias, hparams) + decoder_output = alt_transformer_decoder(decoder_input, encoder_output, + residual_fn, + encoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) @@ -97,19 +97,14 @@ def composite_layer(inputs, mask, hparams, for_output=False): for layer in xrange(hparams.layers_per_layer): with tf.variable_scope("sub_layer_%d" % layer): x = common_layers.linear_set_layer( - hparams.hidden_size, - x, - dropout=hparams.relu_dropout) + hparams.hidden_size, x, dropout=hparams.relu_dropout) if for_output: context = common_layers.running_global_pool_1d(x) else: context = common_layers.global_pool_1d(x, mask=mask) # Final layer. x = common_layers.linear_set_layer( - hparams.hidden_size, - x, - context=context, - dropout=hparams.relu_dropout) + hparams.hidden_size, x, context=context, dropout=hparams.relu_dropout) return x @@ -150,8 +145,8 @@ def alt_transformer_decoder(decoder_input, hparams.attention_dropout, name="encdec_attention") - x_ = residual_fn(x_, composite_layer(x_, None, hparams, - for_output=True)) + x_ = residual_fn(x_, composite_layer( + x_, None, hparams, for_output=True)) x = residual_fn(x, x_) return x @@ -162,7 +157,7 @@ def bias_to_mask(bias): # output sequences. Squeeze out dim one, and get the first element of # each vector. bias = tf.squeeze(bias, [1])[:, :, 0] - bias = - tf.clip_by_value(bias, -1.0, 1.0) + bias = -tf.clip_by_value(bias, -1.0, 1.0) mask = 1 - bias return mask diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index f2e69da21..a61687f48 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -23,8 +23,8 @@ from six.moves import xrange # pylint: disable=redefined-builtin -from tensor2tensor.models import common_hparams -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py new file mode 100644 index 000000000..12057d8e6 --- /dev/null +++ b/tensor2tensor/utils/decoding.py @@ -0,0 +1,371 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Decoding utilities.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import operator +import os + +# Dependency imports + +import numpy as np +import six + +from six.moves import input # pylint: disable=redefined-builtin + +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import data_reader +from tensor2tensor.utils import devices +from tensor2tensor.utils import input_fn_builder +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + + +def decode_from_dataset(estimator): + hparams = estimator.hparams + for i, problem in enumerate(FLAGS.problems.split("-")): + inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None) + targets_vocab = hparams.problems[i].vocabulary["targets"] + tf.logging.info("Performing local inference.") + infer_problems_data = data_reader.get_data_filepatterns( + FLAGS.problems, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER) + infer_input_fn = input_fn_builder.build_input_fn( + mode=tf.contrib.learn.ModeKeys.INFER, + hparams=hparams, + data_file_patterns=infer_problems_data, + num_datashards=devices.data_parallelism().n, + fixed_problem=i) + result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) + + def log_fn(inputs, + targets, + outputs, + problem, + j, + inputs_vocab=inputs_vocab, + targets_vocab=targets_vocab): + """Log inference results.""" + if "image" in problem and FLAGS.decode_save_images: + save_path = os.path.join(estimator.model_dir, + "%s_prediction_%d.jpg" % (problem, j)) + show_and_save_image(inputs / 255., save_path) + elif inputs_vocab: + decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) + tf.logging.info("Inference results INPUT: %s" % decoded_inputs) + + decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten())) + tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) + decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten())) + tf.logging.info("Inference results TARGET: %s" % decoded_targets) + + if FLAGS.decode_to_file: + output_filepath = FLAGS.decode_to_file + ".outputs." + problem + output_file = tf.gfile.Open(output_filepath, "a") + output_file.write(decoded_outputs + "\n") + target_filepath = FLAGS.decode_to_file + ".targets." + problem + target_file = tf.gfile.Open(target_filepath, "a") + target_file.write(decoded_targets + "\n") + + # The function predict() returns an iterable over the network's + # predictions from the test input. We use it to log inputs and decodes. + inputs_iter = result_iter["inputs"] + targets_iter = result_iter["targets"] + outputs_iter = result_iter["outputs"] + for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)): + inputs, targets, outputs = result + if FLAGS.decode_return_beams: + output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + log_fn(inputs, targets, beam, problem, j) + else: + log_fn(inputs, targets, outputs, problem, j) + + +def decode_from_file(estimator, filename): + """Compute predictions on entries in filename and write them out.""" + hparams = estimator.hparams + problem_id = FLAGS.decode_problem_id + inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"] + targets_vocab = hparams.problems[problem_id].vocabulary["targets"] + tf.logging.info("Performing decoding from a file.") + sorted_inputs, sorted_keys = _get_sorted_inputs(filename) + num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1 + input_fn = _decode_batch_input_fn(problem_id, num_decode_batches, + sorted_inputs, inputs_vocab) + + decodes = [] + for _ in range(num_decode_batches): + result_iter = estimator.predict( + input_fn=input_fn.next if six.PY2 else input_fn.__next__, + as_iterable=True) + for result in result_iter: + + def log_fn(inputs, outputs): + decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) + tf.logging.info("Inference results INPUT: %s" % decoded_inputs) + + decoded_outputs = targets_vocab.decode( + _save_until_eos(outputs.flatten())) + tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) + return decoded_outputs + + if FLAGS.decode_return_beams: + beam_decodes = [] + output_beams = np.split( + result["outputs"], FLAGS.decode_beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + beam_decodes.append(log_fn(result["inputs"], beam)) + decodes.append("\t".join(beam_decodes)) + + else: + decodes.append(log_fn(result["inputs"], result["outputs"])) + + # Reversing the decoded inputs and outputs because they were reversed in + # _decode_batch_input_fn + sorted_inputs.reverse() + decodes.reverse() + # Dumping inputs and outputs to file filename.decodes in + # format result\tinput in the same order as original inputs + if FLAGS.decode_to_file: + output_filename = FLAGS.decode_to_file + else: + output_filename = filename + if FLAGS.decode_shards > 1: + base_filename = output_filename + ("%.2d" % FLAGS.worker_id) + else: + base_filename = output_filename + decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" + + str(FLAGS.decode_alpha) + ".decodes") + tf.logging.info("Writing decodes into %s" % decode_filename) + outfile = tf.gfile.Open(decode_filename, "w") + for index in range(len(sorted_inputs)): + outfile.write("%s\n" % (decodes[sorted_keys[index]])) + + +def decode_interactively(estimator): + hparams = estimator.hparams + + infer_input_fn = _interactive_input_fn(hparams) + for problem_idx, example in infer_input_fn: + targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] + result_iter = estimator.predict(input_fn=lambda e=example: e) + for result in result_iter: + if FLAGS.decode_return_beams: + beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) + scores = None + if "scores" in result: + scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0) + for k, beam in enumerate(beams): + tf.logging.info("BEAM %d:" % k) + beam_string = targets_vocab.decode(_save_until_eos(beam.flatten())) + if scores is not None: + tf.logging.info("%s\tScore:%f" % (beam_string, scores[k])) + else: + tf.logging.info(beam_string) + else: + if FLAGS.identity_output: + tf.logging.info(" ".join(map(str, result["outputs"].flatten()))) + else: + tf.logging.info( + targets_vocab.decode( + _save_until_eos(result["outputs"].flatten()))) + + +def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, + vocabulary): + tf.logging.info(" batch %d" % num_decode_batches) + # First reverse all the input sentences so that if you're going to get OOMs, + # you'll see it in the first batch + sorted_inputs.reverse() + for b in range(num_decode_batches): + tf.logging.info("Decoding batch %d" % b) + batch_length = 0 + batch_inputs = [] + for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( + b + 1) * FLAGS.decode_batch_size]: + input_ids = vocabulary.encode(inputs) + if FLAGS.decode_max_input_size > 0: + # Subtract 1 for the EOS_ID. + input_ids = input_ids[:FLAGS.decode_max_input_size - 1] + input_ids.append(text_encoder.EOS_ID) + batch_inputs.append(input_ids) + if len(input_ids) > batch_length: + batch_length = len(input_ids) + final_batch_inputs = [] + for input_ids in batch_inputs: + assert len(input_ids) <= batch_length + x = input_ids + [0] * (batch_length - len(input_ids)) + final_batch_inputs.append(x) + yield { + "inputs": np.array(final_batch_inputs), + "problem_choice": np.array(problem_id) + } + + +def _interactive_input_fn(hparams): + """Generator that reads from the terminal and yields "interactive inputs". + + Due to temporary limitations in tf.learn, if we don't want to reload the + whole graph, then we are stuck encoding all of the input as one fixed-size + numpy array. + + We yield int64 arrays with shape [const_array_size]. The format is: + [num_samples, decode_length, len(input ids), , ] + + Args: + hparams: model hparams + Yields: + numpy arrays + + Raises: + Exception: when `input_type` is invalid. + """ + num_samples = 3 + decode_length = 100 + input_type = "text" + problem_id = 0 + p_hparams = hparams.problems[problem_id] + has_input = "inputs" in p_hparams.input_modality + vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] + # This should be longer than the longest input. + const_array_size = 10000 + while True: + prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" + " it= ('text' or 'image' or 'label')\n" + " pr= (set the problem number)\n" + " in= (set the input problem number)\n" + " ou= (set the output problem number)\n" + " ns= (changes number of samples)\n" + " dl= (changes decode legnth)\n" + " <%s> (decode)\n" + " q (quit)\n" + ">" % (num_samples, decode_length, "source_string" + if has_input else "target_prefix")) + input_string = input(prompt) + if input_string == "q": + return + elif input_string[:3] == "pr=": + problem_id = int(input_string[3:]) + p_hparams = hparams.problems[problem_id] + has_input = "inputs" in p_hparams.input_modality + vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] + elif input_string[:3] == "in=": + problem = int(input_string[3:]) + p_hparams.input_modality = hparams.problems[problem].input_modality + p_hparams.input_space_id = hparams.problems[problem].input_space_id + elif input_string[:3] == "ou=": + problem = int(input_string[3:]) + p_hparams.target_modality = hparams.problems[problem].target_modality + p_hparams.target_space_id = hparams.problems[problem].target_space_id + elif input_string[:3] == "ns=": + num_samples = int(input_string[3:]) + elif input_string[:3] == "dl=": + decode_length = int(input_string[3:]) + elif input_string[:3] == "it=": + input_type = input_string[3:] + else: + if input_type == "text": + input_ids = vocabulary.encode(input_string) + if has_input: + input_ids.append(text_encoder.EOS_ID) + x = [num_samples, decode_length, len(input_ids)] + input_ids + assert len(x) < const_array_size + x += [0] * (const_array_size - len(x)) + yield problem_id, { + "inputs": np.array(x), + "problem_choice": np.array(problem_id) + } + elif input_type == "image": + input_path = input_string + img = read_image(input_path) + yield problem_id, { + "inputs": img, + "problem_choice": np.array(problem_id) + } + elif input_type == "label": + input_ids = [int(input_string)] + x = [num_samples, decode_length, len(input_ids)] + input_ids + yield problem_id, { + "inputs": np.array(x), + "problem_choice": np.array(problem_id) + } + else: + raise Exception("Unsupported input type.") + + +def read_image(path): + try: + import matplotlib.image as im # pylint: disable=g-import-not-at-top + except ImportError as e: + tf.logging.warning( + "Reading an image requires matplotlib to be installed: %s", e) + raise NotImplementedError("Image reading not implemented.") + return im.imread(path) + + +def show_and_save_image(img, save_path): + try: + import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top + except ImportError as e: + tf.logging.warning("Showing and saving an image requires matplotlib to be " + "installed: %s", e) + raise NotImplementedError("Image display and save not implemented.") + plt.imshow(img) + plt.savefig(save_path) + + +def _get_sorted_inputs(filename): + """Returning inputs sorted according to length. + + Args: + filename: path to file with inputs, 1 per line. + + Returns: + a sorted list of inputs + + """ + tf.logging.info("Getting sorted inputs") + # read file and sort inputs according them according to input length. + if FLAGS.decode_shards > 1: + decode_filename = filename + ("%.2d" % FLAGS.worker_id) + else: + decode_filename = filename + inputs = [line.strip() for line in tf.gfile.Open(decode_filename)] + input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)] + sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1)) + # We'll need the keys to rearrange the inputs back into their original order + sorted_keys = {} + sorted_inputs = [] + for i, (index, _) in enumerate(sorted_input_lens): + sorted_inputs.append(inputs[index]) + sorted_keys[index] = i + return sorted_inputs, sorted_keys + + +def _save_until_eos(hyp): + """Strips everything after the first token, which is normally 1.""" + try: + index = list(hyp).index(text_encoder.EOS_ID) + return hyp[0:index] + except ValueError: + # No EOS_ID: return the array as-is. + return hyp diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py new file mode 100644 index 000000000..4f76367e9 --- /dev/null +++ b/tensor2tensor/utils/devices.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Device placement and data parallelism.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +# pylint: disable=redefined-builtin +from six.moves import xrange +# pylint: enable=redefined-builtin + +from tensor2tensor.utils import expert_utils as eu +import tensorflow as tf + +# TODO(rsepassi): Rm dep on FLAGS here +FLAGS = tf.flags.FLAGS + + +def _ps_replicas(all_workers=False): + if all_workers: + return list(range(FLAGS.ps_replicas)) + # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas. + num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas + return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)] + + +def _gpu_order(num_gpus): + if FLAGS.gpu_order: + ret = [int(s) for s in FLAGS.gpu_order.split(" ")] + if len(ret) == num_gpus: + return ret + return list(range(num_gpus)) + + +def _ps_gpus(all_workers=False): + ps_gpus = [] + for d in _ps_replicas(all_workers=all_workers): + ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)]) + return ps_gpus + + +def ps_devices(all_workers=False): + """List of ps devices (where to put the experts). + + Args: + all_workers: whether the list is for all async workers or just this one. + + Returns: + a list of device names + """ + if FLAGS.ps_replicas > 0: + if FLAGS.ps_gpu > 0: + return [ + FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu) + for (d, gpu) in _ps_gpus(all_workers=all_workers) + ] + else: + return [ + FLAGS.ps_job + "/task:%d" % d + for d in _ps_replicas(all_workers=all_workers) + ] + else: + if FLAGS.worker_gpu > 0: + return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] + else: + return [""] + + +def data_parallelism(all_workers=False): + """Over which devices do we split each training batch. + + In old-fashioned async mode, we split the batch over all GPUs on the + current worker. + + In sync mode, we split the batch over all the parameter server GPUs. + + This function returns an expert_utils.Parallelism object, which can be used + to build the model. It is configured in a way that any variables created + by `tf.get_variable` will be assigned to the parameter servers and shared + between datashards. + + Args: + all_workers: whether the devices are all async workers or just this one. + + Returns: + a expert_utils.Parallelism. + """ + + def _replica_device_setter(worker_device): + if FLAGS.ps_replicas == 0: + return worker_device + return tf.train.replica_device_setter( + worker_device=worker_device, + ps_tasks=FLAGS.ps_replicas, + ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job) + + if FLAGS.schedule == "local_run": + assert not FLAGS.sync + datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] + if FLAGS.locally_shard_to_cpu: + datashard_devices += ["cpu:0"] + caching_devices = None + elif FLAGS.sync: + assert FLAGS.ps_replicas > 0 + datashard_devices = [ + _replica_device_setter(d) for d in ps_devices(all_workers=all_workers) + ] + if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1: + caching_devices = [ + FLAGS.ps_job + "/task:%d/cpu:0" % d + for (d, _) in _ps_gpus(all_workers=all_workers) + ] + else: + caching_devices = None + else: + # old fashioned async - compute on worker + if FLAGS.worker_gpu > 1: + datashard_devices = [ + _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d) + for d in _gpu_order(FLAGS.worker_gpu) + ] + caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu + else: + datashard_devices = [_replica_device_setter(FLAGS.worker_job)] + caching_devices = None + tf.logging.info("datashard_devices: %s", datashard_devices) + tf.logging.info("caching_devices: %s", caching_devices) + return eu.Parallelism( + datashard_devices, + reuse=True, + caching_devices=caching_devices, + daisy_chain_variables=FLAGS.daisy_chain_variables) diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py new file mode 100644 index 000000000..1fac64c8b --- /dev/null +++ b/tensor2tensor/utils/input_fn_builder.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Input function building.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensor2tensor.utils import data_reader + +import tensorflow as tf + +# TODO(rsepassi): Rm dep on FLAGS here +FLAGS = tf.flags.FLAGS + + +def build_input_fn(mode, + hparams, + data_file_patterns=None, + num_datashards=None, + fixed_problem=None): + """Provides input to the graph, either from disk or via a placeholder. + + This function produces an input function that will feed data into + the network. There are two modes of operation: + + 1. If data_file_pattern and all subsequent arguments are None, then + it creates a placeholder for a serialized tf.Example proto. + 2. If data_file_pattern is defined, it will read the data from the + files at the given location. Use this mode for training, + evaluation, and testing prediction. + + Args: + mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + hparams: HParams object. + data_file_patterns: The list of file patterns to use to read in data. Set to + `None` if you want to create a placeholder for the input data. The + `problems` flag is a list of problem names joined by the `-` character. + The flag's string is then split along the `-` and each problem gets its + own example queue. + num_datashards: An integer. + fixed_problem: An integer indicating the problem to fetch data for, or None + if the input is to be randomly selected. + + Returns: + A function that returns a dictionary of features and the target labels. + """ + + def input_fn(): + """Supplies input to our model. + + This function supplies input to our model, where this input is a + function of the mode. For example, we supply different data if + we're performing training versus evaluation. + + Returns: + A tuple consisting of 1) a dictionary of tensors whose keys are + the feature names, and 2) a tensor of target labels if the mode + is not INFER (and None, otherwise). + + Raises: + ValueError: if one of the parameters has an unsupported value. + """ + problem_count, batches = len(data_file_patterns), [] + with tf.name_scope("input_reader"): + for n in xrange(problem_count): + if fixed_problem is not None and n != fixed_problem: + continue + problem_instance = hparams.problem_instances[n] + p_hparams = hparams.problems[n] + with tf.name_scope("problem_%d" % n): + with tf.device("/cpu:0"): # Input reading on CPU + capacity = p_hparams.max_expected_batch_size_per_shard + capacity *= num_datashards + examples = data_reader.input_pipeline(problem_instance, + data_file_patterns[n], + capacity, mode, hparams) + feature_map = data_reader.batch_examples( + examples, + data_reader.hparams_to_batching_scheme( + hparams, + shard_multiplier=num_datashards, + drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN + or hparams.eval_drop_long_sequences), + length_multiplier=(p_hparams.batch_size_multiplier))) + + # Reverse inputs and targets features if the problem was reversed. + if problem_instance is not None: + problem_instance.maybe_reverse_features(feature_map) + problem_instance.maybe_copy_features(feature_map) + else: + if p_hparams.was_reversed: + inputs = feature_map["inputs"] + targets = feature_map["targets"] + feature_map["inputs"] = targets + feature_map["targets"] = inputs + # Use the inputs as the targets if the problem is a copy problem. + if p_hparams.was_copy: + feature_map["targets"] = feature_map["inputs"] + + # Ensure inputs and targets are proper rank. + while len(feature_map["inputs"].get_shape()) != 4: + feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) + while len(feature_map["targets"].get_shape()) != 4: + feature_map["targets"] = tf.expand_dims( + feature_map["targets"], axis=-1) + + batches.append((feature_map["inputs"], feature_map["targets"], + tf.constant(n), tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id))) + + # We choose which problem to process. + loss_moving_avgs = [] # Need loss moving averages for that. + for n in xrange(problem_count): + with tf.variable_scope("losses_avg"): + loss_moving_avgs.append( + tf.get_variable( + "problem_%d/total_loss" % n, initializer=100.0, + trainable=False)) + tf.get_variable( + "problem_%d/training_loss" % n, initializer=100.0, trainable=False) + tf.get_variable( + "problem_%d/extra_loss" % n, initializer=100.0, trainable=False) + if fixed_problem is None: + if (hparams.problem_choice == "uniform" or + mode != tf.contrib.learn.ModeKeys.TRAIN): + problem_choice = tf.random_uniform( + [], maxval=problem_count, dtype=tf.int32) + elif hparams.problem_choice == "adaptive": + loss_moving_avgs = tf.stack(loss_moving_avgs) + problem_choice = tf.multinomial( + tf.reshape(loss_moving_avgs, [1, -1]), 1) + problem_choice = tf.to_int32(tf.squeeze(problem_choice)) + elif hparams.problem_choice == "distributed": + assert FLAGS.worker_replicas >= problem_count + assert FLAGS.worker_replicas % problem_count == 0 + problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) + else: + raise ValueError( + "Value of hparams.problem_choice is %s and must be " + "one of [uniform, adaptive, distributed]" % hparams.problem_choice) + + # Inputs and targets conditional on problem_choice. + rand_inputs, rand_target, choice, inp_id, tgt_id = cond_on_index( + lambda n: batches[n], problem_choice, 0, problem_count - 1) + else: + problem_choice = tf.constant(fixed_problem) + # Take the only constructed batch, which is the fixed_problem. + rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0] + + # Set shapes so the ranks are clear. + rand_inputs.set_shape([None, None, None, None]) + rand_target.set_shape([None, None, None, None]) + choice.set_shape([]) + inp_id.set_shape([]) + tgt_id.set_shape([]) + # Forced shape obfuscation is necessary for inference. + if mode == tf.contrib.learn.ModeKeys.INFER: + rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access + rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access + + # Final feature map. + rand_feature_map = { + "inputs": rand_inputs, + "problem_choice": choice, + "input_space_id": inp_id, + "target_space_id": tgt_id + } + if mode == tf.contrib.learn.ModeKeys.INFER: + rand_feature_map["infer_targets"] = rand_target + rand_target = None + return rand_feature_map, rand_target + + return input_fn + + +def cond_on_index(fn, index_tensor, cur_idx, max_idx): + """Call fn(index_tensor) using tf.cond in [cur_id, max_idx].""" + if cur_idx == max_idx: + return fn(cur_idx) + return tf.cond( + tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx), + lambda: cond_on_index(fn, index_tensor, cur_idx + 1, max_idx)) diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index 4435707cd..db60e07c8 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -22,7 +22,7 @@ import six -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers from tensor2tensor.utils import bleu_hook import tensorflow as tf diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py index 5c596e10f..4bcf21f4d 100644 --- a/tensor2tensor/utils/modality.py +++ b/tensor2tensor/utils/modality.py @@ -22,7 +22,7 @@ # Dependency imports -from tensor2tensor.models import common_layers +from tensor2tensor.layers import common_layers import tensorflow as tf diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py new file mode 100644 index 000000000..a12aa1122 --- /dev/null +++ b/tensor2tensor/utils/model_builder.py @@ -0,0 +1,451 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model building.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import math + +# Dependency imports + +import numpy as np +import six +# pylint: disable=redefined-builtin +from six.moves import xrange +# pylint: enable=redefined-builtin + +from tensor2tensor.models import models # pylint: disable=unused-import +from tensor2tensor.utils import devices +from tensor2tensor.utils import input_fn_builder +from tensor2tensor.utils import registry +from tensor2tensor.utils import yellowfin + +import tensorflow as tf +from tensorflow.python.ops import init_ops + +# TODO(rsepassi): Rm dep on FLAGS here +FLAGS = tf.flags.FLAGS + +# Number of samples to draw for an image input (in such cases as captioning) +IMAGE_DECODE_LENGTH = 100 + + +def build_model_fn(model, hparams): + """Returns a function to build the model. + + Args: + model: The name of the model to use. + hparams: The hyperparameters. + + Returns: + A function to build the model's graph. This function is called by + the Estimator object to construct the graph. + """ + + def initializer(): + if hparams.initializer == "orthogonal": + return tf.orthogonal_initializer(gain=hparams.initializer_gain) + elif hparams.initializer == "uniform": + max_val = 0.1 * hparams.initializer_gain + return tf.random_uniform_initializer(-max_val, max_val) + elif hparams.initializer == "normal_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="normal") + elif hparams.initializer == "uniform_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="uniform") + else: + raise ValueError("Unrecognized initializer: %s" % hparams.initializer) + + def learning_rate_decay(): + """Inverse-decay learning rate until warmup_steps, then decay.""" + warmup_steps = tf.to_float( + hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) + step = tf.to_float(tf.contrib.framework.get_global_step()) + if hparams.learning_rate_decay_scheme == "noam": + return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( + (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) + elif hparams.learning_rate_decay_scheme == "exp100k": + return 0.94**(step // 100000) + elif hparams.learning_rate_decay_scheme == "cosine": + cycle_steps = hparams.learning_rate_cosine_cycle_steps + return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps)) + + inv_base = tf.exp(tf.log(0.01) / warmup_steps) + inv_decay = inv_base**(warmup_steps - step) + if hparams.learning_rate_decay_scheme == "sqrt": + decay = _sqrt_decay(step - warmup_steps) + elif hparams.learning_rate_decay_scheme == "exp10k": + decay = _exp_decay_after(step - warmup_steps, 0.9995, + FLAGS.train_steps - warmup_steps - 10000) + elif hparams.learning_rate_decay_scheme == "exp50k": + decay = _exp_decay_after(step - warmup_steps, 0.99995, + FLAGS.train_steps - warmup_steps - 50000) + elif hparams.learning_rate_decay_scheme == "exp500k": + decay = _exp_decay_after(step - warmup_steps, 0.9999955, + FLAGS.train_steps - warmup_steps - 500000) + elif hparams.learning_rate_decay_scheme == "none": + decay = tf.constant(1.0) + else: + raise ValueError("Unrecognized learning rate decay scheme: %s" % + hparams.learning_rate_decay_scheme) + return tf.cond( + step < warmup_steps, + lambda: inv_decay, + lambda: decay, + name="learning_rate_decay_warump_cond") + + def model_fn(features, targets, mode): + """Creates the prediction, loss, and train ops. + + Args: + features: A dictionary of tensors keyed by the feature name. + targets: A tensor representing the labels (targets). + mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + + Returns: + A tuple consisting of the prediction, loss, and train_op. + """ + # Deep-copy the model hparams between modes to eliminate + # side-effects caused by abuse of the linked problem_hparams + # objects which are used to share modality objects between + # problems. We do not want to share the modality objects between + # modes, since the modality objects may decide to do something + # mode-specific. A better fix would be to stop abusing the + # hparams in this way and instead use a separate dictionary to + # share the modality objects between problems. This dictionary + # could be created once per mode and passed to the constructor of + # t2t_model. + my_hp = copy.deepcopy(hparams) + if mode == tf.contrib.learn.ModeKeys.INFER: + if FLAGS.decode_interactive: + features = _interactive_input_tensor_to_features_dict(features, my_hp) + elif FLAGS.decode_from_file: + features = _decode_input_tensor_to_features_dict(features, my_hp) + # A dictionary containing: + # - problem_choice: A Tensor containing an integer indicating which problem + # was selected for this run. + # - predictions: A Tensor containing the model's output predictions. + run_info = dict() + run_info["problem_choice"] = features["problem_choice"] + + if targets is not None: + features["targets"] = targets + + dp = devices.data_parallelism() + + # Add input statistics for incoming features. + with tf.name_scope("input_stats"): + for (k, v) in six.iteritems(features): + if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: + tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) + tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) + nonpadding = tf.to_float(tf.not_equal(v, 0)) + tf.summary.scalar("%s_nonpadding_tokens" % k, + tf.reduce_sum(nonpadding)) + tf.summary.scalar("%s_nonpadding_fraction" % k, + tf.reduce_mean(nonpadding)) + + tf.get_variable_scope().set_initializer(initializer()) + train = mode == tf.contrib.learn.ModeKeys.TRAIN + + # Get multi-problem logits and loss based on features["problem_choice"]. + def nth_model(n): + """Build the model for the n-th problem, plus some added variables.""" + model_class = registry.model(model)( + my_hp, + mode, + my_hp.problems[n], + n, + dp, + devices.ps_devices(all_workers=True)) + if mode == tf.contrib.learn.ModeKeys.INFER: + return model_class.infer( + features, + beam_size=FLAGS.decode_beam_size, + top_beams=(FLAGS.decode_beam_size + if FLAGS.decode_return_beams else 1), + last_position_only=FLAGS.decode_use_last_position_only, + alpha=FLAGS.decode_alpha, + decode_length=FLAGS.decode_extra_length) + # In distributed mode, we build graph for problem=0 and problem=worker_id. + skipping_is_on = my_hp.problem_choice == "distributed" and train + problem_worker_id = FLAGS.worker_id % len(my_hp.problems) + skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id + # On worker 0 also build graph for problems <= 1. + # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. + skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) + sharded_logits, losses_dict = model_class.model_fn( + features, skip=(skipping_is_on and skip_this_one)) + with tf.variable_scope("losses_avg", reuse=True): + total_loss, ops = 0.0, [] + for loss_key, loss_value in losses_dict.iteritems(): + loss_moving_avg = tf.get_variable("problem_%d/%s_loss" % (n, + loss_key)) + ops.append( + loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) + total_loss += loss_value + loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) + ops.append( + loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) + with tf.variable_scope("train_stats"): # Count steps for this problem. + problem_steps = tf.get_variable( + "problem_%d_steps" % n, initializer=0, trainable=False) + ops.append(problem_steps.assign_add(1)) + with tf.control_dependencies(ops): # Make sure the ops run. + # Ensure the loss is a scalar here. + total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") + return [total_loss] + sharded_logits # Need to flatten for cond later. + + result_list = input_fn_builder.cond_on_index(nth_model, + features["problem_choice"], 0, + len(my_hp.problems) - 1) + + if mode == tf.contrib.learn.ModeKeys.INFER: + # Beam search in sequence model returns both decodes withe key "outputs" + # and scores with they key "scores". If return list is a dict, we expect + # that it will have keys "outputs", a tensor of int32 and scores, a + # tensor of floats. This is useful if we want to return scores from + # estimator.predict + if not isinstance(result_list, dict): + ret = {"outputs": result_list}, None, None + else: + ret = { + "outputs": result_list["outputs"], + "scores": result_list["scores"] + }, None, None + if "inputs" in features: + ret[0]["inputs"] = features["inputs"] + if "infer_targets" in features: + ret[0]["targets"] = features["infer_targets"] + return ret + + sharded_logits, total_loss = result_list[1:], result_list[0] + if mode == tf.contrib.learn.ModeKeys.EVAL: + logits = tf.concat(sharded_logits, 0) + if FLAGS.eval_print: + logits = tf.Print( + logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000) + # For evaluation, return the logits layer as our predictions. + run_info["predictions"] = logits + train_op = None + return run_info, total_loss, None + + assert mode == tf.contrib.learn.ModeKeys.TRAIN + + # Some training statistics. + with tf.name_scope("training_stats"): + learning_rate = my_hp.learning_rate * learning_rate_decay() + learning_rate /= math.sqrt(float(FLAGS.worker_replicas)) + tf.summary.scalar("learning_rate", learning_rate) + global_step = tf.to_float(tf.contrib.framework.get_global_step()) + for n in xrange(len(my_hp.problems)): + with tf.variable_scope("losses_avg", reuse=True): + total_loss_var = tf.get_variable("problem_%d/total_loss" % n) + training_loss_var = tf.get_variable("problem_%d/training_loss" % n) + extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n) + tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var) + tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var) + tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var) + with tf.variable_scope("train_stats", reuse=True): + nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) + tf.summary.scalar("problem_%d_frequency" % n, + tf.to_float(nth_steps) / (global_step + 1.0)) + + # Log trainable weights and add decay. + total_size, weight_decay_loss = 0, 0.0 + all_weights = {v.name: v for v in tf.trainable_variables()} + for v_name in sorted(list(all_weights)): + v = all_weights[v_name] + v_size = int(np.prod(np.array(v.shape.as_list()))) + tf.logging.info("Weight %s\tshape %s\tsize %d", + v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size) + total_size += v_size + if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1: + # Add weight regularization if set and the weight is not a bias (dim>1). + with tf.device(v._ref().device): # pylint: disable=protected-access + v_loss = tf.nn.l2_loss(v) / v_size + weight_decay_loss += v_loss + is_body = len(v_name) > 5 and v_name[:5] == "body/" + if my_hp.weight_noise > 0.0 and is_body: + # Add weight noise if set in my_hp. + with tf.device(v._ref().device): # pylint: disable=protected-access + scale = learning_rate * 0.001 + noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale + noise_op = v.assign_add(noise) + with tf.control_dependencies([noise_op]): + total_loss = tf.identity(total_loss) + tf.logging.info("Total trainable variables size: %d", total_size) + if my_hp.weight_decay > 0.0: + total_loss += weight_decay_loss * my_hp.weight_decay + total_loss = tf.identity(total_loss, name="total_loss") + + # Define the train_op for the TRAIN mode. + opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp) + tf.logging.info("Computing gradients for global model_fn.") + opt_summaries = ["learning_rate", "loss"] + if hparams.summarize_grads: + opt_summaries.extend(["gradients", "gradient_norm"]) + train_op = tf.contrib.layers.optimize_loss( + name="training", + loss=total_loss, + global_step=tf.contrib.framework.get_global_step(), + learning_rate=learning_rate, + clip_gradients=my_hp.clip_grad_norm or None, + gradient_noise_scale=hparams.grad_noise_scale or None, + optimizer=opt, + summaries=opt_summaries, + colocate_gradients_with_ops=True) + + # Remove summaries that will fail to run because they are in conditionals. + # TODO(cwhipkey): Test with this code removed, later in 2017. + summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) + for i in range(len(summaries) - 1, -1, -1): + if summaries[i].name.startswith("cond_"): + del summaries[i] + + tf.logging.info("Global model_fn finished.") + return run_info, total_loss, train_op + + return model_fn + + +class _ConditionalOptimizer(tf.train.Optimizer): + """Conditional optimizer.""" + + def __init__(self, optimizer_name, lr, hparams): + if optimizer_name == "Adam": + # We change the default epsilon for Adam and re-scale lr. + # Using LazyAdam as it's much faster for large vocabulary embeddings. + self._opt = tf.contrib.opt.LazyAdamOptimizer( + lr / 500.0, + beta1=hparams.optimizer_adam_beta1, + beta2=hparams.optimizer_adam_beta2, + epsilon=hparams.optimizer_adam_epsilon) + elif optimizer_name == "Momentum": + self._opt = tf.train.MomentumOptimizer( + lr, momentum=hparams.optimizer_momentum_momentum) + elif optimizer_name == "YellowFin": + tf.logging.info("Init YellowFin Optimizer.") + self._opt = yellowfin.YellowFinOptimizer( + learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) + else: + self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) + + def compute_gradients(self, loss, var_list, colocate_gradients_with_ops): + return self._opt.compute_gradients( + loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops) + + def apply_gradients(self, gradients, global_step=None, name=None): + return self._opt.apply_gradients( + gradients, global_step=global_step, name=name) + + +def _sqrt_decay(step): + """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" + return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) + + +def _exp_decay_after(step, rate, from_which_step): + """Decay exponentially by rate (per step) starting at from_which_step.""" + return tf.cond( + step < from_which_step, + lambda: tf.constant(1.0), + lambda: rate**(step - from_which_step), + name="exponential_decay_step_cond") + + +def _interactive_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.constant(feature_map["inputs"]) + input_is_image = False if len(inputs.shape) < 3 else True + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + p_hparams = hparams.problems[problem_choice] + if not input_is_image: + # Remove the batch dimension. + num_samples = x[0] + length = x[2] + x = tf.slice(x, [3], tf.to_int32([length])) + x = tf.reshape(x, [1, -1, 1, 1]) + # Transform into a batch of size num_samples to get that many random + # decodes. + x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) + else: + x = tf.image.resize_images(x, [299, 299]) + x = tf.reshape(x, [1, 299, 299, -1]) + x = tf.to_int32(x) + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = input_fn_builder.cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = tf.constant(feature_map["problem_choice"]) + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else inputs[1]) + features["inputs"] = x + return features + + +def _decode_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.constant(feature_map["inputs"]) + input_is_image = False + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + p_hparams = hparams.problems[problem_choice] + # Add a third empty dimension dimension + x = tf.expand_dims(x, axis=[2]) + x = tf.to_int32(x) + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = input_fn_builder.cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = feature_map["problem_choice"] + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else tf.shape(x)[1] + 50) + features["inputs"] = x + return features diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 9d5e1e0a6..5402e5bde 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -24,7 +24,7 @@ class MyModel(T2TModel): ``` Access by snake-cased name: `registry.model("my_model")`. If you're using -`trainer.py`, you can pass on the command-line: `--model=my_model`. +`t2t_trainer.py`, you can pass on the command-line: `--model=my_model`. See all the models registered: `registry.list_models()`. @@ -32,13 +32,13 @@ class MyModel(T2TModel): * Register: `registry.register_hparams` * List: `registry.list_hparams` * Retrieve by name: `registry.hparams` - * Command-line flag in `trainer.py`: `--hparams_set=name` + * Command-line flag in `t2t_trainer.py`: `--hparams_set=name` For hyperparameter ranges: * Register: `registry.register_ranged_hparams` * List: `registry.list_ranged_hparams` * Retrieve by name: `registry.ranged_hparams` - * Command-line flag in `trainer.py`: `--hparams_range=name` + * Command-line flag in `t2t_trainer.py`: `--hparams_range=name` """ from __future__ import absolute_import from __future__ import division diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 5c0240e16..c5f3296ee 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -19,38 +19,24 @@ from __future__ import division from __future__ import print_function -import copy -import math -import operator -import os import sys # Dependency imports -import numpy as np -import six -# pylint: disable=redefined-builtin -from six.moves import input -from six.moves import xrange -# pylint: enable=redefined-builtin - from tensor2tensor.data_generators import all_problems # pylint: disable=unused-import from tensor2tensor.data_generators import problem_hparams -from tensor2tensor.data_generators import text_encoder from tensor2tensor.models import models # pylint: disable=unused-import from tensor2tensor.utils import data_reader -from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import decoding +from tensor2tensor.utils import devices +from tensor2tensor.utils import input_fn_builder from tensor2tensor.utils import metrics +from tensor2tensor.utils import model_builder from tensor2tensor.utils import registry -from tensor2tensor.utils import yellowfin import tensorflow as tf from tensorflow.contrib.learn.python.learn import learn_runner from tensorflow.python import debug -from tensorflow.python.ops import init_ops - -# Number of samples to draw for an image input (in such cases as captioning) -IMAGE_DECODE_LENGTH = 100 flags = tf.flags FLAGS = flags.FLAGS @@ -134,16 +120,6 @@ flags.DEFINE_bool("identity_output", False, "To print the output as identity") -def _save_until_eos(hyp): - """Strips everything after the first token, which is normally 1.""" - try: - index = list(hyp).index(text_encoder.EOS_ID) - return hyp[0:index] - except ValueError: - # No EOS_ID: return the array as-is. - return hyp - - def make_experiment_fn(data_dir, model_name, train_steps, eval_steps): """Returns experiment_fn for learn_runner. Wraps create_experiment.""" @@ -195,22 +171,22 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): """Constructs and returns Estimator and train/eval input functions.""" tf.logging.info("Creating experiment, storing model files in %s", output_dir) - num_datashards = data_parallelism().n - train_input_fn = get_input_fn( + num_datashards = devices.data_parallelism().n + train_input_fn = input_fn_builder.build_input_fn( mode=tf.contrib.learn.ModeKeys.TRAIN, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.TRAIN), num_datashards=num_datashards) - eval_input_fn = get_input_fn( + eval_input_fn = input_fn_builder.build_input_fn( mode=tf.contrib.learn.ModeKeys.EVAL, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.EVAL), num_datashards=num_datashards) estimator = tf.contrib.learn.Estimator( - model_fn=model_builder(model_name, hparams=hparams), + model_fn=model_builder.build_model_fn(model_name, hparams=hparams), model_dir=output_dir, config=tf.contrib.learn.RunConfig( master=FLAGS.master, @@ -222,7 +198,8 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): estimator.hparams = hparams return estimator, { tf.contrib.learn.ModeKeys.TRAIN: train_input_fn, - tf.contrib.learn.ModeKeys.EVAL: eval_input_fn} + tf.contrib.learn.ModeKeys.EVAL: eval_input_fn + } def log_registry(): @@ -297,7 +274,11 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): if schedule == "local_run": # Run the local demo. - run_locally(exp_fn(output_dir)) + exp = exp_fn(output_dir) + if exp.train_steps > 0 or exp.eval_steps > 0: + tf.logging.info("Performing local training and evaluation.") + exp.train_and_evaluate() + decode(exp.estimator) else: # Perform distributed training/evaluation. learn_runner.run( @@ -342,1040 +323,14 @@ def session_config(): return config -def model_builder(model, hparams): - """Returns a function to build the model. - - Args: - model: The name of the model to use. - hparams: The hyperparameters. - - Returns: - A function to build the model's graph. This function is called by - the Estimator object to construct the graph. - """ - - def initializer(): - if hparams.initializer == "orthogonal": - return tf.orthogonal_initializer(gain=hparams.initializer_gain) - elif hparams.initializer == "uniform": - max_val = 0.1 * hparams.initializer_gain - return tf.random_uniform_initializer(-max_val, max_val) - elif hparams.initializer == "normal_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="normal") - elif hparams.initializer == "uniform_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="uniform") - else: - raise ValueError("Unrecognized initializer: %s" % hparams.initializer) - - def learning_rate_decay(): - """Inverse-decay learning rate until warmup_steps, then decay.""" - warmup_steps = tf.to_float( - hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) - step = tf.to_float(tf.contrib.framework.get_global_step()) - if hparams.learning_rate_decay_scheme == "noam": - return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( - (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) - elif hparams.learning_rate_decay_scheme == "exp100k": - return 0.94**(step // 100000) - elif hparams.learning_rate_decay_scheme == "cosine": - cycle_steps = hparams.learning_rate_cosine_cycle_steps - return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps)) - - inv_base = tf.exp(tf.log(0.01) / warmup_steps) - inv_decay = inv_base**(warmup_steps - step) - if hparams.learning_rate_decay_scheme == "sqrt": - decay = _sqrt_decay(step - warmup_steps) - elif hparams.learning_rate_decay_scheme == "exp10k": - decay = _exp_decay_after(step - warmup_steps, 0.9995, - FLAGS.train_steps - warmup_steps - 10000) - elif hparams.learning_rate_decay_scheme == "exp50k": - decay = _exp_decay_after(step - warmup_steps, 0.99995, - FLAGS.train_steps - warmup_steps - 50000) - elif hparams.learning_rate_decay_scheme == "exp500k": - decay = _exp_decay_after(step - warmup_steps, 0.9999955, - FLAGS.train_steps - warmup_steps - 500000) - elif hparams.learning_rate_decay_scheme == "none": - decay = tf.constant(1.0) - else: - raise ValueError("Unrecognized learning rate decay scheme: %s" % - hparams.learning_rate_decay_scheme) - return tf.cond( - step < warmup_steps, - lambda: inv_decay, - lambda: decay, - name="learning_rate_decay_warump_cond") - - def model_fn(features, targets, mode): - """Creates the prediction, loss, and train ops. - - Args: - features: A dictionary of tensors keyed by the feature name. - targets: A tensor representing the labels (targets). - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. - - Returns: - A tuple consisting of the prediction, loss, and train_op. - """ - # Deep-copy the model hparams between modes to eliminate - # side-effects caused by abuse of the linked problem_hparams - # objects which are used to share modality objects between - # problems. We do not want to share the modality objects between - # modes, since the modality objects may decide to do something - # mode-specific. A better fix would be to stop abusing the - # hparams in this way and instead use a separate dictionary to - # share the modality objects between problems. This dictionary - # could be created once per mode and passed to the constructor of - # t2t_model. - my_hp = copy.deepcopy(hparams) - if mode == tf.contrib.learn.ModeKeys.INFER: - if FLAGS.decode_interactive: - features = _interactive_input_tensor_to_features_dict(features, my_hp) - elif FLAGS.decode_from_file: - features = _decode_input_tensor_to_features_dict(features, my_hp) - # A dictionary containing: - # - problem_choice: A Tensor containing an integer indicating which problem - # was selected for this run. - # - predictions: A Tensor containing the model's output predictions. - run_info = dict() - run_info["problem_choice"] = features["problem_choice"] - - if targets is not None: - features["targets"] = targets - - dp = data_parallelism() - - # Add input statistics for incoming features. - with tf.name_scope("input_stats"): - for (k, v) in six.iteritems(features): - if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: - tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) - tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) - nonpadding = tf.to_float(tf.not_equal(v, 0)) - tf.summary.scalar("%s_nonpadding_tokens" % k, - tf.reduce_sum(nonpadding)) - tf.summary.scalar("%s_nonpadding_fraction" % k, - tf.reduce_mean(nonpadding)) - - tf.get_variable_scope().set_initializer(initializer()) - train = mode == tf.contrib.learn.ModeKeys.TRAIN - - # Get multi-problem logits and loss based on features["problem_choice"]. - def nth_model(n): - """Build the model for the n-th problem, plus some added variables.""" - model_class = registry.model(model)( - my_hp, - mode, - my_hp.problems[n], - n, - dp, - _ps_devices(all_workers=True)) - if mode == tf.contrib.learn.ModeKeys.INFER: - return model_class.infer( - features, - beam_size=FLAGS.decode_beam_size, - top_beams=(FLAGS.decode_beam_size - if FLAGS.decode_return_beams else 1), - last_position_only=FLAGS.decode_use_last_position_only, - alpha=FLAGS.decode_alpha, - decode_length=FLAGS.decode_extra_length) - # In distributed mode, we build graph for problem=0 and problem=worker_id. - skipping_is_on = my_hp.problem_choice == "distributed" and train - problem_worker_id = FLAGS.worker_id % len(my_hp.problems) - skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id - # On worker 0 also build graph for problems <= 1. - # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. - skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) - sharded_logits, losses_dict = model_class.model_fn( - features, skip=(skipping_is_on and skip_this_one)) - with tf.variable_scope("losses_avg", reuse=True): - total_loss, ops = 0.0, [] - for loss_key, loss_value in losses_dict.iteritems(): - loss_moving_avg = tf.get_variable("problem_%d/%s_loss" - % (n, loss_key)) - ops.append(loss_moving_avg.assign( - loss_moving_avg * 0.9 + loss_value * 0.1)) - total_loss += loss_value - loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) - ops.append(loss_moving_avg.assign( - loss_moving_avg * 0.9 + total_loss * 0.1)) - with tf.variable_scope("train_stats"): # Count steps for this problem. - problem_steps = tf.get_variable( - "problem_%d_steps" % n, initializer=0, trainable=False) - ops.append(problem_steps.assign_add(1)) - with tf.control_dependencies(ops): # Make sure the ops run. - # Ensure the loss is a scalar here. - total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") - return [total_loss] + sharded_logits # Need to flatten for cond later. - - result_list = _cond_on_index(nth_model, features["problem_choice"], 0, - len(my_hp.problems) - 1) - - if mode == tf.contrib.learn.ModeKeys.INFER: - # Beam search in sequence model returns both decodes withe key "outputs" - # and scores with they key "scores". If return list is a dict, we expect - # that it will have keys "outputs", a tensor of int32 and scores, a - # tensor of floats. This is useful if we want to return scores from - # estimator.predict - if not isinstance(result_list, dict): - ret = {"outputs": result_list}, None, None - else: - ret = { - "outputs": result_list["outputs"], - "scores": result_list["scores"] - }, None, None - if "inputs" in features: - ret[0]["inputs"] = features["inputs"] - if "infer_targets" in features: - ret[0]["targets"] = features["infer_targets"] - return ret - - sharded_logits, total_loss = result_list[1:], result_list[0] - if mode == tf.contrib.learn.ModeKeys.EVAL: - logits = tf.concat(sharded_logits, 0) - if FLAGS.eval_print: - logits = tf.Print( - logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000) - # For evaluation, return the logits layer as our predictions. - run_info["predictions"] = logits - train_op = None - return run_info, total_loss, None - - assert mode == tf.contrib.learn.ModeKeys.TRAIN - - # Some training statistics. - with tf.name_scope("training_stats"): - learning_rate = my_hp.learning_rate * learning_rate_decay() - learning_rate /= math.sqrt(float(FLAGS.worker_replicas)) - tf.summary.scalar("learning_rate", learning_rate) - global_step = tf.to_float(tf.contrib.framework.get_global_step()) - for n in xrange(len(my_hp.problems)): - with tf.variable_scope("losses_avg", reuse=True): - total_loss_var = tf.get_variable("problem_%d/total_loss" % n) - training_loss_var = tf.get_variable("problem_%d/training_loss" % n) - extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n) - tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var) - tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var) - tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var) - with tf.variable_scope("train_stats", reuse=True): - nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) - tf.summary.scalar("problem_%d_frequency" % n, - tf.to_float(nth_steps) / (global_step + 1.0)) - - # Log trainable weights and add decay. - total_size, weight_decay_loss = 0, 0.0 - all_weights = {v.name: v for v in tf.trainable_variables()} - for v_name in sorted(list(all_weights)): - v = all_weights[v_name] - v_size = int(np.prod(np.array(v.shape.as_list()))) - tf.logging.info("Weight %s\tshape %s\tsize %d", - v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size) - total_size += v_size - if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1: - # Add weight regularization if set and the weight is not a bias (dim>1). - with tf.device(v._ref().device): # pylint: disable=protected-access - v_loss = tf.nn.l2_loss(v) / v_size - weight_decay_loss += v_loss - is_body = len(v_name) > 5 and v_name[:5] == "body/" - if my_hp.weight_noise > 0.0 and is_body: - # Add weight noise if set in my_hp. - with tf.device(v._ref().device): # pylint: disable=protected-access - scale = learning_rate * 0.001 - noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale - noise_op = v.assign_add(noise) - with tf.control_dependencies([noise_op]): - total_loss = tf.identity(total_loss) - tf.logging.info("Total trainable variables size: %d", total_size) - if my_hp.weight_decay > 0.0: - total_loss += weight_decay_loss * my_hp.weight_decay - total_loss = tf.identity(total_loss, name="total_loss") - - # Define the train_op for the TRAIN mode. - opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp) - tf.logging.info("Computing gradients for global model_fn.") - opt_summaries = ["learning_rate", "loss"] - if hparams.summarize_grads: - opt_summaries.extend(["gradients", "gradient_norm"]) - train_op = tf.contrib.layers.optimize_loss( - name="training", - loss=total_loss, - global_step=tf.contrib.framework.get_global_step(), - learning_rate=learning_rate, - clip_gradients=my_hp.clip_grad_norm or None, - gradient_noise_scale=hparams.grad_noise_scale or None, - optimizer=opt, - summaries=opt_summaries, - colocate_gradients_with_ops=True) - - # Remove summaries that will fail to run because they are in conditionals. - # TODO(cwhipkey): Test with this code removed, later in 2017. - summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) - for i in range(len(summaries) - 1, -1, -1): - if summaries[i].name.startswith("cond_"): - del summaries[i] - - tf.logging.info("Global model_fn finished.") - return run_info, total_loss, train_op - - return model_fn - - -def run_locally(exp): - """Runs an Experiment locally - trains, evaluates, and decodes. - - Args: - exp: Experiment. - """ - if exp.train_steps > 0 or exp.eval_steps > 0: - tf.logging.info("Performing local training and evaluation.") - exp.train_and_evaluate() - decode(exp.estimator) +def get_data_filepatterns(data_dir, mode): + return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode) def decode(estimator): if FLAGS.decode_interactive: - decode_interactively(estimator) + decoding.decode_interactively(estimator) elif FLAGS.decode_from_file is not None: - decode_from_file(estimator, FLAGS.decode_from_file) + decoding.decode_from_file(estimator, FLAGS.decode_from_file) elif FLAGS.decode_from_dataset: - decode_from_dataset(estimator) - - -def decode_from_dataset(estimator): - hparams = estimator.hparams - for i, problem in enumerate(FLAGS.problems.split("-")): - inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None) - targets_vocab = hparams.problems[i].vocabulary["targets"] - tf.logging.info("Performing local inference.") - infer_problems_data = get_data_filepatterns(hparams.data_dir, - tf.contrib.learn.ModeKeys.INFER) - - infer_input_fn = get_input_fn( - mode=tf.contrib.learn.ModeKeys.INFER, - hparams=hparams, - data_file_patterns=infer_problems_data, - num_datashards=data_parallelism().n, - fixed_problem=i) - result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) - - def log_fn(inputs, - targets, - outputs, - problem, - j, - inputs_vocab=inputs_vocab, - targets_vocab=targets_vocab): - """Log inference results.""" - if "image" in problem and FLAGS.decode_save_images: - save_path = os.path.join(estimator.model_dir, - "%s_prediction_%d.jpg" % (problem, j)) - show_and_save_image(inputs / 255., save_path) - elif inputs_vocab: - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) - tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - - decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten())) - tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten())) - tf.logging.info("Inference results TARGET: %s" % decoded_targets) - - if FLAGS.decode_to_file: - output_filepath = FLAGS.decode_to_file + ".outputs." + problem - output_file = tf.gfile.Open(output_filepath, "a") - output_file.write(decoded_outputs + "\n") - target_filepath = FLAGS.decode_to_file + ".targets." + problem - target_file = tf.gfile.Open(target_filepath, "a") - target_file.write(decoded_targets + "\n") - - # The function predict() returns an iterable over the network's - # predictions from the test input. We use it to log inputs and decodes. - inputs_iter = result_iter["inputs"] - targets_iter = result_iter["targets"] - outputs_iter = result_iter["outputs"] - for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)): - inputs, targets, outputs = result - if FLAGS.decode_return_beams: - output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % k) - log_fn(inputs, targets, beam, problem, j) - else: - log_fn(inputs, targets, outputs, problem, j) - - -def decode_from_file(estimator, filename): - """Compute predictions on entries in filename and write them out.""" - hparams = estimator.hparams - problem_id = FLAGS.decode_problem_id - inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"] - targets_vocab = hparams.problems[problem_id].vocabulary["targets"] - tf.logging.info("Performing decoding from a file.") - sorted_inputs, sorted_keys = _get_sorted_inputs(filename) - num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1 - input_fn = _decode_batch_input_fn(problem_id, num_decode_batches, - sorted_inputs, inputs_vocab) - - decodes = [] - for _ in range(num_decode_batches): - result_iter = estimator.predict( - input_fn=input_fn.next if six.PY2 else input_fn.__next__, - as_iterable=True) - for result in result_iter: - - def log_fn(inputs, outputs): - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) - tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - - decoded_outputs = targets_vocab.decode( - _save_until_eos(outputs.flatten())) - tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - return decoded_outputs - - if FLAGS.decode_return_beams: - beam_decodes = [] - output_beams = np.split( - result["outputs"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % k) - beam_decodes.append(log_fn(result["inputs"], beam)) - decodes.append("\t".join(beam_decodes)) - - else: - decodes.append(log_fn(result["inputs"], result["outputs"])) - - # Reversing the decoded inputs and outputs because they were reversed in - # _decode_batch_input_fn - sorted_inputs.reverse() - decodes.reverse() - # Dumping inputs and outputs to file filename.decodes in - # format result\tinput in the same order as original inputs - if FLAGS.decode_to_file: - output_filename = FLAGS.decode_to_file - else: - output_filename = filename - if FLAGS.decode_shards > 1: - base_filename = output_filename + ("%.2d" % FLAGS.worker_id) - else: - base_filename = output_filename - decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set - + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" + - str(FLAGS.decode_alpha) + ".decodes") - tf.logging.info("Writing decodes into %s" % decode_filename) - outfile = tf.gfile.Open(decode_filename, "w") - for index in range(len(sorted_inputs)): - outfile.write("%s\n" % (decodes[sorted_keys[index]])) - - -def decode_interactively(estimator): - hparams = estimator.hparams - - infer_input_fn = _interactive_input_fn(hparams) - for problem_idx, example in infer_input_fn: - targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] - result_iter = estimator.predict(input_fn=lambda e=example: e) - for result in result_iter: - if FLAGS.decode_return_beams: - beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) - scores = None - if "scores" in result: - scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(beams): - tf.logging.info("BEAM %d:" % k) - beam_string = targets_vocab.decode(_save_until_eos(beam.flatten())) - if scores is not None: - tf.logging.info("%s\tScore:%f" % (beam_string, scores[k])) - else: - tf.logging.info(beam_string) - else: - if FLAGS.identity_output: - tf.logging.info(" ".join(map(str, result["outputs"].flatten()))) - else: - tf.logging.info(targets_vocab.decode(_save_until_eos( - result["outputs"].flatten()))) - - -def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, - vocabulary): - tf.logging.info(" batch %d" % num_decode_batches) - # First reverse all the input sentences so that if you're going to get OOMs, - # you'll see it in the first batch - sorted_inputs.reverse() - for b in range(num_decode_batches): - tf.logging.info("Decoding batch %d" % b) - batch_length = 0 - batch_inputs = [] - for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( - b + 1) * FLAGS.decode_batch_size]: - input_ids = vocabulary.encode(inputs) - if FLAGS.decode_max_input_size > 0: - # Subtract 1 for the EOS_ID. - input_ids = input_ids[:FLAGS.decode_max_input_size - 1] - input_ids.append(text_encoder.EOS_ID) - batch_inputs.append(input_ids) - if len(input_ids) > batch_length: - batch_length = len(input_ids) - final_batch_inputs = [] - for input_ids in batch_inputs: - assert len(input_ids) <= batch_length - x = input_ids + [0] * (batch_length - len(input_ids)) - final_batch_inputs.append(x) - yield { - "inputs": np.array(final_batch_inputs), - "problem_choice": np.array(problem_id) - } - - -def get_data_filepatterns(data_dir, mode): - return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode) - - -def _cond_on_index(fn, index_tensor, cur_idx, max_idx): - """Call fn(index_tensor) using tf.cond in [cur_id, max_idx].""" - if cur_idx == max_idx: - return fn(cur_idx) - return tf.cond( - tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx), - lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx)) - - -def _interactive_input_fn(hparams): - """Generator that reads from the terminal and yields "interactive inputs". - - Due to temporary limitations in tf.learn, if we don't want to reload the - whole graph, then we are stuck encoding all of the input as one fixed-size - numpy array. - - We yield int64 arrays with shape [const_array_size]. The format is: - [num_samples, decode_length, len(input ids), , ] - - Args: - hparams: model hparams - Yields: - numpy arrays - - Raises: - Exception: when `input_type` is invalid. - """ - num_samples = 3 - decode_length = 100 - input_type = "text" - problem_id = 0 - p_hparams = hparams.problems[problem_id] - has_input = "inputs" in p_hparams.input_modality - vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] - # This should be longer than the longest input. - const_array_size = 10000 - while True: - prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" - " it= ('text' or 'image' or 'label')\n" - " pr= (set the problem number)\n" - " in= (set the input problem number)\n" - " ou= (set the output problem number)\n" - " ns= (changes number of samples)\n" - " dl= (changes decode legnth)\n" - " <%s> (decode)\n" - " q (quit)\n" - ">" % (num_samples, decode_length, "source_string" - if has_input else "target_prefix")) - input_string = input(prompt) - if input_string == "q": - return - elif input_string[:3] == "pr=": - problem_id = int(input_string[3:]) - p_hparams = hparams.problems[problem_id] - has_input = "inputs" in p_hparams.input_modality - vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] - elif input_string[:3] == "in=": - problem = int(input_string[3:]) - p_hparams.input_modality = hparams.problems[problem].input_modality - p_hparams.input_space_id = hparams.problems[problem].input_space_id - elif input_string[:3] == "ou=": - problem = int(input_string[3:]) - p_hparams.target_modality = hparams.problems[problem].target_modality - p_hparams.target_space_id = hparams.problems[problem].target_space_id - elif input_string[:3] == "ns=": - num_samples = int(input_string[3:]) - elif input_string[:3] == "dl=": - decode_length = int(input_string[3:]) - elif input_string[:3] == "it=": - input_type = input_string[3:] - else: - if input_type == "text": - input_ids = vocabulary.encode(input_string) - if has_input: - input_ids.append(text_encoder.EOS_ID) - x = [num_samples, decode_length, len(input_ids)] + input_ids - assert len(x) < const_array_size - x += [0] * (const_array_size - len(x)) - yield problem_id, { - "inputs": np.array(x), - "problem_choice": np.array(problem_id) - } - elif input_type == "image": - input_path = input_string - img = read_image(input_path) - yield problem_id, { - "inputs": img, - "problem_choice": np.array(problem_id) - } - elif input_type == "label": - input_ids = [int(input_string)] - x = [num_samples, decode_length, len(input_ids)] + input_ids - yield problem_id, { - "inputs": np.array(x), - "problem_choice": np.array(problem_id) - } - else: - raise Exception("Unsupported input type.") - - -def read_image(path): - try: - import matplotlib.image as im # pylint: disable=g-import-not-at-top - except ImportError as e: - tf.logging.warning( - "Reading an image requires matplotlib to be installed: %s", e) - raise NotImplementedError("Image reading not implemented.") - return im.imread(path) - - -def show_and_save_image(img, save_path): - try: - import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top - except ImportError as e: - tf.logging.warning("Showing and saving an image requires matplotlib to be " - "installed: %s", e) - raise NotImplementedError("Image display and save not implemented.") - plt.imshow(img) - plt.savefig(save_path) - - -def _get_sorted_inputs(filename): - """Returning inputs sorted according to length. - - Args: - filename: path to file with inputs, 1 per line. - - Returns: - a sorted list of inputs - - """ - tf.logging.info("Getting sorted inputs") - # read file and sort inputs according them according to input length. - if FLAGS.decode_shards > 1: - decode_filename = filename + ("%.2d" % FLAGS.worker_id) - else: - decode_filename = filename - inputs = [line.strip() for line in tf.gfile.Open(decode_filename)] - input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)] - sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1)) - # We'll need the keys to rearrange the inputs back into their original order - sorted_keys = {} - sorted_inputs = [] - for i, (index, _) in enumerate(sorted_input_lens): - sorted_inputs.append(inputs[index]) - sorted_keys[index] = i - return sorted_inputs, sorted_keys - - -def _interactive_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False if len(inputs.shape) < 3 else True - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - if not input_is_image: - # Remove the batch dimension. - num_samples = x[0] - length = x[2] - x = tf.slice(x, [3], tf.to_int32([length])) - x = tf.reshape(x, [1, -1, 1, 1]) - # Transform into a batch of size num_samples to get that many random - # decodes. - x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) - else: - x = tf.image.resize_images(x, [299, 299]) - x = tf.reshape(x, [1, 299, 299, -1]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = _cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = tf.constant(feature_map["problem_choice"]) - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else inputs[1]) - features["inputs"] = x - return features - - -def _decode_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - # Add a third empty dimension dimension - x = tf.expand_dims(x, axis=[2]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = _cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = feature_map["problem_choice"] - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else tf.shape(x)[1] + 50) - features["inputs"] = x - return features - - -def get_input_fn(mode, - hparams, - data_file_patterns=None, - num_datashards=None, - fixed_problem=None): - """Provides input to the graph, either from disk or via a placeholder. - - This function produces an input function that will feed data into - the network. There are two modes of operation: - - 1. If data_file_pattern and all subsequent arguments are None, then - it creates a placeholder for a serialized tf.Example proto. - 2. If data_file_pattern is defined, it will read the data from the - files at the given location. Use this mode for training, - evaluation, and testing prediction. - - Args: - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. - hparams: HParams object. - data_file_patterns: The list of file patterns to use to read in data. Set to - `None` if you want to create a placeholder for the input data. The - `problems` flag is a list of problem names joined by the `-` character. - The flag's string is then split along the `-` and each problem gets its - own example queue. - num_datashards: An integer. - fixed_problem: An integer indicating the problem to fetch data for, or None - if the input is to be randomly selected. - - Returns: - A function that returns a dictionary of features and the target labels. - """ - - def input_fn(): - """Supplies input to our model. - - This function supplies input to our model, where this input is a - function of the mode. For example, we supply different data if - we're performing training versus evaluation. - - Returns: - A tuple consisting of 1) a dictionary of tensors whose keys are - the feature names, and 2) a tensor of target labels if the mode - is not INFER (and None, otherwise). - - Raises: - ValueError: if one of the parameters has an unsupported value. - """ - problem_count, batches = len(data_file_patterns), [] - with tf.name_scope("input_reader"): - for n in xrange(problem_count): - if fixed_problem is not None and n != fixed_problem: - continue - problem_instance = hparams.problem_instances[n] - p_hparams = hparams.problems[n] - with tf.name_scope("problem_%d" % n): - with tf.device("/cpu:0"): # Input reading on CPU - capacity = p_hparams.max_expected_batch_size_per_shard - capacity *= num_datashards - examples = data_reader.input_pipeline(problem_instance, - data_file_patterns[n], - capacity, mode, hparams) - feature_map = data_reader.batch_examples( - examples, - data_reader.hparams_to_batching_scheme( - hparams, - shard_multiplier=num_datashards, - drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN - or hparams.eval_drop_long_sequences), - length_multiplier=(p_hparams.batch_size_multiplier))) - - # Reverse inputs and targets features if the problem was reversed. - if problem_instance is not None: - problem_instance.maybe_reverse_features(feature_map) - problem_instance.maybe_copy_features(feature_map) - else: - if p_hparams.was_reversed: - inputs = feature_map["inputs"] - targets = feature_map["targets"] - feature_map["inputs"] = targets - feature_map["targets"] = inputs - # Use the inputs as the targets if the problem is a copy problem. - if p_hparams.was_copy: - feature_map["targets"] = feature_map["inputs"] - - # Ensure inputs and targets are proper rank. - while len(feature_map["inputs"].get_shape()) != 4: - feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) - while len(feature_map["targets"].get_shape()) != 4: - feature_map["targets"] = tf.expand_dims( - feature_map["targets"], axis=-1) - - batches.append( - (feature_map["inputs"], feature_map["targets"], tf.constant(n), - tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id))) - - # We choose which problem to process. - loss_moving_avgs = [] # Need loss moving averages for that. - for n in xrange(problem_count): - with tf.variable_scope("losses_avg"): - loss_moving_avgs.append( - tf.get_variable( - "problem_%d/total_loss" % n, initializer=100.0, - trainable=False)) - tf.get_variable( - "problem_%d/training_loss" % n, initializer=100.0, trainable=False) - tf.get_variable( - "problem_%d/extra_loss" % n, initializer=100.0, trainable=False) - if fixed_problem is None: - if (hparams.problem_choice == "uniform" or - mode != tf.contrib.learn.ModeKeys.TRAIN): - problem_choice = tf.random_uniform( - [], maxval=problem_count, dtype=tf.int32) - elif hparams.problem_choice == "adaptive": - loss_moving_avgs = tf.stack(loss_moving_avgs) - problem_choice = tf.multinomial( - tf.reshape(loss_moving_avgs, [1, -1]), 1) - problem_choice = tf.to_int32(tf.squeeze(problem_choice)) - elif hparams.problem_choice == "distributed": - assert FLAGS.worker_replicas >= problem_count - assert FLAGS.worker_replicas % problem_count == 0 - problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) - else: - raise ValueError( - "Value of hparams.problem_choice is %s and must be " - "one of [uniform, adaptive, distributed]" % hparams.problem_choice) - - # Inputs and targets conditional on problem_choice. - rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index( - lambda n: batches[n], problem_choice, 0, problem_count - 1) - else: - problem_choice = tf.constant(fixed_problem) - # Take the only constructed batch, which is the fixed_problem. - rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0] - - # Set shapes so the ranks are clear. - rand_inputs.set_shape([None, None, None, None]) - rand_target.set_shape([None, None, None, None]) - choice.set_shape([]) - inp_id.set_shape([]) - tgt_id.set_shape([]) - # Forced shape obfuscation is necessary for inference. - if mode == tf.contrib.learn.ModeKeys.INFER: - rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access - rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access - - # Final feature map. - rand_feature_map = { - "inputs": rand_inputs, - "problem_choice": choice, - "input_space_id": inp_id, - "target_space_id": tgt_id - } - if mode == tf.contrib.learn.ModeKeys.INFER: - rand_feature_map["infer_targets"] = rand_target - rand_target = None - return rand_feature_map, rand_target - - return input_fn - - -class _ConditionalOptimizer(tf.train.Optimizer): - """Conditional optimizer.""" - - def __init__(self, optimizer_name, lr, hparams): - if optimizer_name == "Adam": - # We change the default epsilon for Adam and re-scale lr. - # Using LazyAdam as it's much faster for large vocabulary embeddings. - self._opt = tf.contrib.opt.LazyAdamOptimizer( - lr / 500.0, - beta1=hparams.optimizer_adam_beta1, - beta2=hparams.optimizer_adam_beta2, - epsilon=hparams.optimizer_adam_epsilon) - elif optimizer_name == "Momentum": - self._opt = tf.train.MomentumOptimizer( - lr, momentum=hparams.optimizer_momentum_momentum) - elif optimizer_name == "YellowFin": - tf.logging.info("Init YellowFin Optimizer.") - self._opt = yellowfin.YellowFinOptimizer( - learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) - else: - self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) - - def compute_gradients(self, loss, var_list, colocate_gradients_with_ops): - return self._opt.compute_gradients( - loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops) - - def apply_gradients(self, gradients, global_step=None, name=None): - return self._opt.apply_gradients( - gradients, global_step=global_step, name=name) - - -def _sqrt_decay(step): - """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" - return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) - - -def _exp_decay_after(step, rate, from_which_step): - """Decay exponentially by rate (per step) starting at from_which_step.""" - return tf.cond( - step < from_which_step, - lambda: tf.constant(1.0), - lambda: rate**(step - from_which_step), - name="exponential_decay_step_cond") - - -def _ps_replicas(all_workers=False): - if all_workers: - return list(range(FLAGS.ps_replicas)) - # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas. - num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas - return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)] - - -def _gpu_order(num_gpus): - if FLAGS.gpu_order: - ret = [int(s) for s in FLAGS.gpu_order.split(" ")] - if len(ret) == num_gpus: - return ret - return list(range(num_gpus)) - - -def _ps_gpus(all_workers=False): - ps_gpus = [] - for d in _ps_replicas(all_workers=all_workers): - ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)]) - return ps_gpus - - -def _ps_devices(all_workers=False): - """List of ps devices (where to put the experts). - - Args: - all_workers: whether the list is for all async workers or just this one. - - Returns: - a list of device names - """ - if FLAGS.ps_replicas > 0: - if FLAGS.ps_gpu > 0: - return [ - FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu) - for (d, gpu) in _ps_gpus(all_workers=all_workers) - ] - else: - return [ - FLAGS.ps_job + "/task:%d" % d - for d in _ps_replicas(all_workers=all_workers) - ] - else: - if FLAGS.worker_gpu > 0: - return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] - else: - return [""] - - -def data_parallelism(all_workers=False): - """Over which devices do we split each training batch. - - In old-fashioned async mode, we split the batch over all GPUs on the - current worker. - - In sync mode, we split the batch over all the parameter server GPUs. - - This function returns an expert_utils.Parallelism object, which can be used - to build the model. It is configured in a way that any variables created - by `tf.get_variable` will be assigned to the parameter servers and shared - between datashards. - - Args: - all_workers: whether the devices are all async workers or just this one. - - Returns: - a expert_utils.Parallelism. - """ - - def _replica_device_setter(worker_device): - if FLAGS.ps_replicas == 0: - return worker_device - return tf.train.replica_device_setter( - worker_device=worker_device, - ps_tasks=FLAGS.ps_replicas, - ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job) - - if FLAGS.schedule == "local_run": - assert not FLAGS.sync - datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] - if FLAGS.locally_shard_to_cpu: - datashard_devices += ["cpu:0"] - caching_devices = None - elif FLAGS.sync: - assert FLAGS.ps_replicas > 0 - datashard_devices = [ - _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers) - ] - if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1: - caching_devices = [ - FLAGS.ps_job + "/task:%d/cpu:0" % d - for (d, _) in _ps_gpus(all_workers=all_workers) - ] - else: - caching_devices = None - else: - # old fashioned async - compute on worker - if FLAGS.worker_gpu > 1: - datashard_devices = [ - _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d) - for d in _gpu_order(FLAGS.worker_gpu) - ] - caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu - else: - datashard_devices = [_replica_device_setter(FLAGS.worker_job)] - caching_devices = None - tf.logging.info("datashard_devices: %s", datashard_devices) - tf.logging.info("caching_devices: %s", caching_devices) - return eu.Parallelism( - datashard_devices, - reuse=True, - caching_devices=caching_devices, - daisy_chain_variables=FLAGS.daisy_chain_variables) + decoding.decode_from_dataset(estimator) diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py index 562279623..8a71afe68 100644 --- a/tensor2tensor/utils/trainer_utils_test.py +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -25,7 +25,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.models import transformer from tensor2tensor.utils import registry -from tensor2tensor.utils import trainer_utils as utils # pylint: disable=unused-import +from tensor2tensor.utils import trainer_utils import tensorflow as tf @@ -76,7 +76,7 @@ def testHParamsImported(self): def testSingleStep(self): model_name = "transformer" FLAGS.hparams_set = "transformer_test" - exp = utils.create_experiment( + exp = trainer_utils.create_experiment( output_dir=tf.test.get_temp_dir(), data_dir=TrainerUtilsTest.data_dir, model_name=model_name, From fbe8c61a1aaea95b9b32fb56d49c1e790660ea09 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 2 Aug 2017 17:13:21 -0700 Subject: [PATCH 5/6] v1.1.5 PiperOrigin-RevId: 164061568 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fd8e77a46..38b2fcc48 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.4', + version='1.1.5', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', From eee190b3b770d917931b3ccb3972109b27b48f6d Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 2 Aug 2017 17:25:40 -0700 Subject: [PATCH 6/6] Add layers init and update gitignore for nose --- .gitignore | 1 + tensor2tensor/layers/__init__.py | 0 2 files changed, 1 insertion(+) create mode 100644 tensor2tensor/layers/__init__.py diff --git a/.gitignore b/.gitignore index c9dd3db88..362753caa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ _pycache__/ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info +/*.egg # PyPI distribution artifacts. build/ diff --git a/tensor2tensor/layers/__init__.py b/tensor2tensor/layers/__init__.py new file mode 100644 index 000000000..e69de29bb