From 0bdfcbb57fb0f22e44d3f852889a94716009fffc Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Tue, 1 Aug 2017 18:49:03 -0700
Subject: [PATCH 1/6] Use get_residual_fn to get the residual_fn in the
 transformer.

PiperOrigin-RevId: 163919630
---
 README.md                                      |  2 +-
 .../generator.py}                              |  1 -
 tensor2tensor/models/transformer.py            | 18 ++++++++++++------
 tensor2tensor/{bin/t2t-trainer => trainer.py}  |  1 -
 4 files changed, 13 insertions(+), 9 deletions(-)
 rename tensor2tensor/{bin/t2t-datagen => data_generators/generator.py} (99%)
 rename tensor2tensor/{bin/t2t-trainer => trainer.py} (99%)

diff --git a/README.md b/README.md
index bb0f6f534..5bb1c31a3 100644
--- a/README.md
+++ b/README.md
@@ -180,7 +180,7 @@ python -c "from tensor2tensor.models.transformer import Transformer"
 **Datasets** are all standardized on `TFRecord` files with `tensorflow.Example`
 protocol buffers. All datasets are registered and generated with the
 [data
-generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-datagen)
+generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/generator.py)
 and many common sequence datasets are already available for generation and use.
 
 ### Problems and Modalities
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/data_generators/generator.py
similarity index 99%
rename from tensor2tensor/bin/t2t-datagen
rename to tensor2tensor/data_generators/generator.py
index 837d6d203..bc79f2384 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/data_generators/generator.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 2320a57f1..a2b55febf 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -56,12 +56,7 @@ def model_fn_body(self, features):
     (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
         targets, hparams)
 
-    def residual_fn(x, y):
-      return common_layers.residual_fn(x, y,
-                                       hparams.norm_type,
-                                       hparams.residual_dropout,
-                                       hparams.hidden_size,
-                                       epsilon=hparams.layer_norm_epsilon)
+    residual_fn = get_residual_fn(hparams)
 
     encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
     decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
@@ -76,6 +71,17 @@ def residual_fn(x, y):
     return decoder_output
 
 
+def get_residual_fn(hparams):
+  """Get residual_fn."""
+  def residual_fn(x, y):
+    return common_layers.residual_fn(x, y,
+                                     hparams.norm_type,
+                                     hparams.residual_dropout,
+                                     hparams.hidden_size,
+                                     epsilon=hparams.layer_norm_epsilon)
+  return residual_fn
+
+
 def transformer_prepare_encoder(inputs, target_space, hparams):
   """Prepare one shard of the model for the encoder.
 
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/trainer.py
similarity index 99%
rename from tensor2tensor/bin/t2t-trainer
rename to tensor2tensor/trainer.py
index 13dd7d355..41c9cd33b 100644
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/trainer.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #

From 4390618e692f790871019aadc0371efcd76a89f4 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 2 Aug 2017 10:49:20 -0700
Subject: [PATCH 2/6] Add requests dependency

PiperOrigin-RevId: 164005758
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 6f509d03e..fd8e77a46 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
     ],
     install_requires=[
         'numpy',
+        'requests',
         'sympy',
         'six',
     ],

From 9394d0e3f2ecc0f7fa14d59dec17b0da3cff9a21 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 2 Aug 2017 11:05:05 -0700
Subject: [PATCH 3/6] Use ModeKeys enum consistently in trainer_utils instead
 of string literals.

PiperOrigin-RevId: 164008619
---
 tensor2tensor/utils/trainer_utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 33053806d..5c0240e16 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -181,8 +181,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
     eval_hooks.append(hook)
   return tf.contrib.learn.Experiment(
       estimator=estimator,
-      train_input_fn=input_fns["train"],
-      eval_input_fn=input_fns["eval"],
+      train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN],
+      eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL],
       eval_metrics=eval_metrics,
       train_steps=train_steps,
       eval_steps=eval_steps,
@@ -220,7 +220,9 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
           keep_checkpoint_max=FLAGS.keep_checkpoint_max))
   # Store the hparams in the estimator as well
   estimator.hparams = hparams
-  return estimator, {"train": train_input_fn, "eval": eval_input_fn}
+  return estimator, {
+      tf.contrib.learn.ModeKeys.TRAIN: train_input_fn,
+      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn}
 
 
 def log_registry():

From f6799b9515e0e214d2d4295f4e4cf94cf27cf333 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 2 Aug 2017 16:45:57 -0700
Subject: [PATCH 4/6] File/code moves

PiperOrigin-RevId: 164058229
---
 README.md                                     |    2 +-
 .../generator.py => bin/t2t-datagen}          |   26 +-
 tensor2tensor/{trainer.py => bin/t2t-trainer} |    1 +
 tensor2tensor/data_generators/all_problems.py |    1 +
 tensor2tensor/data_generators/image.py        |   45 +-
 .../data_generators/problem_hparams.py        |   88 +-
 .../{models => layers}/common_attention.py    |  131 +-
 .../common_attention_test.py                  |   32 +-
 .../{models => layers}/common_hparams.py      |   10 +-
 .../{models => layers}/common_layers.py       |   15 +-
 .../{models => layers}/common_layers_test.py  |    9 +-
 .../{models => layers}/modalities.py          |    2 +-
 .../{models => layers}/modalities_test.py     |    2 +-
 tensor2tensor/models/attention_lm.py          |   10 +-
 tensor2tensor/models/attention_lm_moe.py      |   43 +-
 tensor2tensor/models/bluenet.py               |  128 +-
 tensor2tensor/models/bytenet.py               |   13 +-
 tensor2tensor/models/gene_expression.py       |    4 +-
 tensor2tensor/models/gene_expression_test.py  |    2 +-
 tensor2tensor/models/long_answer.py           |   53 +-
 tensor2tensor/models/lstm.py                  |   48 +-
 tensor2tensor/models/lstm_test.py             |    6 +-
 tensor2tensor/models/models.py                |    2 +-
 tensor2tensor/models/multimodel.py            |   99 +-
 tensor2tensor/models/neural_gpu.py            |    5 +-
 tensor2tensor/models/neural_gpu_test.py       |    6 +-
 tensor2tensor/models/shake_shake.py           |    4 +-
 tensor2tensor/models/slicenet.py              |   19 +-
 tensor2tensor/models/slicenet_test.py         |    6 +-
 tensor2tensor/models/transformer.py           |   37 +-
 .../models/transformer_alternative.py         |   39 +-
 tensor2tensor/models/xception.py              |    4 +-
 tensor2tensor/utils/decoding.py               |  371 ++++++
 tensor2tensor/utils/devices.py                |  147 +++
 tensor2tensor/utils/input_fn_builder.py       |  200 +++
 tensor2tensor/utils/metrics.py                |    2 +-
 tensor2tensor/utils/modality.py               |    2 +-
 tensor2tensor/utils/model_builder.py          |  451 +++++++
 tensor2tensor/utils/registry.py               |    6 +-
 tensor2tensor/utils/trainer_utils.py          | 1085 +----------------
 tensor2tensor/utils/trainer_utils_test.py     |    4 +-
 41 files changed, 1648 insertions(+), 1512 deletions(-)
 rename tensor2tensor/{data_generators/generator.py => bin/t2t-datagen} (93%)
 rename tensor2tensor/{trainer.py => bin/t2t-trainer} (99%)
 rename tensor2tensor/{models => layers}/common_attention.py (89%)
 rename tensor2tensor/{models => layers}/common_attention_test.py (77%)
 rename tensor2tensor/{models => layers}/common_hparams.py (97%)
 rename tensor2tensor/{models => layers}/common_layers.py (99%)
 rename tensor2tensor/{models => layers}/common_layers_test.py (98%)
 rename tensor2tensor/{models => layers}/modalities.py (99%)
 rename tensor2tensor/{models => layers}/modalities_test.py (98%)
 create mode 100644 tensor2tensor/utils/decoding.py
 create mode 100644 tensor2tensor/utils/devices.py
 create mode 100644 tensor2tensor/utils/input_fn_builder.py
 create mode 100644 tensor2tensor/utils/model_builder.py

diff --git a/README.md b/README.md
index 5bb1c31a3..bb0f6f534 100644
--- a/README.md
+++ b/README.md
@@ -180,7 +180,7 @@ python -c "from tensor2tensor.models.transformer import Transformer"
 **Datasets** are all standardized on `TFRecord` files with `tensorflow.Example`
 protocol buffers. All datasets are registered and generated with the
 [data
-generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/generator.py)
+generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-datagen)
 and many common sequence datasets are already available for generation and use.
 
 ### Problems and Modalities
diff --git a/tensor2tensor/data_generators/generator.py b/tensor2tensor/bin/t2t-datagen
similarity index 93%
rename from tensor2tensor/data_generators/generator.py
rename to tensor2tensor/bin/t2t-datagen
index bc79f2384..39453dbee 100644
--- a/tensor2tensor/data_generators/generator.py
+++ b/tensor2tensor/bin/t2t-datagen
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
@@ -15,14 +16,15 @@
 
 """Produces the training and dev data for --problem into --data_dir.
 
-generator.py produces sharded and shuffled TFRecord files of tensorflow.Example
-protocol buffers for a variety of datasets registered in this file.
+Produces sharded and shuffled TFRecord files of tensorflow.Example protocol
+buffers for a variety of registered datasets.
 
-All datasets are registered in _SUPPORTED_PROBLEM_GENERATORS. Each entry maps a
-string name (selectable on the command-line with --problem) to a function that
-takes 2 arguments - input_directory and mode (one of "train" or "dev") - and
-yields for each training example a dictionary mapping string feature names to
-lists of {string, int, float}. The generator will be run once for each mode.
+All Problems are registered with @registry.register_problem or are in
+_SUPPORTED_PROBLEM_GENERATORS in this file. Each entry maps a string name
+(selectable on the command-line with --problem) to a function that takes 2
+arguments - input_directory and mode (one of "train" or "dev") - and yields for
+each training example a dictionary mapping string feature names to lists of
+{string, int, float}. The generator will be run once for each mode.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -228,8 +230,7 @@ def generate_data_for_problem(problem):
     num_shards = FLAGS.num_shards or 10
     tf.logging.info("Generating training data for %s.", problem)
     train_output_files = generator_utils.train_data_filenames(
-        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
-        num_shards)
+        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards)
     generator_utils.generate_files(training_gen(), train_output_files,
                                    FLAGS.max_cases)
     tf.logging.info("Generating development data for %s.", problem)
@@ -249,9 +250,10 @@ def generate_data_for_registered_problem(problem_name):
     raise ValueError("--num_shards should not be set for registered Problem.")
   problem = registry.problem(problem_name)
   task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
-  problem.generate_data(os.path.expanduser(FLAGS.data_dir),
-                        os.path.expanduser(FLAGS.tmp_dir),
-                        task_id=task_id)
+  problem.generate_data(
+      os.path.expanduser(FLAGS.data_dir),
+      os.path.expanduser(FLAGS.tmp_dir),
+      task_id=task_id)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/trainer.py b/tensor2tensor/bin/t2t-trainer
similarity index 99%
rename from tensor2tensor/trainer.py
rename to tensor2tensor/bin/t2t-trainer
index 41c9cd33b..13dd7d355 100644
--- a/tensor2tensor/trainer.py
+++ b/tensor2tensor/bin/t2t-trainer
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index 6830cf0bf..9be133a61 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -30,6 +30,7 @@
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
 
+
 # Problem modules that require optional dependencies
 # pylint: disable=g-import-not-at-top
 try:
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index a2e328f00..d70d9339e 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -36,7 +36,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -76,10 +76,11 @@ class ImageFSNS(ImageProblem):
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
                 "street/python/fsns_urls.txt")
-    fsns_urls = generator_utils.maybe_download(
-        tmp_dir, "fsns_urls.txt", list_url)
-    fsns_files = [f.strip() for f in open(fsns_urls, "r")
-                  if f.startswith("http://")]
+    fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
+                                               list_url)
+    fsns_files = [
+        f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
+    ]
     for url in fsns_files:
       if "/train/train" in url:
         generator_utils.maybe_download(
@@ -88,8 +89,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         generator_utils.maybe_download(
             data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
       elif "charset" in url:
-        generator_utils.maybe_download(
-            data_dir, "charset_size134.txt", url)
+        generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
 
   def feature_encoders(self, data_dir):
     # This vocab file must be present within the data directory.
@@ -111,8 +111,8 @@ def hparams(self, defaults, model_hparams):
 
   def example_reading_spec(self):
     label_key = "image/unpadded_label"
-    return super(ImageFSNS, self).example_reading_spec(self,
-                                                       label_key=label_key)
+    return super(ImageFSNS, self).example_reading_spec(
+        self, label_key=label_key)
 
 
 class Image2ClassProblem(ImageProblem):
@@ -161,6 +161,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
 
 def imagenet_preprocess_examples(examples, mode):
   """Preprocessing used for Imagenet and similar problems."""
+
   def preprocess(img):
     img = tf.image.resize_images(img, [360, 360])
     img = common_layers.image_augmentation(tf.to_float(img) / 255.)
@@ -215,8 +216,8 @@ def is_small(self):
 
   def preprocess_examples(self, examples, mode):
     examples = imagenet_preprocess_examples(examples, mode)
-    examples["inputs"] = tf.to_int64(tf.image.resize_images(
-        examples["inputs"], [32, 32]))
+    examples["inputs"] = tf.to_int64(
+        tf.image.resize_images(examples["inputs"], [32, 32]))
 
 
 def image_generator(images, labels):
@@ -665,12 +666,20 @@ def generator(self, data_dir, tmp_dir, is_training):
     vocab_filename = "vocab.endefr.%d" % self.targeted_vocab_size
     if is_training:
       return mscoco_generator(
-          data_dir, tmp_dir, True, 80000,
-          vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size)
+          data_dir,
+          tmp_dir,
+          True,
+          80000,
+          vocab_filename=vocab_filename,
+          vocab_size=self.targeted_vocab_size)
     else:
       return mscoco_generator(
-          data_dir, tmp_dir, False, 40000,
-          vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size)
+          data_dir,
+          tmp_dir,
+          False,
+          40000,
+          vocab_filename=vocab_filename,
+          vocab_size=self.targeted_vocab_size)
 
 
 @registry.register_problem
@@ -690,8 +699,8 @@ def targeted_vocab_size(self):
 def _get_celeba(directory):
   """Download and extract CELEBA to directory unless it is there."""
   # path = os.path.join(directory, _CELEBA_NAME)
-  path = generator_utils.maybe_download_from_drive(directory,
-                                                   _CELEBA_NAME, _CELEBA_URL)
+  path = generator_utils.maybe_download_from_drive(directory, _CELEBA_NAME,
+                                                   _CELEBA_URL)
   if not tf.gfile.Exists(path):
     zipfile.ZipFile(path + ".zip", "r").extractall(directory)
 
@@ -711,7 +720,7 @@ def celeba_generator(tmp_dir, how_many, start_from=0):
   """
   _get_celeba(tmp_dir)
   image_files = tf.gfile.Glob(os.path.join(tmp_dir, _CELEBA_NAME) + "/*.jpg")
-  for filename in image_files[start_from:start_from+how_many]:
+  for filename in image_files[start_from:start_from + how_many]:
     with tf.gfile.Open(filename, "r") as f:
       encoded_image_data = f.read()
       yield {
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 607078d2f..d0577db52 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -25,7 +25,7 @@
 # Dependency imports
 
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.models import modalities  # pylint: disable=unused-import
+from tensor2tensor.layers import modalities  # pylint: disable=unused-import
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -202,8 +202,7 @@ def default_problem_hparams():
       # the targets. For instance `problem_copy` will copy the inputs, but
       # `problem_rev_copy` will copy the targets.
       was_reversed=False,
-      was_copy=False,
-  )
+      was_copy=False,)
 
 
 def test_problem_hparams(unused_model_hparams, input_vocab_size,
@@ -327,9 +326,7 @@ def lm1b_32k(model_hparams):
   encoder = text_encoder.SubwordTextEncoder(
       os.path.join(model_hparams.data_dir, "lm1b_32k.subword_text_encoder"))
   p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
-  p.vocabulary = {
-      "targets": encoder
-  }
+  p.vocabulary = {"targets": encoder}
   p.target_space_id = 3
   return p
 
@@ -343,9 +340,7 @@ def lm1b_characters(unused_model_hparams):
   p.input_modality = {}
   encoder = text_encoder.ByteTextEncoder()
   p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
-  p.vocabulary = {
-      "targets": encoder
-  }
+  p.vocabulary = {"targets": encoder}
   p.target_space_id = 2
   return p
 
@@ -358,10 +353,7 @@ def wiki_32k(model_hparams):
   modality_spec = (registry.Modalities.SYMBOL, encoder.vocab_size)
   p.input_modality = {"inputs": modality_spec}
   p.target_modality = modality_spec
-  p.vocabulary = {
-      "inputs": encoder,
-      "targets": encoder
-  }
+  p.vocabulary = {"inputs": encoder, "targets": encoder}
   p.target_space_id = 3
   return p
 
@@ -430,9 +422,7 @@ def wmt_parsing_tokens(model_hparams, wrong_vocab_size):
   return p
 
 
-def wsj_parsing_tokens(model_hparams,
-                       prefix,
-                       wrong_source_vocab_size,
+def wsj_parsing_tokens(model_hparams, prefix, wrong_source_vocab_size,
                        wrong_target_vocab_size):
   """English to parse tree translation benchmark.
 
@@ -487,11 +477,9 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size):
   p = default_problem_hparams()
   # This vocab file must be present within the data directory.
   source_vocab_filename = os.path.join(
-      model_hparams.data_dir,
-      "ice_source.vocab.%d" % wrong_source_vocab_size)
-  target_vocab_filename = os.path.join(
-      model_hparams.data_dir,
-      "ice_target.vocab.256")
+      model_hparams.data_dir, "ice_source.vocab.%d" % wrong_source_vocab_size)
+  target_vocab_filename = os.path.join(model_hparams.data_dir,
+                                       "ice_target.vocab.256")
   source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
   target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
   p.input_modality = {
@@ -502,7 +490,7 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size):
       "inputs": source_subtokenizer,
       "targets": target_subtokenizer,
   }
-  p.input_space_id = 18   # Icelandic tokens
+  p.input_space_id = 18  # Icelandic tokens
   p.target_space_id = 19  # Icelandic parse tokens
   return p
 
@@ -534,23 +522,41 @@ def image_celeba(unused_model_hparams):
 # Dictionary of named hyperparameter settings for various problems.
 # This is only accessed through the problem_hparams function below.
 PROBLEM_HPARAMS_MAP = {
-    "audio_timit_characters_tune": audio_timit_characters,
-    "audio_timit_characters_test": audio_timit_characters,
-    "audio_timit_tokens_8k_tune": lambda p: audio_timit_tokens(p, 2**13),
-    "audio_timit_tokens_8k_test": lambda p: audio_timit_tokens(p, 2**13),
-    "audio_wsj_characters_tune": audio_wsj_characters,
-    "audio_wsj_characters_test": audio_wsj_characters,
-    "audio_wsj_tokens_8k_tune": lambda p: audio_wsj_tokens(p, 2**13),
-    "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13),
-    "lm1b_characters": lm1b_characters,
-    "lm1b_32k": lm1b_32k,
-    "wiki_32k": wiki_32k,
-    "ice_parsing_characters": wmt_parsing_characters,
-    "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13),
-    "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13),
-    "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens(  # pylint: disable=g-long-lambda
-        p, "wsj", 2**14, 2**9),
-    "wmt_ende_bpe32k": wmt_ende_bpe32k,
-    "image_celeba_tune": image_celeba,
-    "img2img_imagenet": img2img_imagenet,
+    "audio_timit_characters_tune":
+        audio_timit_characters,
+    "audio_timit_characters_test":
+        audio_timit_characters,
+    "audio_timit_tokens_8k_tune":
+        lambda p: audio_timit_tokens(p, 2**13),
+    "audio_timit_tokens_8k_test":
+        lambda p: audio_timit_tokens(p, 2**13),
+    "audio_wsj_characters_tune":
+        audio_wsj_characters,
+    "audio_wsj_characters_test":
+        audio_wsj_characters,
+    "audio_wsj_tokens_8k_tune":
+        lambda p: audio_wsj_tokens(p, 2**13),
+    "audio_wsj_tokens_8k_test":
+        lambda p: audio_wsj_tokens(p, 2**13),
+    "lm1b_characters":
+        lm1b_characters,
+    "lm1b_32k":
+        lm1b_32k,
+    "wiki_32k":
+        wiki_32k,
+    "ice_parsing_characters":
+        wmt_parsing_characters,
+    "ice_parsing_tokens":
+        lambda p: ice_parsing_tokens(p, 2**13),
+    "wmt_parsing_tokens_8k":
+        lambda p: wmt_parsing_tokens(p, 2**13),
+    "wsj_parsing_tokens_16k":
+        lambda p: wsj_parsing_tokens(  # pylint: disable=g-long-lambda
+            p, "wsj", 2**14, 2**9),
+    "wmt_ende_bpe32k":
+        wmt_ende_bpe32k,
+    "image_celeba_tune":
+        image_celeba,
+    "img2img_imagenet":
+        img2img_imagenet,
 }
diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/layers/common_attention.py
similarity index 89%
rename from tensor2tensor/models/common_attention.py
rename to tensor2tensor/layers/common_attention.py
index b52fb8aea..e343dba0a 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -22,7 +22,7 @@
 
 # Dependency imports
 
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 
 import tensorflow as tf
 
@@ -157,9 +157,10 @@ def add_positional_embedding_nd(x, max_length, name):
     shape[i + 1] = max_length
     size[i + 1] = dynamic_shape[i + 1]
     var = (tf.get_variable(
-        name + "_%d" % i, shape,
-        initializer=tf.random_normal_initializer(0, depth ** -0.5))
-           * (depth ** 0.5))
+        name + "_%d" % i,
+        shape,
+        initializer=tf.random_normal_initializer(0, depth**-0.5)) *
+           (depth**0.5))
     x += tf.slice(var, start, size)
   return x
 
@@ -314,11 +315,13 @@ def attention_image_summary(attn, image_shapes=None):
       assert len(image_shapes) == 6
       q_rows, q_cols, q_channnels, m_rows, m_cols, m_channels = list(
           image_shapes)
-      image = tf.reshape(image, [-1, q_rows, q_cols, q_channnels,
-                                 m_rows, m_cols, m_channels, 3])
+      image = tf.reshape(image, [
+          -1, q_rows, q_cols, q_channnels, m_rows, m_cols, m_channels, 3
+      ])
       image = tf.transpose(image, [0, 1, 4, 3, 2, 5, 6, 7])
-      image = tf.reshape(image, [-1, q_rows * m_rows * q_channnels,
-                                 q_cols * m_cols * m_channels, 3])
+      image = tf.reshape(image, [
+          -1, q_rows * m_rows * q_channnels, q_cols * m_cols * m_channels, 3
+      ])
   tf.summary.image("attention", image, max_outputs=1)
 
 
@@ -358,9 +361,13 @@ def dot_product_attention(q,
     return tf.matmul(weights, v)
 
 
-def masked_local_attention_1d(q, k, v,
-                              block_length=128, look_right=True,
-                              use_whole_block=False, name=None):
+def masked_local_attention_1d(q,
+                              k,
+                              v,
+                              block_length=128,
+                              look_right=True,
+                              use_whole_block=False,
+                              name=None):
   """Attention to the source position and a neigborhood around it.
 
   The sequence is divided into blocks of length block_size. Attention for a
@@ -390,8 +397,8 @@ def masked_local_attention_1d(q, k, v,
   Returns:
     a Tensor of shape [batch, heads, length, depth_v]
   """
-  with tf.variable_scope(name, default_name="local_attention_1d",
-                         values=[q, k, v]):
+  with tf.variable_scope(
+      name, default_name="local_attention_1d", values=[q, k, v]):
     v_shape = v.get_shape()
     batch = tf.shape(q)[0]
     heads = tf.shape(q)[1]
@@ -401,8 +408,7 @@ def masked_local_attention_1d(q, k, v,
     original_length = length
 
     # If (length < block_length), then we use only one block.
-    block_length = tf.where(tf.less(length, block_length),
-                            length, block_length)
+    block_length = tf.where(tf.less(length, block_length), length, block_length)
     # Pad to desired length.
     padding_size = tf.mod(-length, block_length)
     length += padding_size
@@ -417,24 +423,23 @@ def masked_local_attention_1d(q, k, v,
       # We shift everything over by half a block so query is in center.
       pad_right = block_length // 2
       pad_left = block_length - pad_right
-      extra_padding = [[0, 0], [0, 0],
-                       [pad_left, padding_size+pad_right], [0, 0]]
+      extra_padding = [[0, 0], [0, 0], [pad_left, padding_size + pad_right],
+                       [0, 0]]
     k = tf.pad(k, extra_padding)
     v = tf.pad(v, extra_padding)
 
     # Reshape into blocks.
     q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k])
-    k = tf.reshape(k, [batch, heads, num_blocks+1, block_length, depth_k])
-    v = tf.reshape(v, [batch, heads, num_blocks+1, block_length, depth_v])
+    k = tf.reshape(k, [batch, heads, num_blocks + 1, block_length, depth_k])
+    v = tf.reshape(v, [batch, heads, num_blocks + 1, block_length, depth_v])
 
     # Get local blocks by slicing.
     def local(x):
       """Create a local version of the keys or values."""
-      prev_block = tf.slice(
-          x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1])
-      cur_block = tf.slice(
-          x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1])
+      prev_block = tf.slice(x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1])
+      cur_block = tf.slice(x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1])
       return tf.concat([prev_block, cur_block], 3)
+
     local_k = local(k)
     local_v = local(v)
     local_length = tf.shape(local_k)[3]
@@ -466,7 +471,11 @@ def local(x):
     return output
 
 
-def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100,
+def unmasked_local_attention_1d(q,
+                                k,
+                                v,
+                                block_length=128,
+                                filter_width=100,
                                 name=None):
   """strided block local self-attention.
 
@@ -481,19 +490,22 @@ def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100,
   Returns:
     a Tensor of shape [batch, heads, length, depth_v]
   """
-  with tf.variable_scope(name, default_name="local_self_attention_1d",
-                         values=[q, k, v]):
+  with tf.variable_scope(
+      name, default_name="local_self_attention_1d", values=[q, k, v]):
     v_shape = v.get_shape()
     depth_v = tf.shape(v)[3]
     batch_size = tf.shape(q)[0]
     num_heads = tf.shape(q)[1]
     original_length = tf.shape(q)[2]
+
     # making sure q is a multiple of d
     def pad_to_multiple(x, pad_length):
       x_length = tf.shape(x)[2]
       return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]])
+
     def pad_l_and_r(x, pad_length):
       return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]])
+
     q = pad_to_multiple(q, block_length)
     k = pad_to_multiple(k, block_length)
     v = pad_to_multiple(v, block_length)
@@ -501,16 +513,17 @@ def pad_l_and_r(x, pad_length):
     # Setting up q blocks
     new_q_shape = tf.shape(q)
     # Setting up q blocks
-    q = tf.reshape(q, [new_q_shape[0], new_q_shape[1],
-                       new_q_shape[2]//block_length,
-                       block_length, new_q_shape[3]])
+    q = tf.reshape(q, [
+        new_q_shape[0], new_q_shape[1], new_q_shape[2] // block_length,
+        block_length, new_q_shape[3]
+    ])
 
     # Setting up k and v values
     k = pad_l_and_r(k, filter_width)
     v = pad_l_and_r(v, filter_width)
 
     length = tf.shape(k)[2]
-    full_filter_width = block_length + 2*filter_width
+    full_filter_width = block_length + 2 * filter_width
     # getting gather indices
     indices = tf.range(0, length, delta=1, name="index_range")
     # making indices [1, length, 1] to appy convs
@@ -541,7 +554,7 @@ def pad_l_and_r(x, pad_length):
 
     logits = tf.matmul(q, k_new, transpose_b=True)
 
-    attention = tf.nn.softmax(logits+attention_bias)
+    attention = tf.nn.softmax(logits + attention_bias)
     output = tf.matmul(attention, v_new)
 
     output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
@@ -626,14 +639,13 @@ def multihead_attention(query_antecedent,
     key_depth_per_head = total_key_depth // num_heads
     q *= key_depth_per_head**-0.5
     if attention_type == "dot_product":
-      x = dot_product_attention(
-          q, k, v, bias, dropout_rate, image_shapes)
+      x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes)
     elif attention_type == "local_mask_right":
       x = masked_local_attention_1d(q, k, v, block_length=block_length)
     else:
       assert attention_type == "local_unmasked"
-      x = unmasked_local_attention_1d(q, k, v, block_length=block_length,
-                                      filter_width=block_width)
+      x = unmasked_local_attention_1d(
+          q, k, v, block_length=block_length, filter_width=block_width)
     x = combine_heads(x)
     x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
     return x
@@ -669,29 +681,22 @@ def ffn_self_attention_layer(x,
     A Tensor.
   """
 
-  with tf.variable_scope(name, default_name="feedforward_self_attention",
-                         values=[x]):
+  with tf.variable_scope(
+      name, default_name="feedforward_self_attention", values=[x]):
     x_shape = tf.shape(x)
     part_depth = filter_depth // num_parts
     if not share_kv:
       combined = common_layers.conv1d(
-          x,
-          filter_depth * 3,
-          1,
-          name="qkv_transform")
+          x, filter_depth * 3, 1, name="qkv_transform")
       combined = tf.expand_dims(combined, axis=2)
       q, k, v = tf.split(combined, 3, axis=3)
     else:
-      q = tf.expand_dims(common_layers.conv1d(
-          x,
-          filter_depth,
-          1,
-          name="q_transform"), axis=2)
-      kv_combined = tf.expand_dims(common_layers.conv1d(
-          tf.concat([x, x], axis=1),
-          filter_depth,
-          1,
-          name="kv_transform"), axis=2)
+      q = tf.expand_dims(
+          common_layers.conv1d(x, filter_depth, 1, name="q_transform"), axis=2)
+      kv_combined = tf.expand_dims(
+          common_layers.conv1d(
+              tf.concat([x, x], axis=1), filter_depth, 1, name="kv_transform"),
+          axis=2)
       k, v = tf.split(kv_combined, [x_shape[1], x_shape[1]], axis=1)
 
     batch_q = tf.reshape(q, [-1, 1, num_parts, part_depth])
@@ -701,8 +706,7 @@ def ffn_self_attention_layer(x,
     batch_q *= part_depth**-0.5
     # non-masked bias
     bias = None
-    x = dot_product_attention(
-        batch_q, batch_k, batch_v, bias, dropout_rate)
+    x = dot_product_attention(batch_q, batch_k, batch_v, bias, dropout_rate)
     x = tf.reshape(x, [x_shape[0], x_shape[1], filter_depth])
     x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
     return x
@@ -738,20 +742,21 @@ def parameter_attention(x,
   Returns:
     A Tensor.
   """
-  with tf.variable_scope(name, default_name="parameter_attention",
-                         values=[x]):
+  with tf.variable_scope(name, default_name="parameter_attention", values=[x]):
     head_size_k = total_key_depth // num_heads
     head_size_v = total_value_depth // num_heads
     var_shape_k = [num_heads, memory_rows, head_size_k]
     var_shape_v = [num_heads, memory_rows, head_size_v]
     k = tf.get_variable(
-        "k", var_shape_k,
-        initializer=tf.random_normal_initializer(
-            0, output_depth ** -0.5)) * (num_heads ** 0.5)
+        "k",
+        var_shape_k,
+        initializer=tf.random_normal_initializer(0, output_depth**-0.5)) * (
+            num_heads**0.5)
     v = tf.get_variable(
-        "v", var_shape_v,
-        initializer=tf.random_normal_initializer(
-            0, output_depth ** -0.5)) * (output_depth ** 0.5)
+        "v",
+        var_shape_v,
+        initializer=tf.random_normal_initializer(0, output_depth**-0.5)) * (
+            output_depth**0.5)
     batch_size = tf.shape(x)[0]
     length = tf.shape(x)[1]
     q = common_layers.conv1d(x, total_key_depth, 1, name="q_transform")
@@ -759,8 +764,8 @@ def parameter_attention(x,
       # This is a cheaper form of attention dropout where we use to use
       # the same dropout decisions across batch elemets and query positions,
       # but different decisions across heads and memory positions.
-      v = tf.nn.dropout(v, 1.0 - dropout_rate,
-                        noise_shape=[num_heads, memory_rows, 1])
+      v = tf.nn.dropout(
+          v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1])
     # query is [batch, length, hidden_size]
     # reshape and transpose it to [heads, batch * length, head_size]
     q = tf.reshape(q, [batch_size, length, num_heads, head_size_k])
diff --git a/tensor2tensor/models/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py
similarity index 77%
rename from tensor2tensor/models/common_attention_test.py
rename to tensor2tensor/layers/common_attention_test.py
index a09da74e1..61855b876 100644
--- a/tensor2tensor/models/common_attention_test.py
+++ b/tensor2tensor/layers/common_attention_test.py
@@ -22,7 +22,7 @@
 # Dependency imports
 
 import numpy as np
-from tensor2tensor.models import common_attention
+from tensor2tensor.layers import common_attention
 
 import tensorflow as tf
 
@@ -42,22 +42,14 @@ def testDotProductAttention(self):
     self.assertEqual(res.shape, (5, 7, 12, 32))
 
   def testMaskedLocalAttention(self):
-    q = np.array([[[[1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0]]]])
-    k = np.array([[[[1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0],
-                    [1.0, 0.0, 0.0, 0.0]]]])
+    q = np.array([[[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [
+        1.0, 0.0, 0.0, 0.0
+    ], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]]])
+    k = np.array([[[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [
+        1.0, 0.0, 0.0, 0.0
+    ], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]]])
     v = np.ones((1, 1, 8, 1))
     with self.test_session() as session:
       q_ = tf.constant(q, dtype=tf.float32)
@@ -77,7 +69,8 @@ def testLocalUnmaskedAttention(self):
           tf.constant(x, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
-          block_length=4, filter_width=3)
+          block_length=4,
+          filter_width=3)
       session.run(tf.global_variables_initializer())
       res = session.run(a)
     self.assertEqual(res.shape, (5, 4, 25, 16))
@@ -90,7 +83,8 @@ def testLocalUnmaskedAttentionMatchingBlockLength(self):
           tf.constant(x, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
-          block_length=5, filter_width=3)
+          block_length=5,
+          filter_width=3)
       session.run(tf.global_variables_initializer())
       res = session.run(a)
     self.assertEqual(res.shape, (5, 4, 25, 16))
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/layers/common_hparams.py
similarity index 97%
rename from tensor2tensor/models/common_hparams.py
rename to tensor2tensor/layers/common_hparams.py
index 353586393..6ecb06fb4 100644
--- a/tensor2tensor/models/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -220,10 +220,6 @@ def basic_range1(ranged_hparams):
   rhp.set_float("optimizer_adam_epsilon", 1e-7, 1e-2, scale=rhp.LOG_SCALE)
   rhp.set_float("optimizer_adam_beta1", 0.8, 0.9)
   rhp.set_float("optimizer_adam_beta2", 0.995, 0.999)
-  rhp.set_categorical("optimizer",
-                      ["Adam",
-                       "Adagrad",
-                       "Momentum",
-                       "RMSProp",
-                       "SGD",
-                       "YellowFin"])
+  rhp.set_categorical("optimizer", [
+      "Adam", "Adagrad", "Momentum", "RMSProp", "SGD", "YellowFin"
+  ])
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/layers/common_layers.py
similarity index 99%
rename from tensor2tensor/models/common_layers.py
rename to tensor2tensor/layers/common_layers.py
index 5449a8bef..8a58cd065 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -475,7 +475,8 @@ def residual_fn(x,
                 residual_dropout,
                 filters=None,
                 epsilon=1e-16,
-                name=None, reuse=None):
+                name=None,
+                reuse=None):
   """Returns a function for combining layer input and layer output.
 
   The returned function on x (layer input) and y (layer output) computes:
@@ -494,8 +495,8 @@ def residual_fn(x,
   Returns:
     residual layer output with applied norm_fn.
   """
-  with tf.variable_scope(name, default_name="residual",
-                         values=[x, y], reuse=reuse):
+  with tf.variable_scope(
+      name, default_name="residual", values=[x, y], reuse=reuse):
     norm_fn = get_norm(norm_type)
     res = x + tf.nn.dropout(y, 1.0 - residual_dropout)
     if norm_type == "layer":
@@ -1517,8 +1518,8 @@ def linear_set_layer(layer_size,
     output: A tensor of dimensions batch_size x sequence_length x output_dims
       dimension containing the sequences of transformed vectors.
   """
-  with tf.variable_scope(name, default_name="linear_set_layer",
-                         values=[inputs]):
+  with tf.variable_scope(
+      name, default_name="linear_set_layer", values=[inputs]):
     # Apply 1D convolution to apply linear filter to each element
     # along the 2nd dimension.
     outputs = conv1d(inputs, layer_size, 1, activation=None, name="set_conv")
@@ -1529,8 +1530,8 @@ def linear_set_layer(layer_size,
       # simply add the transformed context to get the same effect.
       if len(context.get_shape().as_list()) == 2:
         context = tf.expand_dims(context, axis=1)
-      cont_tfm = conv1d(context, layer_size, 1,
-                        activation=None, name="cont_conv")
+      cont_tfm = conv1d(
+          context, layer_size, 1, activation=None, name="cont_conv")
       outputs += cont_tfm
 
     if activation_fn is not None:
diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/layers/common_layers_test.py
similarity index 98%
rename from tensor2tensor/models/common_layers_test.py
rename to tensor2tensor/layers/common_layers_test.py
index 8e724587b..df3ccc68f 100644
--- a/tensor2tensor/models/common_layers_test.py
+++ b/tensor2tensor/layers/common_layers_test.py
@@ -22,7 +22,7 @@
 # Dependency imports
 
 import numpy as np
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 
 import tensorflow as tf
 
@@ -351,8 +351,7 @@ def testResidualFn(self):
       x2 = np.random.rand(5, 2, 1, 11)
       x3 = common_layers.residual_fn(
           tf.constant(x1, dtype=tf.float32),
-          tf.constant(x2, dtype=tf.float32),
-          norm_type, 0.1)
+          tf.constant(x2, dtype=tf.float32), norm_type, 0.1)
       session.run(tf.global_variables_initializer())
       actual = session.run(x3)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
@@ -365,7 +364,9 @@ def testResidualFnWithLayerNorm(self):
       x3 = common_layers.residual_fn(
           tf.constant(x1, dtype=tf.float32),
           tf.constant(x2, dtype=tf.float32),
-          norm_type, 0.1, epsilon=0.1)
+          norm_type,
+          0.1,
+          epsilon=0.1)
       session.run(tf.global_variables_initializer())
       actual = session.run(x3)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/layers/modalities.py
similarity index 99%
rename from tensor2tensor/models/modalities.py
rename to tensor2tensor/layers/modalities.py
index 912c54f8c..523c52fa8 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -22,7 +22,7 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import expert_utils as eu
 from tensor2tensor.utils import modality
 from tensor2tensor.utils import registry
diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/layers/modalities_test.py
similarity index 98%
rename from tensor2tensor/models/modalities_test.py
rename to tensor2tensor/layers/modalities_test.py
index 9130613b9..0ccd13777 100644
--- a/tensor2tensor/models/modalities_test.py
+++ b/tensor2tensor/layers/modalities_test.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from tensor2tensor.models import modalities
+from tensor2tensor.layers import modalities
 from tensor2tensor.utils import expert_utils
 
 import tensorflow as tf
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 3b874555f..664bc9e21 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -29,9 +29,9 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -56,8 +56,8 @@ def residual_fn(x, y):
           y, 1.0 - hparams.residual_dropout))
 
     decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
-    decoder_output = attention_lm_decoder(
-        decoder_input, residual_fn, decoder_self_attention_bias, hparams)
+    decoder_output = attention_lm_decoder(decoder_input, residual_fn,
+                                          decoder_self_attention_bias, hparams)
     decoder_output = tf.expand_dims(decoder_output, 2)
 
     return decoder_output
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 4b37050bb..780478fec 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -29,9 +29,9 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -61,32 +61,33 @@ def residual_fn(x, y):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("attention"):
-          y = dp(common_attention.multihead_attention,
-                 x,
-                 None,
-                 decoder_self_attention_bias,
-                 hparams.attention_key_channels or hparams.hidden_size,
-                 hparams.attention_value_channels or hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.num_heads,
-                 hparams.attention_dropout,
-                 name="decoder_self_attention")
+          y = dp(
+              common_attention.multihead_attention,
+              x,
+              None,
+              decoder_self_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout,
+              name="decoder_self_attention")
           x = dp(residual_fn, x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, loss = common_layers.moe_layer(
                 dp, self._ps_devices, x,
                 hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
-                hparams.hidden_size,
-                hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2,
-                hparams.moe_loss_coef)
+                hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1,
+                hparams.moe_n2, hparams.moe_loss_coef)
             extra_loss += loss
           else:
-            y = dp(common_layers.conv_hidden_relu,
-                   x,
-                   hparams.filter_size,
-                   hparams.hidden_size,
-                   dropout=hparams.relu_dropout)
+            y = dp(
+                common_layers.conv_hidden_relu,
+                x,
+                hparams.filter_size,
+                hparams.hidden_size,
+                dropout=hparams.relu_dropout)
           x = dp(residual_fn, x, y)
     decoder_output = dp(tf.expand_dims, x, 2)
     return decoder_output, extra_loss
diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py
index 87ad70e41..96cb60615 100644
--- a/tensor2tensor/models/bluenet.py
+++ b/tensor2tensor/models/bluenet.py
@@ -27,14 +27,13 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
 
-
 # var:          1d tensor, raw weights for each choice
 # tempered_var: raw weights with temperature applied
 # inv_t:        inverse of the temperature to use when normalizing `var`
@@ -86,7 +85,7 @@ def create_selection_weights(name,
     assert len(shape) == 1
     # TODO(rshin): Change this to select without replacement?
     selection = tf.multinomial(tf.expand_dims(var, axis=0), 4)
-    selection = tf.squeeze(selection, axis=0)   # [k] selected classes.
+    selection = tf.squeeze(selection, axis=0)  # [k] selected classes.
     to_run = tf.one_hot(selection, shape[0])  # [k x nmodules] one-hot.
     # [nmodules], 0=not run, 1=run.
     to_run = tf.minimum(tf.reduce_sum(to_run, axis=0), 1)
@@ -101,16 +100,12 @@ def create_selection_weights(name,
 
   if names is not None:
     tf.get_collection_ref("selection_weight_names/" + var.name).extend(
-        names.flatten()
-        if isinstance(names, np.ndarray) else names)
+        names.flatten() if isinstance(names, np.ndarray) else names)
     tf.add_to_collection("selection_weight_names_tensor/" + var.name,
                          tf.constant(names))
 
   return SelectionWeights(
-      var=var,
-      tempered_var=tempered_var,
-      inv_t=inv_t,
-      normalized=weights)
+      var=var, tempered_var=tempered_var, inv_t=inv_t, normalized=weights)
 
 
 def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels,
@@ -155,18 +150,13 @@ def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels,
     channel_weights.append(channel_weight)
   channel_weight = tf.add_n(channel_weights)
 
-  multiplier = (tf.reshape(kernel_weight, max_kernel_size + (1, 1)) *
-                tf.reshape(channel_weight, (1, 1, -1, 1)))
+  multiplier = (tf.reshape(kernel_weight, max_kernel_size +
+                           (1, 1)) * tf.reshape(channel_weight, (1, 1, -1, 1)))
   return multiplier
 
 
-def make_subseparable_kernel(
-    kernel_size,
-    input_channels,
-    filters,
-    separability,
-    kernel_initializer,
-    kernel_regularizer):
+def make_subseparable_kernel(kernel_size, input_channels, filters, separability,
+                             kernel_initializer, kernel_regularizer):
   """Make a kernel to do subseparable convolution wiht  `tf.nn.conv2d`.
 
   Args:
@@ -198,16 +188,14 @@ def make_subseparable_kernel(
         regularizer=kernel_regularizer)
 
     pointwise_kernel = tf.get_variable(
-        "pointwise_kernel",
-        (input_channels, filters),
+        "pointwise_kernel", (input_channels, filters),
         initializer=kernel_initializer,
         regularizer=kernel_regularizer)
 
     expanded_depthwise_kernel = tf.transpose(
         tf.scatter_nd(
             indices=tf.tile(
-                tf.expand_dims(
-                    tf.range(0, input_channels), axis=1), [1, 2]),
+                tf.expand_dims(tf.range(0, input_channels), axis=1), [1, 2]),
             updates=tf.transpose(depthwise_kernel, (2, 0, 1)),
             shape=(input_channels, input_channels) + kernel_size), (2, 3, 0, 1))
 
@@ -230,21 +218,20 @@ def make_subseparable_kernel(
     raise NotImplementedError
 
 
-def multi_subseparable_conv(
-    inputs,
-    filters,
-    kernel_sizes,
-    input_channels,
-    separabilities,
-    kernel_selection_weights=None,
-    channel_selection_weights=None,
-    separability_selection_weights=None,
-    kernel_selection_weights_params=None,
-    channel_selection_weights_params=None,
-    separability_selection_weights_params=None,
-    kernel_initializer=None,
-    kernel_regularizer=None,
-    scope=None):
+def multi_subseparable_conv(inputs,
+                            filters,
+                            kernel_sizes,
+                            input_channels,
+                            separabilities,
+                            kernel_selection_weights=None,
+                            channel_selection_weights=None,
+                            separability_selection_weights=None,
+                            kernel_selection_weights_params=None,
+                            channel_selection_weights_params=None,
+                            separability_selection_weights_params=None,
+                            kernel_initializer=None,
+                            kernel_regularizer=None,
+                            scope=None):
   """Simultaneously compute different kinds of convolutions on subsets of input.
 
   Args:
@@ -299,44 +286,33 @@ def multi_subseparable_conv(
       kernel_selection_weights = create_selection_weights(
           "kernels",
           "softmax", (len(kernel_sizes),),
-          names=[
-              "kernel_h{}_w{}".format(h, w) for h, w in kernel_sizes
-          ],
+          names=["kernel_h{}_w{}".format(h, w) for h, w in kernel_sizes],
           **kernel_selection_weights_params)
 
     if channel_selection_weights is None:
       channel_selection_weights = create_selection_weights(
           "channels",
           "softmax", (len(input_channels),),
-          names=[
-              "channels_{}_{}".format(c1, c2) for c1, c2 in input_channels
-          ],
+          names=["channels_{}_{}".format(c1, c2) for c1, c2 in input_channels],
           **channel_selection_weights_params)
 
     if separability_selection_weights is None:
       separability_selection_weights = create_selection_weights(
           "separability",
           "softmax", (len(separabilities),),
-          names=[
-              "separability_{}".format(s) for s in separabilities
-          ],
+          names=["separability_{}".format(s) for s in separabilities],
           **separability_selection_weights_params)
 
   kernels = []
   for separability in separabilities:
     with tf.variable_scope("separablity_{}".format(separability)):
-      kernel = make_subseparable_kernel(
-          max_kernel_size,
-          max_num_channels,
-          filters,
-          separability,
-          kernel_initializer,
-          kernel_regularizer)
+      kernel = make_subseparable_kernel(max_kernel_size, max_num_channels,
+                                        filters, separability,
+                                        kernel_initializer, kernel_regularizer)
 
       premultiplier = kernel_premultiplier(
           max_kernel_size, kernel_sizes, input_channels,
-          kernel_selection_weights,
-          channel_selection_weights)
+          kernel_selection_weights, channel_selection_weights)
 
       kernels.append(kernel * premultiplier)
 
@@ -358,18 +334,24 @@ def multi_subseparable_conv(
 
 
 def conv_module(kw, kh, sep, div):
+
   def convfn(x, hparams):
     return common_layers.subseparable_conv(
-        x, hparams.hidden_size // div, (kw, kh),
-        padding="SAME", separability=sep,
+        x,
+        hparams.hidden_size // div, (kw, kh),
+        padding="SAME",
+        separability=sep,
         name="conv_%d%d_sep%d_div%d" % (kw, kh, sep, div))
+
   return convfn
 
 
 def multi_conv_module(kernel_sizes, seps):
+
   def convfn(x, hparams):
     return multi_subseparable_conv(x, hparams.hidden_size, kernel_sizes,
                                    [(0, hparams.hidden_size)], seps)
+
   return convfn
 
 
@@ -438,15 +420,16 @@ def run_unary_modules_basic(modules, cur, hparams):
 def run_unary_modules_sample(modules, cur, hparams, k):
   """Run modules, sampling k."""
   selection_weights = create_selection_weights(
-      "selection",
-      ("softmax_topk", k),
+      "selection", ("softmax_topk", k),
       shape=[len(modules)],
       inv_t=100.0 * common_layers.inverse_exp_decay(
           hparams.anneal_until, min_value=0.01))
-  all_res = [tf.cond(tf.less(selection_weights.normalized[n], 1e-6),
-                     lambda: tf.zeros_like(cur),
-                     lambda i=n: modules[i](cur, hparams))
-             for n in xrange(len(modules))]
+  all_res = [
+      tf.cond(
+          tf.less(selection_weights.normalized[n], 1e-6),
+          lambda: tf.zeros_like(cur),
+          lambda i=n: modules[i](cur, hparams)) for n in xrange(len(modules))
+  ]
   all_res = tf.concat([tf.expand_dims(r, axis=0) for r in all_res], axis=0)
   res = all_res * tf.reshape(selection_weights.normalized, [-1, 1, 1, 1, 1])
   return tf.reduce_sum(res, axis=0)
@@ -461,8 +444,7 @@ def run_unary_modules(modules, cur, hparams):
 def batch_deviation(x):
   """Average deviation of the batch."""
   x_mean = tf.reduce_mean(x, axis=[0], keep_dims=True)
-  x_variance = tf.reduce_mean(
-      tf.square(x - x_mean), axis=[0], keep_dims=True)
+  x_variance = tf.reduce_mean(tf.square(x - x_mean), axis=[0], keep_dims=True)
   return tf.reduce_mean(tf.sqrt(x_variance))
 
 
@@ -475,13 +457,15 @@ def model_fn_body(self, features):
     multi_conv = multi_conv_module(
         kernel_sizes=[(3, 3), (5, 5), (7, 7)], seps=[0, 1])
     conv_modules = [multi_conv, identity_module]
-    activation_modules = [identity_module,
-                          lambda x, _: tf.nn.relu(x),
-                          lambda x, _: tf.nn.elu(x),
-                          lambda x, _: tf.tanh(x)]
+    activation_modules = [
+        identity_module, lambda x, _: tf.nn.relu(x), lambda x, _: tf.nn.elu(x),
+        lambda x, _: tf.tanh(x)
+    ]
     norm_modules = [identity_module, layernorm_module, noamnorm_module]
-    binary_modules = [first_binary_module, second_binary_module,
-                      sum_binary_module, shakeshake_binary_module]
+    binary_modules = [
+        first_binary_module, second_binary_module, sum_binary_module,
+        shakeshake_binary_module
+    ]
     inputs = features["inputs"]
 
     def run_unary(x, name):
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
index 28862e594..d9c4e29a9 100644
--- a/tensor2tensor/models/bytenet.py
+++ b/tensor2tensor/models/bytenet.py
@@ -23,8 +23,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -63,8 +63,8 @@ def bytenet_internal(inputs, targets, hparams):
     # Pad inputs and targets to be the same length, divisible by 50.
     inputs, targets = common_layers.pad_to_same_length(
         inputs, targets, final_length_divisible_by=50)
-    final_encoder = residual_dilated_conv(
-        inputs, hparams.num_block_repeat, "SAME", "encoder", hparams)
+    final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat,
+                                          "SAME", "encoder", hparams)
 
     shifted_targets = common_layers.shift_left(targets)
     kernel = (hparams.kernel_height, hparams.kernel_width)
@@ -73,9 +73,8 @@ def bytenet_internal(inputs, targets, hparams):
         hparams.hidden_size, [((1, 1), kernel)],
         padding="LEFT")
 
-    return residual_dilated_conv(
-        decoder_start, hparams.num_block_repeat,
-        "LEFT", "decoder", hparams)
+    return residual_dilated_conv(decoder_start, hparams.num_block_repeat,
+                                 "LEFT", "decoder", hparams)
 
 
 @registry.register_model
diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py
index bdb93509b..af2d83158 100644
--- a/tensor2tensor/models/gene_expression.py
+++ b/tensor2tensor/models/gene_expression.py
@@ -22,8 +22,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index a43eda97a..3b1dc6873 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -23,8 +23,8 @@
 import numpy as np
 
 from tensor2tensor.data_generators import gene_expression as gene_data
+from tensor2tensor.layers import modalities  # pylint: disable=unused-import
 from tensor2tensor.models import gene_expression
-from tensor2tensor.models import modalities  # pylint: disable=unused-import
 
 import tensorflow as tf
 
diff --git a/tensor2tensor/models/long_answer.py b/tensor2tensor/models/long_answer.py
index be8024f63..a9fb45e4a 100644
--- a/tensor2tensor/models/long_answer.py
+++ b/tensor2tensor/models/long_answer.py
@@ -34,9 +34,9 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -67,34 +67,35 @@ def residual_fn(x, y):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("attention"):
-          y = dp(common_attention.multihead_attention,
-                 x,
-                 None,
-                 None,
-                 hparams.attention_key_channels or hparams.hidden_size,
-                 hparams.attention_value_channels or hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.num_heads,
-                 hparams.attention_dropout,
-                 attention_type="local_mask_right",
-                 block_length=hparams.block_length,
-                 name="decoder_self_attention")
+          y = dp(
+              common_attention.multihead_attention,
+              x,
+              None,
+              None,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout,
+              attention_type="local_mask_right",
+              block_length=hparams.block_length,
+              name="decoder_self_attention")
           x = dp(residual_fn, x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, loss = common_layers.moe_layer(
                 dp, self._ps_devices, x,
                 hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
-                hparams.hidden_size,
-                hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2,
-                hparams.moe_loss_coef)
+                hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1,
+                hparams.moe_n2, hparams.moe_loss_coef)
             extra_loss += loss
           else:
-            y = dp(common_layers.conv_hidden_relu,
-                   x,
-                   hparams.filter_size,
-                   hparams.hidden_size,
-                   dropout=hparams.relu_dropout)
+            y = dp(
+                common_layers.conv_hidden_relu,
+                x,
+                hparams.filter_size,
+                hparams.hidden_size,
+                dropout=hparams.relu_dropout)
           x = dp(residual_fn, x, y)
     x = dp(long_answer_output, x, inputs)
     return x, extra_loss
@@ -113,7 +114,8 @@ def long_answer_prepare_decoder(inputs, targets, hparams):
   """
   decoder_input = tf.concat([
       length_embedding(targets, hparams), inputs,
-      common_layers.shift_left_3d(targets)], 1)
+      common_layers.shift_left_3d(targets)
+  ], 1)
   if hparams.pos == "timing":
     decoder_input = common_attention.add_timing_signal_1d(decoder_input)
   return decoder_input
@@ -140,8 +142,7 @@ def length_embedding(targets, hparams):
   padded_target_length = tf.shape(targets)[1]
   if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN:
     lengths = padded_target_length * tf.to_int32(
-        tf.less(tf.random_uniform([batch]),
-                hparams.answer_length_prob_train))
+        tf.less(tf.random_uniform([batch]), hparams.answer_length_prob_train))
   elif hparams.mode == tf.contrib.learn.ModeKeys.EVAL:
     lengths = 0
   else:
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index 195879d78..d79b04494 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -23,25 +23,29 @@
 
 # Dependency imports
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
 from tensorflow.python.util import nest
 
-
 # Track Tuple of state and attention values
-AttentionTuple = collections.namedtuple("AttentionTuple",
-                                        ("state", "attention"))
+AttentionTuple = collections.namedtuple("AttentionTuple", ("state",
+                                                           "attention"))
 
 
 class ExternalAttentionCellWrapper(tf.contrib.rnn.RNNCell):
   """Wrapper for external attention states for an encoder-decoder setup."""
 
-  def __init__(self, cell, attn_states, attn_vec_size=None,
-               input_size=None, state_is_tuple=True, reuse=None):
+  def __init__(self,
+               cell,
+               attn_states,
+               attn_vec_size=None,
+               input_size=None,
+               state_is_tuple=True,
+               reuse=None):
     """Create a cell with attention.
 
     Args:
@@ -137,8 +141,8 @@ def call(self, inputs, state):
     new_attns = self._attention(new_state_cat, attn_states, attn_length)
 
     with tf.variable_scope("attn_output_projection"):
-      output = tf.layers.dense(tf.concat([lstm_output, new_attns], axis=1),
-                               self._attn_size)
+      output = tf.layers.dense(
+          tf.concat([lstm_output, new_attns], axis=1), self._attn_size)
 
     new_state = AttentionTuple(new_state, new_attns)
 
@@ -151,18 +155,16 @@ def _attention(self, query, attn_states, attn_length):
     tanh = tf.tanh
 
     with tf.variable_scope("attention"):
-      k = tf.get_variable(
-          "attn_w", [1, 1, self._attn_size, self._attn_vec_size])
+      k = tf.get_variable("attn_w",
+                          [1, 1, self._attn_size, self._attn_vec_size])
       v = tf.get_variable("attn_v", [self._attn_vec_size, 1])
-      hidden = tf.reshape(attn_states,
-                          [-1, attn_length, 1, self._attn_size])
+      hidden = tf.reshape(attn_states, [-1, attn_length, 1, self._attn_size])
       hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
       y = tf.layers.dense(query, self._attn_vec_size)
       y = tf.reshape(y, [-1, 1, 1, self._attn_vec_size])
       s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
       a = softmax(s)
-      d = reduce_sum(
-          tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
+      d = reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
       new_attns = tf.reshape(d, [-1, self._attn_size])
 
       return new_attns
@@ -186,8 +188,8 @@ def dropout_lstm_cell():
         time_major=False)
 
 
-def lstm_attention_decoder(inputs, hparams, train, name,
-                           initial_state, attn_states):
+def lstm_attention_decoder(inputs, hparams, train, name, initial_state,
+                           attn_states):
   """Run LSTM cell with attention on inputs of shape [batch x time x size]."""
 
   def dropout_lstm_cell():
@@ -196,9 +198,10 @@ def dropout_lstm_cell():
         input_keep_prob=1.0 - hparams.dropout * tf.to_float(train))
 
   layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]
-  cell = ExternalAttentionCellWrapper(tf.nn.rnn_cell.MultiRNNCell(layers),
-                                      attn_states,
-                                      attn_vec_size=hparams.attn_vec_size)
+  cell = ExternalAttentionCellWrapper(
+      tf.nn.rnn_cell.MultiRNNCell(layers),
+      attn_states,
+      attn_vec_size=hparams.attn_vec_size)
   initial_state = cell.combine_state(initial_state)
   with tf.variable_scope(name):
     return tf.nn.dynamic_rnn(
@@ -239,10 +242,7 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
     # LSTM decoder with attention
     shifted_targets = common_layers.shift_left(targets)
     decoder_outputs, _ = lstm_attention_decoder(
-        common_layers.flatten4d3d(shifted_targets),
-        hparams,
-        train,
-        "decoder",
+        common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder",
         final_encoder_state, encoder_outputs)
     return tf.expand_dims(decoder_outputs, axis=2)
 
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index 6ac792f48..7da3d2380 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.models import common_hparams
+from tensor2tensor.layers import common_hparams
 from tensor2tensor.models import lstm
 
 import tensorflow as tf
@@ -44,8 +44,8 @@ def testLSTMSeq2Seq(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = lstm.LSTMSeq2seq(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      model = lstm.LSTMSeq2seq(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+                               p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
index 907a801cf..c2a904888 100644
--- a/tensor2tensor/models/models.py
+++ b/tensor2tensor/models/models.py
@@ -23,6 +23,7 @@
 
 # pylint: disable=unused-import
 
+from tensor2tensor.layers import modalities
 from tensor2tensor.models import attention_lm
 from tensor2tensor.models import attention_lm_moe
 from tensor2tensor.models import bluenet
@@ -30,7 +31,6 @@
 from tensor2tensor.models import gene_expression
 from tensor2tensor.models import long_answer
 from tensor2tensor.models import lstm
-from tensor2tensor.models import modalities
 from tensor2tensor.models import multimodel
 from tensor2tensor.models import neural_gpu
 from tensor2tensor.models import shake_shake
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index 6f60dbfbf..290c78732 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -22,10 +22,10 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
-from tensor2tensor.models import modalities
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
+from tensor2tensor.layers import modalities
 from tensor2tensor.models import slicenet
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
@@ -41,12 +41,22 @@ def conv_res_step(x, hparams, padding, mask):
   dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)]
   with tf.variable_scope("conv_res_step"):
     y = common_layers.subseparable_conv_block(
-        x, hparams.filter_size, dilations_and_kernels1,
-        padding=padding, mask=mask, separabilities=0, name="residual1")
+        x,
+        hparams.filter_size,
+        dilations_and_kernels1,
+        padding=padding,
+        mask=mask,
+        separabilities=0,
+        name="residual1")
     y = tf.nn.dropout(y, 1.0 - hparams.dropout)
     return common_layers.subseparable_conv_block(
-        y, hparams.hidden_size, dilations_and_kernels2,
-        padding=padding, mask=mask, separabilities=0, name="residual2")
+        y,
+        hparams.hidden_size,
+        dilations_and_kernels2,
+        padding=padding,
+        mask=mask,
+        separabilities=0,
+        name="residual2")
 
 
 def residual_fn2(x, y, hparams):
@@ -102,9 +112,9 @@ def flatten(inputs):
     expert_loss = 0.0
     for i in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("enc_layer_%d" % i):
-        inputs_encoded, moe_loss = conv_experts(
-            inputs_encoded, hparams, dp, self._ps_devices, "SAME",
-            inputs_mask, i)
+        inputs_encoded, moe_loss = conv_experts(inputs_encoded, hparams, dp,
+                                                self._ps_devices, "SAME",
+                                                inputs_mask, i)
         expert_loss += tf.reduce_mean(moe_loss) * hparams.moe_loss_coef
 
     # If we're just predicing a class, there is no use for a decoder, return.
@@ -116,54 +126,57 @@ def flatten(inputs):
     inputs3d = dp(tf.squeeze, inputs, 2)
     inputs_encoded3d = dp(tf.squeeze, inputs_encoded, 2)
     encoder_padding = dp(common_attention.embedding_to_padding, inputs3d)
-    encoder_attention_bias = dp(
-        common_attention.attention_bias_ignore_padding, encoder_padding)
+    encoder_attention_bias = dp(common_attention.attention_bias_ignore_padding,
+                                encoder_padding)
     targets = dp(common_layers.flatten4d3d, sharded_features["targets"])
     target_space_emb = dp(slicenet.embed_target_space,
                           sharded_features["target_space_id"],
                           hparams.hidden_size)
 
-    (decoder_input, decoder_self_attention_bias) = dp(
-        prepare_decoder, targets, target_space_emb)
+    (decoder_input, decoder_self_attention_bias) = dp(prepare_decoder, targets,
+                                                      target_space_emb)
 
     x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.dropout)
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("dec_layer_%d" % layer):
         with tf.variable_scope("attention"):
-          y = dp(common_attention.multihead_attention,
-                 x,
-                 None,
-                 decoder_self_attention_bias,
-                 hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.num_heads,
-                 hparams.attention_dropout,
-                 name="decoder_self_attention")
-          z = dp(common_attention.multihead_attention,
-                 y,
-                 inputs_encoded3d,
-                 encoder_attention_bias,
-                 hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.hidden_size,
-                 hparams.num_heads,
-                 hparams.attention_dropout,
-                 name="encdec_attention")
+          y = dp(
+              common_attention.multihead_attention,
+              x,
+              None,
+              decoder_self_attention_bias,
+              hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout,
+              name="decoder_self_attention")
+          z = dp(
+              common_attention.multihead_attention,
+              y,
+              inputs_encoded3d,
+              encoder_attention_bias,
+              hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout,
+              name="encdec_attention")
           x = dp(residual_fn3, x, y, z, hparams)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, moe_loss = common_layers.moe_layer(
-                dp, self._ps_devices, x, train,
-                hparams.hidden_size, hparams.filter_size,
-                hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef)
+                dp, self._ps_devices, x, train, hparams.hidden_size,
+                hparams.filter_size, hparams.moe_n1, hparams.moe_n2,
+                hparams.moe_loss_coef)
             expert_loss += tf.reduce_mean(moe_loss)
           else:
-            y = dp(common_layers.conv_hidden_relu,
-                   x,
-                   hparams.filter_size,
-                   hparams.hidden_size,
-                   dropout=hparams.dropout)
+            y = dp(
+                common_layers.conv_hidden_relu,
+                x,
+                hparams.filter_size,
+                hparams.hidden_size,
+                dropout=hparams.dropout)
           x = dp(residual_fn2, x, y, hparams)
 
     x = dp(tf.expand_dims, x, 2)
diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py
index fc9d75639..4037aa8d4 100644
--- a/tensor2tensor/models/neural_gpu.py
+++ b/tensor2tensor/models/neural_gpu.py
@@ -23,8 +23,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -34,6 +34,7 @@
 def neural_gpu(inputs, hparams, name=None):
   """The core Neural GPU."""
   with tf.variable_scope(name, "neural_gpu"):
+
     def step(state, inp):  # pylint: disable=missing-docstring
       x = tf.nn.dropout(state, 1.0 - hparams.dropout)
       for layer in xrange(hparams.num_hidden_layers):
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
index 46c01f403..b7a1e98f7 100644
--- a/tensor2tensor/models/neural_gpu_test.py
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.models import common_hparams
+from tensor2tensor.layers import common_hparams
 from tensor2tensor.models import neural_gpu
 
 import tensorflow as tf
@@ -50,8 +50,8 @@ def testNeuralGPU(self):
           "inputs": tf.constant(inputs, dtype=tf.int32),
           "targets": tf.constant(targets, dtype=tf.int32)
       }
-      model = neural_gpu.NeuralGPU(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      model = neural_gpu.NeuralGPU(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+                                   p_hparams)
       shadred_logits, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py
index 7fa40783a..aa91654a3 100644
--- a/tensor2tensor/models/shake_shake.py
+++ b/tensor2tensor/models/shake_shake.py
@@ -23,8 +23,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index f1534137c..8900e6d11 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -23,9 +23,9 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -83,8 +83,7 @@ def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None):
     return norm_fn(targets_shifted + targets_with_attention, name="attn_norm")
 
 
-def multi_conv_res(x, padding, name, layers, hparams,
-                   mask=None, source=None):
+def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None):
   """A stack of separable convolution blocks with residual connections."""
   with tf.variable_scope(name):
     padding_bias = None
@@ -200,7 +199,10 @@ def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
   else:
     inputs_padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
     targets_with_attention = attention(
-        targets_shifted, inputs_encoded, norm_fn, hparams,
+        targets_shifted,
+        inputs_encoded,
+        norm_fn,
+        hparams,
         bias=inputs_padding_bias)
 
   # Positional targets: merge attention and raw.
@@ -237,8 +239,8 @@ def slicenet_internal(inputs, targets, target_space, problem_idx, hparams):
     inputs = common_layers.add_timing_signal(inputs)  # Add position info.
     target_space_emb = embed_target_space(target_space, hparams.hidden_size)
     extra_layers = int(hparams.num_hidden_layers * 1.5)
-    inputs_encoded = multi_conv_res(inputs, "SAME", "encoder", extra_layers,
-                                    hparams, mask=inputs_mask)
+    inputs_encoded = multi_conv_res(
+        inputs, "SAME", "encoder", extra_layers, hparams, mask=inputs_mask)
     target_modality_name = hparams.problems[problem_idx].target_modality.name
     if "class_label_modality" in target_modality_name:
       # If we're just predicing a class, there is no use for a decoder.
@@ -266,6 +268,7 @@ def model_fn_body(self, features):
                              features["target_space_id"], self._problem_idx,
                              self._hparams)
 
+
 _KERNEL_SCHEMES = {
     "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)],
     "3.7.7.7": [(3, 1), (7, 1), (7, 1), (7, 1)],
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index c357448e4..388acde1b 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 from tensor2tensor.data_generators import image  # pylint: disable=unused-import
-from tensor2tensor.models import modalities  # pylint: disable=unused-import
+from tensor2tensor.layers import modalities  # pylint: disable=unused-import
 from tensor2tensor.models import slicenet
 from tensor2tensor.utils import registry
 
@@ -47,8 +47,8 @@ def testSliceNet(self):
           "targets": tf.constant(y, dtype=tf.int32),
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
-      model = slicenet.SliceNet(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      model = slicenet.SliceNet(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+                                p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index a2b55febf..1add44115 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -27,9 +27,9 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -49,10 +49,9 @@ def model_fn_body(self, features):
     inputs = common_layers.flatten4d3d(inputs)
     targets = common_layers.flatten4d3d(targets)
 
-    (encoder_input,
-     encoder_self_attention_bias,
-     encoder_decoder_attention_bias) = (
-         transformer_prepare_encoder(inputs, target_space, hparams))
+    (encoder_input, encoder_self_attention_bias,
+     encoder_decoder_attention_bias) = (transformer_prepare_encoder(
+         inputs, target_space, hparams))
     (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
         targets, hparams)
 
@@ -73,12 +72,16 @@ def model_fn_body(self, features):
 
 def get_residual_fn(hparams):
   """Get residual_fn."""
+
   def residual_fn(x, y):
-    return common_layers.residual_fn(x, y,
-                                     hparams.norm_type,
-                                     hparams.residual_dropout,
-                                     hparams.hidden_size,
-                                     epsilon=hparams.layer_norm_epsilon)
+    return common_layers.residual_fn(
+        x,
+        y,
+        hparams.norm_type,
+        hparams.residual_dropout,
+        hparams.hidden_size,
+        epsilon=hparams.layer_norm_epsilon)
+
   return residual_fn
 
 
@@ -113,8 +116,7 @@ def transformer_prepare_encoder(inputs, target_space, hparams):
   encoder_input += emb_target_space
   if hparams.pos == "timing":
     encoder_input = common_attention.add_timing_signal_1d(encoder_input)
-  return (encoder_input,
-          encoder_self_attention_bias,
+  return (encoder_input, encoder_self_attention_bias,
           encoder_decoder_attention_bias)
 
 
@@ -251,12 +253,9 @@ def transformer_ffn_layer(x, hparams):
         dropout=hparams.relu_dropout)
   elif hparams.ffn_layer == "parameter_attention":
     return common_attention.parameter_attention(
-        x,
-        hparams.parameter_attention_key_channels or hparams.hidden_size,
+        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
         hparams.parameter_attention_value_channels or hparams.hidden_size,
-        hparams.hidden_size,
-        hparams.filter_size,
-        hparams.num_heads,
+        hparams.hidden_size, hparams.filter_size, hparams.num_heads,
         hparams.attention_dropout)
   elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv":
     return common_layers.conv_hidden_relu(
diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py
index 1f20bfb51..2604748be 100644
--- a/tensor2tensor/models/transformer_alternative.py
+++ b/tensor2tensor/models/transformer_alternative.py
@@ -20,7 +20,6 @@
 Code is mostly copied from original Transformer source.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -29,8 +28,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_attention
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
@@ -50,10 +49,11 @@ def model_fn_body(self, features):
     inputs = common_layers.flatten4d3d(inputs)
     targets = common_layers.flatten4d3d(targets)
 
-    (encoder_input, encoder_attention_bias, _) = (
-        transformer.transformer_prepare_encoder(inputs, target_space, hparams))
-    (decoder_input, _) = (
-        transformer.transformer_prepare_decoder(targets, hparams))
+    (encoder_input,
+     encoder_attention_bias, _) = (transformer.transformer_prepare_encoder(
+         inputs, target_space, hparams))
+    (decoder_input, _) = (transformer.transformer_prepare_decoder(
+        targets, hparams))
 
     encoder_mask = bias_to_mask(encoder_attention_bias)
 
@@ -64,12 +64,12 @@ def residual_fn(x, y):
     encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
     decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
 
-    encoder_output = alt_transformer_encoder(
-        encoder_input, residual_fn, encoder_mask, hparams)
+    encoder_output = alt_transformer_encoder(encoder_input, residual_fn,
+                                             encoder_mask, hparams)
 
-    decoder_output = alt_transformer_decoder(
-        decoder_input, encoder_output, residual_fn,
-        encoder_attention_bias, hparams)
+    decoder_output = alt_transformer_decoder(decoder_input, encoder_output,
+                                             residual_fn,
+                                             encoder_attention_bias, hparams)
 
     decoder_output = tf.expand_dims(decoder_output, 2)
 
@@ -97,19 +97,14 @@ def composite_layer(inputs, mask, hparams, for_output=False):
     for layer in xrange(hparams.layers_per_layer):
       with tf.variable_scope("sub_layer_%d" % layer):
         x = common_layers.linear_set_layer(
-            hparams.hidden_size,
-            x,
-            dropout=hparams.relu_dropout)
+            hparams.hidden_size, x, dropout=hparams.relu_dropout)
         if for_output:
           context = common_layers.running_global_pool_1d(x)
         else:
           context = common_layers.global_pool_1d(x, mask=mask)
     # Final layer.
     x = common_layers.linear_set_layer(
-        hparams.hidden_size,
-        x,
-        context=context,
-        dropout=hparams.relu_dropout)
+        hparams.hidden_size, x, context=context, dropout=hparams.relu_dropout)
   return x
 
 
@@ -150,8 +145,8 @@ def alt_transformer_decoder(decoder_input,
             hparams.attention_dropout,
             name="encdec_attention")
 
-        x_ = residual_fn(x_, composite_layer(x_, None, hparams,
-                                             for_output=True))
+        x_ = residual_fn(x_, composite_layer(
+            x_, None, hparams, for_output=True))
         x = residual_fn(x, x_)
   return x
 
@@ -162,7 +157,7 @@ def bias_to_mask(bias):
   #  output sequences. Squeeze out dim one, and get the first element of
   #  each vector.
   bias = tf.squeeze(bias, [1])[:, :, 0]
-  bias = - tf.clip_by_value(bias, -1.0, 1.0)
+  bias = -tf.clip_by_value(bias, -1.0, 1.0)
   mask = 1 - bias
   return mask
 
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
index f2e69da21..a61687f48 100644
--- a/tensor2tensor/models/xception.py
+++ b/tensor2tensor/models/xception.py
@@ -23,8 +23,8 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensor2tensor.models import common_hparams
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
new file mode 100644
index 000000000..12057d8e6
--- /dev/null
+++ b/tensor2tensor/utils/decoding.py
@@ -0,0 +1,371 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Decoding utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import operator
+import os
+
+# Dependency imports
+
+import numpy as np
+import six
+
+from six.moves import input  # pylint: disable=redefined-builtin
+
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import data_reader
+from tensor2tensor.utils import devices
+from tensor2tensor.utils import input_fn_builder
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+def decode_from_dataset(estimator):
+  hparams = estimator.hparams
+  for i, problem in enumerate(FLAGS.problems.split("-")):
+    inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
+    targets_vocab = hparams.problems[i].vocabulary["targets"]
+    tf.logging.info("Performing local inference.")
+    infer_problems_data = data_reader.get_data_filepatterns(
+        FLAGS.problems, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER)
+    infer_input_fn = input_fn_builder.build_input_fn(
+        mode=tf.contrib.learn.ModeKeys.INFER,
+        hparams=hparams,
+        data_file_patterns=infer_problems_data,
+        num_datashards=devices.data_parallelism().n,
+        fixed_problem=i)
+    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False)
+
+    def log_fn(inputs,
+               targets,
+               outputs,
+               problem,
+               j,
+               inputs_vocab=inputs_vocab,
+               targets_vocab=targets_vocab):
+      """Log inference results."""
+      if "image" in problem and FLAGS.decode_save_images:
+        save_path = os.path.join(estimator.model_dir,
+                                 "%s_prediction_%d.jpg" % (problem, j))
+        show_and_save_image(inputs / 255., save_path)
+      elif inputs_vocab:
+        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
+        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
+
+      decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten()))
+      tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
+      decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten()))
+      tf.logging.info("Inference results TARGET: %s" % decoded_targets)
+
+      if FLAGS.decode_to_file:
+        output_filepath = FLAGS.decode_to_file + ".outputs." + problem
+        output_file = tf.gfile.Open(output_filepath, "a")
+        output_file.write(decoded_outputs + "\n")
+        target_filepath = FLAGS.decode_to_file + ".targets." + problem
+        target_file = tf.gfile.Open(target_filepath, "a")
+        target_file.write(decoded_targets + "\n")
+
+    # The function predict() returns an iterable over the network's
+    # predictions from the test input. We use it to log inputs and decodes.
+    inputs_iter = result_iter["inputs"]
+    targets_iter = result_iter["targets"]
+    outputs_iter = result_iter["outputs"]
+    for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)):
+      inputs, targets, outputs = result
+      if FLAGS.decode_return_beams:
+        output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0)
+        for k, beam in enumerate(output_beams):
+          tf.logging.info("BEAM %d:" % k)
+          log_fn(inputs, targets, beam, problem, j)
+      else:
+        log_fn(inputs, targets, outputs, problem, j)
+
+
+def decode_from_file(estimator, filename):
+  """Compute predictions on entries in filename and write them out."""
+  hparams = estimator.hparams
+  problem_id = FLAGS.decode_problem_id
+  inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"]
+  targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
+  tf.logging.info("Performing decoding from a file.")
+  sorted_inputs, sorted_keys = _get_sorted_inputs(filename)
+  num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1
+  input_fn = _decode_batch_input_fn(problem_id, num_decode_batches,
+                                    sorted_inputs, inputs_vocab)
+
+  decodes = []
+  for _ in range(num_decode_batches):
+    result_iter = estimator.predict(
+        input_fn=input_fn.next if six.PY2 else input_fn.__next__,
+        as_iterable=True)
+    for result in result_iter:
+
+      def log_fn(inputs, outputs):
+        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
+        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
+
+        decoded_outputs = targets_vocab.decode(
+            _save_until_eos(outputs.flatten()))
+        tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
+        return decoded_outputs
+
+      if FLAGS.decode_return_beams:
+        beam_decodes = []
+        output_beams = np.split(
+            result["outputs"], FLAGS.decode_beam_size, axis=0)
+        for k, beam in enumerate(output_beams):
+          tf.logging.info("BEAM %d:" % k)
+          beam_decodes.append(log_fn(result["inputs"], beam))
+        decodes.append("\t".join(beam_decodes))
+
+      else:
+        decodes.append(log_fn(result["inputs"], result["outputs"]))
+
+  # Reversing the decoded inputs and outputs because they were reversed in
+  # _decode_batch_input_fn
+  sorted_inputs.reverse()
+  decodes.reverse()
+  # Dumping inputs and outputs to file filename.decodes in
+  # format result\tinput in the same order as original inputs
+  if FLAGS.decode_to_file:
+    output_filename = FLAGS.decode_to_file
+  else:
+    output_filename = filename
+  if FLAGS.decode_shards > 1:
+    base_filename = output_filename + ("%.2d" % FLAGS.worker_id)
+  else:
+    base_filename = output_filename
+  decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set
+                     + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" +
+                     str(FLAGS.decode_alpha) + ".decodes")
+  tf.logging.info("Writing decodes into %s" % decode_filename)
+  outfile = tf.gfile.Open(decode_filename, "w")
+  for index in range(len(sorted_inputs)):
+    outfile.write("%s\n" % (decodes[sorted_keys[index]]))
+
+
+def decode_interactively(estimator):
+  hparams = estimator.hparams
+
+  infer_input_fn = _interactive_input_fn(hparams)
+  for problem_idx, example in infer_input_fn:
+    targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
+    result_iter = estimator.predict(input_fn=lambda e=example: e)
+    for result in result_iter:
+      if FLAGS.decode_return_beams:
+        beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
+        scores = None
+        if "scores" in result:
+          scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0)
+        for k, beam in enumerate(beams):
+          tf.logging.info("BEAM %d:" % k)
+          beam_string = targets_vocab.decode(_save_until_eos(beam.flatten()))
+          if scores is not None:
+            tf.logging.info("%s\tScore:%f" % (beam_string, scores[k]))
+          else:
+            tf.logging.info(beam_string)
+      else:
+        if FLAGS.identity_output:
+          tf.logging.info(" ".join(map(str, result["outputs"].flatten())))
+        else:
+          tf.logging.info(
+              targets_vocab.decode(
+                  _save_until_eos(result["outputs"].flatten())))
+
+
+def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
+                           vocabulary):
+  tf.logging.info(" batch %d" % num_decode_batches)
+  # First reverse all the input sentences so that if you're going to get OOMs,
+  # you'll see it in the first batch
+  sorted_inputs.reverse()
+  for b in range(num_decode_batches):
+    tf.logging.info("Decoding batch %d" % b)
+    batch_length = 0
+    batch_inputs = []
+    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:(
+        b + 1) * FLAGS.decode_batch_size]:
+      input_ids = vocabulary.encode(inputs)
+      if FLAGS.decode_max_input_size > 0:
+        # Subtract 1 for the EOS_ID.
+        input_ids = input_ids[:FLAGS.decode_max_input_size - 1]
+      input_ids.append(text_encoder.EOS_ID)
+      batch_inputs.append(input_ids)
+      if len(input_ids) > batch_length:
+        batch_length = len(input_ids)
+    final_batch_inputs = []
+    for input_ids in batch_inputs:
+      assert len(input_ids) <= batch_length
+      x = input_ids + [0] * (batch_length - len(input_ids))
+      final_batch_inputs.append(x)
+    yield {
+        "inputs": np.array(final_batch_inputs),
+        "problem_choice": np.array(problem_id)
+    }
+
+
+def _interactive_input_fn(hparams):
+  """Generator that reads from the terminal and yields "interactive inputs".
+
+  Due to temporary limitations in tf.learn, if we don't want to reload the
+  whole graph, then we are stuck encoding all of the input as one fixed-size
+  numpy array.
+
+  We yield int64 arrays with shape [const_array_size].  The format is:
+  [num_samples, decode_length, len(input ids), <input ids>, <padding>]
+
+  Args:
+    hparams: model hparams
+  Yields:
+    numpy arrays
+
+  Raises:
+    Exception: when `input_type` is invalid.
+  """
+  num_samples = 3
+  decode_length = 100
+  input_type = "text"
+  problem_id = 0
+  p_hparams = hparams.problems[problem_id]
+  has_input = "inputs" in p_hparams.input_modality
+  vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
+  # This should be longer than the longest input.
+  const_array_size = 10000
+  while True:
+    prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
+              "  it=<input_type>     ('text' or 'image' or 'label')\n"
+              "  pr=<problem_num>    (set the problem number)\n"
+              "  in=<input_problem>  (set the input problem number)\n"
+              "  ou=<output_problem> (set the output problem number)\n"
+              "  ns=<num_samples>    (changes number of samples)\n"
+              "  dl=<decode_length>  (changes decode legnth)\n"
+              "  <%s>                (decode)\n"
+              "  q                   (quit)\n"
+              ">" % (num_samples, decode_length, "source_string"
+                     if has_input else "target_prefix"))
+    input_string = input(prompt)
+    if input_string == "q":
+      return
+    elif input_string[:3] == "pr=":
+      problem_id = int(input_string[3:])
+      p_hparams = hparams.problems[problem_id]
+      has_input = "inputs" in p_hparams.input_modality
+      vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
+    elif input_string[:3] == "in=":
+      problem = int(input_string[3:])
+      p_hparams.input_modality = hparams.problems[problem].input_modality
+      p_hparams.input_space_id = hparams.problems[problem].input_space_id
+    elif input_string[:3] == "ou=":
+      problem = int(input_string[3:])
+      p_hparams.target_modality = hparams.problems[problem].target_modality
+      p_hparams.target_space_id = hparams.problems[problem].target_space_id
+    elif input_string[:3] == "ns=":
+      num_samples = int(input_string[3:])
+    elif input_string[:3] == "dl=":
+      decode_length = int(input_string[3:])
+    elif input_string[:3] == "it=":
+      input_type = input_string[3:]
+    else:
+      if input_type == "text":
+        input_ids = vocabulary.encode(input_string)
+        if has_input:
+          input_ids.append(text_encoder.EOS_ID)
+        x = [num_samples, decode_length, len(input_ids)] + input_ids
+        assert len(x) < const_array_size
+        x += [0] * (const_array_size - len(x))
+        yield problem_id, {
+            "inputs": np.array(x),
+            "problem_choice": np.array(problem_id)
+        }
+      elif input_type == "image":
+        input_path = input_string
+        img = read_image(input_path)
+        yield problem_id, {
+            "inputs": img,
+            "problem_choice": np.array(problem_id)
+        }
+      elif input_type == "label":
+        input_ids = [int(input_string)]
+        x = [num_samples, decode_length, len(input_ids)] + input_ids
+        yield problem_id, {
+            "inputs": np.array(x),
+            "problem_choice": np.array(problem_id)
+        }
+      else:
+        raise Exception("Unsupported input type.")
+
+
+def read_image(path):
+  try:
+    import matplotlib.image as im  # pylint: disable=g-import-not-at-top
+  except ImportError as e:
+    tf.logging.warning(
+        "Reading an image requires matplotlib to be installed: %s", e)
+    raise NotImplementedError("Image reading not implemented.")
+  return im.imread(path)
+
+
+def show_and_save_image(img, save_path):
+  try:
+    import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+  except ImportError as e:
+    tf.logging.warning("Showing and saving an image requires matplotlib to be "
+                       "installed: %s", e)
+    raise NotImplementedError("Image display and save not implemented.")
+  plt.imshow(img)
+  plt.savefig(save_path)
+
+
+def _get_sorted_inputs(filename):
+  """Returning inputs sorted according to length.
+
+  Args:
+    filename: path to file with inputs, 1 per line.
+
+  Returns:
+    a sorted list of inputs
+
+  """
+  tf.logging.info("Getting sorted inputs")
+  # read file and sort inputs according them according to input length.
+  if FLAGS.decode_shards > 1:
+    decode_filename = filename + ("%.2d" % FLAGS.worker_id)
+  else:
+    decode_filename = filename
+  inputs = [line.strip() for line in tf.gfile.Open(decode_filename)]
+  input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1))
+  # We'll need the keys to rearrange the inputs back into their original order
+  sorted_keys = {}
+  sorted_inputs = []
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs.append(inputs[index])
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+
+
+def _save_until_eos(hyp):
+  """Strips everything after the first <EOS> token, which is normally 1."""
+  try:
+    index = list(hyp).index(text_encoder.EOS_ID)
+    return hyp[0:index]
+  except ValueError:
+    # No EOS_ID: return the array as-is.
+    return hyp
diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py
new file mode 100644
index 000000000..4f76367e9
--- /dev/null
+++ b/tensor2tensor/utils/devices.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Device placement and data parallelism."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+# pylint: disable=redefined-builtin
+from six.moves import xrange
+# pylint: enable=redefined-builtin
+
+from tensor2tensor.utils import expert_utils as eu
+import tensorflow as tf
+
+# TODO(rsepassi): Rm dep on FLAGS here
+FLAGS = tf.flags.FLAGS
+
+
+def _ps_replicas(all_workers=False):
+  if all_workers:
+    return list(range(FLAGS.ps_replicas))
+  # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas.
+  num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas
+  return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)]
+
+
+def _gpu_order(num_gpus):
+  if FLAGS.gpu_order:
+    ret = [int(s) for s in FLAGS.gpu_order.split(" ")]
+    if len(ret) == num_gpus:
+      return ret
+  return list(range(num_gpus))
+
+
+def _ps_gpus(all_workers=False):
+  ps_gpus = []
+  for d in _ps_replicas(all_workers=all_workers):
+    ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)])
+  return ps_gpus
+
+
+def ps_devices(all_workers=False):
+  """List of ps devices (where to put the experts).
+
+  Args:
+    all_workers: whether the list is for all async workers or just this one.
+
+  Returns:
+    a list of device names
+  """
+  if FLAGS.ps_replicas > 0:
+    if FLAGS.ps_gpu > 0:
+      return [
+          FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu)
+          for (d, gpu) in _ps_gpus(all_workers=all_workers)
+      ]
+    else:
+      return [
+          FLAGS.ps_job + "/task:%d" % d
+          for d in _ps_replicas(all_workers=all_workers)
+      ]
+  else:
+    if FLAGS.worker_gpu > 0:
+      return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
+    else:
+      return [""]
+
+
+def data_parallelism(all_workers=False):
+  """Over which devices do we split each training batch.
+
+  In old-fashioned async mode, we split the batch over all GPUs on the
+  current worker.
+
+  In sync mode, we split the batch over all the parameter server GPUs.
+
+  This function returns an expert_utils.Parallelism object, which can be used
+  to build the model.  It is configured in a way that any variables created
+  by `tf.get_variable` will be assigned to the parameter servers and shared
+  between datashards.
+
+  Args:
+    all_workers: whether the devices are all async workers or just this one.
+
+  Returns:
+    a expert_utils.Parallelism.
+  """
+
+  def _replica_device_setter(worker_device):
+    if FLAGS.ps_replicas == 0:
+      return worker_device
+    return tf.train.replica_device_setter(
+        worker_device=worker_device,
+        ps_tasks=FLAGS.ps_replicas,
+        ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job)
+
+  if FLAGS.schedule == "local_run":
+    assert not FLAGS.sync
+    datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
+    if FLAGS.locally_shard_to_cpu:
+      datashard_devices += ["cpu:0"]
+    caching_devices = None
+  elif FLAGS.sync:
+    assert FLAGS.ps_replicas > 0
+    datashard_devices = [
+        _replica_device_setter(d) for d in ps_devices(all_workers=all_workers)
+    ]
+    if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1:
+      caching_devices = [
+          FLAGS.ps_job + "/task:%d/cpu:0" % d
+          for (d, _) in _ps_gpus(all_workers=all_workers)
+      ]
+    else:
+      caching_devices = None
+  else:
+    # old fashioned async - compute on worker
+    if FLAGS.worker_gpu > 1:
+      datashard_devices = [
+          _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d)
+          for d in _gpu_order(FLAGS.worker_gpu)
+      ]
+      caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu
+    else:
+      datashard_devices = [_replica_device_setter(FLAGS.worker_job)]
+      caching_devices = None
+  tf.logging.info("datashard_devices: %s", datashard_devices)
+  tf.logging.info("caching_devices: %s", caching_devices)
+  return eu.Parallelism(
+      datashard_devices,
+      reuse=True,
+      caching_devices=caching_devices,
+      daisy_chain_variables=FLAGS.daisy_chain_variables)
diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py
new file mode 100644
index 000000000..1fac64c8b
--- /dev/null
+++ b/tensor2tensor/utils/input_fn_builder.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input function building."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.utils import data_reader
+
+import tensorflow as tf
+
+# TODO(rsepassi): Rm dep on FLAGS here
+FLAGS = tf.flags.FLAGS
+
+
+def build_input_fn(mode,
+                   hparams,
+                   data_file_patterns=None,
+                   num_datashards=None,
+                   fixed_problem=None):
+  """Provides input to the graph, either from disk or via a placeholder.
+
+  This function produces an input function that will feed data into
+  the network. There are two modes of operation:
+
+  1. If data_file_pattern and all subsequent arguments are None, then
+     it creates a placeholder for a serialized tf.Example proto.
+  2. If data_file_pattern is defined, it will read the data from the
+     files at the given location. Use this mode for training,
+     evaluation, and testing prediction.
+
+  Args:
+    mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+    hparams: HParams object.
+    data_file_patterns: The list of file patterns to use to read in data. Set to
+      `None` if you want to create a placeholder for the input data. The
+      `problems` flag is a list of problem names joined by the `-` character.
+      The flag's string is then split along the `-` and each problem gets its
+      own example queue.
+    num_datashards: An integer.
+    fixed_problem: An integer indicating the problem to fetch data for, or None
+      if the input is to be randomly selected.
+
+  Returns:
+    A function that returns a dictionary of features and the target labels.
+  """
+
+  def input_fn():
+    """Supplies input to our model.
+
+    This function supplies input to our model, where this input is a
+    function of the mode. For example, we supply different data if
+    we're performing training versus evaluation.
+
+    Returns:
+      A tuple consisting of 1) a dictionary of tensors whose keys are
+      the feature names, and 2) a tensor of target labels if the mode
+      is not INFER (and None, otherwise).
+
+    Raises:
+      ValueError: if one of the parameters has an unsupported value.
+    """
+    problem_count, batches = len(data_file_patterns), []
+    with tf.name_scope("input_reader"):
+      for n in xrange(problem_count):
+        if fixed_problem is not None and n != fixed_problem:
+          continue
+        problem_instance = hparams.problem_instances[n]
+        p_hparams = hparams.problems[n]
+        with tf.name_scope("problem_%d" % n):
+          with tf.device("/cpu:0"):  # Input reading on CPU
+            capacity = p_hparams.max_expected_batch_size_per_shard
+            capacity *= num_datashards
+            examples = data_reader.input_pipeline(problem_instance,
+                                                  data_file_patterns[n],
+                                                  capacity, mode, hparams)
+            feature_map = data_reader.batch_examples(
+                examples,
+                data_reader.hparams_to_batching_scheme(
+                    hparams,
+                    shard_multiplier=num_datashards,
+                    drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN
+                                         or hparams.eval_drop_long_sequences),
+                    length_multiplier=(p_hparams.batch_size_multiplier)))
+
+        # Reverse inputs and targets features if the problem was reversed.
+        if problem_instance is not None:
+          problem_instance.maybe_reverse_features(feature_map)
+          problem_instance.maybe_copy_features(feature_map)
+        else:
+          if p_hparams.was_reversed:
+            inputs = feature_map["inputs"]
+            targets = feature_map["targets"]
+            feature_map["inputs"] = targets
+            feature_map["targets"] = inputs
+          # Use the inputs as the targets if the problem is a copy problem.
+          if p_hparams.was_copy:
+            feature_map["targets"] = feature_map["inputs"]
+
+        # Ensure inputs and targets are proper rank.
+        while len(feature_map["inputs"].get_shape()) != 4:
+          feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
+        while len(feature_map["targets"].get_shape()) != 4:
+          feature_map["targets"] = tf.expand_dims(
+              feature_map["targets"], axis=-1)
+
+        batches.append((feature_map["inputs"], feature_map["targets"],
+                        tf.constant(n), tf.constant(p_hparams.input_space_id),
+                        tf.constant(p_hparams.target_space_id)))
+
+    # We choose which problem to process.
+    loss_moving_avgs = []  # Need loss moving averages for that.
+    for n in xrange(problem_count):
+      with tf.variable_scope("losses_avg"):
+        loss_moving_avgs.append(
+            tf.get_variable(
+                "problem_%d/total_loss" % n, initializer=100.0,
+                trainable=False))
+        tf.get_variable(
+            "problem_%d/training_loss" % n, initializer=100.0, trainable=False)
+        tf.get_variable(
+            "problem_%d/extra_loss" % n, initializer=100.0, trainable=False)
+    if fixed_problem is None:
+      if (hparams.problem_choice == "uniform" or
+          mode != tf.contrib.learn.ModeKeys.TRAIN):
+        problem_choice = tf.random_uniform(
+            [], maxval=problem_count, dtype=tf.int32)
+      elif hparams.problem_choice == "adaptive":
+        loss_moving_avgs = tf.stack(loss_moving_avgs)
+        problem_choice = tf.multinomial(
+            tf.reshape(loss_moving_avgs, [1, -1]), 1)
+        problem_choice = tf.to_int32(tf.squeeze(problem_choice))
+      elif hparams.problem_choice == "distributed":
+        assert FLAGS.worker_replicas >= problem_count
+        assert FLAGS.worker_replicas % problem_count == 0
+        problem_choice = tf.to_int32(FLAGS.worker_id % problem_count)
+      else:
+        raise ValueError(
+            "Value of hparams.problem_choice is %s and must be "
+            "one of [uniform, adaptive, distributed]" % hparams.problem_choice)
+
+      # Inputs and targets conditional on problem_choice.
+      rand_inputs, rand_target, choice, inp_id, tgt_id = cond_on_index(
+          lambda n: batches[n], problem_choice, 0, problem_count - 1)
+    else:
+      problem_choice = tf.constant(fixed_problem)
+      # Take the only constructed batch, which is the fixed_problem.
+      rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0]
+
+    # Set shapes so the ranks are clear.
+    rand_inputs.set_shape([None, None, None, None])
+    rand_target.set_shape([None, None, None, None])
+    choice.set_shape([])
+    inp_id.set_shape([])
+    tgt_id.set_shape([])
+    #  Forced shape obfuscation is necessary for inference.
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
+      rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
+
+    # Final feature map.
+    rand_feature_map = {
+        "inputs": rand_inputs,
+        "problem_choice": choice,
+        "input_space_id": inp_id,
+        "target_space_id": tgt_id
+    }
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      rand_feature_map["infer_targets"] = rand_target
+      rand_target = None
+    return rand_feature_map, rand_target
+
+  return input_fn
+
+
+def cond_on_index(fn, index_tensor, cur_idx, max_idx):
+  """Call fn(index_tensor) using tf.cond in [cur_id, max_idx]."""
+  if cur_idx == max_idx:
+    return fn(cur_idx)
+  return tf.cond(
+      tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx),
+      lambda: cond_on_index(fn, index_tensor, cur_idx + 1, max_idx))
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 4435707cd..db60e07c8 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -22,7 +22,7 @@
 
 import six
 
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import bleu_hook
 
 import tensorflow as tf
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
index 5c596e10f..4bcf21f4d 100644
--- a/tensor2tensor/utils/modality.py
+++ b/tensor2tensor/utils/modality.py
@@ -22,7 +22,7 @@
 
 # Dependency imports
 
-from tensor2tensor.models import common_layers
+from tensor2tensor.layers import common_layers
 
 import tensorflow as tf
 
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
new file mode 100644
index 000000000..a12aa1122
--- /dev/null
+++ b/tensor2tensor/utils/model_builder.py
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model building."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+
+# Dependency imports
+
+import numpy as np
+import six
+# pylint: disable=redefined-builtin
+from six.moves import xrange
+# pylint: enable=redefined-builtin
+
+from tensor2tensor.models import models  # pylint: disable=unused-import
+from tensor2tensor.utils import devices
+from tensor2tensor.utils import input_fn_builder
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import yellowfin
+
+import tensorflow as tf
+from tensorflow.python.ops import init_ops
+
+# TODO(rsepassi): Rm dep on FLAGS here
+FLAGS = tf.flags.FLAGS
+
+# Number of samples to draw for an image input (in such cases as captioning)
+IMAGE_DECODE_LENGTH = 100
+
+
+def build_model_fn(model, hparams):
+  """Returns a function to build the model.
+
+  Args:
+    model: The name of the model to use.
+    hparams: The hyperparameters.
+
+  Returns:
+    A function to build the model's graph. This function is called by
+    the Estimator object to construct the graph.
+  """
+
+  def initializer():
+    if hparams.initializer == "orthogonal":
+      return tf.orthogonal_initializer(gain=hparams.initializer_gain)
+    elif hparams.initializer == "uniform":
+      max_val = 0.1 * hparams.initializer_gain
+      return tf.random_uniform_initializer(-max_val, max_val)
+    elif hparams.initializer == "normal_unit_scaling":
+      return init_ops.variance_scaling_initializer(
+          hparams.initializer_gain, mode="fan_avg", distribution="normal")
+    elif hparams.initializer == "uniform_unit_scaling":
+      return init_ops.variance_scaling_initializer(
+          hparams.initializer_gain, mode="fan_avg", distribution="uniform")
+    else:
+      raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
+
+  def learning_rate_decay():
+    """Inverse-decay learning rate until warmup_steps, then decay."""
+    warmup_steps = tf.to_float(
+        hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
+    step = tf.to_float(tf.contrib.framework.get_global_step())
+    if hparams.learning_rate_decay_scheme == "noam":
+      return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
+          (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
+    elif hparams.learning_rate_decay_scheme == "exp100k":
+      return 0.94**(step // 100000)
+    elif hparams.learning_rate_decay_scheme == "cosine":
+      cycle_steps = hparams.learning_rate_cosine_cycle_steps
+      return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps))
+
+    inv_base = tf.exp(tf.log(0.01) / warmup_steps)
+    inv_decay = inv_base**(warmup_steps - step)
+    if hparams.learning_rate_decay_scheme == "sqrt":
+      decay = _sqrt_decay(step - warmup_steps)
+    elif hparams.learning_rate_decay_scheme == "exp10k":
+      decay = _exp_decay_after(step - warmup_steps, 0.9995,
+                               FLAGS.train_steps - warmup_steps - 10000)
+    elif hparams.learning_rate_decay_scheme == "exp50k":
+      decay = _exp_decay_after(step - warmup_steps, 0.99995,
+                               FLAGS.train_steps - warmup_steps - 50000)
+    elif hparams.learning_rate_decay_scheme == "exp500k":
+      decay = _exp_decay_after(step - warmup_steps, 0.9999955,
+                               FLAGS.train_steps - warmup_steps - 500000)
+    elif hparams.learning_rate_decay_scheme == "none":
+      decay = tf.constant(1.0)
+    else:
+      raise ValueError("Unrecognized learning rate decay scheme: %s" %
+                       hparams.learning_rate_decay_scheme)
+    return tf.cond(
+        step < warmup_steps,
+        lambda: inv_decay,
+        lambda: decay,
+        name="learning_rate_decay_warump_cond")
+
+  def model_fn(features, targets, mode):
+    """Creates the prediction, loss, and train ops.
+
+    Args:
+      features: A dictionary of tensors keyed by the feature name.
+      targets: A tensor representing the labels (targets).
+      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+
+    Returns:
+      A tuple consisting of the prediction, loss, and train_op.
+    """
+    # Deep-copy the model hparams between modes to eliminate
+    # side-effects caused by abuse of the linked problem_hparams
+    # objects which are used to share modality objects between
+    # problems.  We do not want to share the modality objects between
+    # modes, since the modality objects may decide to do something
+    # mode-specific.  A better fix would be to stop abusing the
+    # hparams in this way and instead use a separate dictionary to
+    # share the modality objects between problems.  This dictionary
+    # could be created once per mode and passed to the constructor of
+    # t2t_model.
+    my_hp = copy.deepcopy(hparams)
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      if FLAGS.decode_interactive:
+        features = _interactive_input_tensor_to_features_dict(features, my_hp)
+      elif FLAGS.decode_from_file:
+        features = _decode_input_tensor_to_features_dict(features, my_hp)
+    # A dictionary containing:
+    #  - problem_choice: A Tensor containing an integer indicating which problem
+    #                    was selected for this run.
+    #  - predictions: A Tensor containing the model's output predictions.
+    run_info = dict()
+    run_info["problem_choice"] = features["problem_choice"]
+
+    if targets is not None:
+      features["targets"] = targets
+
+    dp = devices.data_parallelism()
+
+    # Add input statistics for incoming features.
+    with tf.name_scope("input_stats"):
+      for (k, v) in six.iteritems(features):
+        if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
+          tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
+          tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
+          nonpadding = tf.to_float(tf.not_equal(v, 0))
+          tf.summary.scalar("%s_nonpadding_tokens" % k,
+                            tf.reduce_sum(nonpadding))
+          tf.summary.scalar("%s_nonpadding_fraction" % k,
+                            tf.reduce_mean(nonpadding))
+
+    tf.get_variable_scope().set_initializer(initializer())
+    train = mode == tf.contrib.learn.ModeKeys.TRAIN
+
+    # Get multi-problem logits and loss based on features["problem_choice"].
+    def nth_model(n):
+      """Build the model for the n-th problem, plus some added variables."""
+      model_class = registry.model(model)(
+          my_hp,
+          mode,
+          my_hp.problems[n],
+          n,
+          dp,
+          devices.ps_devices(all_workers=True))
+      if mode == tf.contrib.learn.ModeKeys.INFER:
+        return model_class.infer(
+            features,
+            beam_size=FLAGS.decode_beam_size,
+            top_beams=(FLAGS.decode_beam_size
+                       if FLAGS.decode_return_beams else 1),
+            last_position_only=FLAGS.decode_use_last_position_only,
+            alpha=FLAGS.decode_alpha,
+            decode_length=FLAGS.decode_extra_length)
+      # In distributed mode, we build graph for problem=0 and problem=worker_id.
+      skipping_is_on = my_hp.problem_choice == "distributed" and train
+      problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
+      skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
+      # On worker 0 also build graph for problems <= 1.
+      # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
+      skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
+      sharded_logits, losses_dict = model_class.model_fn(
+          features, skip=(skipping_is_on and skip_this_one))
+      with tf.variable_scope("losses_avg", reuse=True):
+        total_loss, ops = 0.0, []
+        for loss_key, loss_value in losses_dict.iteritems():
+          loss_moving_avg = tf.get_variable("problem_%d/%s_loss" % (n,
+                                                                    loss_key))
+          ops.append(
+              loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
+          total_loss += loss_value
+        loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
+        ops.append(
+            loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
+      with tf.variable_scope("train_stats"):  # Count steps for this problem.
+        problem_steps = tf.get_variable(
+            "problem_%d_steps" % n, initializer=0, trainable=False)
+        ops.append(problem_steps.assign_add(1))
+      with tf.control_dependencies(ops):  # Make sure the ops run.
+        # Ensure the loss is a scalar here.
+        total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
+      return [total_loss] + sharded_logits  # Need to flatten for cond later.
+
+    result_list = input_fn_builder.cond_on_index(nth_model,
+                                                 features["problem_choice"], 0,
+                                                 len(my_hp.problems) - 1)
+
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+      # Beam search in sequence model returns both decodes withe key "outputs"
+      # and scores with they key "scores". If return list is a dict, we expect
+      # that it will have keys "outputs", a tensor of int32 and scores, a
+      # tensor of floats. This is useful if we want to return scores from
+      # estimator.predict
+      if not isinstance(result_list, dict):
+        ret = {"outputs": result_list}, None, None
+      else:
+        ret = {
+            "outputs": result_list["outputs"],
+            "scores": result_list["scores"]
+        }, None, None
+      if "inputs" in features:
+        ret[0]["inputs"] = features["inputs"]
+      if "infer_targets" in features:
+        ret[0]["targets"] = features["infer_targets"]
+      return ret
+
+    sharded_logits, total_loss = result_list[1:], result_list[0]
+    if mode == tf.contrib.learn.ModeKeys.EVAL:
+      logits = tf.concat(sharded_logits, 0)
+      if FLAGS.eval_print:
+        logits = tf.Print(
+            logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000)
+      # For evaluation, return the logits layer as our predictions.
+      run_info["predictions"] = logits
+      train_op = None
+      return run_info, total_loss, None
+
+    assert mode == tf.contrib.learn.ModeKeys.TRAIN
+
+    # Some training statistics.
+    with tf.name_scope("training_stats"):
+      learning_rate = my_hp.learning_rate * learning_rate_decay()
+      learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
+      tf.summary.scalar("learning_rate", learning_rate)
+      global_step = tf.to_float(tf.contrib.framework.get_global_step())
+      for n in xrange(len(my_hp.problems)):
+        with tf.variable_scope("losses_avg", reuse=True):
+          total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
+          training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
+          extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n)
+        tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var)
+        tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var)
+        tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var)
+        with tf.variable_scope("train_stats", reuse=True):
+          nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
+        tf.summary.scalar("problem_%d_frequency" % n,
+                          tf.to_float(nth_steps) / (global_step + 1.0))
+
+    # Log trainable weights and add decay.
+    total_size, weight_decay_loss = 0, 0.0
+    all_weights = {v.name: v for v in tf.trainable_variables()}
+    for v_name in sorted(list(all_weights)):
+      v = all_weights[v_name]
+      v_size = int(np.prod(np.array(v.shape.as_list())))
+      tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
+                      v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)
+      total_size += v_size
+      if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
+        # Add weight regularization if set and the weight is not a bias (dim>1).
+        with tf.device(v._ref().device):  # pylint: disable=protected-access
+          v_loss = tf.nn.l2_loss(v) / v_size
+        weight_decay_loss += v_loss
+      is_body = len(v_name) > 5 and v_name[:5] == "body/"
+      if my_hp.weight_noise > 0.0 and is_body:
+        # Add weight noise if set in my_hp.
+        with tf.device(v._ref().device):  # pylint: disable=protected-access
+          scale = learning_rate * 0.001
+          noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale
+          noise_op = v.assign_add(noise)
+        with tf.control_dependencies([noise_op]):
+          total_loss = tf.identity(total_loss)
+    tf.logging.info("Total trainable variables size: %d", total_size)
+    if my_hp.weight_decay > 0.0:
+      total_loss += weight_decay_loss * my_hp.weight_decay
+    total_loss = tf.identity(total_loss, name="total_loss")
+
+    # Define the train_op for the TRAIN mode.
+    opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)
+    tf.logging.info("Computing gradients for global model_fn.")
+    opt_summaries = ["learning_rate", "loss"]
+    if hparams.summarize_grads:
+      opt_summaries.extend(["gradients", "gradient_norm"])
+    train_op = tf.contrib.layers.optimize_loss(
+        name="training",
+        loss=total_loss,
+        global_step=tf.contrib.framework.get_global_step(),
+        learning_rate=learning_rate,
+        clip_gradients=my_hp.clip_grad_norm or None,
+        gradient_noise_scale=hparams.grad_noise_scale or None,
+        optimizer=opt,
+        summaries=opt_summaries,
+        colocate_gradients_with_ops=True)
+
+    # Remove summaries that will fail to run because they are in conditionals.
+    # TODO(cwhipkey): Test with this code removed, later in 2017.
+    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
+    for i in range(len(summaries) - 1, -1, -1):
+      if summaries[i].name.startswith("cond_"):
+        del summaries[i]
+
+    tf.logging.info("Global model_fn finished.")
+    return run_info, total_loss, train_op
+
+  return model_fn
+
+
+class _ConditionalOptimizer(tf.train.Optimizer):
+  """Conditional optimizer."""
+
+  def __init__(self, optimizer_name, lr, hparams):
+    if optimizer_name == "Adam":
+      # We change the default epsilon for Adam and re-scale lr.
+      # Using LazyAdam as it's much faster for large vocabulary embeddings.
+      self._opt = tf.contrib.opt.LazyAdamOptimizer(
+          lr / 500.0,
+          beta1=hparams.optimizer_adam_beta1,
+          beta2=hparams.optimizer_adam_beta2,
+          epsilon=hparams.optimizer_adam_epsilon)
+    elif optimizer_name == "Momentum":
+      self._opt = tf.train.MomentumOptimizer(
+          lr, momentum=hparams.optimizer_momentum_momentum)
+    elif optimizer_name == "YellowFin":
+      tf.logging.info("Init YellowFin Optimizer.")
+      self._opt = yellowfin.YellowFinOptimizer(
+          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
+    else:
+      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
+
+  def compute_gradients(self, loss, var_list, colocate_gradients_with_ops):
+    return self._opt.compute_gradients(
+        loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops)
+
+  def apply_gradients(self, gradients, global_step=None, name=None):
+    return self._opt.apply_gradients(
+        gradients, global_step=global_step, name=name)
+
+
+def _sqrt_decay(step):
+  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
+  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
+
+
+def _exp_decay_after(step, rate, from_which_step):
+  """Decay exponentially by rate (per step) starting at from_which_step."""
+  return tf.cond(
+      step < from_which_step,
+      lambda: tf.constant(1.0),
+      lambda: rate**(step - from_which_step),
+      name="exponential_decay_step_cond")
+
+
+def _interactive_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.constant(feature_map["inputs"])
+  input_is_image = False if len(inputs.shape) < 3 else True
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    p_hparams = hparams.problems[problem_choice]
+    if not input_is_image:
+      # Remove the batch dimension.
+      num_samples = x[0]
+      length = x[2]
+      x = tf.slice(x, [3], tf.to_int32([length]))
+      x = tf.reshape(x, [1, -1, 1, 1])
+      # Transform into a batch of size num_samples to get that many random
+      # decodes.
+      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
+    else:
+      x = tf.image.resize_images(x, [299, 299])
+      x = tf.reshape(x, [1, 299, 299, -1])
+      x = tf.to_int32(x)
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = tf.constant(feature_map["problem_choice"])
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else inputs[1])
+  features["inputs"] = x
+  return features
+
+
+def _decode_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.constant(feature_map["inputs"])
+  input_is_image = False
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    p_hparams = hparams.problems[problem_choice]
+    # Add a third empty dimension dimension
+    x = tf.expand_dims(x, axis=[2])
+    x = tf.to_int32(x)
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = feature_map["problem_choice"]
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else tf.shape(x)[1] + 50)
+  features["inputs"] = x
+  return features
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 9d5e1e0a6..5402e5bde 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -24,7 +24,7 @@ class MyModel(T2TModel):
 ```
 
 Access by snake-cased name: `registry.model("my_model")`. If you're using
-`trainer.py`, you can pass on the command-line: `--model=my_model`.
+`t2t_trainer.py`, you can pass on the command-line: `--model=my_model`.
 
 See all the models registered: `registry.list_models()`.
 
@@ -32,13 +32,13 @@ class MyModel(T2TModel):
   * Register: `registry.register_hparams`
   * List: `registry.list_hparams`
   * Retrieve by name: `registry.hparams`
-  * Command-line flag in `trainer.py`: `--hparams_set=name`
+  * Command-line flag in `t2t_trainer.py`: `--hparams_set=name`
 
 For hyperparameter ranges:
   * Register: `registry.register_ranged_hparams`
   * List: `registry.list_ranged_hparams`
   * Retrieve by name: `registry.ranged_hparams`
-  * Command-line flag in `trainer.py`: `--hparams_range=name`
+  * Command-line flag in `t2t_trainer.py`: `--hparams_range=name`
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 5c0240e16..c5f3296ee 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -19,38 +19,24 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import math
-import operator
-import os
 import sys
 
 # Dependency imports
 
-import numpy as np
-import six
-# pylint: disable=redefined-builtin
-from six.moves import input
-from six.moves import xrange
-# pylint: enable=redefined-builtin
-
 from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
 from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.models import models  # pylint: disable=unused-import
 from tensor2tensor.utils import data_reader
-from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import decoding
+from tensor2tensor.utils import devices
+from tensor2tensor.utils import input_fn_builder
 from tensor2tensor.utils import metrics
+from tensor2tensor.utils import model_builder
 from tensor2tensor.utils import registry
-from tensor2tensor.utils import yellowfin
 
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn import learn_runner
 from tensorflow.python import debug
-from tensorflow.python.ops import init_ops
-
-# Number of samples to draw for an image input (in such cases as captioning)
-IMAGE_DECODE_LENGTH = 100
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -134,16 +120,6 @@
 flags.DEFINE_bool("identity_output", False, "To print the output as identity")
 
 
-def _save_until_eos(hyp):
-  """Strips everything after the first <EOS> token, which is normally 1."""
-  try:
-    index = list(hyp).index(text_encoder.EOS_ID)
-    return hyp[0:index]
-  except ValueError:
-    # No EOS_ID: return the array as-is.
-    return hyp
-
-
 def make_experiment_fn(data_dir, model_name, train_steps, eval_steps):
   """Returns experiment_fn for learn_runner. Wraps create_experiment."""
 
@@ -195,22 +171,22 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
   """Constructs and returns Estimator and train/eval input functions."""
   tf.logging.info("Creating experiment, storing model files in %s", output_dir)
 
-  num_datashards = data_parallelism().n
-  train_input_fn = get_input_fn(
+  num_datashards = devices.data_parallelism().n
+  train_input_fn = input_fn_builder.build_input_fn(
       mode=tf.contrib.learn.ModeKeys.TRAIN,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.TRAIN),
       num_datashards=num_datashards)
 
-  eval_input_fn = get_input_fn(
+  eval_input_fn = input_fn_builder.build_input_fn(
       mode=tf.contrib.learn.ModeKeys.EVAL,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.EVAL),
       num_datashards=num_datashards)
   estimator = tf.contrib.learn.Estimator(
-      model_fn=model_builder(model_name, hparams=hparams),
+      model_fn=model_builder.build_model_fn(model_name, hparams=hparams),
       model_dir=output_dir,
       config=tf.contrib.learn.RunConfig(
           master=FLAGS.master,
@@ -222,7 +198,8 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
   estimator.hparams = hparams
   return estimator, {
       tf.contrib.learn.ModeKeys.TRAIN: train_input_fn,
-      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn}
+      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn
+  }
 
 
 def log_registry():
@@ -297,7 +274,11 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
 
   if schedule == "local_run":
     # Run the local demo.
-    run_locally(exp_fn(output_dir))
+    exp = exp_fn(output_dir)
+    if exp.train_steps > 0 or exp.eval_steps > 0:
+      tf.logging.info("Performing local training and evaluation.")
+      exp.train_and_evaluate()
+    decode(exp.estimator)
   else:
     # Perform distributed training/evaluation.
     learn_runner.run(
@@ -342,1040 +323,14 @@ def session_config():
   return config
 
 
-def model_builder(model, hparams):
-  """Returns a function to build the model.
-
-  Args:
-    model: The name of the model to use.
-    hparams: The hyperparameters.
-
-  Returns:
-    A function to build the model's graph. This function is called by
-    the Estimator object to construct the graph.
-  """
-
-  def initializer():
-    if hparams.initializer == "orthogonal":
-      return tf.orthogonal_initializer(gain=hparams.initializer_gain)
-    elif hparams.initializer == "uniform":
-      max_val = 0.1 * hparams.initializer_gain
-      return tf.random_uniform_initializer(-max_val, max_val)
-    elif hparams.initializer == "normal_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="normal")
-    elif hparams.initializer == "uniform_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="uniform")
-    else:
-      raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
-
-  def learning_rate_decay():
-    """Inverse-decay learning rate until warmup_steps, then decay."""
-    warmup_steps = tf.to_float(
-        hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
-    step = tf.to_float(tf.contrib.framework.get_global_step())
-    if hparams.learning_rate_decay_scheme == "noam":
-      return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
-          (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
-    elif hparams.learning_rate_decay_scheme == "exp100k":
-      return 0.94**(step // 100000)
-    elif hparams.learning_rate_decay_scheme == "cosine":
-      cycle_steps = hparams.learning_rate_cosine_cycle_steps
-      return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps))
-
-    inv_base = tf.exp(tf.log(0.01) / warmup_steps)
-    inv_decay = inv_base**(warmup_steps - step)
-    if hparams.learning_rate_decay_scheme == "sqrt":
-      decay = _sqrt_decay(step - warmup_steps)
-    elif hparams.learning_rate_decay_scheme == "exp10k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9995,
-                               FLAGS.train_steps - warmup_steps - 10000)
-    elif hparams.learning_rate_decay_scheme == "exp50k":
-      decay = _exp_decay_after(step - warmup_steps, 0.99995,
-                               FLAGS.train_steps - warmup_steps - 50000)
-    elif hparams.learning_rate_decay_scheme == "exp500k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9999955,
-                               FLAGS.train_steps - warmup_steps - 500000)
-    elif hparams.learning_rate_decay_scheme == "none":
-      decay = tf.constant(1.0)
-    else:
-      raise ValueError("Unrecognized learning rate decay scheme: %s" %
-                       hparams.learning_rate_decay_scheme)
-    return tf.cond(
-        step < warmup_steps,
-        lambda: inv_decay,
-        lambda: decay,
-        name="learning_rate_decay_warump_cond")
-
-  def model_fn(features, targets, mode):
-    """Creates the prediction, loss, and train ops.
-
-    Args:
-      features: A dictionary of tensors keyed by the feature name.
-      targets: A tensor representing the labels (targets).
-      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
-
-    Returns:
-      A tuple consisting of the prediction, loss, and train_op.
-    """
-    # Deep-copy the model hparams between modes to eliminate
-    # side-effects caused by abuse of the linked problem_hparams
-    # objects which are used to share modality objects between
-    # problems.  We do not want to share the modality objects between
-    # modes, since the modality objects may decide to do something
-    # mode-specific.  A better fix would be to stop abusing the
-    # hparams in this way and instead use a separate dictionary to
-    # share the modality objects between problems.  This dictionary
-    # could be created once per mode and passed to the constructor of
-    # t2t_model.
-    my_hp = copy.deepcopy(hparams)
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      if FLAGS.decode_interactive:
-        features = _interactive_input_tensor_to_features_dict(features, my_hp)
-      elif FLAGS.decode_from_file:
-        features = _decode_input_tensor_to_features_dict(features, my_hp)
-    # A dictionary containing:
-    #  - problem_choice: A Tensor containing an integer indicating which problem
-    #                    was selected for this run.
-    #  - predictions: A Tensor containing the model's output predictions.
-    run_info = dict()
-    run_info["problem_choice"] = features["problem_choice"]
-
-    if targets is not None:
-      features["targets"] = targets
-
-    dp = data_parallelism()
-
-    # Add input statistics for incoming features.
-    with tf.name_scope("input_stats"):
-      for (k, v) in six.iteritems(features):
-        if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
-          tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
-          tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
-          nonpadding = tf.to_float(tf.not_equal(v, 0))
-          tf.summary.scalar("%s_nonpadding_tokens" % k,
-                            tf.reduce_sum(nonpadding))
-          tf.summary.scalar("%s_nonpadding_fraction" % k,
-                            tf.reduce_mean(nonpadding))
-
-    tf.get_variable_scope().set_initializer(initializer())
-    train = mode == tf.contrib.learn.ModeKeys.TRAIN
-
-    # Get multi-problem logits and loss based on features["problem_choice"].
-    def nth_model(n):
-      """Build the model for the n-th problem, plus some added variables."""
-      model_class = registry.model(model)(
-          my_hp,
-          mode,
-          my_hp.problems[n],
-          n,
-          dp,
-          _ps_devices(all_workers=True))
-      if mode == tf.contrib.learn.ModeKeys.INFER:
-        return model_class.infer(
-            features,
-            beam_size=FLAGS.decode_beam_size,
-            top_beams=(FLAGS.decode_beam_size
-                       if FLAGS.decode_return_beams else 1),
-            last_position_only=FLAGS.decode_use_last_position_only,
-            alpha=FLAGS.decode_alpha,
-            decode_length=FLAGS.decode_extra_length)
-      # In distributed mode, we build graph for problem=0 and problem=worker_id.
-      skipping_is_on = my_hp.problem_choice == "distributed" and train
-      problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
-      skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
-      # On worker 0 also build graph for problems <= 1.
-      # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
-      skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
-      sharded_logits, losses_dict = model_class.model_fn(
-          features, skip=(skipping_is_on and skip_this_one))
-      with tf.variable_scope("losses_avg", reuse=True):
-        total_loss, ops = 0.0, []
-        for loss_key, loss_value in losses_dict.iteritems():
-          loss_moving_avg = tf.get_variable("problem_%d/%s_loss"
-                                            % (n, loss_key))
-          ops.append(loss_moving_avg.assign(
-              loss_moving_avg * 0.9 + loss_value * 0.1))
-          total_loss += loss_value
-        loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
-        ops.append(loss_moving_avg.assign(
-            loss_moving_avg * 0.9 + total_loss * 0.1))
-      with tf.variable_scope("train_stats"):  # Count steps for this problem.
-        problem_steps = tf.get_variable(
-            "problem_%d_steps" % n, initializer=0, trainable=False)
-        ops.append(problem_steps.assign_add(1))
-      with tf.control_dependencies(ops):  # Make sure the ops run.
-        # Ensure the loss is a scalar here.
-        total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
-      return [total_loss] + sharded_logits  # Need to flatten for cond later.
-
-    result_list = _cond_on_index(nth_model, features["problem_choice"], 0,
-                                 len(my_hp.problems) - 1)
-
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      # Beam search in sequence model returns both decodes withe key "outputs"
-      # and scores with they key "scores". If return list is a dict, we expect
-      # that it will have keys "outputs", a tensor of int32 and scores, a
-      # tensor of floats. This is useful if we want to return scores from
-      # estimator.predict
-      if not isinstance(result_list, dict):
-        ret = {"outputs": result_list}, None, None
-      else:
-        ret = {
-            "outputs": result_list["outputs"],
-            "scores": result_list["scores"]
-        }, None, None
-      if "inputs" in features:
-        ret[0]["inputs"] = features["inputs"]
-      if "infer_targets" in features:
-        ret[0]["targets"] = features["infer_targets"]
-      return ret
-
-    sharded_logits, total_loss = result_list[1:], result_list[0]
-    if mode == tf.contrib.learn.ModeKeys.EVAL:
-      logits = tf.concat(sharded_logits, 0)
-      if FLAGS.eval_print:
-        logits = tf.Print(
-            logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000)
-      # For evaluation, return the logits layer as our predictions.
-      run_info["predictions"] = logits
-      train_op = None
-      return run_info, total_loss, None
-
-    assert mode == tf.contrib.learn.ModeKeys.TRAIN
-
-    # Some training statistics.
-    with tf.name_scope("training_stats"):
-      learning_rate = my_hp.learning_rate * learning_rate_decay()
-      learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
-      tf.summary.scalar("learning_rate", learning_rate)
-      global_step = tf.to_float(tf.contrib.framework.get_global_step())
-      for n in xrange(len(my_hp.problems)):
-        with tf.variable_scope("losses_avg", reuse=True):
-          total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
-          training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
-          extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n)
-        tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var)
-        tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var)
-        tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var)
-        with tf.variable_scope("train_stats", reuse=True):
-          nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
-        tf.summary.scalar("problem_%d_frequency" % n,
-                          tf.to_float(nth_steps) / (global_step + 1.0))
-
-    # Log trainable weights and add decay.
-    total_size, weight_decay_loss = 0, 0.0
-    all_weights = {v.name: v for v in tf.trainable_variables()}
-    for v_name in sorted(list(all_weights)):
-      v = all_weights[v_name]
-      v_size = int(np.prod(np.array(v.shape.as_list())))
-      tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
-                      v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)
-      total_size += v_size
-      if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
-        # Add weight regularization if set and the weight is not a bias (dim>1).
-        with tf.device(v._ref().device):  # pylint: disable=protected-access
-          v_loss = tf.nn.l2_loss(v) / v_size
-        weight_decay_loss += v_loss
-      is_body = len(v_name) > 5 and v_name[:5] == "body/"
-      if my_hp.weight_noise > 0.0 and is_body:
-        # Add weight noise if set in my_hp.
-        with tf.device(v._ref().device):  # pylint: disable=protected-access
-          scale = learning_rate * 0.001
-          noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale
-          noise_op = v.assign_add(noise)
-        with tf.control_dependencies([noise_op]):
-          total_loss = tf.identity(total_loss)
-    tf.logging.info("Total trainable variables size: %d", total_size)
-    if my_hp.weight_decay > 0.0:
-      total_loss += weight_decay_loss * my_hp.weight_decay
-    total_loss = tf.identity(total_loss, name="total_loss")
-
-    # Define the train_op for the TRAIN mode.
-    opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)
-    tf.logging.info("Computing gradients for global model_fn.")
-    opt_summaries = ["learning_rate", "loss"]
-    if hparams.summarize_grads:
-      opt_summaries.extend(["gradients", "gradient_norm"])
-    train_op = tf.contrib.layers.optimize_loss(
-        name="training",
-        loss=total_loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=learning_rate,
-        clip_gradients=my_hp.clip_grad_norm or None,
-        gradient_noise_scale=hparams.grad_noise_scale or None,
-        optimizer=opt,
-        summaries=opt_summaries,
-        colocate_gradients_with_ops=True)
-
-    # Remove summaries that will fail to run because they are in conditionals.
-    # TODO(cwhipkey): Test with this code removed, later in 2017.
-    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
-    for i in range(len(summaries) - 1, -1, -1):
-      if summaries[i].name.startswith("cond_"):
-        del summaries[i]
-
-    tf.logging.info("Global model_fn finished.")
-    return run_info, total_loss, train_op
-
-  return model_fn
-
-
-def run_locally(exp):
-  """Runs an Experiment locally - trains, evaluates, and decodes.
-
-  Args:
-    exp: Experiment.
-  """
-  if exp.train_steps > 0 or exp.eval_steps > 0:
-    tf.logging.info("Performing local training and evaluation.")
-    exp.train_and_evaluate()
-  decode(exp.estimator)
+def get_data_filepatterns(data_dir, mode):
+  return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode)
 
 
 def decode(estimator):
   if FLAGS.decode_interactive:
-    decode_interactively(estimator)
+    decoding.decode_interactively(estimator)
   elif FLAGS.decode_from_file is not None:
-    decode_from_file(estimator, FLAGS.decode_from_file)
+    decoding.decode_from_file(estimator, FLAGS.decode_from_file)
   elif FLAGS.decode_from_dataset:
-    decode_from_dataset(estimator)
-
-
-def decode_from_dataset(estimator):
-  hparams = estimator.hparams
-  for i, problem in enumerate(FLAGS.problems.split("-")):
-    inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
-    targets_vocab = hparams.problems[i].vocabulary["targets"]
-    tf.logging.info("Performing local inference.")
-    infer_problems_data = get_data_filepatterns(hparams.data_dir,
-                                                tf.contrib.learn.ModeKeys.INFER)
-
-    infer_input_fn = get_input_fn(
-        mode=tf.contrib.learn.ModeKeys.INFER,
-        hparams=hparams,
-        data_file_patterns=infer_problems_data,
-        num_datashards=data_parallelism().n,
-        fixed_problem=i)
-    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False)
-
-    def log_fn(inputs,
-               targets,
-               outputs,
-               problem,
-               j,
-               inputs_vocab=inputs_vocab,
-               targets_vocab=targets_vocab):
-      """Log inference results."""
-      if "image" in problem and FLAGS.decode_save_images:
-        save_path = os.path.join(estimator.model_dir,
-                                 "%s_prediction_%d.jpg" % (problem, j))
-        show_and_save_image(inputs / 255., save_path)
-      elif inputs_vocab:
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
-        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
-
-      decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten()))
-      tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-      decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten()))
-      tf.logging.info("Inference results TARGET: %s" % decoded_targets)
-
-      if FLAGS.decode_to_file:
-        output_filepath = FLAGS.decode_to_file + ".outputs." + problem
-        output_file = tf.gfile.Open(output_filepath, "a")
-        output_file.write(decoded_outputs + "\n")
-        target_filepath = FLAGS.decode_to_file + ".targets." + problem
-        target_file = tf.gfile.Open(target_filepath, "a")
-        target_file.write(decoded_targets + "\n")
-
-    # The function predict() returns an iterable over the network's
-    # predictions from the test input. We use it to log inputs and decodes.
-    inputs_iter = result_iter["inputs"]
-    targets_iter = result_iter["targets"]
-    outputs_iter = result_iter["outputs"]
-    for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)):
-      inputs, targets, outputs = result
-      if FLAGS.decode_return_beams:
-        output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % k)
-          log_fn(inputs, targets, beam, problem, j)
-      else:
-        log_fn(inputs, targets, outputs, problem, j)
-
-
-def decode_from_file(estimator, filename):
-  """Compute predictions on entries in filename and write them out."""
-  hparams = estimator.hparams
-  problem_id = FLAGS.decode_problem_id
-  inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"]
-  targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
-  tf.logging.info("Performing decoding from a file.")
-  sorted_inputs, sorted_keys = _get_sorted_inputs(filename)
-  num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1
-  input_fn = _decode_batch_input_fn(problem_id, num_decode_batches,
-                                    sorted_inputs, inputs_vocab)
-
-  decodes = []
-  for _ in range(num_decode_batches):
-    result_iter = estimator.predict(
-        input_fn=input_fn.next if six.PY2 else input_fn.__next__,
-        as_iterable=True)
-    for result in result_iter:
-
-      def log_fn(inputs, outputs):
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
-        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
-
-        decoded_outputs = targets_vocab.decode(
-            _save_until_eos(outputs.flatten()))
-        tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-        return decoded_outputs
-
-      if FLAGS.decode_return_beams:
-        beam_decodes = []
-        output_beams = np.split(
-            result["outputs"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_decodes.append(log_fn(result["inputs"], beam))
-        decodes.append("\t".join(beam_decodes))
-
-      else:
-        decodes.append(log_fn(result["inputs"], result["outputs"]))
-
-  # Reversing the decoded inputs and outputs because they were reversed in
-  # _decode_batch_input_fn
-  sorted_inputs.reverse()
-  decodes.reverse()
-  # Dumping inputs and outputs to file filename.decodes in
-  # format result\tinput in the same order as original inputs
-  if FLAGS.decode_to_file:
-    output_filename = FLAGS.decode_to_file
-  else:
-    output_filename = filename
-  if FLAGS.decode_shards > 1:
-    base_filename = output_filename + ("%.2d" % FLAGS.worker_id)
-  else:
-    base_filename = output_filename
-  decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set
-                     + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" +
-                     str(FLAGS.decode_alpha) + ".decodes")
-  tf.logging.info("Writing decodes into %s" % decode_filename)
-  outfile = tf.gfile.Open(decode_filename, "w")
-  for index in range(len(sorted_inputs)):
-    outfile.write("%s\n" % (decodes[sorted_keys[index]]))
-
-
-def decode_interactively(estimator):
-  hparams = estimator.hparams
-
-  infer_input_fn = _interactive_input_fn(hparams)
-  for problem_idx, example in infer_input_fn:
-    targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
-    result_iter = estimator.predict(input_fn=lambda e=example: e)
-    for result in result_iter:
-      if FLAGS.decode_return_beams:
-        beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
-        scores = None
-        if "scores" in result:
-          scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_string = targets_vocab.decode(_save_until_eos(beam.flatten()))
-          if scores is not None:
-            tf.logging.info("%s\tScore:%f" % (beam_string, scores[k]))
-          else:
-            tf.logging.info(beam_string)
-      else:
-        if FLAGS.identity_output:
-          tf.logging.info(" ".join(map(str, result["outputs"].flatten())))
-        else:
-          tf.logging.info(targets_vocab.decode(_save_until_eos(
-              result["outputs"].flatten())))
-
-
-def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
-                           vocabulary):
-  tf.logging.info(" batch %d" % num_decode_batches)
-  # First reverse all the input sentences so that if you're going to get OOMs,
-  # you'll see it in the first batch
-  sorted_inputs.reverse()
-  for b in range(num_decode_batches):
-    tf.logging.info("Decoding batch %d" % b)
-    batch_length = 0
-    batch_inputs = []
-    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:(
-        b + 1) * FLAGS.decode_batch_size]:
-      input_ids = vocabulary.encode(inputs)
-      if FLAGS.decode_max_input_size > 0:
-        # Subtract 1 for the EOS_ID.
-        input_ids = input_ids[:FLAGS.decode_max_input_size - 1]
-      input_ids.append(text_encoder.EOS_ID)
-      batch_inputs.append(input_ids)
-      if len(input_ids) > batch_length:
-        batch_length = len(input_ids)
-    final_batch_inputs = []
-    for input_ids in batch_inputs:
-      assert len(input_ids) <= batch_length
-      x = input_ids + [0] * (batch_length - len(input_ids))
-      final_batch_inputs.append(x)
-    yield {
-        "inputs": np.array(final_batch_inputs),
-        "problem_choice": np.array(problem_id)
-    }
-
-
-def get_data_filepatterns(data_dir, mode):
-  return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode)
-
-
-def _cond_on_index(fn, index_tensor, cur_idx, max_idx):
-  """Call fn(index_tensor) using tf.cond in [cur_id, max_idx]."""
-  if cur_idx == max_idx:
-    return fn(cur_idx)
-  return tf.cond(
-      tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx),
-      lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx))
-
-
-def _interactive_input_fn(hparams):
-  """Generator that reads from the terminal and yields "interactive inputs".
-
-  Due to temporary limitations in tf.learn, if we don't want to reload the
-  whole graph, then we are stuck encoding all of the input as one fixed-size
-  numpy array.
-
-  We yield int64 arrays with shape [const_array_size].  The format is:
-  [num_samples, decode_length, len(input ids), <input ids>, <padding>]
-
-  Args:
-    hparams: model hparams
-  Yields:
-    numpy arrays
-
-  Raises:
-    Exception: when `input_type` is invalid.
-  """
-  num_samples = 3
-  decode_length = 100
-  input_type = "text"
-  problem_id = 0
-  p_hparams = hparams.problems[problem_id]
-  has_input = "inputs" in p_hparams.input_modality
-  vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
-  # This should be longer than the longest input.
-  const_array_size = 10000
-  while True:
-    prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
-              "  it=<input_type>     ('text' or 'image' or 'label')\n"
-              "  pr=<problem_num>    (set the problem number)\n"
-              "  in=<input_problem>  (set the input problem number)\n"
-              "  ou=<output_problem> (set the output problem number)\n"
-              "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode legnth)\n"
-              "  <%s>                (decode)\n"
-              "  q                   (quit)\n"
-              ">" % (num_samples, decode_length, "source_string"
-                     if has_input else "target_prefix"))
-    input_string = input(prompt)
-    if input_string == "q":
-      return
-    elif input_string[:3] == "pr=":
-      problem_id = int(input_string[3:])
-      p_hparams = hparams.problems[problem_id]
-      has_input = "inputs" in p_hparams.input_modality
-      vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
-    elif input_string[:3] == "in=":
-      problem = int(input_string[3:])
-      p_hparams.input_modality = hparams.problems[problem].input_modality
-      p_hparams.input_space_id = hparams.problems[problem].input_space_id
-    elif input_string[:3] == "ou=":
-      problem = int(input_string[3:])
-      p_hparams.target_modality = hparams.problems[problem].target_modality
-      p_hparams.target_space_id = hparams.problems[problem].target_space_id
-    elif input_string[:3] == "ns=":
-      num_samples = int(input_string[3:])
-    elif input_string[:3] == "dl=":
-      decode_length = int(input_string[3:])
-    elif input_string[:3] == "it=":
-      input_type = input_string[3:]
-    else:
-      if input_type == "text":
-        input_ids = vocabulary.encode(input_string)
-        if has_input:
-          input_ids.append(text_encoder.EOS_ID)
-        x = [num_samples, decode_length, len(input_ids)] + input_ids
-        assert len(x) < const_array_size
-        x += [0] * (const_array_size - len(x))
-        yield problem_id, {
-            "inputs": np.array(x),
-            "problem_choice": np.array(problem_id)
-        }
-      elif input_type == "image":
-        input_path = input_string
-        img = read_image(input_path)
-        yield problem_id, {
-            "inputs": img,
-            "problem_choice": np.array(problem_id)
-        }
-      elif input_type == "label":
-        input_ids = [int(input_string)]
-        x = [num_samples, decode_length, len(input_ids)] + input_ids
-        yield problem_id, {
-            "inputs": np.array(x),
-            "problem_choice": np.array(problem_id)
-        }
-      else:
-        raise Exception("Unsupported input type.")
-
-
-def read_image(path):
-  try:
-    import matplotlib.image as im  # pylint: disable=g-import-not-at-top
-  except ImportError as e:
-    tf.logging.warning(
-        "Reading an image requires matplotlib to be installed: %s", e)
-    raise NotImplementedError("Image reading not implemented.")
-  return im.imread(path)
-
-
-def show_and_save_image(img, save_path):
-  try:
-    import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-  except ImportError as e:
-    tf.logging.warning("Showing and saving an image requires matplotlib to be "
-                       "installed: %s", e)
-    raise NotImplementedError("Image display and save not implemented.")
-  plt.imshow(img)
-  plt.savefig(save_path)
-
-
-def _get_sorted_inputs(filename):
-  """Returning inputs sorted according to length.
-
-  Args:
-    filename: path to file with inputs, 1 per line.
-
-  Returns:
-    a sorted list of inputs
-
-  """
-  tf.logging.info("Getting sorted inputs")
-  # read file and sort inputs according them according to input length.
-  if FLAGS.decode_shards > 1:
-    decode_filename = filename + ("%.2d" % FLAGS.worker_id)
-  else:
-    decode_filename = filename
-  inputs = [line.strip() for line in tf.gfile.Open(decode_filename)]
-  input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1))
-  # We'll need the keys to rearrange the inputs back into their original order
-  sorted_keys = {}
-  sorted_inputs = []
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs.append(inputs[index])
-    sorted_keys[index] = i
-  return sorted_inputs, sorted_keys
-
-
-def _interactive_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False if len(inputs.shape) < 3 else True
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    if not input_is_image:
-      # Remove the batch dimension.
-      num_samples = x[0]
-      length = x[2]
-      x = tf.slice(x, [3], tf.to_int32([length]))
-      x = tf.reshape(x, [1, -1, 1, 1])
-      # Transform into a batch of size num_samples to get that many random
-      # decodes.
-      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
-    else:
-      x = tf.image.resize_images(x, [299, 299])
-      x = tf.reshape(x, [1, 299, 299, -1])
-      x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = _cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = tf.constant(feature_map["problem_choice"])
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else inputs[1])
-  features["inputs"] = x
-  return features
-
-
-def _decode_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    # Add a third empty dimension dimension
-    x = tf.expand_dims(x, axis=[2])
-    x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = _cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = feature_map["problem_choice"]
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else tf.shape(x)[1] + 50)
-  features["inputs"] = x
-  return features
-
-
-def get_input_fn(mode,
-                 hparams,
-                 data_file_patterns=None,
-                 num_datashards=None,
-                 fixed_problem=None):
-  """Provides input to the graph, either from disk or via a placeholder.
-
-  This function produces an input function that will feed data into
-  the network. There are two modes of operation:
-
-  1. If data_file_pattern and all subsequent arguments are None, then
-     it creates a placeholder for a serialized tf.Example proto.
-  2. If data_file_pattern is defined, it will read the data from the
-     files at the given location. Use this mode for training,
-     evaluation, and testing prediction.
-
-  Args:
-    mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
-    hparams: HParams object.
-    data_file_patterns: The list of file patterns to use to read in data. Set to
-      `None` if you want to create a placeholder for the input data. The
-      `problems` flag is a list of problem names joined by the `-` character.
-      The flag's string is then split along the `-` and each problem gets its
-      own example queue.
-    num_datashards: An integer.
-    fixed_problem: An integer indicating the problem to fetch data for, or None
-      if the input is to be randomly selected.
-
-  Returns:
-    A function that returns a dictionary of features and the target labels.
-  """
-
-  def input_fn():
-    """Supplies input to our model.
-
-    This function supplies input to our model, where this input is a
-    function of the mode. For example, we supply different data if
-    we're performing training versus evaluation.
-
-    Returns:
-      A tuple consisting of 1) a dictionary of tensors whose keys are
-      the feature names, and 2) a tensor of target labels if the mode
-      is not INFER (and None, otherwise).
-
-    Raises:
-      ValueError: if one of the parameters has an unsupported value.
-    """
-    problem_count, batches = len(data_file_patterns), []
-    with tf.name_scope("input_reader"):
-      for n in xrange(problem_count):
-        if fixed_problem is not None and n != fixed_problem:
-          continue
-        problem_instance = hparams.problem_instances[n]
-        p_hparams = hparams.problems[n]
-        with tf.name_scope("problem_%d" % n):
-          with tf.device("/cpu:0"):  # Input reading on CPU
-            capacity = p_hparams.max_expected_batch_size_per_shard
-            capacity *= num_datashards
-            examples = data_reader.input_pipeline(problem_instance,
-                                                  data_file_patterns[n],
-                                                  capacity, mode, hparams)
-            feature_map = data_reader.batch_examples(
-                examples,
-                data_reader.hparams_to_batching_scheme(
-                    hparams,
-                    shard_multiplier=num_datashards,
-                    drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN
-                                         or hparams.eval_drop_long_sequences),
-                    length_multiplier=(p_hparams.batch_size_multiplier)))
-
-        # Reverse inputs and targets features if the problem was reversed.
-        if problem_instance is not None:
-          problem_instance.maybe_reverse_features(feature_map)
-          problem_instance.maybe_copy_features(feature_map)
-        else:
-          if p_hparams.was_reversed:
-            inputs = feature_map["inputs"]
-            targets = feature_map["targets"]
-            feature_map["inputs"] = targets
-            feature_map["targets"] = inputs
-          # Use the inputs as the targets if the problem is a copy problem.
-          if p_hparams.was_copy:
-            feature_map["targets"] = feature_map["inputs"]
-
-        # Ensure inputs and targets are proper rank.
-        while len(feature_map["inputs"].get_shape()) != 4:
-          feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
-        while len(feature_map["targets"].get_shape()) != 4:
-          feature_map["targets"] = tf.expand_dims(
-              feature_map["targets"], axis=-1)
-
-        batches.append(
-            (feature_map["inputs"], feature_map["targets"], tf.constant(n),
-             tf.constant(p_hparams.input_space_id),
-             tf.constant(p_hparams.target_space_id)))
-
-    # We choose which problem to process.
-    loss_moving_avgs = []  # Need loss moving averages for that.
-    for n in xrange(problem_count):
-      with tf.variable_scope("losses_avg"):
-        loss_moving_avgs.append(
-            tf.get_variable(
-                "problem_%d/total_loss" % n, initializer=100.0,
-                trainable=False))
-        tf.get_variable(
-            "problem_%d/training_loss" % n, initializer=100.0, trainable=False)
-        tf.get_variable(
-            "problem_%d/extra_loss" % n, initializer=100.0, trainable=False)
-    if fixed_problem is None:
-      if (hparams.problem_choice == "uniform" or
-          mode != tf.contrib.learn.ModeKeys.TRAIN):
-        problem_choice = tf.random_uniform(
-            [], maxval=problem_count, dtype=tf.int32)
-      elif hparams.problem_choice == "adaptive":
-        loss_moving_avgs = tf.stack(loss_moving_avgs)
-        problem_choice = tf.multinomial(
-            tf.reshape(loss_moving_avgs, [1, -1]), 1)
-        problem_choice = tf.to_int32(tf.squeeze(problem_choice))
-      elif hparams.problem_choice == "distributed":
-        assert FLAGS.worker_replicas >= problem_count
-        assert FLAGS.worker_replicas % problem_count == 0
-        problem_choice = tf.to_int32(FLAGS.worker_id % problem_count)
-      else:
-        raise ValueError(
-            "Value of hparams.problem_choice is %s and must be "
-            "one of [uniform, adaptive, distributed]" % hparams.problem_choice)
-
-      # Inputs and targets conditional on problem_choice.
-      rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index(
-          lambda n: batches[n], problem_choice, 0, problem_count - 1)
-    else:
-      problem_choice = tf.constant(fixed_problem)
-      # Take the only constructed batch, which is the fixed_problem.
-      rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0]
-
-    # Set shapes so the ranks are clear.
-    rand_inputs.set_shape([None, None, None, None])
-    rand_target.set_shape([None, None, None, None])
-    choice.set_shape([])
-    inp_id.set_shape([])
-    tgt_id.set_shape([])
-    #  Forced shape obfuscation is necessary for inference.
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
-      rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
-
-    # Final feature map.
-    rand_feature_map = {
-        "inputs": rand_inputs,
-        "problem_choice": choice,
-        "input_space_id": inp_id,
-        "target_space_id": tgt_id
-    }
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      rand_feature_map["infer_targets"] = rand_target
-      rand_target = None
-    return rand_feature_map, rand_target
-
-  return input_fn
-
-
-class _ConditionalOptimizer(tf.train.Optimizer):
-  """Conditional optimizer."""
-
-  def __init__(self, optimizer_name, lr, hparams):
-    if optimizer_name == "Adam":
-      # We change the default epsilon for Adam and re-scale lr.
-      # Using LazyAdam as it's much faster for large vocabulary embeddings.
-      self._opt = tf.contrib.opt.LazyAdamOptimizer(
-          lr / 500.0,
-          beta1=hparams.optimizer_adam_beta1,
-          beta2=hparams.optimizer_adam_beta2,
-          epsilon=hparams.optimizer_adam_epsilon)
-    elif optimizer_name == "Momentum":
-      self._opt = tf.train.MomentumOptimizer(
-          lr, momentum=hparams.optimizer_momentum_momentum)
-    elif optimizer_name == "YellowFin":
-      tf.logging.info("Init YellowFin Optimizer.")
-      self._opt = yellowfin.YellowFinOptimizer(
-          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
-    else:
-      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
-
-  def compute_gradients(self, loss, var_list, colocate_gradients_with_ops):
-    return self._opt.compute_gradients(
-        loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops)
-
-  def apply_gradients(self, gradients, global_step=None, name=None):
-    return self._opt.apply_gradients(
-        gradients, global_step=global_step, name=name)
-
-
-def _sqrt_decay(step):
-  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
-  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
-
-
-def _exp_decay_after(step, rate, from_which_step):
-  """Decay exponentially by rate (per step) starting at from_which_step."""
-  return tf.cond(
-      step < from_which_step,
-      lambda: tf.constant(1.0),
-      lambda: rate**(step - from_which_step),
-      name="exponential_decay_step_cond")
-
-
-def _ps_replicas(all_workers=False):
-  if all_workers:
-    return list(range(FLAGS.ps_replicas))
-  # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas.
-  num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas
-  return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)]
-
-
-def _gpu_order(num_gpus):
-  if FLAGS.gpu_order:
-    ret = [int(s) for s in FLAGS.gpu_order.split(" ")]
-    if len(ret) == num_gpus:
-      return ret
-  return list(range(num_gpus))
-
-
-def _ps_gpus(all_workers=False):
-  ps_gpus = []
-  for d in _ps_replicas(all_workers=all_workers):
-    ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)])
-  return ps_gpus
-
-
-def _ps_devices(all_workers=False):
-  """List of ps devices (where to put the experts).
-
-  Args:
-    all_workers: whether the list is for all async workers or just this one.
-
-  Returns:
-    a list of device names
-  """
-  if FLAGS.ps_replicas > 0:
-    if FLAGS.ps_gpu > 0:
-      return [
-          FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu)
-          for (d, gpu) in _ps_gpus(all_workers=all_workers)
-      ]
-    else:
-      return [
-          FLAGS.ps_job + "/task:%d" % d
-          for d in _ps_replicas(all_workers=all_workers)
-      ]
-  else:
-    if FLAGS.worker_gpu > 0:
-      return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
-    else:
-      return [""]
-
-
-def data_parallelism(all_workers=False):
-  """Over which devices do we split each training batch.
-
-  In old-fashioned async mode, we split the batch over all GPUs on the
-  current worker.
-
-  In sync mode, we split the batch over all the parameter server GPUs.
-
-  This function returns an expert_utils.Parallelism object, which can be used
-  to build the model.  It is configured in a way that any variables created
-  by `tf.get_variable` will be assigned to the parameter servers and shared
-  between datashards.
-
-  Args:
-    all_workers: whether the devices are all async workers or just this one.
-
-  Returns:
-    a expert_utils.Parallelism.
-  """
-
-  def _replica_device_setter(worker_device):
-    if FLAGS.ps_replicas == 0:
-      return worker_device
-    return tf.train.replica_device_setter(
-        worker_device=worker_device,
-        ps_tasks=FLAGS.ps_replicas,
-        ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job)
-
-  if FLAGS.schedule == "local_run":
-    assert not FLAGS.sync
-    datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
-    if FLAGS.locally_shard_to_cpu:
-      datashard_devices += ["cpu:0"]
-    caching_devices = None
-  elif FLAGS.sync:
-    assert FLAGS.ps_replicas > 0
-    datashard_devices = [
-        _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers)
-    ]
-    if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1:
-      caching_devices = [
-          FLAGS.ps_job + "/task:%d/cpu:0" % d
-          for (d, _) in _ps_gpus(all_workers=all_workers)
-      ]
-    else:
-      caching_devices = None
-  else:
-    # old fashioned async - compute on worker
-    if FLAGS.worker_gpu > 1:
-      datashard_devices = [
-          _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d)
-          for d in _gpu_order(FLAGS.worker_gpu)
-      ]
-      caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu
-    else:
-      datashard_devices = [_replica_device_setter(FLAGS.worker_job)]
-      caching_devices = None
-  tf.logging.info("datashard_devices: %s", datashard_devices)
-  tf.logging.info("caching_devices: %s", caching_devices)
-  return eu.Parallelism(
-      datashard_devices,
-      reuse=True,
-      caching_devices=caching_devices,
-      daisy_chain_variables=FLAGS.daisy_chain_variables)
+    decoding.decode_from_dataset(estimator)
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
index 562279623..8a71afe68 100644
--- a/tensor2tensor/utils/trainer_utils_test.py
+++ b/tensor2tensor/utils/trainer_utils_test.py
@@ -25,7 +25,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.models import transformer
 from tensor2tensor.utils import registry
-from tensor2tensor.utils import trainer_utils as utils  # pylint: disable=unused-import
+from tensor2tensor.utils import trainer_utils
 
 import tensorflow as tf
 
@@ -76,7 +76,7 @@ def testHParamsImported(self):
   def testSingleStep(self):
     model_name = "transformer"
     FLAGS.hparams_set = "transformer_test"
-    exp = utils.create_experiment(
+    exp = trainer_utils.create_experiment(
         output_dir=tf.test.get_temp_dir(),
         data_dir=TrainerUtilsTest.data_dir,
         model_name=model_name,

From fbe8c61a1aaea95b9b32fb56d49c1e790660ea09 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 2 Aug 2017 17:13:21 -0700
Subject: [PATCH 5/6] v1.1.5

PiperOrigin-RevId: 164061568
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fd8e77a46..38b2fcc48 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.4',
+    version='1.1.5',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From eee190b3b770d917931b3ccb3972109b27b48f6d Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 2 Aug 2017 17:25:40 -0700
Subject: [PATCH 6/6] Add layers init and update gitignore for nose

---
 .gitignore                       | 1 +
 tensor2tensor/layers/__init__.py | 0
 2 files changed, 1 insertion(+)
 create mode 100644 tensor2tensor/layers/__init__.py

diff --git a/.gitignore b/.gitignore
index c9dd3db88..362753caa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ _pycache__/
 
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
+/*.egg
 
 # PyPI distribution artifacts.
 build/
diff --git a/tensor2tensor/layers/__init__.py b/tensor2tensor/layers/__init__.py
new file mode 100644
index 000000000..e69de29bb