diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 8c2d75fbe..a5d4816b7 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -29,7 +29,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
-from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
+from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.tokenizer import Tokenizer
 
 import tensorflow as tf
@@ -218,15 +218,18 @@ def gunzip_file(gz_path, new_path):
 ]
 
 
-def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
-  """Generate a vocabulary from the datasets listed in _DATA_FILE_URLS."""
+def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None):
+  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
   vocab_filepath = os.path.join(tmp_dir, vocab_filename)
   if os.path.exists(vocab_filepath):
-    vocab = SubwordTextEncoder(vocab_filepath)
+    tf.logging.info("Found vocab file: %s", vocab_filepath)
+    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
     return vocab
 
+  sources = sources or _DATA_FILE_URLS
+  tf.logging.info("Generating vocab from: %s", str(sources))
   tokenizer = Tokenizer()
-  for source in _DATA_FILE_URLS:
+  for source in sources:
     url = source[0]
     filename = os.path.basename(url)
     read_type = "r:gz" if "tgz" in filename else "r"
@@ -259,9 +262,9 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
             break
           line = line.strip()
           file_byte_budget -= len(line)
-          _ = tokenizer.encode(line)
+          _ = tokenizer.encode(text_encoder.native_to_unicode(line))
 
-  vocab = SubwordTextEncoder.build_to_target_size(
+  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
       vocab_size, tokenizer.token_counts, 1, 1e3)
   vocab.store_to_file(vocab_filepath)
   return vocab
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 1a61a6690..7ad0a57ad 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -664,8 +664,17 @@ def image_mscoco_tokens(model_hparams, vocab_count):
   }
   p.batch_size_multiplier = 256
   p.max_expected_batch_size_per_shard = 2
+
+
+def img2img_imagenet(unused_model_hparams):
+  """Image 2 Image for imagenet dataset."""
+  p = default_problem_hparams()
+  p.input_modality = {"inputs": ("image:identity", None)}
+  p.target_modality = ("image:identity", None)
+  p.batch_size_multiplier = 256
+  p.max_expected_batch_size_per_shard = 4
   p.input_space_id = 1
-  p.target_space_id = 3
+  p.target_space_id = 1
   return p
 
 
@@ -732,4 +741,5 @@ def image_mscoco_tokens(model_hparams, vocab_count):
     "image_mscoco_tokens_128k_tune": lambda p: image_mscoco_tokens(p, 2**17),
     "image_mscoco_tokens_128k_test": lambda p: image_mscoco_tokens(p, 2**17),
     "image_imagenet": image_imagenet,
+    "img2img_imagenet": img2img_imagenet,
 }
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 7b00a85d2..0a05cb721 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -36,10 +36,10 @@
 
 
 # Conversion between Unicode and UTF-8, if required (on Python2)
-_native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s)
+native_to_unicode = (lambda s: s.decode("utf-8")) if PY2 else (lambda s: s)
 
 
-_unicode_to_native = (lambda s: s.encode("utf-8")) if PY2 else (lambda s: s)
+unicode_to_native = (lambda s: s.encode("utf-8")) if PY2 else (lambda s: s)
 
 
 # Reserved tokens for things like padding and EOS symbols.
@@ -220,7 +220,7 @@ def encode(self, raw_text):
       a list of integers in the range [0, vocab_size)
     """
     return self._tokens_to_subtokens(self._tokenizer.encode(
-        _native_to_unicode(raw_text)))
+        native_to_unicode(raw_text)))
 
   def decode(self, subtokens):
     """Converts a sequence of subtoken ids to a native string.
@@ -230,7 +230,7 @@ def decode(self, subtokens):
     Returns:
       a native string
     """
-    return _unicode_to_native(self._tokenizer.decode(
+    return unicode_to_native(self._tokenizer.decode(
         self._subtokens_to_tokens(subtokens)))
 
   @property
@@ -335,6 +335,9 @@ def bisect(min_val, max_val):
       else:
         other_subtokenizer = bisect(min_val, present_count - 1)
 
+      if other_subtokenizer is None:
+        return subtokenizer
+
       if (abs(other_subtokenizer.vocab_size - target_size) <
           abs(subtokenizer.vocab_size - target_size)):
         return other_subtokenizer
@@ -449,13 +452,13 @@ def _load_from_file(self, filename):
     subtoken_strings = []
     with tf.gfile.Open(filename) as f:
       for line in f:
-        subtoken_strings.append(_native_to_unicode(line.strip()[1:-1]))
+        subtoken_strings.append(native_to_unicode(line.strip()[1:-1]))
     self._init_from_list(subtoken_strings)
 
   def store_to_file(self, filename):
     with tf.gfile.Open(filename, "w") as f:
       for subtoken_string in self._all_subtoken_strings:
-        f.write("'" + _unicode_to_native(subtoken_string) + "'\n")
+        f.write("'" + unicode_to_native(subtoken_string) + "'\n")
 
   def _escape_token(self, token):
     r"""Escape away underscores and OOV characters and append '_'.
@@ -524,7 +527,7 @@ def get_token_counts(cls, text_filepattern, corpus_max_lines):
       with tf.gfile.Open(text_filename) as f:
         for line in f:
           # The tokenizer updates token_counts in encode()
-          tok.encode(_native_to_unicode(line.strip()))
+          tok.encode(native_to_unicode(line.strip()))
           lines_read += 1
           if corpus_max_lines > 0 and lines_read > corpus_max_lines:
             return tok.token_counts
diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py
index 253b79944..c0533ee42 100644
--- a/tensor2tensor/models/bluenet.py
+++ b/tensor2tensor/models/bluenet.py
@@ -18,9 +18,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 # Dependency imports
 
-import collections
+import numpy as np
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensor2tensor.models import common_hparams
@@ -28,16 +31,17 @@
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
-import numpy as np
 import tensorflow as tf
 
+
 # var:          1d tensor, raw weights for each choice
 # tempered_var: raw weights with temperature applied
 # inv_t:        inverse of the temperature to use when normalizing `var`
 # normalized:   same shape as var, but where each item is between 0 and 1, and
 #               the sum is 1
 SelectionWeights = collections.namedtuple(
-    'SelectionWeights', ['var', 'tempered_var', 'inv_t', 'normalized'])
+    "SelectionWeights", ["var", "tempered_var", "inv_t", "normalized"])
+
 
 def create_selection_weights(name,
                              type_,
@@ -50,19 +54,20 @@ def create_selection_weights(name,
 
   Args:
     name: Name for the underlying variable containing the unnormalized weights.
-    type_: 'softmax' or 'sigmoid' or ('softmax_topk', k) where k is an int.
+    type_: "softmax" or "sigmoid" or ("softmax_topk", k) where k is an int.
     shape: Shape for the variable.
+    inv_t: Inverse of the temperature to use in normalization.
     initializer: Initializer for the variable, passed to `tf.get_variable`.
     regularizer: Regularizer for the variable. A callable which accepts
       `tempered_var` and `normalized`.
-    inv_t: Inverse of the temperature to use in normalization.
     names: Name of each selection.
 
   Returns:
     The created SelectionWeights tuple.
-  """
-
 
+  Raises:
+    ValueError: if type_ is not in the supported range.
+  """
   var = tf.get_variable(name, shape, initializer=initializer)
 
   if callable(inv_t):
@@ -72,22 +77,21 @@ def create_selection_weights(name,
   else:
     tempered_var = var * inv_t
 
-  if type_ == 'softmax':
+  if type_ == "softmax":
     weights = tf.nn.softmax(tempered_var)
-  elif type_ == 'sigmoid':
+  elif type_ == "sigmoid":
     weights = tf.nn.sigmoid(tempered_var)
-  elif isinstance(type_, (list, tuple)) and type_[0] == 'softmax_topk':
+  elif isinstance(type_, (list, tuple)) and type_[0] == "softmax_topk":
     assert len(shape) == 1
-
     # TODO(rshin): Change this to select without replacement?
-    selection = tf.multinomial(tf.expand_dims(var, axis=0), k)
+    selection = tf.multinomial(tf.expand_dims(var, axis=0), 4)
     selection = tf.squeeze(selection, axis=0)   # [k] selected classes.
     to_run = tf.one_hot(selection, shape[0])  # [k x nmodules] one-hot.
     # [nmodules], 0=not run, 1=run.
     to_run = tf.minimum(tf.reduce_sum(to_run, axis=0), 1)
     weights = tf.nn.softmax(tempered_var - 1e9 * (1.0 - to_run))
   else:
-    return ValueError(type)
+    raise ValueError("Unknown type: %s" % type_)
 
   if regularizer is not None:
     loss = regularizer(tempered_var, weights)
@@ -95,10 +99,10 @@ def create_selection_weights(name,
       tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, loss)
 
   if names is not None:
-    tf.get_collection_ref('selection_weight_names/' + var.name).extend(
+    tf.get_collection_ref("selection_weight_names/" + var.name).extend(
         names.flatten()
         if isinstance(names, np.ndarray) else names)
-    tf.add_to_collection('selection_weight_names_tensor/' + var.name,
+    tf.add_to_collection("selection_weight_names_tensor/" + var.name,
                          tf.constant(names))
 
   return SelectionWeights(
@@ -110,7 +114,7 @@ def create_selection_weights(name,
 
 def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels,
                          kernel_selection_weights, channel_selection_weights):
-  '''Get weights to multiply the kernel with, before convolving.
+  """Get weights to multiply the kernel with, before convolving.
 
   Args:
     max_kernel_size: (int, int) tuple giving the largest kernel size.
@@ -122,9 +126,10 @@ def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels,
       among kernel sizes.
     channel_selection_weights: SelectionWeights object to use for choosing
       among which input channels to use.
-  '''
-
 
+  Returns:
+    The multiplier.
+  """
   kernel_weights = []
   for kernel_i, (h, w) in enumerate(kernel_sizes):
     top = (max_kernel_size[0] - h) // 2
@@ -153,6 +158,7 @@ def kernel_premultiplier(max_kernel_size, kernel_sizes, input_channels,
                 tf.reshape(channel_weight, (1, 1, -1, 1)))
   return multiplier
 
+
 def make_subseparable_kernel(
     kernel_size,
     input_channels,
@@ -160,7 +166,7 @@ def make_subseparable_kernel(
     separability,
     kernel_initializer,
     kernel_regularizer):
-  '''Make a kernel to do subseparable convolution wiht  `tf.nn.conv2d`.
+  """Make a kernel to do subseparable convolution wiht  `tf.nn.conv2d`.
 
   Args:
     kernel_size: (height, width) tuple.
@@ -172,12 +178,11 @@ def make_subseparable_kernel(
 
   Returns:
     A 4D tensor.
-  '''
-
+  """
   if separability == 1:
     # Non-separable convolution
     return tf.get_variable(
-        'kernel',
+        "kernel",
         kernel_size + (input_channels, filters),
         initializer=kernel_initializer,
         regularizer=kernel_regularizer)
@@ -186,13 +191,13 @@ def make_subseparable_kernel(
     # Separable convolution
     # TODO(rshin): Check initialization is as expected, as these are not 4D.
     depthwise_kernel = tf.get_variable(
-        'depthwise_kernel',
+        "depthwise_kernel",
         kernel_size + (input_channels,),
         initializer=kernel_initializer,
         regularizer=kernel_regularizer)
 
     pointwise_kernel = tf.get_variable(
-        'pointwise_kernel',
+        "pointwise_kernel",
         (input_channels, filters),
         initializer=kernel_initializer,
         regularizer=kernel_regularizer)
@@ -230,22 +235,16 @@ def multi_subseparable_conv(
     kernel_sizes,
     input_channels,
     separabilities,
-
     kernel_selection_weights=None,
     channel_selection_weights=None,
     separability_selection_weights=None,
-
-    kernel_selection_weights_params={},
-    channel_selection_weights_params={},
-    separability_selection_weights_params={},
-
+    kernel_selection_weights_params=None,
+    channel_selection_weights_params=None,
+    separability_selection_weights_params=None,
     kernel_initializer=None,
     kernel_regularizer=None,
-
     scope=None):
-  '''
-  Simultaneously compute different kinds of convolutions on
-  different subsets of the input.
+  """Simultaneously compute different kinds of convolutions on subsets of input.
 
   Args:
     inputs: 4D tensor containing the input, in NHWC format.
@@ -254,30 +253,38 @@ def multi_subseparable_conv(
       different kernel sizes to use.
     input_channels: A list of (begin, end) pairs of integers, which describe
       which channels in the input to use.
-
+    separabilities: An integer or a list, how separable are the convolutions.
     kernel_selection_weights: SelectionWeights object to use for choosing
       among kernel sizes.
     channel_selection_weights: SelectionWeights object to use for choosing
       among which input channels to use.
-
-    kernel_size_seletion_weights_params: dict with up to three keys
+    separability_selection_weights: SelectionWeights object to use for choosing
+      separability.
+    kernel_selection_weights_params: dict with up to three keys
       - initializer
       - regularizer
       - inv_t
-    channel_seletion_weights_params: dict with up to three keys
+    channel_selection_weights_params: dict with up to three keys
+      - initializer
+      - regularizer
+      - inv_t
+    separability_selection_weights_params: dict with up to three keys
       - initializer
       - regularizer
       - inv_t
-
     kernel_initializer: Initializer to use for kernels.
     kernel_regularizer: Regularizer to use for kernels.
+    scope: the scope to use.
 
   Returns:
     Result of convolution.
+  """
+  kernel_selection_weights_params = kernel_selection_weights_params or {}
+  channel_selection_weights_params = channel_selection_weights_params or {}
+  if separability_selection_weights_params is None:
+    separability_selection_weights_params = {}
 
-  '''
-
-  # Get input image size
+  # Get input image size.
   input_shape = inputs.get_shape().as_list()
   assert len(input_shape) == 4
   in_channels = input_shape[3]
@@ -285,41 +292,38 @@ def multi_subseparable_conv(
 
   max_kernel_size = tuple(np.max(kernel_sizes, axis=0))
   max_num_channels = np.max(input_channels) - np.min(input_channels)
-  # kernel height x kernel width x
-  # number of input channels x number of output channels
-  max_kernel_shape = max_kernel_size + (max_num_channels, filters)
 
-  with tf.variable_scope('selection_weights'):
+  with tf.variable_scope(scope or "selection_weights"):
     if kernel_selection_weights is None:
       kernel_selection_weights = create_selection_weights(
-          'kernels',
-          'softmax', (len(kernel_sizes),),
+          "kernels",
+          "softmax", (len(kernel_sizes),),
           names=[
-              'kernel_h{}_w{}'.format(h, w) for h, w in kernel_sizes
+              "kernel_h{}_w{}".format(h, w) for h, w in kernel_sizes
           ],
           **kernel_selection_weights_params)
 
     if channel_selection_weights is None:
       channel_selection_weights = create_selection_weights(
-          'channels',
-          'softmax', (len(input_channels),),
+          "channels",
+          "softmax", (len(input_channels),),
           names=[
-              'channels_{}_{}'.format(c1, c2) for c1, c2 in input_channels
+              "channels_{}_{}".format(c1, c2) for c1, c2 in input_channels
           ],
           **channel_selection_weights_params)
 
     if separability_selection_weights is None:
       separability_selection_weights = create_selection_weights(
-          'separability',
-          'softmax', (len(separabilities),),
+          "separability",
+          "softmax", (len(separabilities),),
           names=[
-              'separability_{}'.format(s) for s in separabilities
+              "separability_{}".format(s) for s in separabilities
           ],
           **separability_selection_weights_params)
 
   kernels = []
   for separability in separabilities:
-    with tf.variable_scope('separablity_{}'.format(separability)):
+    with tf.variable_scope("separablity_{}".format(separability)):
       kernel = make_subseparable_kernel(
           max_kernel_size,
           max_num_channels,
@@ -347,9 +351,9 @@ def multi_subseparable_conv(
       inputs,
       filter=kernel,
       strides=[1, 1, 1, 1],
-      padding='SAME',
-      data_format='NHWC',
-      name='conv2d')
+      padding="SAME",
+      data_format="NHWC",
+      name="conv2d")
 
 
 def conv_module(kw, kh, sep, div):
@@ -360,13 +364,14 @@ def convfn(x, hparams):
         name="conv_%d%d_sep%d_div%d" % (kw, kh, sep, div))
   return convfn
 
+
 def multi_conv_module(kernel_sizes, seps):
   def convfn(x, hparams):
     return multi_subseparable_conv(x, hparams.hidden_size, kernel_sizes,
                                    [(0, hparams.hidden_size)], seps)
-
   return convfn
 
+
 def layernorm_module(x, hparams):
   return common_layers.layer_norm(x, hparams.hidden_size, name="layer_norm")
 
@@ -404,12 +409,11 @@ def shakeshake_binary_module(x, y, hparams):
 def run_binary_modules(modules, cur1, cur2, hparams):
   """Run binary modules."""
   selection_weights = create_selection_weights(
-      'selection',
-      'softmax',
+      "selection",
+      "softmax",
       shape=[len(modules)],
       inv_t=100.0 * common_layers.inverse_exp_decay(
           hparams.anneal_until, min_value=0.01))
-
   all_res = [modules[n](cur1, cur2, hparams) for n in xrange(len(modules))]
   all_res = tf.concat([tf.expand_dims(r, axis=0) for r in all_res], axis=0)
   res = all_res * tf.reshape(selection_weights.normalized, [-1, 1, 1, 1, 1])
@@ -419,12 +423,11 @@ def run_binary_modules(modules, cur1, cur2, hparams):
 def run_unary_modules_basic(modules, cur, hparams):
   """Run unary modules."""
   selection_weights = create_selection_weights(
-      'selection',
-      'softmax',
+      "selection",
+      "softmax",
       shape=[len(modules)],
       inv_t=100.0 * common_layers.inverse_exp_decay(
           hparams.anneal_until, min_value=0.01))
-
   all_res = [modules[n](cur, hparams) for n in xrange(len(modules))]
   all_res = tf.concat([tf.expand_dims(r, axis=0) for r in all_res], axis=0)
   res = all_res * tf.reshape(selection_weights.normalized, [-1, 1, 1, 1, 1])
@@ -434,12 +437,11 @@ def run_unary_modules_basic(modules, cur, hparams):
 def run_unary_modules_sample(modules, cur, hparams, k):
   """Run modules, sampling k."""
   selection_weights = create_selection_weights(
-      'selection',
-      ('softmax_topk', k),
+      "selection",
+      ("softmax_topk", k),
       shape=[len(modules)],
       inv_t=100.0 * common_layers.inverse_exp_decay(
           hparams.anneal_until, min_value=0.01))
-
   all_res = [tf.cond(tf.less(selection_weights.normalized[n], 1e-6),
                      lambda: tf.zeros_like(cur),
                      lambda i=n: modules[i](cur, hparams))
@@ -468,12 +470,10 @@ class BlueNet(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
     hparams = self._hparams
-    # TODO(rshin): Add back div.
     # TODO(rshin): Give identity_module lower weight by default.
-    conv_modules = [
-        multi_conv_module(
-            kernel_sizes=[(3, 3), (5, 5), (7, 7)], seps=[0, 1]), identity_module
-    ]
+    multi_conv = multi_conv_module(
+        kernel_sizes=[(3, 3), (5, 5), (7, 7)], seps=[0, 1])
+    conv_modules = [multi_conv, identity_module]
     activation_modules = [identity_module,
                           lambda x, _: tf.nn.relu(x),
                           lambda x, _: tf.nn.elu(x),
@@ -498,20 +498,24 @@ def run_unary(x, name):
           x.set_shape(x_shape)
       return tf.nn.dropout(x, 1.0 - hparams.dropout), batch_deviation(x)
 
-    cur1, cur2, extra_loss = inputs, inputs, 0.0
+    cur1, cur2, cur3, extra_loss = inputs, inputs, inputs, 0.0
     cur_shape = inputs.get_shape()
     for i in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % i):
         cur1, loss1 = run_unary(cur1, "unary1")
         cur2, loss2 = run_unary(cur2, "unary2")
-        extra_loss += (loss1 + loss2) / float(hparams.num_hidden_layers)
+        cur3, loss3 = run_unary(cur2, "unary3")
+        extra_loss += (loss1 + loss2 + loss3) / float(hparams.num_hidden_layers)
         with tf.variable_scope("binary1"):
           next1 = run_binary_modules(binary_modules, cur1, cur2, hparams)
           next1.set_shape(cur_shape)
         with tf.variable_scope("binary2"):
-          next2 = run_binary_modules(binary_modules, cur1, cur2, hparams)
+          next2 = run_binary_modules(binary_modules, cur1, cur3, hparams)
           next2.set_shape(cur_shape)
-        cur1, cur2 = next1, next2
+        with tf.variable_scope("binary3"):
+          next3 = run_binary_modules(binary_modules, cur2, cur3, hparams)
+          next3.set_shape(cur_shape)
+        cur1, cur2, cur3 = next1, next2, next3
 
     anneal = common_layers.inverse_exp_decay(hparams.anneal_until)
     extra_loss *= hparams.batch_deviation_loss_factor * anneal
@@ -525,7 +529,7 @@ def bluenet_base():
   hparams.batch_size = 4096
   hparams.hidden_size = 256
   hparams.dropout = 0.2
-  hparams.symbol_dropout = 0.2
+  hparams.symbol_dropout = 0.5
   hparams.label_smoothing = 0.1
   hparams.clip_grad_norm = 2.0
   hparams.num_hidden_layers = 8
@@ -543,7 +547,7 @@ def bluenet_base():
   hparams.optimizer_adam_beta2 = 0.997
   hparams.add_hparam("imagenet_use_2d", True)
   hparams.add_hparam("anneal_until", 40000)
-  hparams.add_hparam("batch_deviation_loss_factor", 0.001)
+  hparams.add_hparam("batch_deviation_loss_factor", 5.0)
   return hparams
 
 
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index 7895acd04..7a6ce96fb 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -293,7 +293,6 @@ def conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs):
   static_shape = inputs.get_shape()
   if not static_shape or len(static_shape) != 4:
     raise ValueError("Inputs to conv must have statically known rank 4.")
-  #inputs.set_shape([static_shape[0], None, None, static_shape[3]])
   # Add support for left padding.
   if "padding" in kwargs and kwargs["padding"] == "LEFT":
     dilation_rate = (1, 1)
@@ -331,6 +330,7 @@ def conv2d_kernel(kernel_size_arg, name_suffix):
 
   return conv2d_kernel(kernel_size, "single")
 
+
 def conv(inputs, filters, kernel_size, **kwargs):
   return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs)
 
@@ -556,7 +556,7 @@ def pool(inputs, window_size, pooling_type, padding, strides=(1, 1)):
       inputs.set_shape([static_shape[0], None, None, static_shape[3]])
       padding = "VALID"
 
-    return tf.nn.pool(inputs, window_size, pooling_type, padding, strides=strides)
+  return tf.nn.pool(inputs, window_size, pooling_type, padding, strides=strides)
 
 
 def conv_block_downsample(x,
@@ -1352,7 +1352,8 @@ def padded_cross_entropy(logits,
   vocab_size = tf.shape(logits)[-1]
   with tf.name_scope("padded_cross_entropy", [logits, labels]):
     pad_logits, pad_labels = pad_with_zeros(logits, labels)
-    xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size, confidence)
+    xent = smoothing_cross_entropy(pad_logits, pad_labels,
+                                   vocab_size, confidence)
     weights = weights_fn(pad_labels)
     if not reduce_sum:
       return xent * weights, weights
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index 13ef8dc4c..eb8b10cd2 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -12,68 +12,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Baseline models."""
+"""RNN LSTM models."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 # Dependency imports
 
-from tensor2tensor.models import common_layers
 from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
-from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.util import nest
 
-import collections
 
 # Track Tuple of state and attention values
-AttentionTuple = collections.namedtuple("AttentionTuple", ("state", "attention"))
+AttentionTuple = collections.namedtuple("AttentionTuple",
+                                        ("state", "attention"))
+
 
+class ExternalAttentionCellWrapper(tf.contrib.rnn.RNNCell):
+  """Wrapper for external attention states for an encoder-decoder setup."""
 
-class ExternalAttentionCellWrapper(rnn_cell_impl.RNNCell):
-  """
-    Wrapper for external attention states. To be used in an encoder-decoder setup
-  """
   def __init__(self, cell, attn_states, attn_vec_size=None,
-              input_size=None, state_is_tuple=True, reuse=None):
+               input_size=None, state_is_tuple=True, reuse=None):
     """Create a cell with attention.
-      Args:
-        cell: an RNNCell, an attention is added to it.
-        attn_states: External attention states typically the encoder output in the
-            form [batch_size, time steps, hidden size]
-        attn_vec_size: integer, the number of convolutional features calculated
-            on attention state and a size of the hidden layer built from
-            base cell state. Equal attn_size to by default.
-        input_size: integer, the size of a hidden linear layer,
-            built from inputs and attention. Derived from the input tensor
-            by default.
-        state_is_tuple: If True, accepted and returned states are n-tuples, where
-          `n = len(cells)`.  Must be set to True else will raise an exception
-          concatenated along the column axis.
-        reuse: (optional) Python boolean describing whether to reuse variables
-          in an existing scope.  If not `True`, and the existing scope already has
-          the given variables, an error is raised.
-      Raises:
-        TypeError: if cell is not an RNNCell.
-        ValueError: if the flag `state_is_tuple` is `False` or 
-        if shape of attn_states is not 3 or if innermost dimension (hidden size) is None.
+
+    Args:
+      cell: an RNNCell, an attention is added to it.
+      attn_states: External attention states typically the encoder output in the
+        form [batch_size, time steps, hidden size]
+      attn_vec_size: integer, the number of convolutional features calculated
+        on attention state and a size of the hidden layer built from
+        base cell state. Equal attn_size to by default.
+      input_size: integer, the size of a hidden linear layer,
+        built from inputs and attention. Derived from the input tensor
+        by default.
+      state_is_tuple: If True, accepted and returned states are n-tuples, where
+        `n = len(cells)`.  Must be set to True else will raise an exception
+        concatenated along the column axis.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    Raises:
+      TypeError: if cell is not an RNNCell.
+      ValueError: if the flag `state_is_tuple` is `False` or if shape of
+        `attn_states` is not 3 or if innermost dimension (hidden size) is None.
     """
     super(ExternalAttentionCellWrapper, self).__init__(_reuse=reuse)
-    if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access
-      raise TypeError("The parameter cell is not RNNCell.")
-
     if not state_is_tuple:
       raise ValueError("Only tuple state is supported")
 
     self._cell = cell
     self._input_size = input_size
 
-    #Validate attn_states shape
+    # Validate attn_states shape.
     attn_shape = attn_states.get_shape()
     if not attn_shape or len(attn_shape) != 3:
       raise ValueError("attn_shape must be rank 3")
@@ -82,32 +80,32 @@ def __init__(self, cell, attn_states, attn_vec_size=None,
     self._attn_size = attn_shape[2].value
     if self._attn_size is None:
       raise ValueError("Hidden size of attn_states cannot be None")
-    
+
     self._attn_vec_size = attn_vec_size
     if self._attn_vec_size is None:
       self._attn_vec_size = self._attn_size
 
     self._reuse = reuse
-  
+
   @property
   def state_size(self):
     return AttentionTuple(self._cell.state_size, self._attn_size)
 
-
   @property
   def output_size(self):
     return self._attn_size
 
   def combine_state(self, previous_state):
-    """
-      Combines previous state (usually from an encoder) with the internal attention values
-      You must use this function to derive the initial state passed into this cell as it expects
-      a named tuple (AttentionTuple)
-      Args:
-        previous_state: State from another block that will be fed into this cell. Must have same 
-        structure as the state of the cell wrapped by this
-      Returns:
-        Combined state (AttentionTuple)
+    """Combines previous state (from encoder) with internal attention values.
+
+    You must use this function to derive the initial state passed into
+    this cell as it expects a named tuple (AttentionTuple).
+
+    Args:
+      previous_state: State from another block that will be fed into this cell;
+        Must have same structure as the state of the cell wrapped by this.
+    Returns:
+      Combined state (AttentionTuple).
     """
     batch_size = self._attn_states.get_shape()[0].value
     if batch_size is None:
@@ -118,28 +116,28 @@ def combine_state(self, previous_state):
   def call(self, inputs, state):
     """Long short-term memory cell with attention (LSTMA)."""
 
-    if(not isinstance(state, AttentionTuple)):
+    if not isinstance(state, AttentionTuple):
       raise TypeError("State must be of type AttentionTuple")
-    
+
     state, attns = state
-    attn_states = self._attn_states  
+    attn_states = self._attn_states
     attn_length = attn_states.get_shape()[1].value
     if attn_length is None:
       attn_length = tf.shape(attn_states)[1]
-    
 
     input_size = self._input_size
     if input_size is None:
       input_size = inputs.get_shape().as_list()[1]
-    if(attns is not None):
-      inputs = rnn_cell_impl._linear([inputs, attns], input_size, True)
+    if attns is not None:
+      inputs = tf.layers.dense(tf.concat([inputs, attns], axis=1), input_size)
     lstm_output, new_state = self._cell(inputs, state)
-    
+
     new_state_cat = tf.concat(nest.flatten(new_state), 1)
     new_attns = self._attention(new_state_cat, attn_states, attn_length)
-    
+
     with tf.variable_scope("attn_output_projection"):
-      output = rnn_cell_impl._linear([lstm_output, new_attns], self._attn_size, True)
+      output = tf.layers.dense(tf.concat([lstm_output, new_attns], axis=1),
+                               self._attn_size)
 
     new_state = AttentionTuple(new_state, new_attns)
 
@@ -156,9 +154,9 @@ def _attention(self, query, attn_states, attn_length):
           "attn_w", [1, 1, self._attn_size, self._attn_vec_size])
       v = tf.get_variable("attn_v", [self._attn_vec_size, 1])
       hidden = tf.reshape(attn_states,
-                                 [-1, attn_length, 1, self._attn_size])
+                          [-1, attn_length, 1, self._attn_size])
       hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
-      y = rnn_cell_impl._linear(query, self._attn_vec_size, True)
+      y = tf.layers.dense(query, self._attn_vec_size)
       y = tf.reshape(y, [-1, 1, 1, self._attn_vec_size])
       s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
       a = softmax(s)
@@ -168,6 +166,7 @@ def _attention(self, query, attn_states, attn_length):
 
       return new_attns
 
+
 def lstm(inputs, hparams, train, name, initial_state=None):
   """Run LSTM cell on inputs, assuming they are [batch x time x size]."""
 
@@ -185,8 +184,10 @@ def dropout_lstm_cell():
         dtype=tf.float32,
         time_major=False)
 
-def lstm_attention_decoder(inputs, hparams, train, name, initial_state, attn_states):
-  """Run LSTM cell with attention on inputs, assuming they are [batch x time x size]."""
+
+def lstm_attention_decoder(inputs, hparams, train, name,
+                           initial_state, attn_states):
+  """Run LSTM cell with attention on inputs of shape [batch x time x size]."""
 
   def dropout_lstm_cell():
     return tf.contrib.rnn.DropoutWrapper(
@@ -194,8 +195,9 @@ def dropout_lstm_cell():
         input_keep_prob=1.0 - hparams.dropout * tf.to_float(train))
 
   layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]
-  cell = ExternalAttentionCellWrapper(tf.nn.rnn_cell.MultiRNNCell(layers), attn_states, 
-         attn_vec_size=hparams.attn_vec_size)
+  cell = ExternalAttentionCellWrapper(tf.nn.rnn_cell.MultiRNNCell(layers),
+                                      attn_states,
+                                      attn_vec_size=hparams.attn_vec_size)
   initial_state = cell.combine_state(initial_state)
   with tf.variable_scope(name):
     return tf.nn.dynamic_rnn(
@@ -205,6 +207,7 @@ def dropout_lstm_cell():
         dtype=tf.float32,
         time_major=False)
 
+
 def lstm_seq2seq_internal(inputs, targets, hparams, train):
   """The basic LSTM seq2seq model, main step used for training."""
   with tf.variable_scope("lstm_seq2seq"):
@@ -223,6 +226,7 @@ def lstm_seq2seq_internal(inputs, targets, hparams, train):
         initial_state=final_encoder_state)
     return tf.expand_dims(decoder_outputs, axis=2)
 
+
 def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
   """LSTM seq2seq model with attention, main step used for training."""
   with tf.variable_scope("lstm_seq2seq_attention"):
@@ -241,6 +245,7 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
         final_encoder_state, encoder_outputs)
     return tf.expand_dims(decoder_outputs, axis=2)
 
+
 @registry.register_model("baseline_lstm_seq2seq")
 class LSTMSeq2Seq(t2t_model.T2TModel):
 
@@ -249,13 +254,15 @@ def model_fn_body(self, features):
     return lstm_seq2seq_internal(features["inputs"], features["targets"],
                                  self._hparams, train)
 
+
 @registry.register_model("baseline_lstm_seq2seq_attention")
 class LSTMSeq2SeqAttention(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
     train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
-    return lstm_seq2seq_internal_attention(features["inputs"], features["targets"],
-                                 self._hparams, train)
+    return lstm_seq2seq_internal_attention(
+        features["inputs"], features["targets"], self._hparams, train)
+
 
 @registry.register_hparams
 def lstm_attention():
@@ -267,4 +274,4 @@ def lstm_attention():
 
   # Attention
   hparams.add_hparam("attn_vec_size", hparams.hidden_size)
-  return hparams
\ No newline at end of file
+  return hparams
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index a216a3832..4c4c42909 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -51,7 +51,7 @@ def testLSTMSeq2Seq(self):
       res = session.run(logits)
     self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
 
-  def testLSTMSeq2Seq_attention(self):
+  def testLSTMSeq2SeqAttention(self):
     vocab_size = 9
     x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
     y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1))
@@ -61,7 +61,7 @@ def testLSTMSeq2Seq_attention(self):
                                                      vocab_size)
     x = tf.constant(x, dtype=tf.int32)
     x._shape = tf.TensorShape([None, None, 1, 1])
-    
+
     with self.test_session() as session:
       features = {
           "inputs": x,
@@ -75,5 +75,6 @@ def testLSTMSeq2Seq_attention(self):
       res = session.run(logits)
     self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
 
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index 66a8491f2..60f098e5e 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -19,52 +19,66 @@
 
 # Dependency imports
 
+from tensor2tensor.models import common_attention
+from tensor2tensor.models import common_hparams
 from tensor2tensor.models import common_layers
 from tensor2tensor.models import modalities
 from tensor2tensor.models import slicenet
-from tensor2tensor.utils import expert_utils as eu
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
 
 
-def experts(xs, moe_n1, moe_n2, hidden_size, filter_size, dp, ps, train):
-  """Mixture-of-Experts layer."""
-  # Set up the hyperparameters for the gating networks.
-  primary_gating_hp = eu.NoisyTopKGatingParams()
-  primary_gating_hp.num_experts = moe_n1
-  if moe_n2:
-    # Hierarchical MoE containing moe_n1 groups of moe_n2 experts.
-    assert moe_n2 > 1
-    secondary_gating_hp = eu.NoisyTopKGatingParams()
-    secondary_gating_hp.num_experts = moe_n2
-  else:
-    # Flat mixture of moe_n1 experts.
-    secondary_gating_hp = None
-  # Set up the hyperparameters for the expert networks.
-  # Each expert contains a hidden RELU layer of size filter_size
-  expert_hp = eu.FeedForwardExpertParams()
-  expert_hp.hidden_layer_sizes = [filter_size]
-  # Create the mixture of experts.
-  moe = eu.DistributedMixtureOfExperts(primary_gating_hp, secondary_gating_hp,
-                                       expert_hp, hidden_size, hidden_size, ps,
-                                       "moe")
-  # MoE expects input tensors to be 2d.  Flatten out spatial dimensions.
-  xs_2d = dp(tf.reshape, xs, [[-1, hidden_size]] * dp.n)
-  # Call the MoE
-  moe_out_2d, importance, load, _, _ = moe.Eval(
-      dp.devices, xs_2d, train, summaries=False, identifiers=None)
-  # Reshape the output to the original shape.
-  moe_out = dp(tf.reshape, moe_out_2d, dp(tf.shape, xs))
-  # These losses encourage equal load on the different experts.
-  loss = eu.CVSquared(importance) + eu.CVSquared(load)
-
-  # Apply residual and normalize.
-  def add_and_normalize(x, y):
-    return common_layers.layer_norm(x + y, hidden_size, name="moe_norm")
-
-  return dp(add_and_normalize, xs, moe_out), loss
+def conv_res_step(x, hparams, padding, mask):
+  """One step of convolutions and mid-residual."""
+  k = (hparams.kernel_height, hparams.kernel_width)
+  k2 = (hparams.large_kernel_size, 1)
+  dilations_and_kernels1 = [((1, 1), k), ((1, 1), k)]
+  dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)]
+  with tf.variable_scope("conv_res_step"):
+    y = common_layers.subseparable_conv_block(
+        x, hparams.filter_size, dilations_and_kernels1,
+        padding=padding, mask=mask, separabilities=0, name="residual1")
+    y = tf.nn.dropout(y, 1.0 - hparams.dropout)
+    return common_layers.subseparable_conv_block(
+        y, hparams.hidden_size, dilations_and_kernels2,
+        padding=padding, mask=mask, separabilities=0, name="residual2")
+
+
+def residual_fn2(x, y, hparams):
+  y = tf.nn.dropout(y, 1.0 - hparams.dropout)
+  return common_layers.layer_norm(x + y)
+
+
+def residual_fn3(x, y, z, hparams):
+  y = tf.nn.dropout(y, 1.0 - hparams.dropout)
+  z = tf.nn.dropout(z, 1.0 - hparams.dropout)
+  return common_layers.layer_norm(x + y + z)
+
+
+def conv_experts(xs, hparams, dp, ps, padding, mask, layer_id):
+  """Convolutions + Mixture-of-Experts layer."""
+  del layer_id  # Unused.
+  train = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+  conv_out = dp(conv_res_step, xs, hparams, padding, mask)
+  loss = 0.0
+  moe_out, loss = common_layers.moe_layer(
+      dp, ps, xs, train, hparams.hidden_size, hparams.filter_size,
+      hparams.moe_n1, hparams.moe_n2, 1.0)
+  return dp(residual_fn3, xs, moe_out, conv_out, hparams), loss
+
+
+def prepare_decoder(targets, target_space_emb):
+  """Prepare decoder."""
+  decoder_self_attention_bias = (
+      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  target_space_emb = tf.reshape(target_space_emb, [1, 1, -1])
+  target_space_emb = tf.tile(target_space_emb, [tf.shape(targets)[0], 1, 1])
+  decoder_input = common_layers.shift_left_3d(
+      targets, pad_value=target_space_emb)
+  decoder_input = common_attention.add_timing_signal_1d(decoder_input)
+  return (decoder_input, decoder_self_attention_bias)
 
 
 @registry.register_model
@@ -74,87 +88,119 @@ def model_fn_body_sharded(self, sharded_features):
     train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     dp = self._data_parallelism
     hparams = self._hparams
-    targets = sharded_features["targets"]
 
     def flatten(inputs):
       return tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
 
     inputs = dp(flatten, sharded_features["inputs"])
-
-    # Encode inputs.
-    def encode_half(inputs, inputs_mask, hparams):
-      # Add timing and encode.
-      inputs = common_layers.add_timing_signal(inputs)
-      return slicenet.multi_conv_res(inputs, "SAME", "encoder1",
-                                     hparams.num_hidden_layers // 2,
-                                     hparams, mask=inputs_mask)
-
-    target_space_emb = dp(slicenet.embed_target_space,
-                          sharded_features["target_space_id"],
-                          hparams.hidden_size)
     inputs_pad = dp(slicenet.embedding_to_padding, inputs)
     inputs_mask = dp(lambda x: 1.0 - x, inputs_pad)
-    inputs_encoded = dp(encode_half, inputs, inputs_mask, hparams)
-    with tf.variable_scope("experts_enc"):
-      inputs_encoded, expert_loss = experts(
-          inputs_encoded, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size,
-          hparams.hidden_size, dp, self._ps_devices, train)
-      expert_loss *= hparams.moe_loss_coef
-    inputs_encoded = dp(
-        slicenet.multi_conv_res, inputs_encoded, "SAME",
-        "encoder2", hparams.num_hidden_layers, hparams,
-        mask=inputs_mask)
+    inputs_encoded = dp(common_layers.add_timing_signal, inputs)
+    expert_loss = 0.0
+    for i in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("enc_layer_%d" % i):
+        inputs_encoded, moe_loss = conv_experts(
+            inputs_encoded, hparams, dp, self._ps_devices, "SAME",
+            inputs_mask, i)
+        expert_loss += tf.reduce_mean(moe_loss) * hparams.moe_loss_coef
 
     # If we're just predicing a class, there is no use for a decoder, return.
     if isinstance(hparams.problems[self._problem_idx].target_modality,
                   modalities.ClassLabelModality):
       return inputs_encoded, tf.reduce_mean(expert_loss)
 
-    # Do the middle part.
-    decoder_start, similarity_loss = dp(
-        slicenet.slicenet_middle, inputs_encoded, targets,
-        target_space_emb, inputs_mask, hparams)
-
-    # Decode.
-    decoder_half = dp(
-        slicenet.multi_conv_res,
-        decoder_start,
-        "LEFT",
-        "decoder1",
-        hparams.num_hidden_layers // 2,
-        hparams,
-        train,
-        mask=inputs_mask,
-        source=inputs_encoded)
-    with tf.variable_scope("experts_dec"):
-      decoder_half, expert_dec_loss = experts(
-          decoder_half, hparams.moe_n1, hparams.moe_n2, hparams.hidden_size,
-          hparams.hidden_size, dp, self._ps_devices, train)
-      expert_loss += expert_dec_loss * hparams.moe_loss_coef
-    decoder_final = dp(
-        slicenet.multi_conv_res,
-        decoder_half,
-        "LEFT",
-        "decoder2",
-        hparams.num_hidden_layers // 2,
-        hparams,
-        mask=inputs_mask,
-        source=inputs_encoded)
-
-    total_loss = tf.reduce_mean(expert_loss) + tf.reduce_mean(similarity_loss)
-    return decoder_final, total_loss
-
-
-@registry.register_hparams("multimodel_1p8")
-def multimodel_params1_p8():
-  """Version for eight problem runs."""
-  hparams = slicenet.slicenet_params1()
-  hparams.problem_choice = "distributed"
-  hparams.attention_type = "simple"  # TODO(lukaszkaiser): add transformer.
-  hparams.hidden_size = 1536
-  hparams.moe_n1 = 120
-  hparams.shared_embedding_and_softmax_weights = int(False)
+    # Decoder.
+    inputs3d = dp(tf.squeeze, inputs, 2)
+    inputs_encoded3d = dp(tf.squeeze, inputs_encoded, 2)
+    encoder_padding = dp(common_attention.embedding_to_padding, inputs3d)
+    encoder_attention_bias = dp(
+        common_attention.attention_bias_ignore_padding, encoder_padding)
+    targets = dp(common_layers.flatten4d3d, sharded_features["targets"])
+    target_space_emb = dp(slicenet.embed_target_space,
+                          sharded_features["target_space_id"],
+                          hparams.hidden_size)
+
+    (decoder_input, decoder_self_attention_bias) = dp(
+        prepare_decoder, targets, target_space_emb)
+
+    x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.dropout)
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("dec_layer_%d" % layer):
+        with tf.variable_scope("attention"):
+          y = dp(common_attention.multihead_attention,
+                 x,
+                 None,
+                 decoder_self_attention_bias,
+                 hparams.hidden_size,
+                 hparams.hidden_size,
+                 hparams.hidden_size,
+                 hparams.num_heads,
+                 hparams.attention_dropout,
+                 summaries=False,
+                 name="decoder_self_attention")
+          z = dp(common_attention.multihead_attention,
+                 y,
+                 inputs_encoded3d,
+                 encoder_attention_bias,
+                 hparams.hidden_size,
+                 hparams.hidden_size,
+                 hparams.hidden_size,
+                 hparams.num_heads,
+                 hparams.attention_dropout,
+                 summaries=False,
+                 name="encdec_attention")
+          x = dp(residual_fn3, x, y, z, hparams)
+        with tf.variable_scope("ffn"):
+          if str(layer) in hparams.moe_layers.split(","):
+            y, moe_loss = common_layers.moe_layer(
+                dp, self._ps_devices, x, train,
+                hparams.hidden_size, hparams.filter_size,
+                hparams.moe_n1, hparams.moe_n2, hparams.moe_loss_coef)
+            expert_loss += tf.reduce_mean(moe_loss)
+          else:
+            y = dp(common_layers.conv_hidden_relu,
+                   x,
+                   hparams.filter_size,
+                   hparams.hidden_size,
+                   dropout=hparams.dropout)
+          x = dp(residual_fn2, x, y, hparams)
+
+    x = dp(tf.expand_dims, x, 2)
+    return x, tf.reduce_mean(expert_loss)
+
+
+@registry.register_hparams
+def multimodel_base():
+  """Base parameters for MultiModel."""
+  hparams = common_hparams.basic_params1()
+  hparams.hidden_size = 512
+  hparams.batch_size = 2048
+  hparams.num_hidden_layers = 4
+  hparams.learning_rate_decay_scheme = "noam"
+  hparams.learning_rate = 0.1
+  hparams.learning_rate_warmup_steps = 4000
+  hparams.initializer_gain = 1.0
   hparams.dropout = 0.1
-  hparams.attention_dropout = 0.1
-  hparams.learning_rate_decay_scheme = "exp500k"
+  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
+  hparams.add_hparam("large_kernel_size", 15)
+  hparams.add_hparam("attention_dropout", 0.1)
+  hparams.add_hparam("num_heads", 8)
+  hparams.add_hparam("moe_n1", 30)
+  hparams.add_hparam("moe_n2", 0)
+  hparams.add_hparam("moe_layers", "2")
+  hparams.add_hparam("moe_loss_coef", 1e-2)
+  hparams.add_hparam("imagenet_use_2d", int(True))
+  return hparams
+
+
+@registry.register_hparams
+def multimodel_tiny():
+  """Tiny parameters for MultiModel."""
+  hparams = multimodel_base()
+  hparams.hidden_size = 128
+  hparams.filter_size = 512
+  hparams.batch_size = 512
+  hparams.num_hidden_layers = 2
+  hparams.moe_n1 = 10
+  hparams.moe_layers = "0"
   return hparams
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index 72fe4a326..dbbd3fa8e 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -24,7 +24,6 @@
 
 from tensor2tensor.data_generators import problem_hparams
 from tensor2tensor.models import multimodel
-from tensor2tensor.models import slicenet
 
 import tensorflow as tf
 
@@ -34,7 +33,7 @@ class MultiModelTest(tf.test.TestCase):
   def testMultiModel(self):
     x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3))
     y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1))
-    hparams = slicenet.slicenet_params1_tiny()
+    hparams = multimodel.multimodel_tiny()
     p_hparams = problem_hparams.image_cifar10(hparams)
     hparams.problems = [p_hparams]
     with self.test_session() as session:
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index d09787ae4..7b0663cf8 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -130,6 +130,52 @@ def examples_queue(data_sources,
     }
 
 
+def preprocessing(examples, data_file_pattern, mode):
+  """Preprocessing of examples."""
+  if "image" in data_file_pattern:
+    # Small single-example pre-processing for images.
+    def resize(img, size):
+      return tf.to_int64(tf.image.resize_images(img, [size, size]))
+    def preprocess(img):
+      img = tf.image.resize_images(img, [360, 360])
+      img = common_layers.image_augmentation(tf.to_float(img) / 255.)
+      return tf.to_int64(img * 255.)
+    if ("image_imagenet" in data_file_pattern or
+        "image_mscoco" in data_file_pattern):
+      examples["inputs"] = tf.cast(examples["inputs"], tf.int64)
+      # For imagnet/coco, resize images to 299x299 as is standard.
+      inputs = examples["inputs"]
+      if mode == tf.contrib.learn.ModeKeys.TRAIN:
+        examples["inputs"] = tf.cond(  # Preprocess 80% of the time.
+            tf.less(tf.random_uniform([]), 0.8),
+            lambda img=inputs: preprocess(img),
+            lambda img=inputs: resize(img, 299))
+      else:
+        examples["inputs"] = tf.to_int64(resize(inputs, 299))
+    elif ("image_cifar10" in data_file_pattern
+          and mode == tf.contrib.learn.ModeKeys.TRAIN):
+      examples["inputs"] = common_layers.cifar_image_augmentation(
+          examples["inputs"])
+    elif "img2img" in data_file_pattern:
+      inputs = examples["inputs"]
+      examples["inputs"] = resize(inputs, 16)
+      examples["targets"] = resize(inputs, 64)
+
+  elif "audio" in data_file_pattern:
+    # Reshape audio to proper shape
+    sample_count = tf.to_int32(examples.pop("audio/sample_count"))
+    sample_width = tf.to_int32(examples.pop("audio/sample_width"))
+    channel_count = 1
+    examples["inputs"] = tf.reshape(examples["inputs"],
+                                    [sample_count, sample_width, channel_count])
+    if "wsj" in data_file_pattern:
+      examples["inputs"] = tf.bitcast(examples["inputs"], tf.int32)
+  elif "a2q_20161229" in data_file_pattern:
+    # we forgot the EOS when we preprocessed this data.
+    examples["targets"] = tf.concat([examples["targets"], [1]], 0)
+  return examples
+
+
 def input_pipeline(data_file_pattern, capacity, mode):
   """Input pipeline, returns a dictionary of tensors from queues."""
   # Read from image TFRecords if the file has "image" in its name.
@@ -181,44 +227,7 @@ def input_pipeline(data_file_pattern, capacity, mode):
       capacity=capacity,
       data_items_to_decoders=data_items_to_decoders)
 
-  if "image" in data_file_pattern:
-    # Small single-example pre-processing for images.
-    examples["inputs"] = tf.cast(examples["inputs"], tf.int64)
-    if ("image_imagenet" in data_file_pattern or
-        "image_mscoco" in data_file_pattern):
-      # For imagnet/coco, resize images to 299x299 as is standard.
-      def resize(img):
-        return tf.to_int64(tf.image.resize_images(img, [299, 299]))
-
-      def preprocess(img):
-        img = tf.image.resize_images(img, [360, 360])
-        img = common_layers.image_augmentation(tf.to_float(img) / 255.)
-        return tf.to_int64(img * 255.)
-
-      inputs = examples["inputs"]
-      if mode == tf.contrib.learn.ModeKeys.TRAIN:
-        examples["inputs"] = tf.cond(  # Preprocess 80% of the time.
-            tf.less(tf.random_uniform([]), 0.8),
-            lambda img=inputs: preprocess(img),
-            lambda img=inputs: resize(img))
-      else:
-        examples["inputs"] = tf.to_int64(resize(inputs))
-    elif ("image_cifar10" in data_file_pattern
-          and mode == tf.contrib.learn.ModeKeys.TRAIN):
-      examples["inputs"] = common_layers.cifar_image_augmentation(
-          examples["inputs"])
-  elif "audio" in data_file_pattern:
-    # Reshape audio to proper shape
-    sample_count = tf.to_int32(examples.pop("audio/sample_count"))
-    sample_width = tf.to_int32(examples.pop("audio/sample_width"))
-    channel_count = 1
-    examples["inputs"] = tf.reshape(examples["inputs"],
-                                    [sample_count, sample_width, channel_count])
-    if "wsj" in data_file_pattern:
-      examples["inputs"] = tf.bitcast(examples["inputs"], tf.int32)
-  elif "a2q_20161229" in data_file_pattern:
-    # we forgot the EOS when we preprocessed this data.
-    examples["targets"] = tf.concat([examples["targets"], [1]], 0)
+  examples = preprocessing(examples, data_file_pattern, mode)
 
   # We do not want int64s as they do are not supported on GPUs.
   return {k: tf.to_int32(v) for (k, v) in six.iteritems(examples)}
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index eca6143c7..97da4cd35 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -37,7 +37,8 @@ def padded_accuracy_topk(predictions,
                          weights_fn=common_layers.weights_nonzero):
   """Percentage of times that top-k predictions matches labels on non-0s."""
   with tf.variable_scope("padded_accuracy_topk", values=[predictions, labels]):
-    padded_predictions, padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    padded_predictions, padded_labels = common_layers.pad_with_zeros(
+        predictions, labels)
     weights = weights_fn(padded_labels)
     effective_k = tf.minimum(k, tf.shape(padded_predictions)[-1])
     _, outputs = tf.nn.top_k(padded_predictions, k=effective_k)
@@ -61,9 +62,10 @@ def padded_sequence_accuracy(predictions,
   """Percentage of times that predictions matches labels everywhere (non-0)."""
   with tf.variable_scope(
       "padded_sequence_accuracy", values=[predictions, labels]):
-    paded_predictions, padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    padded_predictions, padded_labels = common_layers.pad_with_zeros(
+        predictions, labels)
     weights = weights_fn(padded_labels)
-    outputs = tf.to_int32(tf.argmax(paded_predictions, axis=-1))
+    outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1))
     not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
     axis = list(range(1, len(outputs.get_shape())))
     correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
@@ -84,7 +86,8 @@ def padded_accuracy(predictions,
                     weights_fn=common_layers.weights_nonzero):
   """Percentage of times that predictions matches labels on non-0s."""
   with tf.variable_scope("padded_accuracy", values=[predictions, labels]):
-    padded_predictions, padded_labels = common_layers.pad_with_zeros(predictions, labels)
+    padded_predictions, padded_labels = common_layers.pad_with_zeros(
+        predictions, labels)
     weights = weights_fn(padded_labels)
     outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1))
     return tf.to_float(tf.equal(outputs, padded_labels)), weights
@@ -119,8 +122,9 @@ def fn(predictions, labels, weights, idx, weights_fn):
 
     for i, problem in enumerate(problems):
       name = "metrics-%s/%s" % (problem, metric_name)
-      weights_fn = (common_layers.weights_concatenated
-                    if "concat" in problem else common_layers.weights_nonzero)
+      class_output = "image" in problem and "coco" not in problem
+      weights_fn = (common_layers.weights_all if class_output
+                    else common_layers.weights_nonzero)
       eval_metrics[name] = functools.partial(fn, idx=i, weights_fn=weights_fn)
 
     def global_fn(predictions, labels, weights):