diff --git a/setup.py b/setup.py
index ff1503990..dd80dfd48 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.8',
+    version='1.1.9',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 30784fa60..19de46fbf 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -66,6 +66,8 @@ flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for "
                      "registered Problems.")
 flags.DEFINE_integer("max_cases", 0,
                      "Maximum number of cases to generate (unbounded if 0).")
+flags.DEFINE_bool("only_list", False,
+                  "If true, we only list the problems that will be generated.")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
 flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
 flags.DEFINE_string("t2t_usr_dir", "",
@@ -81,33 +83,33 @@ _SUPPORTED_PROBLEM_GENERATORS = {
     "algorithmic_algebra_inverse": (
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
-    "wmt_parsing_tokens_8k": (
+    "parsing_english_ptb8k": (
         lambda: wmt.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13),
         lambda: wmt.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False, 2**13)),
-    "wsj_parsing_tokens_16k": (
+    "parsing_english_ptb16k": (
         lambda: wsj_parsing.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, True, 2**14, 2**9),
         lambda: wsj_parsing.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False, 2**14, 2**9)),
-    "wmt_ende_bpe32k": (
+    "translate_ende_wmt_bpe32k": (
         lambda: wmt.ende_bpe_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, True),
         lambda: wmt.ende_bpe_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False)),
-    "lm1b_32k": (
+    "languagemodel_1b32k": (
         lambda: lm1b.generator(FLAGS.tmp_dir, True),
         lambda: lm1b.generator(FLAGS.tmp_dir, False)
     ),
-    "lm1b_characters": (
+    "languagemodel_1b_characters": (
         lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True),
         lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True)
     ),
     "image_celeba_tune": (
         lambda: image.celeba_generator(FLAGS.tmp_dir, 162770),
         lambda: image.celeba_generator(FLAGS.tmp_dir, 19867, 162770)),
-    "snli_32k": (
+    "inference_snli32k": (
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15),
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15),
     ),
@@ -181,7 +183,11 @@ def main(_):
                        "Data will be written to default data_dir=%s.",
                        FLAGS.data_dir)
 
-  tf.logging.info("Generating problems:\n  * %s\n" % "\n  * ".join(problems))
+  tf.logging.info("Generating problems:\n%s"
+                  % registry.display_list_by_prefix(problems,
+                                                    starting_spaces=4))
+  if FLAGS.only_list:
+    return
   for problem in problems:
     set_random_seed()
 
@@ -210,7 +216,7 @@ def generate_data_for_problem(problem):
 
 
 def generate_data_for_registered_problem(problem_name):
-  tf.logging.info("Generating training data for %s.", problem_name)
+  tf.logging.info("Generating data for %s.", problem_name)
   if FLAGS.num_shards:
     raise ValueError("--num_shards should not be set for registered Problem.")
   problem = registry.problem(problem_name)
diff --git a/tensor2tensor/data_generators/cipher.py b/tensor2tensor/data_generators/cipher.py
index a11776b84..977174880 100644
--- a/tensor2tensor/data_generators/cipher.py
+++ b/tensor2tensor/data_generators/cipher.py
@@ -29,7 +29,7 @@
 
 
 @registry.register_problem
-class CipherShift5(algorithmic.AlgorithmicProblem):
+class AlgorithmicCipherShift5(algorithmic.AlgorithmicProblem):
   """Shift cipher."""
 
   @property
@@ -62,7 +62,7 @@ def dev_length(self):
 
 
 @registry.register_problem
-class CipherVigenere5(algorithmic.AlgorithmicProblem):
+class AlgorithmicCipherVigenere5(algorithmic.AlgorithmicProblem):
   """Vinegre cipher."""
 
   @property
@@ -95,7 +95,7 @@ def dev_length(self):
 
 
 @registry.register_problem
-class CipherShift200(CipherShift5):
+class AlgorithmicCipherShift200(AlgorithmicCipherShift5):
   """Shift cipher."""
 
   @property
@@ -110,7 +110,7 @@ def distribution(self):
 
 
 @registry.register_problem
-class CipherVigenere200(CipherVigenere5):
+class AlgorithmicCipherVigenere200(AlgorithmicCipherVigenere5):
   """Vinegre cipher."""
 
   @property
diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py
index 438c116c8..1e26b000c 100644
--- a/tensor2tensor/data_generators/desc2code.py
+++ b/tensor2tensor/data_generators/desc2code.py
@@ -209,8 +209,8 @@ def generator_target():
       }
 
 
-@registry.register_problem("desc2code_py")
-class Desc2CodePyProblem(Desc2CodeProblem):
+@registry.register_problem
+class ProgrammingDesc2codePy(Desc2CodeProblem):
   """Description2Code for python problem."""
 
   @property
@@ -222,8 +222,8 @@ def preprocess_target(self, target):
     return target.replace("\t", "    ")
 
 
-@registry.register_problem("desc2code_cpp")
-class Desc2CodeCppProblem(Desc2CodeProblem):
+@registry.register_problem
+class ProgrammingDesc2codeCpp(Desc2CodeProblem):
   """Description2Code for C++ problem."""
 
   @property
diff --git a/tensor2tensor/data_generators/desc2code_test.py b/tensor2tensor/data_generators/desc2code_test.py
index 24b7568d0..79992296b 100644
--- a/tensor2tensor/data_generators/desc2code_test.py
+++ b/tensor2tensor/data_generators/desc2code_test.py
@@ -47,7 +47,7 @@ class Desc2codeTest(tf.test.TestCase):
 
   def testCppPreprocess(self):
     """Check that the file correctly preprocess the code source."""
-    cpp_pb = desc2code.Desc2CodeCppProblem()
+    cpp_pb = desc2code.ProgrammingDesc2codeCpp()
 
     self.assertEqual(  # Add space beween two lines
         cpp_pb.preprocess_target("firstline//comm1\nsecondline//comm2\n"),
diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 341a20c71..0607aad15 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -176,8 +176,8 @@ def eval_metrics(self):
     return [metrics.Metrics.LOG_POISSON, metrics.Metrics.R2]
 
 
-@registry.register_problem("gene_expression_cage10")
-class GeneExpressionCAGE10(GeneExpressionProblem):
+@registry.register_problem
+class GenomicsExpressionCage10(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -188,8 +188,8 @@ def h5_file(self):
     return "cage10.h5"
 
 
-@registry.register_problem("gene_expression_gm12878")
-class GeneExpressionGM12878(GeneExpressionProblem):
+@registry.register_problem
+class GenomicsExpressionGm12878(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -200,8 +200,8 @@ def h5_file(self):
     return "gm12878.h5"
 
 
-@registry.register_problem("gene_expression_l262k")
-class GeneExpressionL262k(GeneExpressionProblem):
+@registry.register_problem
+class GenomicsExpressionL262k(GeneExpressionProblem):
 
   @property
   def h5_file(self):
diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
index 591b205da..4fb0424bb 100644
--- a/tensor2tensor/data_generators/ice_parsing.py
+++ b/tensor2tensor/data_generators/ice_parsing.py
@@ -62,8 +62,8 @@ def tabbed_parsing_character_generator(tmp_dir, train):
   return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
 
 
-@registry.register_problem("ice_parsing_tokens")
-class IceParsingTokens(problem.Problem):
+@registry.register_problem
+class ParsingIcelandic16k(problem.Problem):
   """Problem spec for parsing tokenized Icelandic text to constituency trees."""
 
   @property
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index f61f85b54..d9a6be6ff 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -214,10 +214,21 @@ def dataset_filename(self):
   def is_small(self):
     return True  # Modalities like for CIFAR.
 
-  def preprocess_examples(self, examples, mode):
-    examples = imagenet_preprocess_examples(examples, mode)
-    examples["inputs"] = tf.to_int64(
-        tf.image.resize_images(examples["inputs"], [32, 32]))
+  @property
+  def num_classes(self):
+    return 1000
+
+  def preprocess_examples(self, examples, mode, hparams):
+    # Just resize with area.
+    if self._was_reversed:
+      examples["inputs"] = tf.to_int64(
+          tf.image.resize_images(examples["inputs"], [32, 32],
+                                 tf.image.ResizeMethod.AREA))
+    else:
+      examples = imagenet_preprocess_examples(examples, mode)
+      examples["inputs"] = tf.to_int64(
+          tf.image.resize_images(examples["inputs"], [32, 32]))
+    return examples
 
 
 def image_generator(images, labels):
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 7a84aac93..60b1e842b 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -98,7 +98,7 @@ def preprocess_examples_common(examples, hparams):
     examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length]
   if hparams.max_target_seq_length > 0:
     examples["targets"] = examples["targets"][:hparams.max_target_seq_length]
-  if hparams.prepend_inputs_to_targets:
+  if hparams.prepend_mode != "none":
     examples["targets"] = tf.concat(
         [examples["inputs"], [0], examples["targets"]], 0)
   return examples
@@ -410,11 +410,12 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
       generator_utils.generate_files(
           self.generator(data_dir, tmp_dir, True), all_paths)
       generator_utils.shuffle_dataset(all_paths)
-    generator_utils.generate_dataset_and_shuffle(
-        self.generator(data_dir, tmp_dir, True),
-        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
-        self.generator(data_dir, tmp_dir, False),
-        self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False))
+    else:
+      generator_utils.generate_dataset_and_shuffle(
+          self.generator(data_dir, tmp_dir, True),
+          self.training_filepaths(data_dir, self.num_shards, shuffled=False),
+          self.generator(data_dir, tmp_dir, False),
+          self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False))
 
   def feature_encoders(self, data_dir):
     if self.is_character_level:
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index b33438d6d..4a6053613 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -492,16 +492,16 @@ def image_celeba(unused_model_hparams):
         lambda p: audio_wsj_tokens(p, 2**13),
     "audio_wsj_tokens_8k_test":
         lambda p: audio_wsj_tokens(p, 2**13),
-    "lm1b_characters":
+    "languagemodel_1b_characters":
         lm1b_characters,
-    "lm1b_32k":
+    "languagemodel_1b32k":
         lm1b_32k,
-    "wmt_parsing_tokens_8k":
+    "parsing_english_ptb8k":
         lambda p: wmt_parsing_tokens(p, 2**13),
-    "wsj_parsing_tokens_16k":
+    "parsing_english_ptb16k":
         lambda p: wsj_parsing_tokens(  # pylint: disable=g-long-lambda
             p, "wsj", 2**14, 2**9),
-    "wmt_ende_bpe32k":
+    "translate_ende_wmt_bpe32k":
         wmt_ende_bpe32k,
     "image_celeba_tune":
         image_celeba,
diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py
index b9014bcd6..893c2b77c 100644
--- a/tensor2tensor/data_generators/ptb.py
+++ b/tensor2tensor/data_generators/ptb.py
@@ -157,8 +157,8 @@ def _generator(self, filename, encoder):
           yield {"inputs": [0], "targets": tok}
 
 
-@registry.register_problem("lm_ptb_10k")
-class LmPtb10k(PTBProblem):
+@registry.register_problem
+class LanguagemodelPtb10k(PTBProblem):
   """A class for generating PTB data, 10k vocab."""
 
   @property
@@ -167,7 +167,7 @@ def is_character_level(self):
 
 
 @registry.register_problem
-class LmPtbCharacters(PTBProblem):
+class LanguagemodelPtbCharacters(PTBProblem):
   """A class for generating PTB data, character-level."""
 
   @property
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 1e427dbe8..3cdbac5db 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -81,8 +81,8 @@ def _page_title(page):
 
 
 @registry.register_problem
-class Wiki32k(problem.Text2TextProblem):
-  """A class for generating PTB data."""
+class LanguagemodelWikiFull32k(problem.Text2TextProblem):
+  """A language model on full English Wikipedia."""
 
   @property
   def is_character_level(self):
@@ -129,3 +129,12 @@ def generator(self, data_dir, tmp_dir, _):
       encoded = encoder.encode(page) + [EOS]
       encoded_title = encoder.encode(title) + [EOS]
       yield {"inputs": encoded_title, "targets": encoded}
+
+
+@registry.register_problem
+class LanguagemodelWikiFull8k(problem.Text2TextProblem):
+  """A language model on full English Wikipedia."""
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**13  # 8192
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 52990eb5f..93fc27ac5 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Data generators for WMT data-sets."""
+"""Data generators for translation data-sets."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -39,8 +39,8 @@
 EOS = text_encoder.EOS_ID
 
 
-class WMTProblem(problem.Text2TextProblem):
-  """Base class for WMT problems."""
+class TranslateProblem(problem.Text2TextProblem):
+  """Base class for translation problems."""
 
   @property
   def is_character_level(self):
@@ -381,8 +381,8 @@ def _compile_data(tmp_dir, datasets, filename):
   return filename
 
 
-@registry.register_problem("wmt_ende_tokens_8k")
-class WMTEnDeTokens8k(WMTProblem):
+@registry.register_problem
+class TranslateEndeWmt8k(TranslateProblem):
   """Problem spec for WMT En-De translation."""
 
   @property
@@ -407,16 +407,16 @@ def target_space_id(self):
     return problem.SpaceID.DE_TOK
 
 
-@registry.register_problem("wmt_ende_tokens_32k")
-class WMTEnDeTokens32k(WMTEnDeTokens8k):
+@registry.register_problem
+class TranslateEndeWmt32k(TranslateEndeWmt8k):
 
   @property
   def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
-@registry.register_problem("wmt_ende_characters")
-class WMTEnDeCharacters(WMTProblem):
+@registry.register_problem
+class TranslateEndeWmtCharacters(TranslateProblem):
   """Problem spec for WMT En-De translation."""
 
   @property
@@ -440,8 +440,8 @@ def target_space_id(self):
     return problem.SpaceID.DE_CHR
 
 
-@registry.register_problem("wmt_zhen_tokens_8k")
-class WMTZhEnTokens8k(WMTProblem):
+@registry.register_problem
+class TranslateEnzhWmt8k(TranslateProblem):
   """Problem spec for WMT Zh-En translation."""
 
   @property
@@ -466,7 +466,10 @@ def generator(self, data_dir, tmp_dir, train):
         target_vocab_size, target_datasets)
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
-    return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
+    # We generate English->X data by convention, to train reverse translation
+    # just add the "_rev" suffix to the problem name, e.g., like this.
+    #   --problems=translate_enzh_wmt8k_rev
+    return bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1",
                                      source_vocab, target_vocab, EOS)
 
   @property
@@ -491,8 +494,8 @@ def feature_encoders(self, data_dir):
     }
 
 
-@registry.register_problem("wmt_enfr_tokens_8k")
-class WMTEnFrTokens8k(WMTProblem):
+@registry.register_problem
+class TranslateEnfrWmt8k(TranslateProblem):
   """Problem spec for WMT En-Fr translation."""
 
   @property
@@ -517,16 +520,16 @@ def target_space_id(self):
     return problem.SpaceID.FR_TOK
 
 
-@registry.register_problem("wmt_enfr_tokens_32k")
-class WMTEnFrTokens32k(WMTEnFrTokens8k):
+@registry.register_problem
+class TranslateEnfrWmt32k(TranslateEnfrWmt8k):
 
   @property
   def targeted_vocab_size(self):
     return 2**15  # 32768
 
 
-@registry.register_problem("wmt_enfr_characters")
-class WMTEnFrCharacters(WMTProblem):
+@registry.register_problem
+class TranslateEnfrWmtCharacters(TranslateProblem):
   """Problem spec for WMT En-Fr translation."""
 
   @property
@@ -550,8 +553,8 @@ def target_space_id(self):
     return problem.SpaceID.FR_CHR
 
 
-@registry.register_problem("setimes_mken_tokens_32k")
-class SETimesMkEnTokens32k(WMTProblem):
+@registry.register_problem
+class TranslateEnmkSetimes32k(TranslateProblem):
   """Problem spec for SETimes Mk-En translation."""
 
   @property
@@ -571,7 +574,10 @@ def generator(self, data_dir, tmp_dir, train):
         source_datasets + target_datasets)
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
-    return token_generator(data_path + ".lang1", data_path + ".lang2",
+    # We generate English->X data by convention, to train reverse translation
+    # just add the "_rev" suffix to the problem name, e.g., like this.
+    #   --problems=translate_enmk_setimes32k_rev
+    return token_generator(data_path + ".lang2", data_path + ".lang1",
                            symbolizer_vocab, EOS)
 
   @property
@@ -583,8 +589,8 @@ def target_space_id(self):
     return problem.SpaceID.EN_TOK
 
 
-@registry.register_problem("wmt_encs_tokens_32k")
-class WMTEnCsTokens32k(WMTProblem):
+@registry.register_problem
+class TranslateEncsWmt32k(TranslateProblem):
   """Problem spec for WMT English-Czech translation."""
 
   @property
@@ -616,8 +622,8 @@ def target_space_id(self):
     return problem.SpaceID.CS_TOK
 
 
-@registry.register_problem("wmt_encs_characters")
-class WMTEnCsCharacters(WMTProblem):
+@registry.register_problem
+class TranslateEncsWmtCharacters(TranslateProblem):
   """Problem spec for WMT En-Cs character-based translation."""
 
   @property
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 4f1273163..2c3e4b71f 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -17,12 +17,14 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from functools import partial
 
 import math
 
 # Dependency imports
 
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import expert_utils
 
 import tensorflow as tf
 
@@ -206,6 +208,39 @@ def attention_bias_ignore_padding(memory_padding):
   return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1)
 
 
+def attention_bias_prepend_inputs_full_attention(padding):
+  """Create a bias tensor for prepend_mode="prepend_inputs_full_attention".
+
+  See prepend_inputs in common_hparams.py.
+
+  Produces a bias tensor to be used in self-attention.
+
+  This bias tensor allows for full connectivity in the "inputs" part of
+  the sequence and masked connectivity in the targets part.
+
+  Args:
+    padding: a float `Tensor` with shape [batch, length] with
+      ones in positions corresponding to padding.  In each row, a single
+      padding position separates the input part from the target part.
+
+  Returns:
+    a `Tensor` with shape [batch, 1, length, length].
+  """
+  # Everything past the first padding position is part of the target.
+  # This Tensor has zeros for the source portion and separator,
+  # and ones for the target portion.
+  in_target = tf.cumsum(padding, axis=1, exclusive=True)
+  # The position within the target, or 0 if part of the source.
+  target_pos = tf.cumsum(in_target, axis=1)
+  # A position with a lesser target_pos cannot see a position with greater
+  # target_pos.
+  illegal_connections = tf.greater(tf.expand_dims(target_pos, 1),
+                                   tf.expand_dims(target_pos, 2))
+  bias = tf.to_float(illegal_connections) * -1e9
+  bias = tf.expand_dims(bias, 1)
+  return bias
+
+
 def attention_bias_proximal(length):
   """Bias for self-attention to encourage attention to close positions.
 
@@ -646,6 +681,70 @@ def gather_blocks(x, indices):
     return tf.reshape(output, v_shape)
 
 
+def compute_qkv(query_antecedent, memory_antecedent, total_key_depth,
+                total_value_depth, q_filter_width=1, kv_filter_width=1,
+                q_padding="VALID", kv_padding="VALID"):
+  """Computes query, key and value.
+
+  Args:
+    query_antecedent: a Tensor with shape [batch, length_q, channels]
+    memory_antecedent: a Tensor with shape [batch, length_m, channels]
+    total_key_depth: an integer
+    total_value_depth: and integer
+    q_filter_width: An integer specifying how wide you want the query to be.
+    kv_filter_width: An integer specifying how wide you want the keys and values
+    to be.
+    q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
+    kv_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
+
+  Returns:
+    q, k, v : [batch, length, depth] tensors
+  """
+  if memory_antecedent is None and q_filter_width == kv_filter_width == 1:
+    # self attention with single position q, k, and v
+    combined = common_layers.conv1d(
+        query_antecedent,
+        total_key_depth * 2 + total_value_depth,
+        1,
+        name="qkv_transform")
+    q, k, v = tf.split(
+        combined, [total_key_depth, total_key_depth, total_value_depth],
+        axis=2)
+    return q, k, v
+
+  if memory_antecedent is None:
+    # self attention
+    q = common_layers.conv1d(
+        query_antecedent,
+        total_key_depth,
+        q_filter_width,
+        padding=q_padding,
+        name="q_transform")
+    kv_combined = common_layers.conv1d(
+        query_antecedent,
+        total_key_depth + total_value_depth,
+        kv_filter_width,
+        padding=kv_padding,
+        name="kv_transform")
+    k, v = tf.split(kv_combined, [total_key_depth, total_value_depth],
+                    axis=2)
+    return q, k, v
+
+  # encoder-decoder attention
+  q = common_layers.conv1d(
+      query_antecedent, total_key_depth, q_filter_width, padding=q_padding,
+      name="q_transform")
+  combined = common_layers.conv1d(
+      memory_antecedent,
+      total_key_depth + total_value_depth,
+      1,
+      padding=kv_padding,
+      name="kv_transform")
+  k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
+
+  return q, k, v
+
+
 def multihead_attention(query_antecedent,
                         memory_antecedent,
                         bias,
@@ -658,6 +757,10 @@ def multihead_attention(query_antecedent,
                         attention_type="dot_product",
                         block_length=128,
                         block_width=128,
+                        q_filter_width=1,
+                        kv_filter_width=1,
+                        q_padding="VALID",
+                        kv_padding="VALID",
                         name=None):
   """Multihead scaled-dot-product attention with input/output transformations.
 
@@ -676,6 +779,12 @@ def multihead_attention(query_antecedent,
                     "local_unmasked"
     block_length: an integer - relevant for "local_mask_right"
     block_width: an integer - relevant for "local_unmasked"
+    q_filter_width: An integer specifying how wide you want the query to be.
+    kv_filter_width: An integer specifying how wide you want the keys and values
+    to be.
+    q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
+    kv_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
+
     name: an optional string
 
   Returns:
@@ -691,30 +800,14 @@ def multihead_attention(query_antecedent,
   if total_value_depth % num_heads != 0:
     raise ValueError("Value depth (%d) must be divisible by the number of "
                      "attention heads (%d)." % (total_value_depth, num_heads))
-
   with tf.variable_scope(
       name,
       default_name="multihead_attention",
       values=[query_antecedent, memory_antecedent]):
-    if memory_antecedent is None:
-      # self attention
-      combined = common_layers.conv1d(
-          query_antecedent,
-          total_key_depth * 2 + total_value_depth,
-          1,
-          name="qkv_transform")
-      q, k, v = tf.split(
-          combined, [total_key_depth, total_key_depth, total_value_depth],
-          axis=2)
-    else:
-      q = common_layers.conv1d(
-          query_antecedent, total_key_depth, 1, name="q_transform")
-      combined = common_layers.conv1d(
-          memory_antecedent,
-          total_key_depth + total_value_depth,
-          1,
-          name="kv_transform")
-      k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
+    q, k, v = compute_qkv(query_antecedent, memory_antecedent, total_key_depth,
+                          total_value_depth, q_filter_width, kv_filter_width,
+                          q_padding, kv_padding)
+
     q = split_heads(q, num_heads)
     k = split_heads(k, num_heads)
     v = split_heads(v, num_heads)
@@ -861,5 +954,106 @@ def parameter_attention(x,
     y = tf.reshape(y, [batch_size, length, total_value_depth])
     y.set_shape([None, None, total_value_depth])
     y = common_layers.conv1d(y, output_depth, 1, name="output_transform")
-
     return y
+
+
+def coordinate_tensor(shape, axis):
+  """Return a tensor with given shape containing coordinte along given axis.
+
+  Args:
+    shape: a Tensor representing the shape of the output Tensor
+    axis: an integer
+
+  Returns:
+    A tensor with shape shape and type tf.int32, where each elements its
+    coordinate along the given axis.
+  """
+
+  r = tf.range(shape[axis])
+  r_shape = tf.one_hot(
+      axis, tf.size(shape), on_value=-1, off_value=1, dtype=tf.int32)
+  return tf.zeros(shape, dtype=tf.int32) + tf.reshape(r, r_shape)
+
+
+def self_attention_expert(x, batch_coordinate, mask_right=True):
+  """Implementing attention that runs inside each expert.
+
+  Args:
+    x: A tensor of shape[batch, depth]. Contains representations from
+      different positions, which are lexicographically ordered.
+    batch_coordinate: A tensor of shape [batch, 1] containing the batch
+      coordinate of each element in x. This is needed to make sure that
+      positions from different sequences don't attend to each other.
+    mask_right: A bool. If true, we will not attend to positions on the right,
+      just as decoder self attention.
+
+  Returns:
+    out: A tensor of shape [batch, depth].
+  example use:
+  expert_utils.local_moe(
+     ...
+     expert_fn=functools.partial(self_attention_expert, mask_right=)
+     )
+  """
+  depth = x.get_shape().as_list()[-1]
+  length = tf.shape(batch_coordinate)[0]
+  batch_coordinate = tf.squeeze(batch_coordinate, 1)
+  bias = tf.to_float(
+      tf.not_equal(tf.expand_dims(batch_coordinate, 1),
+                   tf.expand_dims(batch_coordinate, 0))) * -1e9
+  if mask_right:
+    bias += tf.reshape(
+        attention_bias_lower_triangle(length), [length, length])
+  # bias has shape [length, length]
+  bias = tf.reshape(bias, [1, 1, length, length])
+  x = tf.reshape(x, [1, length, depth])
+  out = multihead_attention(x,
+                            None,
+                            bias,
+                            total_key_depth=depth,
+                            total_value_depth=depth,
+                            output_depth=depth,
+                            num_heads=1,
+                            dropout_rate=0.0)
+  out = tf.squeeze(out, 0)
+  return out
+
+#  functools.partial(self_attention_expert, mask_right=, depth=)
+
+
+def local_expert_attention(x, k, loss_coef, attention_num_experts, train=True,
+                           mask_right=True):
+  """Attention using a mixture of experts.
+
+    Positions sent to the same expert can attend to each other.
+    The mixture of experts is "local" in that it is replicated on each
+    datashard.
+
+  Args:
+    x: a Tensor with shape [batch, length, depth]
+    k: The number of experts to dispatch each example to
+    loss_coef: a scalar. A multiplier for the expert loss
+    attention_num_experts: The number of experts to use
+    train: a boolean for the current mode
+    mask_right: A boolean. If true, we will mask out positions to the right
+      for self-attention.
+
+  Returns:
+    y: a Tensor with shape [batch, length, depth]
+    loss: a Scalar
+  """
+  with tf.variable_scope("local_expert_attention"):
+    additional_dispatch_params = {
+        "batch_coordinate": tf.expand_dims(
+            coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
+    }
+    return expert_utils.local_moe(
+        x,
+        train,
+        partial(self_attention_expert, mask_right=mask_right),
+        attention_num_experts,
+        k=k,
+        loss_coef=loss_coef,
+        pass_x=True,
+        pass_gates=False,
+        additional_dispatch_params=additional_dispatch_params)
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 0ed62685f..6bb4d3e9d 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -124,11 +124,24 @@ def basic_params1():
       # You can change this behavior by overridding preprocess_examples() method
       # in your problem class.
       max_target_seq_length=0,
-      # Treat a seq-to-seq problem as a language model by prepending the
-      # inputs to the targets.  During training, the loss is on both the
-      # inputs and the targets.  During eval, metrics are computed only on the
-      # target portion.
-      prepend_inputs_to_targets=int(False),
+      # This flag allows us to optionally treat a seq-to-seq problem
+      # as a language model.  Legal values are:
+      #
+      # "none" - Do not prepend the inputs to the targets.
+      # "prepend_inputs_masked_attention"
+      #     replace "targets" in preprocessing with
+      #     tf.concat([inputs, [0], targets], axis=1)
+      #     i.e. we prepend the inputs to the targets with a single
+      #     padding token in between.  Use masked self-attention on the
+      #     entire resulting sequence.  During training, we compute losses on
+      #     the combined sequence.  During eval, we compute the metrics
+      #     on only the targets portion.
+      # "prepend_inputs_full_attention"
+      #     similar to the previous option except that each
+      #     position in the inputs portion can see the
+      #     entire inputs portion.  This removes the challenge of
+      #     autoregressively predicting the inputs portion.
+      prepend_mode="none",
   )
 
 
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index e9b195195..8621ddcb1 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1361,10 +1361,11 @@ def weights_nonzero(labels):
   return tf.to_float(tf.not_equal(labels, 0))
 
 
-def weights_second_part(labels):
-  """Weights function for 'prepend_inputs_to_targets'.
+def weights_prepend_inputs_to_targets(labels):
+  """Assign weight 1.0 to only the "targets" portion of the labels.
 
   Weight 1.0 is assigned to all nonzero labels past the first zero.
+  See prepend_mode in common_hparams.py
 
   Args:
     labels: A Tensor of int32s.
@@ -1372,7 +1373,7 @@ def weights_second_part(labels):
   Returns:
     A Tensor of floats.
   """
-  past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0)))
+  past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0)), axis=1)
   nonzero = tf.to_float(labels)
   return tf.to_float(tf.not_equal(past_first_zero * nonzero, 0))
 
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 84f9adbe7..01728ba24 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -510,6 +510,10 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 class IdentityModalityNoPad(modality.Modality):
   """Does nothing except making sure that there is no padding in cross-ent."""
 
+  @property
+  def top_dimensionality(self):
+    return 256
+
   @property
   def targets_dimensionality(self):
     return self._vocab_size
diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py
index d6fb95cf3..4dd1cde03 100644
--- a/tensor2tensor/layers/rev_block.py
+++ b/tensor2tensor/layers/rev_block.py
@@ -23,6 +23,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import random
 import re
 
 # Dependency imports
@@ -34,122 +35,304 @@
 LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
 
 
-def _rev_layer_forward(xs, f, g):
+def _acc_grads(*lists_of_grads):
+  """Accumulates lists of gradients."""
+  acc_grads = []
+  for grads in zip(*lists_of_grads):
+    grads = [g for g in grads if g is not None]
+    if grads:
+      acc_grads.append(tf.add_n(grads))
+    else:
+      acc_grads.append(None)
+  return acc_grads
+
+
+def _rev_layer_forward(xs, f, g, f_side_input, g_side_input,
+                       gate_outputs=False):
   """Forward for 1 reversible layer."""
   x1, x2 = xs
   with tf.variable_scope("f"):
-    y1 = x1 + f(x2)
+    y1 = x1 + (f(x2, f_side_input) if f_side_input else f(x2))
   with tf.variable_scope("g"):
-    y2 = x2 + g(y1)
-  return tf.tuple([y1, y2])
+    y2 = x2 + (g(y1, g_side_input) if g_side_input else g(y1))
+  if gate_outputs:
+    return tf.tuple([y1, y2])
+  else:
+    return (y1, y2)
 
 
-def _rev_layer_backward(ys, grad_ys, f, g, f_vars, g_vars):
+def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars,
+                        g_side_input):
   """Backprop for 1 layer."""
   y1, y2 = ys
   grad_y1, grad_y2 = grad_ys
 
   # Reconstruct intermediates and inputs (x1, x2)
-  # stop_gradients required on y1 and x2 to prevent infinite recursion into this
+  # stop_gradients required on fn inputs to prevent infinite recursion into this
   # grad function on the calls to tf.gradients.
   y1_stop = tf.stop_gradient(y1)
+  g_side_input = [tf.stop_gradient(t) for t in g_side_input]
   with tf.variable_scope("g"):
-    gy1 = g(y1_stop)
+    gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop)
 
   x2 = y2 - gy1
   x2_stop = tf.stop_gradient(x2)
+  f_side_input = [tf.stop_gradient(t) for t in f_side_input]
   with tf.variable_scope("f"):
-    fx2 = f(x2_stop)
+    fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop)
 
   x1 = y1 - fx2
 
   # Compute gradients wrt to inputs
   # dL/dy2 * dG(y1)/y1
-  grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2, gate_gradients=True)[0]
+  grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2)[0]
   grad_x1 = grad_y1 + grad_gy1_y2
-  grad_x2 = (
-      tf.gradients(fx2, x2_stop, grad_y1, gate_gradients=True)[0] + grad_y2 +
-      tf.gradients(fx2, x2_stop, grad_gy1_y2, gate_gradients=True)[0])
+  grad_x2 = (tf.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + tf.gradients(
+      fx2, x2_stop, grad_gy1_y2)[0])
 
-  # Compute gradients wrt to vars in f and g
-  grad_g_vars = tf.gradients(gy1, g_vars, grad_y2, gate_gradients=True)
-  grad_f_y1 = tf.gradients(fx2, f_vars, grad_y1, gate_gradients=True)
-  grad_f_y2 = tf.gradients(fx2, f_vars, grad_gy1_y2, gate_gradients=True)
-  grad_f_vars = [tf.add_n(grads) for grads in zip(grad_f_y1, grad_f_y2)]
+  # Compute gradients wrt to vars and side inputs in f and g
+  grads1 = tf.gradients(gy1, g_vars + g_side_input, grad_y2)
+  grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):]
+  grads2 = tf.gradients(fx2, f_vars + f_side_input, grad_y1)
+  grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):]
+  grads3 = tf.gradients(fx2, f_vars + f_side_input, grad_gy1_y2)
+  grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):]
+  grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2)
+
+  grad_f_side = _acc_grads(grad_f_side1, grad_f_side2)
 
   # Put returns in a tuple to ensure a constant memory budget (i.e. don't want
   # the subsequent layer to start computing and consuming memory based on a
   # subset of these values).
-  outs = tf.tuple([x1, x2, grad_x1, grad_x2] + grad_f_vars + grad_g_vars)
+  outs = tf.tuple([x1, x2, grad_x1, grad_x2] + grad_f_vars + grad_g_vars +
+                  grad_f_side + grad_g_side)
   x1, x2, grad_x1, grad_x2 = outs[:4]
-  grad_f_vars = outs[4:4 + len(grad_f_vars)]
-  grad_g_vars = outs[4 + len(grad_f_vars):]
-
-  return (x1, x2), (grad_x1, grad_x2), grad_f_vars, grad_g_vars
-
-
-def _rev_block_forward(x, f, g, num_layers=1, layer_scopes=None, name=None):
+  grad_f_vars_end = 4 + len(grad_f_vars)
+  grad_g_vars_end = grad_f_vars_end + len(grad_g_vars)
+  grad_f_side_end = grad_g_vars_end + len(grad_f_side)
+
+  grad_f_vars = outs[4:grad_f_vars_end]
+  grad_g_vars = outs[grad_f_vars_end:grad_g_vars_end]
+  grad_f_side = outs[grad_g_vars_end:grad_f_side_end]
+  grad_g_side = outs[grad_f_side_end:]
+
+  return ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side),
+          (grad_g_vars, grad_g_side))
+
+
+def _rev_block_forward(x1,
+                       x2,
+                       f,
+                       g,
+                       num_layers=1,
+                       f_side_input=None,
+                       g_side_input=None,
+                       layer_scopes=None,
+                       gate_outputs=False,
+                       name=None):
   """Forward for a series of reversible layers."""
-  x1, x2 = tf.split(x, 2, axis=len(x.get_shape()) - 1)
   out = (x1, x2)
   with tf.variable_scope(name, default_name="revblock"):
     for i in xrange(num_layers):
       with tf.variable_scope("revlayer_%d" % i) as layer_vs:
         if layer_scopes is not None:
           layer_scopes.append(layer_vs)
-        out = _rev_layer_forward(out, f, g)
+        out = _rev_layer_forward(
+            out,
+            f[i],
+            g[i],
+            f_side_input,
+            g_side_input,
+            gate_outputs=gate_outputs)
 
   y1, y2 = out
-  y = tf.concat([y1, y2], axis=-1)
-  return y
+  return y1, y2
+
+
+def _underlying_variable(t):
+  """Find the underlying variable ref, ignoring Identity ops."""
+  while t.op.type == "Identity":
+    t = t.op.inputs[0]
+  if t.dtype == dtypes.float32_ref and "Variable" in t.op.type:
+    return t
+  else:
+    return None
+
+
+def fn_with_custom_grad(grad_fn):
+  """Decorator to create a subgraph with a custom gradient function.
+
+  The subgraph created by the decorated function is NOT put in a Defun and so
+  does not suffer from the limitations of the Defun (all subgraph ops on the
+  same device, no summaries).
+
+  Args:
+    grad_fn: function with signature
+      (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+
+  Returns:
+    Decorator for function such that the gradient is defined by grad_fn.
+  """
+
+  def dec(fn):
+
+    def wrapped(*args):
+      return _fn_with_custom_grad(fn, args, grad_fn)
+
+    return wrapped
 
+  return dec
 
-def rev_block(x, f, g, num_layers=1, is_training=True):
+
+def _fn_with_custom_grad(fn, inputs, grad_fn):
+  """Create a subgraph with a custom gradient.
+
+  Args:
+    fn: function that takes inputs as arguments and produces 1 or more Tensors.
+    inputs: list<Tensor>, will be passed as fn(*inputs).
+    grad_fn: function with signature
+      (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+
+  Returns:
+    fn(*inputs)
+  """
+  with tf.variable_scope(None, default_name="fn_with_custom_grad") as vs:
+    inputs = list(inputs)
+    outputs = fn(*inputs)
+    train_vars = list(vs.trainable_variables())
+
+  if grad_fn is None:
+    return outputs
+  else:
+    if not (isinstance(outputs, tuple) or isinstance(outputs, list)):
+      outputs = [outputs]
+    outputs = list(outputs)
+
+    in_types = [t.dtype for t in inputs]
+    out_types = [t.dtype for t in outputs]
+    var_types = [t.dtype for t in train_vars]
+
+    def custom_grad_fn(op, *dys):
+      """Custom grad fn applying grad_fn for identity Defun."""
+      dys = list(dys)
+      fn_inputs = op.inputs[:len(inputs)]
+      fn_vars = op.inputs[len(inputs):len(inputs) + len(train_vars)]
+      fn_outputs = op.inputs[len(inputs) + len(train_vars):]
+      assert len(fn_outputs) == len(outputs)
+      assert len(fn_outputs) == len(dys)
+
+      grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys)
+      grad_outputs = [None] * len(fn_outputs)
+      return tuple(grad_inputs + grad_vars + grad_outputs)
+
+    # The Defun takes as input the original inputs, the trainable variables
+    # created in fn, and the outputs. In the forward it passes through the
+    # outputs. In the backwards, it produces gradients for the original inputs
+    # and the trainable variables.
+    @function.Defun(
+        *(in_types + var_types + out_types),
+        func_name="identity_custom_grad%d" % random.randint(1, 10**9),
+        python_grad_func=custom_grad_fn,
+        shape_func=lambda _: [t.get_shape() for t in outputs])
+    def identity(*args):
+      outs = args[len(inputs) + len(train_vars):]
+      return tuple([tf.identity(t) for t in outs])
+
+    id_out = identity(*(inputs + train_vars + outputs))
+    return id_out
+
+
+def rev_block(x1,
+              x2,
+              f,
+              g,
+              num_layers=1,
+              f_side_input=None,
+              g_side_input=None,
+              is_training=True):
   """A block of reversible residual layers.
 
   A reversible residual layer is defined as:
 
   ```
-  x1, x2 = tf.split(x, 2, axis=-1)
-  y1 = x1 + f(x2)
-  y2 = x2 + g(y1)
-  y = tf.concat([y1, y2], axis=-1)
+  y1 = x1 + f(x2, f_side_input)
+  y2 = x2 + g(y1, g_side_input)
   ```
 
+  A reversible residual block, defined here, is a series of reversible residual
+  layers.
+
+  Limitations:
+  * f and g must not close over any Tensors; all side inputs to f and g should
+    be passed in with f_side_input and g_side_input which will be forwarded to
+    f and g.
+  * f and g must not change the dimensionality of their inputs in order for the
+    addition in the equations above to work.
+
   Args:
-    x: a float Tensor, input, will be split evenly across the last dim.
-    f: a function, (Tensor) -> (Tensor). Should not change the shape of the
-      Tensor. May create variables. Should NOT close over any Tensor values.
-    g: a function, (Tensor) -> (Tensor). Should not change the shape of the
-      Tensor. May create variables. Should NOT close over any Tensor values.
+    x1: a float Tensor.
+    x2: a float Tensor.
+    f: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Expected to create variables.
+      See f_side_input if there are side inputs.
+    g: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Expected to create variables.
+      See g_side_input if there are side inputs.
     num_layers: int, number of reversible residual layers. Each layer will
       apply f and g according to the equations above, with new variables in each
       layer.
+    f_side_input: list of Tensors, side input to f. If not None, signature of f
+      should be (Tensor, list<Tensor>) -> (Tensor).
+    g_side_input: list of Tensors, side input to g. If not None, signature of g
+      should be (Tensor, list<Tensor>) -> (Tensor).
     is_training: bool, whether to actually use the efficient backprop codepath.
 
   Returns:
-    y: a float Tensor, output.
+    y1, y2: tuple of float Tensors.
   """
+  if f_side_input is None:
+    f_side_input = []
+  if g_side_input is None:
+    g_side_input = []
+  if isinstance(f, list):
+    assert len(f) == num_layers
+  else:
+    f = [f] * num_layers
+  if isinstance(g, list):
+    assert len(g) == num_layers
+  else:
+    g = [g] * num_layers
+
+  # Filled by the forward function below
   layer_scopes = []
 
-  def rev_block_grad(op, grad_y):
+  def custom_grad_fn(inputs, variables, ys, grad_ys):
     """Custom gradient fn for a block of reversible residual layers."""
-    y = op.outputs[0]
-    ys = tf.split(y, 2, axis=len(y.get_shape()) - 1)
-    grad_ys = tf.split(grad_y, 2, axis=len(y.get_shape()) - 1)
+    side_inputs = inputs[2:]
+    f_side_idxs = [None] * len(f_side_input)
+    g_side_idxs = [None] * len(g_side_input)
+    assert len(side_inputs) == len(f_side_input) + len(g_side_input)
+
+    for i, t in enumerate(side_inputs):
+      if t in f_side_input:
+        f_side_idxs[f_side_input.index(t)] = i
+      elif t in g_side_input:
+        g_side_idxs[g_side_input.index(t)] = i
+      else:
+        assert False
 
-    # Find all variables from f and from g
-    # Keep track of their positions in all_vars
-    all_vars = op.inputs[1:]
     f_vars = [[] for _ in range(num_layers)]
     g_vars = [[] for _ in range(num_layers)]
     f_vars_idxs = [[] for _ in range(num_layers)]
     g_vars_idxs = [[] for _ in range(num_layers)]
 
-    for i, v in enumerate(all_vars):
-      ref = v.op.inputs[0]
-      assert ref.dtype == dtypes.float32_ref
-      regex = LAYER_RE.match(v.name)
+    for i, t in enumerate(variables):
+      ref = _underlying_variable(t)
+
+      # Use the name to identify the layer number and function (f or g)
+      regex = LAYER_RE.match(ref.name)
       layer_no = int(regex.group(1))
       fn_name = regex.group(2)
       if fn_name == "f":
@@ -160,45 +343,71 @@ def rev_block_grad(op, grad_y):
         g_vars[layer_no].append(ref)
         g_vars_idxs[layer_no].append(i)
 
-    f_grads = []
-    g_grads = []
+    f_var_grads = []
+    g_var_grads = []
+    f_side_grads = []
+    g_side_grads = []
 
-    # Reverse state containers to go backward
+    # Reverse variable containers to go backward
     layer_scopes.reverse()
     f_vars.reverse()
     g_vars.reverse()
+    f.reverse()
+    g.reverse()
 
     for i in xrange(num_layers):
       with tf.variable_scope(layer_scopes[i], reuse=True):
-        ys, grad_ys, grad_f_vars, grad_g_vars = _rev_layer_backward(
-            ys, grad_ys, f, g, f_vars[i], g_vars[i])
-        f_grads.append(grad_f_vars)
-        g_grads.append(grad_g_vars)
-
-    # Gradients were collected in reverse layer order
-    f_grads.reverse()
-    g_grads.reverse()
-
-    # Reorder the gradients so they match the original order of all_vars
-    var_grads = [None] * len(all_vars)
-    for idxs, grads in zip(f_vars_idxs, f_grads) + zip(g_vars_idxs, g_grads):
+
+        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(ys, grad_ys, f[i], g[i],
+                                                        f_vars[i], f_side_input,
+                                                        g_vars[i], g_side_input)
+
+        grad_f_vars, grad_f_side = f_ret
+        grad_g_vars, grad_g_side = g_ret
+        f_var_grads.append(grad_f_vars)
+        g_var_grads.append(grad_g_vars)
+        f_side_grads.append(grad_f_side)
+        g_side_grads.append(grad_g_side)
+
+    # Accumulate layer gradients for f_side_input and g_side_input
+    acc_f_side_grads = _acc_grads(*f_side_grads)
+    acc_g_side_grads = _acc_grads(*g_side_grads)
+
+    # Use the stored idxs to put gradients in the passed-in order.
+    side_input_grads = [None] * len(side_inputs)
+    variable_grads = [None] * len(variables)
+
+    # Variable gradients were collected in reverse layer order. Reverse to match
+    # idxs.
+    f_var_grads.reverse()
+    g_var_grads.reverse()
+    for idxs, grads in zip(f_vars_idxs, f_var_grads) + zip(
+        g_vars_idxs, g_var_grads):
       for i, grad in zip(idxs, grads):
-        var_grads[i] = grad
-
-    grad_x = tf.concat(grad_ys, axis=-1)
-    all_grads = [grad_x] + var_grads
-    return all_grads
-
-  @function.Defun(
-      tf.float32,
-      python_grad_func=rev_block_grad,
-      shape_func=lambda _: [x.get_shape()])
-  def rev_block_defun(inp):
-    inp.set_shape(x.get_shape())
-    return _rev_block_forward(
-        inp, f, g, num_layers=num_layers, layer_scopes=layer_scopes)
+        variable_grads[i] = grad
 
-  if is_training:
-    return rev_block_defun(x)
-  else:
-    return _rev_block_forward(x, f, g, num_layers=num_layers)
+    for i, grad in zip(f_side_idxs, acc_f_side_grads):
+      side_input_grads[i] = grad
+    for i, grad in zip(g_side_idxs, acc_g_side_grads):
+      side_input_grads[i] = grad
+
+    grad_x1, grad_x2 = grad_ys
+    return [grad_x1, grad_x2] + side_input_grads, variable_grads
+
+  # Need a forward function with positional arguments
+  @fn_with_custom_grad(custom_grad_fn if is_training else None)
+  def forward(x1, x2, *side_inputs):
+    f_side = side_inputs[:len(f_side_input)]
+    g_side = side_inputs[len(f_side_input):]
+    return _rev_block_forward(
+        x1,
+        x2,
+        f,
+        g,
+        num_layers=num_layers,
+        f_side_input=f_side,
+        g_side_input=g_side,
+        layer_scopes=layer_scopes,
+        gate_outputs=is_training)
+
+  return forward(x1, x2, *(f_side_input + g_side_input))
diff --git a/tensor2tensor/layers/rev_block_test.py b/tensor2tensor/layers/rev_block_test.py
index bc4bcc6a4..dd4a62993 100644
--- a/tensor2tensor/layers/rev_block_test.py
+++ b/tensor2tensor/layers/rev_block_test.py
@@ -27,66 +27,177 @@
 
 
 class RevBlockTest(tf.test.TestCase):
-
-  def testSmoke(self):
-    channels = 8
-    num_layers = 4
-    batch_size = 16
-    use_defun = True
+  CHANNELS = 8
+  NUM_LAYERS = 4
+  BATCH_SIZE = 16
+
+  def _testRevBlock(self,
+                    x=None,
+                    f=None,
+                    g=None,
+                    f_side_input=None,
+                    g_side_input=None):
     tf.set_random_seed(1234)
 
-    def f(x):
-      return tf.layers.dense(x, channels // 2, use_bias=True)
+    if f is None:
 
-    def g(x):
-      return tf.layers.dense(x, channels // 2, use_bias=True)
+      def f(x):  # pylint: disable=function-redefined
+        return tf.layers.dense(x, self.CHANNELS // 2, use_bias=True)
 
-    x = tf.random_uniform([batch_size, channels], dtype=tf.float32)
-    y = rev_block.rev_block(
-        x, f, g, num_layers=num_layers, is_training=use_defun)
-    loss = tf.reduce_mean(y + 10.)
-    grads = tf.gradients(loss, [x] + tf.global_variables())
-    with self.test_session() as sess:
-      sess.run(tf.global_variables_initializer())
-      _ = sess.run(grads)
+    if g is None:
 
-  def testRevBlock(self):
-    channels = 8
-    num_layers = 4
-    batch_size = 16
-    tf.set_random_seed(1234)
+      def g(x):  # pylint: disable=function-redefined
+        return tf.layers.dense(x, self.CHANNELS // 2, use_bias=True)
 
-    def f(x):
-      return tf.layers.dense(x, channels // 2, use_bias=True)
+    if f_side_input is None:
+      f_side_input = []
 
-    def g(x):
-      return tf.layers.dense(x, channels // 2, use_bias=True)
+    if g_side_input is None:
+      g_side_input = []
 
-    x = tf.random_uniform([batch_size, channels], dtype=tf.float32)
+    x = tf.random_uniform([self.BATCH_SIZE, self.CHANNELS], dtype=tf.float32)
+    x1, x2 = tf.split(x, 2, axis=1)
 
-    with tf.variable_scope("defun") as vs:
-      y_defun = rev_block.rev_block(x, f, g, num_layers=num_layers)
+    with tf.variable_scope("rev_test") as vs:
+      y1_rev, y2_rev = rev_block.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS)
+      y_rev = tf.concat([y1_rev, y2_rev], axis=1)
       fg_vars = vs.trainable_variables()
 
     num_vars = len(tf.global_variables())
     with tf.variable_scope(vs, reuse=True):
-      y = rev_block.rev_block(x, f, g, num_layers=num_layers, is_training=False)
+      y1, y2 = rev_block.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS,
+          is_training=False)
+      y = tf.concat([y1, y2], axis=1)
     # Ensure no new vars were created - full reuse
     assert len(tf.global_variables()) == num_vars
 
-    loss_defun = tf.reduce_mean(y_defun + 10.)
+    loss_rev = tf.reduce_mean(y_rev + 10.)
     loss = tf.reduce_mean(y + 10.)
 
-    grads_defun = tf.gradients(loss_defun, [x] + fg_vars)
-    grads = tf.gradients(loss, [x] + fg_vars)
+    wrt = [x] + f_side_input + g_side_input + fg_vars
+    grads_rev = tf.gradients(loss_rev, wrt)
+    grads = tf.gradients(loss, wrt)
 
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
-      y_val, yd_val, gd_val, g_val = sess.run([y, y_defun, grads_defun, grads])
+      y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads])
       self.assertAllClose(y_val, yd_val)
       for g1, g2 in zip(gd_val, g_val):
         self.assertAllClose(g1, g2)
 
+  def testRevBlock(self):
+    self._testRevBlock()
+
+  def testSideInput(self):
+    f_side_input = tf.random_uniform([self.BATCH_SIZE, self.CHANNELS // 2])
+
+    def f(x, side_input):
+      return tf.layers.dense(
+          x, self.CHANNELS // 2, use_bias=True) + side_input[0]
+
+    self._testRevBlock(f=f, f_side_input=[f_side_input])
+
+  def testMultipleFns(self):
+
+    def f1(x):
+      return tf.layers.dense(x, self.CHANNELS // 2)
+
+    def f2(x):
+      return tf.layers.dense(x, self.CHANNELS // 2, activation=tf.nn.relu)
+
+    self._testRevBlock(f=[f1, f2, f1, f2])
+
+
+class FnWithCustomGradTest(tf.test.TestCase):
+
+  def testCorrectness(self):
+
+    w = tf.random_uniform([6, 10])
+
+    def fn(a, b, c):
+      return tf.layers.dense(
+          a,
+          10,
+          use_bias=False,
+          kernel_initializer=lambda shape, dtype, partition_info: w
+      ) + tf.matmul(b, c)
+
+    def grad_fn(inputs, variables, outputs, grad_outputs):
+      outputs = outputs[0]
+      grad_outputs = grad_outputs[0]
+      grad_inputs = tf.gradients(outputs, inputs, grad_ys=grad_outputs)
+      grad_vars = tf.gradients(outputs, variables, grad_ys=grad_outputs)
+      return grad_inputs, grad_vars
+
+    custom_fn = rev_block.fn_with_custom_grad(grad_fn)(fn)
+
+    a = tf.random_uniform([11, 6])
+    b = tf.random_uniform([11, 7])
+    c = tf.random_uniform([7, 10])
+
+    out = fn(a, b, c)
+    custom_out = custom_fn(a, b, c)
+    self.assertEqual(out.get_shape().as_list(),
+                     custom_out.get_shape().as_list())
+
+    loss = tf.reduce_mean(out)
+    custom_loss = tf.reduce_mean(custom_out)
+
+    grads = tf.gradients(loss, [a, b, c] + [tf.trainable_variables()[0]])
+    custom_grads = tf.gradients(custom_loss,
+                                [a, b, c] + [tf.trainable_variables()[1]])
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      out_val, custom_out_val, grads_val, custom_grads_val = sess.run(
+          [out, custom_out, grads, custom_grads])
+      self.assertAllClose(out_val, custom_out_val)
+      for g1, g2 in zip(grads_val, custom_grads_val):
+        self.assertAllClose(g1, g2)
+
+  def testCustomGrad(self):
+
+    def fn(a, b, c):
+      return tf.layers.dense(a, 10, use_bias=False) + tf.matmul(b, c)
+
+    def grad_fn(inputs, variables, unused_outputs, unused_grad_outputs):
+      grad_inputs = [tf.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)]
+      grad_vars = [
+          tf.ones_like(t) * (i + len(inputs) + 1.)
+          for i, t in enumerate(variables)
+      ]
+      return grad_inputs, grad_vars
+
+    a = tf.random_uniform([11, 6])
+    b = tf.random_uniform([11, 7])
+    c = tf.random_uniform([7, 10])
+    w = tf.random_uniform([6, 10])
+    out = rev_block.fn_with_custom_grad(grad_fn)(fn)(a, b, c)
+    loss = tf.reduce_mean(out)
+    grads = tf.gradients(loss, [a, b, c, tf.trainable_variables()[0]])
+    expected_grads = [
+        tf.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
+    ]
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      g_val, eg_val = sess.run([grads, expected_grads])
+      for g1, g2 in zip(g_val, eg_val):
+        self.assertAllClose(g1, g2)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 19f1915e8..3302f45be 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -72,8 +72,13 @@ def attention_lm_prepare_decoder(targets, hparams):
     decoder_self_attention_bias: a Tensor, containing large negative values
     to implement masked attention and possibly baises for diagonal alignments
   """
-  decoder_self_attention_bias = (
-      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  if hparams.prepend_mode == "prepend_inputs_full_attention":
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_prepended(
+            common_attention.embedding_to_padding(targets)))
+  else:
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
   decoder_input = common_layers.shift_left_3d(targets)
   if hparams.pos == "timing":
     decoder_input = common_attention.add_timing_signal_1d(decoder_input)
@@ -153,6 +158,7 @@ def attention_lm_base():
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
   hparams.add_hparam("pos", "timing")  # timing, none
+  hparams.add_hparam("encoder_full_attention", int(False))
   return hparams
 
 
@@ -181,9 +187,26 @@ def attention_lm_translation():
   hparams = attention_lm_base()
   hparams.layer_preprocess_sequence = "n"
   hparams.layer_postprocess_sequence = "da"
-  hparams.learning_rate = 0.1
-  hparams.prepend_inputs_to_targets = int(True)
+  hparams.learning_rate = 0.4
+  hparams.prepend_mode = "prepend_inputs_masked_attention"
   hparams.max_length = 512
   hparams.label_smoothing = 0.1
   hparams.shared_embedding_and_softmax_weights = int(True)
   return hparams
+
+
+@registry.register_hparams
+def attention_lm_translation_l12():
+  """Version to use for seq2seq."""
+  hparams = attention_lm_translation()
+  hparams.batch_size = 4096
+  hparams.num_hidden_layers = 12
+  return hparams
+
+
+@registry.register_hparams
+def attention_lm_translation_full_attention():
+  """Version to use for seq2seq."""
+  hparams = attention_lm_translation()
+  hparams.prepend_mode = "prepend_inputs_full_attention"
+  return hparams
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 268e93f7b..9c55eadd6 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -39,6 +39,19 @@
 import tensorflow as tf
 
 
+class AttentionMoeType(object):
+  NONE = "none"
+  LOCAL = "local"
+  GLOBAL = "global"
+
+  @staticmethod
+  def get_choices():
+    return [
+        AttentionMoeType.NONE,
+        AttentionMoeType.LOCAL,
+    ]
+
+
 @registry.register_model
 class AttentionLmMoe(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
@@ -66,17 +79,33 @@ def postprocess(x, y):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("attention"):
-          y = dp(
-              common_attention.multihead_attention,
-              preprocess(x),
-              None,
-              decoder_self_attention_bias,
-              hparams.attention_key_channels or hparams.hidden_size,
-              hparams.attention_value_channels or hparams.hidden_size,
-              hparams.hidden_size,
-              hparams.num_heads,
-              hparams.attention_dropout,
-              name="decoder_self_attention")
+          x = preprocess(x)
+          if hparams.attention_moe_type == AttentionMoeType.NONE:
+            y = dp(
+                common_attention.multihead_attention,
+                x,
+                None,
+                decoder_self_attention_bias,
+                hparams.attention_key_channels or hparams.hidden_size,
+                hparams.attention_value_channels or hparams.hidden_size,
+                hparams.hidden_size,
+                hparams.num_heads,
+                hparams.attention_dropout,
+                name="decoder_self_attention")
+          elif hparams.attention_moe_type == AttentionMoeType.LOCAL:
+            y, loss = dp(
+                common_attention.local_expert_attention,
+                x,
+                k=2,
+                loss_coef=1e-2,
+                attention_num_experts=hparams.attention_num_experts,
+                train=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                mask_right=True)
+            # TODO(avaswani, epot, noam): Do we need to divide by num shards ?
+            extra_loss += tf.add_n(loss)/dp.n
+          else:
+            raise ValueError("Only {} supported for now.".format(
+                AttentionMoeType.get_choices()))
           x = postprocess(x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
@@ -118,8 +147,13 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
     decoder_self_attention_bias: a Tensor, containing large negative values
     to implement masked attention and possibly baises for diagonal alignments
   """
-  decoder_self_attention_bias = (
-      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  if hparams.prepend_mode == "prepend_inputs_full_attention":
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_prepended(
+            common_attention.embedding_to_padding(targets)))
+  else:
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
   decoder_input = common_layers.shift_left_3d(targets)
   if hparams.pos == "timing":
     decoder_input = common_attention.add_timing_signal_1d(decoder_input)
@@ -169,6 +203,9 @@ def attention_lm_moe_base():
   hparams.add_hparam("relu_dropout", 0.0)
   hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("moe_layers", "2")  # comma separated list of layer numbers
+  # moe params. local attention moe.
+  hparams.add_hparam("attention_moe_type", AttentionMoeType.NONE)
+  hparams.add_hparam("attention_num_experts", 16)
   return hparams
 
 
@@ -206,6 +243,21 @@ def attention_lm_moe_tiny():
   return hparams
 
 
+@registry.register_hparams
+def attention_lm_attention_moe_tiny():
+  """Cheap model for debugging.
+
+  Returns:
+    an hparams object.
+  """
+  hparams = attention_lm_moe_small()
+  hparams.moe_layers = ""
+  hparams.attention_num_experts = 16
+  hparams.filter_size = 512
+  hparams.attention_moe_type = AttentionMoeType.LOCAL
+  return hparams
+
+
 @registry.register_hparams
 def attention_lm_no_moe_small():
   """Without the mixture of experts (for comparison).
@@ -249,3 +301,20 @@ def attention_lm_moe_large():
   hparams.moe_num_experts = 128
   hparams.layer_prepostprocess_dropout = 0.2
   return hparams
+
+
+@registry.register_hparams
+def attention_lm_moe_translation():
+  """Version to use for seq2seq."""
+  hparams = attention_lm_moe_base()
+  hparams.layer_preprocess_sequence = "n"
+  hparams.layer_postprocess_sequence = "da"
+  hparams.learning_rate = 0.4
+  hparams.prepend_mode = "prepend_inputs_masked_attention"
+  hparams.max_length = 512
+  hparams.label_smoothing = 0.1
+  hparams.layer_prepostprocess_dropout = 0.2
+  hparams.num_hidden_layers = 6
+  hparams.moe_layers = "0,1,2,3,4,5"
+  hparams.shared_embedding_and_softmax_weights = int(True)
+  return hparams
diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py
index 27aa631c6..9d676632e 100644
--- a/tensor2tensor/models/gene_expression.py
+++ b/tensor2tensor/models/gene_expression.py
@@ -130,8 +130,14 @@ def fc_layer(x, num_out, dropout_rate, name="fc"):
 def gene_expression_conv_base():
   """Hparams for GeneExpressionConv model."""
   hparams = common_hparams.basic_params1()
-  hparams.max_length = 10000000
-  hparams.batch_size = 1024
+
+  batch_size = 10
+  output_length = 2048
+  inputs_per_output = 128
+  chunk_size = 4
+  input_length = output_length * inputs_per_output // chunk_size
+  hparams.batch_size = input_length * batch_size
+
   hparams.dropout = 0.1
   hparams.add_hparam("num_conv_layers", 4)
   hparams.add_hparam("num_dconv_layers", 7)
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index e2307f49f..cc4cd1200 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -70,7 +70,7 @@ def testGeneExpressionModels(self):
                        gene_expression_conv_test())]
     for model_cls, hparams in models_hparams:
       hparams.add_hparam("data_dir", None)
-      p_hparams = gene_data.GeneExpressionCAGE10().internal_hparams(hparams)
+      p_hparams = gene_data.GenomicsExpressionCage10().internal_hparams(hparams)
       hparams.problems = [p_hparams]
       self._testModel(hparams, model_cls)
 
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
index d4514408d..af609e22c 100644
--- a/tensor2tensor/models/models.py
+++ b/tensor2tensor/models/models.py
@@ -33,6 +33,7 @@
 from tensor2tensor.models import lstm
 from tensor2tensor.models import multimodel
 from tensor2tensor.models import neural_gpu
+from tensor2tensor.models import rev_transformer
 from tensor2tensor.models import shake_shake
 from tensor2tensor.models import slicenet
 from tensor2tensor.models import transformer
diff --git a/tensor2tensor/models/rev_transformer.py b/tensor2tensor/models/rev_transformer.py
new file mode 100644
index 000000000..d1392a1ee
--- /dev/null
+++ b/tensor2tensor/models/rev_transformer.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reversible Residual Transformer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_layers
+from tensor2tensor.layers import rev_block
+from tensor2tensor.models import transformer
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+
+@registry.register_model
+class RevTransformer(transformer.Transformer):
+  """Reversible Residual Transformer.
+
+  Layers are reversible and are recomputed on the backward pass.
+
+  y1 = x1 + f(x2)
+  y2 = x2 + g(y1)
+
+  f: Attention
+  g: Feed-forward
+  """
+
+  def model_fn_body(self, features):
+    hparams = self._hparams
+    targets = features["targets"]
+    inputs = features["inputs"]
+    target_space = features["target_space_id"]
+
+    inputs = common_layers.flatten4d3d(inputs)
+    targets = common_layers.flatten4d3d(targets)
+
+    (encoder_input, encoder_self_attention_bias,
+     encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder(
+         inputs, target_space, hparams))
+    (decoder_input,
+     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
+         targets, hparams)
+
+    encoder_input = tf.nn.dropout(encoder_input,
+                                  1.0 - hparams.layer_prepostprocess_dropout)
+    decoder_input = tf.nn.dropout(decoder_input,
+                                  1.0 - hparams.layer_prepostprocess_dropout)
+    encoder_output = rev_transformer_encoder(
+        encoder_input, encoder_self_attention_bias, hparams)
+
+    decoder_output = rev_transformer_decoder(
+        decoder_input, encoder_output, decoder_self_attention_bias,
+        encoder_decoder_attention_bias, hparams)
+    decoder_output = tf.expand_dims(decoder_output, 2)
+
+    return decoder_output
+
+
+def rev_transformer_encoder(encoder_input,
+                            encoder_self_attention_bias,
+                            hparams,
+                            name="encoder"):
+  """A stack of transformer layers.
+
+  Args:
+    encoder_input: a Tensor
+    encoder_self_attention_bias: bias Tensor for self-attention
+       (see common_attention.attention_bias())
+    hparams: hyperparameters for model
+    name: a string
+
+  Returns:
+    y: a Tensors
+  """
+
+  def f(x, side_input):
+    """f(x) for reversible layer, self-attention layer."""
+    encoder_self_attention_bias = side_input[0]
+
+    old_hid_size = hparams.hidden_size
+    hparams.hidden_size = old_hid_size // 2
+
+    with tf.variable_scope("self_attention"):
+      y = common_attention.multihead_attention(
+          common_layers.layer_preprocess(
+              x, hparams), None, encoder_self_attention_bias,
+          hparams.attention_key_channels or hparams.hidden_size,
+          hparams.attention_value_channels or hparams.hidden_size,
+          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
+      y = common_layers.layer_postprocess(x, y, hparams)
+    hparams.hidden_size = old_hid_size
+    return y
+
+  def g(x):
+    """g(x) for reversible layer, feed-forward layer."""
+    old_hid_size = hparams.hidden_size
+    hparams.hidden_size = old_hid_size // 2
+
+    with tf.variable_scope("ffn"):
+      y = transformer.transformer_ffn_layer(
+          common_layers.layer_preprocess(x, hparams), hparams)
+      y = common_layers.layer_postprocess(x, y, hparams)
+    hparams.hidden_size = old_hid_size
+    return y
+
+  x1, x2 = tf.split(encoder_input, 2, axis=-1)
+
+  with tf.variable_scope(name):
+    y1, y2 = rev_block.rev_block(
+        x1,
+        x2,
+        f,
+        g,
+        num_layers=hparams.num_hidden_layers,
+        f_side_input=[encoder_self_attention_bias],
+        is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN)
+    y = tf.concat([y1, y2], axis=-1)
+
+  return common_layers.layer_preprocess(y, hparams)
+
+
+def rev_transformer_decoder(decoder_input,
+                            encoder_output,
+                            decoder_self_attention_bias,
+                            encoder_decoder_attention_bias,
+                            hparams,
+                            name="decoder"):
+  """A stack of transformer layers.
+
+  Args:
+    decoder_input: a Tensor
+    encoder_output: a Tensor
+    decoder_self_attention_bias: bias Tensor for self-attention
+      (see common_attention.attention_bias())
+    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
+      (see common_attention.attention_bias())
+    hparams: hyperparameters for model
+    name: a string
+
+  Returns:
+    y: a Tensors
+  """
+
+  def f(x, side_input):
+    """f(x) for reversible layer, self-attention and enc-dec attention."""
+    decoder_self_attention_bias = side_input[0]
+    encoder_decoder_attention_bias = side_input[1]
+    encoder_output = side_input[2]
+
+    old_hid_size = hparams.hidden_size
+    hparams.hidden_size = old_hid_size // 2
+
+    with tf.variable_scope("self_attention"):
+      y = common_attention.multihead_attention(
+          common_layers.layer_preprocess(
+              x, hparams), None, decoder_self_attention_bias,
+          hparams.attention_key_channels or hparams.hidden_size,
+          hparams.attention_value_channels or hparams.hidden_size,
+          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
+      y = common_layers.layer_postprocess(x, y, hparams)
+      if encoder_output is not None:
+        with tf.variable_scope("encdec_attention"):
+          y = common_attention.multihead_attention(
+              common_layers.layer_preprocess(
+                  x, hparams), encoder_output, encoder_decoder_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
+          y = common_layers.layer_postprocess(x, y, hparams)
+    hparams.hidden_size = old_hid_size
+    return y
+
+  def g(x):
+    """g(x) for reversible layer, feed-forward layer."""
+    old_hid_size = hparams.hidden_size
+    hparams.hidden_size = old_hid_size // 2
+    with tf.variable_scope("ffn"):
+      y = transformer.transformer_ffn_layer(
+          common_layers.layer_preprocess(x, hparams), hparams)
+      y = common_layers.layer_postprocess(x, y, hparams)
+    hparams.hidden_size = old_hid_size
+    return y
+
+  x1, x2 = tf.split(decoder_input, 2, axis=-1)
+
+  with tf.variable_scope(name):
+    y1, y2 = rev_block.rev_block(
+        x1,
+        x2,
+        f,
+        g,
+        num_layers=hparams.num_hidden_layers,
+        f_side_input=[
+            decoder_self_attention_bias, encoder_decoder_attention_bias,
+            encoder_output
+        ],
+        is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN)
+    y = tf.concat([y1, y2], axis=-1)
+    return common_layers.layer_preprocess(y, hparams)
+
+
+@registry.register_hparams
+def rev_transformer_base():
+  """Base hparams for RevTransformer."""
+  hparams = transformer.transformer_big()
+
+  # Use settings from transformer_n_da
+  hparams.layer_preprocess_sequence = "n"
+  hparams.layer_postprocess_sequence = "da"
+  hparams.learning_rate = 0.4
+
+  return hparams
+
+
+@registry.register_hparams
+def rev_transformer_big():
+  """Base hparams for RevTransformer."""
+  hparams = rev_transformer_base()
+
+  # The RevTransformer uses significantly less memory than the Transformer.
+  # Increase batch size and model size.
+  hparams.batch_size *= 2
+  hparams.hidden_size *= 2
+  hparams.num_heads *= 2
+  hparams.num_hidden_layers += 1
+  return hparams
diff --git a/tensor2tensor/models/rev_transformer_test.py b/tensor2tensor/models/rev_transformer_test.py
new file mode 100644
index 000000000..da9e15f72
--- /dev/null
+++ b/tensor2tensor/models/rev_transformer_test.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for RevTransformer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.models import rev_transformer
+
+import tensorflow as tf
+
+
+def rev_transformer_test():
+  hparams = rev_transformer.rev_transformer_base()
+  hparams.num_hidden_layers = 2
+  hparams.hidden_size = 128
+  hparams.filter_size = 512
+  hparams.num_heads = 2
+  return hparams
+
+
+class RevTransformerTest(tf.test.TestCase):
+
+  def testTransformer(self):
+    batch_size = 3
+    input_length = 5
+    target_length = 7
+    vocab_size = 9
+    hparams = rev_transformer_test()
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    hparams.problems = [p_hparams]
+    inputs = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, input_length, 1, 1))
+    targets = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, target_length, 1, 1))
+    features = {
+        "inputs": tf.constant(inputs, dtype=tf.int32),
+        "targets": tf.constant(targets, dtype=tf.int32),
+        "target_space_id": tf.constant(1, dtype=tf.int32),
+    }
+    model = rev_transformer.RevTransformer(
+        hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+    sharded_logits, _ = model.model_fn(features)
+    logits = tf.concat(sharded_logits, 0)
+    grads = tf.gradients(
+        tf.reduce_mean(logits), [features["inputs"]] + tf.global_variables())
+    grads = [g for g in grads if g is not None]
+
+    with self.test_session() as session:
+      session.run(tf.global_variables_initializer())
+      logits_val, _ = session.run([logits, grads])
+    self.assertEqual(logits_val.shape, (batch_size, target_length, 1, 1,
+                                        vocab_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index ffd791a04..6a3f3afdf 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -26,6 +26,7 @@
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_layers
 from tensor2tensor.models import transformer
+from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -78,18 +79,45 @@ def decompress_step(source, c, hparams, first_relu, name):
     shape = tf.shape(source)
     if c is not None:
       source = attend(source, c, hparams, "decompress_attend")
-    first = common_layers.conv_block(
-        source,
-        hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))],
-        first_relu=first_relu, padding="SAME", name="decompress_conv1")
-    second = common_layers.conv_block(
-        tf.concat([source, first], axis=3),
-        hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))],
-        first_relu=first_relu, padding="SAME", name="decompress_conv2")
-    thicker = interleave(first, second)
+    thicker = common_layers.conv_block(
+        source, hparams.hidden_size * 2, [((1, 1), (1, 1))],
+        first_relu=first_relu, name="decompress_conv")
     return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
 
 
+def top_k_softmax(x, k):
+  """Calculate softmax(x), select top-k and rescale to sum to 1."""
+  x = tf.nn.softmax(x)
+  top_x, _ = tf.nn.top_k(x, k=k+1)
+  min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True)
+  x = tf.nn.relu((x - min_top) + 1e-12)
+  x /= tf.reduce_sum(x, axis=-1, keep_dims=True)
+  return x, tf.reduce_max(top_x, axis=-1)
+
+
+def top_k_experts(x, k, hparams):
+  x_shape = tf.shape(x)
+  x_flat = tf.reshape(x, [-1, x.get_shape().as_list()[-1]])
+  is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
+  gates, load = expert_utils.noisy_top_k_gating(
+      x_flat, hparams.v_size, is_training, k)
+  gates_shape = [x_shape[0], x_shape[1], x_shape[2], hparams.v_size]
+  gates = tf.reshape(gates, gates_shape)
+  load_loss = expert_utils.cv_squared(load)
+  return gates, load_loss
+
+
+def dvae(x, k, hparams, name):
+  with tf.variable_scope(name):
+    m = tf.layers.dense(x, hparams.v_size, name="mask")
+    if k is None:
+      m = tf.nn.softmax(m)
+      kl = - tf.reduce_max(m, axis=-1)
+    else:
+      m, kl = top_k_softmax(m, k)
+    return m, 1.0 - tf.reduce_mean(kl)
+
+
 def vae(x, hparams, name):
   with tf.variable_scope(name):
     mu = tf.layers.dense(x, hparams.z_size, name="mu")
@@ -117,24 +145,59 @@ def compress(x, c, hparams, name):
     return cur
 
 
+def mix(x1, x2, steps, min_prob=0.0, max_prob=1.0, mode="lin"):
+  if mode == "lin":
+    alpha_p = common_layers.inverse_lin_decay(steps) + 0.001
+  else:
+    alpha_p = common_layers.inverse_exp_decay(steps) + 0.001
+  alpha_p = alpha_p * (max_prob - min_prob) + min_prob
+  alpha = tf.random_uniform(tf.shape(x1))
+  alpha = tf.to_float(tf.less(alpha, alpha_p))
+  return alpha * x1 + (1.0 - alpha) * x2
+
+
 def vae_compress(x, c, hparams, compress_name, decompress_name, reuse=None):
   """Compress, then VAE."""
+  mix_k = 8
   with tf.variable_scope(compress_name, reuse=reuse):
-    cur = compress(x, c, hparams, "compress")
+    cur = compress(x, None, hparams, "compress")
     # Convolve and ReLu to get state.
     cur = common_layers.conv_block(
         cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv")
-    z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae")
+    # z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae")
+    z, kl_loss = dvae(cur, None, hparams, name="dvae")
+    z1, kl_loss1 = top_k_experts(cur, mix_k, hparams)
+    mu, log_sigma = None, None
+
+    # Mix expert-selection and flat selection.
+    alpha_p = common_layers.inverse_lin_decay(60000) + 0.001
+    z = alpha_p * z1 + (1 - alpha_p) * z
+    kl_loss += kl_loss1
+
+  # Compress context.
+  with tf.variable_scope(compress_name, reuse=reuse):
+    compress_c = compress(c, None, hparams, "compress_context")
+    c_z = tf.layers.dense(compress_c, hparams.v_size, name="mask_context")
+    reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=z, logits=c_z)
+
+  # If not training, use the predicted z instead of the autoregressive one.
+  # if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN:
+  # z = mix(c_z, z, 50000, max_prob=0.3, mode="exp")
+  # z, _ = top_k_softmax(c_z, mix_k)
 
   with tf.variable_scope(decompress_name, reuse=reuse):
     # Decompress.
     z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense")
 
+    # Leak at the beginning to help train.
+    z = mix(z, cur, 30000)
+
     for i in xrange(hparams.num_compress_steps):
       j = hparams.num_compress_steps - i - 1
       z = residual_conv(z, 1, hparams, "decompress_rc_%d" % j)
-      z = decompress_step(z, c, hparams, i > 0, "decompress__step_%d" % j)
-    return z, kl_loss, mu, log_sigma
+      z = decompress_step(z, c, hparams, i > 0, "decompress_step_%d" % j)
+    return z, kl_loss + 0.0001 * reconstruct_loss, mu, log_sigma
 
 
 def encode(x, x_space, hparams, name):
@@ -167,7 +230,6 @@ def ffn(x, hparams, name):
 def vae_transformer_internal(inputs, targets, target_space, hparams):
   """VAE Transformer, main step used for training."""
   with tf.variable_scope("vae_transformer"):
-    is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     # Prepare inputs, targets, and k.
     inputs = common_layers.flatten4d3d(inputs)
     input_len = tf.shape(inputs)[1]  # Double input size to cover targets.
@@ -179,39 +241,25 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
         inputs, targets, final_length_divisible_by=k)
     inputs = encode(inputs, target_space, hparams, "input_enc")
 
-    # Dropout targets or swap for zeros 5% of the time.
-    targets_nodrop = targets
-    max_prestep = hparams.kl_warmup_steps
-    prob_targets = 0.95 if is_training else 1.0
-    targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01
-    targets = dropmask(targets, targets_dropout_max * 0.7, is_training)
-    targets = tf.cond(tf.less(tf.random_uniform([]), prob_targets),
-                      lambda: targets, lambda: tf.zeros_like(targets))
-    targets = targets_nodrop
-
     # Compress and vae.
-    z = tf.get_variable("z", [hparams.hidden_size])
-    z = tf.reshape(z, [1, 1, 1, -1])
-    z = tf.tile(z, [tf.shape(inputs)[0], 1, 1, 1])
-
-    z = attend(z, inputs, hparams, "z_attendsi")
-    z = ffn(z, hparams, "zff2")
-    z = attend(z, targets, hparams, "z_attendst2")
-    z = ffn(z, hparams, "zff3")
-    z, kl_loss, _, _ = vae(z, hparams, name="vae")
-    z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense")
-
-    # z, kl_loss, _, _ = vae_compress(
-    #     tf.expand_dims(targets, axis=2), tf.expand_dims(inputs, axis=2),
-    #     hparams, "vae_compress", "vae_decompress")
-
-    decoder_in = tf.squeeze(z, axis=2) + tf.zeros_like(targets)
-    (decoder_input, decoder_self_attention_bias) = (
-        transformer.transformer_prepare_decoder(decoder_in, hparams))
-    ret = transformer.transformer_decoder(
-        decoder_input, inputs, decoder_self_attention_bias, None, hparams)
-
-    kl_loss *= common_layers.inverse_exp_decay(int(max_prestep * 1.5)) * 5.0
+    z, kl_loss, _, _ = vae_compress(tf.expand_dims(targets, axis=2),
+                                    tf.expand_dims(inputs, axis=2),
+                                    hparams, "vae_compress", "vae_decompress")
+
+    # Join z with inputs, run decoder.
+    to_decode = common_layers.conv_block(
+        tf.concat([z, tf.expand_dims(inputs, axis=2)], axis=3),
+        hparams.hidden_size, [((1, 1), (1, 1))], name="join_z")
+    ret = encode(tf.squeeze(to_decode, axis=2), target_space, hparams, "dec")
+
+    # For experiments with one-sided decoder:
+    # decoder_in = tf.squeeze(to_decode, axis=2)
+    # (decoder_input, decoder_self_attention_bias) = (
+    #     transformer.transformer_prepare_decoder(decoder_in, hparams))
+    # ret = transformer.transformer_decoder(
+    #     decoder_input, inputs, decoder_self_attention_bias, None, hparams)
+
+    kl_loss *= common_layers.inverse_exp_decay(hparams.kl_warmup_steps) * 3.0
     losses = {"kl": kl_loss}
     return tf.expand_dims(ret, axis=2), losses
 
@@ -267,10 +315,11 @@ def transformer_vae_small():
   """Set of hyperparameters."""
   hparams = transformer.transformer_small()
   hparams.batch_size = 2048
-  hparams.learning_rate_warmup_steps = 16000
+  hparams.learning_rate_warmup_steps = 4000
   hparams.add_hparam("z_size", 128)
+  hparams.add_hparam("v_size", 1024*8)
   hparams.add_hparam("num_compress_steps", 4)
-  hparams.add_hparam("kl_warmup_steps", 60000)
+  hparams.add_hparam("kl_warmup_steps", 50000)
   return hparams
 
 
@@ -283,6 +332,6 @@ def transformer_vae_base():
   hparams.attention_dropout = 0.0
   hparams.relu_dropout = 0.0
   hparams.dropout = 0.0
-  hparams.num_hidden_layers = 3
+  hparams.num_hidden_layers = 4
   hparams.z_size = 256
   return hparams
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index dd8275204..be6c28559 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -256,7 +256,7 @@ def grow_topk(i, alive_seq, alive_log_probs):
 
     topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
 
-    # Recovering the log probs becuase we will need to send them back
+    # Recovering the log probs because we will need to send them back
     topk_log_probs = topk_scores * length_penalty
 
     # Work out what beam the top probs are in.
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 5c7041014..03e7720b6 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -133,7 +133,8 @@ def preprocessing(examples, data_file_pattern):
   # all to the Problem class and its preprocess_examples method. Don't add.
   if "image" in data_file_pattern:
     def resize(img, size):
-      return tf.to_int64(tf.image.resize_images(img, [size, size]))
+      return tf.to_int64(tf.image.resize_images(
+          img, [size, size], tf.image.ResizeMethod.AREA))
 
     if "img2img" in data_file_pattern:
       inputs = examples["inputs"]
@@ -141,6 +142,9 @@ def resize(img, size):
       examples["targets"] = resize(inputs, 64)
     elif "image_celeba" in data_file_pattern:
       inputs = examples["inputs"]
+      # Remove boundaries in CelebA images. Remove 40 pixels each side
+      # vertically and 20 pixels each side horizontally.
+      inputs = tf.image.crop_to_bounding_box(inputs, 40, 20, 218-80, 178-40)
       examples["inputs"] = resize(inputs, 8)
       examples["targets"] = resize(inputs, 32)
   elif "audio" in data_file_pattern:
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index da33cf90e..4ba8dc71a 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -271,7 +271,7 @@ def _interactive_input_fn(hparams):
               "  in=<input_problem>  (set the input problem number)\n"
               "  ou=<output_problem> (set the output problem number)\n"
               "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode legnth)\n"
+              "  dl=<decode_length>  (changes decode length)\n"
               "  <%s>                (decode)\n"
               "  q                   (quit)\n"
               ">" % (num_samples, decode_length, "source_string"
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index ac58ef3cd..6f26f20fa 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -34,6 +34,8 @@
 
 from tensorflow.python.framework import function
 
+DEFAULT_DEV_STRING = "existing_device"
+
 
 @function.Defun(
     python_grad_func=lambda x, dy: tf.convert_to_tensor(dy),
@@ -180,7 +182,14 @@ def caching_getter(getter, name, *args, **kwargs):
             reuse=True if i > 0 and self._reuse else None,
             caching_device=self._caching_devices[i],
             custom_getter=custom_getter):
-          with tf.device(self._devices[i]):
+          # TODO(noam, epot, avaswani)
+          # Allows for passing no device in case you want to default to the
+          # existing device. This is needed when we put all experts on a single
+          # device, for example in local_moe.
+          if self._devices[i] != DEFAULT_DEV_STRING:
+            with tf.device(self._devices[i]):
+              outputs.append(fns[i](*my_args[i], **my_kwargs[i]))
+          else:
             outputs.append(fns[i](*my_args[i], **my_kwargs[i]))
     if isinstance(outputs[0], tuple):
       outputs = list(zip(*outputs))
@@ -361,7 +370,6 @@ def _my_top_k(x, k):
 
 
 def noisy_top_k_gating(x,
-                       input_size,
                        num_experts,
                        train,
                        k=2,
@@ -375,7 +383,6 @@ def noisy_top_k_gating(x,
 
   Args:
     x: input Tensor with shape [batch_size, input_size]
-    input_size: an integer
     num_experts: an integer
     train: a boolean - we only add noise at training time.
     k: an integer - number of experts per example
@@ -389,6 +396,7 @@ def noisy_top_k_gating(x,
     load: a Tensor with shape [num_experts]
   """
   with tf.variable_scope(name, default_name="noisy_top_k_gating"):
+    input_size = x.get_shape().as_list()[-1]
     w_gate = tf.get_variable(
         "w_gate", [input_size, num_experts], tf.float32, initializer)
     if noisy_gating:
@@ -431,6 +439,25 @@ def noisy_top_k_gating(x,
 class SparseDispatcher(object):
   """Helper for implementing a mixture of experts.
 
+  The purpose of this class is to create input minibatches for the
+  experts and to combine the results of the experts to form a unified
+  output tensor.
+
+  There are two functions:
+    dispatch - take an input Tensor and create input Tensors for each expert.
+    combine - take output Tensors from each expert and form a combined output
+      Tensor.  Outputs from different experts for the same batch element are
+      summed together, weighted by the provided "gates".
+
+  The class is initialized with a "gates" Tensor, which specifies which
+  batch elements go to which experts, and the weights to use when combining
+  the outputs.  Batch element b is sent to expert e iff gates[b, e] != 0.
+
+  The inputs and outputs are all two-dimensional [batch, depth].
+  Caller is responsible for collapsing additional dimensions prior to
+  calling this class and reshaping the output to the original shape.
+  See reshape_like().
+
   Example use:
 
   gates: a float32 `Tensor` with shape `[batch_size, num_experts]`
@@ -526,8 +553,8 @@ class DistributedSparseDispatcher(object):
   """A distributed version of SparseDispatcher.
 
   Instead of one batch of input examples, we simultaneously process
-  num_datashards batches of input examples.  The per-expert `Tensor`s contain
-  a combination of examples from the different datashards.
+  a list of num_datashards batches of input examples.  The per-expert
+  `Tensor`s contain a combination of examples from the different datashards.
 
   Each datashard is associated with a particular device and each expert is
   associated with a particular device.  All per-datashard and per-expert
@@ -655,6 +682,13 @@ def reshape_like(a, b):
   return ret
 
 
+def flatten_all_but_last(a):
+  """Flatten all dimensions of a except the last."""
+  ret = tf.reshape(a, [-1, tf.shape(a)[-1]])
+  ret.set_shape([None] + a.get_shape().as_list()[-1:])
+  return ret
+
+
 def distributed_moe(data_parallelism,
                     expert_devices,
                     xs,
@@ -676,7 +710,8 @@ def distributed_moe(data_parallelism,
     input_size: an integer (input size for this layer)
     expert_fn: a unary function for each expert to run
        It should take a Tensor with shape [batch_size, input_size]
-       and return a Tensor with shape [batch_size, output_size]
+       and return a Tensor with shape [batch_size, output_size].
+       e.g. ffn_expert_fn(...)
     num_experts: an integer - number of experts
     k: an integer - how many experts to use for each batch element
     loss_coef: a scalar - multiplier on load-balancing losses
@@ -703,7 +738,6 @@ def distributed_moe(data_parallelism,
     # load is a measure of approximately how many examples go to each expert
     gates, load = dp(noisy_top_k_gating,
                      xs_flat,
-                     input_size,
                      num_experts,
                      train,
                      k,
@@ -721,3 +755,67 @@ def distributed_moe(data_parallelism,
     importance = tf.add_n(dp(tf.reduce_sum, gates, 0))
     loss = loss_coef * (cv_squared(importance) + cv_squared(load))
     return ys, loss
+
+
+def local_moe(x,
+              train,
+              expert_fn,
+              num_experts,
+              k=2,
+              loss_coef=1e-2,
+              pass_x=True,
+              pass_gates=False,
+              additional_dispatch_params=None,
+              name=None):
+  """Call a local mixture of experts.
+
+  Args:
+    x: a tensors with shape [... , input_size]
+    train: a boolean scalar.
+    expert_fn: a function.
+    num_experts: an integer - number of experts
+    k: an integer - how many experts to use for each batch element
+    loss_coef: a scalar - multiplier on load-balancing losses
+    pass_x: a boolean. If true, x will also be dispatched to the experts.
+    pass_gates: a boolean. If true, gates will be passed to experts. Might be
+      necessary when dealing with sparse encoder-encoder decoder attention
+    additional_dispatch_params: The extra tensors that need to be sent to each
+      expert. Examples include batch batch coordinates (see
+      common_attention.local_expert_attention)
+    name: a string
+
+  Returns:
+    y: a tensor.  Has the same shape as x, except for the last dimension,
+      which is output_size.
+    extra_training_loss: a scalar.  This should be added into the overall
+      training loss of the model.  The backpropagation of this loss
+      encourages all experts to be approximately equally used across a batch.
+  """
+  with tf.variable_scope(name, default_name="local_moe"):
+    x_flat = flatten_all_but_last(x)
+    # The gates indicate which batch elements go to which tensors.
+    # load is a measure of approximately how many examples go to each expert
+    gates, load = noisy_top_k_gating(
+        x_flat,
+        num_experts,
+        train,
+        k,
+        initializer=tf.zeros_initializer(),
+        noisy_gating=True,
+        noise_epsilon=1e-2)
+    # This magic object helps us shuffle data between datashards and experts.
+    dispatcher = SparseDispatcher(num_experts, gates)
+    expert_kwargs = {}
+    if pass_x:
+      expert_kwargs["x"] = dispatcher.dispatch(x_flat)
+    if pass_gates:
+      expert_kwargs["gates"] = dispatcher.expert_to_gates()
+    for k, v in six.iteritems(additional_dispatch_params or {}):
+      expert_kwargs[k] = dispatcher.dispatch(flatten_all_but_last(v))
+    ep = Parallelism([DEFAULT_DEV_STRING] * num_experts)
+    expert_outputs = ep(expert_fn, **expert_kwargs)
+    y_flat = dispatcher.combine(expert_outputs)
+    y = reshape_like(y_flat, x)
+    importance = tf.reduce_sum(gates, 0)
+    loss = loss_coef * (cv_squared(importance) + cv_squared(load))
+    return y, loss
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index fd82adc30..e5cb88ddf 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -196,9 +196,12 @@ def problem_metric_fn(predictions, labels, weights):
 
     class_output = "image" in problem_name and "coco" not in problem_name
     real_output = "gene_expression" in problem_name
-    if model_hparams.prepend_inputs_to_targets:
+    if model_hparams.prepend_mode != "none":
+      assert (
+          model_hparams.prepend_mode == "prepend_inputs_masked_attention" or
+          model_hparams.prepend_mode == "prepend_inputs_full_attention")
       assert not class_output
-      weights_fn = common_layers.weights_second_part
+      weights_fn = common_layers.weights_prepend_inputs_to_targets
     elif class_output or real_output:
       weights_fn = common_layers.weights_all
     else:
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 6ce650ac3..f5d83cbf1 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -44,7 +44,6 @@ class MyModel(T2TModel):
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import inspect
 import re
 
@@ -391,17 +390,18 @@ def create_modality(modality_spec, model_hparams):
   return retrieval_fns[modality_type](modality_name)(model_hparams, vocab_size)
 
 
-def _hparams_help_string():
-  hparams_names = list_hparams()
-  prefixes = zip([name.split("_")[0] for name in hparams_names], hparams_names)
-  names_by_prefix = collections.defaultdict(list)
-  for (prefix, full_name) in prefixes:
-    names_by_prefix[prefix].append(full_name)
-  return "\n".join(
-      sorted([
-          "    * %s: %s" % (prefix, sorted(names))
-          for prefix, names in six.iteritems(names_by_prefix)
-      ]))
+def display_list_by_prefix(names_list, starting_spaces=0):
+  """Creates a help string for names_list grouped by prefix."""
+  cur_prefix, result_lines = None, []
+  space = " " * starting_spaces
+  for name in sorted(names_list):
+    split = name.split("_", 1)
+    prefix = split[0]
+    if cur_prefix != prefix:
+      result_lines.append(space + prefix + ":")
+      cur_prefix = prefix
+    result_lines.append(space + "  * " + name)
+  return "\n".join(result_lines)
 
 
 def help_string():
@@ -410,24 +410,29 @@ def help_string():
 Registry contents:
 ------------------
 
-  Models: %s
+  Models:
+%s
 
-  HParams (by model):
+  HParams:
 %s
 
-  RangedHParams: %s
+  RangedHParams:
+%s
 
-  Modalities: %s
+  Modalities:
+%s
 
-  Problems: %s
+  Problems:
+%s
   """
-  m, rhp, mod, probs = [
-      sorted(entries)
+  m, hp, rhp, mod, probs = [
+      display_list_by_prefix(entries, starting_spaces=4)
       for entries in [
           list_models(),
+          list_hparams(),
           list_ranged_hparams(),
           list_modalities(),
           list_problems()
       ]
   ]
-  return help_str % (m, _hparams_help_string(), rhp, mod, probs)
+  return help_str % (m, hp, rhp, mod, probs)