diff --git a/setup.py b/setup.py index ff1503990..dd80dfd48 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.8', + version='1.1.9', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 30784fa60..19de46fbf 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -66,6 +66,8 @@ flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for " "registered Problems.") flags.DEFINE_integer("max_cases", 0, "Maximum number of cases to generate (unbounded if 0).") +flags.DEFINE_bool("only_list", False, + "If true, we only list the problems that will be generated.") flags.DEFINE_integer("random_seed", 429459, "Random seed to use.") flags.DEFINE_integer("task_id", -1, "For distributed data generation.") flags.DEFINE_string("t2t_usr_dir", "", @@ -81,33 +83,33 @@ _SUPPORTED_PROBLEM_GENERATORS = { "algorithmic_algebra_inverse": ( lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000), lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)), - "wmt_parsing_tokens_8k": ( + "parsing_english_ptb8k": ( lambda: wmt.parsing_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13), lambda: wmt.parsing_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, False, 2**13)), - "wsj_parsing_tokens_16k": ( + "parsing_english_ptb16k": ( lambda: wsj_parsing.parsing_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, True, 2**14, 2**9), lambda: wsj_parsing.parsing_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, False, 2**14, 2**9)), - "wmt_ende_bpe32k": ( + "translate_ende_wmt_bpe32k": ( lambda: wmt.ende_bpe_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, True), lambda: wmt.ende_bpe_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, False)), - "lm1b_32k": ( + "languagemodel_1b32k": ( lambda: lm1b.generator(FLAGS.tmp_dir, True), lambda: lm1b.generator(FLAGS.tmp_dir, False) ), - "lm1b_characters": ( + "languagemodel_1b_characters": ( lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True), lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True) ), "image_celeba_tune": ( lambda: image.celeba_generator(FLAGS.tmp_dir, 162770), lambda: image.celeba_generator(FLAGS.tmp_dir, 19867, 162770)), - "snli_32k": ( + "inference_snli32k": ( lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15), lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15), ), @@ -181,7 +183,11 @@ def main(_): "Data will be written to default data_dir=%s.", FLAGS.data_dir) - tf.logging.info("Generating problems:\n * %s\n" % "\n * ".join(problems)) + tf.logging.info("Generating problems:\n%s" + % registry.display_list_by_prefix(problems, + starting_spaces=4)) + if FLAGS.only_list: + return for problem in problems: set_random_seed() @@ -210,7 +216,7 @@ def generate_data_for_problem(problem): def generate_data_for_registered_problem(problem_name): - tf.logging.info("Generating training data for %s.", problem_name) + tf.logging.info("Generating data for %s.", problem_name) if FLAGS.num_shards: raise ValueError("--num_shards should not be set for registered Problem.") problem = registry.problem(problem_name) diff --git a/tensor2tensor/data_generators/cipher.py b/tensor2tensor/data_generators/cipher.py index a11776b84..977174880 100644 --- a/tensor2tensor/data_generators/cipher.py +++ b/tensor2tensor/data_generators/cipher.py @@ -29,7 +29,7 @@ @registry.register_problem -class CipherShift5(algorithmic.AlgorithmicProblem): +class AlgorithmicCipherShift5(algorithmic.AlgorithmicProblem): """Shift cipher.""" @property @@ -62,7 +62,7 @@ def dev_length(self): @registry.register_problem -class CipherVigenere5(algorithmic.AlgorithmicProblem): +class AlgorithmicCipherVigenere5(algorithmic.AlgorithmicProblem): """Vinegre cipher.""" @property @@ -95,7 +95,7 @@ def dev_length(self): @registry.register_problem -class CipherShift200(CipherShift5): +class AlgorithmicCipherShift200(AlgorithmicCipherShift5): """Shift cipher.""" @property @@ -110,7 +110,7 @@ def distribution(self): @registry.register_problem -class CipherVigenere200(CipherVigenere5): +class AlgorithmicCipherVigenere200(AlgorithmicCipherVigenere5): """Vinegre cipher.""" @property diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py index 438c116c8..1e26b000c 100644 --- a/tensor2tensor/data_generators/desc2code.py +++ b/tensor2tensor/data_generators/desc2code.py @@ -209,8 +209,8 @@ def generator_target(): } -@registry.register_problem("desc2code_py") -class Desc2CodePyProblem(Desc2CodeProblem): +@registry.register_problem +class ProgrammingDesc2codePy(Desc2CodeProblem): """Description2Code for python problem.""" @property @@ -222,8 +222,8 @@ def preprocess_target(self, target): return target.replace("\t", " ") -@registry.register_problem("desc2code_cpp") -class Desc2CodeCppProblem(Desc2CodeProblem): +@registry.register_problem +class ProgrammingDesc2codeCpp(Desc2CodeProblem): """Description2Code for C++ problem.""" @property diff --git a/tensor2tensor/data_generators/desc2code_test.py b/tensor2tensor/data_generators/desc2code_test.py index 24b7568d0..79992296b 100644 --- a/tensor2tensor/data_generators/desc2code_test.py +++ b/tensor2tensor/data_generators/desc2code_test.py @@ -47,7 +47,7 @@ class Desc2codeTest(tf.test.TestCase): def testCppPreprocess(self): """Check that the file correctly preprocess the code source.""" - cpp_pb = desc2code.Desc2CodeCppProblem() + cpp_pb = desc2code.ProgrammingDesc2codeCpp() self.assertEqual( # Add space beween two lines cpp_pb.preprocess_target("firstline//comm1\nsecondline//comm2\n"), diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 341a20c71..0607aad15 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -176,8 +176,8 @@ def eval_metrics(self): return [metrics.Metrics.LOG_POISSON, metrics.Metrics.R2] -@registry.register_problem("gene_expression_cage10") -class GeneExpressionCAGE10(GeneExpressionProblem): +@registry.register_problem +class GenomicsExpressionCage10(GeneExpressionProblem): @property def download_url(self): @@ -188,8 +188,8 @@ def h5_file(self): return "cage10.h5" -@registry.register_problem("gene_expression_gm12878") -class GeneExpressionGM12878(GeneExpressionProblem): +@registry.register_problem +class GenomicsExpressionGm12878(GeneExpressionProblem): @property def download_url(self): @@ -200,8 +200,8 @@ def h5_file(self): return "gm12878.h5" -@registry.register_problem("gene_expression_l262k") -class GeneExpressionL262k(GeneExpressionProblem): +@registry.register_problem +class GenomicsExpressionL262k(GeneExpressionProblem): @property def h5_file(self): diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py index 591b205da..4fb0424bb 100644 --- a/tensor2tensor/data_generators/ice_parsing.py +++ b/tensor2tensor/data_generators/ice_parsing.py @@ -62,8 +62,8 @@ def tabbed_parsing_character_generator(tmp_dir, train): return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS) -@registry.register_problem("ice_parsing_tokens") -class IceParsingTokens(problem.Problem): +@registry.register_problem +class ParsingIcelandic16k(problem.Problem): """Problem spec for parsing tokenized Icelandic text to constituency trees.""" @property diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index f61f85b54..d9a6be6ff 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -214,10 +214,21 @@ def dataset_filename(self): def is_small(self): return True # Modalities like for CIFAR. - def preprocess_examples(self, examples, mode): - examples = imagenet_preprocess_examples(examples, mode) - examples["inputs"] = tf.to_int64( - tf.image.resize_images(examples["inputs"], [32, 32])) + @property + def num_classes(self): + return 1000 + + def preprocess_examples(self, examples, mode, hparams): + # Just resize with area. + if self._was_reversed: + examples["inputs"] = tf.to_int64( + tf.image.resize_images(examples["inputs"], [32, 32], + tf.image.ResizeMethod.AREA)) + else: + examples = imagenet_preprocess_examples(examples, mode) + examples["inputs"] = tf.to_int64( + tf.image.resize_images(examples["inputs"], [32, 32])) + return examples def image_generator(images, labels): diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 7a84aac93..60b1e842b 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -98,7 +98,7 @@ def preprocess_examples_common(examples, hparams): examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length] if hparams.max_target_seq_length > 0: examples["targets"] = examples["targets"][:hparams.max_target_seq_length] - if hparams.prepend_inputs_to_targets: + if hparams.prepend_mode != "none": examples["targets"] = tf.concat( [examples["inputs"], [0], examples["targets"]], 0) return examples @@ -410,11 +410,12 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_files( self.generator(data_dir, tmp_dir, True), all_paths) generator_utils.shuffle_dataset(all_paths) - generator_utils.generate_dataset_and_shuffle( - self.generator(data_dir, tmp_dir, True), - self.training_filepaths(data_dir, self.num_shards, shuffled=False), - self.generator(data_dir, tmp_dir, False), - self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False)) + else: + generator_utils.generate_dataset_and_shuffle( + self.generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, self.num_shards, shuffled=False), + self.generator(data_dir, tmp_dir, False), + self.dev_filepaths(data_dir, self.num_dev_shards, shuffled=False)) def feature_encoders(self, data_dir): if self.is_character_level: diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index b33438d6d..4a6053613 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -492,16 +492,16 @@ def image_celeba(unused_model_hparams): lambda p: audio_wsj_tokens(p, 2**13), "audio_wsj_tokens_8k_test": lambda p: audio_wsj_tokens(p, 2**13), - "lm1b_characters": + "languagemodel_1b_characters": lm1b_characters, - "lm1b_32k": + "languagemodel_1b32k": lm1b_32k, - "wmt_parsing_tokens_8k": + "parsing_english_ptb8k": lambda p: wmt_parsing_tokens(p, 2**13), - "wsj_parsing_tokens_16k": + "parsing_english_ptb16k": lambda p: wsj_parsing_tokens( # pylint: disable=g-long-lambda p, "wsj", 2**14, 2**9), - "wmt_ende_bpe32k": + "translate_ende_wmt_bpe32k": wmt_ende_bpe32k, "image_celeba_tune": image_celeba, diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index b9014bcd6..893c2b77c 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -157,8 +157,8 @@ def _generator(self, filename, encoder): yield {"inputs": [0], "targets": tok} -@registry.register_problem("lm_ptb_10k") -class LmPtb10k(PTBProblem): +@registry.register_problem +class LanguagemodelPtb10k(PTBProblem): """A class for generating PTB data, 10k vocab.""" @property @@ -167,7 +167,7 @@ def is_character_level(self): @registry.register_problem -class LmPtbCharacters(PTBProblem): +class LanguagemodelPtbCharacters(PTBProblem): """A class for generating PTB data, character-level.""" @property diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 1e427dbe8..3cdbac5db 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -81,8 +81,8 @@ def _page_title(page): @registry.register_problem -class Wiki32k(problem.Text2TextProblem): - """A class for generating PTB data.""" +class LanguagemodelWikiFull32k(problem.Text2TextProblem): + """A language model on full English Wikipedia.""" @property def is_character_level(self): @@ -129,3 +129,12 @@ def generator(self, data_dir, tmp_dir, _): encoded = encoder.encode(page) + [EOS] encoded_title = encoder.encode(title) + [EOS] yield {"inputs": encoded_title, "targets": encoded} + + +@registry.register_problem +class LanguagemodelWikiFull8k(problem.Text2TextProblem): + """A language model on full English Wikipedia.""" + + @property + def targeted_vocab_size(self): + return 2**13 # 8192 diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 52990eb5f..93fc27ac5 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Data generators for WMT data-sets.""" +"""Data generators for translation data-sets.""" from __future__ import absolute_import from __future__ import division @@ -39,8 +39,8 @@ EOS = text_encoder.EOS_ID -class WMTProblem(problem.Text2TextProblem): - """Base class for WMT problems.""" +class TranslateProblem(problem.Text2TextProblem): + """Base class for translation problems.""" @property def is_character_level(self): @@ -381,8 +381,8 @@ def _compile_data(tmp_dir, datasets, filename): return filename -@registry.register_problem("wmt_ende_tokens_8k") -class WMTEnDeTokens8k(WMTProblem): +@registry.register_problem +class TranslateEndeWmt8k(TranslateProblem): """Problem spec for WMT En-De translation.""" @property @@ -407,16 +407,16 @@ def target_space_id(self): return problem.SpaceID.DE_TOK -@registry.register_problem("wmt_ende_tokens_32k") -class WMTEnDeTokens32k(WMTEnDeTokens8k): +@registry.register_problem +class TranslateEndeWmt32k(TranslateEndeWmt8k): @property def targeted_vocab_size(self): return 2**15 # 32768 -@registry.register_problem("wmt_ende_characters") -class WMTEnDeCharacters(WMTProblem): +@registry.register_problem +class TranslateEndeWmtCharacters(TranslateProblem): """Problem spec for WMT En-De translation.""" @property @@ -440,8 +440,8 @@ def target_space_id(self): return problem.SpaceID.DE_CHR -@registry.register_problem("wmt_zhen_tokens_8k") -class WMTZhEnTokens8k(WMTProblem): +@registry.register_problem +class TranslateEnzhWmt8k(TranslateProblem): """Problem spec for WMT Zh-En translation.""" @property @@ -466,7 +466,10 @@ def generator(self, data_dir, tmp_dir, train): target_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) - return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", + # We generate English->X data by convention, to train reverse translation + # just add the "_rev" suffix to the problem name, e.g., like this. + # --problems=translate_enzh_wmt8k_rev + return bi_vocabs_token_generator(data_path + ".lang2", data_path + ".lang1", source_vocab, target_vocab, EOS) @property @@ -491,8 +494,8 @@ def feature_encoders(self, data_dir): } -@registry.register_problem("wmt_enfr_tokens_8k") -class WMTEnFrTokens8k(WMTProblem): +@registry.register_problem +class TranslateEnfrWmt8k(TranslateProblem): """Problem spec for WMT En-Fr translation.""" @property @@ -517,16 +520,16 @@ def target_space_id(self): return problem.SpaceID.FR_TOK -@registry.register_problem("wmt_enfr_tokens_32k") -class WMTEnFrTokens32k(WMTEnFrTokens8k): +@registry.register_problem +class TranslateEnfrWmt32k(TranslateEnfrWmt8k): @property def targeted_vocab_size(self): return 2**15 # 32768 -@registry.register_problem("wmt_enfr_characters") -class WMTEnFrCharacters(WMTProblem): +@registry.register_problem +class TranslateEnfrWmtCharacters(TranslateProblem): """Problem spec for WMT En-Fr translation.""" @property @@ -550,8 +553,8 @@ def target_space_id(self): return problem.SpaceID.FR_CHR -@registry.register_problem("setimes_mken_tokens_32k") -class SETimesMkEnTokens32k(WMTProblem): +@registry.register_problem +class TranslateEnmkSetimes32k(TranslateProblem): """Problem spec for SETimes Mk-En translation.""" @property @@ -571,7 +574,10 @@ def generator(self, data_dir, tmp_dir, train): source_datasets + target_datasets) tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) - return token_generator(data_path + ".lang1", data_path + ".lang2", + # We generate English->X data by convention, to train reverse translation + # just add the "_rev" suffix to the problem name, e.g., like this. + # --problems=translate_enmk_setimes32k_rev + return token_generator(data_path + ".lang2", data_path + ".lang1", symbolizer_vocab, EOS) @property @@ -583,8 +589,8 @@ def target_space_id(self): return problem.SpaceID.EN_TOK -@registry.register_problem("wmt_encs_tokens_32k") -class WMTEnCsTokens32k(WMTProblem): +@registry.register_problem +class TranslateEncsWmt32k(TranslateProblem): """Problem spec for WMT English-Czech translation.""" @property @@ -616,8 +622,8 @@ def target_space_id(self): return problem.SpaceID.CS_TOK -@registry.register_problem("wmt_encs_characters") -class WMTEnCsCharacters(WMTProblem): +@registry.register_problem +class TranslateEncsWmtCharacters(TranslateProblem): """Problem spec for WMT En-Cs character-based translation.""" @property diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 4f1273163..2c3e4b71f 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -17,12 +17,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from functools import partial import math # Dependency imports from tensor2tensor.layers import common_layers +from tensor2tensor.utils import expert_utils import tensorflow as tf @@ -206,6 +208,39 @@ def attention_bias_ignore_padding(memory_padding): return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1) +def attention_bias_prepend_inputs_full_attention(padding): + """Create a bias tensor for prepend_mode="prepend_inputs_full_attention". + + See prepend_inputs in common_hparams.py. + + Produces a bias tensor to be used in self-attention. + + This bias tensor allows for full connectivity in the "inputs" part of + the sequence and masked connectivity in the targets part. + + Args: + padding: a float `Tensor` with shape [batch, length] with + ones in positions corresponding to padding. In each row, a single + padding position separates the input part from the target part. + + Returns: + a `Tensor` with shape [batch, 1, length, length]. + """ + # Everything past the first padding position is part of the target. + # This Tensor has zeros for the source portion and separator, + # and ones for the target portion. + in_target = tf.cumsum(padding, axis=1, exclusive=True) + # The position within the target, or 0 if part of the source. + target_pos = tf.cumsum(in_target, axis=1) + # A position with a lesser target_pos cannot see a position with greater + # target_pos. + illegal_connections = tf.greater(tf.expand_dims(target_pos, 1), + tf.expand_dims(target_pos, 2)) + bias = tf.to_float(illegal_connections) * -1e9 + bias = tf.expand_dims(bias, 1) + return bias + + def attention_bias_proximal(length): """Bias for self-attention to encourage attention to close positions. @@ -646,6 +681,70 @@ def gather_blocks(x, indices): return tf.reshape(output, v_shape) +def compute_qkv(query_antecedent, memory_antecedent, total_key_depth, + total_value_depth, q_filter_width=1, kv_filter_width=1, + q_padding="VALID", kv_padding="VALID"): + """Computes query, key and value. + + Args: + query_antecedent: a Tensor with shape [batch, length_q, channels] + memory_antecedent: a Tensor with shape [batch, length_m, channels] + total_key_depth: an integer + total_value_depth: and integer + q_filter_width: An integer specifying how wide you want the query to be. + kv_filter_width: An integer specifying how wide you want the keys and values + to be. + q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. + kv_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. + + Returns: + q, k, v : [batch, length, depth] tensors + """ + if memory_antecedent is None and q_filter_width == kv_filter_width == 1: + # self attention with single position q, k, and v + combined = common_layers.conv1d( + query_antecedent, + total_key_depth * 2 + total_value_depth, + 1, + name="qkv_transform") + q, k, v = tf.split( + combined, [total_key_depth, total_key_depth, total_value_depth], + axis=2) + return q, k, v + + if memory_antecedent is None: + # self attention + q = common_layers.conv1d( + query_antecedent, + total_key_depth, + q_filter_width, + padding=q_padding, + name="q_transform") + kv_combined = common_layers.conv1d( + query_antecedent, + total_key_depth + total_value_depth, + kv_filter_width, + padding=kv_padding, + name="kv_transform") + k, v = tf.split(kv_combined, [total_key_depth, total_value_depth], + axis=2) + return q, k, v + + # encoder-decoder attention + q = common_layers.conv1d( + query_antecedent, total_key_depth, q_filter_width, padding=q_padding, + name="q_transform") + combined = common_layers.conv1d( + memory_antecedent, + total_key_depth + total_value_depth, + 1, + padding=kv_padding, + name="kv_transform") + k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) + + return q, k, v + + def multihead_attention(query_antecedent, memory_antecedent, bias, @@ -658,6 +757,10 @@ def multihead_attention(query_antecedent, attention_type="dot_product", block_length=128, block_width=128, + q_filter_width=1, + kv_filter_width=1, + q_padding="VALID", + kv_padding="VALID", name=None): """Multihead scaled-dot-product attention with input/output transformations. @@ -676,6 +779,12 @@ def multihead_attention(query_antecedent, "local_unmasked" block_length: an integer - relevant for "local_mask_right" block_width: an integer - relevant for "local_unmasked" + q_filter_width: An integer specifying how wide you want the query to be. + kv_filter_width: An integer specifying how wide you want the keys and values + to be. + q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. + kv_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. + name: an optional string Returns: @@ -691,30 +800,14 @@ def multihead_attention(query_antecedent, if total_value_depth % num_heads != 0: raise ValueError("Value depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_value_depth, num_heads)) - with tf.variable_scope( name, default_name="multihead_attention", values=[query_antecedent, memory_antecedent]): - if memory_antecedent is None: - # self attention - combined = common_layers.conv1d( - query_antecedent, - total_key_depth * 2 + total_value_depth, - 1, - name="qkv_transform") - q, k, v = tf.split( - combined, [total_key_depth, total_key_depth, total_value_depth], - axis=2) - else: - q = common_layers.conv1d( - query_antecedent, total_key_depth, 1, name="q_transform") - combined = common_layers.conv1d( - memory_antecedent, - total_key_depth + total_value_depth, - 1, - name="kv_transform") - k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) + q, k, v = compute_qkv(query_antecedent, memory_antecedent, total_key_depth, + total_value_depth, q_filter_width, kv_filter_width, + q_padding, kv_padding) + q = split_heads(q, num_heads) k = split_heads(k, num_heads) v = split_heads(v, num_heads) @@ -861,5 +954,106 @@ def parameter_attention(x, y = tf.reshape(y, [batch_size, length, total_value_depth]) y.set_shape([None, None, total_value_depth]) y = common_layers.conv1d(y, output_depth, 1, name="output_transform") - return y + + +def coordinate_tensor(shape, axis): + """Return a tensor with given shape containing coordinte along given axis. + + Args: + shape: a Tensor representing the shape of the output Tensor + axis: an integer + + Returns: + A tensor with shape shape and type tf.int32, where each elements its + coordinate along the given axis. + """ + + r = tf.range(shape[axis]) + r_shape = tf.one_hot( + axis, tf.size(shape), on_value=-1, off_value=1, dtype=tf.int32) + return tf.zeros(shape, dtype=tf.int32) + tf.reshape(r, r_shape) + + +def self_attention_expert(x, batch_coordinate, mask_right=True): + """Implementing attention that runs inside each expert. + + Args: + x: A tensor of shape[batch, depth]. Contains representations from + different positions, which are lexicographically ordered. + batch_coordinate: A tensor of shape [batch, 1] containing the batch + coordinate of each element in x. This is needed to make sure that + positions from different sequences don't attend to each other. + mask_right: A bool. If true, we will not attend to positions on the right, + just as decoder self attention. + + Returns: + out: A tensor of shape [batch, depth]. + example use: + expert_utils.local_moe( + ... + expert_fn=functools.partial(self_attention_expert, mask_right=) + ) + """ + depth = x.get_shape().as_list()[-1] + length = tf.shape(batch_coordinate)[0] + batch_coordinate = tf.squeeze(batch_coordinate, 1) + bias = tf.to_float( + tf.not_equal(tf.expand_dims(batch_coordinate, 1), + tf.expand_dims(batch_coordinate, 0))) * -1e9 + if mask_right: + bias += tf.reshape( + attention_bias_lower_triangle(length), [length, length]) + # bias has shape [length, length] + bias = tf.reshape(bias, [1, 1, length, length]) + x = tf.reshape(x, [1, length, depth]) + out = multihead_attention(x, + None, + bias, + total_key_depth=depth, + total_value_depth=depth, + output_depth=depth, + num_heads=1, + dropout_rate=0.0) + out = tf.squeeze(out, 0) + return out + +# functools.partial(self_attention_expert, mask_right=, depth=) + + +def local_expert_attention(x, k, loss_coef, attention_num_experts, train=True, + mask_right=True): + """Attention using a mixture of experts. + + Positions sent to the same expert can attend to each other. + The mixture of experts is "local" in that it is replicated on each + datashard. + + Args: + x: a Tensor with shape [batch, length, depth] + k: The number of experts to dispatch each example to + loss_coef: a scalar. A multiplier for the expert loss + attention_num_experts: The number of experts to use + train: a boolean for the current mode + mask_right: A boolean. If true, we will mask out positions to the right + for self-attention. + + Returns: + y: a Tensor with shape [batch, length, depth] + loss: a Scalar + """ + with tf.variable_scope("local_expert_attention"): + additional_dispatch_params = { + "batch_coordinate": tf.expand_dims( + coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) + } + return expert_utils.local_moe( + x, + train, + partial(self_attention_expert, mask_right=mask_right), + attention_num_experts, + k=k, + loss_coef=loss_coef, + pass_x=True, + pass_gates=False, + additional_dispatch_params=additional_dispatch_params) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index 0ed62685f..6bb4d3e9d 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -124,11 +124,24 @@ def basic_params1(): # You can change this behavior by overridding preprocess_examples() method # in your problem class. max_target_seq_length=0, - # Treat a seq-to-seq problem as a language model by prepending the - # inputs to the targets. During training, the loss is on both the - # inputs and the targets. During eval, metrics are computed only on the - # target portion. - prepend_inputs_to_targets=int(False), + # This flag allows us to optionally treat a seq-to-seq problem + # as a language model. Legal values are: + # + # "none" - Do not prepend the inputs to the targets. + # "prepend_inputs_masked_attention" + # replace "targets" in preprocessing with + # tf.concat([inputs, [0], targets], axis=1) + # i.e. we prepend the inputs to the targets with a single + # padding token in between. Use masked self-attention on the + # entire resulting sequence. During training, we compute losses on + # the combined sequence. During eval, we compute the metrics + # on only the targets portion. + # "prepend_inputs_full_attention" + # similar to the previous option except that each + # position in the inputs portion can see the + # entire inputs portion. This removes the challenge of + # autoregressively predicting the inputs portion. + prepend_mode="none", ) diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index e9b195195..8621ddcb1 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -1361,10 +1361,11 @@ def weights_nonzero(labels): return tf.to_float(tf.not_equal(labels, 0)) -def weights_second_part(labels): - """Weights function for 'prepend_inputs_to_targets'. +def weights_prepend_inputs_to_targets(labels): + """Assign weight 1.0 to only the "targets" portion of the labels. Weight 1.0 is assigned to all nonzero labels past the first zero. + See prepend_mode in common_hparams.py Args: labels: A Tensor of int32s. @@ -1372,7 +1373,7 @@ def weights_second_part(labels): Returns: A Tensor of floats. """ - past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0))) + past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0)), axis=1) nonzero = tf.to_float(labels) return tf.to_float(tf.not_equal(past_first_zero * nonzero, 0)) diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 84f9adbe7..01728ba24 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -510,6 +510,10 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all): class IdentityModalityNoPad(modality.Modality): """Does nothing except making sure that there is no padding in cross-ent.""" + @property + def top_dimensionality(self): + return 256 + @property def targets_dimensionality(self): return self._vocab_size diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py index d6fb95cf3..4dd1cde03 100644 --- a/tensor2tensor/layers/rev_block.py +++ b/tensor2tensor/layers/rev_block.py @@ -23,6 +23,7 @@ from __future__ import division from __future__ import print_function +import random import re # Dependency imports @@ -34,122 +35,304 @@ LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*") -def _rev_layer_forward(xs, f, g): +def _acc_grads(*lists_of_grads): + """Accumulates lists of gradients.""" + acc_grads = [] + for grads in zip(*lists_of_grads): + grads = [g for g in grads if g is not None] + if grads: + acc_grads.append(tf.add_n(grads)) + else: + acc_grads.append(None) + return acc_grads + + +def _rev_layer_forward(xs, f, g, f_side_input, g_side_input, + gate_outputs=False): """Forward for 1 reversible layer.""" x1, x2 = xs with tf.variable_scope("f"): - y1 = x1 + f(x2) + y1 = x1 + (f(x2, f_side_input) if f_side_input else f(x2)) with tf.variable_scope("g"): - y2 = x2 + g(y1) - return tf.tuple([y1, y2]) + y2 = x2 + (g(y1, g_side_input) if g_side_input else g(y1)) + if gate_outputs: + return tf.tuple([y1, y2]) + else: + return (y1, y2) -def _rev_layer_backward(ys, grad_ys, f, g, f_vars, g_vars): +def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars, + g_side_input): """Backprop for 1 layer.""" y1, y2 = ys grad_y1, grad_y2 = grad_ys # Reconstruct intermediates and inputs (x1, x2) - # stop_gradients required on y1 and x2 to prevent infinite recursion into this + # stop_gradients required on fn inputs to prevent infinite recursion into this # grad function on the calls to tf.gradients. y1_stop = tf.stop_gradient(y1) + g_side_input = [tf.stop_gradient(t) for t in g_side_input] with tf.variable_scope("g"): - gy1 = g(y1_stop) + gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop) x2 = y2 - gy1 x2_stop = tf.stop_gradient(x2) + f_side_input = [tf.stop_gradient(t) for t in f_side_input] with tf.variable_scope("f"): - fx2 = f(x2_stop) + fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop) x1 = y1 - fx2 # Compute gradients wrt to inputs # dL/dy2 * dG(y1)/y1 - grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2, gate_gradients=True)[0] + grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2)[0] grad_x1 = grad_y1 + grad_gy1_y2 - grad_x2 = ( - tf.gradients(fx2, x2_stop, grad_y1, gate_gradients=True)[0] + grad_y2 + - tf.gradients(fx2, x2_stop, grad_gy1_y2, gate_gradients=True)[0]) + grad_x2 = (tf.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + tf.gradients( + fx2, x2_stop, grad_gy1_y2)[0]) - # Compute gradients wrt to vars in f and g - grad_g_vars = tf.gradients(gy1, g_vars, grad_y2, gate_gradients=True) - grad_f_y1 = tf.gradients(fx2, f_vars, grad_y1, gate_gradients=True) - grad_f_y2 = tf.gradients(fx2, f_vars, grad_gy1_y2, gate_gradients=True) - grad_f_vars = [tf.add_n(grads) for grads in zip(grad_f_y1, grad_f_y2)] + # Compute gradients wrt to vars and side inputs in f and g + grads1 = tf.gradients(gy1, g_vars + g_side_input, grad_y2) + grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):] + grads2 = tf.gradients(fx2, f_vars + f_side_input, grad_y1) + grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):] + grads3 = tf.gradients(fx2, f_vars + f_side_input, grad_gy1_y2) + grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):] + grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2) + + grad_f_side = _acc_grads(grad_f_side1, grad_f_side2) # Put returns in a tuple to ensure a constant memory budget (i.e. don't want # the subsequent layer to start computing and consuming memory based on a # subset of these values). - outs = tf.tuple([x1, x2, grad_x1, grad_x2] + grad_f_vars + grad_g_vars) + outs = tf.tuple([x1, x2, grad_x1, grad_x2] + grad_f_vars + grad_g_vars + + grad_f_side + grad_g_side) x1, x2, grad_x1, grad_x2 = outs[:4] - grad_f_vars = outs[4:4 + len(grad_f_vars)] - grad_g_vars = outs[4 + len(grad_f_vars):] - - return (x1, x2), (grad_x1, grad_x2), grad_f_vars, grad_g_vars - - -def _rev_block_forward(x, f, g, num_layers=1, layer_scopes=None, name=None): + grad_f_vars_end = 4 + len(grad_f_vars) + grad_g_vars_end = grad_f_vars_end + len(grad_g_vars) + grad_f_side_end = grad_g_vars_end + len(grad_f_side) + + grad_f_vars = outs[4:grad_f_vars_end] + grad_g_vars = outs[grad_f_vars_end:grad_g_vars_end] + grad_f_side = outs[grad_g_vars_end:grad_f_side_end] + grad_g_side = outs[grad_f_side_end:] + + return ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side), + (grad_g_vars, grad_g_side)) + + +def _rev_block_forward(x1, + x2, + f, + g, + num_layers=1, + f_side_input=None, + g_side_input=None, + layer_scopes=None, + gate_outputs=False, + name=None): """Forward for a series of reversible layers.""" - x1, x2 = tf.split(x, 2, axis=len(x.get_shape()) - 1) out = (x1, x2) with tf.variable_scope(name, default_name="revblock"): for i in xrange(num_layers): with tf.variable_scope("revlayer_%d" % i) as layer_vs: if layer_scopes is not None: layer_scopes.append(layer_vs) - out = _rev_layer_forward(out, f, g) + out = _rev_layer_forward( + out, + f[i], + g[i], + f_side_input, + g_side_input, + gate_outputs=gate_outputs) y1, y2 = out - y = tf.concat([y1, y2], axis=-1) - return y + return y1, y2 + + +def _underlying_variable(t): + """Find the underlying variable ref, ignoring Identity ops.""" + while t.op.type == "Identity": + t = t.op.inputs[0] + if t.dtype == dtypes.float32_ref and "Variable" in t.op.type: + return t + else: + return None + + +def fn_with_custom_grad(grad_fn): + """Decorator to create a subgraph with a custom gradient function. + + The subgraph created by the decorated function is NOT put in a Defun and so + does not suffer from the limitations of the Defun (all subgraph ops on the + same device, no summaries). + + Args: + grad_fn: function with signature + (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars), + all of which are lists of Tensors. + + Returns: + Decorator for function such that the gradient is defined by grad_fn. + """ + + def dec(fn): + + def wrapped(*args): + return _fn_with_custom_grad(fn, args, grad_fn) + + return wrapped + return dec -def rev_block(x, f, g, num_layers=1, is_training=True): + +def _fn_with_custom_grad(fn, inputs, grad_fn): + """Create a subgraph with a custom gradient. + + Args: + fn: function that takes inputs as arguments and produces 1 or more Tensors. + inputs: list, will be passed as fn(*inputs). + grad_fn: function with signature + (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars), + all of which are lists of Tensors. + + Returns: + fn(*inputs) + """ + with tf.variable_scope(None, default_name="fn_with_custom_grad") as vs: + inputs = list(inputs) + outputs = fn(*inputs) + train_vars = list(vs.trainable_variables()) + + if grad_fn is None: + return outputs + else: + if not (isinstance(outputs, tuple) or isinstance(outputs, list)): + outputs = [outputs] + outputs = list(outputs) + + in_types = [t.dtype for t in inputs] + out_types = [t.dtype for t in outputs] + var_types = [t.dtype for t in train_vars] + + def custom_grad_fn(op, *dys): + """Custom grad fn applying grad_fn for identity Defun.""" + dys = list(dys) + fn_inputs = op.inputs[:len(inputs)] + fn_vars = op.inputs[len(inputs):len(inputs) + len(train_vars)] + fn_outputs = op.inputs[len(inputs) + len(train_vars):] + assert len(fn_outputs) == len(outputs) + assert len(fn_outputs) == len(dys) + + grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys) + grad_outputs = [None] * len(fn_outputs) + return tuple(grad_inputs + grad_vars + grad_outputs) + + # The Defun takes as input the original inputs, the trainable variables + # created in fn, and the outputs. In the forward it passes through the + # outputs. In the backwards, it produces gradients for the original inputs + # and the trainable variables. + @function.Defun( + *(in_types + var_types + out_types), + func_name="identity_custom_grad%d" % random.randint(1, 10**9), + python_grad_func=custom_grad_fn, + shape_func=lambda _: [t.get_shape() for t in outputs]) + def identity(*args): + outs = args[len(inputs) + len(train_vars):] + return tuple([tf.identity(t) for t in outs]) + + id_out = identity(*(inputs + train_vars + outputs)) + return id_out + + +def rev_block(x1, + x2, + f, + g, + num_layers=1, + f_side_input=None, + g_side_input=None, + is_training=True): """A block of reversible residual layers. A reversible residual layer is defined as: ``` - x1, x2 = tf.split(x, 2, axis=-1) - y1 = x1 + f(x2) - y2 = x2 + g(y1) - y = tf.concat([y1, y2], axis=-1) + y1 = x1 + f(x2, f_side_input) + y2 = x2 + g(y1, g_side_input) ``` + A reversible residual block, defined here, is a series of reversible residual + layers. + + Limitations: + * f and g must not close over any Tensors; all side inputs to f and g should + be passed in with f_side_input and g_side_input which will be forwarded to + f and g. + * f and g must not change the dimensionality of their inputs in order for the + addition in the equations above to work. + Args: - x: a float Tensor, input, will be split evenly across the last dim. - f: a function, (Tensor) -> (Tensor). Should not change the shape of the - Tensor. May create variables. Should NOT close over any Tensor values. - g: a function, (Tensor) -> (Tensor). Should not change the shape of the - Tensor. May create variables. Should NOT close over any Tensor values. + x1: a float Tensor. + x2: a float Tensor. + f: a function, (Tensor) -> (Tensor) (or list of such of length num_layers). + Should not change the shape of the Tensor. Expected to create variables. + See f_side_input if there are side inputs. + g: a function, (Tensor) -> (Tensor) (or list of such of length num_layers). + Should not change the shape of the Tensor. Expected to create variables. + See g_side_input if there are side inputs. num_layers: int, number of reversible residual layers. Each layer will apply f and g according to the equations above, with new variables in each layer. + f_side_input: list of Tensors, side input to f. If not None, signature of f + should be (Tensor, list) -> (Tensor). + g_side_input: list of Tensors, side input to g. If not None, signature of g + should be (Tensor, list) -> (Tensor). is_training: bool, whether to actually use the efficient backprop codepath. Returns: - y: a float Tensor, output. + y1, y2: tuple of float Tensors. """ + if f_side_input is None: + f_side_input = [] + if g_side_input is None: + g_side_input = [] + if isinstance(f, list): + assert len(f) == num_layers + else: + f = [f] * num_layers + if isinstance(g, list): + assert len(g) == num_layers + else: + g = [g] * num_layers + + # Filled by the forward function below layer_scopes = [] - def rev_block_grad(op, grad_y): + def custom_grad_fn(inputs, variables, ys, grad_ys): """Custom gradient fn for a block of reversible residual layers.""" - y = op.outputs[0] - ys = tf.split(y, 2, axis=len(y.get_shape()) - 1) - grad_ys = tf.split(grad_y, 2, axis=len(y.get_shape()) - 1) + side_inputs = inputs[2:] + f_side_idxs = [None] * len(f_side_input) + g_side_idxs = [None] * len(g_side_input) + assert len(side_inputs) == len(f_side_input) + len(g_side_input) + + for i, t in enumerate(side_inputs): + if t in f_side_input: + f_side_idxs[f_side_input.index(t)] = i + elif t in g_side_input: + g_side_idxs[g_side_input.index(t)] = i + else: + assert False - # Find all variables from f and from g - # Keep track of their positions in all_vars - all_vars = op.inputs[1:] f_vars = [[] for _ in range(num_layers)] g_vars = [[] for _ in range(num_layers)] f_vars_idxs = [[] for _ in range(num_layers)] g_vars_idxs = [[] for _ in range(num_layers)] - for i, v in enumerate(all_vars): - ref = v.op.inputs[0] - assert ref.dtype == dtypes.float32_ref - regex = LAYER_RE.match(v.name) + for i, t in enumerate(variables): + ref = _underlying_variable(t) + + # Use the name to identify the layer number and function (f or g) + regex = LAYER_RE.match(ref.name) layer_no = int(regex.group(1)) fn_name = regex.group(2) if fn_name == "f": @@ -160,45 +343,71 @@ def rev_block_grad(op, grad_y): g_vars[layer_no].append(ref) g_vars_idxs[layer_no].append(i) - f_grads = [] - g_grads = [] + f_var_grads = [] + g_var_grads = [] + f_side_grads = [] + g_side_grads = [] - # Reverse state containers to go backward + # Reverse variable containers to go backward layer_scopes.reverse() f_vars.reverse() g_vars.reverse() + f.reverse() + g.reverse() for i in xrange(num_layers): with tf.variable_scope(layer_scopes[i], reuse=True): - ys, grad_ys, grad_f_vars, grad_g_vars = _rev_layer_backward( - ys, grad_ys, f, g, f_vars[i], g_vars[i]) - f_grads.append(grad_f_vars) - g_grads.append(grad_g_vars) - - # Gradients were collected in reverse layer order - f_grads.reverse() - g_grads.reverse() - - # Reorder the gradients so they match the original order of all_vars - var_grads = [None] * len(all_vars) - for idxs, grads in zip(f_vars_idxs, f_grads) + zip(g_vars_idxs, g_grads): + + ys, grad_ys, f_ret, g_ret = _rev_layer_backward(ys, grad_ys, f[i], g[i], + f_vars[i], f_side_input, + g_vars[i], g_side_input) + + grad_f_vars, grad_f_side = f_ret + grad_g_vars, grad_g_side = g_ret + f_var_grads.append(grad_f_vars) + g_var_grads.append(grad_g_vars) + f_side_grads.append(grad_f_side) + g_side_grads.append(grad_g_side) + + # Accumulate layer gradients for f_side_input and g_side_input + acc_f_side_grads = _acc_grads(*f_side_grads) + acc_g_side_grads = _acc_grads(*g_side_grads) + + # Use the stored idxs to put gradients in the passed-in order. + side_input_grads = [None] * len(side_inputs) + variable_grads = [None] * len(variables) + + # Variable gradients were collected in reverse layer order. Reverse to match + # idxs. + f_var_grads.reverse() + g_var_grads.reverse() + for idxs, grads in zip(f_vars_idxs, f_var_grads) + zip( + g_vars_idxs, g_var_grads): for i, grad in zip(idxs, grads): - var_grads[i] = grad - - grad_x = tf.concat(grad_ys, axis=-1) - all_grads = [grad_x] + var_grads - return all_grads - - @function.Defun( - tf.float32, - python_grad_func=rev_block_grad, - shape_func=lambda _: [x.get_shape()]) - def rev_block_defun(inp): - inp.set_shape(x.get_shape()) - return _rev_block_forward( - inp, f, g, num_layers=num_layers, layer_scopes=layer_scopes) + variable_grads[i] = grad - if is_training: - return rev_block_defun(x) - else: - return _rev_block_forward(x, f, g, num_layers=num_layers) + for i, grad in zip(f_side_idxs, acc_f_side_grads): + side_input_grads[i] = grad + for i, grad in zip(g_side_idxs, acc_g_side_grads): + side_input_grads[i] = grad + + grad_x1, grad_x2 = grad_ys + return [grad_x1, grad_x2] + side_input_grads, variable_grads + + # Need a forward function with positional arguments + @fn_with_custom_grad(custom_grad_fn if is_training else None) + def forward(x1, x2, *side_inputs): + f_side = side_inputs[:len(f_side_input)] + g_side = side_inputs[len(f_side_input):] + return _rev_block_forward( + x1, + x2, + f, + g, + num_layers=num_layers, + f_side_input=f_side, + g_side_input=g_side, + layer_scopes=layer_scopes, + gate_outputs=is_training) + + return forward(x1, x2, *(f_side_input + g_side_input)) diff --git a/tensor2tensor/layers/rev_block_test.py b/tensor2tensor/layers/rev_block_test.py index bc4bcc6a4..dd4a62993 100644 --- a/tensor2tensor/layers/rev_block_test.py +++ b/tensor2tensor/layers/rev_block_test.py @@ -27,66 +27,177 @@ class RevBlockTest(tf.test.TestCase): - - def testSmoke(self): - channels = 8 - num_layers = 4 - batch_size = 16 - use_defun = True + CHANNELS = 8 + NUM_LAYERS = 4 + BATCH_SIZE = 16 + + def _testRevBlock(self, + x=None, + f=None, + g=None, + f_side_input=None, + g_side_input=None): tf.set_random_seed(1234) - def f(x): - return tf.layers.dense(x, channels // 2, use_bias=True) + if f is None: - def g(x): - return tf.layers.dense(x, channels // 2, use_bias=True) + def f(x): # pylint: disable=function-redefined + return tf.layers.dense(x, self.CHANNELS // 2, use_bias=True) - x = tf.random_uniform([batch_size, channels], dtype=tf.float32) - y = rev_block.rev_block( - x, f, g, num_layers=num_layers, is_training=use_defun) - loss = tf.reduce_mean(y + 10.) - grads = tf.gradients(loss, [x] + tf.global_variables()) - with self.test_session() as sess: - sess.run(tf.global_variables_initializer()) - _ = sess.run(grads) + if g is None: - def testRevBlock(self): - channels = 8 - num_layers = 4 - batch_size = 16 - tf.set_random_seed(1234) + def g(x): # pylint: disable=function-redefined + return tf.layers.dense(x, self.CHANNELS // 2, use_bias=True) - def f(x): - return tf.layers.dense(x, channels // 2, use_bias=True) + if f_side_input is None: + f_side_input = [] - def g(x): - return tf.layers.dense(x, channels // 2, use_bias=True) + if g_side_input is None: + g_side_input = [] - x = tf.random_uniform([batch_size, channels], dtype=tf.float32) + x = tf.random_uniform([self.BATCH_SIZE, self.CHANNELS], dtype=tf.float32) + x1, x2 = tf.split(x, 2, axis=1) - with tf.variable_scope("defun") as vs: - y_defun = rev_block.rev_block(x, f, g, num_layers=num_layers) + with tf.variable_scope("rev_test") as vs: + y1_rev, y2_rev = rev_block.rev_block( + x1, + x2, + f, + g, + f_side_input=f_side_input, + g_side_input=g_side_input, + num_layers=self.NUM_LAYERS) + y_rev = tf.concat([y1_rev, y2_rev], axis=1) fg_vars = vs.trainable_variables() num_vars = len(tf.global_variables()) with tf.variable_scope(vs, reuse=True): - y = rev_block.rev_block(x, f, g, num_layers=num_layers, is_training=False) + y1, y2 = rev_block.rev_block( + x1, + x2, + f, + g, + f_side_input=f_side_input, + g_side_input=g_side_input, + num_layers=self.NUM_LAYERS, + is_training=False) + y = tf.concat([y1, y2], axis=1) # Ensure no new vars were created - full reuse assert len(tf.global_variables()) == num_vars - loss_defun = tf.reduce_mean(y_defun + 10.) + loss_rev = tf.reduce_mean(y_rev + 10.) loss = tf.reduce_mean(y + 10.) - grads_defun = tf.gradients(loss_defun, [x] + fg_vars) - grads = tf.gradients(loss, [x] + fg_vars) + wrt = [x] + f_side_input + g_side_input + fg_vars + grads_rev = tf.gradients(loss_rev, wrt) + grads = tf.gradients(loss, wrt) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) - y_val, yd_val, gd_val, g_val = sess.run([y, y_defun, grads_defun, grads]) + y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads]) self.assertAllClose(y_val, yd_val) for g1, g2 in zip(gd_val, g_val): self.assertAllClose(g1, g2) + def testRevBlock(self): + self._testRevBlock() + + def testSideInput(self): + f_side_input = tf.random_uniform([self.BATCH_SIZE, self.CHANNELS // 2]) + + def f(x, side_input): + return tf.layers.dense( + x, self.CHANNELS // 2, use_bias=True) + side_input[0] + + self._testRevBlock(f=f, f_side_input=[f_side_input]) + + def testMultipleFns(self): + + def f1(x): + return tf.layers.dense(x, self.CHANNELS // 2) + + def f2(x): + return tf.layers.dense(x, self.CHANNELS // 2, activation=tf.nn.relu) + + self._testRevBlock(f=[f1, f2, f1, f2]) + + +class FnWithCustomGradTest(tf.test.TestCase): + + def testCorrectness(self): + + w = tf.random_uniform([6, 10]) + + def fn(a, b, c): + return tf.layers.dense( + a, + 10, + use_bias=False, + kernel_initializer=lambda shape, dtype, partition_info: w + ) + tf.matmul(b, c) + + def grad_fn(inputs, variables, outputs, grad_outputs): + outputs = outputs[0] + grad_outputs = grad_outputs[0] + grad_inputs = tf.gradients(outputs, inputs, grad_ys=grad_outputs) + grad_vars = tf.gradients(outputs, variables, grad_ys=grad_outputs) + return grad_inputs, grad_vars + + custom_fn = rev_block.fn_with_custom_grad(grad_fn)(fn) + + a = tf.random_uniform([11, 6]) + b = tf.random_uniform([11, 7]) + c = tf.random_uniform([7, 10]) + + out = fn(a, b, c) + custom_out = custom_fn(a, b, c) + self.assertEqual(out.get_shape().as_list(), + custom_out.get_shape().as_list()) + + loss = tf.reduce_mean(out) + custom_loss = tf.reduce_mean(custom_out) + + grads = tf.gradients(loss, [a, b, c] + [tf.trainable_variables()[0]]) + custom_grads = tf.gradients(custom_loss, + [a, b, c] + [tf.trainable_variables()[1]]) + + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + out_val, custom_out_val, grads_val, custom_grads_val = sess.run( + [out, custom_out, grads, custom_grads]) + self.assertAllClose(out_val, custom_out_val) + for g1, g2 in zip(grads_val, custom_grads_val): + self.assertAllClose(g1, g2) + + def testCustomGrad(self): + + def fn(a, b, c): + return tf.layers.dense(a, 10, use_bias=False) + tf.matmul(b, c) + + def grad_fn(inputs, variables, unused_outputs, unused_grad_outputs): + grad_inputs = [tf.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)] + grad_vars = [ + tf.ones_like(t) * (i + len(inputs) + 1.) + for i, t in enumerate(variables) + ] + return grad_inputs, grad_vars + + a = tf.random_uniform([11, 6]) + b = tf.random_uniform([11, 7]) + c = tf.random_uniform([7, 10]) + w = tf.random_uniform([6, 10]) + out = rev_block.fn_with_custom_grad(grad_fn)(fn)(a, b, c) + loss = tf.reduce_mean(out) + grads = tf.gradients(loss, [a, b, c, tf.trainable_variables()[0]]) + expected_grads = [ + tf.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w]) + ] + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + g_val, eg_val = sess.run([grads, expected_grads]) + for g1, g2 in zip(g_val, eg_val): + self.assertAllClose(g1, g2) + if __name__ == "__main__": tf.test.main() diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py index 19f1915e8..3302f45be 100644 --- a/tensor2tensor/models/attention_lm.py +++ b/tensor2tensor/models/attention_lm.py @@ -72,8 +72,13 @@ def attention_lm_prepare_decoder(targets, hparams): decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments """ - decoder_self_attention_bias = ( - common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + if hparams.prepend_mode == "prepend_inputs_full_attention": + decoder_self_attention_bias = ( + common_attention.attention_bias_prepended( + common_attention.embedding_to_padding(targets))) + else: + decoder_self_attention_bias = ( + common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) @@ -153,6 +158,7 @@ def attention_lm_base(): hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam("encoder_full_attention", int(False)) return hparams @@ -181,9 +187,26 @@ def attention_lm_translation(): hparams = attention_lm_base() hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" - hparams.learning_rate = 0.1 - hparams.prepend_inputs_to_targets = int(True) + hparams.learning_rate = 0.4 + hparams.prepend_mode = "prepend_inputs_masked_attention" hparams.max_length = 512 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(True) return hparams + + +@registry.register_hparams +def attention_lm_translation_l12(): + """Version to use for seq2seq.""" + hparams = attention_lm_translation() + hparams.batch_size = 4096 + hparams.num_hidden_layers = 12 + return hparams + + +@registry.register_hparams +def attention_lm_translation_full_attention(): + """Version to use for seq2seq.""" + hparams = attention_lm_translation() + hparams.prepend_mode = "prepend_inputs_full_attention" + return hparams diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 268e93f7b..9c55eadd6 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -39,6 +39,19 @@ import tensorflow as tf +class AttentionMoeType(object): + NONE = "none" + LOCAL = "local" + GLOBAL = "global" + + @staticmethod + def get_choices(): + return [ + AttentionMoeType.NONE, + AttentionMoeType.LOCAL, + ] + + @registry.register_model class AttentionLmMoe(t2t_model.T2TModel): """Attention net. See file docstring.""" @@ -66,17 +79,33 @@ def postprocess(x, y): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("attention"): - y = dp( - common_attention.multihead_attention, - preprocess(x), - None, - decoder_self_attention_bias, - hparams.attention_key_channels or hparams.hidden_size, - hparams.attention_value_channels or hparams.hidden_size, - hparams.hidden_size, - hparams.num_heads, - hparams.attention_dropout, - name="decoder_self_attention") + x = preprocess(x) + if hparams.attention_moe_type == AttentionMoeType.NONE: + y = dp( + common_attention.multihead_attention, + x, + None, + decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + name="decoder_self_attention") + elif hparams.attention_moe_type == AttentionMoeType.LOCAL: + y, loss = dp( + common_attention.local_expert_attention, + x, + k=2, + loss_coef=1e-2, + attention_num_experts=hparams.attention_num_experts, + train=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + mask_right=True) + # TODO(avaswani, epot, noam): Do we need to divide by num shards ? + extra_loss += tf.add_n(loss)/dp.n + else: + raise ValueError("Only {} supported for now.".format( + AttentionMoeType.get_choices())) x = postprocess(x, y) with tf.variable_scope("ffn"): if str(layer) in hparams.moe_layers.split(","): @@ -118,8 +147,13 @@ def attention_lm_moe_prepare_decoder(targets, hparams): decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments """ - decoder_self_attention_bias = ( - common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + if hparams.prepend_mode == "prepend_inputs_full_attention": + decoder_self_attention_bias = ( + common_attention.attention_bias_prepended( + common_attention.embedding_to_padding(targets))) + else: + decoder_self_attention_bias = ( + common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) @@ -169,6 +203,9 @@ def attention_lm_moe_base(): hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("moe_layers", "2") # comma separated list of layer numbers + # moe params. local attention moe. + hparams.add_hparam("attention_moe_type", AttentionMoeType.NONE) + hparams.add_hparam("attention_num_experts", 16) return hparams @@ -206,6 +243,21 @@ def attention_lm_moe_tiny(): return hparams +@registry.register_hparams +def attention_lm_attention_moe_tiny(): + """Cheap model for debugging. + + Returns: + an hparams object. + """ + hparams = attention_lm_moe_small() + hparams.moe_layers = "" + hparams.attention_num_experts = 16 + hparams.filter_size = 512 + hparams.attention_moe_type = AttentionMoeType.LOCAL + return hparams + + @registry.register_hparams def attention_lm_no_moe_small(): """Without the mixture of experts (for comparison). @@ -249,3 +301,20 @@ def attention_lm_moe_large(): hparams.moe_num_experts = 128 hparams.layer_prepostprocess_dropout = 0.2 return hparams + + +@registry.register_hparams +def attention_lm_moe_translation(): + """Version to use for seq2seq.""" + hparams = attention_lm_moe_base() + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + hparams.learning_rate = 0.4 + hparams.prepend_mode = "prepend_inputs_masked_attention" + hparams.max_length = 512 + hparams.label_smoothing = 0.1 + hparams.layer_prepostprocess_dropout = 0.2 + hparams.num_hidden_layers = 6 + hparams.moe_layers = "0,1,2,3,4,5" + hparams.shared_embedding_and_softmax_weights = int(True) + return hparams diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py index 27aa631c6..9d676632e 100644 --- a/tensor2tensor/models/gene_expression.py +++ b/tensor2tensor/models/gene_expression.py @@ -130,8 +130,14 @@ def fc_layer(x, num_out, dropout_rate, name="fc"): def gene_expression_conv_base(): """Hparams for GeneExpressionConv model.""" hparams = common_hparams.basic_params1() - hparams.max_length = 10000000 - hparams.batch_size = 1024 + + batch_size = 10 + output_length = 2048 + inputs_per_output = 128 + chunk_size = 4 + input_length = output_length * inputs_per_output // chunk_size + hparams.batch_size = input_length * batch_size + hparams.dropout = 0.1 hparams.add_hparam("num_conv_layers", 4) hparams.add_hparam("num_dconv_layers", 7) diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index e2307f49f..cc4cd1200 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -70,7 +70,7 @@ def testGeneExpressionModels(self): gene_expression_conv_test())] for model_cls, hparams in models_hparams: hparams.add_hparam("data_dir", None) - p_hparams = gene_data.GeneExpressionCAGE10().internal_hparams(hparams) + p_hparams = gene_data.GenomicsExpressionCage10().internal_hparams(hparams) hparams.problems = [p_hparams] self._testModel(hparams, model_cls) diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py index d4514408d..af609e22c 100644 --- a/tensor2tensor/models/models.py +++ b/tensor2tensor/models/models.py @@ -33,6 +33,7 @@ from tensor2tensor.models import lstm from tensor2tensor.models import multimodel from tensor2tensor.models import neural_gpu +from tensor2tensor.models import rev_transformer from tensor2tensor.models import shake_shake from tensor2tensor.models import slicenet from tensor2tensor.models import transformer diff --git a/tensor2tensor/models/rev_transformer.py b/tensor2tensor/models/rev_transformer.py new file mode 100644 index 000000000..d1392a1ee --- /dev/null +++ b/tensor2tensor/models/rev_transformer.py @@ -0,0 +1,244 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Reversible Residual Transformer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.layers import common_attention +from tensor2tensor.layers import common_layers +from tensor2tensor.layers import rev_block +from tensor2tensor.models import transformer +from tensor2tensor.utils import registry + +import tensorflow as tf + + +@registry.register_model +class RevTransformer(transformer.Transformer): + """Reversible Residual Transformer. + + Layers are reversible and are recomputed on the backward pass. + + y1 = x1 + f(x2) + y2 = x2 + g(y1) + + f: Attention + g: Feed-forward + """ + + def model_fn_body(self, features): + hparams = self._hparams + targets = features["targets"] + inputs = features["inputs"] + target_space = features["target_space_id"] + + inputs = common_layers.flatten4d3d(inputs) + targets = common_layers.flatten4d3d(targets) + + (encoder_input, encoder_self_attention_bias, + encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( + inputs, target_space, hparams)) + (decoder_input, + decoder_self_attention_bias) = transformer.transformer_prepare_decoder( + targets, hparams) + + encoder_input = tf.nn.dropout(encoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) + decoder_input = tf.nn.dropout(decoder_input, + 1.0 - hparams.layer_prepostprocess_dropout) + encoder_output = rev_transformer_encoder( + encoder_input, encoder_self_attention_bias, hparams) + + decoder_output = rev_transformer_decoder( + decoder_input, encoder_output, decoder_self_attention_bias, + encoder_decoder_attention_bias, hparams) + decoder_output = tf.expand_dims(decoder_output, 2) + + return decoder_output + + +def rev_transformer_encoder(encoder_input, + encoder_self_attention_bias, + hparams, + name="encoder"): + """A stack of transformer layers. + + Args: + encoder_input: a Tensor + encoder_self_attention_bias: bias Tensor for self-attention + (see common_attention.attention_bias()) + hparams: hyperparameters for model + name: a string + + Returns: + y: a Tensors + """ + + def f(x, side_input): + """f(x) for reversible layer, self-attention layer.""" + encoder_self_attention_bias = side_input[0] + + old_hid_size = hparams.hidden_size + hparams.hidden_size = old_hid_size // 2 + + with tf.variable_scope("self_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess( + x, hparams), None, encoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) + y = common_layers.layer_postprocess(x, y, hparams) + hparams.hidden_size = old_hid_size + return y + + def g(x): + """g(x) for reversible layer, feed-forward layer.""" + old_hid_size = hparams.hidden_size + hparams.hidden_size = old_hid_size // 2 + + with tf.variable_scope("ffn"): + y = transformer.transformer_ffn_layer( + common_layers.layer_preprocess(x, hparams), hparams) + y = common_layers.layer_postprocess(x, y, hparams) + hparams.hidden_size = old_hid_size + return y + + x1, x2 = tf.split(encoder_input, 2, axis=-1) + + with tf.variable_scope(name): + y1, y2 = rev_block.rev_block( + x1, + x2, + f, + g, + num_layers=hparams.num_hidden_layers, + f_side_input=[encoder_self_attention_bias], + is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN) + y = tf.concat([y1, y2], axis=-1) + + return common_layers.layer_preprocess(y, hparams) + + +def rev_transformer_decoder(decoder_input, + encoder_output, + decoder_self_attention_bias, + encoder_decoder_attention_bias, + hparams, + name="decoder"): + """A stack of transformer layers. + + Args: + decoder_input: a Tensor + encoder_output: a Tensor + decoder_self_attention_bias: bias Tensor for self-attention + (see common_attention.attention_bias()) + encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention + (see common_attention.attention_bias()) + hparams: hyperparameters for model + name: a string + + Returns: + y: a Tensors + """ + + def f(x, side_input): + """f(x) for reversible layer, self-attention and enc-dec attention.""" + decoder_self_attention_bias = side_input[0] + encoder_decoder_attention_bias = side_input[1] + encoder_output = side_input[2] + + old_hid_size = hparams.hidden_size + hparams.hidden_size = old_hid_size // 2 + + with tf.variable_scope("self_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess( + x, hparams), None, decoder_self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) + y = common_layers.layer_postprocess(x, y, hparams) + if encoder_output is not None: + with tf.variable_scope("encdec_attention"): + y = common_attention.multihead_attention( + common_layers.layer_preprocess( + x, hparams), encoder_output, encoder_decoder_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) + y = common_layers.layer_postprocess(x, y, hparams) + hparams.hidden_size = old_hid_size + return y + + def g(x): + """g(x) for reversible layer, feed-forward layer.""" + old_hid_size = hparams.hidden_size + hparams.hidden_size = old_hid_size // 2 + with tf.variable_scope("ffn"): + y = transformer.transformer_ffn_layer( + common_layers.layer_preprocess(x, hparams), hparams) + y = common_layers.layer_postprocess(x, y, hparams) + hparams.hidden_size = old_hid_size + return y + + x1, x2 = tf.split(decoder_input, 2, axis=-1) + + with tf.variable_scope(name): + y1, y2 = rev_block.rev_block( + x1, + x2, + f, + g, + num_layers=hparams.num_hidden_layers, + f_side_input=[ + decoder_self_attention_bias, encoder_decoder_attention_bias, + encoder_output + ], + is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN) + y = tf.concat([y1, y2], axis=-1) + return common_layers.layer_preprocess(y, hparams) + + +@registry.register_hparams +def rev_transformer_base(): + """Base hparams for RevTransformer.""" + hparams = transformer.transformer_big() + + # Use settings from transformer_n_da + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + hparams.learning_rate = 0.4 + + return hparams + + +@registry.register_hparams +def rev_transformer_big(): + """Base hparams for RevTransformer.""" + hparams = rev_transformer_base() + + # The RevTransformer uses significantly less memory than the Transformer. + # Increase batch size and model size. + hparams.batch_size *= 2 + hparams.hidden_size *= 2 + hparams.num_heads *= 2 + hparams.num_hidden_layers += 1 + return hparams diff --git a/tensor2tensor/models/rev_transformer_test.py b/tensor2tensor/models/rev_transformer_test.py new file mode 100644 index 000000000..da9e15f72 --- /dev/null +++ b/tensor2tensor/models/rev_transformer_test.py @@ -0,0 +1,77 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for RevTransformer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import rev_transformer + +import tensorflow as tf + + +def rev_transformer_test(): + hparams = rev_transformer.rev_transformer_base() + hparams.num_hidden_layers = 2 + hparams.hidden_size = 128 + hparams.filter_size = 512 + hparams.num_heads = 2 + return hparams + + +class RevTransformerTest(tf.test.TestCase): + + def testTransformer(self): + batch_size = 3 + input_length = 5 + target_length = 7 + vocab_size = 9 + hparams = rev_transformer_test() + p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size, + vocab_size) + hparams.problems = [p_hparams] + inputs = -1 + np.random.random_integers( + vocab_size, size=(batch_size, input_length, 1, 1)) + targets = -1 + np.random.random_integers( + vocab_size, size=(batch_size, target_length, 1, 1)) + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = rev_transformer.RevTransformer( + hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + sharded_logits, _ = model.model_fn(features) + logits = tf.concat(sharded_logits, 0) + grads = tf.gradients( + tf.reduce_mean(logits), [features["inputs"]] + tf.global_variables()) + grads = [g for g in grads if g is not None] + + with self.test_session() as session: + session.run(tf.global_variables_initializer()) + logits_val, _ = session.run([logits, grads]) + self.assertEqual(logits_val.shape, (batch_size, target_length, 1, 1, + vocab_size)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index ffd791a04..6a3f3afdf 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -26,6 +26,7 @@ from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers from tensor2tensor.models import transformer +from tensor2tensor.utils import expert_utils from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -78,18 +79,45 @@ def decompress_step(source, c, hparams, first_relu, name): shape = tf.shape(source) if c is not None: source = attend(source, c, hparams, "decompress_attend") - first = common_layers.conv_block( - source, - hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))], - first_relu=first_relu, padding="SAME", name="decompress_conv1") - second = common_layers.conv_block( - tf.concat([source, first], axis=3), - hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))], - first_relu=first_relu, padding="SAME", name="decompress_conv2") - thicker = interleave(first, second) + thicker = common_layers.conv_block( + source, hparams.hidden_size * 2, [((1, 1), (1, 1))], + first_relu=first_relu, name="decompress_conv") return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size]) +def top_k_softmax(x, k): + """Calculate softmax(x), select top-k and rescale to sum to 1.""" + x = tf.nn.softmax(x) + top_x, _ = tf.nn.top_k(x, k=k+1) + min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True) + x = tf.nn.relu((x - min_top) + 1e-12) + x /= tf.reduce_sum(x, axis=-1, keep_dims=True) + return x, tf.reduce_max(top_x, axis=-1) + + +def top_k_experts(x, k, hparams): + x_shape = tf.shape(x) + x_flat = tf.reshape(x, [-1, x.get_shape().as_list()[-1]]) + is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN + gates, load = expert_utils.noisy_top_k_gating( + x_flat, hparams.v_size, is_training, k) + gates_shape = [x_shape[0], x_shape[1], x_shape[2], hparams.v_size] + gates = tf.reshape(gates, gates_shape) + load_loss = expert_utils.cv_squared(load) + return gates, load_loss + + +def dvae(x, k, hparams, name): + with tf.variable_scope(name): + m = tf.layers.dense(x, hparams.v_size, name="mask") + if k is None: + m = tf.nn.softmax(m) + kl = - tf.reduce_max(m, axis=-1) + else: + m, kl = top_k_softmax(m, k) + return m, 1.0 - tf.reduce_mean(kl) + + def vae(x, hparams, name): with tf.variable_scope(name): mu = tf.layers.dense(x, hparams.z_size, name="mu") @@ -117,24 +145,59 @@ def compress(x, c, hparams, name): return cur +def mix(x1, x2, steps, min_prob=0.0, max_prob=1.0, mode="lin"): + if mode == "lin": + alpha_p = common_layers.inverse_lin_decay(steps) + 0.001 + else: + alpha_p = common_layers.inverse_exp_decay(steps) + 0.001 + alpha_p = alpha_p * (max_prob - min_prob) + min_prob + alpha = tf.random_uniform(tf.shape(x1)) + alpha = tf.to_float(tf.less(alpha, alpha_p)) + return alpha * x1 + (1.0 - alpha) * x2 + + def vae_compress(x, c, hparams, compress_name, decompress_name, reuse=None): """Compress, then VAE.""" + mix_k = 8 with tf.variable_scope(compress_name, reuse=reuse): - cur = compress(x, c, hparams, "compress") + cur = compress(x, None, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") - z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae") + # z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae") + z, kl_loss = dvae(cur, None, hparams, name="dvae") + z1, kl_loss1 = top_k_experts(cur, mix_k, hparams) + mu, log_sigma = None, None + + # Mix expert-selection and flat selection. + alpha_p = common_layers.inverse_lin_decay(60000) + 0.001 + z = alpha_p * z1 + (1 - alpha_p) * z + kl_loss += kl_loss1 + + # Compress context. + with tf.variable_scope(compress_name, reuse=reuse): + compress_c = compress(c, None, hparams, "compress_context") + c_z = tf.layers.dense(compress_c, hparams.v_size, name="mask_context") + reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits( + labels=z, logits=c_z) + + # If not training, use the predicted z instead of the autoregressive one. + # if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN: + # z = mix(c_z, z, 50000, max_prob=0.3, mode="exp") + # z, _ = top_k_softmax(c_z, mix_k) with tf.variable_scope(decompress_name, reuse=reuse): # Decompress. z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense") + # Leak at the beginning to help train. + z = mix(z, cur, 30000) + for i in xrange(hparams.num_compress_steps): j = hparams.num_compress_steps - i - 1 z = residual_conv(z, 1, hparams, "decompress_rc_%d" % j) - z = decompress_step(z, c, hparams, i > 0, "decompress__step_%d" % j) - return z, kl_loss, mu, log_sigma + z = decompress_step(z, c, hparams, i > 0, "decompress_step_%d" % j) + return z, kl_loss + 0.0001 * reconstruct_loss, mu, log_sigma def encode(x, x_space, hparams, name): @@ -167,7 +230,6 @@ def ffn(x, hparams, name): def vae_transformer_internal(inputs, targets, target_space, hparams): """VAE Transformer, main step used for training.""" with tf.variable_scope("vae_transformer"): - is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN # Prepare inputs, targets, and k. inputs = common_layers.flatten4d3d(inputs) input_len = tf.shape(inputs)[1] # Double input size to cover targets. @@ -179,39 +241,25 @@ def vae_transformer_internal(inputs, targets, target_space, hparams): inputs, targets, final_length_divisible_by=k) inputs = encode(inputs, target_space, hparams, "input_enc") - # Dropout targets or swap for zeros 5% of the time. - targets_nodrop = targets - max_prestep = hparams.kl_warmup_steps - prob_targets = 0.95 if is_training else 1.0 - targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01 - targets = dropmask(targets, targets_dropout_max * 0.7, is_training) - targets = tf.cond(tf.less(tf.random_uniform([]), prob_targets), - lambda: targets, lambda: tf.zeros_like(targets)) - targets = targets_nodrop - # Compress and vae. - z = tf.get_variable("z", [hparams.hidden_size]) - z = tf.reshape(z, [1, 1, 1, -1]) - z = tf.tile(z, [tf.shape(inputs)[0], 1, 1, 1]) - - z = attend(z, inputs, hparams, "z_attendsi") - z = ffn(z, hparams, "zff2") - z = attend(z, targets, hparams, "z_attendst2") - z = ffn(z, hparams, "zff3") - z, kl_loss, _, _ = vae(z, hparams, name="vae") - z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense") - - # z, kl_loss, _, _ = vae_compress( - # tf.expand_dims(targets, axis=2), tf.expand_dims(inputs, axis=2), - # hparams, "vae_compress", "vae_decompress") - - decoder_in = tf.squeeze(z, axis=2) + tf.zeros_like(targets) - (decoder_input, decoder_self_attention_bias) = ( - transformer.transformer_prepare_decoder(decoder_in, hparams)) - ret = transformer.transformer_decoder( - decoder_input, inputs, decoder_self_attention_bias, None, hparams) - - kl_loss *= common_layers.inverse_exp_decay(int(max_prestep * 1.5)) * 5.0 + z, kl_loss, _, _ = vae_compress(tf.expand_dims(targets, axis=2), + tf.expand_dims(inputs, axis=2), + hparams, "vae_compress", "vae_decompress") + + # Join z with inputs, run decoder. + to_decode = common_layers.conv_block( + tf.concat([z, tf.expand_dims(inputs, axis=2)], axis=3), + hparams.hidden_size, [((1, 1), (1, 1))], name="join_z") + ret = encode(tf.squeeze(to_decode, axis=2), target_space, hparams, "dec") + + # For experiments with one-sided decoder: + # decoder_in = tf.squeeze(to_decode, axis=2) + # (decoder_input, decoder_self_attention_bias) = ( + # transformer.transformer_prepare_decoder(decoder_in, hparams)) + # ret = transformer.transformer_decoder( + # decoder_input, inputs, decoder_self_attention_bias, None, hparams) + + kl_loss *= common_layers.inverse_exp_decay(hparams.kl_warmup_steps) * 3.0 losses = {"kl": kl_loss} return tf.expand_dims(ret, axis=2), losses @@ -267,10 +315,11 @@ def transformer_vae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 - hparams.learning_rate_warmup_steps = 16000 + hparams.learning_rate_warmup_steps = 4000 hparams.add_hparam("z_size", 128) + hparams.add_hparam("v_size", 1024*8) hparams.add_hparam("num_compress_steps", 4) - hparams.add_hparam("kl_warmup_steps", 60000) + hparams.add_hparam("kl_warmup_steps", 50000) return hparams @@ -283,6 +332,6 @@ def transformer_vae_base(): hparams.attention_dropout = 0.0 hparams.relu_dropout = 0.0 hparams.dropout = 0.0 - hparams.num_hidden_layers = 3 + hparams.num_hidden_layers = 4 hparams.z_size = 256 return hparams diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py index dd8275204..be6c28559 100644 --- a/tensor2tensor/utils/beam_search.py +++ b/tensor2tensor/utils/beam_search.py @@ -256,7 +256,7 @@ def grow_topk(i, alive_seq, alive_log_probs): topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2) - # Recovering the log probs becuase we will need to send them back + # Recovering the log probs because we will need to send them back topk_log_probs = topk_scores * length_penalty # Work out what beam the top probs are in. diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 5c7041014..03e7720b6 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -133,7 +133,8 @@ def preprocessing(examples, data_file_pattern): # all to the Problem class and its preprocess_examples method. Don't add. if "image" in data_file_pattern: def resize(img, size): - return tf.to_int64(tf.image.resize_images(img, [size, size])) + return tf.to_int64(tf.image.resize_images( + img, [size, size], tf.image.ResizeMethod.AREA)) if "img2img" in data_file_pattern: inputs = examples["inputs"] @@ -141,6 +142,9 @@ def resize(img, size): examples["targets"] = resize(inputs, 64) elif "image_celeba" in data_file_pattern: inputs = examples["inputs"] + # Remove boundaries in CelebA images. Remove 40 pixels each side + # vertically and 20 pixels each side horizontally. + inputs = tf.image.crop_to_bounding_box(inputs, 40, 20, 218-80, 178-40) examples["inputs"] = resize(inputs, 8) examples["targets"] = resize(inputs, 32) elif "audio" in data_file_pattern: diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index da33cf90e..4ba8dc71a 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -271,7 +271,7 @@ def _interactive_input_fn(hparams): " in= (set the input problem number)\n" " ou= (set the output problem number)\n" " ns= (changes number of samples)\n" - " dl= (changes decode legnth)\n" + " dl= (changes decode length)\n" " <%s> (decode)\n" " q (quit)\n" ">" % (num_samples, decode_length, "source_string" diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index ac58ef3cd..6f26f20fa 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -34,6 +34,8 @@ from tensorflow.python.framework import function +DEFAULT_DEV_STRING = "existing_device" + @function.Defun( python_grad_func=lambda x, dy: tf.convert_to_tensor(dy), @@ -180,7 +182,14 @@ def caching_getter(getter, name, *args, **kwargs): reuse=True if i > 0 and self._reuse else None, caching_device=self._caching_devices[i], custom_getter=custom_getter): - with tf.device(self._devices[i]): + # TODO(noam, epot, avaswani) + # Allows for passing no device in case you want to default to the + # existing device. This is needed when we put all experts on a single + # device, for example in local_moe. + if self._devices[i] != DEFAULT_DEV_STRING: + with tf.device(self._devices[i]): + outputs.append(fns[i](*my_args[i], **my_kwargs[i])) + else: outputs.append(fns[i](*my_args[i], **my_kwargs[i])) if isinstance(outputs[0], tuple): outputs = list(zip(*outputs)) @@ -361,7 +370,6 @@ def _my_top_k(x, k): def noisy_top_k_gating(x, - input_size, num_experts, train, k=2, @@ -375,7 +383,6 @@ def noisy_top_k_gating(x, Args: x: input Tensor with shape [batch_size, input_size] - input_size: an integer num_experts: an integer train: a boolean - we only add noise at training time. k: an integer - number of experts per example @@ -389,6 +396,7 @@ def noisy_top_k_gating(x, load: a Tensor with shape [num_experts] """ with tf.variable_scope(name, default_name="noisy_top_k_gating"): + input_size = x.get_shape().as_list()[-1] w_gate = tf.get_variable( "w_gate", [input_size, num_experts], tf.float32, initializer) if noisy_gating: @@ -431,6 +439,25 @@ def noisy_top_k_gating(x, class SparseDispatcher(object): """Helper for implementing a mixture of experts. + The purpose of this class is to create input minibatches for the + experts and to combine the results of the experts to form a unified + output tensor. + + There are two functions: + dispatch - take an input Tensor and create input Tensors for each expert. + combine - take output Tensors from each expert and form a combined output + Tensor. Outputs from different experts for the same batch element are + summed together, weighted by the provided "gates". + + The class is initialized with a "gates" Tensor, which specifies which + batch elements go to which experts, and the weights to use when combining + the outputs. Batch element b is sent to expert e iff gates[b, e] != 0. + + The inputs and outputs are all two-dimensional [batch, depth]. + Caller is responsible for collapsing additional dimensions prior to + calling this class and reshaping the output to the original shape. + See reshape_like(). + Example use: gates: a float32 `Tensor` with shape `[batch_size, num_experts]` @@ -526,8 +553,8 @@ class DistributedSparseDispatcher(object): """A distributed version of SparseDispatcher. Instead of one batch of input examples, we simultaneously process - num_datashards batches of input examples. The per-expert `Tensor`s contain - a combination of examples from the different datashards. + a list of num_datashards batches of input examples. The per-expert + `Tensor`s contain a combination of examples from the different datashards. Each datashard is associated with a particular device and each expert is associated with a particular device. All per-datashard and per-expert @@ -655,6 +682,13 @@ def reshape_like(a, b): return ret +def flatten_all_but_last(a): + """Flatten all dimensions of a except the last.""" + ret = tf.reshape(a, [-1, tf.shape(a)[-1]]) + ret.set_shape([None] + a.get_shape().as_list()[-1:]) + return ret + + def distributed_moe(data_parallelism, expert_devices, xs, @@ -676,7 +710,8 @@ def distributed_moe(data_parallelism, input_size: an integer (input size for this layer) expert_fn: a unary function for each expert to run It should take a Tensor with shape [batch_size, input_size] - and return a Tensor with shape [batch_size, output_size] + and return a Tensor with shape [batch_size, output_size]. + e.g. ffn_expert_fn(...) num_experts: an integer - number of experts k: an integer - how many experts to use for each batch element loss_coef: a scalar - multiplier on load-balancing losses @@ -703,7 +738,6 @@ def distributed_moe(data_parallelism, # load is a measure of approximately how many examples go to each expert gates, load = dp(noisy_top_k_gating, xs_flat, - input_size, num_experts, train, k, @@ -721,3 +755,67 @@ def distributed_moe(data_parallelism, importance = tf.add_n(dp(tf.reduce_sum, gates, 0)) loss = loss_coef * (cv_squared(importance) + cv_squared(load)) return ys, loss + + +def local_moe(x, + train, + expert_fn, + num_experts, + k=2, + loss_coef=1e-2, + pass_x=True, + pass_gates=False, + additional_dispatch_params=None, + name=None): + """Call a local mixture of experts. + + Args: + x: a tensors with shape [... , input_size] + train: a boolean scalar. + expert_fn: a function. + num_experts: an integer - number of experts + k: an integer - how many experts to use for each batch element + loss_coef: a scalar - multiplier on load-balancing losses + pass_x: a boolean. If true, x will also be dispatched to the experts. + pass_gates: a boolean. If true, gates will be passed to experts. Might be + necessary when dealing with sparse encoder-encoder decoder attention + additional_dispatch_params: The extra tensors that need to be sent to each + expert. Examples include batch batch coordinates (see + common_attention.local_expert_attention) + name: a string + + Returns: + y: a tensor. Has the same shape as x, except for the last dimension, + which is output_size. + extra_training_loss: a scalar. This should be added into the overall + training loss of the model. The backpropagation of this loss + encourages all experts to be approximately equally used across a batch. + """ + with tf.variable_scope(name, default_name="local_moe"): + x_flat = flatten_all_but_last(x) + # The gates indicate which batch elements go to which tensors. + # load is a measure of approximately how many examples go to each expert + gates, load = noisy_top_k_gating( + x_flat, + num_experts, + train, + k, + initializer=tf.zeros_initializer(), + noisy_gating=True, + noise_epsilon=1e-2) + # This magic object helps us shuffle data between datashards and experts. + dispatcher = SparseDispatcher(num_experts, gates) + expert_kwargs = {} + if pass_x: + expert_kwargs["x"] = dispatcher.dispatch(x_flat) + if pass_gates: + expert_kwargs["gates"] = dispatcher.expert_to_gates() + for k, v in six.iteritems(additional_dispatch_params or {}): + expert_kwargs[k] = dispatcher.dispatch(flatten_all_but_last(v)) + ep = Parallelism([DEFAULT_DEV_STRING] * num_experts) + expert_outputs = ep(expert_fn, **expert_kwargs) + y_flat = dispatcher.combine(expert_outputs) + y = reshape_like(y_flat, x) + importance = tf.reduce_sum(gates, 0) + loss = loss_coef * (cv_squared(importance) + cv_squared(load)) + return y, loss diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index fd82adc30..e5cb88ddf 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -196,9 +196,12 @@ def problem_metric_fn(predictions, labels, weights): class_output = "image" in problem_name and "coco" not in problem_name real_output = "gene_expression" in problem_name - if model_hparams.prepend_inputs_to_targets: + if model_hparams.prepend_mode != "none": + assert ( + model_hparams.prepend_mode == "prepend_inputs_masked_attention" or + model_hparams.prepend_mode == "prepend_inputs_full_attention") assert not class_output - weights_fn = common_layers.weights_second_part + weights_fn = common_layers.weights_prepend_inputs_to_targets elif class_output or real_output: weights_fn = common_layers.weights_all else: diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 6ce650ac3..f5d83cbf1 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -44,7 +44,6 @@ class MyModel(T2TModel): from __future__ import division from __future__ import print_function -import collections import inspect import re @@ -391,17 +390,18 @@ def create_modality(modality_spec, model_hparams): return retrieval_fns[modality_type](modality_name)(model_hparams, vocab_size) -def _hparams_help_string(): - hparams_names = list_hparams() - prefixes = zip([name.split("_")[0] for name in hparams_names], hparams_names) - names_by_prefix = collections.defaultdict(list) - for (prefix, full_name) in prefixes: - names_by_prefix[prefix].append(full_name) - return "\n".join( - sorted([ - " * %s: %s" % (prefix, sorted(names)) - for prefix, names in six.iteritems(names_by_prefix) - ])) +def display_list_by_prefix(names_list, starting_spaces=0): + """Creates a help string for names_list grouped by prefix.""" + cur_prefix, result_lines = None, [] + space = " " * starting_spaces + for name in sorted(names_list): + split = name.split("_", 1) + prefix = split[0] + if cur_prefix != prefix: + result_lines.append(space + prefix + ":") + cur_prefix = prefix + result_lines.append(space + " * " + name) + return "\n".join(result_lines) def help_string(): @@ -410,24 +410,29 @@ def help_string(): Registry contents: ------------------ - Models: %s + Models: +%s - HParams (by model): + HParams: %s - RangedHParams: %s + RangedHParams: +%s - Modalities: %s + Modalities: +%s - Problems: %s + Problems: +%s """ - m, rhp, mod, probs = [ - sorted(entries) + m, hp, rhp, mod, probs = [ + display_list_by_prefix(entries, starting_spaces=4) for entries in [ list_models(), + list_hparams(), list_ranged_hparams(), list_modalities(), list_problems() ] ] - return help_str % (m, _hparams_help_string(), rhp, mod, probs) + return help_str % (m, hp, rhp, mod, probs)