Merge pull request #383 from rsepassi/push

v1.2.6
tensorflow · Oct 27, 2017 · 9bdc801 · 9bdc801
2 parents a836d66 + ba47b61
commit 9bdc801
Show file tree

Hide file tree

Showing 40 changed files with 932 additions and 314 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,6 +24,6 @@ script:
   - mkdir $T2T_TRAIN_DIR
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
   - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
-  - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
+  - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10,use_last_position_only=True'
 git:
   depth: 3
diff --git a/README.md b/README.md
@@ -286,7 +286,7 @@ registrations.
 To add a new dataset, subclass
 [`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)
 and register it with `@registry.register_problem`. See
-[`TranslateEndeWmt8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+[`TranslateEndeWmt8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate_ende.py)
 for an example.
 
 Also see the [data generators

diff --git a/docs/new_problem.md b/docs/new_problem.md
@@ -105,7 +105,7 @@ We're almost done. `generator` generates the training and evaluation data and
 stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully
 several commonly used methods like `character_generator`, and `token_generator`
 are already written in the file
-[`wmt.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py).
+[`translate.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate.py).
 We will import `character_generator` and
 [`text_encoder`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/text_encoder.py)
 to write:

diff --git a/docs/walkthrough.md b/docs/walkthrough.md
@@ -286,7 +286,7 @@ registrations.
 To add a new dataset, subclass
 [`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)
 and register it with `@registry.register_problem`. See
-[`TranslateEndeWmt8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+[`TranslateEndeWmt8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate_ende.py)
 for an example.
 
 Also see the [data generators

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.2.5',
+    version='1.2.6',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -82,9 +82,9 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
     "parsing_english_ptb8k": (
-        lambda: wmt.parsing_token_generator(
+        lambda: translate.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13),
-        lambda: wmt.parsing_token_generator(
+        lambda: translate.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False, 2**13)),
     "parsing_english_ptb16k": (
         lambda: wsj_parsing.parsing_token_generator(

diff --git a/tensor2tensor/bin/t2t-decoder b/tensor2tensor/bin/t2t-decoder
@@ -84,6 +84,7 @@ def main(_):
 
   decode_hp = decoding.decode_hparams(FLAGS.decode_hparams)
   decode_hp.add_hparam("shards", FLAGS.decode_shards)
+  decode_hp.add_hparam("shard_id", FLAGS.worker_id)
   if FLAGS.decode_interactive:
     decoding.decode_interactively(estimator, decode_hp)
   elif FLAGS.decode_from_file:

diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md
@@ -23,7 +23,7 @@ All tasks produce TFRecord files of `tensorflow.Example` protocol buffers.
 To add a new problem, subclass
 [`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)
 and register it with `@registry.register_problem`. See
-[`WMTEnDeTokens8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+[`TranslateEndeWmt8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate_ende.py)
 for an example.
 
 `Problem`s support data generation, training, and decoding.
@@ -37,7 +37,7 @@ for training/decoding, e.g. a vocabulary file.
 A particularly easy way to implement `Problem.generate_data` for your dataset is
 to create 2 Python generators, one for the training data and another for the
 dev data, and pass them to `generator_utils.generate_dataset_and_shuffle`. See
-[`WMTEnDeTokens8k.generate_data`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+[`TranslateEndeWmt8k.generate_data`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate_ende.py)
 for an example of usage.
 
 The generators should yield dictionaries with string keys and values being lists
@@ -66,5 +66,5 @@ Some examples:
 
 *   [Algorithmic problems](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/algorithmic.py)
     and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/algorithmic_test.py)
-*   [WMT problems](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+*   [WMT En-De problems](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate_ende.py)
     and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt_test.py)
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -29,16 +29,16 @@
 from tensor2tensor.data_generators import image
 from tensor2tensor.data_generators import imdb
 from tensor2tensor.data_generators import lm1b
+from tensor2tensor.data_generators import multinli
 from tensor2tensor.data_generators import problem_hparams
 from tensor2tensor.data_generators import ptb
 from tensor2tensor.data_generators import snli
-from tensor2tensor.data_generators import wiki
-from tensor2tensor.data_generators import translate
-from tensor2tensor.data_generators import translate_enfr
-from tensor2tensor.data_generators import translate_ende
 from tensor2tensor.data_generators import translate_encs
-from tensor2tensor.data_generators import translate_enzh
+from tensor2tensor.data_generators import translate_ende
+from tensor2tensor.data_generators import translate_enfr
 from tensor2tensor.data_generators import translate_enmk
+from tensor2tensor.data_generators import translate_enzh
+from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wsj_parsing
 
 

diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py
@@ -19,9 +19,9 @@
 from __future__ import division
 from __future__ import print_function
 
+import hashlib
 import os
 import tarfile
-import hashlib
 
 # Dependency imports
 
@@ -39,6 +39,7 @@
 
 _DAILYMAIL_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs"
 
+
 # Note: using See et al. (2017) as reference for data generation
 # For more info, use the links below
 
@@ -47,23 +48,29 @@
 _DEV_URLS = "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt"
 _TEST_URLS = "https://github.com/abisee/cnn-dailymail/blob/master/url_lists/all_test.txt"
 
+
 # End-of-sentence marker.
 EOS = text_encoder.EOS_ID
 
+
 # Techniques for data prep from See et al. (2017)
-dm_single_close_quote = u'\u2019' # unicode
-dm_double_close_quote = u'\u201d'
-END_TOKENS = [u'.', u'!', u'?', u'...', u"'", u"`", u'"', dm_single_close_quote, dm_double_close_quote, u")"] # acceptable ways to end a sentence
+dm_single_close_quote = u"\u2019"  # unicode
+dm_double_close_quote = u"\u201d"
+# Acceptable ways to end a sentence.
+END_TOKENS = [u".", u"!", u"?", u"...", u"'", u"`", u"\"",
+              dm_single_close_quote, dm_double_close_quote, u")"]
 
 
 def _maybe_download_corpora(tmp_dir, is_training):
   """Download corpora if necessary and unzip them.
 
   Args:
     tmp_dir: directory containing dataset.
+    is_training: whether we're in training mode or not.
 
   Returns:
-    list of all files generated and path to file containing train/dev/test split info.
+    List of all files generated and path to file containing
+      train/dev/test split info.
   """
   cnn_filename = "cnn_stories.tgz"
   cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
@@ -85,43 +92,52 @@ def _maybe_download_corpora(tmp_dir, is_training):
   all_files = cnn_files + dailymail_files
 
   if is_training:
-    urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS)
+    urls_path = generator_utils.maybe_download(
+        tmp_dir, "all_train.txt", _TRAIN_URLS)
   else:
-    urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS)
+    urls_path = generator_utils.maybe_download(
+        tmp_dir, "all_val.txt", _DEV_URLS)
 
   return all_files, urls_path
 
+
 def example_splits(url_file, all_files):
+  """Generate splits of the data."""
   def generate_hash(inp):
-      """Generate a sha1 hash to match the raw url to the filename extracted"""
-      h = hashlib.sha1()
-      h.update(inp)
-      return h.hexdigest()
+    """Generate a sha1 hash to match the raw url to the filename extracted."""
+    h = hashlib.sha1()
+    h.update(inp)
+    return h.hexdigest()
 
-  all_files_map = {f.split("/")[-1]:f for f in all_files}
+  all_files_map = {f.split("/")[-1]: f for f in all_files}
 
   urls = []
   for line in tf.gfile.Open(url_file):
-    urls.append(line.strip().encode('utf-8'))
+    urls.append(line.strip().encode("utf-8"))
 
   filelist = []
   for url in urls:
-      url_hash = generate_hash(url)
-      filename = url_hash + ".story"
-      if filename not in all_files_map:
-        tf.logging.info("Missing file: %s" % url)
-        continue
-      filelist.append(all_files_map[filename])
+    url_hash = generate_hash(url)
+    filename = url_hash + ".story"
+    if filename not in all_files_map:
+      tf.logging.info("Missing file: %s" % url)
+      continue
+    filelist.append(all_files_map[filename])
 
   tf.logging.info("Found %d examples" % len(filelist))
 
   return filelist
 
+
 def example_generator(tmp_dir, is_training, sum_token):
+  """Generate examples."""
   def fix_run_on_sents(line):
-    if u"@highlight" in line: return line
-    if line=="": return line
-    if line[-1] in END_TOKENS: return line
+    if u"@highlight" in line:
+      return line
+    if not line:
+      return line
+    if line[-1] in END_TOKENS:
+      return line
     return line + u"."
 
   all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training)
@@ -133,28 +149,33 @@ def fix_run_on_sents(line):
     summary = []
     reading_highlights = False
     for line in tf.gfile.Open(story_file, "rb"):
-      line = unicode(line.strip(), "utf-8") if six.PY2 else line.strip().decode("utf-8")
+      if six.PY2:
+        line = unicode(line.strip(), "utf-8")
+      else:
+        line = line.strip().decode("utf-8")
       line = fix_run_on_sents(line)
-      if line == "":
-          continue
+      if not line:
+        continue
       elif line.startswith(u"@highlight"):
-          if len(story) == 0: break # No article text
-          reading_highlights = True
+        if not story:
+          break  # No article text.
+        reading_highlights = True
       elif reading_highlights:
-          summary.append(line)
+        summary.append(line)
       else:
-          story.append(line)
+        story.append(line)
 
-    if len(story) == 0 or len(summary) == 0:
-        continue
+    if (not story) or not summary:
+      continue
 
     yield " ".join(story) + story_summary_split_token + " ".join(summary)
 
+
 def _story_summary_split(story):
   split_str = u" <summary> "
   split_str_len = len(split_str)
   split_pos = story.find(split_str)
-  return story[:split_pos], story[split_pos+split_str_len:] # story, summary
+  return story[:split_pos], story[split_pos+split_str_len:]  # story, summary
 
 
 @registry.register_problem

diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -263,6 +263,7 @@ def gunzip_file(gz_path, new_path):
       for line in gz_file:
         new_file.write(line)
 
+
 def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                 generator):
   """Inner implementation for vocab generators.
@@ -301,10 +302,7 @@ def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
   return vocab
 
 
-def get_or_generate_vocab(data_dir,
-                          tmp_dir,
-                          vocab_filename,
-                          vocab_size,
+def get_or_generate_vocab(data_dir, tmp_dir, vocab_filename, vocab_size,
                           sources):
   """Generate a vocabulary from the datasets in sources."""
 

diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
@@ -32,7 +32,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.data_generators.translate import tabbed_generator
+from tensor2tensor.data_generators import translate
 from tensor2tensor.utils import registry
 
 
@@ -51,15 +51,17 @@ def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
       data_dir, tmp_dir, filename, 1,
       prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
   pair_filepath = os.path.join(tmp_dir, filename)
-  return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)
+  return translate.tabbed_generator(pair_filepath, source_vocab, target_vocab,
+                                    EOS)
 
 
 def tabbed_parsing_character_generator(tmp_dir, train):
   """Generate source and target data from a single file."""
   character_vocab = text_encoder.ByteTextEncoder()
   filename = "parsing_{0}.pairs".format("train" if train else "dev")
   pair_filepath = os.path.join(tmp_dir, filename)
-  return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
+  return translate.tabbed_generator(pair_filepath, character_vocab,
+                                    character_vocab, EOS)
 
 
 @registry.register_problem

diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
@@ -227,7 +227,7 @@ def feature_encoders(self, data_dir):
     # This vocab file must be present within the data directory.
     vocab_filename = os.path.join(data_dir, "charset_size134.txt")
     return {
-        "inputs": text_encoder.TextEncoder(),
+        "inputs": text_encoder.ImageEncoder(),
         "targets": text_encoder.SubwordTextEncoder(vocab_filename)
     }
 
@@ -273,7 +273,7 @@ def class_labels(self):
   def feature_encoders(self, data_dir):
     del data_dir
     return {
-        "inputs": text_encoder.TextEncoder(),
+        "inputs": text_encoder.ImageEncoder(),
         "targets": text_encoder.ClassLabelEncoder(self.class_labels)
     }