Merge pull request #570 from rsepassi/push

v1.4.4
tensorflow · Feb 8, 2018 · 80b2f73 · 80b2f73
2 parents 1c98b8e + 290a12a
commit 80b2f73
Show file tree

Hide file tree

Showing 43 changed files with 1,854 additions and 365 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,8 +3,11 @@ python:
   - "2.7"
   - "3.6"
 before_install:
+  - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list
+  - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add -
   - sudo apt-get update -qq
   - sudo apt-get install -qq libhdf5-dev
+  - sudo apt-get install -qq tensorflow-model-server
 install:
   - pip install -q .[tensorflow]
   - pip install -q .[tests]
@@ -21,7 +24,7 @@ script:
   - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)"
 
   # Run tests
-  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py
+  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py --ignore=tensor2tensor/bin/t2t_trainer_test.py
   - pytest tensor2tensor/utils/registry_test.py
   - pytest tensor2tensor/utils/trainer_lib_test.py
 
@@ -36,5 +39,14 @@ script:
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
   - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
   - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
+
+  # Export and query (on Python 2 only)
+  - t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+        pip install tensorflow-serving-api;
+        tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo &
+        sleep 10;
+        t2t-query-server --problem=$T2T_PROBLEM --server=localhost:9000 --servable_name=my_model --data_dir=$T2T_DATA_DIR --inputs_once='1 0 1 0 1 0';
+    fi
 git:
   depth: 3
diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md
@@ -0,0 +1,80 @@
+# Running on Cloud ML Engine
+
+Google Cloud Platform offers a managed training environment for TensorFlow
+models called [Cloud ML Engine](https://cloud.google.com/ml-engine/) and
+you can easily launch Tensor2Tensor on it, including for hyperparameter tuning.
+
+# Launch
+
+It's the same `t2t-trainer` you know and love with the addition of the
+`--cloud_mlengine` flag, which by default will launch on a 1-GPU machine.
+
+```
+# Note that both the data dir and output dir have to be on GCS
+DATA_DIR=gs://my-bucket/data
+OUTPUT_DIR=gs://my-bucket/train
+t2t-trainer \
+  --problems=translate_ende_wmt32k \
+  --model=transformer \
+  --hparams_set=transformer_base \
+  --data_dir=$DATA_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --cloud_mlengine
+```
+
+By passing `--worker_gpu=4` or `--worker_gpu=8` it will automatically launch on
+machines with 4 or 8 GPUs.
+
+You can additionally pass the `--cloud_mlengine_master_type` to select another
+kind of machine (see the [docs for
+`masterType`](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput)
+for your options). If you provide this flag yourself, make sure you pass the
+correct value for `--worker_gpu`.
+
+**Note**: `t2t-trainer` only currently supports launching with single machines,
+possibly with multiple GPUs. Multi-machine setups are not yet supported out of
+the box with the `--cloud_mlengine` flag, though multi-machine should in
+principle work just fine. Contributions/testers welcome.
+
+## `--t2t_usr_dir`
+
+Launching on Cloud ML Engine works with `--t2t_usr_dir` as well as long as the
+directory is fully self-contained (i.e. the imports only refer to other modules
+in the directory). If there are additional PyPI dependencies that you need, you
+can include a `setup.py` file in your directory (ensure that it uses
+`setuptools.find_packages`).
+
+# Hyperparameter Tuning
+
+Hyperparameter tuning with `t2t-trainer` and Cloud ML Engine is also a breeze
+with `--hparams_range` and the `--autotune_*` flags:
+
+```
+t2t-trainer \
+  --problems=translate_ende_wmt32k \
+  --model=transformer \
+  --hparams_set=transformer_base \
+  --data_dir=$DATA_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --cloud_mlengine \
+  --hparams_range=transformer_base_range \
+  --autotune_objective='metrics-translate_ende_wmt32k/neg_log_perplexity' \
+  --autotune_maximize \
+  --autotune_max_trials=100 \
+  --autotune_parallel_trials=3
+```
+
+The `--hparams_range` specifies the search space and should be registered with
+`@register_ranged_hparams`. It defines a `RangedHParams` object that sets
+search ranges and scales for various parameters. See `transformer_base_range`
+in
+[`transformer.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py)
+for an example.
+
+The metric name passed as `--autotune_objective` should be exactly what you'd
+see in TensorBoard. To minimize a metric, set `--autotune_maximize=False`.
+
+You control how many total trials to run with `--autotune_max_trials` and the
+number of jobs to launch in parallel with `--autotune_parallel_trials`.
+
+Happy tuning!
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.4.3',
+    version='1.4.4',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',
@@ -35,9 +35,9 @@
         'flask',
         'future',
         'gevent',
+        'google-api-python-client',
         'gunicorn',
         'gym<=0.9.5',  # gym in version 0.9.6 has some temporary issues.
-        'munch',
         'numpy',
         'requests',
         'scipy',

diff --git a/tensor2tensor/bin/t2t-rl-trainer b/tensor2tensor/bin/t2t-rl-trainer
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -26,7 +26,8 @@
 
 from tensor2tensor import models  # pylint: disable=unused-import
 from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
-from tensor2tensor.utils import cloud
+from tensor2tensor.utils import cloud_mlengine
+from tensor2tensor.utils import cloud_tpu
 from tensor2tensor.utils import decoding
 from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
 from tensor2tensor.utils import registry
@@ -81,13 +82,68 @@
 flags.DEFINE_bool("cloud_delete_on_done", False,
                   "Whether to delete the VM and TPU instance when done.")
 
+# Google Cloud ML Engine
+flags.DEFINE_bool("cloud_mlengine", False,
+                  "Whether to launch on Cloud ML Engine.")
+flags.DEFINE_string("cloud_mlengine_master_type", None,
+                    "Machine type for master on Cloud ML Engine. "
+                    "If provided, overrides default selections based on "
+                    "--worker_gpu. User is responsible for ensuring "
+                    "type is valid and that --worker_gpu matches number of "
+                    "GPUs on machine type. See documentation: "
+                    "https://cloud.google.com/ml-engine/reference/rest/v1/"
+                    "projects.jobs#traininginput")
+# Hyperparameter tuning on Cloud ML Engine
+# Pass an --hparams_range to enable
+flags.DEFINE_string("autotune_objective", None,
+                    "TensorBoard metric name to optimize.")
+flags.DEFINE_bool("autotune_maximize", True,
+                  "Whether to maximize (vs. minimize) autotune_objective.")
+flags.DEFINE_integer("autotune_max_trials", 10,
+                     "Maximum number of tuning experiments to run.")
+flags.DEFINE_integer("autotune_parallel_trials", 1,
+                     "How many trials to run in parallel (will spin up this "
+                     "many jobs.")
+# Note than in open-source TensorFlow, the dash gets converted to an underscore,
+# so access is FLAGS.job_dir.
+flags.DEFINE_string("job-dir", None,
+                    "DO NOT USE. Exists only for Cloud ML Engine to pass in "
+                    "during hyperparameter tuning. Overrides --output_dir.")
+
 
 def get_problem_name():
   problems = FLAGS.problems.split("-")
   assert len(problems) == 1
   return problems[0]
 
 
+def set_hparams_from_args(args):
+  """Set hparams overrides from unparsed args list."""
+  if not args:
+    return
+
+  hp_prefix = "--hp_"
+  tf.logging.info("Found unparsed command-line arguments. Checking if any "
+                  "start with %s and interpreting those as hparams "
+                  "settings.", hp_prefix)
+
+  pairs = []
+  i = 0
+  while i < len(args):
+    arg = args[i]
+    if arg.startswith(hp_prefix):
+      pairs.append((arg.lstrip(hp_prefix), args[i+1]))
+      i += 2
+    else:
+      tf.logging.warn("Found unknown flag: %s", arg)
+      i += 1
+
+  as_hparams = ",".join(["%s=%s" % (key, val) for key, val in pairs])
+  if FLAGS.hparams:
+    as_hparams = "," + as_hparams
+  FLAGS.hparams += as_hparams
+
+
 def create_hparams():
   if (FLAGS.cloud_tpu or FLAGS.use_tpu) and "tpu" not in FLAGS.hparams_set:
     tf.logging.warn("Not all hyperparameter sets work on TPU. "
@@ -244,23 +300,31 @@ def maybe_cloud_tpu():
                      "be gs:// paths, i.e. on Google Cloud Storage.")
 
   FLAGS.use_tpu = True
-  with cloud.cloud_tpu(
+  with cloud_tpu.cloud_tpu(
       FLAGS.cloud_vm_name,
       FLAGS.cloud_tpu_name,
       delete_on_done=FLAGS.cloud_delete_on_done) as tpu_master:
     FLAGS.master = tpu_master
     yield
 
 
-def main(_):
+def main(argv):
   tf.logging.set_verbosity(tf.logging.INFO)
   trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
+  if FLAGS.cloud_mlengine:
+    return cloud_mlengine.launch()
+
   if FLAGS.generate_data:
     generate_data()
 
+  if hasattr(FLAGS, "job_dir") and FLAGS.job_dir:
+    FLAGS.output_dir = FLAGS.job_dir
+
+  if argv:
+    set_hparams_from_args(argv[1:])
   hparams = create_hparams()
   if is_chief():
     save_metadata(hparams)

diff --git a/tensor2tensor/bin/t2t_trainer_test.py b/tensor2tensor/bin/t2t_trainer_test.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for t2t_trainer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.utils import trainer_lib_test
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+class TrainerTest(tf.test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    trainer_lib_test.TrainerLibTest.setUpClass()
+
+  def testTrain(self):
+    FLAGS.problems = "tiny_algo"
+    FLAGS.model = "transformer"
+    FLAGS.hparams_set = "transformer_tiny"
+    FLAGS.train_steps = 1
+    FLAGS.eval_steps = 1
+    FLAGS.output_dir = tf.test.get_temp_dir()
+    FLAGS.data_dir = tf.test.get_temp_dir()
+    t2t_trainer.main(None)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/data_generators/librispeech.py b/tensor2tensor/data_generators/librispeech.py
@@ -66,8 +66,7 @@ def _collect_data(directory, input_ext, transcription_ext):
       transcript_path = os.path.join(root, transcript)
       with open(transcript_path, "r") as transcript_file:
         for transcript_line in transcript_file:
-          line_contents = transcript_line.split(" ", 1)
-          assert len(line_contents) == 2
+          line_contents = transcript_line.strip().split(" ", 1)
           media_base, label = line_contents
           key = os.path.join(root, media_base)
           assert key not in data_files

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
@@ -517,16 +517,26 @@ def _maybe_reverse_and_copy(example):
     if shuffle_files:
       random.shuffle(data_files)
     dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
-    dataset = dataset.apply(
-        tf.contrib.data.parallel_interleave(
-            _load_records, sloppy=is_training, cycle_length=8))
+
+    if hasattr(tf.contrib.data, "parallel_interleave"):
+      dataset = dataset.apply(
+          tf.contrib.data.parallel_interleave(
+              _load_records, sloppy=is_training, cycle_length=8))
+    else:
+      dataset = dataset.interleave(_load_records, cycle_length=8,
+                                   block_length=16)
+
     if repeat:
       dataset = dataset.repeat()
     dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
     if preprocess:
-      dataset = dataset.apply(
-          tf.contrib.data.parallel_interleave(
-              _preprocess, sloppy=is_training, cycle_length=8))
+      if hasattr(tf.contrib.data, "parallel_interleave"):
+        dataset = dataset.apply(
+            tf.contrib.data.parallel_interleave(
+                _preprocess, sloppy=is_training, cycle_length=8))
+      else:
+        dataset = dataset.interleave(_preprocess, cycle_length=8,
+                                     block_length=16)
     dataset = dataset.map(
         _maybe_reverse_and_copy, num_parallel_calls=num_threads)
 
@@ -633,6 +643,8 @@ def _dataset_partition(self, mode, config):
       num_partitions: an integer
     """
     if mode != tf.estimator.ModeKeys.TRAIN or not hasattr(config, "tpu_config"):
+      # Reset in the case when using TPU but alternating TRAIN and EVAL.
+      self._next_partition_id = 0
       return 0, 1
     if config.tpu_config.per_host_input_for_training:
       num_partitions = max(config.tpu_config.num_shards // 8, 1)
@@ -670,7 +682,7 @@ def input_fn(self,
     partition_id, num_partitions = self._dataset_partition(mode, config)
 
     is_training = mode == tf.estimator.ModeKeys.TRAIN
-    if config.use_tpu:
+    if config and config.use_tpu:
       num_threads = 64
     else:
       num_threads = 4 if is_training else 1