diff --git a/README.md b/README.md
index ae512752564..d6482570fe1 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,10 @@
 
 This repository contains a number of different models implemented in [TensorFlow](https://www.tensorflow.org):
 
-The [official models](official) are a collection of example models that use TensorFlow's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here.
+The [official models](official) are a collection of example models that use TensorFlow 2's high-level APIs. They are intended to be well-maintained, tested, and kept up to date with the latest stable TensorFlow API. They should also be reasonably optimized for fast performance while still being easy to read. We especially recommend newer TensorFlow users to start here.
 
 The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests.
 
-The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
-
 ## Contribution guidelines
 
 If you want to contribute to models, be sure to review the [contribution guidelines](CONTRIBUTING.md).
diff --git a/official/README.md b/official/README.md
index 2ff0f2096cb..359b9d97eeb 100644
--- a/official/README.md
+++ b/official/README.md
@@ -1,6 +1,6 @@
 # TensorFlow Official Models
 
-The TensorFlow official models are a collection of example models that use
+The TensorFlow official models are a collection of models that use
 TensorFlow's high-level APIs. They are intended to be well-maintained, tested,
 and kept up to date with the latest TensorFlow API. They should also be
 reasonably optimized for fast performance while still being easy to read.
@@ -83,7 +83,7 @@ installable Official Models package. This is being tracked in
 *   [bert](nlp/bert): A powerful pre-trained language representation model:
     BERT, which stands for Bidirectional Encoder Representations from
     Transformers.
-*   [transformer](transformer): A transformer model to translate the WMT English
+*   [transformer](nlp/transformer): A transformer model to translate the WMT English
     to German dataset.
 *   [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for
     Language Understanding.
diff --git a/official/benchmark/keras_imagenet_benchmark.py b/official/benchmark/keras_imagenet_benchmark.py
index d46266aa429..d50a17891dc 100644
--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
@@ -23,7 +23,7 @@
 
 from official.benchmark import keras_benchmark
 from official.utils.testing import benchmark_wrappers
-from official.vision.image_classification import resnet_imagenet_main
+from official.vision.image_classification.resnet import resnet_imagenet_main
 
 MIN_TOP_1_ACCURACY = 0.76
 MAX_TOP_1_ACCURACY = 0.77
@@ -61,18 +61,6 @@ def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
     super(Resnet50KerasAccuracy, self).__init__(
         output_dir=output_dir, flag_methods=flag_methods)
 
-  def benchmark_graph_8_gpu(self):
-    """Test Keras model with Keras fit/dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
   def benchmark_8_gpu(self):
     """Test Keras model with eager, dist_strat and 8 GPUs."""
     self._setup()
@@ -135,30 +123,6 @@ def benchmark_xla_8_gpu_fp16(self):
     FLAGS.tf_gpu_thread_mode = 'gpu_private'
     self._run_and_report_benchmark()
 
-  def benchmark_8_gpu_mlperf_like(self):
-    """Test similar to the rules for MLPerf 0.5.
-
-    Listed below are reasons this comparison is not to the MLSpec, but this is
-    still a decent directional measurement:
-      - Eval is every 4 epochs and again at the end. ~2 extra times.
-      - Learning rate is not tuned to hit 75%, but we know the model is correct.
-      - We measure total time and MLPerf 0.5 excluded some startup time.
-      - Eval is not on the total set, need to set eval batch_size where
-        8*batch_size/50K is even. 250 is a good number.
-      - Not sure if we are doing any extra or too few steps due to epoch bleed.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 61
-    FLAGS.epochs_between_evals = 4
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mlperf_like')
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark(top_1_min=0.736)
-
   def benchmark_xla_8_gpu_fp16_dynamic(self):
     """Test Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
     self._setup()
@@ -921,129 +885,353 @@ def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
     # Cache dataset so performance is stable after the first epoch.
     def_flags['training_dataset_cache'] = True
     def_flags['log_steps'] = 100
+    # Note that for single GPU and pure eager tests which are less likely to be
+    # input bound and more stable, these tests will run for shorter time by
+    # overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
+    # methods, and skip_steps in _run_and_report_benchmark().
 
     super(Resnet50KerasBenchmarkRemoteData, self).__init__(
         output_dir=output_dir, default_flags=def_flags)
 
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    # skip the first epoch for performance measurement.
-    super(Resnet50KerasBenchmarkRemoteData,
-          self)._run_and_report_benchmark(skip_steps=600)
+  def _override_flags_to_run_test_shorter(self):
+    FLAGS.train_epochs = 1
+    FLAGS.train_steps = 300
+    FLAGS.log_steps = 10
 
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+    self._setup()
 
-class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
-  """Trivial model with real data benchmark tests."""
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
 
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
+  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
+    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
+    self._setup()
 
-    def_flags = {}
-    def_flags['use_trivial_model'] = True
-    def_flags['skip_eval'] = True
-    def_flags['report_accuracy_metrics'] = False
-    def_flags['dtype'] = 'fp16'
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    def_flags['train_steps'] = 600
-    def_flags['log_steps'] = 100
-    def_flags['distribution_strategy'] = 'mirrored'
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly')
+    FLAGS.batch_size = 64
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
 
-    super(TrivialKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=def_flags)
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
+    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
+    self._setup()
 
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_imagenet_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.explicit_gpu_placement = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
+    FLAGS.batch_size = 64
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
 
-    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
+    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
+    self._setup()
 
-  def benchmark_8_gpu_warmup(self):
-    """Dummy test that runs over an epoch to warmup the machine."""
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
+    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
     self._setup()
 
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
     FLAGS.enable_eager = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_steps = 700
+    FLAGS.run_eagerly = True
+    FLAGS.explicit_gpu_placement = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_1_gpu_no_dist_strat(self):
+    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 96  # BatchNorm is less efficient in legacy graph mode
+    # due to its reliance on v1 cond.
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
   def benchmark_1_gpu(self):
-    """Test trivial Keras model (input pipeline) with 1 GPU."""
+    """Test Keras model with 1 GPU."""
     self._setup()
 
     FLAGS.num_gpus = 1
     FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
     FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_amp(self):
+    """Test Keras model with 1 GPU with automatic mixed precision."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
     FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
-  def benchmark_graph_1_gpu(self):
-    """Test trivial Keras model (input pipeline) with 1 GPU."""
+  def benchmark_xla_1_gpu(self):
+    """Test Keras model with XLA and 1 GPU."""
     self._setup()
 
     FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
+    FLAGS.enable_eager = True
     FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_amp(self):
+    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
     FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
-  def benchmark_8_gpu(self):
-    """Test trivial Keras model (input pipeline) with 8 GPUs."""
+  def benchmark_1_gpu_fp16(self):
+    """Test Keras model with 1 GPU and fp16."""
     self._setup()
 
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_1_gpu_fp16_dynamic(self):
+    """Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    FLAGS.loss_scale = 'dynamic'
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_xla_1_gpu_fp16(self):
+    """Test Keras model with XLA, 1 GPU and fp16."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
     FLAGS.enable_eager = True
     FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
-  def benchmark_8_gpu_tweaked(self):
-    """Test trivial Keras model with tuning and 8 GPUs."""
+  def benchmark_xla_1_gpu_fp16_tweaked(self):
+    """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
     self._setup()
 
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
     FLAGS.enable_eager = True
     FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
     FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
-  def benchmark_graph_8_gpu(self):
-    """Test trivial Keras model in legacy graph mode with 8 GPUs."""
+  def benchmark_xla_1_gpu_fp16_dynamic(self):
+    """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
     self._setup()
 
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
+    FLAGS.loss_scale = 'dynamic'
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_1_gpu(self):
+    """Test Keras model in legacy graph mode with 1 GPU."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_xla_1_gpu(self):
+    """Test Keras model in legacy graph mode with XLA and 1 GPU."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
     FLAGS.enable_eager = False
     FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu')
+    FLAGS.batch_size = 128
+    self._override_flags_to_run_test_shorter()
     self._run_and_report_benchmark()
 
-  def benchmark_graph_8_gpu_tweaked(self):
-    """Test trivial Keras model in legacy graph mode with tuning and 8 GPUs."""
+  def benchmark_graph_1_gpu_fp16(self):
+    """Test Keras model in legacy graph mode with 1 GPU and fp16."""
     self._setup()
 
-    FLAGS.num_gpus = 8
+    FLAGS.num_gpus = 1
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_fp16')
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_xla_1_gpu_fp16(self):
+    """Test Keras model in legacy graph mode with 1 GPU, fp16 and XLA."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.dtype = 'fp16'
     FLAGS.enable_eager = False
     FLAGS.enable_xla = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_xla_1_gpu_fp16')
+    FLAGS.batch_size = 256
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_xla_1_gpu_fp16_tweaked(self):
+    """Test Keras model in legacy graph with 1 GPU, fp16, XLA, and tuning."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_xla_1_gpu_fp16_tweaked')
+    FLAGS.dtype = 'fp16'
+    FLAGS.batch_size = 256
     FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
+    self._override_flags_to_run_test_shorter()
+    self._run_and_report_benchmark()
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
+      # For single GPU and pure eager tests which are less likely to be input
+      # bound and more stable, run for shorter time and use the default
+      # skip_steps.
+      skip_steps = None
+    else:
+      # skip the first epoch for performance measurement.
+      skip_steps = 600
+    super(Resnet50KerasBenchmarkRemoteData,
+          self)._run_and_report_benchmark(skip_steps=skip_steps)
+
+
+class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
+  """Trivial model with real data benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
+
+    def_flags = {}
+    def_flags['use_trivial_model'] = True
+    def_flags['skip_eval'] = True
+    def_flags['report_accuracy_metrics'] = False
+    def_flags['dtype'] = 'fp16'
+    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
+    def_flags['train_steps'] = 600
+    def_flags['log_steps'] = 100
+    def_flags['distribution_strategy'] = 'mirrored'
+
+    super(TrivialKerasBenchmarkReal, self).__init__(
+        output_dir=output_dir,
+        flag_methods=flag_methods,
+        default_flags=def_flags)
+
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_imagenet_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+
+  def benchmark_8_gpu_warmup(self):
+    """Dummy test that runs over an epoch to warmup the machine."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.enable_eager = True
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_steps = 700
     self._run_and_report_benchmark()
 
   def fill_report_object(self, stats):
diff --git a/official/benchmark/models/resnet_cifar_main.py b/official/benchmark/models/resnet_cifar_main.py
index 6b8656c8e62..87cf96d6416 100644
--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
@@ -28,8 +28,8 @@
 from official.utils.logs import logger
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
-from official.vision.image_classification import cifar_preprocessing
-from official.vision.image_classification import common
+from official.vision.image_classification.resnet import cifar_preprocessing
+from official.vision.image_classification.resnet import common
 
 
 LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
diff --git a/official/benchmark/models/resnet_cifar_test.py b/official/benchmark/models/resnet_cifar_test.py
index 0997556b4db..6dbb2fa8111 100644
--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
@@ -27,7 +27,7 @@
 from official.benchmark.models import resnet_cifar_main
 from official.utils.misc import keras_utils
 from official.utils.testing import integration
-from official.vision.image_classification import cifar_preprocessing
+from official.vision.image_classification.resnet import cifar_preprocessing
 
 
 class KerasCifarTest(googletest.TestCase):
diff --git a/official/benchmark/resnet_ctl_imagenet_benchmark.py b/official/benchmark/resnet_ctl_imagenet_benchmark.py
index 2a33e396e08..a632bc29289 100644
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
@@ -22,8 +22,8 @@
 from absl import flags
 import tensorflow as tf
 
-from official.vision.image_classification import common
-from official.vision.image_classification import resnet_ctl_imagenet_main
+from official.vision.image_classification.resnet import common
+from official.vision.image_classification.resnet import resnet_ctl_imagenet_main
 from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
 from official.utils.testing import benchmark_wrappers
 from official.utils.flags import core as flags_core
@@ -87,10 +87,9 @@ def _report_benchmark(self,
       # first entry in the time_log is start of step 0. The rest of the
       # entries are the end of each step recorded
       time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      examples_per_sec = num_examples / elapsed
+      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
+      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
       metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
 
     if 'avg_exp_per_second' in stats:
diff --git a/official/benchmark/tfhub_memory_usage_benchmark.py b/official/benchmark/tfhub_memory_usage_benchmark.py
index 2dda28cfcf1..4000f9900e8 100644
--- a/official/benchmark/tfhub_memory_usage_benchmark.py
+++ b/official/benchmark/tfhub_memory_usage_benchmark.py
@@ -41,8 +41,14 @@ def __init__(self,
         output_dir=output_dir, default_flags=default_flags, **kwargs)
     if hub_model_handle_list:
       for hub_model_handle in hub_model_handle_list.split(';'):
+        # Converts a model handle of the form
+        # https://tfhub.dev/google/nnlm-en-dim128/1 to valid python method name
+        # like google_nnlm_en_dim128_1.
+        hub_model_method_name = hub_model_handle.replace(
+            'https://tfhub.dev',
+            '').replace('/', '_').replace('-', '_').strip('_')
         setattr(
-            self, 'benchmark_' + hub_model_handle,
+            self, 'benchmark_' + hub_model_method_name,
             functools.partial(self.benchmark_memory_usage, hub_model_handle))
 
   def benchmark_memory_usage(
diff --git a/official/modeling/model_training_utils.py b/official/modeling/model_training_utils.py
index 49a1e2a4afe..2f66d1c922f 100644
--- a/official/modeling/model_training_utils.py
+++ b/official/modeling/model_training_utils.py
@@ -102,6 +102,7 @@ def run_customized_training_loop(
     strategy=None,
     model_fn=None,
     loss_fn=None,
+    scale_loss=True,
     model_dir=None,
     train_input_fn=None,
     steps_per_epoch=None,
@@ -129,6 +130,8 @@ def run_customized_training_loop(
         to be used for initial checkpoint -- if provided.
       loss_fn: Function with signature func(labels, logits) and returns a loss
         tensor.
+      scale_loss: Whether to divide the raw loss by number of replicas before
+        gradients calculation.
       model_dir: Model directory used during training for restoring/saving model
         weights.
       train_input_fn: Function that returns a tf.data.Dataset used for training.
@@ -211,7 +214,7 @@ def run_customized_training_loop(
   if run_eagerly:
     if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
       raise ValueError(
-          'TPUStrategy should not run eagerly as it heavily replies on graph'
+          'TPUStrategy should not run eagerly as it heavily relies on graph'
           ' optimization for the distributed system.')
 
   if eval_input_fn and (eval_steps is None or metric_fn is None):
@@ -223,9 +226,6 @@ def run_customized_training_loop(
         'if `metric_fn` is specified, metric_fn must be a callable.')
 
   total_training_steps = steps_per_epoch * epochs
-
-  # To reduce unnecessary send/receive input pipeline operation, we place input
-  # pipeline ops in worker task.
   train_iterator = _get_input_iterator(train_input_fn, strategy)
 
   with distribution_utils.get_strategy_scope(strategy):
@@ -287,6 +287,12 @@ def _replicated_step(inputs):
       with tf.GradientTape() as tape:
         model_outputs = model(inputs, training=True)
         loss = loss_fn(labels, model_outputs)
+        # Raw loss is used for reporting in metrics/logs.
+        raw_loss = loss
+        if scale_loss:
+          # Scales down the loss for gradients to be invariant from replicas.
+          loss = loss / strategy.num_replicas_in_sync
+
       if explicit_allreduce:
         grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
                                                      training_vars,
@@ -303,7 +309,7 @@ def _replicated_step(inputs):
           grads = tape.gradient(loss, training_vars)
         optimizer.apply_gradients(zip(grads, training_vars))
       # For reporting, the metric takes the mean of losses.
-      train_loss_metric.update_state(loss)
+      train_loss_metric.update_state(raw_loss)
       for metric in train_metrics:
         metric.update_state(labels, model_outputs)
 
@@ -324,7 +330,7 @@ def train_steps(iterator, steps):
                          'retracing.')
 
       for _ in tf.range(steps):
-        strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
+        strategy.run(_replicated_step, args=(next(iterator),))
 
     def train_single_step(iterator):
       """Performs a distributed training step.
@@ -335,7 +341,7 @@ def train_single_step(iterator):
       Raises:
         ValueError: Any of the arguments or tensor shapes are invalid.
       """
-      strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
+      strategy.run(_replicated_step, args=(next(iterator),))
 
     def test_step(iterator):
       """Calculates evaluation metrics on distributed devices."""
@@ -348,7 +354,7 @@ def _test_step_fn(inputs):
         for metric in eval_metrics:
           metric.update_state(labels, model_outputs)
 
-      strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),))
+      strategy.run(_test_step_fn, args=(next(iterator),))
 
     if not run_eagerly:
       train_single_step = tf.function(train_single_step)
diff --git a/official/modeling/model_training_utils_test.py b/official/modeling/model_training_utils_test.py
index d03c926469a..28e19b4b6c6 100644
--- a/official/modeling/model_training_utils_test.py
+++ b/official/modeling/model_training_utils_test.py
@@ -233,5 +233,4 @@ def test_train_check_artifacts_non_chief(self, distribution):
 
 
 if __name__ == '__main__':
-  assert tf.version.VERSION.startswith('2.')
   tf.test.main()
diff --git a/official/modeling/training/distributed_executor.py b/official/modeling/training/distributed_executor.py
index a76d89e1159..19062946e8e 100644
--- a/official/modeling/training/distributed_executor.py
+++ b/official/modeling/training/distributed_executor.py
@@ -243,10 +243,10 @@ def train_step(iterator, num_steps):
         raise ValueError('steps should be an Tensor. Python object may cause '
                          'retracing.')
 
-      per_replica_losses = strategy.experimental_run_v2(
+      per_replica_losses = strategy.run(
           _replicated_step, args=(next(iterator),))
       for _ in tf.range(num_steps - 1):
-        per_replica_losses = strategy.experimental_run_v2(
+        per_replica_losses = strategy.run(
             _replicated_step, args=(next(iterator),))
 
       # For reporting, we returns the mean of losses.
@@ -278,7 +278,7 @@ def _test_step_fn(inputs):
         metric.update_state(labels, model_outputs)
         return labels, model_outputs
 
-      return strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),))
+      return strategy.run(_test_step_fn, args=(next(iterator),))
 
     return test_step
 
diff --git a/official/nlp/bert/run_classifier.py b/official/nlp/bert/run_classifier.py
index a8333f3d7a3..2734d028805 100644
--- a/official/nlp/bert/run_classifier.py
+++ b/official/nlp/bert/run_classifier.py
@@ -61,7 +61,7 @@
 FLAGS = flags.FLAGS
 
 
-def get_loss_fn(num_classes, loss_factor=1.0):
+def get_loss_fn(num_classes):
   """Gets the classification loss function."""
 
   def classification_loss_fn(labels, logits):
@@ -72,9 +72,7 @@ def classification_loss_fn(labels, logits):
         tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
     per_example_loss = -tf.reduce_sum(
         tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
-    loss = tf.reduce_mean(per_example_loss)
-    loss *= loss_factor
-    return loss
+    return tf.reduce_mean(per_example_loss)
 
   return classification_loss_fn
 
@@ -135,17 +133,7 @@ def _get_classifier_model():
         use_graph_rewrite=common_flags.use_graph_rewrite())
     return classifier_model, core_model
 
-  # During distributed training, loss used for gradient computation is
-  # summed over from all replicas. When Keras compile/fit() API is used,
-  # the fit() API internally normalizes the loss by dividing the loss by
-  # the number of replicas used for computation. However, when custom
-  # training loop is used this is not done automatically and should be
-  # done manually by the end user.
-  loss_multiplier = 1.0
-  if FLAGS.scale_loss and not use_keras_compile_fit:
-    loss_multiplier = 1.0 / strategy.num_replicas_in_sync
-
-  loss_fn = get_loss_fn(num_classes, loss_factor=loss_multiplier)
+  loss_fn = get_loss_fn(num_classes)
 
   # Defines evaluation metrics function, which will create metrics in the
   # correct device and strategy scope.
@@ -267,7 +255,7 @@ def _test_step_fn(inputs):
       model_outputs = trained_model(inputs, training=False)
       return model_outputs, labels
 
-    outputs, labels = strategy.experimental_run_v2(
+    outputs, labels = strategy.run(
         _test_step_fn, args=(next(iterator),))
     # outputs: current batch logits as a tuple of shard logits
     outputs = tf.nest.map_structure(strategy.experimental_local_results,
diff --git a/official/nlp/bert/run_pretraining.py b/official/nlp/bert/run_pretraining.py
index 5a0c296979c..b7d28fb4354 100644
--- a/official/nlp/bert/run_pretraining.py
+++ b/official/nlp/bert/run_pretraining.py
@@ -74,11 +74,11 @@ def _dataset_fn(ctx=None):
   return _dataset_fn
 
 
-def get_loss_fn(loss_factor=1.0):
+def get_loss_fn():
   """Returns loss function for BERT pretraining."""
 
   def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
-    return tf.reduce_mean(losses) * loss_factor
+    return tf.reduce_mean(losses)
 
   return _bert_pretrain_loss_fn
 
@@ -116,9 +116,8 @@ def _get_pretrain_model():
   trained_model = model_training_utils.run_customized_training_loop(
       strategy=strategy,
       model_fn=_get_pretrain_model,
-      loss_fn=get_loss_fn(
-          loss_factor=1.0 /
-          strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0),
+      loss_fn=get_loss_fn(),
+      scale_loss=FLAGS.scale_loss,
       model_dir=model_dir,
       train_input_fn=train_input_fn,
       steps_per_epoch=steps_per_epoch,
diff --git a/official/nlp/bert/run_squad_helper.py b/official/nlp/bert/run_squad_helper.py
index 764a429f2d8..4fc62d44ead 100644
--- a/official/nlp/bert/run_squad_helper.py
+++ b/official/nlp/bert/run_squad_helper.py
@@ -24,6 +24,7 @@
 import tensorflow as tf
 
 from official.modeling import model_training_utils
+from official.modeling import performance
 from official.nlp import optimization
 from official.nlp.bert import bert_models
 from official.nlp.bert import common_flags
@@ -89,8 +90,7 @@ def define_common_squad_flags():
 def squad_loss_fn(start_positions,
                   end_positions,
                   start_logits,
-                  end_logits,
-                  loss_factor=1.0):
+                  end_logits):
   """Returns sparse categorical crossentropy for start/end logits."""
   start_loss = tf.keras.losses.sparse_categorical_crossentropy(
       start_positions, start_logits, from_logits=True)
@@ -98,11 +98,10 @@ def squad_loss_fn(start_positions,
       end_positions, end_logits, from_logits=True)
 
   total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
-  total_loss *= loss_factor
   return total_loss
 
 
-def get_loss_fn(loss_factor=1.0):
+def get_loss_fn():
   """Gets a loss function for squad task."""
 
   def _loss_fn(labels, model_outputs):
@@ -113,8 +112,7 @@ def _loss_fn(labels, model_outputs):
         start_positions,
         end_positions,
         start_logits,
-        end_logits,
-        loss_factor=loss_factor)
+        end_logits)
 
   return _loss_fn
 
@@ -194,8 +192,7 @@ def _replicated_step(inputs):
           start_logits=start_logits,
           end_logits=end_logits)
 
-    outputs = strategy.experimental_run_v2(
-        _replicated_step, args=(next(iterator),))
+    outputs = strategy.run(_replicated_step, args=(next(iterator),))
     return tf.nest.map_structure(strategy.experimental_local_results, outputs)
 
   all_results = []
@@ -219,10 +216,7 @@ def train_squad(strategy,
                  ' strategy.')
   # Enables XLA in Session Config. Should not be set for TPU.
   keras_utils.set_config_v2(FLAGS.enable_xla)
-
-  use_float16 = common_flags.use_float16()
-  if use_float16:
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+  performance.set_mixed_precision_policy(common_flags.dtype())
 
   epochs = FLAGS.num_train_epochs
   num_train_examples = input_meta_data['train_data_size']
@@ -242,33 +236,16 @@ def _get_squad_model():
         max_seq_length,
         hub_module_url=FLAGS.hub_module_url,
         hub_module_trainable=FLAGS.hub_module_trainable)
-    squad_model.optimizer = optimization.create_optimizer(
-        FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps)
-    if use_float16:
-      # Wraps optimizer with a LossScaleOptimizer. This is done automatically
-      # in compile() with the "mixed_float16" policy, but since we do not call
-      # compile(), we must wrap the optimizer manually.
-      squad_model.optimizer = (
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-              squad_model.optimizer, loss_scale=common_flags.get_loss_scale()))
-    if FLAGS.fp16_implementation == 'graph_rewrite':
-      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
-      # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
-      # which will ensure tf.compat.v2.keras.mixed_precision and
-      # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
-      # up.
-      squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          squad_model.optimizer)
+    optimizer = optimization.create_optimizer(FLAGS.learning_rate,
+                                              steps_per_epoch * epochs,
+                                              warmup_steps)
+
+    squad_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
     return squad_model, core_model
 
-  # The original BERT model does not scale the loss by
-  # 1/num_replicas_in_sync. It could be an accident. So, in order to use
-  # the same hyper parameter, we do the same thing here by keeping each
-  # replica loss as it is.
-  loss_fn = get_loss_fn(
-      loss_factor=1.0 /
-      strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0)
-
   # If explicit_allreduce = True, apply_gradients() no longer implicitly
   # allreduce gradients, users manually allreduce gradient and pass the
   # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
@@ -281,7 +258,7 @@ def clip_by_global_norm_callback(grads_and_vars):
   model_training_utils.run_customized_training_loop(
       strategy=strategy,
       model_fn=_get_squad_model,
-      loss_fn=loss_fn,
+      loss_fn=get_loss_fn(),
       model_dir=FLAGS.model_dir,
       steps_per_epoch=steps_per_epoch,
       steps_per_loop=FLAGS.steps_per_loop,
diff --git a/official/nlp/bert/tf2_encoder_checkpoint_converter.py b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
index 68fea9dc284..203b238a77e 100644
--- a/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ b/official/nlp/bert/tf2_encoder_checkpoint_converter.py
@@ -98,7 +98,6 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint):
 
 
 def main(_):
-  tf.enable_v2_behavior()
   output_path = FLAGS.converted_checkpoint_path
   v1_checkpoint = FLAGS.checkpoint_to_convert
   bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
diff --git a/official/nlp/data/classifier_data_lib.py b/official/nlp/data/classifier_data_lib.py
index bccabf9f9f2..2bed942d345 100644
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -24,6 +24,7 @@
 
 from absl import logging
 import tensorflow as tf
+import tensorflow_datasets as tfds
 
 from official.nlp.bert import tokenization
 
@@ -386,6 +387,99 @@ def _create_examples(self, lines, set_type):
     return examples
 
 
+class TfdsProcessor(DataProcessor):
+  """Processor for generic text classification TFDS data set.
+
+  The TFDS parameters are expected to be provided in the tfds_params string, in
+  a comma-separated list of parameter assignments.
+  Examples:
+    tfds_params="dataset=scicite,text_key=string"
+    tfds_params="dataset=imdb_reviews,test_split=,dev_split=test"
+    tfds_params="dataset=glue/cola,text_key=sentence"
+    tfds_params="dataset=glue/sst2,text_key=sentence"
+    tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
+    tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
+  Possible parameters (please refer to the documentation of Tensorflow Datasets
+  (TFDS) for the meaning of individual parameters):
+    dataset: Required dataset name (potentially with subset and version number).
+    data_dir: Optional TFDS source root directory.
+    train_split: Name of the train split (defaults to `train`).
+    dev_split: Name of the dev split (defaults to `validation`).
+    test_split: Name of the test split (defaults to `test`).
+    text_key: Key of the text_a feature (defaults to `text`).
+    text_b_key: Key of the second text feature if available.
+    label_key: Key of the label feature (defaults to `label`).
+    test_text_key: Key of the text feature to use in test set.
+    test_text_b_key: Key of the second text feature to use in test set.
+    test_label: String to be used as the label for all test examples.
+  """
+
+  def __init__(self, tfds_params,
+               process_text_fn=tokenization.convert_to_unicode):
+    super(TfdsProcessor, self).__init__(process_text_fn)
+    self._process_tfds_params_str(tfds_params)
+    self.dataset, info = tfds.load(self.dataset_name, data_dir=self.data_dir,
+                                   with_info=True)
+    self._labels = list(range(info.features[self.label_key].num_classes))
+
+  def _process_tfds_params_str(self, params_str):
+    """Extracts TFDS parameters from a comma-separated assignements string."""
+    tuples = [x.split("=") for x in params_str.split(",")]
+    d = {k.strip(): v.strip() for k, v in tuples}
+    self.dataset_name = d["dataset"]  # Required.
+    self.data_dir = d.get("data_dir", None)
+    self.train_split = d.get("train_split", "train")
+    self.dev_split = d.get("dev_split", "validation")
+    self.test_split = d.get("test_split", "test")
+    self.text_key = d.get("text_key", "text")
+    self.text_b_key = d.get("text_b_key", None)
+    self.label_key = d.get("label_key", "label")
+    self.test_text_key = d.get("test_text_key", self.text_key)
+    self.test_text_b_key = d.get("test_text_b_key", self.text_b_key)
+    self.test_label = d.get("test_label", "test_example")
+
+  def get_train_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.train_split, "train")
+
+  def get_dev_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.dev_split, "dev")
+
+  def get_test_examples(self, data_dir):
+    assert data_dir is None
+    return self._create_examples(self.test_split, "test")
+
+  def get_labels(self):
+    return self._labels
+
+  def get_processor_name(self):
+    return "TFDS_" + self.dataset_name
+
+  def _create_examples(self, split_name, set_type):
+    """Creates examples for the training and dev sets."""
+    if split_name not in self.dataset:
+      raise ValueError("Split {} not available.".format(split_name))
+    dataset = self.dataset[split_name].as_numpy_iterator()
+    examples = []
+    text_b = None
+    for i, example in enumerate(dataset):
+      guid = "%s-%s" % (set_type, i)
+      if set_type == "test":
+        text_a = self.process_text_fn(example[self.test_text_key])
+        if self.test_text_b_key:
+          text_b = self.process_text_fn(example[self.test_text_b_key])
+        label = self.test_label
+      else:
+        text_a = self.process_text_fn(example[self.text_key])
+        if self.text_b_key:
+          text_b = self.process_text_fn(example[self.text_b_key])
+        label = int(example[self.label_key])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
 def convert_single_example(ex_index, example, label_list, max_seq_length,
                            tokenizer):
   """Converts a single `InputExample` into a single `InputFeatures`."""
diff --git a/official/nlp/data/create_finetuning_data.py b/official/nlp/data/create_finetuning_data.py
index d7b775b9685..939286e19b5 100644
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -104,22 +104,16 @@
     "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
     "while ALBERT uses sentence_piece tokenizer.")
 
+flags.DEFINE_string("tfds_params", "",
+                    "Comma-separated list of TFDS parameter assigments for "
+                    "generic classfication data import (for more details "
+                    "see the TfdsProcessor class documentation).")
+
 
 def generate_classifier_dataset():
   """Generates classifier dataset and returns input meta data."""
-  assert FLAGS.input_data_dir and FLAGS.classification_task_name
-
-  processors = {
-      "cola": classifier_data_lib.ColaProcessor,
-      "mnli": classifier_data_lib.MnliProcessor,
-      "mrpc": classifier_data_lib.MrpcProcessor,
-      "qnli": classifier_data_lib.QnliProcessor,
-      "sst-2": classifier_data_lib.SstProcessor,
-      "xnli": classifier_data_lib.XnliProcessor,
-  }
-  task_name = FLAGS.classification_task_name.lower()
-  if task_name not in processors:
-    raise ValueError("Task not found: %s" % (task_name))
+  assert (FLAGS.input_data_dir and FLAGS.classification_task_name
+          or FLAGS.tfds_params)
 
   if FLAGS.tokenizer_impl == "word_piece":
     tokenizer = tokenization.FullTokenizer(
@@ -131,14 +125,38 @@ def generate_classifier_dataset():
     processor_text_fn = functools.partial(
         tokenization.preprocess_text, lower=FLAGS.do_lower_case)
 
-  processor = processors[task_name](processor_text_fn)
-  return classifier_data_lib.generate_tf_record_from_data_file(
-      processor,
-      FLAGS.input_data_dir,
-      tokenizer,
-      train_data_output_path=FLAGS.train_data_output_path,
-      eval_data_output_path=FLAGS.eval_data_output_path,
-      max_seq_length=FLAGS.max_seq_length)
+  if FLAGS.tfds_params:
+    processor = classifier_data_lib.TfdsProcessor(
+        tfds_params=FLAGS.tfds_params,
+        process_text_fn=processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        None,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
+  else:
+    processors = {
+        "cola": classifier_data_lib.ColaProcessor,
+        "mnli": classifier_data_lib.MnliProcessor,
+        "mrpc": classifier_data_lib.MrpcProcessor,
+        "qnli": classifier_data_lib.QnliProcessor,
+        "sst-2": classifier_data_lib.SstProcessor,
+        "xnli": classifier_data_lib.XnliProcessor,
+    }
+    task_name = FLAGS.classification_task_name.lower()
+    if task_name not in processors:
+      raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name](processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        FLAGS.input_data_dir,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
 
 
 def generate_squad_dataset():
diff --git a/official/nlp/transformer/compute_bleu.py b/official/nlp/transformer/compute_bleu.py
index d8729a55e63..92d54c30ecb 100644
--- a/official/nlp/transformer/compute_bleu.py
+++ b/official/nlp/transformer/compute_bleu.py
@@ -47,8 +47,10 @@ def __init__(self):
     self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
 
   def property_chars(self, prefix):
-    return "".join(six.unichr(x) for x in range(sys.maxunicode)
-                   if unicodedata.category(six.unichr(x)).startswith(prefix))
+    return "".join(
+        six.unichr(x)
+        for x in range(sys.maxunicode)
+        if unicodedata.category(six.unichr(x)).startswith(prefix))
 
 
 uregex = UnicodeRegex()
@@ -92,9 +94,10 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
       tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines()
 
   if len(ref_lines) != len(hyp_lines):
-    raise ValueError("Reference and translation files have different number of "
-                     "lines. If training only a few steps (100-200), the "
-                     "translation may be empty.")
+    raise ValueError(
+        "Reference and translation files have different number of "
+        "lines (%d VS %d). If training only a few steps (100-200), the "
+        "translation may be empty." % (len(ref_lines), len(hyp_lines)))
   if not case_sensitive:
     ref_lines = [x.lower() for x in ref_lines]
     hyp_lines = [x.lower() for x in hyp_lines]
@@ -116,18 +119,23 @@ def main(unused_argv):
 def define_compute_bleu_flags():
   """Add flags for computing BLEU score."""
   flags.DEFINE_string(
-      name="translation", default=None,
+      name="translation",
+      default=None,
       help=flags_core.help_wrap("File containing translated text."))
   flags.mark_flag_as_required("translation")
 
   flags.DEFINE_string(
-      name="reference", default=None,
+      name="reference",
+      default=None,
       help=flags_core.help_wrap("File containing reference translation."))
   flags.mark_flag_as_required("reference")
 
   flags.DEFINE_enum(
-      name="bleu_variant", short_name="bv", default="both",
-      enum_values=["both", "uncased", "cased"], case_sensitive=False,
+      name="bleu_variant",
+      short_name="bv",
+      default="both",
+      enum_values=["both", "uncased", "cased"],
+      case_sensitive=False,
       help=flags_core.help_wrap(
           "Specify one or more BLEU variants to calculate. Variants: \"cased\""
           ", \"uncased\", or \"both\"."))
diff --git a/official/nlp/transformer/transformer_main.py b/official/nlp/transformer/transformer_main.py
index ff91153f858..126262fc6a7 100644
--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
@@ -280,7 +280,7 @@ def _step_fn(inputs):
 
       for _ in tf.range(steps):
         train_loss_metric.reset_states()
-        self.distribution_strategy.experimental_run_v2(
+        self.distribution_strategy.run(
             _step_fn, args=(next(iterator),))
 
     cased_score, uncased_score = None, None
diff --git a/official/nlp/transformer/translate.py b/official/nlp/transformer/translate.py
index 0cda3139bd4..8a377226b82 100644
--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
@@ -132,7 +132,7 @@ def _step_fn(inputs):
       val_outputs, _ = model([val_inputs], training=False)
       return tag, val_outputs
 
-    return distribution_strategy.experimental_run_v2(_step_fn, args=(inputs,))
+    return distribution_strategy.run(_step_fn, args=(inputs,))
 
   translations = []
   if distribution_strategy:
@@ -151,7 +151,7 @@ def text_as_per_replica():
         replica_id = replica_context.replica_id_in_sync_group
         return replica_id, text[replica_id]
 
-      text = distribution_strategy.experimental_run_v2(text_as_per_replica)
+      text = distribution_strategy.run(text_as_per_replica)
       outputs = distribution_strategy.experimental_local_results(
           predict_step(text))
       tags, unordered_val_outputs = outputs[0]
diff --git a/official/nlp/transformer/utils/tokenizer.py b/official/nlp/transformer/utils/tokenizer.py
index 52c7ddc27b8..9a3b472fe58 100644
--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
@@ -29,6 +29,8 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
+
+# pylint: disable=g-complex-comprehension
 PAD = "<pad>"
 PAD_ID = 0
 EOS = "<EOS>"
@@ -46,27 +48,36 @@
 
 _UNDEFINED_UNICODE = u"\u3013"
 
+
+def alphanumeric_char_set():
+  return set(
+      six.unichr(i)
+      for i in xrange(sys.maxunicode)
+      if (unicodedata.category(six.unichr(i)).startswith("L") or
+          unicodedata.category(six.unichr(i)).startswith("N")))
+
+
 # Set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = set(
-    six.unichr(i) for i in xrange(sys.maxunicode)
-    if (unicodedata.category(six.unichr(i)).startswith("L") or
-        unicodedata.category(six.unichr(i)).startswith("N")))
+_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
 
 # min_count is the minimum number of times a subtoken must appear in the data
 # before before it is added to the vocabulary. The value is found using binary
 # search to obtain the target vocabulary size.
-_MIN_MIN_COUNT = 1     # min value to use when binary searching for min_count
+_MIN_MIN_COUNT = 1  # min value to use when binary searching for min_count
 _MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
 
 
 class Subtokenizer(object):
   """Encodes and decodes strings to/from integer IDs."""
 
-  def __init__(self, vocab_file, reserved_tokens=None):
+  def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
     """Initializes class, creating a vocab file if data_files is provided."""
     logging.info("Initializing Subtokenizer from file %s." %
                               vocab_file)
 
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
+
     if reserved_tokens is None:
       reserved_tokens = RESERVED_TOKENS
 
@@ -79,13 +90,20 @@ def __init__(self, vocab_file, reserved_tokens=None):
       self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
 
     # Create cache to speed up subtokenization
-    self._cache_size = 2 ** 20
+    self._cache_size = 2**20
     self._cache = [(None, None)] * self._cache_size
+    self._master_char_set = master_char_set
 
   @staticmethod
-  def init_from_files(
-      vocab_file, files, target_vocab_size, threshold, min_count=None,
-      file_byte_limit=1e6, reserved_tokens=None, correct_strip=True):
+  def init_from_files(vocab_file,
+                      files,
+                      target_vocab_size,
+                      threshold,
+                      min_count=None,
+                      file_byte_limit=1e6,
+                      reserved_tokens=None,
+                      correct_strip=True,
+                      master_char_set=None):
     """Create subtoken vocabulary based on files, and save vocab to file.
 
     Args:
@@ -102,10 +120,13 @@ def init_from_files(
       reserved_tokens: List of string tokens that are guaranteed to be at the
         beginning of the subtoken vocabulary list.
       correct_strip: Whether to convert text to unicode before strip.
+      master_char_set: the char set.
 
     Returns:
       Subtokenizer object
     """
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
     if reserved_tokens is None:
       reserved_tokens = RESERVED_TOKENS
 
@@ -113,7 +134,8 @@ def init_from_files(
       logging.info("Vocab file already exists (%s)" % vocab_file)
     else:
       logging.info("Begin steps to create subtoken vocabulary...")
-      token_counts = _count_tokens(files, file_byte_limit, correct_strip)
+      token_counts = _count_tokens(files, file_byte_limit, correct_strip,
+                                   master_char_set)
       alphabet = _generate_alphabet_dict(token_counts)
       subtoken_list = _generate_subtokens_with_target_vocab_size(
           token_counts, alphabet, target_vocab_size, threshold, min_count,
@@ -121,15 +143,18 @@ def init_from_files(
       logging.info("Generated vocabulary with %d subtokens." %
                                 len(subtoken_list))
       _save_vocab_file(vocab_file, subtoken_list)
-    return Subtokenizer(vocab_file)
+    return Subtokenizer(vocab_file, master_char_set=master_char_set)
 
   def encode(self, raw_string, add_eos=False):
     """Encodes a string into a list of int subtoken ids."""
     ret = []
-    tokens = _split_string_to_tokens(native_to_unicode(raw_string))
+    tokens = _split_string_to_tokens(
+        native_to_unicode(raw_string), self._master_char_set)
     for token in tokens:
       ret.extend(self._token_to_subtoken_ids(token))
     if add_eos:
+      assert EOS in self.subtoken_list, \
+          "Can't append 'EOS' because it is not in list of known subtokens."
       ret.append(EOS_ID)
     return ret
 
@@ -162,13 +187,14 @@ def decode(self, subtokens):
         "Subtokens argument passed into decode() must be a list of integers.")
 
     return _unicode_to_native(
-        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens)))
+        _join_tokens_to_string(
+            self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
 
   def _subtoken_ids_to_tokens(self, subtokens):
     """Convert list of int subtoken ids to a list of string tokens."""
     escaped_tokens = "".join([
-        self.subtoken_list[s] for s in subtokens
-        if s < len(self.subtoken_list)])
+        self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
+    ])
     escaped_tokens = escaped_tokens.split("_")
 
     # All tokens in the vocabulary list have been escaped (see _escape_token())
@@ -205,7 +231,7 @@ def _load_vocab_file(vocab_file, reserved_tokens=None):
 
 def native_to_unicode(s):
   """Convert string to unicode (required in Python 2)."""
-  try:               # Python 2
+  try:  # Python 2
     return s if isinstance(s, unicode) else s.decode("utf-8")
   except NameError:  # Python 3
     return s
@@ -213,22 +239,22 @@ def native_to_unicode(s):
 
 def _unicode_to_native(s):
   """Convert string from unicode to native format (required in Python 2)."""
-  try:               # Python 2
+  try:  # Python 2
     return s.encode("utf-8") if isinstance(s, unicode) else s
   except NameError:  # Python 3
     return s
 
 
-def _split_string_to_tokens(text):
+def _split_string_to_tokens(text, master_char_set):
   """Splits text to a list of string tokens."""
   if not text:
     return []
   ret = []
   token_start = 0
   # Classify each character in the input string
-  is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+  is_master = [c in master_char_set for c in text]
   for pos in xrange(1, len(text)):
-    if is_alnum[pos] != is_alnum[pos - 1]:
+    if is_master[pos] != is_master[pos - 1]:
       token = text[token_start:pos]
       if token != u" " or token_start == 0:
         ret.append(token)
@@ -238,12 +264,12 @@ def _split_string_to_tokens(text):
   return ret
 
 
-def _join_tokens_to_string(tokens):
+def _join_tokens_to_string(tokens, master_char_set):
   """Join a list of string tokens into a single string."""
-  token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  token_is_master = [t[0] in master_char_set for t in tokens]
   ret = []
   for i, token in enumerate(tokens):
-    if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
       ret.append(u" ")
     ret.append(token)
   return "".join(ret)
@@ -325,7 +351,10 @@ def match(m):
   return _UNESCAPE_REGEX.sub(match, token)
 
 
-def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
+def _count_tokens(files,
+                  file_byte_limit=1e6,
+                  correct_strip=True,
+                  master_char_set=None):
   """Return token counts of words in the files.
 
   Samples file_byte_limit bytes from each file, and counts the words that appear
@@ -338,11 +367,15 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
       vocabulary generation for PY2. Sets correct_strip to False in PY2 to
       reproduce previous common public result. Sets correct_strip to True will
       let PY2 and PY3 get a consistent vocabulary.
+    master_char_set: the char set.
 
   Returns:
     Dictionary mapping tokens to the number of times they appear in the sampled
     lines from the files.
   """
+  if master_char_set is None:
+    master_char_set = _ALPHANUMERIC_CHAR_SET
+
   token_counts = collections.defaultdict(int)
 
   for filepath in files:
@@ -363,7 +396,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
           counter = 0
 
           # Add words to token counts
-          for token in _split_string_to_tokens(native_to_unicode(line)):
+          for token in _split_string_to_tokens(
+              native_to_unicode(line), master_char_set):
             token_counts[token] += 1
   return token_counts
 
@@ -395,9 +429,12 @@ def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
   return ret
 
 
-def _generate_subtokens_with_target_vocab_size(
-    token_counts, alphabet, target_size, threshold, min_count=None,
-    reserved_tokens=None):
+def _generate_subtokens_with_target_vocab_size(token_counts,
+                                               alphabet,
+                                               target_size,
+                                               threshold,
+                                               min_count=None,
+                                               reserved_tokens=None):
   """Generate subtoken vocabulary close to the target size."""
   if reserved_tokens is None:
     reserved_tokens = RESERVED_TOKENS
@@ -450,8 +487,8 @@ def _generate_alphabet_dict(iterable, reserved_tokens=None):
   return alphabet
 
 
-def _count_and_gen_subtokens(
-    token_counts, alphabet, subtoken_dict, max_subtoken_length):
+def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
+                             max_subtoken_length):
   """Count number of times subtokens appear, and generate new subtokens.
 
   Args:
@@ -469,8 +506,8 @@ def _count_and_gen_subtokens(
   subtoken_counts = collections.defaultdict(int)
   for token, count in six.iteritems(token_counts):
     token = _escape_token(token, alphabet)
-    subtokens = _split_token_to_subtokens(
-        token, subtoken_dict, max_subtoken_length)
+    subtokens = _split_token_to_subtokens(token, subtoken_dict,
+                                          max_subtoken_length)
 
     # Generate new subtokens by taking substrings from token.
     start = 0
@@ -504,8 +541,10 @@ def _filter_and_bucket_subtokens(subtoken_counts, min_count):
   return subtoken_buckets
 
 
-def _gen_new_subtoken_list(
-    subtoken_counts, min_count, alphabet, reserved_tokens=None):
+def _gen_new_subtoken_list(subtoken_counts,
+                           min_count,
+                           alphabet,
+                           reserved_tokens=None):
   """Generate candidate subtokens ordered by count, and new max subtoken length.
 
   Add subtokens to the candiate list in order of length (longest subtokens
@@ -576,9 +615,11 @@ def _gen_new_subtoken_list(
   return subtoken_list, max_subtoken_length
 
 
-def _generate_subtokens(
-    token_counts, alphabet, min_count, num_iterations=4,
-    reserved_tokens=None):
+def _generate_subtokens(token_counts,
+                        alphabet,
+                        min_count,
+                        num_iterations=4,
+                        reserved_tokens=None):
   """Create a list of subtokens in decreasing order of frequency.
 
   Args:
@@ -610,8 +651,9 @@ def _generate_subtokens(
 
     # Create dict mapping subtoken->count, with additional subtokens created
     # from substrings taken from the tokens.
-    subtoken_counts = _count_and_gen_subtokens(
-        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+    subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
+                                               subtoken_dict,
+                                               max_subtoken_length)
 
     # Generate new list of subtokens sorted by subtoken count.
     subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
diff --git a/official/nlp/transformer/utils/tokenizer_test.py b/official/nlp/transformer/utils/tokenizer_test.py
index 6ca0495e656..307398fd3ae 100644
--- a/official/nlp/transformer/utils/tokenizer_test.py
+++ b/official/nlp/transformer/utils/tokenizer_test.py
@@ -59,13 +59,15 @@ class StringHelperTest(tf.test.TestCase):
   def test_split_string_to_tokens(self):
     text = "test? testing 123."
 
-    tokens = tokenizer._split_string_to_tokens(text)
+    tokens = tokenizer._split_string_to_tokens(text,
+                                               tokenizer._ALPHANUMERIC_CHAR_SET)
     self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
 
   def test_join_tokens_to_string(self):
     tokens = ["test", "? ", "testing", "123", "."]
 
-    s = tokenizer._join_tokens_to_string(tokens)
+    s = tokenizer._join_tokens_to_string(tokens,
+                                         tokenizer._ALPHANUMERIC_CHAR_SET)
     self.assertEqual("test? testing 123.", s)
 
   def test_escape_token(self):
@@ -79,8 +81,7 @@ def test_unescape_token(self):
     escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
 
     unescaped_token = tokenizer._unescape_token(escaped_token)
-    self.assertEqual(
-        "Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+    self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
 
   def test_list_to_index_dict(self):
     lst = ["test", "strings"]
@@ -93,8 +94,8 @@ def test_split_token_to_subtokens(self):
     subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
     max_subtoken_length = 2
 
-    subtokens = tokenizer._split_token_to_subtokens(
-        token, subtoken_dict, max_subtoken_length)
+    subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict,
+                                                    max_subtoken_length)
     self.assertEqual(["ab", "c"], subtokens)
 
   def test_generate_alphabet_dict(self):
@@ -124,12 +125,28 @@ def test_count_and_gen_subtokens(self):
 
     self.assertIsInstance(subtoken_counts, collections.defaultdict)
     self.assertDictEqual(
-        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
-         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
+        {
+            "a": 5,
+            "b": 5,
+            "c": 5,
+            "_": 5,
+            "ab": 5,
+            "bc": 5,
+            "c_": 5,
+            "abc": 5,
+            "bc_": 5,
+            "abc_": 5
+        }, subtoken_counts)
 
   def test_filter_and_bucket_subtokens(self):
-    subtoken_counts = collections.defaultdict(
-        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
+    subtoken_counts = collections.defaultdict(int, {
+        "a": 2,
+        "b": 4,
+        "c": 1,
+        "ab": 6,
+        "ac": 3,
+        "abbc": 5
+    })
     min_count = 3
 
     subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
@@ -142,8 +159,12 @@ def test_filter_and_bucket_subtokens(self):
     self.assertEqual(set(["abbc"]), subtoken_buckets[4])
 
   def test_gen_new_subtoken_list(self):
-    subtoken_counts = collections.defaultdict(
-        int, {"translate": 10, "t": 40, "tr": 16, "tra": 12})
+    subtoken_counts = collections.defaultdict(int, {
+        "translate": 10,
+        "t": 40,
+        "tr": 16,
+        "tra": 12
+    })
     min_count = 5
     alphabet = set("translate")
     reserved_tokens = ["reserved", "tokens"]
@@ -167,8 +188,9 @@ def test_generate_subtokens(self):
     num_iterations = 1
     reserved_tokens = ["reserved", "tokens"]
 
-    vocab_list = tokenizer._generate_subtokens(
-        token_counts, alphabet, min_count, num_iterations, reserved_tokens)
+    vocab_list = tokenizer._generate_subtokens(token_counts, alphabet,
+                                               min_count, num_iterations,
+                                               reserved_tokens)
 
     # Check that reserved tokens are at the front of the list
     self.assertEqual(vocab_list[:2], reserved_tokens)
diff --git a/official/nlp/xlnet/run_classifier.py b/official/nlp/xlnet/run_classifier.py
index f8f9f1be49b..79a27f244d8 100644
--- a/official/nlp/xlnet/run_classifier.py
+++ b/official/nlp/xlnet/run_classifier.py
@@ -87,7 +87,7 @@ def _test_step_fn(inputs):
   @tf.function
   def _run_evaluation(test_iterator):
     """Runs validation steps."""
-    logits, labels, masks = strategy.experimental_run_v2(
+    logits, labels, masks = strategy.run(
         _test_step_fn, args=(next(test_iterator),))
     return logits, labels, masks
 
diff --git a/official/nlp/xlnet/run_squad.py b/official/nlp/xlnet/run_squad.py
index fce19444c1e..013893f1a28 100644
--- a/official/nlp/xlnet/run_squad.py
+++ b/official/nlp/xlnet/run_squad.py
@@ -130,7 +130,7 @@ def _test_step_fn(inputs):
   @tf.function
   def _run_evaluation(test_iterator):
     """Runs validation steps."""
-    res, unique_ids = strategy.experimental_run_v2(
+    res, unique_ids = strategy.run(
         _test_step_fn, args=(next(test_iterator),))
     return res, unique_ids
 
diff --git a/official/nlp/xlnet/training_utils.py b/official/nlp/xlnet/training_utils.py
index 2f88c850bee..627fa975113 100644
--- a/official/nlp/xlnet/training_utils.py
+++ b/official/nlp/xlnet/training_utils.py
@@ -222,16 +222,16 @@ def cache_fn():
         return mems
 
       if input_meta_data["mem_len"] > 0:
-        mem = strategy.experimental_run_v2(cache_fn)
+        mem = strategy.run(cache_fn)
         for _ in tf.range(steps):
-          mem = strategy.experimental_run_v2(
+          mem = strategy.run(
               _replicated_step, args=(
                   next(iterator),
                   mem,
               ))
       else:
         for _ in tf.range(steps):
-          strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
+          strategy.run(_replicated_step, args=(next(iterator),))
 
     if not run_eagerly:
       train_steps = tf.function(train_steps)
diff --git a/official/pip_package/setup.py b/official/pip_package/setup.py
index 83d9fa1dbc7..bfd95a1d3cd 100644
--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -13,30 +13,76 @@
 # limitations under the License.
 # ==============================================================================
 """Sets up TensorFlow Official Models."""
+import datetime
+import os
+import sys
+
 from setuptools import find_packages
 from setuptools import setup
 
+version = '2.2.0'
+
+project_name = 'tf-models-official'
+
+long_description = """The TensorFlow official models are a collection of
+models that use TensorFlow's high-level APIs.
+They are intended to be well-maintained, tested, and kept up to date with the
+latest TensorFlow API. They should also be reasonably optimized for fast
+performance while still being easy to read."""
+
+if '--project_name' in sys.argv:
+  project_name_idx = sys.argv.index('--project_name')
+  project_name = sys.argv[project_name_idx + 1]
+  sys.argv.remove('--project_name')
+  sys.argv.pop(project_name_idx)
+
+
+def _get_requirements():
+  """Parses requirements.txt file."""
+  install_requires_tmp = []
+  dependency_links_tmp = []
+  with open(
+      os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
+    for line in f:
+      package_name = line.strip()
+      if package_name.startswith('-e '):
+        dependency_links_tmp.append(package_name[3:].strip())
+      else:
+        install_requires_tmp.append(package_name)
+  return install_requires_tmp, dependency_links_tmp
+
+install_requires, dependency_links = _get_requirements()
+
+if project_name == 'tf-models-nightly':
+  version += '.dev' + datetime.datetime.now().strftime('%Y%m%d')
+  install_requires.append('tf-nightly')
+else:
+  install_requires.append('tensorflow>=2.1.0')
+
+print('install_requires: ', install_requires)
+print('dependency_links: ', dependency_links)
+
 setup(
-    name='tf-models-official',
-    version='0.0.3.dev1',
+    name=project_name,
+    version=version,
     description='TensorFlow Official Models',
+    long_description=long_description,
     author='Google Inc.',
     author_email='no-reply@google.com',
     url='https://github.com/tensorflow/models',
     license='Apache 2.0',
-    packages=find_packages(exclude=["research*", "tutorials*", "samples*"]),
+    packages=find_packages(exclude=[
+        'research*',
+        'tutorials*',
+        'samples*',
+        'official.r1*',
+        'official.pip_package*',
+        'official.benchmark*',
+    ]),
     exclude_package_data={
-            '': [
-                '*_test.py',
-            ],
-        },
-    install_requires=[
-        'six',
-    ],
-    extras_require={
-        'tensorflow': ['tensorflow>=2.0.0'],
-        'tensorflow_gpu': ['tensorflow-gpu>=2.0.0'],
-        'tensorflow-hub': ['tensorflow-hub>=0.6.0'],
+        '': ['*_test.py',],
     },
+    install_requires=install_requires,
+    dependency_links=dependency_links,
     python_requires='>=3.6',
 )
diff --git a/official/r1/README.md b/official/r1/README.md
index 0871718bb0e..203b29580d8 100644
--- a/official/r1/README.md
+++ b/official/r1/README.md
@@ -3,6 +3,12 @@
 The R1 folder contains legacy model implmentation and models that will not
 update to TensorFlow 2.x. They do not have solid performance tracking.
 
+**Note: models will be removed from the master branch by 2020/06.**
+
+After removal, you can still access to these legacy models in the previous
+released tags, e.g. [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0).
+
+
 ## Legacy model implmentation
 
 Transformer and MNIST implementation uses pure TF 1.x TF-Estimator.
diff --git a/official/r1/mnist/mnist.py b/official/r1/mnist/mnist.py
index 9cc5889c96d..4abbec2f3e9 100644
--- a/official/r1/mnist/mnist.py
+++ b/official/r1/mnist/mnist.py
@@ -38,8 +38,6 @@ def create_model(data_format):
 
   Network structure is equivalent to:
   https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
-  and
-  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
   But uses the tf.keras API.
 
diff --git a/official/r1/wide_deep/movielens_main.py b/official/r1/wide_deep/movielens_main.py
index 5c82b0d2edf..39e98ce03ce 100644
--- a/official/r1/wide_deep/movielens_main.py
+++ b/official/r1/wide_deep/movielens_main.py
@@ -69,7 +69,7 @@ def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op):
       model_dir=model_dir,
       feature_columns=deep_columns,
       hidden_units=hidden_units,
-      optimizer=tf.train.AdamOptimizer(),
+      optimizer=tf.compat.v1.train.AdamOptimizer(),
       activation_fn=tf.nn.sigmoid,
       dropout=0.3,
       loss_reduction=tf.losses.Reduction.MEAN)
diff --git a/official/recommendation/ncf_keras_main.py b/official/recommendation/ncf_keras_main.py
index c52eb9d2bc0..d5f13c33538 100644
--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -405,7 +405,7 @@ def step_fn(features):
       optimizer.apply_gradients(grads)
       return loss
 
-    per_replica_losses = strategy.experimental_run_v2(
+    per_replica_losses = strategy.run(
         step_fn, args=(next(train_iterator),))
     mean_loss = strategy.reduce(
         tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
@@ -425,7 +425,7 @@ def step_fn(features):
       return hr_sum, hr_count
 
     per_replica_hr_sum, per_replica_hr_count = (
-        strategy.experimental_run_v2(
+        strategy.run(
             step_fn, args=(next(eval_iterator),)))
     hr_sum = strategy.reduce(
         tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None)
diff --git a/official/requirements.txt b/official/requirements.txt
index 8030b730ecc..3e30f54f30a 100644
--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -1,3 +1,4 @@
+six
 google-api-python-client>=1.6.7
 google-cloud-bigquery>=0.31.0
 kaggle>=1.3.9
diff --git a/official/staging/training/controller.py b/official/staging/training/controller.py
index 134126e02bc..939aa09d262 100644
--- a/official/staging/training/controller.py
+++ b/official/staging/training/controller.py
@@ -78,9 +78,10 @@ def __init__(
       eval_summary_dir: The directory to write eval summaries. If None, it will
         be set to `summary_dir`.
       eval_steps: Number of steps to run evaluation.
-      eval_interval: Step interval for evaluation. If None, will skip
-        evaluation. Note that evaluation only happens outside the training loop,
-        which the loop iteration is specify by `steps_per_loop` parameter.
+      eval_interval: Step interval for evaluation. If None, will skip evaluation
+        in the middle of training. Note that evaluation only happens outside the
+        training loop, which the loop iteration is specify by `steps_per_loop`
+        parameter.
 
     Raises:
       ValueError: If both `train_fn` and `eval_fn` are None.
@@ -111,35 +112,41 @@ def __init__(
     self.train_fn = train_fn
     self.eval_fn = eval_fn
     self.global_step = global_step
-
-    self.train_steps = train_steps
-
-    self.steps_per_loop = steps_per_loop
-
-    self.summary_dir = summary_dir or checkpoint_manager.directory
     self.checkpoint_manager = checkpoint_manager
 
-    self.summary_interval = summary_interval
-    summary_writer = tf.summary.create_file_writer(
-        self.summary_dir) if self.summary_interval else None
-    # TODO(rxsang): Consider pass SummaryManager directly into Controller for
-    # maximum customizability.
-    self.summary_manager = utils.SummaryManager(
-        summary_writer,
-        tf.summary.scalar,
-        global_step=self.global_step,
-        summary_interval=self.summary_interval)
+    if self.train_fn is not None:
+      self.train_steps = train_steps
+      self.steps_per_loop = steps_per_loop
+      self.summary_dir = summary_dir or checkpoint_manager.directory
+
+      self.summary_interval = summary_interval
+      summary_writer = tf.summary.create_file_writer(
+          self.summary_dir) if self.summary_interval else None
+      # TODO(rxsang): Consider pass SummaryManager directly into Controller for
+      # maximum customizability.
+      self.summary_manager = utils.SummaryManager(
+          summary_writer,
+          tf.summary.scalar,
+          global_step=self.global_step,
+          summary_interval=self.summary_interval)
+
+    if self.eval_fn is not None:
+      eval_summary_dir = eval_summary_dir or self.summary_dir
+      eval_summary_writer = tf.summary.create_file_writer(
+          eval_summary_dir) if eval_summary_dir else None
+      self.eval_summary_manager = utils.SummaryManager(
+          eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
+
+      self.eval_steps = eval_steps
+      self.eval_interval = eval_interval
+
+      # Create and initialize the interval triggers.
+      self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
+                                                self.global_step.numpy())
+
     if self.global_step:
       tf.summary.experimental.set_step(self.global_step)
 
-    self.eval_summary_dir = eval_summary_dir or self.summary_dir
-    eval_summary_writer = tf.summary.create_file_writer(self.eval_summary_dir)
-    self.eval_summary_manager = utils.SummaryManager(
-        eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
-
-    self.eval_steps = eval_steps
-    self.eval_interval = eval_interval
-
     # Restore Model if needed.
     if self.checkpoint_manager is not None:
       model_restored = self._restore_model()
@@ -150,10 +157,6 @@ def __init__(
             checkpoint_number=self.global_step)
         logging.info("Saved checkpoins in %s", ckpt_path)
 
-    # Create and initialize the interval triggers.
-    self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
-                                              self.global_step.numpy())
-
   def _restore_model(self, checkpoint_path=None):
     """Restore or initialize the model.
 
@@ -186,11 +189,12 @@ def _evaluate_once(self, current_step):
     self._log_info(info)
 
     self.eval_summary_manager.write_summaries(eval_outputs)
+    self.eval_summary_manager.flush()
 
   def _maybe_save_checkpoints(self, current_step, force_trigger=False):
     if self.checkpoint_manager.checkpoint_interval:
       ckpt_path = self.checkpoint_manager.save(
-          checkpoint_number=current_step, check_interval=force_trigger)
+          checkpoint_number=current_step, check_interval=not force_trigger)
       if ckpt_path is not None:
         logging.info("Saved checkpoins in %s", ckpt_path)
 
@@ -265,6 +269,7 @@ def train(self, evaluate=True):
         self._maybe_evaluate(current_step)
 
     self.summary_manager.write_summaries(train_outputs, always_write=True)
+    self.summary_manager.flush()
     self._maybe_save_checkpoints(current_step, force_trigger=True)
     if evaluate:
       self._maybe_evaluate(current_step, force_trigger=True)
diff --git a/official/staging/training/controller_test.py b/official/staging/training/controller_test.py
new file mode 100644
index 00000000000..d7a7282fc99
--- /dev/null
+++ b/official/staging/training/controller_test.py
@@ -0,0 +1,262 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.staging.training.controller."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.staging.training import controller
+from official.staging.training import standard_runnable
+
+
+def all_strategy_combinations():
+  """Gets combinations of distribution strategies."""
+  return combinations.combine(
+      strategy=[
+          strategy_combinations.one_device_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+      ],
+      mode="eager",
+  )
+
+
+def create_model():
+  x = tf.keras.layers.Input(shape=(3,), name="input")
+  y = tf.keras.layers.Dense(4, name="dense")(x)
+  model = tf.keras.Model(x, y)
+  return model
+
+
+def summaries_with_matching_keyword(keyword, summary_dir):
+  """Yields summary protos matching given keyword from event file."""
+  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
+  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
+        if keyword in value.tag:
+          tf.compat.v1.logging.error(event)
+          yield event.summary
+
+
+def check_eventfile_for_keyword(keyword, summary_dir):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, summary_dir))
+
+
+def dataset_fn(ctx):
+  del ctx
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  targets = np.zeros((10, 4), dtype=np.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.repeat(100)
+  dataset = dataset.batch(10, drop_remainder=True)
+  return dataset
+
+
+class TestRunnable(standard_runnable.StandardTrainable,
+                   standard_runnable.StandardEvaluable):
+  """Implements the training and evaluation APIs for the test model."""
+
+  def __init__(self):
+    standard_runnable.StandardTrainable.__init__(self)
+    standard_runnable.StandardEvaluable.__init__(self)
+    self.strategy = tf.distribute.get_strategy()
+    self.model = create_model()
+    self.optimizer = tf.keras.optimizers.RMSprop()
+    self.global_step = self.optimizer.iterations
+    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
+    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
+
+  def build_train_dataset(self):
+    return self.strategy.experimental_distribute_datasets_from_function(
+        dataset_fn)
+
+  def train_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, targets = inputs
+      with tf.GradientTape() as tape:
+        outputs = self.model(inputs)
+        loss = tf.math.reduce_sum(outputs - targets)
+      grads = tape.gradient(loss, self.model.variables)
+      self.optimizer.apply_gradients(zip(grads, self.model.variables))
+      self.train_loss.update_state(loss)
+
+    self.strategy.run(_replicated_step, args=(next(iterator),))
+
+  def train_loop_end(self):
+    return {
+        "loss": self.train_loss.result(),
+    }
+
+  def build_eval_dataset(self):
+    return self.strategy.experimental_distribute_datasets_from_function(
+        dataset_fn)
+
+  def eval_begin(self):
+    self.eval_loss.reset_states()
+
+  def eval_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated evaluation step."""
+      inputs, targets = inputs
+      outputs = self.model(inputs)
+      loss = tf.math.reduce_sum(outputs - targets)
+      self.eval_loss.update_state(loss)
+
+    self.strategy.run(_replicated_step, args=(next(iterator),))
+
+  def eval_end(self):
+    return {
+        "eval_loss": self.eval_loss.result(),
+    }
+
+
+class ControllerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(ControllerTest, self).setUp()
+    self.model_dir = self.get_temp_dir()
+
+  @combinations.generate(all_strategy_combinations())
+  def test_train_and_evaluate(self, strategy):
+    with strategy.scope():
+      test_runnable = TestRunnable()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runnable.model, optimizer=test_runnable.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runnable.global_step,
+        checkpoint_interval=10)
+    test_controller = controller.Controller(
+        strategy=strategy,
+        train_fn=test_runnable.train,
+        eval_fn=test_runnable.evaluate,
+        global_step=test_runnable.global_step,
+        train_steps=10,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        summary_interval=2,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        eval_steps=2,
+        eval_interval=5)
+    test_controller.train(evaluate=True)
+
+    # Checkpoints are saved.
+    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
+
+    # Loss and accuracy values should be written into summaries.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertTrue(
+        check_eventfile_for_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertTrue(
+        check_eventfile_for_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+
+  @combinations.generate(all_strategy_combinations())
+  def test_train_only(self, strategy):
+    with strategy.scope():
+      test_runnable = TestRunnable()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runnable.model, optimizer=test_runnable.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runnable.global_step,
+        checkpoint_interval=10)
+    test_controller = controller.Controller(
+        strategy=strategy,
+        train_fn=test_runnable.train,
+        global_step=test_runnable.global_step,
+        train_steps=10,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        summary_interval=2,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+    )
+    test_controller.train(evaluate=False)
+
+    # Checkpoints are saved.
+    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
+
+    # Only train summaries are written.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertTrue(
+        check_eventfile_for_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertFalse(
+        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
+
+  @combinations.generate(all_strategy_combinations())
+  def test_evaluate_only(self, strategy):
+    with strategy.scope():
+      test_runnable = TestRunnable()
+
+    checkpoint = tf.train.Checkpoint(model=test_runnable.model)
+    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
+
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runnable.global_step)
+    test_controller = controller.Controller(
+        strategy=strategy,
+        eval_fn=test_runnable.evaluate,
+        global_step=test_runnable.global_step,
+        checkpoint_manager=checkpoint_manager,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        eval_steps=2,
+        eval_interval=5)
+    test_controller.evaluate()
+
+    # Only eval summaries are written
+    self.assertFalse(
+        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertTrue(
+        check_eventfile_for_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/official/staging/training/runnable.py b/official/staging/training/runnable.py
index cc82466de31..1af6eca06a3 100644
--- a/official/staging/training/runnable.py
+++ b/official/staging/training/runnable.py
@@ -39,7 +39,7 @@ def train(self,
     python callbacks. This is necessary for getting good performance in TPU
     training, as the overhead for launching a multi worker tf.function may be
     large in Eager mode. It is usually encouraged to create a host training loop
-    (e.g. using a `tf.range` wrapping `strategy.experimental_run_v2` inside a
+    (e.g. using a `tf.range` wrapping `strategy.run` inside a
     `tf.function`) in the TPU case. For the cases that don't require host
     training loop to acheive peak performance, users can just implement a simple
     python loop to drive each step.
diff --git a/official/staging/training/standard_runnable.py b/official/staging/training/standard_runnable.py
index 806fc1ba0a1..e30be9fe3fe 100644
--- a/official/staging/training/standard_runnable.py
+++ b/official/staging/training/standard_runnable.py
@@ -87,7 +87,7 @@ def train_step(self, iterator):
     What a "step" consists of is up to the implementer. If using distribution
     strategies, the call to this method should take place in the "cross-replica
     context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.experimental_run_v2`.
+    to `strategy.run`.
 
     Args:
       iterator: A tf.nest-compatible structure of tf.data Iterator or
@@ -163,7 +163,7 @@ def eval_step(self, iterator):
     What a "step" consists of is up to the implementer. If using distribution
     strategies, the call to this method should take place in the "cross-replica
     context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.experimental_run_v2`.
+    to `strategy.run`.
 
     Args:
       iterator: A tf.nest-compatible structure of tf.data Iterator or
diff --git a/official/staging/training/utils.py b/official/staging/training/utils.py
index 15708c78170..33fa368b7b9 100644
--- a/official/staging/training/utils.py
+++ b/official/staging/training/utils.py
@@ -193,6 +193,11 @@ def summary_writer(self):
     """Returns the underlying summary writer."""
     return self._summary_writer
 
+  def flush(self):
+    """Flush the underlying summary writer."""
+    if self._enabled:
+      tf.summary.flush(self._summary_writer)
+
   def write_summaries(self, items, always_write=True):
     """Write a bulk of summaries.
 
diff --git a/official/utils/testing/perfzero_benchmark.py b/official/utils/testing/perfzero_benchmark.py
index 546e6efb507..3a9b3ff827c 100644
--- a/official/utils/testing/perfzero_benchmark.py
+++ b/official/utils/testing/perfzero_benchmark.py
@@ -48,15 +48,26 @@ def __init__(self,
       flag_methods: Set of flag methods to run during setup.
       tpu: (optional) TPU name to use in a TPU benchmark.
     """
-    if not output_dir:
-      output_dir = '/tmp'
-    self.output_dir = output_dir
+    if os.getenv('BENCHMARK_OUTPUT_DIR'):
+      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
+    elif output_dir:
+      self.output_dir = output_dir
+    else:
+      self.output_dir = '/tmp'
     self.default_flags = default_flags or {}
     self.flag_methods = flag_methods or {}
-    if tpu:
+
+    if os.getenv('BENCHMARK_TPU'):
+      resolved_tpu = os.getenv('BENCHMARK_TPU')
+    elif tpu:
+      resolved_tpu = tpu
+    else:
+      resolved_tpu = None
+
+    if resolved_tpu:
       # TPU models are expected to accept a --tpu=name flag. PerfZero creates
       # the TPU at runtime and passes the TPU's name to this flag.
-      self.default_flags['tpu'] = tpu
+      self.default_flags['tpu'] = resolved_tpu
 
   def _get_model_dir(self, folder_name):
     """Returns directory to store info, e.g. saved model and event log."""
diff --git a/official/vision/detection/executor/detection_executor.py b/official/vision/detection/executor/detection_executor.py
index 49daa6a34f2..799a239bcd9 100644
--- a/official/vision/detection/executor/detection_executor.py
+++ b/official/vision/detection/executor/detection_executor.py
@@ -80,12 +80,11 @@ def _replicated_step(inputs):
         all_losses = loss_fn(labels, outputs)
         losses = {}
         for k, v in all_losses.items():
-          v = tf.reduce_mean(v) / strategy.num_replicas_in_sync
-          losses[k] = v
-        loss = losses['total_loss']
+          losses[k] = tf.reduce_mean(v)
+        per_replica_loss = losses['total_loss'] / strategy.num_replicas_in_sync
         _update_state(labels, outputs)
 
-      grads = tape.gradient(loss, trainable_variables)
+      grads = tape.gradient(per_replica_loss, trainable_variables)
       optimizer.apply_gradients(zip(grads, trainable_variables))
       return losses
 
@@ -119,7 +118,7 @@ def _test_step_fn(inputs, eval_steps):
 
         return labels, prediction_outputs
 
-      labels, outputs = strategy.experimental_run_v2(
+      labels, outputs = strategy.run(
           _test_step_fn, args=(
               next(iterator),
               eval_steps,
diff --git a/official/vision/detection/modeling/base_model.py b/official/vision/detection/modeling/base_model.py
index 58eb28e370c..65778a40bf1 100644
--- a/official/vision/detection/modeling/base_model.py
+++ b/official/vision/detection/modeling/base_model.py
@@ -21,8 +21,6 @@
 import abc
 import functools
 import re
-from absl import logging
-
 import tensorflow.compat.v2 as tf
 from official.vision.detection.modeling import checkpoint_utils
 from official.vision.detection.modeling import learning_rates
@@ -60,11 +58,10 @@ def __call__(self, learning_rate):
 
 
 def _make_filter_trainable_variables_fn(frozen_variable_prefix):
-  """Creates a function for filtering trainable varialbes.
-  """
+  """Creates a function for filtering trainable varialbes."""
 
   def _filter_trainable_variables(variables):
-    """Filters trainable varialbes
+    """Filters trainable varialbes.
 
     Args:
       variables: a list of tf.Variable to be filtered.
@@ -141,8 +138,7 @@ def build_optimizer(self):
     return self._optimizer_fn(self._learning_rate)
 
   def make_filter_trainable_variables_fn(self):
-    """Creates a function for filtering trainable varialbes.
-    """
+    """Creates a function for filtering trainable varialbes."""
     return _make_filter_trainable_variables_fn(self._frozen_variable_prefix)
 
   def weight_decay_loss(self, trainable_variables):
@@ -151,8 +147,6 @@ def weight_decay_loss(self, trainable_variables):
         if self._regularization_var_regex is None
         or re.match(self._regularization_var_regex, v.name)
     ]
-    logging.info('Regularization Variables: %s',
-                 [v.name for v in reg_variables])
 
     return self._l2_weight_decay * tf.add_n(
         [tf.nn.l2_loss(v) for v in reg_variables])
diff --git a/official/vision/detection/modeling/factory.py b/official/vision/detection/modeling/factory.py
index 25c5d432460..bc7c33fc44a 100644
--- a/official/vision/detection/modeling/factory.py
+++ b/official/vision/detection/modeling/factory.py
@@ -15,6 +15,7 @@
 """Factory to build detection model."""
 
 
+from official.vision.detection.modeling import maskrcnn_model
 from official.vision.detection.modeling import retinanet_model
 
 
@@ -22,6 +23,8 @@ def model_generator(params):
   """Model function generator."""
   if params.type == 'retinanet':
     model_fn = retinanet_model.RetinanetModel(params)
+  elif params.type == 'mask_rcnn':
+    model_fn = maskrcnn_model.MaskrcnnModel(params)
   else:
     raise ValueError('Model %s is not supported.'% params.type)
 
diff --git a/official/vision/detection/modeling/losses.py b/official/vision/detection/modeling/losses.py
index 9e36dd5fc9f..757334950c1 100644
--- a/official/vision/detection/modeling/losses.py
+++ b/official/vision/detection/modeling/losses.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl import logging
 import tensorflow.compat.v2 as tf
 
 
@@ -76,7 +77,7 @@ def focal_loss(logits, targets, alpha, gamma, normalizer):
     #      (1 - p_t)^r = exp(-r * z * x - r * log(1 + exp(-x))).
     neg_logits = -1.0 * logits
     modulator = tf.math.exp(gamma * targets * neg_logits -
-                       gamma * tf.math.log1p(tf.math.exp(neg_logits)))
+                            gamma * tf.math.log1p(tf.math.exp(neg_logits)))
     loss = modulator * cross_entropy
     weighted_loss = tf.where(positive_label_mask, alpha * loss,
                              (1.0 - alpha) * loss)
@@ -89,6 +90,8 @@ class RpnScoreLoss(object):
 
   def __init__(self, params):
     self._rpn_batch_size_per_im = params.rpn_batch_size_per_im
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
 
   def __call__(self, score_outputs, labels):
     """Computes total RPN detection loss.
@@ -128,17 +131,16 @@ def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
     # (3) score_targets[i]=-1, the anchor is don't care (ignore).
     with tf.name_scope('rpn_score_loss'):
       mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
-                           tf.math.equal(score_targets, 0))
-      score_targets = tf.math.maximum(score_targets, tf.zeros_like(score_targets))
-      # RPN score loss is sum over all except ignored samples.
-      # Keep the compat.v1 loss because Keras does not have a
-      # sigmoid_cross_entropy substitution yet.
-      # TODO(b/143720144): replace this loss.
-      score_loss = tf.compat.v1.losses.sigmoid_cross_entropy(
-          score_targets,
-          score_outputs,
-          weights=mask,
-          reduction=tf.compat.v1.losses.Reduction.SUM)
+                                tf.math.equal(score_targets, 0))
+
+      score_targets = tf.math.maximum(score_targets,
+                                      tf.zeros_like(score_targets))
+
+      score_targets = tf.expand_dims(score_targets, axis=-1)
+      score_outputs = tf.expand_dims(score_outputs, axis=-1)
+      score_loss = self._binary_crossentropy(
+          score_targets, score_outputs, sample_weight=mask)
+
       score_loss /= normalizer
       return score_loss
 
@@ -147,7 +149,10 @@ class RpnBoxLoss(object):
   """Region Proposal Network box regression loss function."""
 
   def __init__(self, params):
-    self._delta = params.huber_loss_delta
+    logging.info('RpnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
     self._huber_loss = tf.keras.losses.Huber(
         delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
 
@@ -171,35 +176,32 @@ def __call__(self, box_outputs, labels):
 
       box_losses = []
       for level in levels:
-        box_losses.append(
-            self._rpn_box_loss(
-                box_outputs[level], labels[level], delta=self._delta))
+        box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
 
       # Sum per level losses to total loss.
       return tf.add_n(box_losses)
 
-  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0, delta=1./9):
+  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
     """Computes box regression loss."""
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
     with tf.name_scope('rpn_box_loss'):
-      mask = tf.math.not_equal(box_targets, 0.0)
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      box_loss = tf.compat.v1.losses.huber_loss(
-          box_targets,
-          box_outputs,
-          weights=mask,
-          delta=delta,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-      box_loss /= normalizer
+      mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the sum of non-zero weights and additional
+      # normalizer provided by the function caller. Using + 0.01 here to avoid
+      # division by zero.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
       return box_loss
 
 
 class FastrcnnClassLoss(object):
   """Fast R-CNN classification loss function."""
 
+  def __init__(self):
+    self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
   def __call__(self, class_outputs, class_targets):
     """Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
 
@@ -218,24 +220,19 @@ def __call__(self, class_outputs, class_targets):
       a scalar tensor representing total class loss.
     """
     with tf.name_scope('fast_rcnn_loss'):
-      _, _, num_classes = class_outputs.get_shape().as_list()
+      batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
       class_targets = tf.cast(class_targets, dtype=tf.int32)
       class_targets_one_hot = tf.one_hot(class_targets, num_classes)
-      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot)
+      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
+                                        normalizer=batch_size * num_boxes / 2.0)
 
   def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
-                            normalizer=1.0):
+                            normalizer):
     """Computes classification loss."""
     with tf.name_scope('fast_rcnn_class_loss'):
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      # Keep the compat.v1 loss because Keras does not have a
-      # softmax_cross_entropy substitution yet.
-      # TODO(b/143720144): replace this loss.
-      class_loss = tf.compat.v1.losses.softmax_cross_entropy(
-          class_targets_one_hot,
-          class_outputs,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+      class_loss = self._categorical_crossentropy(class_targets_one_hot,
+                                                  class_outputs)
+
       class_loss /= normalizer
       return class_loss
 
@@ -244,7 +241,12 @@ class FastrcnnBoxLoss(object):
   """Fast R-CNN box regression loss function."""
 
   def __init__(self, params):
-    self._delta = params.huber_loss_delta
+    logging.info('FastrcnnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+    self._huber_loss = tf.keras.losses.Huber(
+        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
 
   def __call__(self, box_outputs, class_targets, box_targets):
     """Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
@@ -296,36 +298,32 @@ def __call__(self, box_outputs, class_targets, box_targets):
               dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
       box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
 
-      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets,
-                                      delta=self._delta)
+      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
 
   def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
-                          normalizer=1.0, delta=1.):
+                          normalizer=1.0):
     """Computes box regression loss."""
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
     with tf.name_scope('fast_rcnn_box_loss'):
       mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
                      [1, 1, 4])
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      # Keep the compat.v1 loss because Keras does not have a
-      # Reduction.SUM_BY_NONZERO_WEIGHTS substitution yet.
-      # TODO(b/143720144): replace this loss.
-      box_loss = tf.compat.v1.losses.huber_loss(
-          box_targets,
-          box_outputs,
-          weights=mask,
-          delta=delta,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-      box_loss /= normalizer
+      mask = tf.cast(mask, dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the number of ones in mask,
+      # additianal normalizer provided by the user and using 0.01 here to avoid
+      # division by 0.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
       return box_loss
 
 
 class MaskrcnnLoss(object):
   """Mask R-CNN instance segmentation mask loss function."""
 
+  def __init__(self):
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
   def __call__(self, mask_outputs, mask_targets, select_class_targets):
     """Computes the mask loss of Mask-RCNN.
 
@@ -358,11 +356,16 @@ def __call__(self, mask_outputs, mask_targets, select_class_targets):
           tf.reshape(tf.greater(select_class_targets, 0),
                      [batch_size, num_masks, 1, 1]),
           [1, 1, mask_height, mask_width])
-      return tf.compat.v1.losses.sigmoid_cross_entropy(
-          mask_targets,
-          mask_outputs,
-          weights=weights,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+      weights = tf.cast(weights, dtype=tf.float32)
+
+      mask_targets = tf.expand_dims(mask_targets, axis=-1)
+      mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
+      mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
+                                            sample_weight=weights)
+
+      # The loss is normalized by the number of 1's in weights and
+      # + 0.01 is used to avoid division by zero.
+      return mask_loss / (tf.reduce_sum(weights) + 0.01)
 
 
 class RetinanetClassLoss(object):
diff --git a/official/vision/detection/modeling/maskrcnn_model.py b/official/vision/detection/modeling/maskrcnn_model.py
new file mode 100644
index 00000000000..e196a31d1c7
--- /dev/null
+++ b/official/vision/detection/modeling/maskrcnn_model.py
@@ -0,0 +1,342 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask R-CNN Model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.keras import backend
+from official.vision.detection.dataloader import anchor
+from official.vision.detection.dataloader import mode_keys
+from official.vision.detection.evaluation import factory as eval_factory
+from official.vision.detection.modeling import base_model
+from official.vision.detection.modeling import losses
+from official.vision.detection.modeling.architecture import factory
+from official.vision.detection.ops import postprocess_ops
+from official.vision.detection.ops import roi_ops
+from official.vision.detection.ops import sampling_ops
+from official.vision.detection.ops import spatial_transform_ops
+from official.vision.detection.utils import box_utils
+
+
+class MaskrcnnModel(base_model.Model):
+  """Mask R-CNN model function."""
+
+  def __init__(self, params):
+    super(MaskrcnnModel, self).__init__(params)
+
+    # For eval metrics.
+    self._params = params
+    self._keras_model = None
+
+    self._include_mask = params.architecture.include_mask
+
+    # Architecture generators.
+    self._backbone_fn = factory.backbone_generator(params)
+    self._fpn_fn = factory.multilevel_features_generator(params)
+    self._rpn_head_fn = factory.rpn_head_generator(params.rpn_head)
+    self._generate_rois_fn = roi_ops.ROIGenerator(params.roi_proposal)
+    self._sample_rois_fn = sampling_ops.ROISampler(params.roi_sampling)
+    self._sample_masks_fn = sampling_ops.MaskSampler(params.mask_sampling)
+
+    self._frcnn_head_fn = factory.fast_rcnn_head_generator(params.frcnn_head)
+    if self._include_mask:
+      self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params.mrcnn_head)
+
+    # Loss function.
+    self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
+    self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
+    self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
+    self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
+    if self._include_mask:
+      self._mask_loss_fn = losses.MaskrcnnLoss()
+
+    self._generate_detections_fn = postprocess_ops.GenericDetectionGenerator(
+        params.postprocess)
+
+    self._transpose_input = params.train.transpose_input
+    assert not self._transpose_input, 'Transpose input is not supportted.'
+
+  def build_outputs(self, inputs, mode):
+    is_training = mode == mode_keys.TRAIN
+    model_outputs = {}
+
+    image = inputs['image']
+    _, image_height, image_width, _ = image.get_shape().as_list()
+    backbone_features = self._backbone_fn(image, is_training)
+    fpn_features = self._fpn_fn(backbone_features, is_training)
+
+    rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
+        fpn_features, is_training)
+    model_outputs.update({
+        'rpn_score_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  rpn_score_outputs),
+        'rpn_box_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  rpn_box_outputs),
+    })
+    input_anchor = anchor.Anchor(self._params.anchor.min_level,
+                                 self._params.anchor.max_level,
+                                 self._params.anchor.num_scales,
+                                 self._params.anchor.aspect_ratios,
+                                 self._params.anchor.anchor_size,
+                                 (image_height, image_width))
+    rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs,
+                                         input_anchor.multilevel_boxes,
+                                         inputs['image_info'][:, 1, :],
+                                         is_training)
+    if is_training:
+      rpn_rois = tf.stop_gradient(rpn_rois)
+
+      # Sample proposals.
+      rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
+                               inputs['gt_classes']))
+
+      # Create bounding box training targets.
+      box_targets = box_utils.encode_boxes(
+          matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]),
+          tf.zeros_like(box_targets),
+          box_targets)
+      model_outputs.update({
+          'class_targets': matched_gt_classes,
+          'box_targets': box_targets,
+      })
+
+    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_features, rpn_rois, output_size=7)
+
+    class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training)
+
+    model_outputs.update({
+        'class_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  class_outputs),
+        'box_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  box_outputs),
+    })
+
+    # Add this output to train to make the checkpoint loadable in predict mode.
+    # If we skip it in train mode, the heads will be out-of-order and checkpoint
+    # loading will fail.
+    boxes, scores, classes, valid_detections = self._generate_detections_fn(
+        box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :])
+    model_outputs.update({
+        'num_detections': valid_detections,
+        'detection_boxes': boxes,
+        'detection_classes': classes,
+        'detection_scores': scores,
+    })
+
+    if not self._include_mask:
+      return model_outputs
+
+    if is_training:
+      rpn_rois, classes, mask_targets = self._sample_masks_fn(
+          rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+          inputs['gt_masks'])
+      mask_targets = tf.stop_gradient(mask_targets)
+
+      classes = tf.cast(classes, dtype=tf.int32)
+
+      model_outputs.update({
+          'mask_targets': mask_targets,
+          'sampled_class_targets': classes,
+      })
+    else:
+      rpn_rois = boxes
+      classes = tf.cast(classes, dtype=tf.int32)
+
+    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_features, rpn_rois, output_size=14)
+
+    mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
+
+    if is_training:
+      model_outputs.update({
+          'mask_outputs':
+              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                    mask_outputs),
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': tf.nn.sigmoid(mask_outputs)
+      })
+
+    return model_outputs
+
+  def build_loss_fn(self):
+    if self._keras_model is None:
+      raise ValueError('build_loss_fn() must be called after build_model().')
+
+    filter_fn = self.make_filter_trainable_variables_fn()
+    trainable_variables = filter_fn(self._keras_model.trainable_variables)
+
+    def _total_loss_fn(labels, outputs):
+      rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
+                                               labels['rpn_score_targets'])
+      rpn_box_loss = self._rpn_box_loss_fn(outputs['rpn_box_outputs'],
+                                           labels['rpn_box_targets'])
+
+      frcnn_class_loss = self._frcnn_class_loss_fn(outputs['class_outputs'],
+                                                   outputs['class_targets'])
+      frcnn_box_loss = self._frcnn_box_loss_fn(outputs['box_outputs'],
+                                               outputs['class_targets'],
+                                               outputs['box_targets'])
+
+      if self._include_mask:
+        mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
+                                       outputs['mask_targets'],
+                                       outputs['sampled_class_targets'])
+      else:
+        mask_loss = 0.0
+
+      model_loss = (
+          rpn_score_loss + rpn_box_loss + frcnn_class_loss + frcnn_box_loss +
+          mask_loss)
+
+      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
+      total_loss = model_loss + l2_regularization_loss
+      return {
+          'total_loss': total_loss,
+          'loss': total_loss,
+          'fast_rcnn_class_loss': frcnn_class_loss,
+          'fast_rcnn_box_loss': frcnn_box_loss,
+          'mask_loss': mask_loss,
+          'model_loss': model_loss,
+          'l2_regularization_loss': l2_regularization_loss,
+          'rpn_score_loss': rpn_score_loss,
+          'rpn_box_loss': rpn_box_loss,
+      }
+
+    return _total_loss_fn
+
+  def build_input_layers(self, params, mode):
+    is_training = mode == mode_keys.TRAIN
+    input_shape = (
+        params.maskrcnn_parser.output_size +
+        [params.maskrcnn_parser.num_channels])
+    if is_training:
+      batch_size = params.train.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2],
+                  batch_size=batch_size,
+                  name='image_info',
+              ),
+          'gt_boxes':
+              tf.keras.layers.Input(
+                  shape=[params.maskrcnn_parser.max_num_instances, 4],
+                  batch_size=batch_size,
+                  name='gt_boxes'),
+          'gt_classes':
+              tf.keras.layers.Input(
+                  shape=[params.maskrcnn_parser.max_num_instances],
+                  batch_size=batch_size,
+                  name='gt_classes',
+                  dtype=tf.int64),
+      }
+      if self._include_mask:
+        input_layer['gt_masks'] = tf.keras.layers.Input(
+            shape=[
+                params.maskrcnn_parser.max_num_instances,
+                params.maskrcnn_parser.mask_crop_size,
+                params.maskrcnn_parser.mask_crop_size
+            ],
+            batch_size=batch_size,
+            name='gt_masks')
+    else:
+      batch_size = params.eval.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2],
+                  batch_size=batch_size,
+                  name='image_info',
+              ),
+      }
+    return input_layer
+
+  def build_model(self, params, mode):
+    if self._keras_model is None:
+      input_layers = self.build_input_layers(self._params, mode)
+      with backend.get_graph().as_default():
+        outputs = self.model_outputs(input_layers, mode)
+
+        model = tf.keras.models.Model(
+            inputs=input_layers, outputs=outputs, name='maskrcnn')
+        assert model is not None, 'Fail to build tf.keras.Model.'
+        model.optimizer = self.build_optimizer()
+        self._keras_model = model
+
+    return self._keras_model
+
+  def post_processing(self, labels, outputs):
+    required_output_fields = ['class_outputs', 'box_outputs']
+    for field in required_output_fields:
+      if field not in outputs:
+        raise ValueError('"%s" is missing in outputs, requried %s found %s'
+                         %(field, required_output_fields, outputs.keys()))
+    predictions = {
+        'image_info': labels['image_info'],
+        'num_detections': outputs['num_detections'],
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_classes': outputs['detection_classes'],
+        'detection_scores': outputs['detection_scores'],
+    }
+    if self._include_mask:
+      predictions.update({
+          'detection_masks': outputs['detection_masks'],
+      })
+
+    if 'groundtruths' in labels:
+      predictions['source_id'] = labels['groundtruths']['source_id']
+      predictions['gt_source_id'] = labels['groundtruths']['source_id']
+      predictions['gt_height'] = labels['groundtruths']['height']
+      predictions['gt_width'] = labels['groundtruths']['width']
+      predictions['gt_image_info'] = labels['image_info']
+      predictions['gt_num_detections'] = (
+          labels['groundtruths']['num_detections'])
+      predictions['gt_boxes'] = labels['groundtruths']['boxes']
+      predictions['gt_classes'] = labels['groundtruths']['classes']
+      predictions['gt_areas'] = labels['groundtruths']['areas']
+      predictions['gt_is_crowds'] = labels['groundtruths']['is_crowds']
+    return labels, predictions
+
+  def eval_metrics(self):
+    return eval_factory.evaluator_generator(self._params.eval)
diff --git a/official/vision/detection/utils/object_detection/visualization_utils.py b/official/vision/detection/utils/object_detection/visualization_utils.py
index 3ffa3dfb600..6f7b8ad9011 100644
--- a/official/vision/detection/utils/object_detection/visualization_utils.py
+++ b/official/vision/detection/utils/object_detection/visualization_utils.py
@@ -21,6 +21,7 @@
 """
 import collections
 import functools
+from absl import logging
 # Set headless-friendly backend.
 import matplotlib; matplotlib.use('Agg')  # pylint: disable=multiple-statements
 import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
@@ -97,6 +98,12 @@ def encode_image_array_as_png_str(image):
 def visualize_images_with_bounding_boxes(images, box_outputs, step,
                                          summary_writer):
   """Records subset of evaluation images with bounding boxes."""
+  if not isinstance(images, list):
+    logging.warning('visualize_images_with_bounding_boxes expects list of '
+                    'images but received type: %s and value: %s',
+                    type(images), images)
+    return
+
   image_shape = tf.shape(images[0])
   image_height = tf.cast(image_shape[0], tf.float32)
   image_width = tf.cast(image_shape[1], tf.float32)
diff --git a/official/vision/image_classification/README.md b/official/vision/image_classification/README.md
index f6889de2bba..8b4e2d13422 100644
--- a/official/vision/image_classification/README.md
+++ b/official/vision/image_classification/README.md
@@ -13,7 +13,7 @@ For more information about other types of models, please refer to this
 Similar to the [estimator implementation](../../r1/resnet), the Keras
 implementation has code for the ImageNet dataset. The ImageNet
 version uses a ResNet50 model implemented in
-[`resnet_model.py`](./resnet_model.py).
+[`resnet_model.py`](./resnet/resnet_model.py).
 
 Please make sure that you have the latest version of TensorFlow
 installed and
@@ -36,14 +36,14 @@ provide a few options.
 Once your dataset is ready, you can begin training the model as follows:
 
 ```bash
-python resnet_imagenet_main.py
+python resnet/resnet_imagenet_main.py
 ```
 
 Again, if you did not download the data to the default directory, specify the
 location with the `--data_dir` flag:
 
 ```bash
-python resnet_imagenet_main.py --data_dir=/path/to/imagenet
+python resnet/resnet_imagenet_main.py --data_dir=/path/to/imagenet
 ```
 
 There are more flag options you can specify. Here are some examples:
@@ -62,7 +62,7 @@ For example, this is a typical command line to run with ImageNet data with
 batch size 128 per GPU:
 
 ```bash
-python -m resnet_imagenet_main \
+python -m resnet/resnet_imagenet_main.py \
     --model_dir=/tmp/model_dir/something \
     --num_gpus=2 \
     --batch_size=128 \
@@ -120,7 +120,7 @@ From a GCE VM, you can run the following command to train ResNet for one epoch
 on a v2-8 or v3-8 TPU:
 
 ```bash
-python resnet_ctl_imagenet_main.py \
+python resnet/resnet_ctl_imagenet_main.py \
   --tpu=$TPU_NAME \
   --model_dir=$MODEL_DIR \
   --data_dir=$DATA_DIR \
@@ -140,7 +140,7 @@ python resnet_ctl_imagenet_main.py \
 To train the ResNet to convergence, run it for 90 epochs:
 
 ```bash
-python resnet_ctl_imagenet_main.py \
+python resnet/resnet_ctl_imagenet_main.py \
   --tpu=$TPU_NAME \
   --model_dir=$MODEL_DIR \
   --data_dir=$DATA_DIR \
diff --git a/official/vision/image_classification/common_test.py b/official/vision/image_classification/common_test.py
deleted file mode 100644
index a6a967e8aa6..00000000000
--- a/official/vision/image_classification/common_test.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the common module."""
-from __future__ import absolute_import
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-from mock import Mock
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.platform import googletest
-from official.utils.misc import keras_utils
-from official.vision.image_classification import common
-
-
-class KerasCommonTests(tf.test.TestCase):
-  """Tests for common."""
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasCommonTests, cls).setUpClass()
-
-  def test_build_stats(self):
-
-    history = self._build_history(1.145, cat_accuracy=.99988)
-    eval_output = self._build_eval_output(.56432111, 5.990)
-    th = keras_utils.TimeHistory(128, 100)
-
-    th.timestamp_log = [keras_utils.BatchTimestamp(0, 1),
-                        keras_utils.BatchTimestamp(1, 2),
-                        keras_utils.BatchTimestamp(2, 3)]
-    th.train_finish_time = 12345
-    stats = common.build_stats(history, eval_output, [th])
-
-    self.assertEqual(1.145, stats['loss'])
-    self.assertEqual(.99988, stats['training_accuracy_top_1'])
-
-    self.assertEqual(.56432111, stats['accuracy_top_1'])
-    self.assertEqual(5.990, stats['eval_loss'])
-
-    self.assertEqual(3, stats['step_timestamp_log'][2].timestamp)
-    self.assertEqual(12345, stats['train_finish_time'])
-
-  def test_build_stats_sparse(self):
-
-    history = self._build_history(1.145, cat_accuracy_sparse=.99988)
-    eval_output = self._build_eval_output(.928, 1.9844)
-    stats = common.build_stats(history, eval_output, None)
-
-    self.assertEqual(1.145, stats['loss'])
-    self.assertEqual(.99988, stats['training_accuracy_top_1'])
-
-    self.assertEqual(.928, stats['accuracy_top_1'])
-    self.assertEqual(1.9844, stats['eval_loss'])
-
-  def test_time_history(self):
-    th = keras_utils.TimeHistory(batch_size=128, log_steps=3)
-
-    th.on_train_begin()
-    th.on_batch_begin(0)
-    th.on_batch_end(0)
-    th.on_batch_begin(1)
-    th.on_batch_end(1)
-    th.on_batch_begin(2)
-    th.on_batch_end(2)
-    th.on_batch_begin(3)
-    th.on_batch_end(3)
-    th.on_batch_begin(4)
-    th.on_batch_end(4)
-    th.on_batch_begin(5)
-    th.on_batch_end(5)
-    th.on_batch_begin(6)
-    th.on_batch_end(6)
-    th.on_train_end()
-
-    self.assertEqual(3, len(th.timestamp_log))
-
-  def _build_history(self, loss, cat_accuracy=None,
-                     cat_accuracy_sparse=None):
-    history_p = Mock()
-    history = {}
-    history_p.history = history
-    history['loss'] = [np.float64(loss)]
-    if cat_accuracy:
-      history['categorical_accuracy'] = [np.float64(cat_accuracy)]
-    if cat_accuracy_sparse:
-      history['sparse_categorical_accuracy'] = [np.float64(cat_accuracy_sparse)]
-
-    return history_p
-
-  def _build_eval_output(self, top_1, eval_loss):
-    eval_output = [np.float64(eval_loss), np.float64(top_1)]
-    return eval_output
-
-if __name__ == '__main__':
-  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
-  googletest.main()
diff --git a/official/vision/image_classification/mnist_main.py b/official/vision/image_classification/mnist_main.py
index 9f72c04bae6..1470c02d05b 100644
--- a/official/vision/image_classification/mnist_main.py
+++ b/official/vision/image_classification/mnist_main.py
@@ -28,7 +28,7 @@
 from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import model_helpers
-from official.vision.image_classification import common
+from official.vision.image_classification.resnet import common
 
 FLAGS = flags.FLAGS
 
diff --git a/official/vision/image_classification/cifar_preprocessing.py b/official/vision/image_classification/resnet/cifar_preprocessing.py
similarity index 98%
rename from official/vision/image_classification/cifar_preprocessing.py
rename to official/vision/image_classification/resnet/cifar_preprocessing.py
index 53ac7340f12..18d7fe630e1 100644
--- a/official/vision/image_classification/cifar_preprocessing.py
+++ b/official/vision/image_classification/resnet/cifar_preprocessing.py
@@ -22,7 +22,7 @@
 from absl import logging
 import tensorflow as tf
 
-from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification.resnet import imagenet_preprocessing
 
 HEIGHT = 32
 WIDTH = 32
diff --git a/official/vision/image_classification/common.py b/official/vision/image_classification/resnet/common.py
similarity index 100%
rename from official/vision/image_classification/common.py
rename to official/vision/image_classification/resnet/common.py
diff --git a/official/vision/image_classification/imagenet_preprocessing.py b/official/vision/image_classification/resnet/imagenet_preprocessing.py
similarity index 100%
rename from official/vision/image_classification/imagenet_preprocessing.py
rename to official/vision/image_classification/resnet/imagenet_preprocessing.py
diff --git a/official/vision/image_classification/resnet_ctl_imagenet_main.py b/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
similarity index 94%
rename from official/vision/image_classification/resnet_ctl_imagenet_main.py
rename to official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
index d8f2f34ff33..9339e9da19d 100644
--- a/official/vision/image_classification/resnet_ctl_imagenet_main.py
+++ b/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
@@ -30,9 +30,9 @@
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
 from official.utils.misc import model_helpers
-from official.vision.image_classification import common
-from official.vision.image_classification import imagenet_preprocessing
-from official.vision.image_classification import resnet_runnable
+from official.vision.image_classification.resnet import common
+from official.vision.image_classification.resnet import imagenet_preprocessing
+from official.vision.image_classification.resnet import resnet_runnable
 
 flags.DEFINE_boolean(name='use_tf_function', default=True,
                      help='Wrap the train and test step inside a '
@@ -147,9 +147,7 @@ def run(flags_obj):
     runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
                                               per_epoch_steps)
 
-  eval_interval = (
-      flags_obj.epochs_between_evals *
-      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
   checkpoint_interval = (
       per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
   summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
@@ -174,7 +172,7 @@ def run(flags_obj):
       eval_interval=eval_interval)
 
   time_callback.on_train_begin()
-  resnet_controller.train(evaluate=True)
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
   time_callback.on_train_end()
 
   stats = build_stats(runnable, time_callback)
diff --git a/official/vision/image_classification/resnet_imagenet_main.py b/official/vision/image_classification/resnet/resnet_imagenet_main.py
similarity index 97%
rename from official/vision/image_classification/resnet_imagenet_main.py
rename to official/vision/image_classification/resnet/resnet_imagenet_main.py
index f8424a9c02c..285df847dbc 100644
--- a/official/vision/image_classification/resnet_imagenet_main.py
+++ b/official/vision/image_classification/resnet/resnet_imagenet_main.py
@@ -26,16 +26,15 @@
 import tensorflow as tf
 
 import tensorflow_model_optimization as tfmot
-
-from official.benchmark.models import trivial_model
 from official.modeling import performance
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
 from official.utils.misc import model_helpers
-from official.vision.image_classification import common
-from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification import test_utils
+from official.vision.image_classification.resnet import common
+from official.vision.image_classification.resnet import imagenet_preprocessing
 from official.vision.image_classification.resnet import resnet_model
 
 
@@ -180,8 +179,7 @@ def run(flags_obj):
 
     # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
     if flags_obj.use_trivial_model:
-      model = trivial_model.trivial_model(
-          imagenet_preprocessing.NUM_CLASSES)
+      model = test_utils.trivial_model(imagenet_preprocessing.NUM_CLASSES)
     elif flags_obj.model == 'resnet50_v1.5':
       model = resnet_model.resnet50(
           num_classes=imagenet_preprocessing.NUM_CLASSES)
diff --git a/official/vision/image_classification/resnet/resnet_model.py b/official/vision/image_classification/resnet/resnet_model.py
index 6faab6a6392..643ca5ad18c 100644
--- a/official/vision/image_classification/resnet/resnet_model.py
+++ b/official/vision/image_classification/resnet/resnet_model.py
@@ -33,7 +33,7 @@
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import regularizers
-from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification.resnet import imagenet_preprocessing
 
 L2_WEIGHT_DECAY = 1e-4
 BATCH_NORM_DECAY = 0.9
@@ -255,9 +255,7 @@ def resnet50(num_classes,
     x = img_input
 
   if backend.image_data_format() == 'channels_first':
-    x = layers.Lambda(
-        lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
-        name='transpose')(x)
+    x = layers.Permute((3, 1, 2))(x)
     bn_axis = 1
   else:  # channels_last
     bn_axis = 3
@@ -382,8 +380,7 @@ def resnet50(num_classes,
       block='c',
       use_l2_regularizer=use_l2_regularizer)
 
-  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
-  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
+  x = layers.GlobalAveragePooling2D()(x)
   x = layers.Dense(
       num_classes,
       kernel_initializer=initializers.RandomNormal(stddev=0.01),
diff --git a/official/vision/image_classification/resnet_runnable.py b/official/vision/image_classification/resnet/resnet_runnable.py
similarity index 96%
rename from official/vision/image_classification/resnet_runnable.py
rename to official/vision/image_classification/resnet/resnet_runnable.py
index 01c8e9d20f5..b6992ae631d 100644
--- a/official/vision/image_classification/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
@@ -25,8 +25,8 @@
 from official.staging.training import standard_runnable
 from official.staging.training import utils
 from official.utils.flags import core as flags_core
-from official.vision.image_classification import common
-from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification.resnet import common
+from official.vision.image_classification.resnet import imagenet_preprocessing
 from official.vision.image_classification.resnet import resnet_model
 
 
@@ -175,7 +175,7 @@ def step_fn(inputs):
       self.train_loss.update_state(loss)
       self.train_accuracy.update_state(labels, logits)
 
-    self.strategy.experimental_run_v2(step_fn, args=(next(iterator),))
+    self.strategy.run(step_fn, args=(next(iterator),))
 
   def train_loop_end(self):
     """See base class."""
@@ -204,7 +204,7 @@ def step_fn(inputs):
       self.test_loss.update_state(loss)
       self.test_accuracy.update_state(labels, logits)
 
-    self.strategy.experimental_run_v2(step_fn, args=(next(iterator),))
+    self.strategy.run(step_fn, args=(next(iterator),))
 
   def eval_end(self):
     """See base class."""
diff --git a/official/vision/image_classification/resnet/tfhub_export.py b/official/vision/image_classification/resnet/tfhub_export.py
index 44d92b33a08..8ec000a220a 100644
--- a/official/vision/image_classification/resnet/tfhub_export.py
+++ b/official/vision/image_classification/resnet/tfhub_export.py
@@ -26,7 +26,7 @@
 
 import tensorflow.compat.v2 as tf
 
-from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification.resnet import imagenet_preprocessing
 from official.vision.image_classification.resnet import resnet_model
 
 FLAGS = flags.FLAGS
diff --git a/official/vision/image_classification/resnet_ctl_imagenet_test.py b/official/vision/image_classification/resnet_ctl_imagenet_test.py
deleted file mode 100644
index fbc8d0f850a..00000000000
--- a/official/vision/image_classification/resnet_ctl_imagenet_test.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the ResNet model with ImageNet data using CTL."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-import os
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.eager import context
-from official.utils.testing import integration
-from official.vision.image_classification import common
-from official.vision.image_classification import imagenet_preprocessing
-from official.vision.image_classification import resnet_ctl_imagenet_main
-
-
-class CtlImagenetTest(tf.test.TestCase):
-  """Unit tests for Keras ResNet with ImageNet using CTL."""
-
-  _extra_flags = [
-      '-batch_size', '4',
-      '-train_steps', '4',
-      '-use_synthetic_data', 'true'
-  ]
-  _tempdir = None
-
-  def get_temp_dir(self):
-    if not self._tempdir:
-      self._tempdir = tempfile.mkdtemp(
-          dir=super(CtlImagenetTest, self).get_temp_dir())
-    return self._tempdir
-
-  @classmethod
-  def setUpClass(cls):
-    super(CtlImagenetTest, cls).setUpClass()
-    common.define_keras_flags()
-
-  def setUp(self):
-    super(CtlImagenetTest, self).setUp()
-    imagenet_preprocessing.NUM_IMAGES['validation'] = 4
-    self.policy = \
-        tf.compat.v2.keras.mixed_precision.experimental.global_policy()
-
-  def tearDown(self):
-    super(CtlImagenetTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    tf.compat.v2.keras.mixed_precision.experimental.set_policy(self.policy)
-
-  def test_end_to_end_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-
-    model_dir = os.path.join(self.get_temp_dir(), 'ctl_imagenet_no_dist_strat')
-    extra_flags = [
-        '-distribution_strategy', 'off',
-        '-model_dir', model_dir,
-        '-data_format', 'channels_last',
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_ctl_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu(self):
-    """Test Keras model with 2 GPUs."""
-    num_gpus = '2'
-    if context.num_gpus() < 2:
-      num_gpus = '0'
-
-    model_dir = os.path.join(self.get_temp_dir(), 'ctl_imagenet_2_gpu')
-    extra_flags = [
-        '-num_gpus', num_gpus,
-        '-distribution_strategy', 'mirrored',
-        '-model_dir', model_dir,
-        '-data_format', 'channels_last',
-    ]
-    extra_flags = extra_flags + self._extra_flags
-
-    integration.run_synthetic(
-        main=resnet_ctl_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-if __name__ == '__main__':
-  assert tf.version.VERSION.startswith('2.')
-  tf.test.main()
diff --git a/official/vision/image_classification/resnet_imagenet_test.py b/official/vision/image_classification/resnet_imagenet_test.py
deleted file mode 100644
index e7ed553dc82..00000000000
--- a/official/vision/image_classification/resnet_imagenet_test.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the keras ResNet model with ImageNet data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.eager import context
-from official.utils.misc import keras_utils
-from official.utils.testing import integration
-from official.vision.image_classification import imagenet_preprocessing
-from official.vision.image_classification import resnet_imagenet_main
-
-
-@parameterized.parameters(
-    "resnet",
-    "resnet_polynomial_decay",
-    "mobilenet",
-    "mobilenet_polynomial_decay")
-class KerasImagenetTest(tf.test.TestCase):
-  """Unit tests for Keras Models with ImageNet."""
-  _default_flags_dict = [
-      "-batch_size", "4",
-      "-train_steps", "1",
-      "-use_synthetic_data", "true",
-      "-data_format", "channels_last",
-  ]
-  _extra_flags_dict = {
-      "resnet": [
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-      ],
-      "resnet_polynomial_decay": [
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-      "mobilenet": [
-          "-model", "mobilenet",
-          "-optimizer", "mobilenet_default",
-      ],
-      "mobilenet_polynomial_decay": [
-          "-model", "mobilenet",
-          "-optimizer", "mobilenet_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-  }
-  _tempdir = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasImagenetTest, cls).setUpClass()
-    resnet_imagenet_main.define_imagenet_keras_flags()
-
-  def setUp(self):
-    super(KerasImagenetTest, self).setUp()
-    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
-    self.policy = \
-        tf.compat.v2.keras.mixed_precision.experimental.global_policy()
-
-  def tearDown(self):
-    super(KerasImagenetTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    tf.compat.v2.keras.mixed_precision.experimental.set_policy(self.policy)
-
-  def get_extra_flags_dict(self, flags_key):
-    return self._extra_flags_dict[flags_key] + self._default_flags_dict
-
-  def test_end_to_end_no_dist_strat(self, flags_key):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    extra_flags = [
-        "-distribution_strategy", "off",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_graph_no_dist_strat(self, flags_key):
-    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
-    extra_flags = [
-        "-enable_eager", "false",
-        "-distribution_strategy", "off",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_1_gpu(self, flags_key):
-    """Test Keras model with 1 GPU."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-distribution_strategy", "mirrored",
-        "-enable_checkpoint_and_export", "1",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_1_gpu_fp16(self, flags_key):
-    """Test Keras model with 1 GPU and fp16."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 1:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available"
-          .format(1, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "1",
-        "-dtype", "fp16",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu(self, flags_key):
-    """Test Keras model with 2 GPUs."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_xla_2_gpu(self, flags_key):
-    """Test Keras model with XLA and 2 GPUs."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-enable_xla", "true",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_2_gpu_fp16(self, flags_key):
-    """Test Keras model with 2 GPUs and fp16."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-dtype", "fp16",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  def test_end_to_end_xla_2_gpu_fp16(self, flags_key):
-    """Test Keras model with XLA, 2 GPUs and fp16."""
-    config = keras_utils.get_config_proto_v1()
-    tf.compat.v1.enable_eager_execution(config=config)
-
-    if context.num_gpus() < 2:
-      self.skipTest(
-          "{} GPUs are not available for this test. {} GPUs are available".
-          format(2, context.num_gpus()))
-
-    extra_flags = [
-        "-num_gpus", "2",
-        "-dtype", "fp16",
-        "-enable_xla", "true",
-        "-distribution_strategy", "mirrored",
-    ]
-    extra_flags = extra_flags + self.get_extra_flags_dict(flags_key)
-
-    if "polynomial_decay" in extra_flags:
-      self.skipTest("Pruning with fp16 is not currently supported.")
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-
-if __name__ == "__main__":
-  tf.compat.v1.enable_v2_behavior()
-  tf.test.main()
diff --git a/official/benchmark/models/trivial_model.py b/official/vision/image_classification/test_utils.py
similarity index 96%
rename from official/benchmark/models/trivial_model.py
rename to official/vision/image_classification/test_utils.py
index 96f8b729832..a6dc91dc775 100644
--- a/official/benchmark/models/trivial_model.py
+++ b/official/vision/image_classification/test_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A trivial model for Keras."""
+"""Test utilities for image classification tasks."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tutorials/README.md b/tutorials/README.md
deleted file mode 100644
index 7dd5f46c8ef..00000000000
--- a/tutorials/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Tutorial Models
-
-This folder contains models referenced to from the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
diff --git a/tutorials/__init__.py b/tutorials/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tutorials/embedding/README.md b/tutorials/embedding/README.md
deleted file mode 100644
index cb84f532f5c..00000000000
--- a/tutorials/embedding/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains models for unsupervised training of word embeddings
-using the model described in:
-
-(Mikolov, et. al.) [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/abs/1301.3781),
-ICLR 2013.
-
-Detailed instructions on how to get started and use them are available in the
-tutorials. Brief instructions are below.
-
-* [Word2Vec Tutorial](http://tensorflow.org/tutorials/word2vec)
-
-Assuming you have cloned the git repository, navigate into this directory. To download the example text and evaluation data:
-
-```shell
-curl http://mattmahoney.net/dc/text8.zip > text8.zip
-unzip text8.zip
-curl https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip > source-archive.zip
-unzip -p source-archive.zip  word2vec/trunk/questions-words.txt > questions-words.txt
-rm text8.zip source-archive.zip
-```
-
-You will need to compile the ops as follows (See 
-[Adding a New Op to TensorFlow](https://www.tensorflow.org/how_tos/adding_an_op/#building_the_op_library)
-for more details).:
-
-```shell
-TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
-TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
-g++ -std=c++11 -shared word2vec_ops.cc word2vec_kernels.cc -o word2vec_ops.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2 -D_GLIBCXX_USE_CXX11_ABI=0
-```
-
-On Mac, add `-undefined dynamic_lookup` to the g++ command. The flag `-D_GLIBCXX_USE_CXX11_ABI=0` is included to support newer versions of gcc. However, if you compiled TensorFlow from source using gcc 5 or later, you may need to exclude the flag. Specifically, if you get an error similar to the following: `word2vec_ops.so: undefined symbol: _ZN10tensorflow7strings6StrCatERKNS0_8AlphaNumES3_S3_S3_` then you likely need to exclude the flag.
-
-Once you've successfully compiled the ops, run the model as follows:
-
-```shell
-python word2vec_optimized.py \
-  --train_data=text8 \
-  --eval_data=questions-words.txt \
-  --save_path=/tmp/
-```
-
-Here is a short overview of what is in this directory.
-
-File | What's in it?
---- | ---
-`word2vec.py` | A version of word2vec implemented using TensorFlow ops and minibatching.
-`word2vec_test.py` | Integration test for word2vec.
-`word2vec_optimized.py` | A version of word2vec implemented using C ops that does no minibatching.
-`word2vec_optimized_test.py` | Integration test for word2vec_optimized.
-`word2vec_kernels.cc` | Kernels for the custom input and training ops.
-`word2vec_ops.cc` | The declarations of the custom ops.
diff --git a/tutorials/embedding/__init__.py b/tutorials/embedding/__init__.py
deleted file mode 100644
index ea3259cd34e..00000000000
--- a/tutorials/embedding/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Import generated word2vec optimized ops into embedding package."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
diff --git a/tutorials/embedding/word2vec.py b/tutorials/embedding/word2vec.py
deleted file mode 100644
index 72158647389..00000000000
--- a/tutorials/embedding/word2vec.py
+++ /dev/null
@@ -1,534 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Multi-threaded word2vec mini-batched skip-gram model.
-
-Trains the model described in:
-(Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space
-ICLR 2013.
-http://arxiv.org/abs/1301.3781
-This model does traditional minibatching.
-
-The key ops used are:
-* placeholder for feeding in tensors for each example.
-* embedding_lookup for fetching rows from the embedding matrix.
-* sigmoid_cross_entropy_with_logits to calculate the loss.
-* GradientDescentOptimizer for optimizing the loss.
-* skipgram custom op that does input processing.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import threading
-import time
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-import numpy as np
-import tensorflow as tf
-
-word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so'))
-
-flags = tf.app.flags
-
-flags.DEFINE_string("save_path", None, "Directory to write the model and "
-                    "training summaries.")
-flags.DEFINE_string("train_data", None, "Training text file. "
-                    "E.g., unzipped file http://mattmahoney.net/dc/text8.zip.")
-flags.DEFINE_string(
-    "eval_data", None, "File consisting of analogies of four tokens."
-    "embedding 2 - embedding 1 + embedding 3 should be close "
-    "to embedding 4."
-    "See README.md for how to get 'questions-words.txt'.")
-flags.DEFINE_integer("embedding_size", 200, "The embedding dimension size.")
-flags.DEFINE_integer(
-    "epochs_to_train", 15,
-    "Number of epochs to train. Each epoch processes the training data once "
-    "completely.")
-flags.DEFINE_float("learning_rate", 0.2, "Initial learning rate.")
-flags.DEFINE_integer("num_neg_samples", 100,
-                     "Negative samples per training example.")
-flags.DEFINE_integer("batch_size", 16,
-                     "Number of training examples processed per step "
-                     "(size of a minibatch).")
-flags.DEFINE_integer("concurrent_steps", 12,
-                     "The number of concurrent training steps.")
-flags.DEFINE_integer("window_size", 5,
-                     "The number of words to predict to the left and right "
-                     "of the target word.")
-flags.DEFINE_integer("min_count", 5,
-                     "The minimum number of word occurrences for it to be "
-                     "included in the vocabulary.")
-flags.DEFINE_float("subsample", 1e-3,
-                   "Subsample threshold for word occurrence. Words that appear "
-                   "with higher frequency will be randomly down-sampled. Set "
-                   "to 0 to disable.")
-flags.DEFINE_boolean(
-    "interactive", False,
-    "If true, enters an IPython interactive session to play with the trained "
-    "model. E.g., try model.analogy(b'france', b'paris', b'russia') and "
-    "model.nearby([b'proton', b'elephant', b'maxwell'])")
-flags.DEFINE_integer("statistics_interval", 5,
-                     "Print statistics every n seconds.")
-flags.DEFINE_integer("summary_interval", 5,
-                     "Save training summary to file every n seconds (rounded "
-                     "up to statistics interval).")
-flags.DEFINE_integer("checkpoint_interval", 600,
-                     "Checkpoint the model (i.e. save the parameters) every n "
-                     "seconds (rounded up to statistics interval).")
-
-FLAGS = flags.FLAGS
-
-
-class Options(object):
-  """Options used by our word2vec model."""
-
-  def __init__(self):
-    # Model options.
-
-    # Embedding dimension.
-    self.emb_dim = FLAGS.embedding_size
-
-    # Training options.
-    # The training text file.
-    self.train_data = FLAGS.train_data
-
-    # Number of negative samples per example.
-    self.num_samples = FLAGS.num_neg_samples
-
-    # The initial learning rate.
-    self.learning_rate = FLAGS.learning_rate
-
-    # Number of epochs to train. After these many epochs, the learning
-    # rate decays linearly to zero and the training stops.
-    self.epochs_to_train = FLAGS.epochs_to_train
-
-    # Concurrent training steps.
-    self.concurrent_steps = FLAGS.concurrent_steps
-
-    # Number of examples for one training step.
-    self.batch_size = FLAGS.batch_size
-
-    # The number of words to predict to the left and right of the target word.
-    self.window_size = FLAGS.window_size
-
-    # The minimum number of word occurrences for it to be included in the
-    # vocabulary.
-    self.min_count = FLAGS.min_count
-
-    # Subsampling threshold for word occurrence.
-    self.subsample = FLAGS.subsample
-
-    # How often to print statistics.
-    self.statistics_interval = FLAGS.statistics_interval
-
-    # How often to write to the summary file (rounds up to the nearest
-    # statistics_interval).
-    self.summary_interval = FLAGS.summary_interval
-
-    # How often to write checkpoints (rounds up to the nearest statistics
-    # interval).
-    self.checkpoint_interval = FLAGS.checkpoint_interval
-
-    # Where to write out summaries.
-    self.save_path = FLAGS.save_path
-    if not os.path.exists(self.save_path):
-      os.makedirs(self.save_path)
-
-    # Eval options.
-    # The text file for eval.
-    self.eval_data = FLAGS.eval_data
-
-
-class Word2Vec(object):
-  """Word2Vec model (Skipgram)."""
-
-  def __init__(self, options, session):
-    self._options = options
-    self._session = session
-    self._word2id = {}
-    self._id2word = []
-    self.build_graph()
-    self.build_eval_graph()
-    self.save_vocab()
-
-  def read_analogies(self):
-    """Reads through the analogy question file.
-
-    Returns:
-      questions: a [n, 4] numpy array containing the analogy question's
-                 word ids.
-      questions_skipped: questions skipped due to unknown words.
-    """
-    questions = []
-    questions_skipped = 0
-    with open(self._options.eval_data, "rb") as analogy_f:
-      for line in analogy_f:
-        if line.startswith(b":"):  # Skip comments.
-          continue
-        words = line.strip().lower().split(b" ")
-        ids = [self._word2id.get(w.strip()) for w in words]
-        if None in ids or len(ids) != 4:
-          questions_skipped += 1
-        else:
-          questions.append(np.array(ids))
-    print("Eval analogy file: ", self._options.eval_data)
-    print("Questions: ", len(questions))
-    print("Skipped: ", questions_skipped)
-    self._analogy_questions = np.array(questions, dtype=np.int32)
-
-  def forward(self, examples, labels):
-    """Build the graph for the forward pass."""
-    opts = self._options
-
-    # Declare all variables we need.
-    # Embedding: [vocab_size, emb_dim]
-    init_width = 0.5 / opts.emb_dim
-    emb = tf.Variable(
-        tf.random_uniform(
-            [opts.vocab_size, opts.emb_dim], -init_width, init_width),
-        name="emb")
-    self._emb = emb
-
-    # Softmax weight: [vocab_size, emb_dim]. Transposed.
-    sm_w_t = tf.Variable(
-        tf.zeros([opts.vocab_size, opts.emb_dim]),
-        name="sm_w_t")
-
-    # Softmax bias: [vocab_size].
-    sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="sm_b")
-
-    # Global step: scalar, i.e., shape [].
-    self.global_step = tf.Variable(0, name="global_step")
-
-    # Nodes to compute the nce loss w/ candidate sampling.
-    labels_matrix = tf.reshape(
-        tf.cast(labels,
-                dtype=tf.int64),
-        [opts.batch_size, 1])
-
-    # Negative sampling.
-    sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
-        true_classes=labels_matrix,
-        num_true=1,
-        num_sampled=opts.num_samples,
-        unique=True,
-        range_max=opts.vocab_size,
-        distortion=0.75,
-        unigrams=opts.vocab_counts.tolist()))
-
-    # Embeddings for examples: [batch_size, emb_dim]
-    example_emb = tf.nn.embedding_lookup(emb, examples)
-
-    # Weights for labels: [batch_size, emb_dim]
-    true_w = tf.nn.embedding_lookup(sm_w_t, labels)
-    # Biases for labels: [batch_size, 1]
-    true_b = tf.nn.embedding_lookup(sm_b, labels)
-
-    # Weights for sampled ids: [num_sampled, emb_dim]
-    sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
-    # Biases for sampled ids: [num_sampled, 1]
-    sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)
-
-    # True logits: [batch_size, 1]
-    true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b
-
-    # Sampled logits: [batch_size, num_sampled]
-    # We replicate sampled noise labels for all examples in the batch
-    # using the matmul.
-    sampled_b_vec = tf.reshape(sampled_b, [opts.num_samples])
-    sampled_logits = tf.matmul(example_emb,
-                               sampled_w,
-                               transpose_b=True) + sampled_b_vec
-    return true_logits, sampled_logits
-
-  def nce_loss(self, true_logits, sampled_logits):
-    """Build the graph for the NCE loss."""
-
-    # cross-entropy(logits, labels)
-    opts = self._options
-    true_xent = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(true_logits), logits=true_logits)
-    sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.zeros_like(sampled_logits), logits=sampled_logits)
-
-    # NCE-loss is the sum of the true and noise (sampled words)
-    # contributions, averaged over the batch.
-    nce_loss_tensor = (tf.reduce_sum(true_xent) +
-                       tf.reduce_sum(sampled_xent)) / opts.batch_size
-    return nce_loss_tensor
-
-  def optimize(self, loss):
-    """Build the graph to optimize the loss function."""
-
-    # Optimizer nodes.
-    # Linear learning rate decay.
-    opts = self._options
-    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
-    lr = opts.learning_rate * tf.maximum(
-        0.0001, 1.0 - tf.cast(self._words, tf.float32) / words_to_train)
-    self._lr = lr
-    optimizer = tf.train.GradientDescentOptimizer(lr)
-    train = optimizer.minimize(loss,
-                               global_step=self.global_step,
-                               gate_gradients=optimizer.GATE_NONE)
-    self._train = train
-
-  def build_eval_graph(self):
-    """Build the eval graph."""
-    # Eval graph
-
-    # Each analogy task is to predict the 4th word (d) given three
-    # words: a, b, c.  E.g., a=italy, b=rome, c=france, we should
-    # predict d=paris.
-
-    # The eval feeds three vectors of word ids for a, b, c, each of
-    # which is of size N, where N is the number of analogies we want to
-    # evaluate in one batch.
-    analogy_a = tf.placeholder(dtype=tf.int32)  # [N]
-    analogy_b = tf.placeholder(dtype=tf.int32)  # [N]
-    analogy_c = tf.placeholder(dtype=tf.int32)  # [N]
-
-    # Normalized word embeddings of shape [vocab_size, emb_dim].
-    nemb = tf.nn.l2_normalize(self._emb, 1)
-
-    # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
-    # They all have the shape [N, emb_dim]
-    a_emb = tf.gather(nemb, analogy_a)  # a's embs
-    b_emb = tf.gather(nemb, analogy_b)  # b's embs
-    c_emb = tf.gather(nemb, analogy_c)  # c's embs
-
-    # We expect that d's embedding vectors on the unit hyper-sphere is
-    # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
-    target = c_emb + (b_emb - a_emb)
-
-    # Compute cosine distance between each pair of target and vocab.
-    # dist has shape [N, vocab_size].
-    dist = tf.matmul(target, nemb, transpose_b=True)
-
-    # For each question (row in dist), find the top 4 words.
-    _, pred_idx = tf.nn.top_k(dist, 4)
-
-    # Nodes for computing neighbors for a given word according to
-    # their cosine distance.
-    nearby_word = tf.placeholder(dtype=tf.int32)  # word id
-    nearby_emb = tf.gather(nemb, nearby_word)
-    nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True)
-    nearby_val, nearby_idx = tf.nn.top_k(nearby_dist,
-                                         min(1000, self._options.vocab_size))
-
-    # Nodes in the construct graph which are used by training and
-    # evaluation to run/feed/fetch.
-    self._analogy_a = analogy_a
-    self._analogy_b = analogy_b
-    self._analogy_c = analogy_c
-    self._analogy_pred_idx = pred_idx
-    self._nearby_word = nearby_word
-    self._nearby_val = nearby_val
-    self._nearby_idx = nearby_idx
-
-  def build_graph(self):
-    """Build the graph for the full model."""
-    opts = self._options
-    # The training data. A text file.
-    (words, counts, words_per_epoch, self._epoch, self._words, examples,
-     labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
-                                          batch_size=opts.batch_size,
-                                          window_size=opts.window_size,
-                                          min_count=opts.min_count,
-                                          subsample=opts.subsample)
-    (opts.vocab_words, opts.vocab_counts,
-     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
-    opts.vocab_size = len(opts.vocab_words)
-    print("Data file: ", opts.train_data)
-    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
-    print("Words per epoch: ", opts.words_per_epoch)
-    self._examples = examples
-    self._labels = labels
-    self._id2word = opts.vocab_words
-    for i, w in enumerate(self._id2word):
-      self._word2id[w] = i
-    true_logits, sampled_logits = self.forward(examples, labels)
-    loss = self.nce_loss(true_logits, sampled_logits)
-    tf.summary.scalar("NCE loss", loss)
-    self._loss = loss
-    self.optimize(loss)
-
-    # Properly initialize all variables.
-    tf.global_variables_initializer().run()
-
-    self.saver = tf.train.Saver()
-
-  def save_vocab(self):
-    """Save the vocabulary to a file so the model can be reloaded."""
-    opts = self._options
-    with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f:
-      for i in xrange(opts.vocab_size):
-        vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8")
-        f.write("%s %d\n" % (vocab_word,
-                             opts.vocab_counts[i]))
-
-  def _train_thread_body(self):
-    initial_epoch, = self._session.run([self._epoch])
-    while True:
-      _, epoch = self._session.run([self._train, self._epoch])
-      if epoch != initial_epoch:
-        break
-
-  def train(self):
-    """Train the model."""
-    opts = self._options
-
-    initial_epoch, initial_words = self._session.run([self._epoch, self._words])
-
-    summary_op = tf.summary.merge_all()
-    summary_writer = tf.summary.FileWriter(opts.save_path, self._session.graph)
-    workers = []
-    for _ in xrange(opts.concurrent_steps):
-      t = threading.Thread(target=self._train_thread_body)
-      t.start()
-      workers.append(t)
-
-    last_words, last_time, last_summary_time = initial_words, time.time(), 0
-    last_checkpoint_time = 0
-    while True:
-      time.sleep(opts.statistics_interval)  # Reports our progress once a while.
-      (epoch, step, loss, words, lr) = self._session.run(
-          [self._epoch, self.global_step, self._loss, self._words, self._lr])
-      now = time.time()
-      last_words, last_time, rate = words, now, (words - last_words) / (
-          now - last_time)
-      print("Epoch %4d Step %8d: lr = %5.3f loss = %6.2f words/sec = %8.0f\r" %
-            (epoch, step, lr, loss, rate), end="")
-      sys.stdout.flush()
-      if now - last_summary_time > opts.summary_interval:
-        summary_str = self._session.run(summary_op)
-        summary_writer.add_summary(summary_str, step)
-        last_summary_time = now
-      if now - last_checkpoint_time > opts.checkpoint_interval:
-        self.saver.save(self._session,
-                        os.path.join(opts.save_path, "model.ckpt"),
-                        global_step=step.astype(int))
-        last_checkpoint_time = now
-      if epoch != initial_epoch:
-        break
-
-    for t in workers:
-      t.join()
-
-    return epoch
-
-  def _predict(self, analogy):
-    """Predict the top 4 answers for analogy questions."""
-    idx, = self._session.run([self._analogy_pred_idx], {
-        self._analogy_a: analogy[:, 0],
-        self._analogy_b: analogy[:, 1],
-        self._analogy_c: analogy[:, 2]
-    })
-    return idx
-
-  def eval(self):
-    """Evaluate analogy questions and reports accuracy."""
-
-    # How many questions we get right at precision@1.
-    correct = 0
-
-    try:
-      total = self._analogy_questions.shape[0]
-    except AttributeError as e:
-      raise AttributeError("Need to read analogy questions.")
-
-    start = 0
-    while start < total:
-      limit = start + 2500
-      sub = self._analogy_questions[start:limit, :]
-      idx = self._predict(sub)
-      start = limit
-      for question in xrange(sub.shape[0]):
-        for j in xrange(4):
-          if idx[question, j] == sub[question, 3]:
-            # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
-            correct += 1
-            break
-          elif idx[question, j] in sub[question, :3]:
-            # We need to skip words already in the question.
-            continue
-          else:
-            # The correct label is not the precision@1
-            break
-    print()
-    print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total,
-                                              correct * 100.0 / total))
-
-  def analogy(self, w0, w1, w2):
-    """Predict word w3 as in w0:w1 vs w2:w3."""
-    wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]])
-    idx = self._predict(wid)
-    for c in [self._id2word[i] for i in idx[0, :]]:
-      if c not in [w0, w1, w2]:
-        print(c)
-        return
-    print("unknown")
-
-  def nearby(self, words, num=20):
-    """Prints out nearby words given a list of words."""
-    ids = np.array([self._word2id.get(x, 0) for x in words])
-    vals, idx = self._session.run(
-        [self._nearby_val, self._nearby_idx], {self._nearby_word: ids})
-    for i in xrange(len(words)):
-      print("\n%s\n=====================================" % (words[i]))
-      for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
-        print("%-20s %6.4f" % (self._id2word[neighbor], distance))
-
-
-def _start_shell(local_ns=None):
-  # An interactive shell is useful for debugging/development.
-  import IPython
-  user_ns = {}
-  if local_ns:
-    user_ns.update(local_ns)
-  user_ns.update(globals())
-  IPython.start_ipython(argv=[], user_ns=user_ns)
-
-
-def main(_):
-  """Train a word2vec model."""
-  if not FLAGS.train_data or not FLAGS.eval_data or not FLAGS.save_path:
-    print("--train_data --eval_data and --save_path must be specified.")
-    sys.exit(1)
-  opts = Options()
-  with tf.Graph().as_default(), tf.Session() as session:
-    with tf.device("/cpu:0"):
-      model = Word2Vec(opts, session)
-      model.read_analogies() # Read analogy questions
-    for _ in xrange(opts.epochs_to_train):
-      model.train()  # Process one epoch
-      model.eval()  # Eval analogies.
-    # Perform a final save.
-    model.saver.save(session,
-                     os.path.join(opts.save_path, "model.ckpt"),
-                     global_step=model.global_step)
-    if FLAGS.interactive:
-      # E.g.,
-      # [0]: model.analogy(b'france', b'paris', b'russia')
-      # [1]: model.nearby([b'proton', b'elephant', b'maxwell'])
-      _start_shell(locals())
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tutorials/embedding/word2vec_kernels.cc b/tutorials/embedding/word2vec_kernels.cc
deleted file mode 100644
index 989ab1639c2..00000000000
--- a/tutorials/embedding/word2vec_kernels.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/random/distribution_sampler.h"
-#include "tensorflow/core/lib/random/philox_random.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/util/guarded_philox_random.h"
-
-namespace tensorflow {
-
-// Number of examples to precalculate.
-const int kPrecalc = 3000;
-// Number of words to read into a sentence before processing.
-const int kSentenceSize = 1000;
-
-namespace {
-
-bool ScanWord(StringPiece* input, string* word) {
-  str_util::RemoveLeadingWhitespace(input);
-  StringPiece tmp;
-  if (str_util::ConsumeNonWhitespace(input, &tmp)) {
-    word->assign(tmp.data(), tmp.size());
-    return true;
-  } else {
-    return false;
-  }
-}
-
-}  // end namespace
-
-class SkipgramWord2vecOp : public OpKernel {
- public:
-  explicit SkipgramWord2vecOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), rng_(&philox_) {
-    string filename;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("filename", &filename));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("batch_size", &batch_size_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size", &window_size_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("min_count", &min_count_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("subsample", &subsample_));
-    OP_REQUIRES_OK(ctx, Init(ctx->env(), filename));
-
-    mutex_lock l(mu_);
-    example_pos_ = corpus_size_;
-    label_pos_ = corpus_size_;
-    label_limit_ = corpus_size_;
-    sentence_index_ = kSentenceSize;
-    for (int i = 0; i < kPrecalc; ++i) {
-      NextExample(&precalc_examples_[i].input, &precalc_examples_[i].label);
-    }
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    Tensor words_per_epoch(DT_INT64, TensorShape({}));
-    Tensor current_epoch(DT_INT32, TensorShape({}));
-    Tensor total_words_processed(DT_INT64, TensorShape({}));
-    Tensor examples(DT_INT32, TensorShape({batch_size_}));
-    auto Texamples = examples.flat<int32>();
-    Tensor labels(DT_INT32, TensorShape({batch_size_}));
-    auto Tlabels = labels.flat<int32>();
-    {
-      mutex_lock l(mu_);
-      for (int i = 0; i < batch_size_; ++i) {
-        Texamples(i) = precalc_examples_[precalc_index_].input;
-        Tlabels(i) = precalc_examples_[precalc_index_].label;
-        precalc_index_++;
-        if (precalc_index_ >= kPrecalc) {
-          precalc_index_ = 0;
-          for (int j = 0; j < kPrecalc; ++j) {
-            NextExample(&precalc_examples_[j].input,
-                        &precalc_examples_[j].label);
-          }
-        }
-      }
-      words_per_epoch.scalar<int64>()() = corpus_size_;
-      current_epoch.scalar<int32>()() = current_epoch_;
-      total_words_processed.scalar<int64>()() = total_words_processed_;
-    }
-    ctx->set_output(0, word_);
-    ctx->set_output(1, freq_);
-    ctx->set_output(2, words_per_epoch);
-    ctx->set_output(3, current_epoch);
-    ctx->set_output(4, total_words_processed);
-    ctx->set_output(5, examples);
-    ctx->set_output(6, labels);
-  }
-
- private:
-  struct Example {
-    int32 input;
-    int32 label;
-  };
-
-  int32 batch_size_ = 0;
-  int32 window_size_ = 5;
-  float subsample_ = 1e-3;
-  int min_count_ = 5;
-  int32 vocab_size_ = 0;
-  Tensor word_;
-  Tensor freq_;
-  int64 corpus_size_ = 0;
-  std::vector<int32> corpus_;
-  std::vector<Example> precalc_examples_;
-  int precalc_index_ = 0;
-  std::vector<int32> sentence_;
-  int sentence_index_ = 0;
-
-  mutex mu_;
-  random::PhiloxRandom philox_ GUARDED_BY(mu_);
-  random::SimplePhilox rng_ GUARDED_BY(mu_);
-  int32 current_epoch_ GUARDED_BY(mu_) = -1;
-  int64 total_words_processed_ GUARDED_BY(mu_) = 0;
-  int64 example_pos_ GUARDED_BY(mu_);
-  int32 label_pos_ GUARDED_BY(mu_);
-  int32 label_limit_ GUARDED_BY(mu_);
-
-  // {example_pos_, label_pos_} is the cursor for the next example.
-  // example_pos_ wraps around at the end of corpus_. For each
-  // example, we randomly generate [label_pos_, label_limit) for
-  // labels.
-  void NextExample(int32* example, int32* label) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    while (true) {
-      if (label_pos_ >= label_limit_) {
-        ++total_words_processed_;
-        ++sentence_index_;
-        if (sentence_index_ >= kSentenceSize) {
-          sentence_index_ = 0;
-          for (int i = 0; i < kSentenceSize; ++i, ++example_pos_) {
-            if (example_pos_ >= corpus_size_) {
-              ++current_epoch_;
-              example_pos_ = 0;
-            }
-            if (subsample_ > 0) {
-              int32 word_freq = freq_.flat<int32>()(corpus_[example_pos_]);
-              // See Eq. 5 in http://arxiv.org/abs/1310.4546
-              float keep_prob =
-                  (std::sqrt(word_freq / (subsample_ * corpus_size_)) + 1) *
-                  (subsample_ * corpus_size_) / word_freq;
-              if (rng_.RandFloat() > keep_prob) {
-                i--;
-                continue;
-              }
-            }
-            sentence_[i] = corpus_[example_pos_];
-          }
-        }
-        const int32 skip = 1 + rng_.Uniform(window_size_);
-        label_pos_ = std::max<int32>(0, sentence_index_ - skip);
-        label_limit_ =
-            std::min<int32>(kSentenceSize, sentence_index_ + skip + 1);
-      }
-      if (sentence_index_ != label_pos_) {
-        break;
-      }
-      ++label_pos_;
-    }
-    *example = sentence_[sentence_index_];
-    *label = sentence_[label_pos_++];
-  }
-
-  Status Init(Env* env, const string& filename) {
-    string data;
-    TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &data));
-    StringPiece input = data;
-    string w;
-    corpus_size_ = 0;
-    std::unordered_map<string, int32> word_freq;
-    while (ScanWord(&input, &w)) {
-      ++(word_freq[w]);
-      ++corpus_size_;
-    }
-    if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
-    }
-    typedef std::pair<string, int32> WordFreq;
-    std::vector<WordFreq> ordered;
-    for (const auto& p : word_freq) {
-      if (p.second >= min_count_) ordered.push_back(p);
-    }
-    LOG(INFO) << "Data file: " << filename << " contains " << data.size()
-              << " bytes, " << corpus_size_ << " words, " << word_freq.size()
-              << " unique words, " << ordered.size()
-              << " unique frequent words.";
-    word_freq.clear();
-    std::sort(ordered.begin(), ordered.end(),
-              [](const WordFreq& x, const WordFreq& y) {
-                return x.second > y.second;
-              });
-    vocab_size_ = static_cast<int32>(1 + ordered.size());
-    Tensor word(DT_STRING, TensorShape({vocab_size_}));
-    Tensor freq(DT_INT32, TensorShape({vocab_size_}));
-    word.flat<tstring>()(0) = "UNK";
-    static const int32 kUnkId = 0;
-    std::unordered_map<string, int32> word_id;
-    int64 total_counted = 0;
-    for (std::size_t i = 0; i < ordered.size(); ++i) {
-      const auto& w = ordered[i].first;
-      auto id = i + 1;
-      word.flat<tstring>()(id) = w;
-      auto word_count = ordered[i].second;
-      freq.flat<int32>()(id) = word_count;
-      total_counted += word_count;
-      word_id[w] = id;
-    }
-    freq.flat<int32>()(kUnkId) = corpus_size_ - total_counted;
-    word_ = word;
-    freq_ = freq;
-    corpus_.reserve(corpus_size_);
-    input = data;
-    while (ScanWord(&input, &w)) {
-      corpus_.push_back(gtl::FindWithDefault(word_id, w, kUnkId));
-    }
-    precalc_examples_.resize(kPrecalc);
-    sentence_.resize(kSentenceSize);
-    return Status::OK();
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("SkipgramWord2vec").Device(DEVICE_CPU), SkipgramWord2vecOp);
-
-class NegTrainWord2vecOp : public OpKernel {
- public:
-  explicit NegTrainWord2vecOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    base_.Init(0, 0);
-
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_negative_samples", &num_samples_));
-
-    std::vector<int32> vocab_count;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("vocab_count", &vocab_count));
-
-    std::vector<float> vocab_weights;
-    vocab_weights.reserve(vocab_count.size());
-    for (const auto& f : vocab_count) {
-      float r = std::pow(static_cast<float>(f), 0.75f);
-      vocab_weights.push_back(r);
-    }
-    sampler_ = new random::DistributionSampler(vocab_weights);
-  }
-
-  ~NegTrainWord2vecOp() { delete sampler_; }
-
-  void Compute(OpKernelContext* ctx) override {
-    Tensor w_in = ctx->mutable_input(0, false);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_in.shape()),
-                errors::InvalidArgument("Must be a matrix"));
-    Tensor w_out = ctx->mutable_input(1, false);
-    OP_REQUIRES(ctx, w_in.shape() == w_out.shape(),
-                errors::InvalidArgument("w_in.shape == w_out.shape"));
-    const Tensor& examples = ctx->input(2);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(examples.shape()),
-                errors::InvalidArgument("Must be a vector"));
-    const Tensor& labels = ctx->input(3);
-    OP_REQUIRES(ctx, examples.shape() == labels.shape(),
-                errors::InvalidArgument("examples.shape == labels.shape"));
-    const Tensor& learning_rate = ctx->input(4);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(learning_rate.shape()),
-                errors::InvalidArgument("Must be a scalar"));
-
-    auto Tw_in = w_in.matrix<float>();
-    auto Tw_out = w_out.matrix<float>();
-    auto Texamples = examples.flat<int32>();
-    auto Tlabels = labels.flat<int32>();
-    auto lr = learning_rate.scalar<float>()();
-    const int64 vocab_size = w_in.dim_size(0);
-    const int64 dims = w_in.dim_size(1);
-    const int64 batch_size = examples.dim_size(0);
-    OP_REQUIRES(ctx, vocab_size == sampler_->num(),
-                errors::InvalidArgument("vocab_size mismatches: ", vocab_size,
-                                        " vs. ", sampler_->num()));
-
-    // Gradient accumulator for v_in.
-    Tensor buf(DT_FLOAT, TensorShape({dims}));
-    auto Tbuf = buf.flat<float>();
-
-    // Scalar buffer to hold sigmoid(+/- dot).
-    Tensor g_buf(DT_FLOAT, TensorShape({}));
-    auto g = g_buf.scalar<float>();
-
-    // The following loop needs 2 random 32-bit values per negative
-    // sample.  We reserve 8 values per sample just in case the
-    // underlying implementation changes.
-    auto rnd = base_.ReserveSamples32(batch_size * num_samples_ * 8);
-    random::SimplePhilox srnd(&rnd);
-
-    for (int64 i = 0; i < batch_size; ++i) {
-      const int32 example = Texamples(i);
-      DCHECK(0 <= example && example < vocab_size) << example;
-      const int32 label = Tlabels(i);
-      DCHECK(0 <= label && label < vocab_size) << label;
-      auto v_in = Tw_in.chip<0>(example);
-
-      // Positive: example predicts label.
-      //   forward: x = v_in' * v_out
-      //            l = log(sigmoid(x))
-      //   backward: dl/dx = g = sigmoid(-x)
-      //             dl/d(v_in) = g * v_out'
-      //             dl/d(v_out) = v_in' * g
-      {
-        auto v_out = Tw_out.chip<0>(label);
-        auto dot = (v_in * v_out).sum();
-        g = (dot.exp() + 1.f).inverse();
-        Tbuf = v_out * (g() * lr);
-        v_out += v_in * (g() * lr);
-      }
-
-      // Negative samples:
-      //   forward: x = v_in' * v_sample
-      //            l = log(sigmoid(-x))
-      //   backward: dl/dx = g = -sigmoid(x)
-      //             dl/d(v_in) = g * v_out'
-      //             dl/d(v_out) = v_in' * g
-      for (int j = 0; j < num_samples_; ++j) {
-        const int sample = sampler_->Sample(&srnd);
-        if (sample == label) continue;  // Skip.
-        auto v_sample = Tw_out.chip<0>(sample);
-        auto dot = (v_in * v_sample).sum();
-        g = -((-dot).exp() + 1.f).inverse();
-        Tbuf += v_sample * (g() * lr);
-        v_sample += v_in * (g() * lr);
-      }
-
-      // Applies the gradient on v_in.
-      v_in += Tbuf;
-    }
-  }
-
- private:
-  int32 num_samples_ = 0;
-  random::DistributionSampler* sampler_ = nullptr;
-  GuardedPhiloxRandom base_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("NegTrainWord2vec").Device(DEVICE_CPU), NegTrainWord2vecOp);
-
-}  // end namespace tensorflow
diff --git a/tutorials/embedding/word2vec_ops.cc b/tutorials/embedding/word2vec_ops.cc
deleted file mode 100644
index cdffa4a7725..00000000000
--- a/tutorials/embedding/word2vec_ops.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("SkipgramWord2vec")
-    .Output("vocab_word: string")
-    .Output("vocab_freq: int32")
-    .Output("words_per_epoch: int64")
-    .Output("current_epoch: int32")
-    .Output("total_words_processed: int64")
-    .Output("examples: int32")
-    .Output("labels: int32")
-    .SetIsStateful()
-    .Attr("filename: string")
-    .Attr("batch_size: int")
-    .Attr("window_size: int = 5")
-    .Attr("min_count: int = 5")
-    .Attr("subsample: float = 1e-3")
-    .Doc(R"doc(
-Parses a text file and creates a batch of examples.
-
-vocab_word: A vector of words in the corpus.
-vocab_freq: Frequencies of words. Sorted in the non-ascending order.
-words_per_epoch: Number of words per epoch in the data file.
-current_epoch: The current epoch number.
-total_words_processed: The total number of words processed so far.
-examples: A vector of word ids.
-labels: A vector of word ids.
-filename: The corpus's text file name.
-batch_size: The size of produced batch.
-window_size: The number of words to predict to the left and right of the target.
-min_count: The minimum number of word occurrences for it to be included in the
-    vocabulary.
-subsample: Threshold for word occurrence. Words that appear with higher
-    frequency will be randomly down-sampled. Set to 0 to disable.
-)doc");
-
-REGISTER_OP("NegTrainWord2vec")
-    .Input("w_in: Ref(float)")
-    .Input("w_out: Ref(float)")
-    .Input("examples: int32")
-    .Input("labels: int32")
-    .Input("lr: float")
-    .SetIsStateful()
-    .Attr("vocab_count: list(int)")
-    .Attr("num_negative_samples: int")
-    .Doc(R"doc(
-Training via negative sampling.
-
-w_in: input word embedding.
-w_out: output word embedding.
-examples: A vector of word ids.
-labels: A vector of word ids.
-vocab_count: Count of words in the vocabulary.
-num_negative_samples: Number of negative samples per example.
-)doc");
-
-}  // end namespace tensorflow
diff --git a/tutorials/embedding/word2vec_optimized.py b/tutorials/embedding/word2vec_optimized.py
deleted file mode 100644
index 420991a8a5d..00000000000
--- a/tutorials/embedding/word2vec_optimized.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Multi-threaded word2vec unbatched skip-gram model.
-
-Trains the model described in:
-(Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space
-ICLR 2013.
-http://arxiv.org/abs/1301.3781
-This model does true SGD (i.e. no minibatching). To do this efficiently, custom
-ops are used to sequentially process data within a 'batch'.
-
-The key ops used are:
-* skipgram custom op that does input processing.
-* neg_train custom op that efficiently calculates and applies the gradient using
-  true SGD.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import threading
-import time
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-import numpy as np
-import tensorflow as tf
-
-word2vec = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'word2vec_ops.so'))
-
-flags = tf.app.flags
-
-flags.DEFINE_string("save_path", None, "Directory to write the model.")
-flags.DEFINE_string(
-    "train_data", None,
-    "Training data. E.g., unzipped file http://mattmahoney.net/dc/text8.zip.")
-flags.DEFINE_string(
-    "eval_data", None, "Analogy questions. "
-    "See README.md for how to get 'questions-words.txt'.")
-flags.DEFINE_integer("embedding_size", 200, "The embedding dimension size.")
-flags.DEFINE_integer(
-    "epochs_to_train", 15,
-    "Number of epochs to train. Each epoch processes the training data once "
-    "completely.")
-flags.DEFINE_float("learning_rate", 0.025, "Initial learning rate.")
-flags.DEFINE_integer("num_neg_samples", 25,
-                     "Negative samples per training example.")
-flags.DEFINE_integer("batch_size", 500,
-                     "Numbers of training examples each step processes "
-                     "(no minibatching).")
-flags.DEFINE_integer("concurrent_steps", 12,
-                     "The number of concurrent training steps.")
-flags.DEFINE_integer("window_size", 5,
-                     "The number of words to predict to the left and right "
-                     "of the target word.")
-flags.DEFINE_integer("min_count", 5,
-                     "The minimum number of word occurrences for it to be "
-                     "included in the vocabulary.")
-flags.DEFINE_float("subsample", 1e-3,
-                   "Subsample threshold for word occurrence. Words that appear "
-                   "with higher frequency will be randomly down-sampled. Set "
-                   "to 0 to disable.")
-flags.DEFINE_boolean(
-    "interactive", False,
-    "If true, enters an IPython interactive session to play with the trained "
-    "model. E.g., try model.analogy(b'france', b'paris', b'russia') and "
-    "model.nearby([b'proton', b'elephant', b'maxwell'])")
-
-FLAGS = flags.FLAGS
-
-
-class Options(object):
-  """Options used by our word2vec model."""
-
-  def __init__(self):
-    # Model options.
-
-    # Embedding dimension.
-    self.emb_dim = FLAGS.embedding_size
-
-    # Training options.
-
-    # The training text file.
-    self.train_data = FLAGS.train_data
-
-    # Number of negative samples per example.
-    self.num_samples = FLAGS.num_neg_samples
-
-    # The initial learning rate.
-    self.learning_rate = FLAGS.learning_rate
-
-    # Number of epochs to train. After these many epochs, the learning
-    # rate decays linearly to zero and the training stops.
-    self.epochs_to_train = FLAGS.epochs_to_train
-
-    # Concurrent training steps.
-    self.concurrent_steps = FLAGS.concurrent_steps
-
-    # Number of examples for one training step.
-    self.batch_size = FLAGS.batch_size
-
-    # The number of words to predict to the left and right of the target word.
-    self.window_size = FLAGS.window_size
-
-    # The minimum number of word occurrences for it to be included in the
-    # vocabulary.
-    self.min_count = FLAGS.min_count
-
-    # Subsampling threshold for word occurrence.
-    self.subsample = FLAGS.subsample
-
-    # Where to write out summaries.
-    self.save_path = FLAGS.save_path
-    if not os.path.exists(self.save_path):
-      os.makedirs(self.save_path)
-
-    # Eval options.
-
-    # The text file for eval.
-    self.eval_data = FLAGS.eval_data
-
-
-class Word2Vec(object):
-  """Word2Vec model (Skipgram)."""
-
-  def __init__(self, options, session):
-    self._options = options
-    self._session = session
-    self._word2id = {}
-    self._id2word = []
-    self.build_graph()
-    self.build_eval_graph()
-    self.save_vocab()
-
-  def read_analogies(self):
-    """Reads through the analogy question file.
-
-    Returns:
-      questions: a [n, 4] numpy array containing the analogy question's
-                 word ids.
-      questions_skipped: questions skipped due to unknown words.
-    """
-    questions = []
-    questions_skipped = 0
-    with open(self._options.eval_data, "rb") as analogy_f:
-      for line in analogy_f:
-        if line.startswith(b":"):  # Skip comments.
-          continue
-        words = line.strip().lower().split(b" ")
-        ids = [self._word2id.get(w.strip()) for w in words]
-        if None in ids or len(ids) != 4:
-          questions_skipped += 1
-        else:
-          questions.append(np.array(ids))
-    print("Eval analogy file: ", self._options.eval_data)
-    print("Questions: ", len(questions))
-    print("Skipped: ", questions_skipped)
-    self._analogy_questions = np.array(questions, dtype=np.int32)
-
-  def build_graph(self):
-    """Build the model graph."""
-    opts = self._options
-
-    # The training data. A text file.
-    (words, counts, words_per_epoch, current_epoch, total_words_processed,
-     examples, labels) = word2vec.skipgram_word2vec(filename=opts.train_data,
-                                                    batch_size=opts.batch_size,
-                                                    window_size=opts.window_size,
-                                                    min_count=opts.min_count,
-                                                    subsample=opts.subsample)
-    (opts.vocab_words, opts.vocab_counts,
-     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
-    opts.vocab_size = len(opts.vocab_words)
-    print("Data file: ", opts.train_data)
-    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
-    print("Words per epoch: ", opts.words_per_epoch)
-
-    self._id2word = opts.vocab_words
-    for i, w in enumerate(self._id2word):
-      self._word2id[w] = i
-
-    # Declare all variables we need.
-    # Input words embedding: [vocab_size, emb_dim]
-    w_in = tf.Variable(
-        tf.random_uniform(
-            [opts.vocab_size,
-             opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),
-        name="w_in")
-
-    # Global step: scalar, i.e., shape [].
-    w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out")
-
-    # Global step: []
-    global_step = tf.Variable(0, name="global_step")
-
-    # Linear learning rate decay.
-    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
-    lr = opts.learning_rate * tf.maximum(
-        0.0001,
-        1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)
-
-    # Training nodes.
-    inc = global_step.assign_add(1)
-    with tf.control_dependencies([inc]):
-      train = word2vec.neg_train_word2vec(w_in,
-                                          w_out,
-                                          examples,
-                                          labels,
-                                          lr,
-                                          vocab_count=opts.vocab_counts.tolist(),
-                                          num_negative_samples=opts.num_samples)
-
-    self._w_in = w_in
-    self._examples = examples
-    self._labels = labels
-    self._lr = lr
-    self._train = train
-    self.global_step = global_step
-    self._epoch = current_epoch
-    self._words = total_words_processed
-
-  def save_vocab(self):
-    """Save the vocabulary to a file so the model can be reloaded."""
-    opts = self._options
-    with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f:
-      for i in xrange(opts.vocab_size):
-        vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8")
-        f.write("%s %d\n" % (vocab_word,
-                             opts.vocab_counts[i]))
-
-  def build_eval_graph(self):
-    """Build the evaluation graph."""
-    # Eval graph
-    opts = self._options
-
-    # Each analogy task is to predict the 4th word (d) given three
-    # words: a, b, c.  E.g., a=italy, b=rome, c=france, we should
-    # predict d=paris.
-
-    # The eval feeds three vectors of word ids for a, b, c, each of
-    # which is of size N, where N is the number of analogies we want to
-    # evaluate in one batch.
-    analogy_a = tf.placeholder(dtype=tf.int32)  # [N]
-    analogy_b = tf.placeholder(dtype=tf.int32)  # [N]
-    analogy_c = tf.placeholder(dtype=tf.int32)  # [N]
-
-    # Normalized word embeddings of shape [vocab_size, emb_dim].
-    nemb = tf.nn.l2_normalize(self._w_in, 1)
-
-    # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
-    # They all have the shape [N, emb_dim]
-    a_emb = tf.gather(nemb, analogy_a)  # a's embs
-    b_emb = tf.gather(nemb, analogy_b)  # b's embs
-    c_emb = tf.gather(nemb, analogy_c)  # c's embs
-
-    # We expect that d's embedding vectors on the unit hyper-sphere is
-    # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
-    target = c_emb + (b_emb - a_emb)
-
-    # Compute cosine distance between each pair of target and vocab.
-    # dist has shape [N, vocab_size].
-    dist = tf.matmul(target, nemb, transpose_b=True)
-
-    # For each question (row in dist), find the top 4 words.
-    _, pred_idx = tf.nn.top_k(dist, 4)
-
-    # Nodes for computing neighbors for a given word according to
-    # their cosine distance.
-    nearby_word = tf.placeholder(dtype=tf.int32)  # word id
-    nearby_emb = tf.gather(nemb, nearby_word)
-    nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True)
-    nearby_val, nearby_idx = tf.nn.top_k(nearby_dist,
-                                         min(1000, opts.vocab_size))
-
-    # Nodes in the construct graph which are used by training and
-    # evaluation to run/feed/fetch.
-    self._analogy_a = analogy_a
-    self._analogy_b = analogy_b
-    self._analogy_c = analogy_c
-    self._analogy_pred_idx = pred_idx
-    self._nearby_word = nearby_word
-    self._nearby_val = nearby_val
-    self._nearby_idx = nearby_idx
-
-    # Properly initialize all variables.
-    tf.global_variables_initializer().run()
-
-    self.saver = tf.train.Saver()
-
-  def _train_thread_body(self):
-    initial_epoch, = self._session.run([self._epoch])
-    while True:
-      _, epoch = self._session.run([self._train, self._epoch])
-      if epoch != initial_epoch:
-        break
-
-  def train(self):
-    """Train the model."""
-    opts = self._options
-
-    initial_epoch, initial_words = self._session.run([self._epoch, self._words])
-
-    workers = []
-    for _ in xrange(opts.concurrent_steps):
-      t = threading.Thread(target=self._train_thread_body)
-      t.start()
-      workers.append(t)
-
-    last_words, last_time = initial_words, time.time()
-    while True:
-      time.sleep(5)  # Reports our progress once a while.
-      (epoch, step, words, lr) = self._session.run(
-          [self._epoch, self.global_step, self._words, self._lr])
-      now = time.time()
-      last_words, last_time, rate = words, now, (words - last_words) / (
-          now - last_time)
-      print("Epoch %4d Step %8d: lr = %5.3f words/sec = %8.0f\r" % (epoch, step,
-                                                                    lr, rate),
-            end="")
-      sys.stdout.flush()
-      if epoch != initial_epoch:
-        break
-
-    for t in workers:
-      t.join()
-
-  def _predict(self, analogy):
-    """Predict the top 4 answers for analogy questions."""
-    idx, = self._session.run([self._analogy_pred_idx], {
-        self._analogy_a: analogy[:, 0],
-        self._analogy_b: analogy[:, 1],
-        self._analogy_c: analogy[:, 2]
-    })
-    return idx
-
-  def eval(self):
-    """Evaluate analogy questions and reports accuracy."""
-
-    # How many questions we get right at precision@1.
-    correct = 0
-
-    try:
-      total = self._analogy_questions.shape[0]
-    except AttributeError as e:
-      raise AttributeError("Need to read analogy questions.")
-
-    start = 0
-    while start < total:
-      limit = start + 2500
-      sub = self._analogy_questions[start:limit, :]
-      idx = self._predict(sub)
-      start = limit
-      for question in xrange(sub.shape[0]):
-        for j in xrange(4):
-          if idx[question, j] == sub[question, 3]:
-            # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
-            correct += 1
-            break
-          elif idx[question, j] in sub[question, :3]:
-            # We need to skip words already in the question.
-            continue
-          else:
-            # The correct label is not the precision@1
-            break
-    print()
-    print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total,
-                                              correct * 100.0 / total))
-
-  def analogy(self, w0, w1, w2):
-    """Predict word w3 as in w0:w1 vs w2:w3."""
-    wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]])
-    idx = self._predict(wid)
-    for c in [self._id2word[i] for i in idx[0, :]]:
-      if c not in [w0, w1, w2]:
-        print(c)
-        break
-    print("unknown")
-
-  def nearby(self, words, num=20):
-    """Prints out nearby words given a list of words."""
-    ids = np.array([self._word2id.get(x, 0) for x in words])
-    vals, idx = self._session.run(
-        [self._nearby_val, self._nearby_idx], {self._nearby_word: ids})
-    for i in xrange(len(words)):
-      print("\n%s\n=====================================" % (words[i]))
-      for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
-        print("%-20s %6.4f" % (self._id2word[neighbor], distance))
-
-
-def _start_shell(local_ns=None):
-  # An interactive shell is useful for debugging/development.
-  import IPython
-  user_ns = {}
-  if local_ns:
-    user_ns.update(local_ns)
-  user_ns.update(globals())
-  IPython.start_ipython(argv=[], user_ns=user_ns)
-
-
-def main(_):
-  """Train a word2vec model."""
-  if not FLAGS.train_data or not FLAGS.eval_data or not FLAGS.save_path:
-    print("--train_data --eval_data and --save_path must be specified.")
-    sys.exit(1)
-  opts = Options()
-  with tf.Graph().as_default(), tf.Session() as session:
-    with tf.device("/cpu:0"):
-      model = Word2Vec(opts, session)
-      model.read_analogies() # Read analogy questions
-    for _ in xrange(opts.epochs_to_train):
-      model.train()  # Process one epoch
-      model.eval()  # Eval analogies.
-    # Perform a final save.
-    model.saver.save(session, os.path.join(opts.save_path, "model.ckpt"),
-                     global_step=model.global_step)
-    if FLAGS.interactive:
-      # E.g.,
-      # [0]: model.analogy(b'france', b'paris', b'russia')
-      # [1]: model.nearby([b'proton', b'elephant', b'maxwell'])
-      _start_shell(locals())
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tutorials/embedding/word2vec_optimized_test.py b/tutorials/embedding/word2vec_optimized_test.py
deleted file mode 100644
index d00a14f991d..00000000000
--- a/tutorials/embedding/word2vec_optimized_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for word2vec_optimized module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import tensorflow as tf
-
-import word2vec_optimized
-
-flags = tf.app.flags
-
-FLAGS = flags.FLAGS
-
-
-class Word2VecTest(tf.test.TestCase):
-
-  def setUp(self):
-    FLAGS.train_data = os.path.join(self.get_temp_dir() + "test-text.txt")
-    FLAGS.eval_data = os.path.join(self.get_temp_dir() + "eval-text.txt")
-    FLAGS.save_path = self.get_temp_dir()
-    with open(FLAGS.train_data, "w") as f:
-      f.write(
-          """alice was beginning to get very tired of sitting by her sister on
-          the bank, and of having nothing to do: once or twice she had peeped
-          into the book her sister was reading, but it had no pictures or
-          conversations in it, 'and what is the use of a book,' thought alice
-          'without pictures or conversations?' So she was considering in her own
-          mind (as well as she could, for the hot day made her feel very sleepy
-          and stupid), whether the pleasure of making a daisy-chain would be
-          worth the trouble of getting up and picking the daisies, when suddenly
-          a White rabbit with pink eyes ran close by her.\n""")
-      with open(FLAGS.eval_data, "w") as f:
-        f.write("alice she rabbit once\n")
-
-  def testWord2VecOptimized(self):
-    FLAGS.batch_size = 5
-    FLAGS.num_neg_samples = 10
-    FLAGS.epochs_to_train = 1
-    FLAGS.min_count = 0
-    word2vec_optimized.main([])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tutorials/embedding/word2vec_test.py b/tutorials/embedding/word2vec_test.py
deleted file mode 100644
index b5068d85b47..00000000000
--- a/tutorials/embedding/word2vec_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for word2vec module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import tensorflow as tf
-
-import word2vec
-
-flags = tf.app.flags
-
-FLAGS = flags.FLAGS
-
-
-class Word2VecTest(tf.test.TestCase):
-
-  def setUp(self):
-    FLAGS.train_data = os.path.join(self.get_temp_dir(), "test-text.txt")
-    FLAGS.eval_data = os.path.join(self.get_temp_dir(), "eval-text.txt")
-    FLAGS.save_path = self.get_temp_dir()
-    with open(FLAGS.train_data, "w") as f:
-      f.write(
-          """alice was beginning to get very tired of sitting by her sister on
-          the bank, and of having nothing to do: once or twice she had peeped
-          into the book her sister was reading, but it had no pictures or
-          conversations in it, 'and what is the use of a book,' thought alice
-          'without pictures or conversations?' So she was considering in her own
-          mind (as well as she could, for the hot day made her feel very sleepy
-          and stupid), whether the pleasure of making a daisy-chain would be
-          worth the trouble of getting up and picking the daisies, when suddenly
-          a White rabbit with pink eyes ran close by her.\n""")
-      with open(FLAGS.eval_data, "w") as f:
-        f.write("alice she rabbit once\n")
-
-  def testWord2Vec(self):
-    FLAGS.batch_size = 5
-    FLAGS.num_neg_samples = 10
-    FLAGS.epochs_to_train = 1
-    FLAGS.min_count = 0
-    word2vec.main([])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tutorials/image/__init__.py b/tutorials/image/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tutorials/image/alexnet/BUILD b/tutorials/image/alexnet/BUILD
deleted file mode 100644
index bbe29da6f5c..00000000000
--- a/tutorials/image/alexnet/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-# Description:
-# Benchmark for AlexNet.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "alexnet_benchmark",
-    srcs = [
-        "alexnet_benchmark.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/image/alexnet/__init__.py b/tutorials/image/alexnet/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tutorials/image/alexnet/alexnet_benchmark.py b/tutorials/image/alexnet/alexnet_benchmark.py
deleted file mode 100644
index 39fcb109f0a..00000000000
--- a/tutorials/image/alexnet/alexnet_benchmark.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Timing benchmark for AlexNet inference.
-
-To run, use:
-  bazel run -c opt --config=cuda \
-      models/tutorials/image/alexnet:alexnet_benchmark
-
-Across 100 steps on batch size = 128.
-
-Forward pass:
-Run on Tesla K40c: 145 +/- 1.5 ms / batch
-Run on Titan X:     70 +/- 0.1 ms / batch
-
-Forward-backward pass:
-Run on Tesla K40c: 480 +/- 48 ms / batch
-Run on Titan X:    244 +/- 30 ms / batch
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from datetime import datetime
-import math
-import sys
-import time
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-FLAGS = None
-
-
-def print_activations(t):
-  print(t.op.name, ' ', t.get_shape().as_list())
-
-
-def inference(images):
-  """Build the AlexNet model.
-
-  Args:
-    images: Images Tensor
-
-  Returns:
-    pool5: the last Tensor in the convolutional component of AlexNet.
-    parameters: a list of Tensors corresponding to the weights and biases of the
-        AlexNet model.
-  """
-  parameters = []
-  # conv1
-  with tf.name_scope('conv1') as scope:
-    kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=tf.float32,
-                                             stddev=1e-1), name='weights')
-    conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
-    biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
-                         trainable=True, name='biases')
-    bias = tf.nn.bias_add(conv, biases)
-    conv1 = tf.nn.relu(bias, name=scope)
-    print_activations(conv1)
-    parameters += [kernel, biases]
-
-  # lrn1
-  with tf.name_scope('lrn1') as scope:
-    lrn1 = tf.nn.local_response_normalization(conv1,
-                                              alpha=1e-4,
-                                              beta=0.75,
-                                              depth_radius=2,
-                                              bias=2.0)
-
-  # pool1
-  pool1 = tf.nn.max_pool(lrn1,
-                         ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1],
-                         padding='VALID',
-                         name='pool1')
-  print_activations(pool1)
-
-  # conv2
-  with tf.name_scope('conv2') as scope:
-    kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=tf.float32,
-                                             stddev=1e-1), name='weights')
-    conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=tf.float32),
-                         trainable=True, name='biases')
-    bias = tf.nn.bias_add(conv, biases)
-    conv2 = tf.nn.relu(bias, name=scope)
-    parameters += [kernel, biases]
-  print_activations(conv2)
-
-  # lrn2
-  with tf.name_scope('lrn2') as scope:
-    lrn2 = tf.nn.local_response_normalization(conv2,
-                                              alpha=1e-4,
-                                              beta=0.75,
-                                              depth_radius=2,
-                                              bias=2.0)
-
-  # pool2
-  pool2 = tf.nn.max_pool(lrn2,
-                         ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1],
-                         padding='VALID',
-                         name='pool2')
-  print_activations(pool2)
-
-  # conv3
-  with tf.name_scope('conv3') as scope:
-    kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384],
-                                             dtype=tf.float32,
-                                             stddev=1e-1), name='weights')
-    conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=tf.float32),
-                         trainable=True, name='biases')
-    bias = tf.nn.bias_add(conv, biases)
-    conv3 = tf.nn.relu(bias, name=scope)
-    parameters += [kernel, biases]
-    print_activations(conv3)
-
-  # conv4
-  with tf.name_scope('conv4') as scope:
-    kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256],
-                                             dtype=tf.float32,
-                                             stddev=1e-1), name='weights')
-    conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
-                         trainable=True, name='biases')
-    bias = tf.nn.bias_add(conv, biases)
-    conv4 = tf.nn.relu(bias, name=scope)
-    parameters += [kernel, biases]
-    print_activations(conv4)
-
-  # conv5
-  with tf.name_scope('conv5') as scope:
-    kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256],
-                                             dtype=tf.float32,
-                                             stddev=1e-1), name='weights')
-    conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
-                         trainable=True, name='biases')
-    bias = tf.nn.bias_add(conv, biases)
-    conv5 = tf.nn.relu(bias, name=scope)
-    parameters += [kernel, biases]
-    print_activations(conv5)
-
-  # pool5
-  pool5 = tf.nn.max_pool(conv5,
-                         ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1],
-                         padding='VALID',
-                         name='pool5')
-  print_activations(pool5)
-
-  return pool5, parameters
-
-
-def time_tensorflow_run(session, target, info_string):
-  """Run the computation to obtain the target tensor and print timing stats.
-
-  Args:
-    session: the TensorFlow session to run the computation under.
-    target: the target Tensor that is passed to the session's run() function.
-    info_string: a string summarizing this run, to be printed with the stats.
-
-  Returns:
-    None
-  """
-  num_steps_burn_in = 10
-  total_duration = 0.0
-  total_duration_squared = 0.0
-  for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-    start_time = time.time()
-    _ = session.run(target)
-    duration = time.time() - start_time
-    if i >= num_steps_burn_in:
-      if not i % 10:
-        print ('%s: step %d, duration = %.3f' %
-               (datetime.now(), i - num_steps_burn_in, duration))
-      total_duration += duration
-      total_duration_squared += duration * duration
-  mn = total_duration / FLAGS.num_batches
-  vr = total_duration_squared / FLAGS.num_batches - mn * mn
-  sd = math.sqrt(vr)
-  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
-
-
-
-def run_benchmark():
-  """Run the benchmark on AlexNet."""
-  with tf.Graph().as_default():
-    # Generate some dummy images.
-    image_size = 224
-    # Note that our padding definition is slightly different the cuda-convnet.
-    # In order to force the model to start with the same activations sizes,
-    # we add 3 to the image_size and employ VALID padding above.
-    images = tf.Variable(tf.random_normal([FLAGS.batch_size,
-                                           image_size,
-                                           image_size, 3],
-                                          dtype=tf.float32,
-                                          stddev=1e-1))
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    pool5, parameters = inference(images)
-
-    # Build an initialization operation.
-    init = tf.global_variables_initializer()
-
-    # Start running operations on the Graph.
-    config = tf.ConfigProto()
-    config.gpu_options.allocator_type = 'BFC'
-    sess = tf.Session(config=config)
-    sess.run(init)
-
-    # Run the forward benchmark.
-    time_tensorflow_run(sess, pool5, "Forward")
-
-    # Add a simple objective so we can calculate the backward pass.
-    objective = tf.nn.l2_loss(pool5)
-    # Compute the gradient with respect to all the parameters.
-    grad = tf.gradients(objective, parameters)
-    # Run the backward benchmark.
-    time_tensorflow_run(sess, grad, "Forward-backward")
-
-
-def main(_):
-  run_benchmark()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=128,
-      help='Batch size.'
-  )
-  parser.add_argument(
-      '--num_batches',
-      type=int,
-      default=100,
-      help='Number of batches to run.'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tutorials/image/cifar10/BUILD b/tutorials/image/cifar10/BUILD
deleted file mode 100644
index 9cf574f605e..00000000000
--- a/tutorials/image/cifar10/BUILD
+++ /dev/null
@@ -1,87 +0,0 @@
-# Description:
-# Example TensorFlow models for CIFAR-10
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "cifar10_input",
-    srcs = ["cifar10_input.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_test(
-    name = "cifar10_input_test",
-    size = "small",
-    srcs = ["cifar10_input_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cifar10_input",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-py_library(
-    name = "cifar10",
-    srcs = ["cifar10.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cifar10_input",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "cifar10_eval",
-    srcs = [
-        "cifar10_eval.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        ":cifar10",
-    ],
-)
-
-py_binary(
-    name = "cifar10_train",
-    srcs = [
-        "cifar10_train.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        ":cifar10",
-    ],
-)
-
-py_binary(
-    name = "cifar10_multi_gpu_train",
-    srcs = [
-        "cifar10_multi_gpu_train.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        ":cifar10",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/image/cifar10/README.md b/tutorials/image/cifar10/README.md
deleted file mode 100644
index 69b6d08e431..00000000000
--- a/tutorials/image/cifar10/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-**NOTE: For users interested in multi-GPU, we recommend looking at the newer [cifar10_estimator](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator) example instead.**
-
----
-
-CIFAR-10 is a common benchmark in machine learning for image recognition.
-
-http://www.cs.toronto.edu/~kriz/cifar.html
-
-Code in this directory demonstrates how to use TensorFlow to train and evaluate a convolutional neural network (CNN) on both CPU and GPU. We also demonstrate how to train a CNN over multiple GPUs.
-
-Detailed instructions on how to get started available at:
-
-https://www.tensorflow.org/tutorials/images/deep_cnn
diff --git a/tutorials/image/cifar10/__init__.py b/tutorials/image/cifar10/__init__.py
deleted file mode 100644
index 6b2729e7e0b..00000000000
--- a/tutorials/image/cifar10/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Makes helper libraries available in the cifar10 package."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import cifar10
-import cifar10_input
diff --git a/tutorials/image/cifar10/cifar10.py b/tutorials/image/cifar10/cifar10.py
deleted file mode 100644
index c725a890b82..00000000000
--- a/tutorials/image/cifar10/cifar10.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Builds the CIFAR-10 network.
-
-Summary of available functions:
-
- # Compute input images and labels for training. If you would like to run
- # evaluations, use inputs() instead.
- inputs, labels = distorted_inputs()
-
- # Compute inference on the model inputs to make a prediction.
- predictions = inference(inputs)
-
- # Compute the total loss of the prediction with respect to the labels.
- loss = loss(predictions, labels)
-
- # Create a graph to run one step of training with respect to the loss.
- train_op = train(loss, global_step)
-"""
-# pylint: disable=missing-docstring
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-import tensorflow as tf
-
-import cifar10_input
-
-FLAGS = tf.app.flags.FLAGS
-
-# Basic model parameters.
-tf.app.flags.DEFINE_integer('batch_size', 128,
-                            """Number of images to process in a batch.""")
-tf.app.flags.DEFINE_boolean('use_fp16', True,
-                            """Train the model using fp16.""")
-
-# Global constants describing the CIFAR-10 data set.
-IMAGE_SIZE = cifar10_input.IMAGE_SIZE
-NUM_CLASSES = cifar10_input.NUM_CLASSES
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
-NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
-
-
-# Constants describing the training process.
-MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
-NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
-LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
-INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
-
-# If a model is trained with multiple GPUs, prefix all Op names with tower_name
-# to differentiate the operations. Note that this prefix is removed from the
-# names of the summaries when visualizing a model.
-TOWER_NAME = 'tower'
-
-
-def _activation_summary(x):
-  """Helper to create summaries for activations.
-
-  Creates a summary that provides a histogram of activations.
-  Creates a summary that measures the sparsity of activations.
-
-  Args:
-    x: Tensor
-  Returns:
-    nothing
-  """
-  # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
-  # session. This helps the clarity of presentation on tensorboard.
-  tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
-  tf.summary.histogram(tensor_name + '/activations', x)
-  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
-
-
-def _variable_on_cpu(name, shape, initializer):
-  """Helper to create a Variable stored on CPU memory.
-
-  Args:
-    name: name of the variable
-    shape: list of ints
-    initializer: initializer for Variable
-
-  Returns:
-    Variable Tensor
-  """
-  with tf.device('/cpu:0'):
-    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
-    var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
-  return var
-
-
-def _variable_with_weight_decay(name, shape, stddev, wd):
-  """Helper to create an initialized Variable with weight decay.
-
-  Note that the Variable is initialized with a truncated normal distribution.
-  A weight decay is added only if one is specified.
-
-  Args:
-    name: name of the variable
-    shape: list of ints
-    stddev: standard deviation of a truncated Gaussian
-    wd: add L2Loss weight decay multiplied by this float. If None, weight
-        decay is not added for this Variable.
-
-  Returns:
-    Variable Tensor
-  """
-  dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
-  var = _variable_on_cpu(
-      name,
-      shape,
-      tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
-  if wd is not None:
-    weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
-    tf.add_to_collection('losses', weight_decay)
-  return var
-
-
-def distorted_inputs():
-  """Construct distorted input for CIFAR training using the Reader ops.
-
-  Returns:
-    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
-    labels: Labels. 1D tensor of [batch_size] size.
-  """
-  images, labels = cifar10_input.distorted_inputs(batch_size=FLAGS.batch_size)
-  if FLAGS.use_fp16:
-    images = tf.cast(images, tf.float16)
-    labels = tf.cast(labels, tf.float16)
-  return images, labels
-
-
-def inputs(eval_data):
-  """Construct input for CIFAR evaluation using the Reader ops.
-  Args:
-    eval_data: bool, indicating if one should use the train or eval data set.
-
-  Returns:
-    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
-    labels: Labels. 1D tensor of [batch_size] size.
-  """
-  images, labels = cifar10_input.inputs(eval_data=eval_data, batch_size=FLAGS.batch_size)
-  if FLAGS.use_fp16:
-    images = tf.cast(images, tf.float16)
-    labels = tf.cast(labels, tf.float16)
-  return images, labels
-
-
-def inference(images):
-  """Build the CIFAR-10 model.
-
-  Args:
-    images: Images returned from distorted_inputs() or inputs().
-
-  Returns:
-    Logits.
-  """
-  # We instantiate all variables using tf.get_variable() instead of
-  # tf.Variable() in order to share variables across multiple GPU training runs.
-  # If we only ran this model on a single GPU, we could simplify this function
-  # by replacing all instances of tf.get_variable() with tf.Variable().
-  #
-  # conv1
-  with tf.variable_scope('conv1') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 3, 64],
-                                         stddev=5e-2,
-                                         wd=None)
-    conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
-    pre_activation = tf.nn.bias_add(conv, biases)
-    conv1 = tf.nn.relu(pre_activation, name=scope.name)
-    _activation_summary(conv1)
-
-  # pool1
-  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
-                         padding='SAME', name='pool1')
-  # norm1
-  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm1')
-
-  # conv2
-  with tf.variable_scope('conv2') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 64, 64],
-                                         stddev=5e-2,
-                                         wd=None)
-    conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
-    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
-    pre_activation = tf.nn.bias_add(conv, biases)
-    conv2 = tf.nn.relu(pre_activation, name=scope.name)
-    _activation_summary(conv2)
-
-  # norm2
-  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm2')
-  # pool2
-  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')
-
-  # local3
-  with tf.variable_scope('local3') as scope:
-    # Move everything into depth so we can perform a single matrix multiply.
-    reshape = tf.keras.layers.Flatten()(pool2)
-    dim = reshape.get_shape()[1].value
-    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
-                                          stddev=0.04, wd=0.004)
-    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
-    local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
-    _activation_summary(local3)
-
-  # local4
-  with tf.variable_scope('local4') as scope:
-    weights = _variable_with_weight_decay('weights', shape=[384, 192],
-                                          stddev=0.04, wd=0.004)
-    biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
-    local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
-    _activation_summary(local4)
-
-  # linear layer(WX + b),
-  # We don't apply softmax here because
-  # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
-  # and performs the softmax internally for efficiency.
-  with tf.variable_scope('softmax_linear') as scope:
-    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
-                                          stddev=1/192.0, wd=None)
-    biases = _variable_on_cpu('biases', [NUM_CLASSES],
-                              tf.constant_initializer(0.0))
-    softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
-    _activation_summary(softmax_linear)
-
-  return softmax_linear
-
-
-def loss(logits, labels):
-  """Add L2Loss to all the trainable variables.
-
-  Add summary for "Loss" and "Loss/avg".
-  Args:
-    logits: Logits from inference().
-    labels: Labels from distorted_inputs or inputs(). 1-D tensor
-            of shape [batch_size]
-
-  Returns:
-    Loss tensor of type float.
-  """
-  # Calculate the average cross entropy loss across the batch.
-  labels = tf.cast(labels, tf.int64)
-  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits, name='cross_entropy_per_example')
-  cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
-  tf.add_to_collection('losses', cross_entropy_mean)
-
-  # The total loss is defined as the cross entropy loss plus all of the weight
-  # decay terms (L2 loss).
-  return tf.add_n(tf.get_collection('losses'), name='total_loss')
-
-
-def _add_loss_summaries(total_loss):
-  """Add summaries for losses in CIFAR-10 model.
-
-  Generates moving average for all losses and associated summaries for
-  visualizing the performance of the network.
-
-  Args:
-    total_loss: Total loss from loss().
-  Returns:
-    loss_averages_op: op for generating moving averages of losses.
-  """
-  # Compute the moving average of all individual losses and the total loss.
-  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-  losses = tf.get_collection('losses')
-  loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-  # Attach a scalar summary to all individual losses and the total loss; do the
-  # same for the averaged version of the losses.
-  for l in losses + [total_loss]:
-    # Name each loss as '(raw)' and name the moving average version of the loss
-    # as the original loss name.
-    tf.summary.scalar(l.op.name + ' (raw)', l)
-    tf.summary.scalar(l.op.name, loss_averages.average(l))
-
-  return loss_averages_op
-
-
-def train(total_loss, global_step):
-  """Train CIFAR-10 model.
-
-  Create an optimizer and apply to all trainable variables. Add moving
-  average for all trainable variables.
-
-  Args:
-    total_loss: Total loss from loss().
-    global_step: Integer Variable counting the number of training steps
-      processed.
-  Returns:
-    train_op: op for training.
-  """
-  # Variables that affect learning rate.
-  num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
-  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-  # Decay the learning rate exponentially based on the number of steps.
-  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                  global_step,
-                                  decay_steps,
-                                  LEARNING_RATE_DECAY_FACTOR,
-                                  staircase=True)
-  tf.summary.scalar('learning_rate', lr)
-
-  # Generate moving averages of all losses and associated summaries.
-  loss_averages_op = _add_loss_summaries(total_loss)
-
-  # Compute gradients.
-  with tf.control_dependencies([loss_averages_op]):
-    opt = tf.train.GradientDescentOptimizer(lr)
-    grads = opt.compute_gradients(total_loss)
-
-  # Apply gradients.
-  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-  # Add histograms for trainable variables.
-  for var in tf.trainable_variables():
-    tf.summary.histogram(var.op.name, var)
-
-  # Add histograms for gradients.
-  for grad, var in grads:
-    if grad is not None:
-      tf.summary.histogram(var.op.name + '/gradients', grad)
-
-  # Track the moving averages of all trainable variables.
-  variable_averages = tf.train.ExponentialMovingAverage(
-      MOVING_AVERAGE_DECAY, global_step)
-  with tf.control_dependencies([apply_gradient_op]):
-    variables_averages_op = variable_averages.apply(tf.trainable_variables())
-
-  return variables_averages_op
diff --git a/tutorials/image/cifar10/cifar10_eval.py b/tutorials/image/cifar10/cifar10_eval.py
deleted file mode 100644
index fab39d1caf8..00000000000
--- a/tutorials/image/cifar10/cifar10_eval.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Evaluation for CIFAR-10.
-
-Accuracy:
-cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs
-of data) as judged by cifar10_eval.py.
-
-Speed:
-On a single Tesla K40, cifar10_train.py processes a single batch of 128 images
-in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86%
-accuracy after 100K steps in 8 hours of training time.
-
-Usage:
-Please see the tutorial and website for how to download the CIFAR-10
-data set, compile the program and train the model.
-
-http://tensorflow.org/tutorials/deep_cnn/
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from datetime import datetime
-import math
-import time
-
-import numpy as np
-import tensorflow as tf
-
-import cifar10
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
-                           """Directory where to write event logs.""")
-tf.app.flags.DEFINE_string('eval_data', 'test',
-                           """Either 'test' or 'train_eval'.""")
-tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
-                           """Directory where to read model checkpoints.""")
-tf.app.flags.DEFINE_integer('eval_interval_secs', 5,
-                            """How often to run the eval.""")
-tf.app.flags.DEFINE_integer('num_examples', 1000,
-                            """Number of examples to run.""")
-tf.app.flags.DEFINE_boolean('run_once', False,
-                            """Whether to run eval only once.""")
-
-
-def eval_once(saver, summary_writer, top_k_op, summary_op):
-  """Run Eval once.
-
-  Args:
-    saver: Saver.
-    summary_writer: Summary writer.
-    top_k_op: Top K op.
-    summary_op: Summary op.
-  """
-  with tf.Session() as sess:
-    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
-    if ckpt and ckpt.model_checkpoint_path:
-      # Restores from checkpoint
-      saver.restore(sess, ckpt.model_checkpoint_path)
-      # Assuming model_checkpoint_path looks something like:
-      #   /my-favorite-path/cifar10_train/model.ckpt-0,
-      # extract global_step from it.
-      global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
-    else:
-      print('No checkpoint file found')
-      return
-
-    # Start the queue runners.
-    coord = tf.train.Coordinator()
-    try:
-      threads = []
-      for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
-        threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
-                                         start=True))
-
-      num_iter = int(math.ceil(float(FLAGS.num_examples) / FLAGS.batch_size))
-      true_count = 0  # Counts the number of correct predictions.
-      total_sample_count = num_iter * FLAGS.batch_size
-      step = 0
-      while step < num_iter and not coord.should_stop():
-        predictions = sess.run([top_k_op])
-        true_count += np.sum(predictions)
-        step += 1
-
-      # Compute precision @ 1.
-      precision = true_count / total_sample_count
-      print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))
-
-      summary = tf.Summary()
-      summary.ParseFromString(sess.run(summary_op))
-      summary.value.add(tag='Precision @ 1', simple_value=precision)
-      summary_writer.add_summary(summary, global_step)
-    except Exception as e:  # pylint: disable=broad-except
-      coord.request_stop(e)
-
-    coord.request_stop()
-    coord.join(threads, stop_grace_period_secs=10)
-
-
-def evaluate():
-  """Eval CIFAR-10 for a number of steps."""
-  with tf.Graph().as_default() as g:
-    # Get images and labels for CIFAR-10.
-    images, labels = cifar10.inputs(eval_data=FLAGS.eval_data)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    logits = cifar10.inference(images)
-
-    logits = tf.cast(logits, "float32")
-    labels = tf.cast(labels, "int32")
-
-    # Calculate predictions.
-    top_k_op = tf.nn.in_top_k(logits, labels, 1)
-
-    # Restore the moving average version of the learned variables for eval.
-    variable_averages = tf.train.ExponentialMovingAverage(
-        cifar10.MOVING_AVERAGE_DECAY)
-    variables_to_restore = variable_averages.variables_to_restore()
-    saver = tf.train.Saver(variables_to_restore)
-
-    # Build the summary operation based on the TF collection of Summaries.
-    summary_op = tf.summary.merge_all()
-
-    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)
-
-    while True:
-      eval_once(saver, summary_writer, top_k_op, summary_op)
-      if FLAGS.run_once:
-        break
-      time.sleep(FLAGS.eval_interval_secs)
-
-
-def main(argv=None):  # pylint: disable=unused-argument
-  if tf.gfile.Exists(FLAGS.eval_dir):
-    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
-  tf.gfile.MakeDirs(FLAGS.eval_dir)
-  evaluate()
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tutorials/image/cifar10/cifar10_input.py b/tutorials/image/cifar10/cifar10_input.py
deleted file mode 100644
index 82d460e2133..00000000000
--- a/tutorials/image/cifar10/cifar10_input.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Routine for decoding the CIFAR-10 binary file format."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-# Process images of this size. Note that this differs from the original CIFAR
-# image size of 32 x 32. If one alters this number, then the entire model
-# architecture will change and any model would need to be retrained.
-IMAGE_SIZE = 24
-
-# Global constants describing the CIFAR-10 data set.
-NUM_CLASSES = 10
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
-NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
-
-
-def _get_images_labels(batch_size, split, distords=False):
-  """Returns Dataset for given split."""
-  dataset = tfds.load(name='cifar10', split=split)
-  scope = 'data_augmentation' if distords else 'input'
-  with tf.name_scope(scope):
-    dataset = dataset.map(DataPreprocessor(distords), num_parallel_calls=10)
-  # Dataset is small enough to be fully loaded on memory:
-  dataset = dataset.prefetch(-1)
-  dataset = dataset.repeat().batch(batch_size)
-  iterator = dataset.make_one_shot_iterator()
-  images_labels = iterator.get_next()
-  images, labels = images_labels['input'], images_labels['target']
-  tf.summary.image('images', images)
-  return images, labels
-
-
-class DataPreprocessor(object):
-  """Applies transformations to dataset record."""
-
-  def __init__(self, distords):
-    self._distords = distords
-
-  def __call__(self, record):
-    """Process img for training or eval."""
-    img = record['image']
-    img = tf.cast(img, tf.float32)
-    if self._distords:  # training
-      # Randomly crop a [height, width] section of the image.
-      img = tf.random_crop(img, [IMAGE_SIZE, IMAGE_SIZE, 3])
-      # Randomly flip the image horizontally.
-      img = tf.image.random_flip_left_right(img)
-      # Because these operations are not commutative, consider randomizing
-      # the order their operation.
-      # NOTE: since per_image_standardization zeros the mean and makes
-      # the stddev unit, this likely has no effect see tensorflow#1458.
-      img = tf.image.random_brightness(img, max_delta=63)
-      img = tf.image.random_contrast(img, lower=0.2, upper=1.8)
-    else:  # Image processing for evaluation.
-      # Crop the central [height, width] of the image.
-      img = tf.image.resize_image_with_crop_or_pad(img, IMAGE_SIZE, IMAGE_SIZE)
-    # Subtract off the mean and divide by the variance of the pixels.
-    img = tf.image.per_image_standardization(img)
-    return dict(input=img, target=record['label'])
-
-
-def distorted_inputs(batch_size):
-  """Construct distorted input for CIFAR training using the Reader ops.
-
-  Args:
-    batch_size: Number of images per batch.
-
-  Returns:
-    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
-    labels: Labels. 1D tensor of [batch_size] size.
-  """
-  return _get_images_labels(batch_size, tfds.Split.TRAIN, distords=True)
-
-
-def inputs(eval_data, batch_size):
-  """Construct input for CIFAR evaluation using the Reader ops.
-
-  Args:
-    eval_data: bool, indicating if one should use the train or eval data set.
-    batch_size: Number of images per batch.
-
-  Returns:
-    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
-    labels: Labels. 1D tensor of [batch_size] size.
-  """
-  split = tfds.Split.TEST if eval_data == 'test' else tfds.Split.TRAIN
-  return _get_images_labels(batch_size, split)
diff --git a/tutorials/image/cifar10/cifar10_input_test.py b/tutorials/image/cifar10/cifar10_input_test.py
deleted file mode 100644
index dbae1cab411..00000000000
--- a/tutorials/image/cifar10/cifar10_input_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for cifar10 input."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import tensorflow as tf
-
-import cifar10_input
-
-
-class CIFAR10InputTest(tf.test.TestCase):
-
-  def _record(self, label, red, green, blue):
-    image_size = 32 * 32
-    record = bytes(bytearray([label] + [red] * image_size +
-                             [green] * image_size + [blue] * image_size))
-    expected = [[[red, green, blue]] * 32] * 32
-    return record, expected
-
-  def testSimple(self):
-    labels = [9, 3, 0]
-    records = [self._record(labels[0], 0, 128, 255),
-               self._record(labels[1], 255, 0, 1),
-               self._record(labels[2], 254, 255, 0)]
-    contents = b"".join([record for record, _ in records])
-    expected = [expected for _, expected in records]
-    filename = os.path.join(self.get_temp_dir(), "cifar")
-    open(filename, "wb").write(contents)
-
-    with self.test_session() as sess:
-      q = tf.FIFOQueue(99, [tf.string], shapes=())
-      q.enqueue([filename]).run()
-      q.close().run()
-      result = cifar10_input.read_cifar10(q)
-
-      for i in range(3):
-        key, label, uint8image = sess.run([
-            result.key, result.label, result.uint8image])
-        self.assertEqual("%s:%d" % (filename, i), tf.compat.as_text(key))
-        self.assertEqual(labels[i], label)
-        self.assertAllEqual(expected[i], uint8image)
-
-      with self.assertRaises(tf.errors.OutOfRangeError):
-        sess.run([result.key, result.uint8image])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tutorials/image/cifar10/cifar10_multi_gpu_train.py b/tutorials/image/cifar10/cifar10_multi_gpu_train.py
deleted file mode 100644
index 8cb8a096f84..00000000000
--- a/tutorials/image/cifar10/cifar10_multi_gpu_train.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A binary to train CIFAR-10 using multiple GPUs with synchronous updates.
-
-Accuracy:
-cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256
-epochs of data) as judged by cifar10_eval.py.
-
-Speed: With batch_size 128.
-
-System        | Step Time (sec/batch)  |     Accuracy
---------------------------------------------------------------------
-1 Tesla K20m  | 0.35-0.60              | ~86% at 60K steps  (5 hours)
-1 Tesla K40m  | 0.25-0.35              | ~86% at 100K steps (4 hours)
-2 Tesla K20m  | 0.13-0.20              | ~84% at 30K steps  (2.5 hours)
-3 Tesla K20m  | 0.13-0.18              | ~84% at 30K steps
-4 Tesla K20m  | ~0.10                  | ~84% at 30K steps
-
-Usage:
-Please see the tutorial and website for how to download the CIFAR-10
-data set, compile the program and train the model.
-
-http://tensorflow.org/tutorials/deep_cnn/
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-import re
-import time
-from datetime import datetime
-
-import numpy as np
-import tensorflow as tf
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-import cifar10
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
-                           """Directory where to write event logs """
-                           """and checkpoint.""")
-tf.app.flags.DEFINE_integer('max_steps', 1000000,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('num_gpus', 1,
-                            """How many GPUs to use.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-
-
-def tower_loss(scope, images, labels):
-  """Calculate the total loss on a single tower running the CIFAR model.
-
-  Args:
-    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
-    images: Images. 4D tensor of shape [batch_size, height, width, 3].
-    labels: Labels. 1D tensor of shape [batch_size].
-
-  Returns:
-     Tensor of shape [] containing the total loss for a batch of data
-  """
-
-  # Build inference Graph.
-  logits = cifar10.inference(images)
-
-  # Build the portion of the Graph calculating the losses. Note that we will
-  # assemble the total_loss using a custom function below.
-  _ = cifar10.loss(logits, labels)
-
-  # Assemble all of the losses for the current tower only.
-  losses = tf.get_collection('losses', scope)
-
-  # Calculate the total loss for the current tower.
-  total_loss = tf.add_n(losses, name='total_loss')
-
-  # Attach a scalar summary to all individual losses and the total loss; do the
-  # same for the averaged version of the losses.
-  for l in losses + [total_loss]:
-    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
-    # session. This helps the clarity of presentation on tensorboard.
-    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
-    tf.summary.scalar(loss_name, l)
-
-  return total_loss
-
-
-def average_gradients(tower_grads):
-  """Calculate the average gradient for each shared variable across all towers.
-
-  Note that this function provides a synchronization point across all towers.
-
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples. The outer list
-      is over individual gradients. The inner list is over the gradient
-      calculation for each tower.
-  Returns:
-     List of pairs of (gradient, variable) where the gradient has been averaged
-     across all towers.
-  """
-  average_grads = []
-  for grad_and_vars in zip(*tower_grads):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    grads = []
-    for g, _ in grad_and_vars:
-      # Add 0 dimension to the gradients to represent the tower.
-      expanded_g = tf.expand_dims(g, 0)
-
-      # Append on a 'tower' dimension which we will average over below.
-      grads.append(expanded_g)
-
-    # Average over the 'tower' dimension.
-    grad = tf.concat(axis=0, values=grads)
-    grad = tf.reduce_mean(grad, 0)
-
-    # Keep in mind that the Variables are redundant because they are shared
-    # across towers. So .. we will just return the first tower's pointer to
-    # the Variable.
-    v = grad_and_vars[0][1]
-    grad_and_var = (grad, v)
-    average_grads.append(grad_and_var)
-  return average_grads
-
-
-def train():
-  """Train CIFAR-10 for a number of steps."""
-  with tf.Graph().as_default(), tf.device('/cpu:0'):
-    # Create a variable to count the number of train() calls. This equals the
-    # number of batches processed * FLAGS.num_gpus.
-    global_step = tf.get_variable(
-        'global_step', [],
-        initializer=tf.constant_initializer(0), trainable=False)
-
-    # Calculate the learning rate schedule.
-    num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                             FLAGS.batch_size / FLAGS.num_gpus)
-    decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
-
-    # Decay the learning rate exponentially based on the number of steps.
-    lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
-                                    global_step,
-                                    decay_steps,
-                                    cifar10.LEARNING_RATE_DECAY_FACTOR,
-                                    staircase=True)
-
-    # Create an optimizer that performs gradient descent.
-    opt = tf.train.GradientDescentOptimizer(lr)
-
-    # Get images and labels for CIFAR-10.
-    images, labels = cifar10.distorted_inputs()
-    images = tf.reshape(images, [cifar10.FLAGS.batch_size, 24, 24, 3])
-    labels = tf.reshape(labels, [cifar10.FLAGS.batch_size])
-    batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
-          [images, labels], capacity=2 * FLAGS.num_gpus)
-    # Calculate the gradients for each model tower.
-    tower_grads = []
-    with tf.variable_scope(tf.get_variable_scope()):
-      for i in xrange(FLAGS.num_gpus):
-        with tf.device('/gpu:%d' % i):
-          with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
-            # Dequeues one batch for the GPU
-            image_batch, label_batch = batch_queue.dequeue()
-            # Calculate the loss for one tower of the CIFAR model. This function
-            # constructs the entire CIFAR model but shares the variables across
-            # all towers.
-            loss = tower_loss(scope, image_batch, label_batch)
-
-            # Reuse variables for the next tower.
-            tf.get_variable_scope().reuse_variables()
-
-            # Retain the summaries from the final tower.
-            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-            # Calculate the gradients for the batch of data on this CIFAR tower.
-            grads = opt.compute_gradients(loss)
-
-            # Keep track of the gradients across all towers.
-            tower_grads.append(grads)
-
-    # We must calculate the mean of each gradient. Note that this is the
-    # synchronization point across all towers.
-    grads = average_gradients(tower_grads)
-
-    # Add a summary to track the learning rate.
-    summaries.append(tf.summary.scalar('learning_rate', lr))
-
-    # Add histograms for gradients.
-    for grad, var in grads:
-      if grad is not None:
-        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
-
-    # Apply the gradients to adjust the shared variables.
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Add histograms for trainable variables.
-    for var in tf.trainable_variables():
-      summaries.append(tf.summary.histogram(var.op.name, var))
-
-    # Track the moving averages of all trainable variables.
-    variable_averages = tf.train.ExponentialMovingAverage(
-        cifar10.MOVING_AVERAGE_DECAY, global_step)
-    variables_averages_op = variable_averages.apply(tf.trainable_variables())
-
-    # Group all updates to into a single train op.
-    train_op = tf.group(apply_gradient_op, variables_averages_op)
-
-    # Create a saver.
-    saver = tf.train.Saver(tf.global_variables())
-
-    # Build the summary operation from the last tower summaries.
-    summary_op = tf.summary.merge(summaries)
-
-    # Build an initialization operation to run below.
-    init = tf.global_variables_initializer()
-
-    # Start running operations on the Graph. allow_soft_placement must be set to
-    # True to build towers on GPU, as some of the ops do not have GPU
-    # implementations.
-    sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-
-    # Start the queue runners.
-    tf.train.start_queue_runners(sess=sess)
-
-    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
-
-    for step in xrange(FLAGS.max_steps):
-      start_time = time.time()
-      _, loss_value = sess.run([train_op, loss])
-      duration = time.time() - start_time
-
-      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
-
-      if step % 10 == 0:
-        num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-        examples_per_sec = num_examples_per_step / duration
-        sec_per_batch = duration / FLAGS.num_gpus
-
-        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                      'sec/batch)')
-        print (format_str % (datetime.now(), step, loss_value,
-                             examples_per_sec, sec_per_batch))
-
-      if step % 100 == 0:
-        summary_str = sess.run(summary_op)
-        summary_writer.add_summary(summary_str, step)
-
-      # Save the model checkpoint periodically.
-      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
-        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
-        saver.save(sess, checkpoint_path, global_step=step)
-
-
-def main(argv=None):  # pylint: disable=unused-argument
-  if tf.gfile.Exists(FLAGS.train_dir):
-    tf.gfile.DeleteRecursively(FLAGS.train_dir)
-  tf.gfile.MakeDirs(FLAGS.train_dir)
-  train()
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tutorials/image/cifar10/cifar10_train.py b/tutorials/image/cifar10/cifar10_train.py
deleted file mode 100644
index 4b4d967bc0e..00000000000
--- a/tutorials/image/cifar10/cifar10_train.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A binary to train CIFAR-10 using a single GPU.
-
-Accuracy:
-cifar10_train.py achieves ~86% accuracy after 100K steps (256 epochs of
-data) as judged by cifar10_eval.py.
-
-Speed: With batch_size 128.
-
-System        | Step Time (sec/batch)  |     Accuracy
-------------------------------------------------------------------
-1 Tesla K20m  | 0.35-0.60              | ~86% at 60K steps  (5 hours)
-1 Tesla K40m  | 0.25-0.35              | ~86% at 100K steps (4 hours)
-
-Usage:
-Please see the tutorial and website for how to download the CIFAR-10
-data set, compile the program and train the model.
-
-http://tensorflow.org/tutorials/deep_cnn/
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from datetime import datetime
-import time
-
-import tensorflow as tf
-
-import cifar10
-
-FLAGS = tf.app.flags.FLAGS
-
-tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
-                           """Directory where to write event logs """
-                           """and checkpoint.""")
-tf.app.flags.DEFINE_integer('max_steps', 100000,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_boolean('log_device_placement', False,
-                            """Whether to log device placement.""")
-tf.app.flags.DEFINE_integer('log_frequency', 10,
-                            """How often to log results to the console.""")
-
-
-def train():
-  """Train CIFAR-10 for a number of steps."""
-  with tf.Graph().as_default():
-    global_step = tf.train.get_or_create_global_step()
-
-    # Get images and labels for CIFAR-10.
-    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
-    # GPU and resulting in a slow down.
-    with tf.device('/cpu:0'):
-      images, labels = cifar10.distorted_inputs()
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    logits = cifar10.inference(images)
-
-    # Calculate loss.
-    loss = cifar10.loss(logits, labels)
-
-    # Build a Graph that trains the model with one batch of examples and
-    # updates the model parameters.
-    train_op = cifar10.train(loss, global_step)
-
-    class _LoggerHook(tf.train.SessionRunHook):
-      """Logs loss and runtime."""
-
-      def begin(self):
-        self._step = -1
-        self._start_time = time.time()
-
-      def before_run(self, run_context):
-        self._step += 1
-        return tf.train.SessionRunArgs(loss)  # Asks for loss value.
-
-      def after_run(self, run_context, run_values):
-        if self._step % FLAGS.log_frequency == 0:
-          current_time = time.time()
-          duration = current_time - self._start_time
-          self._start_time = current_time
-
-          loss_value = run_values.results
-          examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
-          sec_per_batch = float(duration / FLAGS.log_frequency)
-
-          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                        'sec/batch)')
-          print (format_str % (datetime.now(), self._step, loss_value,
-                               examples_per_sec, sec_per_batch))
-
-    with tf.train.MonitoredTrainingSession(
-        checkpoint_dir=FLAGS.train_dir,
-        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
-               tf.train.NanTensorHook(loss),
-               _LoggerHook()],
-        config=tf.ConfigProto(
-            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
-      while not mon_sess.should_stop():
-        mon_sess.run(train_op)
-
-
-def main(argv=None):  # pylint: disable=unused-argument
-  if tf.gfile.Exists(FLAGS.train_dir):
-    tf.gfile.DeleteRecursively(FLAGS.train_dir)
-  tf.gfile.MakeDirs(FLAGS.train_dir)
-  train()
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tutorials/image/cifar10_estimator/README.md b/tutorials/image/cifar10_estimator/README.md
deleted file mode 100644
index 5627e9b9f59..00000000000
--- a/tutorials/image/cifar10_estimator/README.md
+++ /dev/null
@@ -1,523 +0,0 @@
-CIFAR-10 is a common benchmark in machine learning for image recognition.
-
-http://www.cs.toronto.edu/~kriz/cifar.html
-
-Code in this directory focuses on how to use TensorFlow Estimators to train and 
-evaluate a CIFAR-10 ResNet model on:
-
-* A single host with one CPU;
-* A single host with multiple GPUs;
-* Multiple hosts with CPU or multiple GPUs;
-
-Before trying to run the model we highly encourage you to read all the README.
-
-## Prerequisite
-
-1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.9.0 or
-later.
-
-2. Download the CIFAR-10 dataset and generate TFRecord files using the provided
-script.  The script and associated command below will download the CIFAR-10
-dataset and then generate a TFRecord for the training, validation, and
-evaluation datasets. 
-
-```shell
-python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data
-```
-
-After running the command above, you should see the following files in the
---data-dir (```ls -R cifar-10-data```):
-
-* train.tfrecords
-* validation.tfrecords
-* eval.tfrecords
-
-
-## Training on a single machine with GPUs or CPU
-
-Run the training on CPU only. After training, it runs the evaluation.
-
-```
-python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
-                       --job-dir=/tmp/cifar10 \
-                       --num-gpus=0 \
-                       --train-steps=1000
-```
-
-Run the model on 2 GPUs using CPU as parameter server. After training, it runs
-the evaluation.
-```
-python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
-                       --job-dir=/tmp/cifar10 \
-                       --num-gpus=2 \
-                       --train-steps=1000
-```
-
-Run the model on 2 GPUs using GPU as parameter server.
-It will run an experiment, which for local setting basically means it will run
-stop training
-a couple of times to perform evaluation.
-
-```
-python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
-                       --job-dir=/tmp/cifar10 \
-                       --variable-strategy GPU \
-                       --num-gpus=2 \
-```
-
-There are more command line flags to play with; run
-`python cifar10_main.py --help` for details.
-
-## Run distributed training
-
-### (Optional) Running on Google Cloud Machine Learning Engine
-
-This example can be run on Google Cloud Machine Learning Engine (ML Engine),
-which will configure the environment and take care of running workers,
-parameters servers, and masters in a fault tolerant way.
-
-To install the command line tool, and set up a project and billing, see the
-quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
-
-You'll also need a Google Cloud Storage bucket for the data. If you followed the
-instructions above, you can just run:
-
-```
-MY_BUCKET=gs://<my-bucket-name>
-gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/
-```
-
-Then run the following command from the `tutorials/image` directory of this
-repository (the parent directory of this README):
-
-```
-gcloud ml-engine jobs submit training cifarmultigpu \
-    --runtime-version 1.2 \
-    --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
-    --config cifar10_estimator/cmle_config.yaml \
-    --package-path cifar10_estimator/ \
-    --module-name cifar10_estimator.cifar10_main \
-    -- \
-    --data-dir=$MY_BUCKET/cifar-10-data \
-    --num-gpus=4 \
-    --train-steps=1000
-```
-
-
-### Set TF_CONFIG
-
-Considering that you already have multiple hosts configured, all you need is a
-`TF_CONFIG` environment variable on each host. You can set up the hosts manually
-or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for
-instructions about how to set up a Cluster.
-
-The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and
-their task: `master`, `ps` or `worker`.
-
-Here's an example of `TF_CONFIG`.
-
-```python
-cluster = {'master': ['master-ip:8000'],
-           'ps': ['ps-ip:8000'],
-           'worker': ['worker-ip:8000']}
-
-TF_CONFIG = json.dumps(
-  {'cluster': cluster,
-   'task': {'type': master, 'index': 0},
-   'model_dir': 'gs://<bucket_path>/<dir_path>',
-   'environment': 'cloud'
-  })
-```
-
-*Cluster*
-
-A cluster spec, which is basically a dictionary that describes all of the tasks
-in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed).
-
-In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker.
-
-* `ps`: saves the parameters among all workers. All workers can
-   read/write/update the parameters for model via ps. As some models are
-   extremely large the parameters are shared among the ps (each ps stores a
-   subset).
-
-* `worker`: does the training.
-
-* `master`: basically a special worker, it does training, but also restores and
-   saves checkpoints and do evaluation.
-
-*Task*
-
-The Task defines what is the role of the current node, for this example the node
-is the master on index 0 on the cluster spec, the task will be different for
-each node. An example of the `TF_CONFIG` for a worker would be:
-
-```python
-cluster = {'master': ['master-ip:8000'],
-           'ps': ['ps-ip:8000'],
-           'worker': ['worker-ip:8000']}
-
-TF_CONFIG = json.dumps(
-  {'cluster': cluster,
-   'task': {'type': worker, 'index': 0},
-   'model_dir': 'gs://<bucket_path>/<dir_path>',
-   'environment': 'cloud'
-  })
-```
-
-*Model_dir*
-
-This is the path where the master will save the checkpoints, graph and
-TensorBoard files. For a multi host environment you may want to use a
-Distributed File System, Google Storage and DFS are supported.
-
-*Environment*
-
-By the default environment is *local*, for a distributed setting we need to
-change it to *cloud*.
-
-### Running script
-
-Once you have a `TF_CONFIG` configured properly on each host you're ready to run
-on distributed settings.
-
-#### Master
-Run this on master:
-Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
-40000 steps. It will run evaluation a couple of times during training. The
-num_workers arugument is used only to update the learning rate correctly. Make
-sure the model_dir is the same as defined on the TF_CONFIG.
-
-```shell
-python cifar10_main.py --data-dir=gs://path/cifar-10-data \
-                       --job-dir=gs://path/model_dir/ \
-                       --num-gpus=4 \
-                       --train-steps=40000 \
-                       --sync \
-                       --num-workers=2
-```
-
-*Output:*
-
-```shell
-INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
-INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd16fb2be10>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
-gpu_options {
-}
-allow_soft_placement: true
-, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
-  per_process_gpu_memory_fraction: 1.0
-}
-, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
-...
-2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: 
-name: Tesla K80
-major: 3 minor: 7 memoryClockRate (GHz) 0.8235
-pciBusID 0000:00:04.0
-Total memory: 11.17GiB
-Free memory: 11.09GiB
-2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: 
-name: Tesla K80
-major: 3 minor: 7 memoryClockRate (GHz) 0.8235
-pciBusID 0000:00:05.0
-Total memory: 11.17GiB
-Free memory: 11.10GiB
-...
-2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
-INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
-INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1
-INFO:tensorflow:Create CheckpointSaverHook.
-INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0
-2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config: 
-intra_op_parallelism_threads: 1
-gpu_options {
-  per_process_gpu_memory_fraction: 1
-}
-allow_soft_placement: true
-
-INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt.
-INFO:tensorflow:loss = 1.20682, step = 1
-INFO:tensorflow:loss = 1.20682, learning_rate = 0.1
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
-INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
-INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
-INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14
-2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0)
-2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0)
-2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0)
-2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0)
-2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0)
-2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0)
-2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0)
-2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0)
-INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023
-INFO:tensorflow:Evaluation [1/100]
-INFO:tensorflow:Evaluation [2/100]
-INFO:tensorflow:Evaluation [3/100]
-INFO:tensorflow:Evaluation [4/100]
-INFO:tensorflow:Evaluation [5/100]
-INFO:tensorflow:Evaluation [6/100]
-INFO:tensorflow:Evaluation [7/100]
-INFO:tensorflow:Evaluation [8/100]
-INFO:tensorflow:Evaluation [9/100]
-INFO:tensorflow:Evaluation [10/100]
-INFO:tensorflow:Evaluation [11/100]
-INFO:tensorflow:Evaluation [12/100]
-INFO:tensorflow:Evaluation [13/100]
-...
-INFO:tensorflow:Evaluation [100/100]
-INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31
-INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425
-```
-
-#### Worker
-
-Run this on worker:
-Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
-40000 steps. It will run evaluation a couple of times during training. Make sure
-the model_dir is the same as defined on the TF_CONFIG.
-
-```shell
-python cifar10_main.py --data-dir=gs://path/cifar-10-data \
-                       --job-dir=gs://path/model_dir/ \
-                       --num-gpus=4 \
-                       --train-steps=40000 \
-                       --sync
-```
-
-*Output:*
-
-```shell
-INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
-INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600,
-'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker',
-'_is_chief': False, '_cluster_spec':
-<tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6918438e10>,
-'_model_dir': 'gs://<path>/model_dir/',
-'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000,
-'_session_config': intra_op_parallelism_threads: 1
-gpu_options {
-}
-allow_soft_placement: true
-, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1,
-'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
-  per_process_gpu_memory_fraction: 1.0
-  }
-...
-2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: 
-name: Tesla K80
-major: 3 minor: 7 memoryClockRate (GHz) 0.8235
-pciBusID 0000:00:04.0
-Total memory: 11.17GiB
-Free memory: 11.09GiB
-2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: 
-name: Tesla K80
-major: 3 minor: 7 memoryClockRate (GHz) 0.8235
-pciBusID 0000:00:05.0
-Total memory: 11.17GiB
-Free memory: 11.10GiB
-...
-2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
-INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
-INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
-INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
-INFO:tensorflow:Create CheckpointSaverHook.
-2017-07-31 22:38:04.629150: I
-tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting
-for response from worker: /job:master/replica:0/task:0
-2017-07-31 22:38:09.263492: I
-tensorflow/core/distributed_runtime/master_session.cc:999] Start master
-session cc58f93b1e259b0c with config: 
-intra_op_parallelism_threads: 1
-gpu_options {
-per_process_gpu_memory_fraction: 1
-}
-allow_soft_placement: true
-INFO:tensorflow:loss = 5.82382, step = 0
-INFO:tensorflow:loss = 5.82382, learning_rate = 0.8
-INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10
-INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20
-INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30
-INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40
-INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50
-INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60
-INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70
-INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec)
-INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec)
-INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80
-INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90
-INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100
-INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110
-INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120
-INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130
-INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140
-INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150
-INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160
-INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170
-INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
-...
-```
-
-#### PS
-
-Run this on ps:
-The ps will not do training so most of the arguments won't affect the execution
-
-```shell
-python cifar10_main.py --job-dir=gs://path/model_dir/
-```
-
-*Output:*
-
-```shell
-INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
-INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48f1addf90>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
-gpu_options {
-}
-allow_soft_placement: true
-, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
-  per_process_gpu_memory_fraction: 1.0
-}
-, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
-2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000}
-2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
-2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000}
-2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
-```
-
-## Visualizing results with TensorBoard
-
-When using Estimators you can also visualize your data in TensorBoard, with no
-changes in your code. You can use TensorBoard to visualize your TensorFlow
-graph, plot quantitative metrics about the execution of your graph, and show
-additional data like images that pass through it.
-
-You'll see something similar to this if you "point" TensorBoard to the
-`job dir` parameter you used to train or evaluate your model.
-
-Check TensorBoard during training or after it. Just point TensorBoard to the
-model_dir you chose on the previous step.
-
-```shell
-tensorboard --log-dir="<job dir>"
-```
-
-## Warnings
-
-When runninng `cifar10_main.py` with `--sync` argument you may see an error
-similar to:
-
-```python
-File "cifar10_main.py", line 538, in <module>
-    tf.app.run()
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
-    _sys.exit(main(_sys.argv[:1] + flags_passthrough))
-File "cifar10_main.py", line 518, in main
-    hooks), run_config=config)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run
-    return _execute_schedule(experiment, schedule)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule
-    return task()
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate
-    hooks=self._eval_hooks)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate
-    hooks=hooks)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate
-    name=name)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model
-    features, labels, model_fn_lib.ModeKeys.EVAL)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
-    features=features, labels=labels, **kwargs)
-File "cifar10_main.py", line 331, in _resnet_model_fn
-    gradvars, global_step=tf.train.get_global_step())
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients
-    variables.global_variables())
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped
-    return _add_should_use_warning(fn(*args, **kwargs))
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning
-    wrapped = TFShouldUseWarningWrapper(x)
-File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__
-    stack = [s.strip() for s in traceback.format_stack()]
-```
-
-This should not affect your training, and should be fixed on the next releases.
diff --git a/tutorials/image/cifar10_estimator/__init__.py b/tutorials/image/cifar10_estimator/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tutorials/image/cifar10_estimator/cifar10.py b/tutorials/image/cifar10_estimator/cifar10.py
deleted file mode 100644
index 5e1a70895ad..00000000000
--- a/tutorials/image/cifar10_estimator/cifar10.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CIFAR-10 data set.
-
-See http://www.cs.toronto.edu/~kriz/cifar.html.
-"""
-import os
-
-import tensorflow as tf
-
-HEIGHT = 32
-WIDTH = 32
-DEPTH = 3
-
-
-class Cifar10DataSet(object):
-  """Cifar10 data set.
-
-  Described by http://www.cs.toronto.edu/~kriz/cifar.html.
-  """
-
-  def __init__(self, data_dir, subset='train', use_distortion=True):
-    self.data_dir = data_dir
-    self.subset = subset
-    self.use_distortion = use_distortion
-
-  def get_filenames(self):
-    if self.subset in ['train', 'validation', 'eval']:
-      return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
-    else:
-      raise ValueError('Invalid data subset "%s"' % self.subset)
-
-  def parser(self, serialized_example):
-    """Parses a single tf.Example into image and label tensors."""
-    # Dimensions of the images in the CIFAR-10 dataset.
-    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
-    # input format.
-    features = tf.parse_single_example(
-        serialized_example,
-        features={
-            'image': tf.FixedLenFeature([], tf.string),
-            'label': tf.FixedLenFeature([], tf.int64),
-        })
-    image = tf.decode_raw(features['image'], tf.uint8)
-    image.set_shape([DEPTH * HEIGHT * WIDTH])
-
-    # Reshape from [depth * height * width] to [depth, height, width].
-    image = tf.cast(
-        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
-        tf.float32)
-    label = tf.cast(features['label'], tf.int32)
-
-    # Custom preprocessing.
-    image = self.preprocess(image)
-
-    return image, label
-
-  def make_batch(self, batch_size):
-    """Read the images and labels from 'filenames'."""
-    filenames = self.get_filenames()
-    # Repeat infinitely.
-    dataset = tf.data.TFRecordDataset(filenames).repeat()
-
-    # Parse records.
-    dataset = dataset.map(
-        self.parser, num_parallel_calls=batch_size)
-
-    # Potentially shuffle records.
-    if self.subset == 'train':
-      min_queue_examples = int(
-          Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4)
-      # Ensure that the capacity is sufficiently large to provide good random
-      # shuffling.
-      dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
-
-    # Batch it up.
-    dataset = dataset.batch(batch_size)
-    iterator = dataset.make_one_shot_iterator()
-    image_batch, label_batch = iterator.get_next()
-
-    return image_batch, label_batch
-
-  def preprocess(self, image):
-    """Preprocess a single image in [height, width, depth] layout."""
-    if self.subset == 'train' and self.use_distortion:
-      # Pad 4 pixels on each dimension of feature map, done in mini-batch
-      image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
-      image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
-      image = tf.image.random_flip_left_right(image)
-    return image
-
-  @staticmethod
-  def num_examples_per_epoch(subset='train'):
-    if subset == 'train':
-      return 45000
-    elif subset == 'validation':
-      return 5000
-    elif subset == 'eval':
-      return 10000
-    else:
-      raise ValueError('Invalid data subset "%s"' % subset)
diff --git a/tutorials/image/cifar10_estimator/cifar10_main.py b/tutorials/image/cifar10_estimator/cifar10_main.py
deleted file mode 100644
index 51da6b94fa2..00000000000
--- a/tutorials/image/cifar10_estimator/cifar10_main.py
+++ /dev/null
@@ -1,521 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet model for classifying images from CIFAR-10 dataset.
-
-Support single-host training with one or multiple devices.
-
-ResNet as proposed in:
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-Deep Residual Learning for Image Recognition. arXiv:1512.03385
-
-CIFAR-10 as in:
-http://www.cs.toronto.edu/~kriz/cifar.html
-
-
-"""
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import functools
-import itertools
-import os
-
-import cifar10
-import cifar10_model
-import cifar10_utils
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-
-def get_model_fn(num_gpus, variable_strategy, num_workers):
-  """Returns a function that will build the resnet model."""
-
-  def _resnet_model_fn(features, labels, mode, params):
-    """Resnet model body.
-
-    Support single host, one or more GPU training. Parameter distribution can
-    be either one of the following scheme.
-    1. CPU is the parameter server and manages gradient updates.
-    2. Parameters are distributed evenly across all GPUs, and the first GPU
-       manages gradient updates.
-
-    Args:
-      features: a list of tensors, one for each tower
-      labels: a list of tensors, one for each tower
-      mode: ModeKeys.TRAIN or EVAL
-      params: Hyperparameters suitable for tuning
-    Returns:
-      A EstimatorSpec object.
-    """
-    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-    weight_decay = params.weight_decay
-    momentum = params.momentum
-
-    tower_features = features
-    tower_labels = labels
-    tower_losses = []
-    tower_gradvars = []
-    tower_preds = []
-
-    # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
-    # on CPU. The exception is Intel MKL on CPU which is optimal with
-    # channels_last.
-    data_format = params.data_format
-    if not data_format:
-      if num_gpus == 0:
-        data_format = 'channels_last'
-      else:
-        data_format = 'channels_first'
-
-    if num_gpus == 0:
-      num_devices = 1
-      device_type = 'cpu'
-    else:
-      num_devices = num_gpus
-      device_type = 'gpu'
-
-    for i in range(num_devices):
-      worker_device = '/{}:{}'.format(device_type, i)
-      if variable_strategy == 'CPU':
-        device_setter = cifar10_utils.local_device_setter(
-            worker_device=worker_device)
-      elif variable_strategy == 'GPU':
-        device_setter = cifar10_utils.local_device_setter(
-            ps_device_type='gpu',
-            worker_device=worker_device,
-            ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
-                num_gpus, tf.contrib.training.byte_size_load_fn))
-      with tf.variable_scope('resnet', reuse=bool(i != 0)):
-        with tf.name_scope('tower_%d' % i) as name_scope:
-          with tf.device(device_setter):
-            loss, gradvars, preds = _tower_fn(
-                is_training, weight_decay, tower_features[i], tower_labels[i],
-                data_format, params.num_layers, params.batch_norm_decay,
-                params.batch_norm_epsilon)
-            tower_losses.append(loss)
-            tower_gradvars.append(gradvars)
-            tower_preds.append(preds)
-            if i == 0:
-              # Only trigger batch_norm moving mean and variance update from
-              # the 1st tower. Ideally, we should grab the updates from all
-              # towers but these stats accumulate extremely fast so we can
-              # ignore the other stats from the other towers without
-              # significant detriment.
-              update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
-                                             name_scope)
-
-    # Now compute global loss and gradients.
-    gradvars = []
-    with tf.name_scope('gradient_averaging'):
-      all_grads = {}
-      for grad, var in itertools.chain(*tower_gradvars):
-        if grad is not None:
-          all_grads.setdefault(var, []).append(grad)
-      for var, grads in six.iteritems(all_grads):
-        # Average gradients on the same device as the variables
-        # to which they apply.
-        with tf.device(var.device):
-          if len(grads) == 1:
-            avg_grad = grads[0]
-          else:
-            avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
-        gradvars.append((avg_grad, var))
-
-    # Device that runs the ops to apply global gradient updates.
-    consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
-    with tf.device(consolidation_device):
-      # Suggested learning rate scheduling from
-      # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
-      num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
-          'train') // (params.train_batch_size * num_workers)
-      boundaries = [
-          num_batches_per_epoch * x
-          for x in np.array([82, 123, 300], dtype=np.int64)
-      ]
-      staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
-
-      learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
-                                                  boundaries, staged_lr)
-
-      loss = tf.reduce_mean(tower_losses, name='loss')
-
-      examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
-          params.train_batch_size, every_n_steps=10)
-
-      tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}
-
-      logging_hook = tf.train.LoggingTensorHook(
-          tensors=tensors_to_log, every_n_iter=100)
-
-      train_hooks = [logging_hook, examples_sec_hook]
-
-      optimizer = tf.train.MomentumOptimizer(
-          learning_rate=learning_rate, momentum=momentum)
-
-      if params.sync:
-        optimizer = tf.train.SyncReplicasOptimizer(
-            optimizer, replicas_to_aggregate=num_workers)
-        sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief)
-        train_hooks.append(sync_replicas_hook)
-
-      # Create single grouped train op
-      train_op = [
-          optimizer.apply_gradients(
-              gradvars, global_step=tf.train.get_global_step())
-      ]
-      train_op.extend(update_ops)
-      train_op = tf.group(*train_op)
-
-      predictions = {
-          'classes':
-              tf.concat([p['classes'] for p in tower_preds], axis=0),
-          'probabilities':
-              tf.concat([p['probabilities'] for p in tower_preds], axis=0)
-      }
-      stacked_labels = tf.concat(labels, axis=0)
-      metrics = {
-          'accuracy':
-              tf.metrics.accuracy(stacked_labels, predictions['classes'])
-      }
-
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        training_hooks=train_hooks,
-        eval_metric_ops=metrics)
-
-  return _resnet_model_fn
-
-
-def _tower_fn(is_training, weight_decay, feature, label, data_format,
-              num_layers, batch_norm_decay, batch_norm_epsilon):
-  """Build computation tower (Resnet).
-
-  Args:
-    is_training: true if is training graph.
-    weight_decay: weight regularization strength, a float.
-    feature: a Tensor.
-    label: a Tensor.
-    data_format: channels_last (NHWC) or channels_first (NCHW).
-    num_layers: number of layers, an int.
-    batch_norm_decay: decay for batch normalization, a float.
-    batch_norm_epsilon: epsilon for batch normalization, a float.
-
-  Returns:
-    A tuple with the loss for the tower, the gradients and parameters, and
-    predictions.
-
-  """
-  model = cifar10_model.ResNetCifar10(
-      num_layers,
-      batch_norm_decay=batch_norm_decay,
-      batch_norm_epsilon=batch_norm_epsilon,
-      is_training=is_training,
-      data_format=data_format)
-  logits = model.forward_pass(feature, input_data_format='channels_last')
-  tower_pred = {
-      'classes': tf.argmax(input=logits, axis=1),
-      'probabilities': tf.nn.softmax(logits)
-  }
-
-  tower_loss = tf.losses.sparse_softmax_cross_entropy(
-      logits=logits, labels=label)
-  tower_loss = tf.reduce_mean(tower_loss)
-
-  model_params = tf.trainable_variables()
-  tower_loss += weight_decay * tf.add_n(
-      [tf.nn.l2_loss(v) for v in model_params])
-
-  tower_grad = tf.gradients(tower_loss, model_params)
-
-  return tower_loss, zip(tower_grad, model_params), tower_pred
-
-
-def input_fn(data_dir,
-             subset,
-             num_shards,
-             batch_size,
-             use_distortion_for_training=True):
-  """Create input graph for model.
-
-  Args:
-    data_dir: Directory where TFRecords representing the dataset are located.
-    subset: one of 'train', 'validate' and 'eval'.
-    num_shards: num of towers participating in data-parallel training.
-    batch_size: total batch size for training to be divided by the number of
-    shards.
-    use_distortion_for_training: True to use distortions.
-  Returns:
-    two lists of tensors for features and labels, each of num_shards length.
-  """
-  with tf.device('/cpu:0'):
-    use_distortion = subset == 'train' and use_distortion_for_training
-    dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
-    image_batch, label_batch = dataset.make_batch(batch_size)
-    if num_shards <= 1:
-      # No GPU available or only 1 GPU.
-      return [image_batch], [label_batch]
-
-    # Note that passing num=batch_size is safe here, even though
-    # dataset.batch(batch_size) can, in some cases, return fewer than batch_size
-    # examples. This is because it does so only when repeating for a limited
-    # number of epochs, but our dataset repeats forever.
-    image_batch = tf.unstack(image_batch, num=batch_size, axis=0)
-    label_batch = tf.unstack(label_batch, num=batch_size, axis=0)
-    feature_shards = [[] for i in range(num_shards)]
-    label_shards = [[] for i in range(num_shards)]
-    for i in xrange(batch_size):
-      idx = i % num_shards
-      feature_shards[idx].append(image_batch[i])
-      label_shards[idx].append(label_batch[i])
-    feature_shards = [tf.parallel_stack(x) for x in feature_shards]
-    label_shards = [tf.parallel_stack(x) for x in label_shards]
-    return feature_shards, label_shards
-
-
-def get_experiment_fn(data_dir,
-                      num_gpus,
-                      variable_strategy,
-                      use_distortion_for_training=True):
-  """Returns an Experiment function.
-
-  Experiments perform training on several workers in parallel,
-  in other words experiments know how to invoke train and eval in a sensible
-  fashion for distributed training. Arguments passed directly to this
-  function are not tunable, all other arguments should be passed within
-  tf.HParams, passed to the enclosed function.
-
-  Args:
-      data_dir: str. Location of the data for input_fns.
-      num_gpus: int. Number of GPUs on each worker.
-      variable_strategy: String. CPU to use CPU as the parameter server
-      and GPU to use the GPUs as the parameter server.
-      use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
-  Returns:
-      A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
-      tf.contrib.learn.Experiment.
-
-      Suitable for use by tf.contrib.learn.learn_runner, which will run various
-      methods on Experiment (train, evaluate) based on information
-      about the current runner in `run_config`.
-  """
-
-  def _experiment_fn(run_config, hparams):
-    """Returns an Experiment."""
-    # Create estimator.
-    train_input_fn = functools.partial(
-        input_fn,
-        data_dir,
-        subset='train',
-        num_shards=num_gpus,
-        batch_size=hparams.train_batch_size,
-        use_distortion_for_training=use_distortion_for_training)
-
-    eval_input_fn = functools.partial(
-        input_fn,
-        data_dir,
-        subset='eval',
-        batch_size=hparams.eval_batch_size,
-        num_shards=num_gpus)
-
-    num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
-    if num_eval_examples % hparams.eval_batch_size != 0:
-      raise ValueError(
-          'validation set size must be multiple of eval_batch_size')
-
-    train_steps = hparams.train_steps
-    eval_steps = num_eval_examples // hparams.eval_batch_size
- 
-    classifier = tf.estimator.Estimator(
-        model_fn=get_model_fn(num_gpus, variable_strategy,
-                              run_config.num_worker_replicas or 1),
-        config=run_config,
-        params=hparams)
-
-    # Create experiment.
-    return tf.contrib.learn.Experiment(
-        classifier,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        train_steps=train_steps,
-        eval_steps=eval_steps)
-
-  return _experiment_fn
-
-
-def main(job_dir, data_dir, num_gpus, variable_strategy,
-         use_distortion_for_training, log_device_placement, num_intra_threads,
-         **hparams):
-  # The env variable is on deprecation path, default is set to off.
-  os.environ['TF_SYNC_ON_FINISH'] = '0'
-  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-
-  # Session configuration.
-  sess_config = tf.ConfigProto(
-      allow_soft_placement=True,
-      log_device_placement=log_device_placement,
-      intra_op_parallelism_threads=num_intra_threads,
-      gpu_options=tf.GPUOptions(force_gpu_compatible=True))
-
-  config = cifar10_utils.RunConfig(
-      session_config=sess_config, model_dir=job_dir)
-  tf.contrib.learn.learn_runner.run(
-      get_experiment_fn(data_dir, num_gpus, variable_strategy,
-                        use_distortion_for_training),
-      run_config=config,
-      hparams=tf.contrib.training.HParams(
-          is_chief=config.is_chief,
-          **hparams))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data-dir',
-      type=str,
-      required=True,
-      help='The directory where the CIFAR-10 input data is stored.')
-  parser.add_argument(
-      '--job-dir',
-      type=str,
-      required=True,
-      help='The directory where the model will be stored.')
-  parser.add_argument(
-      '--variable-strategy',
-      choices=['CPU', 'GPU'],
-      type=str,
-      default='CPU',
-      help='Where to locate variable operations')
-  parser.add_argument(
-      '--num-gpus',
-      type=int,
-      default=1,
-      help='The number of gpus used. Uses only CPU if set to 0.')
-  parser.add_argument(
-      '--num-layers',
-      type=int,
-      default=44,
-      help='The number of layers of the model.')
-  parser.add_argument(
-      '--train-steps',
-      type=int,
-      default=80000,
-      help='The number of steps to use for training.')
-  parser.add_argument(
-      '--train-batch-size',
-      type=int,
-      default=128,
-      help='Batch size for training.')
-  parser.add_argument(
-      '--eval-batch-size',
-      type=int,
-      default=100,
-      help='Batch size for validation.')
-  parser.add_argument(
-      '--momentum',
-      type=float,
-      default=0.9,
-      help='Momentum for MomentumOptimizer.')
-  parser.add_argument(
-      '--weight-decay',
-      type=float,
-      default=2e-4,
-      help='Weight decay for convolutions.')
-  parser.add_argument(
-      '--learning-rate',
-      type=float,
-      default=0.1,
-      help="""\
-      This is the inital learning rate value. The learning rate will decrease
-      during training. For more details check the model_fn implementation in
-      this file.\
-      """)
-  parser.add_argument(
-      '--use-distortion-for-training',
-      type=bool,
-      default=True,
-      help='If doing image distortion for training.')
-  parser.add_argument(
-      '--sync',
-      action='store_true',
-      default=False,
-      help="""\
-      If present when running in a distributed environment will run on sync mode.\
-      """)
-  parser.add_argument(
-      '--num-intra-threads',
-      type=int,
-      default=0,
-      help="""\
-      Number of threads to use for intra-op parallelism. When training on CPU
-      set to 0 to have the system pick the appropriate number or alternatively
-      set it to the number of physical CPU cores.\
-      """)
-  parser.add_argument(
-      '--num-inter-threads',
-      type=int,
-      default=0,
-      help="""\
-      Number of threads to use for inter-op parallelism. If set to 0, the
-      system will pick an appropriate number.\
-      """)
-  parser.add_argument(
-      '--data-format',
-      type=str,
-      default=None,
-      help="""\
-      If not set, the data format best for the training device is used. 
-      Allowed values: channels_first (NCHW) channels_last (NHWC).\
-      """)
-  parser.add_argument(
-      '--log-device-placement',
-      action='store_true',
-      default=False,
-      help='Whether to log device placement.')
-  parser.add_argument(
-      '--batch-norm-decay',
-      type=float,
-      default=0.997,
-      help='Decay for batch norm.')
-  parser.add_argument(
-      '--batch-norm-epsilon',
-      type=float,
-      default=1e-5,
-      help='Epsilon for batch norm.')
-  args = parser.parse_args()
-
-  if args.num_gpus > 0:
-    assert tf.test.is_gpu_available(), "Requested GPUs but none found."
-  if args.num_gpus < 0:
-    raise ValueError(
-        'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
-  if args.num_gpus == 0 and args.variable_strategy == 'GPU':
-    raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
-                     '--variable-strategy=CPU.')
-  if (args.num_layers - 2) % 6 != 0:
-    raise ValueError('Invalid --num-layers parameter.')
-  if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
-    raise ValueError('--train-batch-size must be multiple of --num-gpus.')
-  if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
-    raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
-
-  main(**vars(args))
diff --git a/tutorials/image/cifar10_estimator/cifar10_model.py b/tutorials/image/cifar10_estimator/cifar10_model.py
deleted file mode 100644
index d67c233dbba..00000000000
--- a/tutorials/image/cifar10_estimator/cifar10_model.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Model class for Cifar10 Dataset."""
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-import model_base
-
-
-class ResNetCifar10(model_base.ResNet):
-  """Cifar10 model with ResNetV1 and basic residual block."""
-
-  def __init__(self,
-               num_layers,
-               is_training,
-               batch_norm_decay,
-               batch_norm_epsilon,
-               data_format='channels_first'):
-    super(ResNetCifar10, self).__init__(
-        is_training,
-        data_format,
-        batch_norm_decay,
-        batch_norm_epsilon
-    )
-    self.n = (num_layers - 2) // 6
-    # Add one in case label starts with 1. No impact if label starts with 0.
-    self.num_classes = 10 + 1
-    self.filters = [16, 16, 32, 64]
-    self.strides = [1, 2, 2]
-
-  def forward_pass(self, x, input_data_format='channels_last'):
-    """Build the core model within the graph."""
-    if self._data_format != input_data_format:
-      if input_data_format == 'channels_last':
-        # Computation requires channels_first.
-        x = tf.transpose(x, [0, 3, 1, 2])
-      else:
-        # Computation requires channels_last.
-        x = tf.transpose(x, [0, 2, 3, 1])
-
-    # Image standardization.
-    x = x / 128 - 1
-
-    x = self._conv(x, 3, 16, 1)
-    x = self._batch_norm(x)
-    x = self._relu(x)
-
-    # Use basic (non-bottleneck) block and ResNet V1 (post-activation).
-    res_func = self._residual_v1
-
-    # 3 stages of block stacking.
-    for i in range(3):
-      with tf.name_scope('stage'):
-        for j in range(self.n):
-          if j == 0:
-            # First block in a stage, filters and strides may change.
-            x = res_func(x, 3, self.filters[i], self.filters[i + 1],
-                         self.strides[i])
-          else:
-            # Following blocks in a stage, constant filters and unit stride.
-            x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1)
-
-    x = self._global_avg_pool(x)
-    x = self._fully_connected(x, self.num_classes)
-
-    return x
diff --git a/tutorials/image/cifar10_estimator/cifar10_utils.py b/tutorials/image/cifar10_estimator/cifar10_utils.py
deleted file mode 100644
index 9082cbfece4..00000000000
--- a/tutorials/image/cifar10_estimator/cifar10_utils.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import collections
-import six
-
-import tensorflow as tf
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-from tensorflow.python.training import device_setter
-from tensorflow.contrib.learn.python.learn import run_config
-
-
-# TODO(b/64848083) Remove once uid bug is fixed
-class RunConfig(tf.contrib.learn.RunConfig): 
-  def uid(self, whitelist=None):
-    """Generates a 'Unique Identifier' based on all internal fields.
-    Caller should use the uid string to check `RunConfig` instance integrity
-    in one session use, but should not rely on the implementation details, which
-    is subject to change.
-    Args:
-      whitelist: A list of the string names of the properties uid should not
-        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
-        includes most properties user allowes to change.
-    Returns:
-      A uid string.
-    """
-    if whitelist is None:
-      whitelist = run_config._DEFAULT_UID_WHITE_LIST
-
-    state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
-    # Pop out the keys in whitelist.
-    for k in whitelist:
-      state.pop('_' + k, None)
-
-    ordered_state = collections.OrderedDict(
-        sorted(state.items(), key=lambda t: t[0]))
-    # For class instance without __repr__, some special cares are required.
-    # Otherwise, the object address will be used.
-    if '_cluster_spec' in ordered_state:
-      ordered_state['_cluster_spec'] = collections.OrderedDict(
-         sorted(ordered_state['_cluster_spec'].as_dict().items(),
-                key=lambda t: t[0])
-      )
-    return ', '.join(
-        '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) 
-
-
-class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
-  """Hook to print out examples per second.
-
-    Total time is tracked and then divided by the total number of steps
-    to get the average step time and then batch_size is used to determine
-    the running average of examples per second. The examples per second for the
-    most recent interval is also logged.
-  """
-
-  def __init__(
-      self,
-      batch_size,
-      every_n_steps=100,
-      every_n_secs=None,):
-    """Initializer for ExamplesPerSecondHook.
-
-      Args:
-      batch_size: Total batch size used to calculate examples/second from
-      global time.
-      every_n_steps: Log stats every n steps.
-      every_n_secs: Log stats every n seconds.
-    """
-    if (every_n_steps is None) == (every_n_secs is None):
-      raise ValueError('exactly one of every_n_steps'
-                       ' and every_n_secs should be provided.')
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_steps=every_n_steps, every_secs=every_n_secs)
-
-    self._step_train_time = 0
-    self._total_steps = 0
-    self._batch_size = batch_size
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError(
-          'Global step should be created to use StepCounterHook.')
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-
-    global_step = run_values.results
-    if self._timer.should_trigger_for_step(global_step):
-      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
-          global_step)
-      if elapsed_time is not None:
-        steps_per_sec = elapsed_steps / elapsed_time
-        self._step_train_time += elapsed_time
-        self._total_steps += elapsed_steps
-
-        average_examples_per_sec = self._batch_size * (
-            self._total_steps / self._step_train_time)
-        current_examples_per_sec = steps_per_sec * self._batch_size
-        # Average examples/sec followed by current examples/sec
-        logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
-                     average_examples_per_sec, current_examples_per_sec,
-                     self._total_steps)
-
-def local_device_setter(num_devices=1,
-                        ps_device_type='cpu',
-                        worker_device='/cpu:0',
-                        ps_ops=None,
-                        ps_strategy=None):
-  if ps_ops == None:
-    ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
-
-  if ps_strategy is None:
-    ps_strategy = device_setter._RoundRobinStrategy(num_devices)
-  if not six.callable(ps_strategy):
-    raise TypeError("ps_strategy must be callable")
-
-  def _local_device_chooser(op):
-    current_device = pydev.DeviceSpec.from_string(op.device or "")
-
-    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-    if node_def.op in ps_ops:
-      ps_device_spec = pydev.DeviceSpec.from_string(
-          '/{}:{}'.format(ps_device_type, ps_strategy(op)))
-
-      ps_device_spec.merge_from(current_device)
-      return ps_device_spec.to_string()
-    else:
-      worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
-      worker_device_spec.merge_from(current_device)
-      return worker_device_spec.to_string()
-  return _local_device_chooser
diff --git a/tutorials/image/cifar10_estimator/cmle_config.yaml b/tutorials/image/cifar10_estimator/cmle_config.yaml
deleted file mode 100644
index 76f920534ef..00000000000
--- a/tutorials/image/cifar10_estimator/cmle_config.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-trainingInput:
-  scaleTier: CUSTOM
-  masterType: complex_model_m_gpu
-  workerType: complex_model_m_gpu
-  parameterServerType: complex_model_m
-  workerCount: 1
diff --git a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py b/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
deleted file mode 100644
index d1a599c31bf..00000000000
--- a/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
-
-Generates tf.train.Example protos and writes them to TFRecord files from the
-python version of the CIFAR-10 dataset downloaded from
-https://www.cs.toronto.edu/~kriz/cifar.html.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-
-import tarfile
-from six.moves import cPickle as pickle
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-CIFAR_FILENAME = 'cifar-10-python.tar.gz'
-CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
-CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
-
-
-def download_and_extract(data_dir):
-  # download CIFAR-10 if not already downloaded.
-  tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
-                                                CIFAR_DOWNLOAD_URL)
-  tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
-               'r:gz').extractall(data_dir)
-
-
-def _int64_feature(value):
-  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
-
-
-def _bytes_feature(value):
-  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-
-
-def _get_file_names():
-  """Returns the file names expected to exist in the input_dir."""
-  file_names = {}
-  file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
-  file_names['validation'] = ['data_batch_5']
-  file_names['eval'] = ['test_batch']
-  return file_names
-
-
-def read_pickle_from_file(filename):
-  with tf.gfile.Open(filename, 'rb') as f:
-    if sys.version_info >= (3, 0):
-      data_dict = pickle.load(f, encoding='bytes')
-    else:
-      data_dict = pickle.load(f)
-  return data_dict
-
-
-def convert_to_tfrecord(input_files, output_file):
-  """Converts a file to TFRecords."""
-  print('Generating %s' % output_file)
-  with tf.python_io.TFRecordWriter(output_file) as record_writer:
-    for input_file in input_files:
-      data_dict = read_pickle_from_file(input_file)
-      data = data_dict[b'data']
-      labels = data_dict[b'labels']
-      num_entries_in_batch = len(labels)
-      for i in range(num_entries_in_batch):
-        example = tf.train.Example(features=tf.train.Features(
-            feature={
-                'image': _bytes_feature(data[i].tobytes()),
-                'label': _int64_feature(labels[i])
-            }))
-        record_writer.write(example.SerializeToString())
-
-
-def main(data_dir):
-  print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
-  download_and_extract(data_dir)
-  file_names = _get_file_names()
-  input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
-  for mode, files in file_names.items():
-    input_files = [os.path.join(input_dir, f) for f in files]
-    output_file = os.path.join(data_dir, mode + '.tfrecords')
-    try:
-      os.remove(output_file)
-    except OSError:
-      pass
-    # Convert to tf.train.Example and write the to TFRecords.
-    convert_to_tfrecord(input_files, output_file)
-  print('Done!')
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data-dir',
-      type=str,
-      default='',
-      help='Directory to download and extract CIFAR-10 to.')
-
-  args = parser.parse_args()
-  main(args.data_dir)
diff --git a/tutorials/image/cifar10_estimator/model_base.py b/tutorials/image/cifar10_estimator/model_base.py
deleted file mode 100644
index 35e52b8355d..00000000000
--- a/tutorials/image/cifar10_estimator/model_base.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet model.
-
-Related papers:
-https://arxiv.org/pdf/1603.05027v2.pdf
-https://arxiv.org/pdf/1512.03385v1.pdf
-https://arxiv.org/pdf/1605.07146v1.pdf
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-class ResNet(object):
-  """ResNet model."""
-
-  def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
-    """ResNet constructor.
-
-    Args:
-      is_training: if build training or inference model.
-      data_format: the data_format used during computation.
-                   one of 'channels_first' or 'channels_last'.
-    """
-    self._batch_norm_decay = batch_norm_decay
-    self._batch_norm_epsilon = batch_norm_epsilon
-    self._is_training = is_training
-    assert data_format in ('channels_first', 'channels_last')
-    self._data_format = data_format
-
-  def forward_pass(self, x):
-    raise NotImplementedError(
-        'forward_pass() is implemented in ResNet sub classes')
-
-  def _residual_v1(self,
-                   x,
-                   kernel_size,
-                   in_filter,
-                   out_filter,
-                   stride,
-                   activate_before_residual=False):
-    """Residual unit with 2 sub layers, using Plan A for shortcut connection."""
-
-    del activate_before_residual
-    with tf.name_scope('residual_v1') as name_scope:
-      orig_x = x
-
-      x = self._conv(x, kernel_size, out_filter, stride)
-      x = self._batch_norm(x)
-      x = self._relu(x)
-
-      x = self._conv(x, kernel_size, out_filter, 1)
-      x = self._batch_norm(x)
-
-      if in_filter != out_filter:
-        orig_x = self._avg_pool(orig_x, stride, stride)
-        pad = (out_filter - in_filter) // 2
-        if self._data_format == 'channels_first':
-          orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
-        else:
-          orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
-
-      x = self._relu(tf.add(x, orig_x))
-
-      tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-      return x
-
-  def _residual_v2(self,
-                   x,
-                   in_filter,
-                   out_filter,
-                   stride,
-                   activate_before_residual=False):
-    """Residual unit with 2 sub layers with preactivation, plan A shortcut."""
-
-    with tf.name_scope('residual_v2') as name_scope:
-      if activate_before_residual:
-        x = self._batch_norm(x)
-        x = self._relu(x)
-        orig_x = x
-      else:
-        orig_x = x
-        x = self._batch_norm(x)
-        x = self._relu(x)
-
-      x = self._conv(x, 3, out_filter, stride)
-
-      x = self._batch_norm(x)
-      x = self._relu(x)
-      x = self._conv(x, 3, out_filter, [1, 1, 1, 1])
-
-      if in_filter != out_filter:
-        pad = (out_filter - in_filter) // 2
-        orig_x = self._avg_pool(orig_x, stride, stride)
-        if self._data_format == 'channels_first':
-          orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
-        else:
-          orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
-
-      x = tf.add(x, orig_x)
-
-      tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-      return x
-
-  def _bottleneck_residual_v2(self,
-                              x,
-                              in_filter,
-                              out_filter,
-                              stride,
-                              activate_before_residual=False):
-    """Bottleneck residual unit with 3 sub layers, plan B shortcut."""
-
-    with tf.name_scope('bottle_residual_v2') as name_scope:
-      if activate_before_residual:
-        x = self._batch_norm(x)
-        x = self._relu(x)
-        orig_x = x
-      else:
-        orig_x = x
-        x = self._batch_norm(x)
-        x = self._relu(x)
-
-      x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True)
-
-      x = self._batch_norm(x)
-      x = self._relu(x)
-      # pad when stride isn't unit
-      x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True)
-
-      x = self._batch_norm(x)
-      x = self._relu(x)
-      x = self._conv(x, 1, out_filter, 1, is_atrous=True)
-
-      if in_filter != out_filter:
-        orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True)
-      x = tf.add(x, orig_x)
-
-      tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-      return x
-
-  def _conv(self, x, kernel_size, filters, strides, is_atrous=False):
-    """Convolution."""
-
-    padding = 'SAME'
-    if not is_atrous and strides > 1:
-      pad = kernel_size - 1
-      pad_beg = pad // 2
-      pad_end = pad - pad_beg
-      if self._data_format == 'channels_first':
-        x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
-      else:
-        x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
-      padding = 'VALID'
-    return tf.layers.conv2d(
-        inputs=x,
-        kernel_size=kernel_size,
-        filters=filters,
-        strides=strides,
-        padding=padding,
-        use_bias=False,
-        data_format=self._data_format)
-
-  def _batch_norm(self, x):
-    if self._data_format == 'channels_first':
-      data_format = 'NCHW'
-    else:
-      data_format = 'NHWC'
-    return tf.contrib.layers.batch_norm(
-        x,
-        decay=self._batch_norm_decay,
-        center=True,
-        scale=True,
-        epsilon=self._batch_norm_epsilon,
-        is_training=self._is_training,
-        fused=True,
-        data_format=data_format)
-
-  def _relu(self, x):
-    return tf.nn.relu(x)
-
-  def _fully_connected(self, x, out_dim):
-    with tf.name_scope('fully_connected') as name_scope:
-      x = tf.layers.dense(x, out_dim)
-
-    tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-    return x
-
-  def _avg_pool(self, x, pool_size, stride):
-    with tf.name_scope('avg_pool') as name_scope:
-      x = tf.layers.average_pooling2d(
-          x, pool_size, stride, 'SAME', data_format=self._data_format)
-
-    tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-    return x
-
-  def _global_avg_pool(self, x):
-    with tf.name_scope('global_avg_pool') as name_scope:
-      assert x.get_shape().ndims == 4
-      if self._data_format == 'channels_first':
-        x = tf.reduce_mean(x, [2, 3])
-      else:
-        x = tf.reduce_mean(x, [1, 2])
-    tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
-    return x
diff --git a/tutorials/image/imagenet/BUILD b/tutorials/image/imagenet/BUILD
deleted file mode 100644
index b3ff258eb4e..00000000000
--- a/tutorials/image/imagenet/BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-# Description:
-# Example TensorFlow models for ImageNet.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "classify_image",
-    srcs = [
-        "classify_image.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/image/imagenet/classify_image.py b/tutorials/image/imagenet/classify_image.py
deleted file mode 100644
index c2850f58ea3..00000000000
--- a/tutorials/image/imagenet/classify_image.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Simple image classification with Inception.
-
-Run image classification with Inception trained on ImageNet 2012 Challenge data
-set.
-
-This program creates a graph from a saved GraphDef protocol buffer,
-and runs inference on an input JPEG image. It outputs human readable
-strings of the top 5 predictions along with their probabilities.
-
-Change the --image_file argument to any jpg image to compute a
-classification of that image.
-
-Please see the tutorial and website for a detailed description of how
-to use this script to perform image recognition.
-
-https://tensorflow.org/tutorials/image_recognition/
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os.path
-import re
-import sys
-import tarfile
-
-import numpy as np
-from six.moves import urllib
-import tensorflow as tf
-
-FLAGS = None
-
-# pylint: disable=line-too-long
-DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
-# pylint: enable=line-too-long
-
-
-class NodeLookup(object):
-  """Converts integer node ID's to human readable labels."""
-
-  def __init__(self,
-               label_lookup_path=None,
-               uid_lookup_path=None):
-    if not label_lookup_path:
-      label_lookup_path = os.path.join(
-          FLAGS.model_dir, 'imagenet_2012_challenge_label_map_proto.pbtxt')
-    if not uid_lookup_path:
-      uid_lookup_path = os.path.join(
-          FLAGS.model_dir, 'imagenet_synset_to_human_label_map.txt')
-    self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
-
-  def load(self, label_lookup_path, uid_lookup_path):
-    """Loads a human readable English name for each softmax node.
-
-    Args:
-      label_lookup_path: string UID to integer node ID.
-      uid_lookup_path: string UID to human-readable string.
-
-    Returns:
-      dict from integer node ID to human-readable string.
-    """
-    if not tf.gfile.Exists(uid_lookup_path):
-      tf.logging.fatal('File does not exist %s', uid_lookup_path)
-    if not tf.gfile.Exists(label_lookup_path):
-      tf.logging.fatal('File does not exist %s', label_lookup_path)
-
-    # Loads mapping from string UID to human-readable string
-    proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
-    uid_to_human = {}
-    p = re.compile(r'[n\d]*[ \S,]*')
-    for line in proto_as_ascii_lines:
-      parsed_items = p.findall(line)
-      uid = parsed_items[0]
-      human_string = parsed_items[2]
-      uid_to_human[uid] = human_string
-
-    # Loads mapping from string UID to integer node ID.
-    node_id_to_uid = {}
-    proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
-    for line in proto_as_ascii:
-      if line.startswith('  target_class:'):
-        target_class = int(line.split(': ')[1])
-      if line.startswith('  target_class_string:'):
-        target_class_string = line.split(': ')[1]
-        node_id_to_uid[target_class] = target_class_string[1:-2]
-
-    # Loads the final mapping of integer node ID to human-readable string
-    node_id_to_name = {}
-    for key, val in node_id_to_uid.items():
-      if val not in uid_to_human:
-        tf.logging.fatal('Failed to locate: %s', val)
-      name = uid_to_human[val]
-      node_id_to_name[key] = name
-
-    return node_id_to_name
-
-  def id_to_string(self, node_id):
-    if node_id not in self.node_lookup:
-      return ''
-    return self.node_lookup[node_id]
-
-
-def create_graph():
-  """Creates a graph from saved GraphDef file and returns a saver."""
-  # Creates graph from saved graph_def.pb.
-  with tf.gfile.FastGFile(os.path.join(
-      FLAGS.model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
-    graph_def = tf.GraphDef()
-    graph_def.ParseFromString(f.read())
-    _ = tf.import_graph_def(graph_def, name='')
-
-
-def run_inference_on_image(image):
-  """Runs inference on an image.
-
-  Args:
-    image: Image file name.
-
-  Returns:
-    Nothing
-  """
-  if not tf.gfile.Exists(image):
-    tf.logging.fatal('File does not exist %s', image)
-  image_data = tf.gfile.FastGFile(image, 'rb').read()
-
-  # Creates graph from saved GraphDef.
-  create_graph()
-
-  with tf.Session() as sess:
-    # Some useful tensors:
-    # 'softmax:0': A tensor containing the normalized prediction across
-    #   1000 labels.
-    # 'pool_3:0': A tensor containing the next-to-last layer containing 2048
-    #   float description of the image.
-    # 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG
-    #   encoding of the image.
-    # Runs the softmax tensor by feeding the image_data as input to the graph.
-    softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
-    predictions = sess.run(softmax_tensor,
-                           {'DecodeJpeg/contents:0': image_data})
-    predictions = np.squeeze(predictions)
-
-    # Creates node ID --> English string lookup.
-    node_lookup = NodeLookup()
-
-    top_k = predictions.argsort()[-FLAGS.num_top_predictions:][::-1]
-    for node_id in top_k:
-      human_string = node_lookup.id_to_string(node_id)
-      score = predictions[node_id]
-      print('%s (score = %.5f)' % (human_string, score))
-
-
-def maybe_download_and_extract():
-  """Download and extract model tar file."""
-  dest_directory = FLAGS.model_dir
-  if not os.path.exists(dest_directory):
-    os.makedirs(dest_directory)
-  filename = DATA_URL.split('/')[-1]
-  filepath = os.path.join(dest_directory, filename)
-  if not os.path.exists(filepath):
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-          filename, float(count * block_size) / float(total_size) * 100.0))
-      sys.stdout.flush()
-    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
-
-
-def main(_):
-  maybe_download_and_extract()
-  image = (FLAGS.image_file if FLAGS.image_file else
-           os.path.join(FLAGS.model_dir, 'cropped_panda.jpg'))
-  run_inference_on_image(image)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  # classify_image_graph_def.pb:
-  #   Binary representation of the GraphDef protocol buffer.
-  # imagenet_synset_to_human_label_map.txt:
-  #   Map from synset ID to a human readable string.
-  # imagenet_2012_challenge_label_map_proto.pbtxt:
-  #   Text representation of a protocol buffer mapping a label to synset ID.
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='/tmp/imagenet',
-      help="""\
-      Path to classify_image_graph_def.pb,
-      imagenet_synset_to_human_label_map.txt, and
-      imagenet_2012_challenge_label_map_proto.pbtxt.\
-      """
-  )
-  parser.add_argument(
-      '--image_file',
-      type=str,
-      default='',
-      help='Absolute path to image file.'
-  )
-  parser.add_argument(
-      '--num_top_predictions',
-      type=int,
-      default=5,
-      help='Display this many predictions.'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tutorials/image/mnist/BUILD b/tutorials/image/mnist/BUILD
deleted file mode 100644
index a9b6d78e5e6..00000000000
--- a/tutorials/image/mnist/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Description:
-# Example TensorFlow models for MNIST that achieves high accuracy
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "convolutional",
-    srcs = [
-        "convolutional.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_test(
-    name = "convolutional_test",
-    size = "medium",
-    srcs = [
-        "convolutional.py",
-    ],
-    args = [
-        "--self_test",
-    ],
-    main = "convolutional.py",
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/image/mnist/__init__.py b/tutorials/image/mnist/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tutorials/image/mnist/convolutional.py b/tutorials/image/mnist/convolutional.py
deleted file mode 100644
index b38d4bd351b..00000000000
--- a/tutorials/image/mnist/convolutional.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example.
-
-This should achieve a test error of 0.7%. Please keep this model as simple and
-linear as possible, it is meant as a tutorial for simple convolutional models.
-Run with --self_test on the command line to execute a short self-test.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import gzip
-import os
-import sys
-import time
-
-import numpy
-from six.moves import urllib
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-# CVDF mirror of http://yann.lecun.com/exdb/mnist/
-SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
-WORK_DIRECTORY = 'data'
-IMAGE_SIZE = 28
-NUM_CHANNELS = 1
-PIXEL_DEPTH = 255
-NUM_LABELS = 10
-VALIDATION_SIZE = 5000  # Size of the validation set.
-SEED = 66478  # Set to None for random seed.
-BATCH_SIZE = 64
-NUM_EPOCHS = 10
-EVAL_BATCH_SIZE = 64
-EVAL_FREQUENCY = 100  # Number of steps between evaluations.
-
-
-FLAGS = None
-
-
-def data_type():
-  """Return the type of the activations, weights, and placeholder variables."""
-  if FLAGS.use_fp16:
-    return tf.float16
-  else:
-    return tf.float32
-
-
-def maybe_download(filename):
-  """Download the data from Yann's website, unless it's already here."""
-  if not tf.gfile.Exists(WORK_DIRECTORY):
-    tf.gfile.MakeDirs(WORK_DIRECTORY)
-  filepath = os.path.join(WORK_DIRECTORY, filename)
-  if not tf.gfile.Exists(filepath):
-    filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
-    with tf.gfile.GFile(filepath) as f:
-      size = f.size()
-    print('Successfully downloaded', filename, size, 'bytes.')
-  return filepath
-
-
-def extract_data(filename, num_images):
-  """Extract the images into a 4D tensor [image index, y, x, channels].
-
-  Values are rescaled from [0, 255] down to [-0.5, 0.5].
-  """
-  print('Extracting', filename)
-  with gzip.open(filename) as bytestream:
-    bytestream.read(16)
-    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
-    data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
-    data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
-    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
-    return data
-
-
-def extract_labels(filename, num_images):
-  """Extract the labels into a vector of int64 label IDs."""
-  print('Extracting', filename)
-  with gzip.open(filename) as bytestream:
-    bytestream.read(8)
-    buf = bytestream.read(1 * num_images)
-    labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
-  return labels
-
-
-def fake_data(num_images):
-  """Generate a fake dataset that matches the dimensions of MNIST."""
-  data = numpy.ndarray(
-      shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS),
-      dtype=numpy.float32)
-  labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64)
-  for image in xrange(num_images):
-    label = image % 2
-    data[image, :, :, 0] = label - 0.5
-    labels[image] = label
-  return data, labels
-
-
-def error_rate(predictions, labels):
-  """Return the error rate based on dense predictions and sparse labels."""
-  return 100.0 - (
-      100.0 *
-      numpy.sum(numpy.argmax(predictions, 1) == labels) /
-      predictions.shape[0])
-
-
-def main(_):
-  if FLAGS.self_test:
-    print('Running self-test.')
-    train_data, train_labels = fake_data(256)
-    validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE)
-    test_data, test_labels = fake_data(EVAL_BATCH_SIZE)
-    num_epochs = 1
-  else:
-    # Get the data.
-    train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
-    train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
-    test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
-    test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')
-
-    # Extract it into numpy arrays.
-    train_data = extract_data(train_data_filename, 60000)
-    train_labels = extract_labels(train_labels_filename, 60000)
-    test_data = extract_data(test_data_filename, 10000)
-    test_labels = extract_labels(test_labels_filename, 10000)
-
-    # Generate a validation set.
-    validation_data = train_data[:VALIDATION_SIZE, ...]
-    validation_labels = train_labels[:VALIDATION_SIZE]
-    train_data = train_data[VALIDATION_SIZE:, ...]
-    train_labels = train_labels[VALIDATION_SIZE:]
-    num_epochs = NUM_EPOCHS
-  train_size = train_labels.shape[0]
-
-  # This is where training samples and labels are fed to the graph.
-  # These placeholder nodes will be fed a batch of training data at each
-  # training step using the {feed_dict} argument to the Run() call below.
-  train_data_node = tf.placeholder(
-      data_type(),
-      shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
-  train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
-  eval_data = tf.placeholder(
-      data_type(),
-      shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
-
-  # The variables below hold all the trainable weights. They are passed an
-  # initial value which will be assigned when we call:
-  # {tf.global_variables_initializer().run()}
-  conv1_weights = tf.Variable(
-      tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
-                          stddev=0.1,
-                          seed=SEED, dtype=data_type()))
-  conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
-  conv2_weights = tf.Variable(tf.truncated_normal(
-      [5, 5, 32, 64], stddev=0.1,
-      seed=SEED, dtype=data_type()))
-  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
-  fc1_weights = tf.Variable(  # fully connected, depth 512.
-      tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
-                          stddev=0.1,
-                          seed=SEED,
-                          dtype=data_type()))
-  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
-  fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
-                                                stddev=0.1,
-                                                seed=SEED,
-                                                dtype=data_type()))
-  fc2_biases = tf.Variable(tf.constant(
-      0.1, shape=[NUM_LABELS], dtype=data_type()))
-
-  # We will replicate the model structure for the training subgraph, as well
-  # as the evaluation subgraphs, while sharing the trainable parameters.
-  def model(data, train=False):
-    """The Model definition."""
-    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
-    # the same size as the input). Note that {strides} is a 4D array whose
-    # shape matches the data layout: [image index, y, x, depth].
-    conv = tf.nn.conv2d(data,
-                        conv1_weights,
-                        strides=[1, 1, 1, 1],
-                        padding='SAME')
-    # Bias and rectified linear non-linearity.
-    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
-    # Max pooling. The kernel size spec {ksize} also follows the layout of
-    # the data. Here we have a pooling window of 2, and a stride of 2.
-    pool = tf.nn.max_pool(relu,
-                          ksize=[1, 2, 2, 1],
-                          strides=[1, 2, 2, 1],
-                          padding='SAME')
-    conv = tf.nn.conv2d(pool,
-                        conv2_weights,
-                        strides=[1, 1, 1, 1],
-                        padding='SAME')
-    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
-    pool = tf.nn.max_pool(relu,
-                          ksize=[1, 2, 2, 1],
-                          strides=[1, 2, 2, 1],
-                          padding='SAME')
-    # Reshape the feature map cuboid into a 2D matrix to feed it to the
-    # fully connected layers.
-    pool_shape = pool.get_shape().as_list()
-    reshape = tf.reshape(
-        pool,
-        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
-    # Fully connected layer. Note that the '+' operation automatically
-    # broadcasts the biases.
-    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
-    # Add a 50% dropout during training only. Dropout also scales
-    # activations such that no rescaling is needed at evaluation time.
-    if train:
-      hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
-    return tf.matmul(hidden, fc2_weights) + fc2_biases
-
-  # Training computation: logits + cross-entropy loss.
-  logits = model(train_data_node, True)
-  loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=train_labels_node, logits=logits))
-
-  # L2 regularization for the fully connected parameters.
-  regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
-                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
-  # Add the regularization term to the loss.
-  loss += 5e-4 * regularizers
-
-  # Optimizer: set up a variable that's incremented once per batch and
-  # controls the learning rate decay.
-  batch = tf.Variable(0, dtype=data_type())
-  # Decay once per epoch, using an exponential schedule starting at 0.01.
-  learning_rate = tf.train.exponential_decay(
-      0.01,                # Base learning rate.
-      batch * BATCH_SIZE,  # Current index into the dataset.
-      train_size,          # Decay step.
-      0.95,                # Decay rate.
-      staircase=True)
-  # Use simple momentum for the optimization.
-  optimizer = tf.train.MomentumOptimizer(learning_rate,
-                                         0.9).minimize(loss,
-                                                       global_step=batch)
-
-  # Predictions for the current training minibatch.
-  train_prediction = tf.nn.softmax(logits)
-
-  # Predictions for the test and validation, which we'll compute less often.
-  eval_prediction = tf.nn.softmax(model(eval_data))
-
-  # Small utility function to evaluate a dataset by feeding batches of data to
-  # {eval_data} and pulling the results from {eval_predictions}.
-  # Saves memory and enables this to run on smaller GPUs.
-  def eval_in_batches(data, sess):
-    """Get all predictions for a dataset by running it in small batches."""
-    size = data.shape[0]
-    if size < EVAL_BATCH_SIZE:
-      raise ValueError("batch size for evals larger than dataset: %d" % size)
-    predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
-    for begin in xrange(0, size, EVAL_BATCH_SIZE):
-      end = begin + EVAL_BATCH_SIZE
-      if end <= size:
-        predictions[begin:end, :] = sess.run(
-            eval_prediction,
-            feed_dict={eval_data: data[begin:end, ...]})
-      else:
-        batch_predictions = sess.run(
-            eval_prediction,
-            feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
-        predictions[begin:, :] = batch_predictions[begin - size:, :]
-    return predictions
-
-  # Create a local session to run the training.
-  start_time = time.time()
-  with tf.Session() as sess:
-    # Run all the initializers to prepare the trainable parameters.
-    tf.global_variables_initializer().run()
-    print('Initialized!')
-    # Loop through training steps.
-    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
-      # Compute the offset of the current minibatch in the data.
-      # Note that we could use better randomization across epochs.
-      offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
-      batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
-      batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
-      # This dictionary maps the batch data (as a numpy array) to the
-      # node in the graph it should be fed to.
-      feed_dict = {train_data_node: batch_data,
-                   train_labels_node: batch_labels}
-      # Run the optimizer to update weights.
-      sess.run(optimizer, feed_dict=feed_dict)
-      # print some extra information once reach the evaluation frequency
-      if step % EVAL_FREQUENCY == 0:
-        # fetch some extra nodes' data
-        l, lr, predictions = sess.run([loss, learning_rate, train_prediction],
-                                      feed_dict=feed_dict)
-        elapsed_time = time.time() - start_time
-        start_time = time.time()
-        print('Step %d (epoch %.2f), %.1f ms' %
-              (step, float(step) * BATCH_SIZE / train_size,
-               1000 * elapsed_time / EVAL_FREQUENCY))
-        print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
-        print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
-        print('Validation error: %.1f%%' % error_rate(
-            eval_in_batches(validation_data, sess), validation_labels))
-        sys.stdout.flush()
-    # Finally print the result!
-    test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
-    print('Test error: %.1f%%' % test_error)
-    if FLAGS.self_test:
-      print('test_error', test_error)
-      assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % (
-          test_error,)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--use_fp16',
-      default=False,
-      help='Use half floats instead of full floats if True.',
-      action='store_true')
-  parser.add_argument(
-      '--self_test',
-      default=False,
-      action='store_true',
-      help='True if running a self test.')
-
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tutorials/rnn/BUILD b/tutorials/rnn/BUILD
deleted file mode 100644
index 118884fd28d..00000000000
--- a/tutorials/rnn/BUILD
+++ /dev/null
@@ -1,80 +0,0 @@
-# Description:
-# Example RNN models, including language models and sequence-to-sequence models.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "linear",
-    srcs = [
-        "linear.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "rnn_cell",
-    srcs = [
-        "rnn_cell.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":linear",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "package",
-    srcs = [
-        "__init__.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":rnn",
-        ":rnn_cell",
-        ":seq2seq",
-    ],
-)
-
-py_library(
-    name = "rnn",
-    srcs = [
-        "rnn.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":rnn_cell",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "seq2seq",
-    srcs = [
-        "seq2seq.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":rnn",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/rnn/README.md b/tutorials/rnn/README.md
deleted file mode 100644
index 5166d14c3c6..00000000000
--- a/tutorials/rnn/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-This directory contains functions for creating recurrent neural networks
-and sequence-to-sequence models. Detailed instructions on how to get started
-and use them are available in the
-[tutorials on tensorflow.org](http://tensorflow.org/tutorials/).
-
-Here is a short overview of what is in this directory:
-
-
-File         | What's in it?
------------- | -------------
-`ptb/`       | PTB language model, see the [RNN Tutorial](http://tensorflow.org/tutorials/recurrent/)
-`quickdraw/` | Quick, Draw! model, see the [RNN Tutorial for Drawing Classification](https://www.tensorflow.org/versions/master/tutorials/recurrent_quickdraw)
-
-If you're looking for the 
-[`seq2seq` tutorial code](http://tensorflow.org/tutorials/seq2seq/), it lives
-in [its own repo](https://github.com/tensorflow/nmt).
\ No newline at end of file
diff --git a/tutorials/rnn/__init__.py b/tutorials/rnn/__init__.py
deleted file mode 100644
index 844cc0b854e..00000000000
--- a/tutorials/rnn/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Libraries to build Recurrent Neural Networks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
diff --git a/tutorials/rnn/ptb/BUILD b/tutorials/rnn/ptb/BUILD
deleted file mode 100644
index a79fa202784..00000000000
--- a/tutorials/rnn/ptb/BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-# Description:
-# Python support for TensorFlow.
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "package",
-    srcs = [
-        "__init__.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":reader",
-    ],
-)
-
-py_library(
-    name = "reader",
-    srcs = ["reader.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_test(
-    name = "reader_test",
-    size = "small",
-    srcs = ["reader_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":reader",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "util",
-    srcs = ["util.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_binary(
-    name = "ptb_word_lm",
-    srcs = [
-        "ptb_word_lm.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":reader",
-        ":util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tutorials/rnn/ptb/__init__.py b/tutorials/rnn/ptb/__init__.py
deleted file mode 100644
index 47ba9a74fb1..00000000000
--- a/tutorials/rnn/ptb/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Makes helper libraries available in the ptb package."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import reader
-import util
diff --git a/tutorials/rnn/ptb/ptb_word_lm.py b/tutorials/rnn/ptb/ptb_word_lm.py
deleted file mode 100644
index 502863de3f2..00000000000
--- a/tutorials/rnn/ptb/ptb_word_lm.py
+++ /dev/null
@@ -1,529 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Example / benchmark for building a PTB LSTM model.
-
-Trains the model described in:
-(Zaremba, et. al.) Recurrent Neural Network Regularization
-http://arxiv.org/abs/1409.2329
-
-There are 3 supported model configurations:
-===========================================
-| config | epochs | train | valid  | test
-===========================================
-| small  | 13     | 37.99 | 121.39 | 115.91
-| medium | 39     | 48.45 |  86.16 |  82.07
-| large  | 55     | 37.87 |  82.62 |  78.29
-The exact results may vary depending on the random initialization.
-
-The hyperparameters used in the model:
-- init_scale - the initial scale of the weights
-- learning_rate - the initial value of the learning rate
-- max_grad_norm - the maximum permissible norm of the gradient
-- num_layers - the number of LSTM layers
-- num_steps - the number of unrolled steps of LSTM
-- hidden_size - the number of LSTM units
-- max_epoch - the number of epochs trained with the initial learning rate
-- max_max_epoch - the total number of epochs for training
-- keep_prob - the probability of keeping weights in the dropout layer
-- lr_decay - the decay of the learning rate for each epoch after "max_epoch"
-- batch_size - the batch size
-- rnn_mode - the low level implementation of lstm cell: one of CUDNN,
-             BASIC, or BLOCK, representing cudnn_lstm, basic_lstm, and
-             lstm_block_cell classes.
-
-The data required for this example is in the data/ dir of the
-PTB dataset from Tomas Mikolov's webpage:
-
-$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-$ tar xvf simple-examples.tgz
-
-To run:
-
-$ python ptb_word_lm.py --data_path=simple-examples/data/
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-import tensorflow as tf
-
-import reader
-import util
-
-from tensorflow.python.client import device_lib
-
-from distutils.version import StrictVersion
-
-flags = tf.flags
-logging = tf.logging
-
-flags.DEFINE_string(
-    "model", "small",
-    "A type of model. Possible options are: small, medium, large.")
-flags.DEFINE_string("data_path", None,
-                    "Where the training/test data is stored.")
-flags.DEFINE_string("save_path", None,
-                    "Model output directory.")
-flags.DEFINE_bool("use_fp16", False,
-                  "Train using 16-bit floats instead of 32bit floats")
-flags.DEFINE_integer("num_gpus", 1,
-                     "If larger than 1, Grappler AutoParallel optimizer "
-                     "will create multiple training replicas with each GPU "
-                     "running one replica.")
-flags.DEFINE_string("rnn_mode", None,
-                    "The low level implementation of lstm cell: one of CUDNN, "
-                    "BASIC, and BLOCK, representing cudnn_lstm, basic_lstm, "
-                    "and lstm_block_cell classes.")
-FLAGS = flags.FLAGS
-BASIC = "basic"
-CUDNN = "cudnn"
-BLOCK = "block"
-
-
-def data_type():
-  return tf.float16 if FLAGS.use_fp16 else tf.float32
-
-
-class PTBInput(object):
-  """The input data."""
-
-  def __init__(self, config, data, name=None):
-    self.batch_size = batch_size = config.batch_size
-    self.num_steps = num_steps = config.num_steps
-    self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
-    self.input_data, self.targets = reader.ptb_producer(
-        data, batch_size, num_steps, name=name)
-
-
-class PTBModel(object):
-  """The PTB model."""
-
-  def __init__(self, is_training, config, input_):
-    self._is_training = is_training
-    self._input = input_
-    self._rnn_params = None
-    self._cell = None
-    self.batch_size = input_.batch_size
-    self.num_steps = input_.num_steps
-    size = config.hidden_size
-    vocab_size = config.vocab_size
-
-    with tf.device("/cpu:0"):
-      embedding = tf.get_variable(
-          "embedding", [vocab_size, size], dtype=data_type())
-      inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
-
-    if is_training and config.keep_prob < 1:
-      inputs = tf.nn.dropout(inputs, config.keep_prob)
-
-    output, state = self._build_rnn_graph(inputs, config, is_training)
-
-    softmax_w = tf.get_variable(
-        "softmax_w", [size, vocab_size], dtype=data_type())
-    softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
-    logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
-     # Reshape logits to be a 3-D tensor for sequence loss
-    logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
-
-    # Use the contrib sequence loss and average over the batches
-    loss = tf.contrib.seq2seq.sequence_loss(
-        logits,
-        input_.targets,
-        tf.ones([self.batch_size, self.num_steps], dtype=data_type()),
-        average_across_timesteps=False,
-        average_across_batch=True)
-
-    # Update the cost
-    self._cost = tf.reduce_sum(loss)
-    self._final_state = state
-
-    if not is_training:
-      return
-
-    self._lr = tf.Variable(0.0, trainable=False)
-    tvars = tf.trainable_variables()
-    grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars),
-                                      config.max_grad_norm)
-    optimizer = tf.train.GradientDescentOptimizer(self._lr)
-    self._train_op = optimizer.apply_gradients(
-        zip(grads, tvars),
-        global_step=tf.train.get_or_create_global_step())
-
-    self._new_lr = tf.placeholder(
-        tf.float32, shape=[], name="new_learning_rate")
-    self._lr_update = tf.assign(self._lr, self._new_lr)
-
-  def _build_rnn_graph(self, inputs, config, is_training):
-    if config.rnn_mode == CUDNN:
-      return self._build_rnn_graph_cudnn(inputs, config, is_training)
-    else:
-      return self._build_rnn_graph_lstm(inputs, config, is_training)
-
-  def _build_rnn_graph_cudnn(self, inputs, config, is_training):
-    """Build the inference graph using CUDNN cell."""
-    inputs = tf.transpose(inputs, [1, 0, 2])
-    self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(
-        num_layers=config.num_layers,
-        num_units=config.hidden_size,
-        input_size=config.hidden_size,
-        dropout=1 - config.keep_prob if is_training else 0)
-    params_size_t = self._cell.params_size()
-    self._rnn_params = tf.get_variable(
-        "lstm_params",
-        initializer=tf.random_uniform(
-            [params_size_t], -config.init_scale, config.init_scale),
-        validate_shape=False)
-    c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
-                 tf.float32)
-    h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
-                 tf.float32)
-    self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)
-    outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training)
-    outputs = tf.transpose(outputs, [1, 0, 2])
-    outputs = tf.reshape(outputs, [-1, config.hidden_size])
-    return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)
-
-  def _get_lstm_cell(self, config, is_training):
-    if config.rnn_mode == BASIC:
-      return tf.contrib.rnn.BasicLSTMCell(
-          config.hidden_size, forget_bias=0.0, state_is_tuple=True,
-          reuse=not is_training)
-    if config.rnn_mode == BLOCK:
-      return tf.contrib.rnn.LSTMBlockCell(
-          config.hidden_size, forget_bias=0.0)
-    raise ValueError("rnn_mode %s not supported" % config.rnn_mode)
-
-  def _build_rnn_graph_lstm(self, inputs, config, is_training):
-    """Build the inference graph using canonical LSTM cells."""
-    # Slightly better results can be obtained with forget gate biases
-    # initialized to 1 but the hyperparameters of the model would need to be
-    # different than reported in the paper.
-    def make_cell():
-      cell = self._get_lstm_cell(config, is_training)
-      if is_training and config.keep_prob < 1:
-        cell = tf.contrib.rnn.DropoutWrapper(
-            cell, output_keep_prob=config.keep_prob)
-      return cell
-
-    cell = tf.contrib.rnn.MultiRNNCell(
-        [make_cell() for _ in range(config.num_layers)], state_is_tuple=True)
-
-    self._initial_state = cell.zero_state(config.batch_size, data_type())
-    state = self._initial_state
-    # Simplified version of tf.nn.static_rnn().
-    # This builds an unrolled LSTM for tutorial purposes only.
-    # In general, use tf.nn.static_rnn() or tf.nn.static_state_saving_rnn().
-    #
-    # The alternative version of the code below is:
-    #
-    # inputs = tf.unstack(inputs, num=self.num_steps, axis=1)
-    # outputs, state = tf.nn.static_rnn(cell, inputs,
-    #                                   initial_state=self._initial_state)
-    outputs = []
-    with tf.variable_scope("RNN"):
-      for time_step in range(self.num_steps):
-        if time_step > 0: tf.get_variable_scope().reuse_variables()
-        (cell_output, state) = cell(inputs[:, time_step, :], state)
-        outputs.append(cell_output)
-    output = tf.reshape(tf.concat(outputs, 1), [-1, config.hidden_size])
-    return output, state
-
-  def assign_lr(self, session, lr_value):
-    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
-
-  def export_ops(self, name):
-    """Exports ops to collections."""
-    self._name = name
-    ops = {util.with_prefix(self._name, "cost"): self._cost}
-    if self._is_training:
-      ops.update(lr=self._lr, new_lr=self._new_lr, lr_update=self._lr_update)
-      if self._rnn_params:
-        ops.update(rnn_params=self._rnn_params)
-    for name, op in ops.items():
-      tf.add_to_collection(name, op)
-    self._initial_state_name = util.with_prefix(self._name, "initial")
-    self._final_state_name = util.with_prefix(self._name, "final")
-    util.export_state_tuples(self._initial_state, self._initial_state_name)
-    util.export_state_tuples(self._final_state, self._final_state_name)
-
-  def import_ops(self):
-    """Imports ops from collections."""
-    if self._is_training:
-      self._train_op = tf.get_collection_ref("train_op")[0]
-      self._lr = tf.get_collection_ref("lr")[0]
-      self._new_lr = tf.get_collection_ref("new_lr")[0]
-      self._lr_update = tf.get_collection_ref("lr_update")[0]
-      rnn_params = tf.get_collection_ref("rnn_params")
-      if self._cell and rnn_params:
-        params_saveable = tf.contrib.cudnn_rnn.RNNParamsSaveable(
-            self._cell,
-            self._cell.params_to_canonical,
-            self._cell.canonical_to_params,
-            rnn_params,
-            base_variable_scope="Model/RNN")
-        tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
-    self._cost = tf.get_collection_ref(util.with_prefix(self._name, "cost"))[0]
-    num_replicas = FLAGS.num_gpus if self._name == "Train" else 1
-    self._initial_state = util.import_state_tuples(
-        self._initial_state, self._initial_state_name, num_replicas)
-    self._final_state = util.import_state_tuples(
-        self._final_state, self._final_state_name, num_replicas)
-
-  @property
-  def input(self):
-    return self._input
-
-  @property
-  def initial_state(self):
-    return self._initial_state
-
-  @property
-  def cost(self):
-    return self._cost
-
-  @property
-  def final_state(self):
-    return self._final_state
-
-  @property
-  def lr(self):
-    return self._lr
-
-  @property
-  def train_op(self):
-    return self._train_op
-
-  @property
-  def initial_state_name(self):
-    return self._initial_state_name
-
-  @property
-  def final_state_name(self):
-    return self._final_state_name
-
-
-class SmallConfig(object):
-  """Small config."""
-  init_scale = 0.1
-  learning_rate = 1.0
-  max_grad_norm = 5
-  num_layers = 2
-  num_steps = 20
-  hidden_size = 200
-  max_epoch = 4
-  max_max_epoch = 13
-  keep_prob = 1.0
-  lr_decay = 0.5
-  batch_size = 20
-  vocab_size = 10000
-  rnn_mode = BLOCK
-
-
-class MediumConfig(object):
-  """Medium config."""
-  init_scale = 0.05
-  learning_rate = 1.0
-  max_grad_norm = 5
-  num_layers = 2
-  num_steps = 35
-  hidden_size = 650
-  max_epoch = 6
-  max_max_epoch = 39
-  keep_prob = 0.5
-  lr_decay = 0.8
-  batch_size = 20
-  vocab_size = 10000
-  rnn_mode = BLOCK
-
-
-class LargeConfig(object):
-  """Large config."""
-  init_scale = 0.04
-  learning_rate = 1.0
-  max_grad_norm = 10
-  num_layers = 2
-  num_steps = 35
-  hidden_size = 1500
-  max_epoch = 14
-  max_max_epoch = 55
-  keep_prob = 0.35
-  lr_decay = 1 / 1.15
-  batch_size = 20
-  vocab_size = 10000
-  rnn_mode = BLOCK
-
-
-class TestConfig(object):
-  """Tiny config, for testing."""
-  init_scale = 0.1
-  learning_rate = 1.0
-  max_grad_norm = 1
-  num_layers = 1
-  num_steps = 2
-  hidden_size = 2
-  max_epoch = 1
-  max_max_epoch = 1
-  keep_prob = 1.0
-  lr_decay = 0.5
-  batch_size = 20
-  vocab_size = 10000
-  rnn_mode = BLOCK
-
-
-def run_epoch(session, model, eval_op=None, verbose=False):
-  """Runs the model on the given data."""
-  start_time = time.time()
-  costs = 0.0
-  iters = 0
-  state = session.run(model.initial_state)
-
-  fetches = {
-      "cost": model.cost,
-      "final_state": model.final_state,
-  }
-  if eval_op is not None:
-    fetches["eval_op"] = eval_op
-
-  for step in range(model.input.epoch_size):
-    feed_dict = {}
-    for i, (c, h) in enumerate(model.initial_state):
-      feed_dict[c] = state[i].c
-      feed_dict[h] = state[i].h
-
-    vals = session.run(fetches, feed_dict)
-    cost = vals["cost"]
-    state = vals["final_state"]
-
-    costs += cost
-    iters += model.input.num_steps
-
-    if verbose and step % (model.input.epoch_size // 10) == 10:
-      print("%.3f perplexity: %.3f speed: %.0f wps" %
-            (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
-             iters * model.input.batch_size * max(1, FLAGS.num_gpus) /
-             (time.time() - start_time)))
-
-  return np.exp(costs / iters)
-
-
-def get_config():
-  """Get model config."""
-  config = None
-  if FLAGS.model == "small":
-    config = SmallConfig()
-  elif FLAGS.model == "medium":
-    config = MediumConfig()
-  elif FLAGS.model == "large":
-    config = LargeConfig()
-  elif FLAGS.model == "test":
-    config = TestConfig()
-  else:
-    raise ValueError("Invalid model: %s", FLAGS.model)
-  if FLAGS.rnn_mode:
-    config.rnn_mode = FLAGS.rnn_mode
-  if FLAGS.num_gpus != 1 or StrictVersion(tf.__version__) < StrictVersion("1.3.0") :
-    config.rnn_mode = BASIC
-  return config
-
-
-def main(_):
-  if not FLAGS.data_path:
-    raise ValueError("Must set --data_path to PTB data directory")
-  gpus = [
-      x.name for x in device_lib.list_local_devices() if x.device_type == "GPU"
-  ]
-  if FLAGS.num_gpus > len(gpus):
-    raise ValueError(
-        "Your machine has only %d gpus "
-        "which is less than the requested --num_gpus=%d."
-        % (len(gpus), FLAGS.num_gpus))
-
-  raw_data = reader.ptb_raw_data(FLAGS.data_path)
-  train_data, valid_data, test_data, _ = raw_data
-
-  config = get_config()
-  eval_config = get_config()
-  eval_config.batch_size = 1
-  eval_config.num_steps = 1
-
-  with tf.Graph().as_default():
-    initializer = tf.random_uniform_initializer(-config.init_scale,
-                                                config.init_scale)
-
-    with tf.name_scope("Train"):
-      train_input = PTBInput(config=config, data=train_data, name="TrainInput")
-      with tf.variable_scope("Model", reuse=None, initializer=initializer):
-        m = PTBModel(is_training=True, config=config, input_=train_input)
-      tf.summary.scalar("Training Loss", m.cost)
-      tf.summary.scalar("Learning Rate", m.lr)
-
-    with tf.name_scope("Valid"):
-      valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
-      with tf.variable_scope("Model", reuse=True, initializer=initializer):
-        mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
-      tf.summary.scalar("Validation Loss", mvalid.cost)
-
-    with tf.name_scope("Test"):
-      test_input = PTBInput(
-          config=eval_config, data=test_data, name="TestInput")
-      with tf.variable_scope("Model", reuse=True, initializer=initializer):
-        mtest = PTBModel(is_training=False, config=eval_config,
-                         input_=test_input)
-
-    models = {"Train": m, "Valid": mvalid, "Test": mtest}
-    for name, model in models.items():
-      model.export_ops(name)
-    metagraph = tf.train.export_meta_graph()
-    if StrictVersion(tf.__version__) < StrictVersion("1.1.0") and FLAGS.num_gpus > 1:
-      raise ValueError("num_gpus > 1 is not supported for TensorFlow versions "
-                       "below 1.1.0")
-    soft_placement = False
-    if FLAGS.num_gpus > 1:
-      soft_placement = True
-      util.auto_parallel(metagraph, m)
-
-  with tf.Graph().as_default():
-    tf.train.import_meta_graph(metagraph)
-    for model in models.values():
-      model.import_ops()
-    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
-    config_proto = tf.ConfigProto(allow_soft_placement=soft_placement)
-    with sv.managed_session(config=config_proto) as session:
-      for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
-        m.assign_lr(session, config.learning_rate * lr_decay)
-
-        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
-        train_perplexity = run_epoch(session, m, eval_op=m.train_op,
-                                     verbose=True)
-        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
-        valid_perplexity = run_epoch(session, mvalid)
-        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
-
-      test_perplexity = run_epoch(session, mtest)
-      print("Test Perplexity: %.3f" % test_perplexity)
-
-      if FLAGS.save_path:
-        print("Saving model to %s." % FLAGS.save_path)
-        sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tutorials/rnn/ptb/reader.py b/tutorials/rnn/ptb/reader.py
deleted file mode 100644
index da1dee32b2b..00000000000
--- a/tutorials/rnn/ptb/reader.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-"""Utilities for parsing PTB text files."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-import sys
-
-import tensorflow as tf
-
-Py3 = sys.version_info[0] == 3
-
-def _read_words(filename):
-  with tf.gfile.GFile(filename, "r") as f:
-    if Py3:
-      return f.read().replace("\n", "<eos>").split()
-    else:
-      return f.read().decode("utf-8").replace("\n", "<eos>").split()
-
-
-def _build_vocab(filename):
-  data = _read_words(filename)
-
-  counter = collections.Counter(data)
-  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
-
-  words, _ = list(zip(*count_pairs))
-  word_to_id = dict(zip(words, range(len(words))))
-
-  return word_to_id
-
-
-def _file_to_word_ids(filename, word_to_id):
-  data = _read_words(filename)
-  return [word_to_id[word] for word in data if word in word_to_id]
-
-
-def ptb_raw_data(data_path=None):
-  """Load PTB raw data from data directory "data_path".
-
-  Reads PTB text files, converts strings to integer ids,
-  and performs mini-batching of the inputs.
-
-  The PTB dataset comes from Tomas Mikolov's webpage:
-
-  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-
-  Args:
-    data_path: string path to the directory where simple-examples.tgz has
-      been extracted.
-
-  Returns:
-    tuple (train_data, valid_data, test_data, vocabulary)
-    where each of the data objects can be passed to PTBIterator.
-  """
-
-  train_path = os.path.join(data_path, "ptb.train.txt")
-  valid_path = os.path.join(data_path, "ptb.valid.txt")
-  test_path = os.path.join(data_path, "ptb.test.txt")
-
-  word_to_id = _build_vocab(train_path)
-  train_data = _file_to_word_ids(train_path, word_to_id)
-  valid_data = _file_to_word_ids(valid_path, word_to_id)
-  test_data = _file_to_word_ids(test_path, word_to_id)
-  vocabulary = len(word_to_id)
-  return train_data, valid_data, test_data, vocabulary
-
-
-def ptb_producer(raw_data, batch_size, num_steps, name=None):
-  """Iterate on the raw PTB data.
-
-  This chunks up raw_data into batches of examples and returns Tensors that
-  are drawn from these batches.
-
-  Args:
-    raw_data: one of the raw data outputs from ptb_raw_data.
-    batch_size: int, the batch size.
-    num_steps: int, the number of unrolls.
-    name: the name of this operation (optional).
-
-  Returns:
-    A pair of Tensors, each shaped [batch_size, num_steps]. The second element
-    of the tuple is the same data time-shifted to the right by one.
-
-  Raises:
-    tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
-  """
-  with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
-    raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
-
-    data_len = tf.size(raw_data)
-    batch_len = data_len // batch_size
-    data = tf.reshape(raw_data[0 : batch_size * batch_len],
-                      [batch_size, batch_len])
-
-    epoch_size = (batch_len - 1) // num_steps
-    assertion = tf.assert_positive(
-        epoch_size,
-        message="epoch_size == 0, decrease batch_size or num_steps")
-    with tf.control_dependencies([assertion]):
-      epoch_size = tf.identity(epoch_size, name="epoch_size")
-
-    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
-    x = tf.strided_slice(data, [0, i * num_steps],
-                         [batch_size, (i + 1) * num_steps])
-    x.set_shape([batch_size, num_steps])
-    y = tf.strided_slice(data, [0, i * num_steps + 1],
-                         [batch_size, (i + 1) * num_steps + 1])
-    y.set_shape([batch_size, num_steps])
-    return x, y
diff --git a/tutorials/rnn/ptb/reader_test.py b/tutorials/rnn/ptb/reader_test.py
deleted file mode 100644
index ab0191aef6e..00000000000
--- a/tutorials/rnn/ptb/reader_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for models.tutorials.rnn.ptb.reader."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-import tensorflow as tf
-
-import reader
-
-
-class PtbReaderTest(tf.test.TestCase):
-
-  def setUp(self):
-    self._string_data = "\n".join(
-        [" hello there i am",
-         " rain as day",
-         " want some cheesy puffs ?"])
-
-  def testPtbRawData(self):
-    tmpdir = tf.test.get_temp_dir()
-    for suffix in "train", "valid", "test":
-      filename = os.path.join(tmpdir, "ptb.%s.txt" % suffix)
-      with tf.gfile.GFile(filename, "w") as fh:
-        fh.write(self._string_data)
-    # Smoke test
-    output = reader.ptb_raw_data(tmpdir)
-    self.assertEqual(len(output), 4)
-
-  def testPtbProducer(self):
-    raw_data = [4, 3, 2, 1, 0, 5, 6, 1, 1, 1, 1, 0, 3, 4, 1]
-    batch_size = 3
-    num_steps = 2
-    x, y = reader.ptb_producer(raw_data, batch_size, num_steps)
-    with self.test_session() as session:
-      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
-      try:
-        xval, yval = session.run([x, y])
-        self.assertAllEqual(xval, [[4, 3], [5, 6], [1, 0]])
-        self.assertAllEqual(yval, [[3, 2], [6, 1], [0, 3]])
-        xval, yval = session.run([x, y])
-        self.assertAllEqual(xval, [[2, 1], [1, 1], [3, 4]])
-        self.assertAllEqual(yval, [[1, 0], [1, 1], [4, 1]])
-      finally:
-        coord.request_stop()
-        coord.join()
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tutorials/rnn/ptb/util.py b/tutorials/rnn/ptb/util.py
deleted file mode 100644
index f23581e69b2..00000000000
--- a/tutorials/rnn/ptb/util.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for Grappler autoparallel optimizer."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-
-FLAGS = tf.flags.FLAGS
-
-
-def export_state_tuples(state_tuples, name):
-  for state_tuple in state_tuples:
-    tf.add_to_collection(name, state_tuple.c)
-    tf.add_to_collection(name, state_tuple.h)
-
-
-def import_state_tuples(state_tuples, name, num_replicas):
-  restored = []
-  for i in range(len(state_tuples) * num_replicas):
-    c = tf.get_collection_ref(name)[2 * i + 0]
-    h = tf.get_collection_ref(name)[2 * i + 1]
-    restored.append(tf.contrib.rnn.LSTMStateTuple(c, h))
-  return tuple(restored)
-
-
-def with_prefix(prefix, name):
-  """Adds prefix to name."""
-  return "/".join((prefix, name))
-
-
-def with_autoparallel_prefix(replica_id, name):
-  return with_prefix("AutoParallel-Replica-%d" % replica_id, name)
-
-
-class UpdateCollection(object):
-  """Update collection info in MetaGraphDef for AutoParallel optimizer."""
-
-  def __init__(self, metagraph, model):
-    self._metagraph = metagraph
-    self.replicate_states(model.initial_state_name)
-    self.replicate_states(model.final_state_name)
-    self.update_snapshot_name("variables")
-    self.update_snapshot_name("trainable_variables")
-
-  def update_snapshot_name(self, var_coll_name):
-    var_list = self._metagraph.collection_def[var_coll_name]
-    for i, value in enumerate(var_list.bytes_list.value):
-      var_def = variable_pb2.VariableDef()
-      var_def.ParseFromString(value)
-      # Somehow node Model/global_step/read doesn't have any fanout and seems to
-      # be only used for snapshot; this is different from all other variables.
-      if var_def.snapshot_name != "Model/global_step/read:0":
-        var_def.snapshot_name = with_autoparallel_prefix(
-            0, var_def.snapshot_name)
-      value = var_def.SerializeToString()
-      var_list.bytes_list.value[i] = value
-
-  def replicate_states(self, state_coll_name):
-    state_list = self._metagraph.collection_def[state_coll_name]
-    num_states = len(state_list.node_list.value)
-    for replica_id in range(1, FLAGS.num_gpus):
-      for i in range(num_states):
-        state_list.node_list.value.append(state_list.node_list.value[i])
-    for replica_id in range(FLAGS.num_gpus):
-      for i in range(num_states):
-        index = replica_id * num_states + i
-        state_list.node_list.value[index] = with_autoparallel_prefix(
-            replica_id, state_list.node_list.value[index])
-
-
-def auto_parallel(metagraph, model):
-  from tensorflow.python.grappler import tf_optimizer
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
-  rewriter_config.optimizers.append("autoparallel")
-  rewriter_config.auto_parallel.enable = True
-  rewriter_config.auto_parallel.num_replicas = FLAGS.num_gpus
-  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
-  metagraph.graph_def.CopyFrom(optimized_graph)
-  UpdateCollection(metagraph, model)
diff --git a/tutorials/rnn/quickdraw/BUILD b/tutorials/rnn/quickdraw/BUILD
deleted file mode 100644
index 33c3faeced4..00000000000
--- a/tutorials/rnn/quickdraw/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Description:
-# Example classification model on Quick, Draw! dataset.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "train_model",
-    srcs = [
-        "train_model.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//third_party/py/tensorflow",
-    ],
-)
-
-py_binary(
-    name = "create_dataset",
-    srcs = [
-        "create_dataset.py",
-    ],
-    deps = [
-        "//third_party/py/numpy",
-        "//third_party/py/tensorflow",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//third_party/tensorflow:__subpackages__"],
-)
diff --git a/tutorials/rnn/quickdraw/create_dataset.py b/tutorials/rnn/quickdraw/create_dataset.py
deleted file mode 100644
index af2f019ebac..00000000000
--- a/tutorials/rnn/quickdraw/create_dataset.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-r"""Creates training and eval data from Quickdraw NDJSON files.
-
-This tool reads the NDJSON files from https://quickdraw.withgoogle.com/data
-and converts them into tensorflow.Example stored in TFRecord files.
-
-The tensorflow example will contain 3 features:
- shape - contains the shape of the sequence [length, dim] where dim=3.
- class_index - the class index of the class for the example.
- ink - a length * dim vector of the ink.
-
-It creates disjoint training and evaluation sets.
-
-python create_dataset.py \
-  --ndjson_path ${HOME}/ndjson \
-  --output_path ${HOME}/tfrecord
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import json
-import os
-import random
-import sys
-import numpy as np
-import tensorflow as tf
-
-
-def parse_line(ndjson_line):
-  """Parse an ndjson line and return ink (as np array) and classname."""
-  sample = json.loads(ndjson_line)
-  class_name = sample["word"]
-  if not class_name:
-    print ("Empty classname")
-    return None, None
-  inkarray = sample["drawing"]
-  stroke_lengths = [len(stroke[0]) for stroke in inkarray]
-  total_points = sum(stroke_lengths)
-  np_ink = np.zeros((total_points, 3), dtype=np.float32)
-  current_t = 0
-  if not inkarray:
-    print("Empty inkarray")
-    return None, None
-  for stroke in inkarray:
-    if len(stroke[0]) != len(stroke[1]):
-      print("Inconsistent number of x and y coordinates.")
-      return None, None
-    for i in [0, 1]:
-      np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
-    current_t += len(stroke[0])
-    np_ink[current_t - 1, 2] = 1  # stroke_end
-  # Preprocessing.
-  # 1. Size normalization.
-  lower = np.min(np_ink[:, 0:2], axis=0)
-  upper = np.max(np_ink[:, 0:2], axis=0)
-  scale = upper - lower
-  scale[scale == 0] = 1
-  np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
-  # 2. Compute deltas.
-  np_ink[1:, 0:2] -= np_ink[0:-1, 0:2]
-  np_ink = np_ink[1:, :]
-  return np_ink, class_name
-
-
-def convert_data(trainingdata_dir,
-                 observations_per_class,
-                 output_file,
-                 classnames,
-                 output_shards=10,
-                 offset=0):
-  """Convert training data from ndjson files into tf.Example in tf.Record.
-
-  Args:
-   trainingdata_dir: path to the directory containin the training data.
-     The training data is stored in that directory as ndjson files.
-   observations_per_class: the number of items to load per class.
-   output_file: path where to write the output.
-   classnames: array with classnames - is auto created if not passed in.
-   output_shards: the number of shards to write the output in.
-   offset: the number of items to skip at the beginning of each file.
-
-  Returns:
-    classnames: the class names as strings. classnames[classes[i]] is the
-      textual representation of the class of the i-th data point.
-  """
-
-  def _pick_output_shard():
-    return random.randint(0, output_shards - 1)
-
-  file_handles = []
-  # Open all input files.
-  for filename in sorted(tf.gfile.ListDirectory(trainingdata_dir)):
-    if not filename.endswith(".ndjson"):
-      print("Skipping", filename)
-      continue
-    file_handles.append(
-        tf.gfile.GFile(os.path.join(trainingdata_dir, filename), "r"))
-    if offset:  # Fast forward all files to skip the offset.
-      count = 0
-      for _ in file_handles[-1]:
-        count += 1
-        if count == offset:
-          break
-
-  writers = []
-  for i in range(FLAGS.output_shards):
-    writers.append(
-        tf.python_io.TFRecordWriter("%s-%05i-of-%05i" % (output_file, i,
-                                                         output_shards)))
-
-  reading_order = list(range(len(file_handles))) * observations_per_class
-  random.shuffle(reading_order)
-
-  for c in reading_order:
-    line = file_handles[c].readline()
-    ink = None
-    while ink is None:
-      ink, class_name = parse_line(line)
-      if ink is None:
-        print ("Couldn't parse ink from '" + line + "'.")
-    if class_name not in classnames:
-      classnames.append(class_name)
-    features = {}
-    features["class_index"] = tf.train.Feature(int64_list=tf.train.Int64List(
-        value=[classnames.index(class_name)]))
-    features["ink"] = tf.train.Feature(float_list=tf.train.FloatList(
-        value=ink.flatten()))
-    features["shape"] = tf.train.Feature(int64_list=tf.train.Int64List(
-        value=ink.shape))
-    f = tf.train.Features(feature=features)
-    example = tf.train.Example(features=f)
-    writers[_pick_output_shard()].write(example.SerializeToString())
-
-  # Close all files
-  for w in writers:
-    w.close()
-  for f in file_handles:
-    f.close()
-  # Write the class list.
-  with tf.gfile.GFile(output_file + ".classes", "w") as f:
-    for class_name in classnames:
-      f.write(class_name + "\n")
-  return classnames
-
-
-def main(argv):
-  del argv
-  classnames = convert_data(
-      FLAGS.ndjson_path,
-      FLAGS.train_observations_per_class,
-      os.path.join(FLAGS.output_path, "training.tfrecord"),
-      classnames=[],
-      output_shards=FLAGS.output_shards,
-      offset=0)
-  convert_data(
-      FLAGS.ndjson_path,
-      FLAGS.eval_observations_per_class,
-      os.path.join(FLAGS.output_path, "eval.tfrecord"),
-      classnames=classnames,
-      output_shards=FLAGS.output_shards,
-      offset=FLAGS.train_observations_per_class)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--ndjson_path",
-      type=str,
-      default="",
-      help="Directory where the ndjson files are stored.")
-  parser.add_argument(
-      "--output_path",
-      type=str,
-      default="",
-      help="Directory where to store the output TFRecord files.")
-  parser.add_argument(
-      "--train_observations_per_class",
-      type=int,
-      default=10000,
-      help="How many items per class to load for training.")
-  parser.add_argument(
-      "--eval_observations_per_class",
-      type=int,
-      default=1000,
-      help="How many items per class to load for evaluation.")
-  parser.add_argument(
-      "--output_shards",
-      type=int,
-      default=10,
-      help="Number of shards for the output.")
-
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tutorials/rnn/quickdraw/train_model.py b/tutorials/rnn/quickdraw/train_model.py
deleted file mode 100644
index f98d8202355..00000000000
--- a/tutorials/rnn/quickdraw/train_model.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-r"""Binary for training a RNN-based classifier for the Quick, Draw! data.
-
-python train_model.py \
-  --training_data train_data \
-  --eval_data eval_data \
-  --model_dir /tmp/quickdraw_model/ \
-  --cell_type cudnn_lstm
-
-When running on GPUs using --cell_type cudnn_lstm is much faster.
-
-The expected performance is ~75% in 1.5M steps with the default configuration.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import argparse
-import ast
-import functools
-import sys
-
-import tensorflow as tf
-
-
-def get_num_classes():
-  classes = []
-  with tf.gfile.GFile(FLAGS.classes_file, "r") as f:
-    classes = [x for x in f]
-  num_classes = len(classes)
-  return num_classes
-
-
-def get_input_fn(mode, tfrecord_pattern, batch_size):
-  """Creates an input_fn that stores all the data in memory.
-
-  Args:
-   mode: one of tf.contrib.learn.ModeKeys.{TRAIN, INFER, EVAL}
-   tfrecord_pattern: path to a TF record file created using create_dataset.py.
-   batch_size: the batch size to output.
-
-  Returns:
-    A valid input_fn for the model estimator.
-  """
-
-  def _parse_tfexample_fn(example_proto, mode):
-    """Parse a single record which is expected to be a tensorflow.Example."""
-    feature_to_type = {
-        "ink": tf.VarLenFeature(dtype=tf.float32),
-        "shape": tf.FixedLenFeature([2], dtype=tf.int64)
-    }
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      # The labels won't be available at inference time, so don't add them
-      # to the list of feature_columns to be read.
-      feature_to_type["class_index"] = tf.FixedLenFeature([1], dtype=tf.int64)
-
-    parsed_features = tf.parse_single_example(example_proto, feature_to_type)
-    labels = None
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      labels = parsed_features["class_index"]
-    parsed_features["ink"] = tf.sparse_tensor_to_dense(parsed_features["ink"])
-    return parsed_features, labels
-
-  def _input_fn():
-    """Estimator `input_fn`.
-
-    Returns:
-      A tuple of:
-      - Dictionary of string feature name to `Tensor`.
-      - `Tensor` of target labels.
-    """
-    dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      dataset = dataset.shuffle(buffer_size=10)
-    dataset = dataset.repeat()
-    # Preprocesses 10 files concurrently and interleaves records from each file.
-    dataset = dataset.interleave(
-        tf.data.TFRecordDataset,
-        cycle_length=10,
-        block_length=1)
-    dataset = dataset.map(
-        functools.partial(_parse_tfexample_fn, mode=mode),
-        num_parallel_calls=10)
-    dataset = dataset.prefetch(10000)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      dataset = dataset.shuffle(buffer_size=1000000)
-    # Our inputs are variable length, so pad them.
-    dataset = dataset.padded_batch(
-        batch_size, padded_shapes=dataset.output_shapes)
-    features, labels = dataset.make_one_shot_iterator().get_next()
-    return features, labels
-
-  return _input_fn
-
-
-def model_fn(features, labels, mode, params):
-  """Model function for RNN classifier.
-
-  This function sets up a neural network which applies convolutional layers (as
-  configured with params.num_conv and params.conv_len) to the input.
-  The output of the convolutional layers is given to LSTM layers (as configured
-  with params.num_layers and params.num_nodes).
-  The final state of the all LSTM layers are concatenated and fed to a fully
-  connected layer to obtain the final classification scores.
-
-  Args:
-    features: dictionary with keys: inks, lengths.
-    labels: one hot encoded classes
-    mode: one of tf.estimator.ModeKeys.{TRAIN, INFER, EVAL}
-    params: a parameter dictionary with the following keys: num_layers,
-      num_nodes, batch_size, num_conv, conv_len, num_classes, learning_rate.
-
-  Returns:
-    ModelFnOps for Estimator API.
-  """
-
-  def _get_input_tensors(features, labels):
-    """Converts the input dict into inks, lengths, and labels tensors."""
-    # features[ink] is a sparse tensor that is [8, batch_maxlen, 3]
-    # inks will be a dense tensor of [8, maxlen, 3]
-    # shapes is [batchsize, 2]
-    shapes = features["shape"]
-    # lengths will be [batch_size]
-    lengths = tf.squeeze(
-        tf.slice(shapes, begin=[0, 0], size=[params.batch_size, 1]))
-    inks = tf.reshape(features["ink"], [params.batch_size, -1, 3])
-    if labels is not None:
-      labels = tf.squeeze(labels)
-    return inks, lengths, labels
-
-  def _add_conv_layers(inks, lengths):
-    """Adds convolution layers."""
-    convolved = inks
-    for i in range(len(params.num_conv)):
-      convolved_input = convolved
-      if params.batch_norm:
-        convolved_input = tf.layers.batch_normalization(
-            convolved_input,
-            training=(mode == tf.estimator.ModeKeys.TRAIN))
-      # Add dropout layer if enabled and not first convolution layer.
-      if i > 0 and params.dropout:
-        convolved_input = tf.layers.dropout(
-            convolved_input,
-            rate=params.dropout,
-            training=(mode == tf.estimator.ModeKeys.TRAIN))
-      convolved = tf.layers.conv1d(
-          convolved_input,
-          filters=params.num_conv[i],
-          kernel_size=params.conv_len[i],
-          activation=None,
-          strides=1,
-          padding="same",
-          name="conv1d_%d" % i)
-    return convolved, lengths
-
-  def _add_regular_rnn_layers(convolved, lengths):
-    """Adds RNN layers."""
-    if params.cell_type == "lstm":
-      cell = tf.nn.rnn_cell.BasicLSTMCell
-    elif params.cell_type == "block_lstm":
-      cell = tf.contrib.rnn.LSTMBlockCell
-    cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)]
-    cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)]
-    if params.dropout > 0.0:
-      cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw]
-      cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw]
-    outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
-        cells_fw=cells_fw,
-        cells_bw=cells_bw,
-        inputs=convolved,
-        sequence_length=lengths,
-        dtype=tf.float32,
-        scope="rnn_classification")
-    return outputs
-
-  def _add_cudnn_rnn_layers(convolved):
-    """Adds CUDNN LSTM layers."""
-    # Convolutions output [B, L, Ch], while CudnnLSTM is time-major.
-    convolved = tf.transpose(convolved, [1, 0, 2])
-    lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
-        num_layers=params.num_layers,
-        num_units=params.num_nodes,
-        dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0,
-        direction="bidirectional")
-    outputs, _ = lstm(convolved)
-    # Convert back from time-major outputs to batch-major outputs.
-    outputs = tf.transpose(outputs, [1, 0, 2])
-    return outputs
-
-  def _add_rnn_layers(convolved, lengths):
-    """Adds recurrent neural network layers depending on the cell type."""
-    if params.cell_type != "cudnn_lstm":
-      outputs = _add_regular_rnn_layers(convolved, lengths)
-    else:
-      outputs = _add_cudnn_rnn_layers(convolved)
-    # outputs is [batch_size, L, N] where L is the maximal sequence length and N
-    # the number of nodes in the last layer.
-    mask = tf.tile(
-        tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
-        [1, 1, tf.shape(outputs)[2]])
-    zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
-    outputs = tf.reduce_sum(zero_outside, axis=1)
-    return outputs
-
-  def _add_fc_layers(final_state):
-    """Adds a fully connected layer."""
-    return tf.layers.dense(final_state, params.num_classes)
-
-  # Build the model.
-  inks, lengths, labels = _get_input_tensors(features, labels)
-  convolved, lengths = _add_conv_layers(inks, lengths)
-  final_state = _add_rnn_layers(convolved, lengths)
-  logits = _add_fc_layers(final_state)
-  # Add the loss.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=labels, logits=logits))
-  # Add the optimizer.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss=cross_entropy,
-      global_step=tf.train.get_global_step(),
-      learning_rate=params.learning_rate,
-      optimizer="Adam",
-      # some gradient clipping stabilizes training in the beginning.
-      clip_gradients=params.gradient_clipping_norm,
-      summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
-  # Compute current predictions.
-  predictions = tf.argmax(logits, axis=1)
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions={"logits": logits, "predictions": predictions},
-      loss=cross_entropy,
-      train_op=train_op,
-      eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)})
-
-
-def create_estimator_and_specs(run_config):
-  """Creates an Experiment configuration based on the estimator and input fn."""
-  model_params = tf.contrib.training.HParams(
-      num_layers=FLAGS.num_layers,
-      num_nodes=FLAGS.num_nodes,
-      batch_size=FLAGS.batch_size,
-      num_conv=ast.literal_eval(FLAGS.num_conv),
-      conv_len=ast.literal_eval(FLAGS.conv_len),
-      num_classes=get_num_classes(),
-      learning_rate=FLAGS.learning_rate,
-      gradient_clipping_norm=FLAGS.gradient_clipping_norm,
-      cell_type=FLAGS.cell_type,
-      batch_norm=FLAGS.batch_norm,
-      dropout=FLAGS.dropout)
-
-  estimator = tf.estimator.Estimator(
-      model_fn=model_fn,
-      config=run_config,
-      params=model_params)
-
-  train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
-      mode=tf.estimator.ModeKeys.TRAIN,
-      tfrecord_pattern=FLAGS.training_data,
-      batch_size=FLAGS.batch_size), max_steps=FLAGS.steps)
-
-  eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
-      mode=tf.estimator.ModeKeys.EVAL,
-      tfrecord_pattern=FLAGS.eval_data,
-      batch_size=FLAGS.batch_size))
-
-  return estimator, train_spec, eval_spec
-
-
-def main(unused_args):
-  estimator, train_spec, eval_spec = create_estimator_and_specs(
-      run_config=tf.estimator.RunConfig(
-          model_dir=FLAGS.model_dir,
-          save_checkpoints_secs=300,
-          save_summary_steps=100))
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--training_data",
-      type=str,
-      default="",
-      help="Path to training data (tf.Example in TFRecord format)")
-  parser.add_argument(
-      "--eval_data",
-      type=str,
-      default="",
-      help="Path to evaluation data (tf.Example in TFRecord format)")
-  parser.add_argument(
-      "--classes_file",
-      type=str,
-      default="",
-      help="Path to a file with the classes - one class per line")
-  parser.add_argument(
-      "--num_layers",
-      type=int,
-      default=3,
-      help="Number of recurrent neural network layers.")
-  parser.add_argument(
-      "--num_nodes",
-      type=int,
-      default=128,
-      help="Number of node per recurrent network layer.")
-  parser.add_argument(
-      "--num_conv",
-      type=str,
-      default="[48, 64, 96]",
-      help="Number of conv layers along with number of filters per layer.")
-  parser.add_argument(
-      "--conv_len",
-      type=str,
-      default="[5, 5, 3]",
-      help="Length of the convolution filters.")
-  parser.add_argument(
-      "--cell_type",
-      type=str,
-      default="lstm",
-      help="Cell type used for rnn layers: cudnn_lstm, lstm or block_lstm.")
-  parser.add_argument(
-      "--batch_norm",
-      type="bool",
-      default="False",
-      help="Whether to enable batch normalization or not.")
-  parser.add_argument(
-      "--learning_rate",
-      type=float,
-      default=0.0001,
-      help="Learning rate used for training.")
-  parser.add_argument(
-      "--gradient_clipping_norm",
-      type=float,
-      default=9.0,
-      help="Gradient clipping norm used during training.")
-  parser.add_argument(
-      "--dropout",
-      type=float,
-      default=0.3,
-      help="Dropout used for convolutions and bidi lstm layers.")
-  parser.add_argument(
-      "--steps",
-      type=int,
-      default=100000,
-      help="Number of training steps.")
-  parser.add_argument(
-      "--batch_size",
-      type=int,
-      default=8,
-      help="Batch size to use for training/evaluation.")
-  parser.add_argument(
-      "--model_dir",
-      type=str,
-      default="",
-      help="Path for storing the model checkpoints.")
-  parser.add_argument(
-      "--self_test",
-      type="bool",
-      default="False",
-      help="Whether to enable batch normalization or not.")
-
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)