diff --git a/official/nlp/albert/run_squad.py b/official/nlp/albert/run_squad.py
index 242e349b128..ed3c2da5372 100644
--- a/official/nlp/albert/run_squad.py
+++ b/official/nlp/albert/run_squad.py
@@ -19,9 +19,12 @@
 from __future__ import print_function
 
 import json
+import os
+import time
 
 from absl import app
 from absl import flags
+from absl import logging
 import tensorflow as tf
 
 from official.nlp.albert import configs as albert_configs
@@ -53,7 +56,7 @@ def train_squad(strategy,
 
 
 def predict_squad(strategy, input_meta_data):
-  """Makes predictions for a squad dataset."""
+  """Makes predictions for the squad dataset."""
   bert_config = albert_configs.AlbertConfig.from_json_file(
       FLAGS.bert_config_file)
   tokenizer = tokenization.FullSentencePieceTokenizer(
@@ -63,6 +66,18 @@ def predict_squad(strategy, input_meta_data):
                                  bert_config, squad_lib_sp)
 
 
+def eval_squad(strategy, input_meta_data):
+  """Evaluate on the squad dataset."""
+  bert_config = albert_configs.AlbertConfig.from_json_file(
+      FLAGS.bert_config_file)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=FLAGS.sp_model_file)
+
+  eval_metrics = run_squad_helper.eval_squad(
+      strategy, input_meta_data, tokenizer, bert_config, squad_lib_sp)
+  return eval_metrics
+
+
 def export_squad(model_export_path, input_meta_data):
   """Exports a trained model as a `SavedModel` for inference.
 
@@ -97,10 +112,25 @@ def main(_):
       num_gpus=FLAGS.num_gpus,
       all_reduce_alg=FLAGS.all_reduce_alg,
       tpu_address=FLAGS.tpu)
-  if FLAGS.mode in ('train', 'train_and_predict'):
+
+  if 'train' in FLAGS.mode:
     train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
-  if FLAGS.mode in ('predict', 'train_and_predict'):
+  if 'predict' in FLAGS.mode:
     predict_squad(strategy, input_meta_data)
+  if 'eval' in FLAGS.mode:
+    eval_metrics = eval_squad(strategy, input_meta_data)
+    f1_score = eval_metrics['final_f1']
+    logging.info('SQuAD eval F1-score: %f', f1_score)
+    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
+    summary_writer = tf.summary.create_file_writer(summary_dir)
+    with summary_writer.as_default():
+      # TODO(lehou): write to the correct step number.
+      tf.summary.scalar('F1-score', f1_score, step=0)
+      summary_writer.flush()
+    # Also write eval_metrics to json file.
+    squad_lib_sp.write_to_json_files(
+        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
+    time.sleep(60)
 
 
 if __name__ == '__main__':
diff --git a/official/nlp/bert/run_squad.py b/official/nlp/bert/run_squad.py
index 7ca62b13f98..2c84c7bcdee 100644
--- a/official/nlp/bert/run_squad.py
+++ b/official/nlp/bert/run_squad.py
@@ -20,7 +20,6 @@
 
 import json
 import os
-import tempfile
 import time
 
 from absl import app
@@ -130,18 +129,15 @@ def main(_):
     eval_metrics = eval_squad(strategy, input_meta_data)
     f1_score = eval_metrics['final_f1']
     logging.info('SQuAD eval F1-score: %f', f1_score)
-    if (not strategy) or strategy.extended.should_save_summary:
-      summary_dir = os.path.join(FLAGS.model_dir, 'summaries')
-    else:
-      summary_dir = tempfile.mkdtemp()
-    summary_writer = tf.summary.create_file_writer(
-        os.path.join(summary_dir, 'eval'))
+    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
+    summary_writer = tf.summary.create_file_writer(summary_dir)
     with summary_writer.as_default():
       # TODO(lehou): write to the correct step number.
       tf.summary.scalar('F1-score', f1_score, step=0)
       summary_writer.flush()
-    # Wait for some time, for the depending mldash/tensorboard jobs to finish
-    # exporting the final F1-score.
+    # Also write eval_metrics to json file.
+    squad_lib_wp.write_to_json_files(
+        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
     time.sleep(60)