From 574c981c140b43b4e66e7c43d6e1247b3acc842a Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Fri, 29 Jan 2016 17:40:54 -0800 Subject: [PATCH] Final tweaks for lower batch size. --- neural_gpu/neural_gpu.py | 2 +- neural_gpu/neural_gpu_trainer.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_gpu/neural_gpu.py b/neural_gpu/neural_gpu.py index fd61a479131..f40d5d24573 100644 --- a/neural_gpu/neural_gpu.py +++ b/neural_gpu/neural_gpu.py @@ -151,7 +151,7 @@ def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step, tf.constant(0, dtype=tf.int32, shape=[1]), tf.zeros([1, vec_size])) - adam = tf.train.AdamOptimizer(0.01*self.lr, epsilon=1e-4) + adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4) # Main graph creation loop, for every bin in data_utils. self.steps = [] diff --git a/neural_gpu/neural_gpu_trainer.py b/neural_gpu/neural_gpu_trainer.py index ee5ed853bcd..9fe5b932ef4 100644 --- a/neural_gpu/neural_gpu_trainer.py +++ b/neural_gpu/neural_gpu_trainer.py @@ -31,7 +31,7 @@ import data_utils as data import neural_gpu -tf.app.flags.DEFINE_float("lr", 0.3, "Learning rate.") +tf.app.flags.DEFINE_float("lr", 0.003, "Learning rate.") tf.app.flags.DEFINE_float("init_weight", 1.0, "Initial weights deviation.") tf.app.flags.DEFINE_float("max_grad_norm", 0.05, "Clip gradients to this norm.") tf.app.flags.DEFINE_float("cutoff", 1.2, "Cutoff at the gates.") @@ -215,7 +215,7 @@ def train(): start_time = time.time() inp, target = data.get_batch(l, batch_size, True, task) noise_param = math.sqrt(math.pow(global_step, -0.55) * - (20 * prev_seq_err)) * FLAGS.grad_noise_scale + prev_seq_err) * FLAGS.grad_noise_scale loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param) step_time += time.time() - start_time acc_grad_norm += float(gnorm) @@ -234,7 +234,7 @@ def train(): acc_loss /= step_count step_time /= FLAGS.steps_per_checkpoint acc_seq_err = float(acc_seq_err) / (step_count * batch_size) - prev_seq_err = acc_seq_err + prev_seq_err = max(0.0, acc_seq_err - 0.02) # No noise at error < 2%. acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0 msg1 = "step %d step-time %.2f" % (global_step, step_time) msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)