forked from buaazhangfan/Composable-RL-with-EX2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdqn.py
700 lines (637 loc) · 29.2 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
import uuid
import time
import pickle
import sys
import gym.spaces
import itertools
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.layers as layers
from collections import namedtuple
from dqn_utils import *
from my_exemplar import Exemplar
OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
def set_global_seeds(i):
try:
import tensorflow as tf
except ImportError:
pass
else:
tf.set_random_seed(i)
np.random.seed(i)
random.seed(i)
class QLearner(object):
def __init__(
self,
env,
q_func,
optimizer_spec,
session,
exploration=LinearSchedule(1000000, 0.1),
stopping_criterion=None,
replay_buffer_size=1000000,
batch_size=32,
gamma=0.99,
learning_starts=50000,
learning_freq=4,
frame_history_len=4,
target_update_freq=10000,
grad_norm_clipping=10,
rew_file=None,
double_q=True,
lander=False,
explore='e-greedy',
ex2= False,
min_replay_size=10000,
# not sure
ex2_len= 1000,
coef=0.01,
seed=250,
evaluation=False,
directory = './models/model1'):
"""Run Deep Q-learning algorithm.
You can specify your own convnet using q_func.
All schedules are w.r.t. total number of steps taken in the environment.
Parameters
----------
env: gym.Env
gym environment to train on.
q_func: function
Model to use for computing the q function. It should accept the
following named arguments:
img_in: tf.Tensor
tensorflow tensor representing the input image
num_actions: int
number of actions
scope: str
scope in which all the model related variables
should be created
reuse: bool
whether previously created variables should be reused.
optimizer_spec: OptimizerSpec
Specifying the constructor and kwargs, as well as learning rate schedule
for the optimizer
session: tf.Session
tensorflow session to use.
exploration: rl_algs.deepq.utils.schedules.Schedule
schedule for probability of chosing random action.
stopping_criterion: (env, t) -> bool
should return true when it's ok for the RL algorithm to stop.
takes in env and the number of steps executed so far.
replay_buffer_size: int
How many memories to store in the replay buffer.
batch_size: int
How many transitions to sample each time experience is replayed.
gamma: float
Discount Factor
learning_starts: int
After how many environment steps to start replaying experiences
learning_freq: int
How many steps of environment to take between every experience replay
frame_history_len: int
How many past frames to include as input to the model.
target_update_freq: int
How many experience replay rounds (not steps!) to perform between
each update to the target Q network
grad_norm_clipping: float or None
If not None gradients' norms are clipped to this value.
double_q: bool
If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN.
https://papers.nips.cc/paper/3964-double-q-learning.pdf
"""
assert type(env.observation_space) == gym.spaces.Box
assert type(env.action_space) == gym.spaces.Discrete
self.target_update_freq = target_update_freq
self.optimizer_spec = optimizer_spec
self.batch_size = batch_size
self.learning_freq = learning_freq
self.learning_starts = learning_starts
self.stopping_criterion = stopping_criterion
self.env = env
self.session = session
self.exploration = exploration
self.rew_file = str(uuid.uuid4()) + '.pkl' if rew_file is None else rew_file
self.double_q = double_q
self.explore = explore
self.directory = directory
# EX2
# [1e-3, 1e-4, 1e-5]
self.coef = coef
self.first_train = True
self.first_train_itrs = int(5e3)
self.train_itrs = int(1e3)
self.ex2 = ex2
self.min_replay_size = min_replay_size
self.ex2_len = ex2_len
self.count = 0
self.seed = seed
self.eval = evaluation
print('eval?', self.eval)
print('exploration strategy', explore)
print('using ex2', ex2)
print('using coef', coef)
set_global_seeds(self.seed)
###############
# BUILD MODEL #
###############
if len(self.env.observation_space.shape) == 1:
# This means we are running on low-dimensional observations (e.g. RAM)
# IT is what I am debugging on!
input_shape = self.env.observation_space.shape
else:
img_h, img_w, img_c = self.env.observation_space.shape
input_shape = (img_h, img_w, frame_history_len * img_c)
self.num_actions = self.env.action_space.n
if self.eval:
saver1 = tf.train.import_meta_graph(self.directory+'.meta')
saver1.restore(self.session, self.directory)
self.obs_t_ph = tf.get_collection('obs_t_ph')[0]
self.Temp = tf.get_collection('Temp')[0]
self.keep_per = tf.get_collection('keep_per')[0]
self.q_dist = tf.get_collection('q_dist')[0]
self.q_t = tf.get_collection('q_t')[0]
# Ex2
if self.ex2:
self.ex2_in1 = tf.get_collection('ex2_in1')[0]
self.ex2_in2 = tf.get_collection('ex2_in2')[0]
self.ex2_dis_output = tf.get_collection('ex2_dis_output')[0]
self.ex2_prob = tf.get_collection('ex2_prob')[0]
self.model_initialized = True
# print('obs is here',self.obs_t_ph)
# print(self.Temp)
# print(self.keep_per)
# print(self.q_dist)
print('restored and initialized the model')
else:
# set up placeholders
# placeholder for current observation (or state)
self.obs_t_ph = tf.placeholder(
tf.float32 if lander else tf.uint8, [None] + list(input_shape))
# placeholder for current action
self.act_t_ph = tf.placeholder(tf.int32, [None])
# placeholder for current reward
self.rew_t_ph = tf.placeholder(tf.float32, [None])
# placeholder for next observation (or state)
self.obs_tp1_ph = tf.placeholder(
tf.float32 if lander else tf.uint8, [None] + list(input_shape))
# placeholder for end of episode mask
# this value is 1 if the next state corresponds to the end of an episode,
# in which case there is no Q-value at the next state; at the end of an
# episode, only the current state reward contributes to the target, not the
# next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
self.done_mask_ph = tf.placeholder(tf.float32, [None])
# casting to float on GPU ensures lower data transfer times.
# TO-DO: WHY?
if lander:
obs_t_float = self.obs_t_ph
obs_tp1_float = self.obs_tp1_ph
else:
obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0
obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0
# Here, you should fill in your own code to compute the Bellman error. This requires
# evaluating the current and next Q-values and constructing the corresponding error.
# TensorFlow will differentiate this error for you, you just need to pass it to the
# optimizer. See assignment text for details.
# Your code should produce one scalar-valued tensor: total_error
# This will be passed to the optimizer in the provided code below.
# Your code should also produce two collections of variables:
# q_func_vars
# target_q_func_vars
# These should hold all of the variables of the Q-function network and target network,
# respectively. A convenient way to get these is to make use of TF's "scope" feature.
# For example, you can create your Q-function network with the scope "q_func" like this:
# <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
# And then you can obtain the variables like this:
# q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
# Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
# Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error
######
# YOUR CODE HERE
# For bayesian exploration: Add dropout value to the network
# Get Q-function and target network
self.keep_per = tf.placeholder(shape=None,dtype=tf.float32)
if self.explore == 'bayesian':
print('Bayesian variables defined!')
dropout = True
else:
dropout = False
# EX2
if self.ex2:
print('Use Exemplar Model')
self.exemplar = Exemplar(input_dim= input_shape[0], seed=self.seed, eval=self.eval )
q_t = q_func(obs_t_float, self.num_actions, scope='q_func', reuse=False,
dropout=dropout, keep_prob=self.keep_per)
q_tp1 = q_func(obs_tp1_float, self.num_actions, scope='target_q_func_vars', reuse=False,
dropout=dropout, keep_prob=self.keep_per)
# For boltzmann exploration
if self.explore == 'soft_q':
print('Boltzman variables defined!')
self.Temp = tf.placeholder(shape=None, dtype=tf.float32)
# print(q_t)
#value = tf.reduce_mean(q_t, 1)
# print(value)
# print(q_t - value)
# print(self.q_dist)
# exit()
# self.q_dist = tf.nn.softmax(q_t/self.Temp)
# # Old version
# value = tf.log( tf.reduce_sum(tf.exp(q_t),1) )
# self.q_dist = tf.exp(q_t - value)
# New version
self.q_dist = tf.nn.softmax(q_t/self.Temp)
# Max operation
self.q_t_action = tf.argmax(q_t, axis=1)
# value = tf.reduce_mean(q_t)
# self.q_t_action = tf.nn.softmax(q_t - value)
# Specify double Q function difference
if self.double_q:
print('using double q learning')
# TO-DO: VERY VERY IMPORTANT TO REUSE VAIRABLES
# TO-DO: DO WE NEED TO SET GRADIENT NOT UPDATE
q_tp1_target = q_func(obs_tp1_float, self.num_actions, scope='q_func', reuse=True)
q_tp1_target_action = tf.argmax(q_tp1_target, axis=1)
q_tp1_max = tf.reduce_sum(q_tp1 * tf.one_hot(indices=q_tp1_target_action,
depth=self.num_actions,
on_value=1.0, off_value=0.0), axis=1)
else:
# Soft maximum
if self.explore == 'soft_q':
print('using soft q learning')
# q_tp1_max = tf.log( tf.reduce_sum(tf.exp(q_tp1),1) )
q_tp1_max = tf.reduce_logsumexp(q_tp1, 1)
# print(q_tp1_max)
# exit()
else:
q_tp1_max = tf.reduce_max(q_tp1, 1)
# Get target value
q_tp1 = gamma * (1.0 - self.done_mask_ph) * q_tp1_max
target = self.rew_t_ph + q_tp1
# Get Q_fai(si,ai)
# TO-DO: VERY VERY IMPORTANT! use reduce_sum instead of reduce_max since exist negative value
q_t_target = tf.reduce_sum(q_t * tf.one_hot(indices=self.act_t_ph,
depth=self.num_actions,
on_value=1.0, off_value=0.0), axis=1)
# Calculate loss
self.total_error = target - q_t_target
self.total_error = tf.reduce_mean(huber_loss(self.total_error))
# Produce collections of variables to update separately
q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func_vars')
if 0:
print(q_t.get_shape())
print(q_tp1.get_shape())
print(self.q_t_action.get_shape())
print(self.done_mask_ph.get_shape())
print(q_tp1_max.get_shape())
print(q_tp1.get_shape())
print(q_t_target.get_shape())
print(self.total_error.get_shape())
exit()
######
# construct optimization op (with gradient clipping)
self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
optimizer = self.optimizer_spec.constructor(learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
self.train_fn = minimize_and_clip(optimizer, self.total_error,
var_list=q_func_vars, clip_val=grad_norm_clipping)
# update_target_fn will be called periodically to copy Q network to target Q network
update_target_fn = []
for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
sorted(target_q_func_vars, key=lambda v: v.name)):
update_target_fn.append(var_target.assign(var))
self.update_target_fn = tf.group(*update_target_fn)
# construct the replay buffer
self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander)
self.replay_buffer_idx = None
###############
# RUN ENV #
###############
if not self.eval:
self.model_initialized = False
self.num_param_updates = 0
self.mean_episode_reward = -float('nan')
self.best_mean_episode_reward = -float('inf')
# last_obs intialized here
self.last_obs = self.env.reset()
self.log_every_n_steps = 10000
self.timesteps = []
self.mean_episode_rewards = []
self.best_mean_episode_rewards = []
self.start_time = None
self.t = 0
# EX2
if not self.eval:
self.saver = tf.train.Saver()
tf.add_to_collection('obs_t_ph', self.obs_t_ph)
if self.explore == 'soft_q':
tf.add_to_collection('Temp', self.Temp)
tf.add_to_collection('q_dist', self.q_dist)
tf.add_to_collection('keep_per', self.keep_per)
tf.add_to_collection('q_t', q_t)
if self.ex2:
in1, in2, dis_output, prob = self.exemplar.model.predict_tensor()
tf.add_to_collection('ex2_in1', in1)
tf.add_to_collection('ex2_in2', in2)
tf.add_to_collection('ex2_dis_output', dis_output)
tf.add_to_collection('ex2_prob', prob)
if self.ex2 and not self.eval:
self.exemplar.model.init_tf_sess(self.session)
self.model_initialized = True
"""
# eval
if self.eval:
print("Initialize Evaluation Mode")
self.saver.restore(self.session, "./bstmodel/model.ckpt")
self.model_initialized = True
print("Initialized models")
"""
def stopping_criterion_met(self):
return self.stopping_criterion is not None and self.stopping_criterion(self.env, self.t)
def step_env(self):
### 2. Step the env and store the transition
# At this point, "self.last_obs" contains the latest observation that was
# recorded from the simulator. Here, your code needs to store this
# observation and its outcome (reward, next observation, etc.) into
# the replay buffer while stepping the simulator forward one step.
# At the end of this block of code, the simulator should have been
# advanced one step, and the replay buffer should contain one more
# transition.
# Specifically, self.last_obs must point to the new latest observation.
# Useful functions you'll need to call:
# obs, reward, done, info = env.step(action)
# this steps the environment forward one step
# obs = env.reset()
# this resets the environment if you reached an episode boundary.
# Don't forget to call env.reset() to get a new observation if done
# is true!!
## what do you mean?? what is context?
## i think it is just {s,a,r}
## check encode_recent_observation!
# Note that you cannot use "self.last_obs" directly as input
# into your network, since it needs to be processed to include context
# from previous frames. You should check out the replay buffer
# implementation in dqn_utils.py to see what functionality the replay
# buffer exposes. The replay buffer has a function called
# encode_recent_observation that will take the latest observation
# that you pushed into the buffer and compute the corresponding
# input that should be given to a Q network by appending some
# previous frames.
# Don't forget to include epsilon greedy exploration!
# And remember that the first time you enter this loop, the model
# may not yet have been initialized (but of course, the first step
# might as well be random, since you haven't trained your net...)
# Ex2
self.count += 1
# Store observation
ret = self.replay_buffer.store_frame(self.last_obs)
self.e_current_idx = ret
# print(np.shape(self.last_obs))
# For exploration, the value will gradually decrease
if self.explore == 'greedy':
# print("using greedy exploration!")
if (not self.model_initialized):
action = np.random.randint(0, self.num_actions)
else:
recent_obs = self.replay_buffer.encode_recent_observation()
action = self.session.run(self.q_t_action, feed_dict={self.obs_t_ph: [recent_obs],
self.keep_per: 1.0})
action = action[0]
if self.explore == 'e-greedy':
# print("using e-greedy exploration!")
# Random.random return [0,1)
if (not self.model_initialized) or (random.random() < self.exploration.value(self.t)):
action = np.random.randint(0, self.num_actions)
else:
# Understanding: context have at least two frames to encode velocity info
# RECENT_OBS: FOR RAM (128,) AND FOR LAUDER (9,) AND FOR ATARI (84,84,4)
# Action shape (1,)
# Encode recent observation
recent_obs = self.replay_buffer.encode_recent_observation()
# print(np.shape(recent_obs))
action = self.session.run(self.q_t_action, feed_dict={self.obs_t_ph: [recent_obs],
self.keep_per: 1.0})
action = action[0]
# print(np.shape(action))
# exit()
if self.explore == 'soft_q':
# print("using boltzmann exploration!")
if (not self.model_initialized):
action = np.random.randint(0, self.num_actions)
else:
recent_obs = self.replay_buffer.encode_recent_observation()
#print(recent_obs.shape)
#print(recent_obs)
#print(self.q_dist)
#print(self.obs_t_ph)
#print(self.Temp)
#print(self.keep_per)
#exit()
q_d = self.session.run(self.q_dist, feed_dict={self.obs_t_ph: [recent_obs],
self.Temp: self.exploration.value(self.t),
self.keep_per: 1.0})
if self.eval and (self.replay_buffer.num_in_buffer > self.min_replay_size) and (self.count >= self.ex2_len):
self.count = 0
#paths = self.replay_buffer.get_all_positive(self.ex2_len)
#ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: paths,
# self.ex2_in2: paths})
# print("ex2 dis_out", ex2_out)
# print("ex2 pb_out", ex2_pb)
#for _ in range(10):
ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: [recent_obs],
self.ex2_in2: [recent_obs] })
# print("ex2_pb", ex2_pb)
#exit()
if 0:
print('in',input_q)
print('qt',q_t)
print('qd',q_d)
action = np.random.choice(self.num_actions, p=q_d[0])
if self.explore == 'bayesian':
# print("using bayesian exploration!")
if (not self.model_initialized):
action = np.random.randint(0, self.num_actions)
else:
recent_obs = self.replay_buffer.encode_recent_observation()
keep_per = (1.0 - self.exploration.value(self.t)) + 0.1
# Deal with larger than 1.0 case
keep_per = 1.0 if keep_per>1.0 else keep_per
# print(keep_per)
action = self.session.run(self.q_t_action, feed_dict={self.obs_t_ph: [recent_obs],
self.keep_per: keep_per})
action = action[0]
# print(action)
# exit()
# Step one step forward
# INPUT FOR ACTION IS INT VALUE
obs, reward, done, info = self.env.step(action)
# print(np.shape(obs))
# exit()
# Point to the newest observation
if done:
obs = self.env.reset()
self.last_obs = obs
# Store others
self.replay_buffer.store_effect(ret, action, reward, done)
# Update EX2 model and rewards
if not self.eval:
if self.ex2 and (self.replay_buffer.num_in_buffer > self.min_replay_size) and (self.count >= self.ex2_len):
self.count = 0
# fit ex2 model
if self.first_train:
train_itrs = self.first_train_itrs
self.first_train = False
else:
train_itrs = self.train_itrs
for _ in range(train_itrs):
positive = self.replay_buffer.sample_positive(self.ex2_len, 128)
negative = self.replay_buffer.sample_negative(self.ex2_len, 128)
# positive_np = np.asarray(positive)
# print(positive_np.shape)
# print(self.replay_buffer.num_in_buffer)
# print(positive)
# print(len(positive))
# exit()
self.exemplar.fit(positive, negative)
# update rewards
paths = self.replay_buffer.get_all_positive(self.ex2_len)
bonus_reward = self.exemplar.predict(paths)
self.replay_buffer.update_reward(self.ex2_len, bonus_reward, self.coef)
if self.eval:
self.t += 1
# exit()
#####
# YOUR CODE HERE
def update_model(self):
### 3. Perform experience replay and train the network.
# Absolutely, this process takes long!
# note that this is only done if the replay buffer contains enough samples
# for us to learn something useful -- until then, the model will not be
# initialized and random actions should be taken
if (self.t > self.learning_starts and \
self.t % self.learning_freq == 0 and \
self.replay_buffer.can_sample(self.batch_size)):
# Here, you should perform training. Training consists of four steps:
# 3.a: use the replay buffer to sample a batch of transitions (see the
# replay buffer code for function definition, each batch that you sample
# should consist of current observations, current actions, rewards,
# next observations, and done indicator).
# batch_size = 32, observation shape = 128
obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = self.replay_buffer.sample(self.batch_size)
# 3.b: initialize the model if it has not been initialized yet; to do
# that, call
# initialize_interdependent_variables(self.session, tf.global_variables(), {
# self.obs_t_ph: obs_t_batch,
# self.obs_tp1_ph: obs_tp1_batch,
# })
# where obs_t_batch and obs_tp1_batch are the batches of observations at
# the current and next time step. The boolean variable model_initialized
# indicates whether or not the model has been initialized.
# Remember that you have to update the target network too (see 3.d)!
# TO-DO: is it only initialize once when first start
if not self.model_initialized:
print("initializing model")
if self.ex2:
# initialized in Siamese model
print("Ex2 no need to initialize")
pass
else:
print("interdependent init")
initialize_interdependent_variables(self.session, tf.global_variables(), {
self.obs_t_ph: obs_t_batch,
self.obs_tp1_ph: obs_tp1_batch,
})
# self.session.run(tf.global_variables_initializer())
# TO-DO: VERY VERY IMPORTATNT!!
#self.saver = tf.train.Saver()
print("set model_initialized True")
self.model_initialized = True
# 3.c: train the model. To do this, you'll need to use the self.train_fn and
# self.total_error ops that were created earlier: self.total_error is what you
# created to compute the total Bellman error in a batch, and self.train_fn
# will actually perform a gradient step and update the network parameters
# to reduce total_error. When calling self.session.run on these you'll need to
# populate the following placeholders:
# self.obs_t_ph
# self.act_t_ph
# self.rew_t_ph
# self.obs_tp1_ph
# self.done_mask_ph
# (this is needed for computing self.total_error)
# self.learning_rate -- you can get this from self.optimizer_spec.lr_schedule.value(t)
# (this is needed by the optimizer to choose the learning rate)
# TO-DO: check written rule okay?
_, error = self.session.run([self.train_fn, self.total_error], feed_dict={
self.obs_t_ph: obs_t_batch,
self.act_t_ph: act_batch,
self.rew_t_ph: rew_batch,
self.obs_tp1_ph: obs_tp1_batch,
self.done_mask_ph: done_mask,
self.learning_rate: self.optimizer_spec.lr_schedule.value(self.t),
self.keep_per: 1.0})
# print('error', error)
# exit()
# 3.d: periodically update the target network by calling
# self.session.run(self.update_target_fn)
# you should update every target_update_freq steps, and you may find the
# variable self.num_param_updates useful for this (it was initialized to 0)
#####
# YOUR CODE HERE
self.num_param_updates += 1
if (self.num_param_updates % self.target_update_freq == 0):
print("actually update")
self.session.run(self.update_target_fn)
# exit()
self.t += 1
# print('self.t', self.t)
def log_progress(self):
#print(self.t)
episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()
if len(episode_rewards) > 0:
self.mean_episode_reward = np.mean(episode_rewards[-100:])
if len(episode_rewards) > 50:
if self.mean_episode_reward > self.best_mean_episode_reward:
# store the best_mean_reward
self.best_mean_episode_reward = self.mean_episode_reward
#print("init?",self.model_initialized)
#print("eval?",self.eval)
if self.model_initialized and not self.eval:
# store the best model
save_path = self.saver.save(self.session, self.directory)
print("Model saved in path: %s" % save_path)
#self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)
#print("Exemplar test output")
#for _ in range(10):
# self.exemplar.predict(np.ones((1,9))/9 )
if self.t % self.log_every_n_steps == 0 and self.model_initialized:
print("Timestep %d" % (self.t,))
print("mean reward (100 episodes) %f" % self.mean_episode_reward)
print("best mean reward %f" % self.best_mean_episode_reward)
print("episodes %d" % len(episode_rewards))
print("exploration %f" % self.exploration.value(self.t))
print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t))
if self.start_time is not None:
print("running time %f" % ((time.time() - self.start_time) / 60.))
self.start_time = time.time()
sys.stdout.flush()
# Store variables
self.timesteps.append(self.t)
self.mean_episode_rewards.append(self.mean_episode_reward)
self.best_mean_episode_rewards.append(self.best_mean_episode_reward)
# TO-DO: it is weird, since every time it is doing dumpying, but every time it opens as new..
# Actually if less steps required, we can only store once at the end
with open(self.rew_file, 'wb') as f:
store_result = {'timestep': np.array(self.timesteps), 'reward': np.array(episode_rewards),
'mean_reward': np.array(self.mean_episode_rewards), 'best_reward': np.array(self.best_mean_episode_rewards)}
pickle.dump(store_result, f, pickle.HIGHEST_PROTOCOL)
def learn(*args, **kwargs):
alg = QLearner(*args, **kwargs)
while not alg.stopping_criterion_met():
alg.step_env()
# at this point, the environment should have been advanced one step (and
# reset if done was true), and self.last_obs should point to the new latest
# observation
if not alg.eval:
alg.update_model()
alg.log_progress()
# print('end')
# exit()