-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathskillchain_lunarlander.py
executable file
·540 lines (449 loc) · 22.7 KB
/
skillchain_lunarlander.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
# Matt Corsaro
# Brown University CS 2951X Final Project
# Skill chaining for continuous Lunar Lander
# Original DQN code from:
# https://gist.github.com/heerad/d2b92c2f3a83b5e4be395546c17b274c#file-dqn-lunarlander-v2-py
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
from sklearn import svm
import matplotlib.pyplot as plt
import time
import datetime
import os
from os import path
import sys
import random
from collections import deque
from anytree import NodeMixin, RenderTree
import argparse
# DQN Params
gamma = 0.99
# Hidden layer sizes
h1 = 200
h2 = 200
h3 = 200
lr = 5e-5
# decay per episode
lr_decay = 1
l2_reg = 1e-6
dropout = 0
num_episodes = 1000
# gym cuts off after 1000, anyway
max_steps_ep = 1000
update_slow_target_every = 100
train_every = 1
replay_memory_capacity = int(1e6)
minibatch_size = 1024
epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay_length = 10000
epsilon_decay_exp = 0.98
# Skill chain params
# don't execute after creating, off-policy learning
gestation = 10
# Stop adding options after this timestep
add_opt_cutoff = num_episodes/5
# Maximum number of steps in one option
max_steps_opt = 25
max_neg_traj = max_steps_opt*10
# Option completion reward - not used since global MDP currently must choose an option if presented with it
opt_r = 35
# How long to gather initiation classifier data for, and the maximum number of examples that can be reached before
num_ep_init_class = 50
max_num_init_ex = 6000
# unused
max_branching_factor = 2
# episode to drop the epsilon to 0
epsilon_drop_episode = 4*num_episodes/5
def atGoal(state, done):
# If landed in the target zone (between the two flags)
x = state[0]
y = state[1]
return -0.2 < x < 0.2 and -0.1 < y < 0.1 and done
def getMinibatchElem(minibatch, i):
return np.asarray([elem[i] for elem in minibatch])
def statesFromExperiences(experiences):
return [example[0][:2] for example in experiences]
def make_meshgrid(x_min, x_max, y_min, y_max, h=.02):
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
"""Plot the decision boundaries for a classifier.
Parameters
----------
ax: matplotlib axes object
clf: a classifier
xx: meshgrid ndarray
yy: meshgrid ndarray
params: dictionary of params to pass to contourf, optional
"""
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
# TODO: BFS function...
def optTreeToList(root_option):
optList = []
queue = [root_option]
while len(queue) != 0:
opt = queue.pop(0)
optList.append(opt)
queue += opt.children
return optList
def findOptForState(position, root_option, ep):
# BFS Search
queue = [root_option]
while len(queue) != 0:
opt = queue.pop(0)
# If the state is in the initation set and the initiation set classifier has been fully trained
if opt.inInitiationSet(position) and opt.classifierTrained():
return opt
else:
queue += opt.children
return None
def writeAllEpsilon(root_option, ep):
# BFS iteration
queue = [root_option]
while len(queue) != 0:
option = queue.pop(0)
option.writeEpsilon(ep)
queue += option.children
def dropAllEpsilon(root_option):
# BFS iteration
queue = [root_option]
while len(queue) != 0:
option = queue.pop(0)
option.epsilon = 0.0
queue += option.children
def main():
parser = argparse.ArgumentParser(description = "Lunar Lander")
parser.add_argument('--visualize', dest='visualize', action='store_true')
parser.add_argument('--no-visualize', dest='visualize', action='store_false')
parser.set_defaults(visualize=False)
args = parser.parse_args()
# game parameters
env = gym.make("LunarLander-v2")
state_dim = np.prod(np.array(env.observation_space.shape))
n_actions = env.action_space.n
####################################################################################################################
## Tensorflow
tf.reset_default_graph()
# placeholders
state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) # input to Q network
next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) # input to slow target network
action_ph = tf.placeholder(dtype=tf.int32, shape=[None]) # action indices (indices of Q network output)
reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # rewards (go into target computation)
is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout
episode_reward = tf.Variable(0.)
tf.summary.scalar("Episode Reward", episode_reward)
r_summary_placeholder = tf.placeholder("float")
update_ep_reward = episode_reward.assign(r_summary_placeholder)
plot_epsilon = tf.Variable(0.)
tf.summary.scalar("Epsilon", plot_epsilon)
eps_summary_placeholder = tf.placeholder("float")
update_plot_epsilon = plot_epsilon.assign(eps_summary_placeholder)
# episode counter
episodes = tf.Variable(0.0, trainable=False, name='episodes')
episode_inc_op = episodes.assign_add(1)
# will use this to initialize both Q network and slowly-changing target network with same structure
def generate_network(s, trainable, reuse):
hidden = tf.layers.dense(s, h1, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
hidden_drop = tf.layers.dropout(hidden, rate = dropout, training = trainable & is_training_ph)
hidden_2 = tf.layers.dense(hidden_drop, h2, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', \
reuse = reuse)
hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout, training = trainable & is_training_ph)
hidden_3 = tf.layers.dense(hidden_drop_2, h3, activation = tf.nn.relu, trainable = trainable, name = 'dense_2',\
reuse = reuse)
hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout, training = trainable & is_training_ph)
action_values = tf.squeeze(tf.layers.dense(hidden_drop_3, n_actions, trainable = trainable, name = 'dense_3', \
reuse = reuse))
return action_values
with tf.variable_scope('q_network') as scope:
# Q network applied to state_ph
q_action_values = generate_network(state_ph, trainable = True, reuse = False)
# Q network applied to next_state_ph (for double Q learning)
q_action_values_next = tf.stop_gradient(generate_network(next_state_ph, trainable = False, reuse = True))
# slow target network
with tf.variable_scope('slow_target_network', reuse=False):
# use stop_gradient to treat the output values as constant targets when doing backprop
slow_target_action_values = tf.stop_gradient(generate_network(next_state_ph, trainable = False, reuse = False))
# isolate vars for each network
q_network_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_network')
slow_target_network_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_network')
# update values for slowly-changing target network to match current critic network
update_slow_target_ops = []
for i, slow_target_var in enumerate(slow_target_network_vars):
update_slow_target_op = slow_target_var.assign(q_network_vars[i])
update_slow_target_ops.append(update_slow_target_op)
update_slow_target_op = tf.group(*update_slow_target_ops, name='update_slow_target')
targets = reward_ph + is_not_terminal_ph * gamma * \
tf.gather_nd(slow_target_action_values, tf.stack((tf.range(minibatch_size), \
tf.cast(tf.argmax(q_action_values_next, axis=1), tf.int32)), axis=1))
# Estimated Q values for (s,a) from experience replay
estim_taken_action_vales = tf.gather_nd(q_action_values, tf.stack((tf.range(minibatch_size), action_ph), axis=1))
# loss function (with regularization)
loss = tf.reduce_mean(tf.square(targets - estim_taken_action_vales))
for var in q_network_vars:
if not 'bias' in var.name:
loss += l2_reg * 0.5 * tf.nn.l2_loss(var)
# optimizer
train_op = tf.train.AdamOptimizer(lr*lr_decay**episodes).minimize(loss)
## Tensorflow
####################################################################################################################
## Option and Skill classes
# date and time, with full unix timestamp appended
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S___') + str(int(time.time() *10e5))
class Option:
def __init__(self, n, start_ep):
self.n = n
self.start_ep = start_ep
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
self.writer = tf.summary.FileWriter("board_" + timestamp + '_' + str(n))
self.writer.add_graph(self.sess.graph)
self.saver = tf.train.Saver()
self.directory = timestamp + '/' + str(self.n)
if not os.path.exists(self.directory):
os.makedirs(self.directory)
self.initiation_examples = []
self.initiation_labels = []
# Printing
self.num_updates_per_ep = [0]*num_episodes
self.size_exp_buff_per_ep = [0]*num_episodes
self.initiation_classifier = svm.SVC(kernel="rbf")
self.experience = deque(maxlen=replay_memory_capacity)
self.initTrained = False
self.epsilon = epsilon_start
self.epsilon_linear_step = (epsilon_start-epsilon_end)/epsilon_decay_length
self.total_steps = 0
def writeReward(self, r, ep):
self.sess.run(update_ep_reward, feed_dict={r_summary_placeholder: r})
summary_str = self.sess.run(tf.summary.merge_all())
self.writer.add_summary(summary_str, ep)
def writeEpsilon(self, ep):
self.sess.run(update_plot_epsilon, feed_dict={eps_summary_placeholder: self.epsilon})
if self.n != "GlobalMDP":
summary_str = self.sess.run(tf.summary.merge_all())
self.writer.add_summary(summary_str, ep)
def retrainInitationClassifier(self, ep):
self.num_pos_examples = len([x for x in self.initiation_labels if x == 1])
self.num_neg_examples = len([x for x in self.initiation_labels if x == 0])
if self.num_pos_examples != 0 and self.num_neg_examples != 0:
print "Training classifier with", len([x for x in self.initiation_labels if x == 1]), \
"positive examples and", len([x for x in self.initiation_labels if x == 0]), "negative examples."
class_start_time = time.time()
self.initiation_classifier.fit(self.initiation_examples, self.initiation_labels)
print "Retrained option", self.n, "classifier in", (time.time() - class_start_time), "seconds."
self.saveInitiationPlot(ep)
self.initTrained = True
def classifierTrained(self):
return ep - self.start_ep > num_ep_init_class or len(self.initiation_labels) > max_num_init_ex
def loadDQNWeights(self, model_file):
print "Loading weights for new option", self.n, "from", model_file
self.saver.restore(self.sess, model_file)
def saveDQNWeights(self, model_file):
assert(self.n == "GlobalMDP")
print "Saving", self.n, "DQN weights to", model_file
self.saver.save(self.sess, model_file)
def saveInitiationPlot(self, ep):
try:
# very rarely, the legend doesn't fit correctly, and this fails
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
X0, X1 = np.array(self.initiation_examples)[:, 0], np.array(self.initiation_examples)[:, 1]
xx, yy = make_meshgrid(-1, 1, -1./3, 1)
labels = [str(self.num_pos_examples) + " positive examples", \
str(self.num_neg_examples) + " negative examples"]
fig, sub = plt.subplots(1, 1)
plot_contours(sub, self.initiation_classifier, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
sub.scatter(X0, X1, c=self.initiation_labels, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
sub.set_xlim(xx.min(), xx.max())
sub.set_ylim(yy.min(), yy.max())
sub.set_xticks(())
sub.set_yticks(())
sub.set_xlabel("Option " + str(self.n) + " at episode " + str(ep))
sub.set_ylabel(str(self.num_pos_examples) + " pos, " + str(self.num_neg_examples) + " neg")
#sub.legend(labels=labels, bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
plt.plot([-0.2, 0.2], [0, 0], 'k-')
plt.savefig(self.directory + '/' + str(ep) + '.png')
plt.close()
except:
print "Failed to generate plot for option", self.n, " at episode", ep
print sys.exc_info()[0]
def addInitiationExamples(self, states, label):
self.initiation_examples += states
self.initiation_labels += [label]*len(states)
def inInitiationSet(self, state):
return self.initTrained and self.initiation_classifier.predict([state])[0]
# TODO: epsilon decay
def updateEpsilon(self, done, ep):
# Only update if Epsilon hasn't been forced to zero
if ep < epsilon_drop_episode:
# linearly decay epsilon from epsilon_start to epsilon_end over epsilon_decay_length steps
decay = ""
old_epsilon = self.epsilon
if self.total_steps < epsilon_decay_length:
self.epsilon -= self.epsilon_linear_step
decay = "linear"
# then exponentially decay it every episode
elif done:
self.epsilon *= epsilon_decay_exp
decay = "exponential"
#print "Updating option", self.n, "epsilon from", old_epsilon, "to", self.epsilon, "with", decay, "decay."
def updateDQN(self, step_experience, episode):
self.experience.append(step_experience)
# update the slow target's weights to match the latest q network if it's time to do so
if self.total_steps%update_slow_target_every == 0:
_ = self.sess.run(update_slow_target_op)
# update network weights to fit a minibatch of experience
if self.total_steps%train_every == 0 and len(self.experience) >= minibatch_size:
# grab N (s,a,r,s') tuples from experience
minibatch = random.sample(self.experience, minibatch_size)
# do a train_op with all the inputs required
_ = self.sess.run(train_op,
feed_dict = {state_ph: getMinibatchElem(minibatch, 0), action_ph: getMinibatchElem(minibatch, 1), \
reward_ph: getMinibatchElem(minibatch, 2), next_state_ph: getMinibatchElem(minibatch, 3), \
is_not_terminal_ph: getMinibatchElem(minibatch, 4), is_training_ph: True})
self.num_updates_per_ep[episode] += 1
self.size_exp_buff_per_ep[episode] = len(self.experience)
# http://anytree.readthedocs.io/en/latest/api/anytree.node.html#anytree.node.nodemixin.NodeMixin
class Skill(Option, NodeMixin):
def __init__(self, n, start_ep, parent = None):
super(Skill, self).__init__(n, start_ep)
self.parent = parent
self.name = str(n)
# Not the global MDP or the goal option
if self.parent != None and self.parent.parent != None:
global_mdp = self.parent
while global_mdp.parent != None:
global_mdp = global_mdp.parent
# Timestamp shouldn't be necessary since only one model will be saved for each option
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')
weights_file = self.directory + '/' + timestamp + ".ckpt"
global_mdp.saveDQNWeights(weights_file)
self.loadDQNWeights(weights_file)
def inTerminationSet(self, full_state, done):
if self.n == "GlobalMDP":
return False
elif self.n == 0:
return atGoal(full_state, done)
else:
return self.parent.inInitiationSet(full_state[:2])
def updateInit(self, experiences, ep):
# Only called if `self.inTerminationSet(experiences[-1][0], (not experiences[-1][-1]))`
if not self.classifierTrained():
# List of (x, y) states for experiences less than max_steps_opt time steps away from the goal
positive_examples = statesFromExperiences(experiences[-max_steps_opt:])
# Only use the last max_neg_traj negative examples, not the hovering at the beginning
negative_examples = statesFromExperiences(experiences[-max_steps_opt-max_neg_traj:-max_steps_opt])
# If there aren't examples or if first negative example isn't already in the initiation set
if len(self.initiation_examples) == 0 or len(negative_examples) == 0 or not self.inInitiationSet(negative_examples[0]):
self.addInitiationExamples(positive_examples, 1)
self.addInitiationExamples(negative_examples, 0)
self.retrainInitationClassifier(ep)
elif len(negative_examples) != 0:
print "Trajectory began at state", negative_examples[0], "which is in the initiation set. Skipping."
## Option and Skill classes
####################################################################################################################
## Training
# initialize session
globalMDP = Skill("GlobalMDP", 0)
num_skills = 0
goalOpt = Skill(num_skills, 0, parent=globalMDP)
num_skills += 1
# continually updated, set to new option whose initiation classifier is not fully trained, else set to None
new_opt = goalOpt
start_time = time.time()
for ep in range(num_episodes):
newopt_episode_terminated = False
total_reward = 0
raw_reward = 0
steps_in_ep = 0
epi_experience = []
observation = env.reset()
# Option to use at each step of this episode
opt = globalMDP
# Check to see if initiation classification is done
if new_opt != None and new_opt.classifierTrained():
new_opt = None
# Drop all epsilons in the tree to zero
if ep >= epsilon_drop_episode:
dropAllEpsilon(globalMDP)
for t in range(max_steps_ep):
current_position = observation[:2]
# determine if we should switch to an option, create a new one, or continue to use global MDP
if opt == globalMDP:
current_opt = findOptForState(current_position, goalOpt, ep)
if current_opt != None:
opt = current_opt
print "Switching from global MDP to option", opt.name
# When transitioning to option from global, and no option is being initialized
if new_opt == None and ep < add_opt_cutoff:
print "Creating a new option with parent", opt.name
new_opt = Skill(num_skills, ep, parent=opt)
num_skills += 1
else:
opt = globalMDP
if np.random.random() < opt.epsilon:
action = np.random.randint(n_actions)
else:
q_s = opt.sess.run(q_action_values, feed_dict = {state_ph: observation[None], is_training_ph: False})
action = np.argmax(q_s)
# take step
next_observation, reward, done, _info = env.step(action)
if args.visualize:
env.render()
opt_reward = reward
# Since we aren't allowing the global MDP to choose between an action and an option, don't need to give
# extra completion reward to encourage choosing an option. Therefore, no difference between total and raw
# reward
'''
# if current option is completed and we move to the next, or if we've reached the goal with the goal option
if (opt == goalOpt and done) or \
(opt != globalMDP and opt != goalOpt and opt.parent.inInitiationSet(next_observation[0][:2])):
print "Completed opt", opt.name, ", assigning reward."
opt_reward += opt_r
'''
total_reward += opt_reward
raw_reward += reward
step_experience = (observation, action, opt_reward, next_observation, 0.0 if done else 1.0)
opt.updateDQN(step_experience, ep)
epi_experience.append(step_experience)
observation = next_observation
opt.total_steps += 1
steps_in_ep += 1
opt.updateEpsilon(done, ep)
if opt != globalMDP:
globalMDP.updateDQN(step_experience, ep)
globalMDP.updateEpsilon(done, ep)
if opt.inTerminationSet(observation, done):
print "Switching from option", opt.name, "to option", opt.parent.name
opt = opt.parent
if new_opt != None and not new_opt.classifierTrained() and new_opt.inTerminationSet(observation, done) and not newopt_episode_terminated:
# Only update once per episode, at what would be the transition
newopt_episode_terminated = True
for exp in epi_experience[-max_steps_opt:]:
new_opt.updateDQN(exp, ep)
new_opt.updateInit(epi_experience, ep)
if done:
# Increment episode counter
_ = opt.sess.run(episode_inc_op)
break
# TODO: only write once, writeEpsilon currently writes for all but global since nothing else is plotted
writeAllEpsilon(globalMDP, ep)
globalMDP.writeReward(raw_reward, ep)
print('Episode %2i, Reward: %7.3f, Steps: %i, Minutes: %7.3f'%\
(ep, raw_reward, steps_in_ep, (time.time() - start_time)/60))
for option in optTreeToList(globalMDP):
print option.name
print
print option.num_updates_per_ep
print
print option.size_exp_buff_per_ep
print
print
env.close()
if __name__ == '__main__':
main()