-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils.py
105 lines (83 loc) · 3.06 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
def weight_init(shape):
'''Convenience function for randomly initializing weights'''
weights = np.random.uniform(-0.000001, 0.000001, size=shape)
return weights
def sensor(t):
'''Return current x,y coordinates of agent'''
data = np.zeros(25)
idx = 5 * (agent.x-1) + (agent.y -1)
data[idx] = 1
return data
def reward(t):
return agent.reward
def compute_position(action_idx):
if action_idx == 0:
# move up
x_pos = agent.x
y_pos = agent.y - 1
elif action_idx == 1:
# move right
x_pos = agent.x + 1
y_pos = agent.y
elif action_idx == 2:
# move down
x_pos = agent.x
y_pos = agent.y + 1
else:
# move left
x_pos = agent.x - 1
y_pos = agent.y
return x_pos, y_pos
def take_action(index, epsilon=0.1):
'''Pick an action to perform in the environment'''
do_random = np.random.choice([1, 0], p=[epsilon, 1-epsilon])
if do_random:
action_idx = np.random.choice(np.arange(4))
else:
action_idx = index
x_pos, y_pos = compute_position(action_idx)
agent.set_position(x_pos, y_pos)
return action_idx
class Values(object):
def __init__(self, dimensions=8):
self.output = np.zeros(dimensions)
self.last_action_index = 0
self.current_action_index = 0
self.terminal = False
self.terminal_clock = stepsize
def step(self, t, x):
'''Prepare Q value info for computing error signal'''
# if t is multiple of action step size, do step
if self.terminal:
self.terminal_clock -= 1
if self.terminal_clock == 0:
self.terminal = False
self.terminal_clock = stepsize
return np.zeros_like(self.output)
if int(t * 1000) == 1:
print('STARTING')
if int(t * 1000) % stepsize == 0:
if agent.reward > 0:
self.output = np.zeros_like(self.output)
x_pos = np.random.choice([1,5])
y_pos = np.random.choice([1,5])
agent.set_position(x_pos, y_pos)
self.terminal = True
return self.output
self.last_action_index = self.current_action_index
qs = self.output[8:]
idx = np.argmax(qs)
self.current_action_index = take_action(idx)
# then on next step store new qvalues
elif int(t * 1000) % stepsize == 1:
qvalues = x
qmax = qvalues[np.argmax(qvalues)]
# set output to be current state q, qmax, last state selection
c_output = np.zeros(4)
c_output[self.current_action_index] = qvalues[self.current_action_index]
f_output = np.zeros(4)
f_output[self.current_action_index] = 0.9 * qmax + agent.reward
self.output = np.concatenate(
(c_output, f_output, qvalues))
return self.output