-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenvironment.py
109 lines (83 loc) · 3.19 KB
/
environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import dynet as dy
import math
import numpy as np
import random
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
MAX_NUM_ITEMS = 5
possible_items = { "PENNY": 1,
"NICKEL": 5,
"DIME": 10,
"QUARTER": 25 }
actions_ids = list(possible_items.keys())
### In a ScalesEnvironment, the agent should choose objects to place on the
### scale at each step. The agent is given a set of scales where one side is
### filled with items already, and the agent can place items on the other side
### of the scale, which is initially empty.
class ScalesEnvironment():
def __init__(self):
self.full_side = [ ]
self.full_weight = 0
num_items = random.randint(1, MAX_NUM_ITEMS)
for item in range(num_items):
item_type = random.choice(actions_ids)
self.full_side.append(item_type)
self.full_weight += possible_items[item_type]
self.empty_side = [ ]
self.empty_weight = 0
self.prev_empty_side = [ ]
self.prev_empty_weight = 0
def has_finished(self):
return self.full_weight <= self.empty_weight
def take_action(self, action):
self.prev_empty_side = self.empty_side
self.prev_empty_weight = self.empty_weight
action_name = actions_ids[action]
self.empty_side.append(action_name)
self.empty_weight += possible_items[action_name]
def reward(self):
# if self.empty_weight == self.full_weight:
# return 1.
# else:
# return 0.
if self.empty_weight <= self.full_weight:
return float(self.empty_weight) / self.full_weight
elif self.empty_weight <= 2 * self.full_weight:
return float(self.empty_weight - self.full_weight) / self.full_weight
else:
return 0.
def previous_state(self):
return self.full_side, self.prev_empty_side
def current_state(self):
return self.full_side, self.empty_side
class ScalesModel():
def __init__(self):
self.model = dy.Model()
# architecture: l1 embeddings for the full and current scales, and l2
# combines the two and puts biases to transform to decision
self.empty_state = self.model.add_parameters((1))
self.l1_weights = self.model.add_lookup_parameters((len(possible_items), 1))
self.l2_weights = self.model.add_parameters((2, len(possible_items)))
self.trainer = dy.AdamTrainer(self.model, alpha = 0.01)
def forward(self, state):
full_side = state[0]
empty_side = state[1]
full_embs = [self.l1_weights[actions_ids.index(item)] for item in full_side]
empty_embs = [self.l1_weights[actions_ids.index(item)] for item in empty_side]
full_sum = dy.esum(full_embs)
if len(empty_embs) > 0:
empty_sum = dy.esum(empty_embs)
else:
empty_sum = dy.parameter(self.empty_state)
cat = dy.concatenate([full_sum, empty_sum])
result = dy.transpose(dy.rectify(dy.reshape(cat, (1, 2)) * dy.parameter(self.l2_weights)))
return result
def select_action(self, state, total_step_num):
sample = random.random()
threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * total_step_num / EPS_DECAY)
if sample > threshold:
dist = self.forward(state).npvalue()
return np.argmax(dist)
else:
return random.randint(0, len(possible_items) - 1)