Refactoring: Move common methods and classes to separate package

adik993 · Dec 14, 2017 · c6a93a4 · c6a93a4
1 parent 3fea3c1
commit c6a93a4
Show file tree

Hide file tree

Showing 15 changed files with 78 additions and 92 deletions.
diff --git a/cliffwalking.py b/cliffwalking.py
@@ -1,9 +1,8 @@
 from gym import Env
 
 from envs.CliffWalkingEnv import CliffWalking
-from windy_gridworld import Sarsa, generate_episode
-import numpy as np
 from log import make_logger
+from windy_gridworld import Sarsa, generate_episode
 
 log = make_logger(__name__)
 

diff --git a/double_q_learning.py b/double_q_learning.py
@@ -6,33 +6,7 @@
 import plotly.offline as py
 import plotly.graph_objs as go
 
-
-def randomargmax(d, key=None):
-    k_max = max(d, key=key)
-    return np.random.choice([k for k, v in d.items() if d[k_max] == v])
-
-
-def epsilon_prob(greedy, action, n_actions, epsilon):
-    if greedy == action:
-        return epsilon_greedy_prob(n_actions, epsilon)
-    else:
-        return epsilon_explore_prob(n_actions, epsilon)
-
-
-def epsilon_greedy_prob(n_actions, epsilon):
-    return 1 - epsilon + epsilon / n_actions
-
-
-def epsilon_explore_prob(n_actions, epsilon):
-    return epsilon / n_actions
-
-
-class Algorithm:
-    def action(self, state):
-        raise NotImplementedError()
-
-    def on_new_state(self, state, action, reward, next_state, done):
-        raise NotImplementedError()
+from utils import epsilon_prob, randomargmax, Algorithm
 
 
 class QLearning(Algorithm):

diff --git a/dyna_q.py b/dyna_q.py
@@ -1,15 +1,12 @@
 import random
 
-from double_q_learning import epsilon_prob
-from envs.MazeEnv import BasicMaze, Maze, MazeShortLong, MazeLongShort
-from n_step_sarsa import Algorithm
 import numpy as np
 import plotly.graph_objs as go
 import plotly.offline as py
 
-
-def randomargmax(a: np.ndarray):
-    return np.random.choice(np.flatnonzero(a == a.max()))
+from double_q_learning import epsilon_prob
+from envs.MazeEnv import Maze, MazeLongShort
+from utils import randomargmax, Algorithm
 
 
 class DynaQ(Algorithm):

diff --git a/envs/CliffWalkingEnv.py b/envs/CliffWalkingEnv.py
@@ -3,10 +3,6 @@
 import numpy as np
 
 
-def minmax(value, low, high):
-    return max(min(value, high), low)
-
-
 class CliffWalking(Env):
     metadata = {'render.modes': ['human']}
     ACTION_UP = 0
@@ -55,8 +51,8 @@ def _reward(self, felt):
         return -100 if felt else -1
 
     def _move(self, by):
-        axis0 = minmax(self.position[0] + by[0], 0, self.world.shape[0] - 1)
-        axis1 = minmax(self.position[1] + by[1], 0, self.world.shape[1] - 1)
+        axis0 = np.clip(self.position[0] + by[0], 0, self.world.shape[0] - 1)
+        axis1 = np.clip(self.position[1] + by[1], 0, self.world.shape[1] - 1)
         felt = False
         if self.world[axis0, axis1] == CliffWalking.CLIFF:
             felt = True

diff --git a/envs/GridWorldEnv.py b/envs/GridWorldEnv.py
@@ -2,8 +2,6 @@
 import numpy as np
 from gym.spaces import Tuple, Discrete
 
-from envs.WindyGridWorldEnv import minmax
-
 
 class GridWorld(Env):
     metadata = {'render.modes': ['human']}
@@ -48,8 +46,8 @@ def _step(self, action):
         return self._obs(), -1, done, self.world
 
     def _move(self, move):
-        axis0 = minmax(self.position[0] + move[0], 0, self.world.shape[0] - 1)
-        axis1 = minmax(self.position[1] + move[1], 0, self.world.shape[1] - 1)
+        axis0 = np.clip(self.position[0] + move[0], 0, self.world.shape[0] - 1)
+        axis1 = np.clip(self.position[1] + move[1], 0, self.world.shape[1] - 1)
         self.position = (axis0, axis1)
 
     def _reset(self):

diff --git a/envs/MazeEnv.py b/envs/MazeEnv.py
@@ -1,7 +1,6 @@
 from gym import Env
 from gym.spaces import Tuple, Discrete
-
-from envs.CliffWalkingEnv import minmax
+import numpy as np
 from envs.GridWorldEnv import GridWorld
 
 
@@ -18,8 +17,8 @@ def is_wall(self, position):
         return self.world[position] == Maze.WALL
 
     def _move(self, move):
-        axis0 = minmax(self.position[0] + move[0], 0, self.world.shape[0] - 1)
-        axis1 = minmax(self.position[1] + move[1], 0, self.world.shape[1] - 1)
+        axis0 = np.clip(self.position[0] + move[0], 0, self.world.shape[0] - 1)
+        axis1 = np.clip(self.position[1] + move[1], 0, self.world.shape[1] - 1)
         if not self.is_wall((axis0, axis1)):
             self.position = (axis0, axis1)
 

diff --git a/envs/RandomWalkEnv.py b/envs/RandomWalkEnv.py
@@ -2,8 +2,6 @@
 from gym.spaces import Discrete
 import numpy as np
 
-from envs.WindyGridWorldEnv import minmax
-
 
 class RandomWalk(Env):
     metadata = {'render.modes': ['human']}
@@ -38,7 +36,7 @@ def _step(self, action):
             self.position -= step
         else:
             self.position += step
-        self.position = minmax(self.position, 0, len(self.states) - 1)
+        self.position = np.clip(self.position, 0, len(self.states) - 1)
 
         done = self.position == 0 or self.position == len(self.states) - 1
         reward = self.states[self.position]

diff --git a/envs/WindyGridWorldEnv.py b/envs/WindyGridWorldEnv.py
@@ -7,10 +7,6 @@ def inc(tuple, val):
     return tuple[0] + val, tuple[1] + val
 
 
-def minmax(value, low, high):
-    return max(min(value, high), low)
-
-
 class WindyGridWorld(Env):
     metadata = {'render.modes': ['human']}
     ACTION_UP = 0
@@ -71,9 +67,9 @@ def _step(self, action):
 
     def _move(self, by):
         wind = self._get_wind(self.position[1])
-        axis1 = minmax(self.position[1] + by[1], 0, self.size[1] - 1)
+        axis1 = np.clip(self.position[1] + by[1], 0, self.size[1] - 1)
         axis0 = self.position[0] + by[0] - wind
-        axis0 = minmax(axis0, 0, self.size[0] - 1)
+        axis0 = np.clip(axis0, 0, self.size[0] - 1)
         self.position = axis0, axis1
 
     def _get_wind(self, axis1):

diff --git a/gradient_methods_random_walk.py b/gradient_methods_random_walk.py
@@ -1,12 +1,12 @@
 from gym import Env
 
-from double_q_learning import Algorithm
-from envs.CliffWalkingEnv import minmax
 from envs.RandomWalkEnv import RandomWalk
 import numpy as np
 import plotly.offline as py
 import plotly.graph_objs as go
 
+from utils import EpisodeAlgorithm, Algorithm
+
 N_AGGREGATE = 100
 N_STATES = 1000
 MAX_STEP = 100
@@ -21,7 +21,7 @@ def find_true_values():
             for action in [-1, 1]:
                 for step in range(1, MAX_STEP + 1):
                     step *= action
-                    next_state = minmax(state + step, 0, N_STATES + 1)
+                    next_state = np.clip(state + step, 0, N_STATES + 1)
                     prob = 1 / (MAX_STEP * 2)
                     new[state] += prob * (0 + new[next_state])
         error = np.abs(np.sum(old - new))
@@ -40,14 +40,6 @@ def __init__(self, state, reward):
         self.reward = reward
 
 
-class EpisodeAlgorithm:
-    def action(self, state):
-        raise NotImplementedError()
-
-    def on_new_episode(self, history):
-        raise NotImplementedError()
-
-
 class ValueFunction:
     def __init__(self, shape, aggregation=N_AGGREGATE):
         self.value = np.zeros([s // aggregation for s in shape])

diff --git a/n_step_sarsa.py b/n_step_sarsa.py
@@ -6,13 +6,7 @@
 import plotly.offline as py
 import plotly.graph_objs as go
 
-
-class Algorithm:
-    def action(self, state):
-        raise NotImplementedError()
-
-    def on_new_state(self, state, action, reward, next_state, done):
-        raise NotImplementedError()
+from utils import Algorithm
 
 
 class NStepSarsa(Algorithm):

diff --git a/n_step_td_random_walk.py b/n_step_td_random_walk.py
@@ -1,22 +1,15 @@
 import sys
-from collections import deque
 
-from envs.RandomWalkEnv import RandomWalk, Env
-from randomwalk import rmse
 import numpy as np
-import plotly.offline as py
 import plotly.graph_objs as go
+import plotly.offline as py
 
+from envs.RandomWalkEnv import RandomWalk
+from randomwalk import rmse
+from utils import Algorithm
 
 TRUE_VALUES = np.arange(-20, 22, 2) / 20.0
 
-class Algorithm:
-    def action(self, state):
-        raise NotImplementedError()
-
-    def on_new_state(self, state, action, reward, next_state, done):
-        raise NotImplementedError()
-
 
 class NStepTD(Algorithm):
     def __init__(self, env: RandomWalk, n, alpha=0.1, gamma=1):

diff --git a/n_step_tree_backup.py b/n_step_tree_backup.py
@@ -7,7 +7,8 @@
 
 from double_q_learning import epsilon_prob
 from envs.GridWorldEnv import GridWorld
-from n_step_sarsa import Algorithm, perform_algo_eval, NStepSarsa
+from n_step_sarsa import perform_algo_eval
+from utils import Algorithm
 
 
 class Entry:

diff --git a/random_walk_td_lambda.py b/random_walk_td_lambda.py
@@ -7,6 +7,8 @@
 import plotly.offline as py
 import plotly.graph_objs as go
 
+from utils import Algorithm
+
 
 class RandomPolicy:
     def __init__(self, env: Env):
@@ -26,7 +28,7 @@ def __getitem__(self, item):
         return self.actions[self.index]
 
 
-class TD:
+class TD(Algorithm):
     def __init__(self, env: Env, policy, alpha=0.1, gamma=1, lam=0.9):
         self.alpha = alpha
         self.gamma = gamma
@@ -41,7 +43,7 @@ def trace(self, state):
     def action(self, state):
         return self.policy[state]
 
-    def on_new_state(self, state, reward, next_state, done):
+    def on_new_state(self, state, action, reward, next_state, done):
         v = self.values[state]
         v_next = self.values[next_state]
         delta = reward + self.gamma * v_next - v
@@ -61,7 +63,7 @@ def generate_episode(env: Env, algorithm: TD):
         prev_obs = obs
         action = algorithm.action(prev_obs)
         obs, reward, done, aux = env.step(action)
-        algorithm.on_new_state(prev_obs, reward, obs, done)
+        algorithm.on_new_state(prev_obs, action, reward, obs, done)
 
 
 def perform_lam_test(env, lams, alphas, n_avg=1, n=10):

diff --git a/utils/__init__.py b/utils/__init__.py
@@ -0,0 +1,45 @@
+import numpy as np
+from math import ceil
+
+def randomargmax(d, key=None):
+    k_max = max(d, key=key)
+    return np.random.choice([k for k, v in d.items() if d[k_max] == v])
+
+
+def randargmax(b, **kw):
+    """ a random tie-breaking argmax"""
+    return np.argmax(np.random.random(b.shape) * (b == b.max()), **kw)
+
+
+def epsilon_prob(greedy, action, n_actions, epsilon):
+    if greedy == action:
+        return epsilon_greedy_prob(n_actions, epsilon)
+    else:
+        return epsilon_explore_prob(n_actions, epsilon)
+
+
+def epsilon_greedy_prob(n_actions, epsilon):
+    return 1 - epsilon + epsilon / n_actions
+
+
+def epsilon_explore_prob(n_actions, epsilon):
+    return epsilon / n_actions
+
+
+def calc_batch_size(size, n_batches, batch_idx):
+    return max(0, min(size - batch_idx * ceil(size / n_batches), ceil(size / n_batches)))
+
+class Algorithm:
+    def action(self, state):
+        raise NotImplementedError()
+
+    def on_new_state(self, state, action, reward, next_state, done):
+        raise NotImplementedError()
+
+
+class EpisodeAlgorithm:
+    def action(self, state):
+        raise NotImplementedError()
+
+    def on_new_episode(self, history):
+        raise NotImplementedError()
diff --git a/windy_gridworld.py b/windy_gridworld.py
@@ -4,10 +4,12 @@
 from log import make_logger
 import numpy as np
 
+from utils import Algorithm
+
 log = make_logger(__name__)
 
 
-class Sarsa:
+class Sarsa(Algorithm):
     def __init__(self, env: Env, alpha=0.5, gamma=1, epsilon=0.1):
         self.alpha = alpha
         self.gamma = gamma