diff --git a/cliffwalking.py b/cliffwalking.py index d433522..d248a41 100644 --- a/cliffwalking.py +++ b/cliffwalking.py @@ -1,9 +1,8 @@ from gym import Env from envs.CliffWalkingEnv import CliffWalking -from windy_gridworld import Sarsa, generate_episode -import numpy as np from log import make_logger +from windy_gridworld import Sarsa, generate_episode log = make_logger(__name__) diff --git a/double_q_learning.py b/double_q_learning.py index d560aff..fccc99a 100644 --- a/double_q_learning.py +++ b/double_q_learning.py @@ -6,33 +6,7 @@ import plotly.offline as py import plotly.graph_objs as go - -def randomargmax(d, key=None): - k_max = max(d, key=key) - return np.random.choice([k for k, v in d.items() if d[k_max] == v]) - - -def epsilon_prob(greedy, action, n_actions, epsilon): - if greedy == action: - return epsilon_greedy_prob(n_actions, epsilon) - else: - return epsilon_explore_prob(n_actions, epsilon) - - -def epsilon_greedy_prob(n_actions, epsilon): - return 1 - epsilon + epsilon / n_actions - - -def epsilon_explore_prob(n_actions, epsilon): - return epsilon / n_actions - - -class Algorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_state(self, state, action, reward, next_state, done): - raise NotImplementedError() +from utils import epsilon_prob, randomargmax, Algorithm class QLearning(Algorithm): diff --git a/dyna_q.py b/dyna_q.py index 1555238..6b2dd09 100644 --- a/dyna_q.py +++ b/dyna_q.py @@ -1,15 +1,12 @@ import random -from double_q_learning import epsilon_prob -from envs.MazeEnv import BasicMaze, Maze, MazeShortLong, MazeLongShort -from n_step_sarsa import Algorithm import numpy as np import plotly.graph_objs as go import plotly.offline as py - -def randomargmax(a: np.ndarray): - return np.random.choice(np.flatnonzero(a == a.max())) +from double_q_learning import epsilon_prob +from envs.MazeEnv import Maze, MazeLongShort +from utils import randomargmax, Algorithm class DynaQ(Algorithm): diff --git a/envs/CliffWalkingEnv.py b/envs/CliffWalkingEnv.py index 81bdfe9..e871521 100644 --- a/envs/CliffWalkingEnv.py +++ b/envs/CliffWalkingEnv.py @@ -3,10 +3,6 @@ import numpy as np -def minmax(value, low, high): - return max(min(value, high), low) - - class CliffWalking(Env): metadata = {'render.modes': ['human']} ACTION_UP = 0 @@ -55,8 +51,8 @@ def _reward(self, felt): return -100 if felt else -1 def _move(self, by): - axis0 = minmax(self.position[0] + by[0], 0, self.world.shape[0] - 1) - axis1 = minmax(self.position[1] + by[1], 0, self.world.shape[1] - 1) + axis0 = np.clip(self.position[0] + by[0], 0, self.world.shape[0] - 1) + axis1 = np.clip(self.position[1] + by[1], 0, self.world.shape[1] - 1) felt = False if self.world[axis0, axis1] == CliffWalking.CLIFF: felt = True diff --git a/envs/GridWorldEnv.py b/envs/GridWorldEnv.py index a2fad61..3e11ded 100644 --- a/envs/GridWorldEnv.py +++ b/envs/GridWorldEnv.py @@ -2,8 +2,6 @@ import numpy as np from gym.spaces import Tuple, Discrete -from envs.WindyGridWorldEnv import minmax - class GridWorld(Env): metadata = {'render.modes': ['human']} @@ -48,8 +46,8 @@ def _step(self, action): return self._obs(), -1, done, self.world def _move(self, move): - axis0 = minmax(self.position[0] + move[0], 0, self.world.shape[0] - 1) - axis1 = minmax(self.position[1] + move[1], 0, self.world.shape[1] - 1) + axis0 = np.clip(self.position[0] + move[0], 0, self.world.shape[0] - 1) + axis1 = np.clip(self.position[1] + move[1], 0, self.world.shape[1] - 1) self.position = (axis0, axis1) def _reset(self): diff --git a/envs/MazeEnv.py b/envs/MazeEnv.py index 413ae04..d6529dd 100644 --- a/envs/MazeEnv.py +++ b/envs/MazeEnv.py @@ -1,7 +1,6 @@ from gym import Env from gym.spaces import Tuple, Discrete - -from envs.CliffWalkingEnv import minmax +import numpy as np from envs.GridWorldEnv import GridWorld @@ -18,8 +17,8 @@ def is_wall(self, position): return self.world[position] == Maze.WALL def _move(self, move): - axis0 = minmax(self.position[0] + move[0], 0, self.world.shape[0] - 1) - axis1 = minmax(self.position[1] + move[1], 0, self.world.shape[1] - 1) + axis0 = np.clip(self.position[0] + move[0], 0, self.world.shape[0] - 1) + axis1 = np.clip(self.position[1] + move[1], 0, self.world.shape[1] - 1) if not self.is_wall((axis0, axis1)): self.position = (axis0, axis1) diff --git a/envs/RandomWalkEnv.py b/envs/RandomWalkEnv.py index b9565e2..08751cd 100644 --- a/envs/RandomWalkEnv.py +++ b/envs/RandomWalkEnv.py @@ -2,8 +2,6 @@ from gym.spaces import Discrete import numpy as np -from envs.WindyGridWorldEnv import minmax - class RandomWalk(Env): metadata = {'render.modes': ['human']} @@ -38,7 +36,7 @@ def _step(self, action): self.position -= step else: self.position += step - self.position = minmax(self.position, 0, len(self.states) - 1) + self.position = np.clip(self.position, 0, len(self.states) - 1) done = self.position == 0 or self.position == len(self.states) - 1 reward = self.states[self.position] diff --git a/envs/WindyGridWorldEnv.py b/envs/WindyGridWorldEnv.py index e596b1f..cee9161 100644 --- a/envs/WindyGridWorldEnv.py +++ b/envs/WindyGridWorldEnv.py @@ -7,10 +7,6 @@ def inc(tuple, val): return tuple[0] + val, tuple[1] + val -def minmax(value, low, high): - return max(min(value, high), low) - - class WindyGridWorld(Env): metadata = {'render.modes': ['human']} ACTION_UP = 0 @@ -71,9 +67,9 @@ def _step(self, action): def _move(self, by): wind = self._get_wind(self.position[1]) - axis1 = minmax(self.position[1] + by[1], 0, self.size[1] - 1) + axis1 = np.clip(self.position[1] + by[1], 0, self.size[1] - 1) axis0 = self.position[0] + by[0] - wind - axis0 = minmax(axis0, 0, self.size[0] - 1) + axis0 = np.clip(axis0, 0, self.size[0] - 1) self.position = axis0, axis1 def _get_wind(self, axis1): diff --git a/gradient_methods_random_walk.py b/gradient_methods_random_walk.py index 80f590f..8b1fdbc 100644 --- a/gradient_methods_random_walk.py +++ b/gradient_methods_random_walk.py @@ -1,12 +1,12 @@ from gym import Env -from double_q_learning import Algorithm -from envs.CliffWalkingEnv import minmax from envs.RandomWalkEnv import RandomWalk import numpy as np import plotly.offline as py import plotly.graph_objs as go +from utils import EpisodeAlgorithm, Algorithm + N_AGGREGATE = 100 N_STATES = 1000 MAX_STEP = 100 @@ -21,7 +21,7 @@ def find_true_values(): for action in [-1, 1]: for step in range(1, MAX_STEP + 1): step *= action - next_state = minmax(state + step, 0, N_STATES + 1) + next_state = np.clip(state + step, 0, N_STATES + 1) prob = 1 / (MAX_STEP * 2) new[state] += prob * (0 + new[next_state]) error = np.abs(np.sum(old - new)) @@ -40,14 +40,6 @@ def __init__(self, state, reward): self.reward = reward -class EpisodeAlgorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_episode(self, history): - raise NotImplementedError() - - class ValueFunction: def __init__(self, shape, aggregation=N_AGGREGATE): self.value = np.zeros([s // aggregation for s in shape]) diff --git a/n_step_sarsa.py b/n_step_sarsa.py index 31d4e2c..d8bb68d 100644 --- a/n_step_sarsa.py +++ b/n_step_sarsa.py @@ -6,13 +6,7 @@ import plotly.offline as py import plotly.graph_objs as go - -class Algorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_state(self, state, action, reward, next_state, done): - raise NotImplementedError() +from utils import Algorithm class NStepSarsa(Algorithm): diff --git a/n_step_td_random_walk.py b/n_step_td_random_walk.py index d26d2c4..e9a4145 100644 --- a/n_step_td_random_walk.py +++ b/n_step_td_random_walk.py @@ -1,22 +1,15 @@ import sys -from collections import deque -from envs.RandomWalkEnv import RandomWalk, Env -from randomwalk import rmse import numpy as np -import plotly.offline as py import plotly.graph_objs as go +import plotly.offline as py +from envs.RandomWalkEnv import RandomWalk +from randomwalk import rmse +from utils import Algorithm TRUE_VALUES = np.arange(-20, 22, 2) / 20.0 -class Algorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_state(self, state, action, reward, next_state, done): - raise NotImplementedError() - class NStepTD(Algorithm): def __init__(self, env: RandomWalk, n, alpha=0.1, gamma=1): diff --git a/n_step_tree_backup.py b/n_step_tree_backup.py index 69a01d9..7464bd1 100644 --- a/n_step_tree_backup.py +++ b/n_step_tree_backup.py @@ -7,7 +7,8 @@ from double_q_learning import epsilon_prob from envs.GridWorldEnv import GridWorld -from n_step_sarsa import Algorithm, perform_algo_eval, NStepSarsa +from n_step_sarsa import perform_algo_eval +from utils import Algorithm class Entry: diff --git a/random_walk_td_lambda.py b/random_walk_td_lambda.py index f4cbacc..7a1eb67 100644 --- a/random_walk_td_lambda.py +++ b/random_walk_td_lambda.py @@ -7,6 +7,8 @@ import plotly.offline as py import plotly.graph_objs as go +from utils import Algorithm + class RandomPolicy: def __init__(self, env: Env): @@ -26,7 +28,7 @@ def __getitem__(self, item): return self.actions[self.index] -class TD: +class TD(Algorithm): def __init__(self, env: Env, policy, alpha=0.1, gamma=1, lam=0.9): self.alpha = alpha self.gamma = gamma @@ -41,7 +43,7 @@ def trace(self, state): def action(self, state): return self.policy[state] - def on_new_state(self, state, reward, next_state, done): + def on_new_state(self, state, action, reward, next_state, done): v = self.values[state] v_next = self.values[next_state] delta = reward + self.gamma * v_next - v @@ -61,7 +63,7 @@ def generate_episode(env: Env, algorithm: TD): prev_obs = obs action = algorithm.action(prev_obs) obs, reward, done, aux = env.step(action) - algorithm.on_new_state(prev_obs, reward, obs, done) + algorithm.on_new_state(prev_obs, action, reward, obs, done) def perform_lam_test(env, lams, alphas, n_avg=1, n=10): diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..f147db2 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,45 @@ +import numpy as np +from math import ceil + +def randomargmax(d, key=None): + k_max = max(d, key=key) + return np.random.choice([k for k, v in d.items() if d[k_max] == v]) + + +def randargmax(b, **kw): + """ a random tie-breaking argmax""" + return np.argmax(np.random.random(b.shape) * (b == b.max()), **kw) + + +def epsilon_prob(greedy, action, n_actions, epsilon): + if greedy == action: + return epsilon_greedy_prob(n_actions, epsilon) + else: + return epsilon_explore_prob(n_actions, epsilon) + + +def epsilon_greedy_prob(n_actions, epsilon): + return 1 - epsilon + epsilon / n_actions + + +def epsilon_explore_prob(n_actions, epsilon): + return epsilon / n_actions + + +def calc_batch_size(size, n_batches, batch_idx): + return max(0, min(size - batch_idx * ceil(size / n_batches), ceil(size / n_batches))) + +class Algorithm: + def action(self, state): + raise NotImplementedError() + + def on_new_state(self, state, action, reward, next_state, done): + raise NotImplementedError() + + +class EpisodeAlgorithm: + def action(self, state): + raise NotImplementedError() + + def on_new_episode(self, history): + raise NotImplementedError() diff --git a/windy_gridworld.py b/windy_gridworld.py index a609bd3..7f96685 100644 --- a/windy_gridworld.py +++ b/windy_gridworld.py @@ -4,10 +4,12 @@ from log import make_logger import numpy as np +from utils import Algorithm + log = make_logger(__name__) -class Sarsa: +class Sarsa(Algorithm): def __init__(self, env: Env, alpha=0.5, gamma=1, epsilon=0.1): self.alpha = alpha self.gamma = gamma