From b2820307ad708b9ec930ca81fdfe7120dc307cca Mon Sep 17 00:00:00 2001 From: Adrian Skrobacz Date: Sat, 23 Dec 2017 20:36:37 +0100 Subject: [PATCH] Sarsa Lambda and True Online Sarsa Lambda done --- ...tial_semi_gradient_sarsa_access_control.py | 4 +- envs/__init__.py | 9 ++ sarsa-lambda.py | 142 ++++++++++++++++++ semi_gradient_sarsa_mountain_car.py | 57 +++---- utils/__init__.py | 113 +------------- utils/algo_utils.py | 129 ++++++++++++++++ utils/averager.py | 85 +++++++++++ 7 files changed, 389 insertions(+), 150 deletions(-) create mode 100644 sarsa-lambda.py create mode 100644 utils/algo_utils.py create mode 100644 utils/averager.py diff --git a/differential_semi_gradient_sarsa_access_control.py b/differential_semi_gradient_sarsa_access_control.py index b4184c5..123f600 100644 --- a/differential_semi_gradient_sarsa_access_control.py +++ b/differential_semi_gradient_sarsa_access_control.py @@ -5,15 +5,15 @@ from plotly import tools from envs.AcessControlQueueEnv import AccessControlQueueTimeLimit, AccessControlQueue +from features.TileCoding import IHT from utils import Algorithm, randargmax, generate_episode, epsilon_probs, TilingValueFunction np.random.seed(7) class ValueFunction(TilingValueFunction): - def __init__(self, n_tilings, max_size, n_priorities, n_servers): - super().__init__(n_tilings, max_size) + super().__init__(n_tilings, IHT(max_size)) self.n_priorities = n_priorities - 1 self.n_servers = n_servers diff --git a/envs/__init__.py b/envs/__init__.py index e69de29..506624e 100644 --- a/envs/__init__.py +++ b/envs/__init__.py @@ -0,0 +1,9 @@ +from envs.AcessControlQueueEnv import * +from envs.BlackjackEnv import * +from envs.CliffWalkingEnv import * +from envs.DoubleQLearningEnv import * +from envs.GridWorldEnv import * +from envs.MazeEnv import * +from envs.RaceCarEnv import * +from envs.RandomWalkEnv import * +from envs.WindyGridWorldEnv import * \ No newline at end of file diff --git a/sarsa-lambda.py b/sarsa-lambda.py new file mode 100644 index 0000000..926c895 --- /dev/null +++ b/sarsa-lambda.py @@ -0,0 +1,142 @@ +from collections import defaultdict + +import numpy as np +import gym +from gym import Env + +from features.TileCoding import IHT +from semi_gradient_sarsa_mountain_car import ValueFunction +from utils import Algorithm, generate_episode, epsilon_probs, randargmax, TilingFunctionCreator, Averager, \ + GymEpisodeTaskFactory, AlgorithmFactory, plot_scatters_from_dict + +N_TILINGS = 8 + + +class ValueFunctionCreator(TilingFunctionCreator): + def __init__(self, n_tilings: int, iht: IHT): + self.n_tilings = n_tilings + self.iht = iht + + def create(self): + return ValueFunction(self.n_tilings, self.iht) + + +class SarsaLambda(Algorithm): + def __init__(self, env: Env, creator: TilingFunctionCreator, alpha=0.5 / N_TILINGS, lam=0.92, epsilon=0.0, + gamma=1.0): + self.env = env + self.value_func_creator = creator + self.value_function = creator.create() + self.alpha = alpha + self.lam = lam + self.epsilon = epsilon + self.gamma = gamma + self.actions = np.arange(env.action_space.n) + self._reset() + + def action(self, state): + if self.next_action is None: + return self._action(state) + else: + return self.next_action + + def _reset(self): + self.e_trace = self.value_func_creator.create() + self.next_action = None + + def _action(self, state): + greedy = self.greedy_action(state) + probs = epsilon_probs(greedy, self.actions, self.epsilon) + return np.random.choice(self.actions, p=probs) + + def greedy_action(self, state): + array = np.array([self.value_function.estimated(state, action) for action in self.actions]) + return randargmax(array) + + def on_new_state(self, state, action, reward, next_state, done): + if not done: + self.next_action = self._action(next_state) + q = self.value_function.estimated(state, action) + q_next = 0 if done else self.value_function.estimated(next_state, self.next_action) + delta = reward + self.gamma * q_next - q + self.e_trace[state, action] = 1 + self.value_function[:, :] += self.alpha * delta * self.e_trace[:, :] + self.e_trace[:, :] *= self.gamma * self.lam + if done: + self._reset() + + +class TrueOnlineSarsaLambda(SarsaLambda): + def _reset(self): + super()._reset() + self.q_old = 0 + + def on_new_state(self, state, action, reward, next_state, done): + # Note value_function.x(...) and e_trace.x(...) returns same values since they use the same IHT + if not done: + self.next_action = self._action(next_state) + q = self.value_function.estimated(state, action) + q_next = 0 if done else self.value_function.estimated(next_state, self.next_action) + x = self.value_function.x(state, action) + delta = reward + self.gamma * q_next - q + self.e_trace[:, :] *= self.gamma * self.lam + self.e_trace[state, action] += 1 - self.alpha * self.gamma * self.lam * self.e_trace.estimated(state, action) + q_delta = q - self.q_old + self.value_function[:, :] += self.alpha * (delta + q_delta) * self.e_trace[:, :] - self.alpha * q_delta * x + self.q_old = q_next + if done: + self._reset() + + +class SarsaLambdaFactory(AlgorithmFactory): + def __init__(self, env: Env): + self.env = env + + def create(self, lam, alpha) -> Algorithm: + return SarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS) + + +class TrueOnlineSarsaLambdaFactory(AlgorithmFactory): + def __init__(self, env: Env): + self.env = env + + def create(self, lam, alpha) -> Algorithm: + return TrueOnlineSarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS) + + +def average_steps_per_episode(results, n_avg): + tmp = np.mean(results, axis=1) + return np.sum(tmp, axis=0) / n_avg + + +def perform_lambda_test(n_episodes, n_avg): + averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env))) + alphas = np.arange(1, 15) / N_TILINGS # Those are again divided by N_TILINGS in sarsa to give final alpha value + results = defaultdict(lambda: np.zeros(len(alphas))) + for lam in [0, .68, .84, .92, .96, .98, .99]: + for i, alpha in np.ndenumerate(alphas): + results[lam][i] = averager.average((lam, alpha), n_avg, merge=average_steps_per_episode) + plot_scatters_from_dict(results, 'lambda={}', alphas) + + +def perform_sarsa_lambda_comparison(n_episodes, n_avg): + alphas = np.arange(0.2, 2.2, 0.2) # Those are divided by N_TILINGS in sarsa to give final alpha value + lam = 0.84 + results = defaultdict(lambda: np.zeros(len(alphas))) + averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env))) + for i, alpha in np.ndenumerate(alphas): + results['Sarsa(Lam) with replacing'][i] = -averager.average((lam, alpha), n_avg, + merge=average_steps_per_episode) + + averager = Averager(GymEpisodeTaskFactory(env, n_episodes, TrueOnlineSarsaLambdaFactory(env))) + for i, alpha in np.ndenumerate(alphas): + results['True Online Sarsa(Lam)'][i] = -averager.average((lam, alpha), n_avg, merge=average_steps_per_episode) + + plot_scatters_from_dict(results, '{}', alphas) + + +if __name__ == '__main__': + env = gym.make('MountainCar-v0') + env._max_episode_steps = int(3e3) + # perform_lambda_test(n_episodes=50, n_avg=40) + perform_sarsa_lambda_comparison(n_episodes=20, n_avg=100) diff --git a/semi_gradient_sarsa_mountain_car.py b/semi_gradient_sarsa_mountain_car.py index d88c657..1bd3fb4 100644 --- a/semi_gradient_sarsa_mountain_car.py +++ b/semi_gradient_sarsa_mountain_car.py @@ -12,7 +12,7 @@ from joblib import Parallel, delayed from multiprocessing import cpu_count -from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size +from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size, TilingValueFunction POSITION_MIN = -1.2 POSITION_MAX = 0.6 @@ -25,32 +25,15 @@ EPSILON = 0 -class TilingValueFunction: - def __init__(self, n_tilings=N_TILINGS, max_size=MAX_SIZE): - self.iht = IHT(MAX_SIZE) - self.n_tilings = n_tilings - self.weights = np.zeros((max_size,)) - self.position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN) - self.velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN) - - def _idx(self, item): - position, velocity, action = item - return tiles(self.iht, self.n_tilings, - [self.position_scale * position, self.velocity_scale * velocity], - [action]) - - def __getitem__(self, item): - position, _, _ = item - if position >= POSITION_GOAL: - return np.zeros(1) - else: - return self.weights[self._idx(item)] - - def estimated(self, item): - return self[item].sum() +class ValueFunction(TilingValueFunction): + def __init__(self, n_tilings: int, iht: IHT): + super().__init__(n_tilings, iht) - def __setitem__(self, key, value): - self.weights[self._idx(key)] = value + def scaled_values(self, state): + position, velocity = state + position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN) + velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN) + return [position * position_scale, velocity * velocity_scale] class SemiGradientSarsa(Algorithm): @@ -81,16 +64,16 @@ def _action(self, state): return np.random.choice(self.actions, p=self._probs(state)) def greedy_action(self, state): - array = np.array([self.value_function.estimated((*state, action)) for action in self.actions]) + array = np.array([self.value_function.estimated(state, action) for action in self.actions]) return np.argmax(array) def on_new_state(self, state, action, reward, next_state, done): self.next_action = self._action(next_state) - q_next = self.value_function.estimated((*next_state, self.next_action)) - q = self.value_function.estimated((*state, action)) + q_next = self.value_function.estimated(next_state, self.next_action) + q = self.value_function.estimated(state, action) delta = reward + self.gamma * q_next - q update = self.alpha * delta - self.value_function[(*state, action)] += update + self.value_function[state, action] += update if done: self.next_action = None @@ -138,7 +121,7 @@ def get_entry(self, t): def _get_key(self, t): entry = self.get_entry(t) - return (*entry.state, entry.action) + return entry.state, entry.action def action(self, state): if self.t > 0: @@ -158,7 +141,7 @@ def _prob(self, action, greedy): return epsilon_prob(greedy, action, len(self.actions), self.epsilon) def greedy_action(self, state): - array = np.array([self.value_function.estimated((*state, action)) for action in self.actions]) + array = np.array([self.value_function.estimated(state, action) for action in self.actions]) return randargmax(array) def calc_returns(self, update_time): @@ -182,8 +165,8 @@ def on_new_state(self, state, action, reward, next_state, done): returns = self.calc_returns(update_time) not_last_state = update_time + self.n < self.T if not_last_state: - returns += pow(self.gamma, self.n) * self.value_function.estimated(key_t_plus_n) - self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(key_t)) + returns += pow(self.gamma, self.n) * self.value_function.estimated(*key_t_plus_n) + self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(*key_t)) self.t += 1 if done and update_time != self.T - 1: self.on_new_state(state, action, reward, next_state, done) @@ -259,7 +242,7 @@ def __init__(self, env): self.env = env def __call__(self, alpha): - return SemiGradientSarsa(self.env, TilingValueFunction(), alpha) + return SemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), alpha) class GimmeNStepSarsa: @@ -267,7 +250,7 @@ def __init__(self, env): self.env = env def __call__(self, alpha, n): - return NStepSemiGradientSarsa(self.env, TilingValueFunction(), n, alpha) + return NStepSemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), n, alpha) def plot_value_function_using_plotly(value_function): @@ -326,7 +309,7 @@ def plot_n_step_sarsa_by_alpha_and_n(env): plot_n_step_sarsa_by_alpha_and_n(env) - # value_function = TilingValueFunction(N_TILINGS) + # value_function = ValueFunction(N_TILINGS, IHT(MAX_SIZE)) # for i in range(100): # # steps = generate_episode(env, NStepSemiGradientSarsa(env, value_function, 8, 0.5 / N_TILINGS)) # steps = generate_episode(env, SemiGradientSarsa(env, value_function, 0.5 / N_TILINGS)) diff --git a/utils/__init__.py b/utils/__init__.py index 48988cc..b8a8dc1 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,111 +1,2 @@ -import numpy as np -from math import ceil -import itertools - -from gym import Env - -from features.TileCoding import IHT, tiles - - -def randomargmax(d, key=None): - k_max = max(d, key=key) - return np.random.choice([k for k, v in d.items() if d[k_max] == v]) - - -def randargmax(b, **kw): - """ a random tie-breaking argmax""" - return np.argmax(np.random.random(b.shape) * (b == b.max()), **kw) - - -def epsilon_probs(greedy, actions, epsilon): - return [epsilon_prob(greedy, action, len(actions), epsilon) for action in actions] - - -def epsilon_prob(greedy, action, n_actions, epsilon): - if greedy == action: - return epsilon_greedy_prob(n_actions, epsilon) - else: - return epsilon_explore_prob(n_actions, epsilon) - - -def epsilon_greedy_prob(n_actions, epsilon): - return 1 - epsilon + epsilon / n_actions - - -def epsilon_explore_prob(n_actions, epsilon): - return epsilon / n_actions - - -def calc_batch_size(size, n_batches, batch_idx): - return max(0, min(size - batch_idx * ceil(size / n_batches), ceil(size / n_batches))) - - -class Algorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_state(self, state, action, reward, next_state, done): - raise NotImplementedError() - - -class EpisodeAlgorithm: - def action(self, state): - raise NotImplementedError() - - def on_new_episode(self, history): - raise NotImplementedError() - - -def generate_episode(env: Env, algorithm: Algorithm, render=False, print_step=False): - done = False - obs = env.reset() - counter = 0 - while not done: - if print_step: - print('Step:', counter) - if render: - env.render() - prev_obs = obs - action = algorithm.action(obs) - obs, reward, done, _ = env.step(action) - algorithm.on_new_state(prev_obs, action, reward, obs, done) - counter += 1 - return counter - - -class TilingValueFunction: - def __init__(self, n_tilings, max_size): - self.iht = IHT(max_size) - self.n_tilings = n_tilings - self.weights = np.zeros(max_size) - - def scaled_values(self, state): - raise NotImplementedError('Implement me and return scaled values from state') - - def _idx(self, state, action): - return tiles(self.iht, self.n_tilings, - self.scaled_values(state), - [action]) - - def __getitem__(self, item): - state, action = item - return self.weights[self._idx(state, action)] - - def estimated(self, state, action): - return self[state, action].sum() - - def __setitem__(self, key, value): - state, action = key - self.weights[self._idx(state, action)] = value - - def to_policy(self, actions, *args): - policy = np.zeros([len(arg) for arg in args]) - for state in itertools.product(*[list(arg) for arg in args]): - policy[state] = np.argmax([self.estimated(state, action) for action in actions]) - return policy - - def to_value(self, actions, *args): - value = np.zeros([len(arg) for arg in args]) - for state in itertools.product(*[list(arg) for arg in args]): - value[state] = np.max([self.estimated(state, action) for action in actions]) - return value +from utils.algo_utils import * +from utils.averager import * diff --git a/utils/algo_utils.py b/utils/algo_utils.py new file mode 100644 index 0000000..a335458 --- /dev/null +++ b/utils/algo_utils.py @@ -0,0 +1,129 @@ +import itertools +from math import ceil + +from gym import Env + +from features.TileCoding import tiles +import numpy as np + + +def randomargmax(d, key=None): + k_max = max(d, key=key) + return np.random.choice([k for k, v in d.items() if d[k_max] == v]) + + +def randargmax(b, **kw): + """ a random tie-breaking argmax""" + return np.argmax(np.random.random(b.shape) * (b == b.max()), **kw) + + +def epsilon_probs(greedy, actions, epsilon): + return [epsilon_prob(greedy, action, len(actions), epsilon) for action in actions] + + +def epsilon_prob(greedy, action, n_actions, epsilon): + if greedy == action: + return epsilon_greedy_prob(n_actions, epsilon) + else: + return epsilon_explore_prob(n_actions, epsilon) + + +def epsilon_greedy_prob(n_actions, epsilon): + return 1 - epsilon + epsilon / n_actions + + +def epsilon_explore_prob(n_actions, epsilon): + return epsilon / n_actions + + +def calc_batch_size(size, n_batches, batch_idx): + return max(0, min(size - batch_idx * ceil(size / n_batches), ceil(size / n_batches))) + + +class Algorithm: + def action(self, state): + raise NotImplementedError() + + def on_new_state(self, state, action, reward, next_state, done): + raise NotImplementedError() + + +class EpisodeAlgorithm: + def action(self, state): + raise NotImplementedError() + + def on_new_episode(self, history): + raise NotImplementedError() + + +def generate_episode(env: Env, algorithm: Algorithm, render=False, print_step=False): + done = False + obs = env.reset() + counter = 0 + while not done: + if print_step: + print('Step:', counter) + if render: + env.render() + prev_obs = obs + action = algorithm.action(obs) + obs, reward, done, _ = env.step(action) + algorithm.on_new_state(prev_obs, action, reward, obs, done) + counter += 1 + return counter + + +class TilingValueFunction: + ALL = slice(None, None, None) + + def __init__(self, n_tilings, iht): + self.iht = iht + self.n_tilings = n_tilings + self.weights = np.zeros(iht.size) + + def scaled_values(self, state): + raise NotImplementedError('Implement me and return scaled values from state') + + def _idx(self, state, action): + if self.is_all_slice(state) and self.is_all_slice(action): + return TilingValueFunction.ALL + else: + return tiles(self.iht, self.n_tilings, + self.scaled_values(state), + [action]) + + def is_all_slice(self, item): + return isinstance(item, slice) and item == TilingValueFunction.ALL + + def x(self, state, action): + x = np.zeros(self.weights.shape) + x[self._idx(state, action)] = 1 + return x + + def __getitem__(self, item): + state, action = item + return self.weights[self._idx(state, action)] + + def estimated(self, state, action): + return self[state, action].sum() + + def __setitem__(self, key, value): + state, action = key + self.weights[self._idx(state, action)] = value + + def to_policy(self, actions, *args): + policy = np.zeros([len(arg) for arg in args]) + for state in itertools.product(*[list(arg) for arg in args]): + policy[state] = np.argmax([self.estimated(state, action) for action in actions]) + return policy + + def to_value(self, actions, *args): + value = np.zeros([len(arg) for arg in args]) + for state in itertools.product(*[list(arg) for arg in args]): + value[state] = np.max([self.estimated(state, action) for action in actions]) + return value + + +class TilingFunctionCreator: + def create(self): + raise NotImplementedError('Implement this method and return subclass of TilingValueFunction') diff --git a/utils/averager.py b/utils/averager.py new file mode 100644 index 0000000..4883f1f --- /dev/null +++ b/utils/averager.py @@ -0,0 +1,85 @@ +from multiprocessing import cpu_count +from joblib import Parallel, delayed +import numpy as np +import plotly.graph_objs as go +import plotly.offline as py + +from utils import calc_batch_size, Algorithm, generate_episode + + +class AlgorithmFactory: + def create(self, *args, **kwargs) -> Algorithm: + raise NotImplementedError('Implement me') + + +class AveragingTask: + def run(self, batch_size, batch_idx): + results = self.create_results() + for i in range(batch_size): + self.run_single(i, results) + return results + + def run_single(self, i, results): + raise NotImplementedError('Implement me') + + def create_results(self): + raise NotImplementedError('Implement me') + + +class GymEpisodeTask(AveragingTask): + def __init__(self, env, n_episodes, algorithm_factory: AlgorithmFactory, algo_params): + self.env = env + self.n_episodes = n_episodes + self.algorithm_factory = algorithm_factory + self.algo_params = algo_params + + def run_single(self, i, results): + algorithm = self.algorithm_factory.create(*self.algo_params) + for episode in range(self.n_episodes): + steps = generate_episode(self.env, algorithm, render=False) + results[episode] += steps + print('Run: {:2}, params: {}, ep: {:3}, steps: {:4}'.format(i, self.algo_params, episode, steps)) + + def create_results(self): + return np.zeros(self.n_episodes) + + +class TaskFactory: + def create(self, params) -> AveragingTask: + raise NotImplementedError('Implement me') + + +class GymEpisodeTaskFactory(TaskFactory): + def __init__(self, env, n_episodes, algorithm_factory: AlgorithmFactory): + self.env = env + self.n_episodes = n_episodes + self.algorithm_factory = algorithm_factory + + def create(self, params) -> AveragingTask: + return GymEpisodeTask(self.env, self.n_episodes, self.algorithm_factory, params) + + +def average(results, n_avg): + return np.sum(results, axis=0) / n_avg + + +class Averager: + def __init__(self, task_factory: TaskFactory): + self.task_factory = task_factory + + def average(self, algo_params, n_avg, n_jobs=cpu_count(), merge=average): + with Parallel(n_jobs=n_jobs) as parallel: + jobs = [] + for batch_idx in range(n_jobs): + task = self.task_factory.create(algo_params) + batch_size = calc_batch_size(n_avg, n_jobs, batch_idx) + jobs.append(delayed(task.run)(batch_size, batch_idx)) + results = parallel(jobs) + return merge(results, n_avg) + + +def plot_scatters_from_dict(results, label_format: str, x=None): + data = [] + for label, values in results.items(): + data.append(go.Scatter(y=values, x=x, name=label_format.format(label))) + py.plot(data)