Skip to content

Commit

Permalink
Sarsa Lambda and True Online Sarsa Lambda done
Browse files Browse the repository at this point in the history
  • Loading branch information
adik993 committed Dec 29, 2017
1 parent 2e52714 commit b282030
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 150 deletions.
4 changes: 2 additions & 2 deletions differential_semi_gradient_sarsa_access_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from plotly import tools

from envs.AcessControlQueueEnv import AccessControlQueueTimeLimit, AccessControlQueue
from features.TileCoding import IHT
from utils import Algorithm, randargmax, generate_episode, epsilon_probs, TilingValueFunction

np.random.seed(7)


class ValueFunction(TilingValueFunction):

def __init__(self, n_tilings, max_size, n_priorities, n_servers):
super().__init__(n_tilings, max_size)
super().__init__(n_tilings, IHT(max_size))
self.n_priorities = n_priorities - 1
self.n_servers = n_servers

Expand Down
9 changes: 9 additions & 0 deletions envs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from envs.AcessControlQueueEnv import *
from envs.BlackjackEnv import *
from envs.CliffWalkingEnv import *
from envs.DoubleQLearningEnv import *
from envs.GridWorldEnv import *
from envs.MazeEnv import *
from envs.RaceCarEnv import *
from envs.RandomWalkEnv import *
from envs.WindyGridWorldEnv import *
142 changes: 142 additions & 0 deletions sarsa-lambda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from collections import defaultdict

import numpy as np
import gym
from gym import Env

from features.TileCoding import IHT
from semi_gradient_sarsa_mountain_car import ValueFunction
from utils import Algorithm, generate_episode, epsilon_probs, randargmax, TilingFunctionCreator, Averager, \
GymEpisodeTaskFactory, AlgorithmFactory, plot_scatters_from_dict

N_TILINGS = 8


class ValueFunctionCreator(TilingFunctionCreator):
def __init__(self, n_tilings: int, iht: IHT):
self.n_tilings = n_tilings
self.iht = iht

def create(self):
return ValueFunction(self.n_tilings, self.iht)


class SarsaLambda(Algorithm):
def __init__(self, env: Env, creator: TilingFunctionCreator, alpha=0.5 / N_TILINGS, lam=0.92, epsilon=0.0,
gamma=1.0):
self.env = env
self.value_func_creator = creator
self.value_function = creator.create()
self.alpha = alpha
self.lam = lam
self.epsilon = epsilon
self.gamma = gamma
self.actions = np.arange(env.action_space.n)
self._reset()

def action(self, state):
if self.next_action is None:
return self._action(state)
else:
return self.next_action

def _reset(self):
self.e_trace = self.value_func_creator.create()
self.next_action = None

def _action(self, state):
greedy = self.greedy_action(state)
probs = epsilon_probs(greedy, self.actions, self.epsilon)
return np.random.choice(self.actions, p=probs)

def greedy_action(self, state):
array = np.array([self.value_function.estimated(state, action) for action in self.actions])
return randargmax(array)

def on_new_state(self, state, action, reward, next_state, done):
if not done:
self.next_action = self._action(next_state)
q = self.value_function.estimated(state, action)
q_next = 0 if done else self.value_function.estimated(next_state, self.next_action)
delta = reward + self.gamma * q_next - q
self.e_trace[state, action] = 1
self.value_function[:, :] += self.alpha * delta * self.e_trace[:, :]
self.e_trace[:, :] *= self.gamma * self.lam
if done:
self._reset()


class TrueOnlineSarsaLambda(SarsaLambda):
def _reset(self):
super()._reset()
self.q_old = 0

def on_new_state(self, state, action, reward, next_state, done):
# Note value_function.x(...) and e_trace.x(...) returns same values since they use the same IHT
if not done:
self.next_action = self._action(next_state)
q = self.value_function.estimated(state, action)
q_next = 0 if done else self.value_function.estimated(next_state, self.next_action)
x = self.value_function.x(state, action)
delta = reward + self.gamma * q_next - q
self.e_trace[:, :] *= self.gamma * self.lam
self.e_trace[state, action] += 1 - self.alpha * self.gamma * self.lam * self.e_trace.estimated(state, action)
q_delta = q - self.q_old
self.value_function[:, :] += self.alpha * (delta + q_delta) * self.e_trace[:, :] - self.alpha * q_delta * x
self.q_old = q_next
if done:
self._reset()


class SarsaLambdaFactory(AlgorithmFactory):
def __init__(self, env: Env):
self.env = env

def create(self, lam, alpha) -> Algorithm:
return SarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS)


class TrueOnlineSarsaLambdaFactory(AlgorithmFactory):
def __init__(self, env: Env):
self.env = env

def create(self, lam, alpha) -> Algorithm:
return TrueOnlineSarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS)


def average_steps_per_episode(results, n_avg):
tmp = np.mean(results, axis=1)
return np.sum(tmp, axis=0) / n_avg


def perform_lambda_test(n_episodes, n_avg):
averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env)))
alphas = np.arange(1, 15) / N_TILINGS # Those are again divided by N_TILINGS in sarsa to give final alpha value
results = defaultdict(lambda: np.zeros(len(alphas)))
for lam in [0, .68, .84, .92, .96, .98, .99]:
for i, alpha in np.ndenumerate(alphas):
results[lam][i] = averager.average((lam, alpha), n_avg, merge=average_steps_per_episode)
plot_scatters_from_dict(results, 'lambda={}', alphas)


def perform_sarsa_lambda_comparison(n_episodes, n_avg):
alphas = np.arange(0.2, 2.2, 0.2) # Those are divided by N_TILINGS in sarsa to give final alpha value
lam = 0.84
results = defaultdict(lambda: np.zeros(len(alphas)))
averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env)))
for i, alpha in np.ndenumerate(alphas):
results['Sarsa(Lam) with replacing'][i] = -averager.average((lam, alpha), n_avg,
merge=average_steps_per_episode)

averager = Averager(GymEpisodeTaskFactory(env, n_episodes, TrueOnlineSarsaLambdaFactory(env)))
for i, alpha in np.ndenumerate(alphas):
results['True Online Sarsa(Lam)'][i] = -averager.average((lam, alpha), n_avg, merge=average_steps_per_episode)

plot_scatters_from_dict(results, '{}', alphas)


if __name__ == '__main__':
env = gym.make('MountainCar-v0')
env._max_episode_steps = int(3e3)
# perform_lambda_test(n_episodes=50, n_avg=40)
perform_sarsa_lambda_comparison(n_episodes=20, n_avg=100)
57 changes: 20 additions & 37 deletions semi_gradient_sarsa_mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from joblib import Parallel, delayed
from multiprocessing import cpu_count

from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size
from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size, TilingValueFunction

POSITION_MIN = -1.2
POSITION_MAX = 0.6
Expand All @@ -25,32 +25,15 @@
EPSILON = 0


class TilingValueFunction:
def __init__(self, n_tilings=N_TILINGS, max_size=MAX_SIZE):
self.iht = IHT(MAX_SIZE)
self.n_tilings = n_tilings
self.weights = np.zeros((max_size,))
self.position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN)
self.velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN)

def _idx(self, item):
position, velocity, action = item
return tiles(self.iht, self.n_tilings,
[self.position_scale * position, self.velocity_scale * velocity],
[action])

def __getitem__(self, item):
position, _, _ = item
if position >= POSITION_GOAL:
return np.zeros(1)
else:
return self.weights[self._idx(item)]

def estimated(self, item):
return self[item].sum()
class ValueFunction(TilingValueFunction):
def __init__(self, n_tilings: int, iht: IHT):
super().__init__(n_tilings, iht)

def __setitem__(self, key, value):
self.weights[self._idx(key)] = value
def scaled_values(self, state):
position, velocity = state
position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN)
velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN)
return [position * position_scale, velocity * velocity_scale]


class SemiGradientSarsa(Algorithm):
Expand Down Expand Up @@ -81,16 +64,16 @@ def _action(self, state):
return np.random.choice(self.actions, p=self._probs(state))

def greedy_action(self, state):
array = np.array([self.value_function.estimated((*state, action)) for action in self.actions])
array = np.array([self.value_function.estimated(state, action) for action in self.actions])
return np.argmax(array)

def on_new_state(self, state, action, reward, next_state, done):
self.next_action = self._action(next_state)
q_next = self.value_function.estimated((*next_state, self.next_action))
q = self.value_function.estimated((*state, action))
q_next = self.value_function.estimated(next_state, self.next_action)
q = self.value_function.estimated(state, action)
delta = reward + self.gamma * q_next - q
update = self.alpha * delta
self.value_function[(*state, action)] += update
self.value_function[state, action] += update
if done:
self.next_action = None

Expand Down Expand Up @@ -138,7 +121,7 @@ def get_entry(self, t):

def _get_key(self, t):
entry = self.get_entry(t)
return (*entry.state, entry.action)
return entry.state, entry.action

def action(self, state):
if self.t > 0:
Expand All @@ -158,7 +141,7 @@ def _prob(self, action, greedy):
return epsilon_prob(greedy, action, len(self.actions), self.epsilon)

def greedy_action(self, state):
array = np.array([self.value_function.estimated((*state, action)) for action in self.actions])
array = np.array([self.value_function.estimated(state, action) for action in self.actions])
return randargmax(array)

def calc_returns(self, update_time):
Expand All @@ -182,8 +165,8 @@ def on_new_state(self, state, action, reward, next_state, done):
returns = self.calc_returns(update_time)
not_last_state = update_time + self.n < self.T
if not_last_state:
returns += pow(self.gamma, self.n) * self.value_function.estimated(key_t_plus_n)
self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(key_t))
returns += pow(self.gamma, self.n) * self.value_function.estimated(*key_t_plus_n)
self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(*key_t))
self.t += 1
if done and update_time != self.T - 1:
self.on_new_state(state, action, reward, next_state, done)
Expand Down Expand Up @@ -259,15 +242,15 @@ def __init__(self, env):
self.env = env

def __call__(self, alpha):
return SemiGradientSarsa(self.env, TilingValueFunction(), alpha)
return SemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), alpha)


class GimmeNStepSarsa:
def __init__(self, env):
self.env = env

def __call__(self, alpha, n):
return NStepSemiGradientSarsa(self.env, TilingValueFunction(), n, alpha)
return NStepSemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), n, alpha)


def plot_value_function_using_plotly(value_function):
Expand Down Expand Up @@ -326,7 +309,7 @@ def plot_n_step_sarsa_by_alpha_and_n(env):

plot_n_step_sarsa_by_alpha_and_n(env)

# value_function = TilingValueFunction(N_TILINGS)
# value_function = ValueFunction(N_TILINGS, IHT(MAX_SIZE))
# for i in range(100):
# # steps = generate_episode(env, NStepSemiGradientSarsa(env, value_function, 8, 0.5 / N_TILINGS))
# steps = generate_episode(env, SemiGradientSarsa(env, value_function, 0.5 / N_TILINGS))
Expand Down
Loading

0 comments on commit b282030

Please sign in to comment.