Sarsa Lambda and True Online Sarsa Lambda done

adik993 · Dec 29, 2017 · b282030 · b282030
1 parent 2e52714
commit b282030
Show file tree

Hide file tree

Showing 7 changed files with 389 additions and 150 deletions.
diff --git a/differential_semi_gradient_sarsa_access_control.py b/differential_semi_gradient_sarsa_access_control.py
@@ -5,15 +5,15 @@
 from plotly import tools
 
 from envs.AcessControlQueueEnv import AccessControlQueueTimeLimit, AccessControlQueue
+from features.TileCoding import IHT
 from utils import Algorithm, randargmax, generate_episode, epsilon_probs, TilingValueFunction
 
 np.random.seed(7)
 
 
 class ValueFunction(TilingValueFunction):
-
     def __init__(self, n_tilings, max_size, n_priorities, n_servers):
-        super().__init__(n_tilings, max_size)
+        super().__init__(n_tilings, IHT(max_size))
         self.n_priorities = n_priorities - 1
         self.n_servers = n_servers
 

diff --git a/envs/__init__.py b/envs/__init__.py
@@ -0,0 +1,9 @@
+from envs.AcessControlQueueEnv import *
+from envs.BlackjackEnv import *
+from envs.CliffWalkingEnv import *
+from envs.DoubleQLearningEnv import *
+from envs.GridWorldEnv import *
+from envs.MazeEnv import *
+from envs.RaceCarEnv import *
+from envs.RandomWalkEnv import *
+from envs.WindyGridWorldEnv import *
diff --git a/sarsa-lambda.py b/sarsa-lambda.py
@@ -0,0 +1,142 @@
+from collections import defaultdict
+
+import numpy as np
+import gym
+from gym import Env
+
+from features.TileCoding import IHT
+from semi_gradient_sarsa_mountain_car import ValueFunction
+from utils import Algorithm, generate_episode, epsilon_probs, randargmax, TilingFunctionCreator, Averager, \
+    GymEpisodeTaskFactory, AlgorithmFactory, plot_scatters_from_dict
+
+N_TILINGS = 8
+
+
+class ValueFunctionCreator(TilingFunctionCreator):
+    def __init__(self, n_tilings: int, iht: IHT):
+        self.n_tilings = n_tilings
+        self.iht = iht
+
+    def create(self):
+        return ValueFunction(self.n_tilings, self.iht)
+
+
+class SarsaLambda(Algorithm):
+    def __init__(self, env: Env, creator: TilingFunctionCreator, alpha=0.5 / N_TILINGS, lam=0.92, epsilon=0.0,
+                 gamma=1.0):
+        self.env = env
+        self.value_func_creator = creator
+        self.value_function = creator.create()
+        self.alpha = alpha
+        self.lam = lam
+        self.epsilon = epsilon
+        self.gamma = gamma
+        self.actions = np.arange(env.action_space.n)
+        self._reset()
+
+    def action(self, state):
+        if self.next_action is None:
+            return self._action(state)
+        else:
+            return self.next_action
+
+    def _reset(self):
+        self.e_trace = self.value_func_creator.create()
+        self.next_action = None
+
+    def _action(self, state):
+        greedy = self.greedy_action(state)
+        probs = epsilon_probs(greedy, self.actions, self.epsilon)
+        return np.random.choice(self.actions, p=probs)
+
+    def greedy_action(self, state):
+        array = np.array([self.value_function.estimated(state, action) for action in self.actions])
+        return randargmax(array)
+
+    def on_new_state(self, state, action, reward, next_state, done):
+        if not done:
+            self.next_action = self._action(next_state)
+        q = self.value_function.estimated(state, action)
+        q_next = 0 if done else self.value_function.estimated(next_state, self.next_action)
+        delta = reward + self.gamma * q_next - q
+        self.e_trace[state, action] = 1
+        self.value_function[:, :] += self.alpha * delta * self.e_trace[:, :]
+        self.e_trace[:, :] *= self.gamma * self.lam
+        if done:
+            self._reset()
+
+
+class TrueOnlineSarsaLambda(SarsaLambda):
+    def _reset(self):
+        super()._reset()
+        self.q_old = 0
+
+    def on_new_state(self, state, action, reward, next_state, done):
+        # Note value_function.x(...) and e_trace.x(...) returns same values since they use the same IHT
+        if not done:
+            self.next_action = self._action(next_state)
+        q = self.value_function.estimated(state, action)
+        q_next = 0 if done else self.value_function.estimated(next_state, self.next_action)
+        x = self.value_function.x(state, action)
+        delta = reward + self.gamma * q_next - q
+        self.e_trace[:, :] *= self.gamma * self.lam
+        self.e_trace[state, action] += 1 - self.alpha * self.gamma * self.lam * self.e_trace.estimated(state, action)
+        q_delta = q - self.q_old
+        self.value_function[:, :] += self.alpha * (delta + q_delta) * self.e_trace[:, :] - self.alpha * q_delta * x
+        self.q_old = q_next
+        if done:
+            self._reset()
+
+
+class SarsaLambdaFactory(AlgorithmFactory):
+    def __init__(self, env: Env):
+        self.env = env
+
+    def create(self, lam, alpha) -> Algorithm:
+        return SarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS)
+
+
+class TrueOnlineSarsaLambdaFactory(AlgorithmFactory):
+    def __init__(self, env: Env):
+        self.env = env
+
+    def create(self, lam, alpha) -> Algorithm:
+        return TrueOnlineSarsaLambda(env, ValueFunctionCreator(N_TILINGS, IHT(4096)), lam=lam, alpha=alpha / N_TILINGS)
+
+
+def average_steps_per_episode(results, n_avg):
+    tmp = np.mean(results, axis=1)
+    return np.sum(tmp, axis=0) / n_avg
+
+
+def perform_lambda_test(n_episodes, n_avg):
+    averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env)))
+    alphas = np.arange(1, 15) / N_TILINGS  # Those are again divided by N_TILINGS in sarsa to give final alpha value
+    results = defaultdict(lambda: np.zeros(len(alphas)))
+    for lam in [0, .68, .84, .92, .96, .98, .99]:
+        for i, alpha in np.ndenumerate(alphas):
+            results[lam][i] = averager.average((lam, alpha), n_avg, merge=average_steps_per_episode)
+    plot_scatters_from_dict(results, 'lambda={}', alphas)
+
+
+def perform_sarsa_lambda_comparison(n_episodes, n_avg):
+    alphas = np.arange(0.2, 2.2, 0.2)  # Those are divided by N_TILINGS in sarsa to give final alpha value
+    lam = 0.84
+    results = defaultdict(lambda: np.zeros(len(alphas)))
+    averager = Averager(GymEpisodeTaskFactory(env, n_episodes, SarsaLambdaFactory(env)))
+    for i, alpha in np.ndenumerate(alphas):
+        results['Sarsa(Lam) with replacing'][i] = -averager.average((lam, alpha), n_avg,
+                                                                    merge=average_steps_per_episode)
+
+    averager = Averager(GymEpisodeTaskFactory(env, n_episodes, TrueOnlineSarsaLambdaFactory(env)))
+    for i, alpha in np.ndenumerate(alphas):
+        results['True Online Sarsa(Lam)'][i] = -averager.average((lam, alpha), n_avg, merge=average_steps_per_episode)
+
+    plot_scatters_from_dict(results, '{}', alphas)
+
+
+if __name__ == '__main__':
+    env = gym.make('MountainCar-v0')
+    env._max_episode_steps = int(3e3)
+    # perform_lambda_test(n_episodes=50, n_avg=40)
+    perform_sarsa_lambda_comparison(n_episodes=20, n_avg=100)
diff --git a/semi_gradient_sarsa_mountain_car.py b/semi_gradient_sarsa_mountain_car.py
@@ -12,7 +12,7 @@
 from joblib import Parallel, delayed
 from multiprocessing import cpu_count
 
-from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size
+from utils import epsilon_prob, randargmax, Algorithm, calc_batch_size, TilingValueFunction
 
 POSITION_MIN = -1.2
 POSITION_MAX = 0.6
@@ -25,32 +25,15 @@
 EPSILON = 0
 
 
-class TilingValueFunction:
-    def __init__(self, n_tilings=N_TILINGS, max_size=MAX_SIZE):
-        self.iht = IHT(MAX_SIZE)
-        self.n_tilings = n_tilings
-        self.weights = np.zeros((max_size,))
-        self.position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN)
-        self.velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN)
-
-    def _idx(self, item):
-        position, velocity, action = item
-        return tiles(self.iht, self.n_tilings,
-                     [self.position_scale * position, self.velocity_scale * velocity],
-                     [action])
-
-    def __getitem__(self, item):
-        position, _, _ = item
-        if position >= POSITION_GOAL:
-            return np.zeros(1)
-        else:
-            return self.weights[self._idx(item)]
-
-    def estimated(self, item):
-        return self[item].sum()
+class ValueFunction(TilingValueFunction):
+    def __init__(self, n_tilings: int, iht: IHT):
+        super().__init__(n_tilings, iht)
 
-    def __setitem__(self, key, value):
-        self.weights[self._idx(key)] = value
+    def scaled_values(self, state):
+        position, velocity = state
+        position_scale = self.n_tilings / (POSITION_MAX - POSITION_MIN)
+        velocity_scale = self.n_tilings / (VELOCITY_MAX - VELOCITY_MIN)
+        return [position * position_scale, velocity * velocity_scale]
 
 
 class SemiGradientSarsa(Algorithm):
@@ -81,16 +64,16 @@ def _action(self, state):
         return np.random.choice(self.actions, p=self._probs(state))
 
     def greedy_action(self, state):
-        array = np.array([self.value_function.estimated((*state, action)) for action in self.actions])
+        array = np.array([self.value_function.estimated(state, action) for action in self.actions])
         return np.argmax(array)
 
     def on_new_state(self, state, action, reward, next_state, done):
         self.next_action = self._action(next_state)
-        q_next = self.value_function.estimated((*next_state, self.next_action))
-        q = self.value_function.estimated((*state, action))
+        q_next = self.value_function.estimated(next_state, self.next_action)
+        q = self.value_function.estimated(state, action)
         delta = reward + self.gamma * q_next - q
         update = self.alpha * delta
-        self.value_function[(*state, action)] += update
+        self.value_function[state, action] += update
         if done:
             self.next_action = None
 
@@ -138,7 +121,7 @@ def get_entry(self, t):
 
     def _get_key(self, t):
         entry = self.get_entry(t)
-        return (*entry.state, entry.action)
+        return entry.state, entry.action
 
     def action(self, state):
         if self.t > 0:
@@ -158,7 +141,7 @@ def _prob(self, action, greedy):
         return epsilon_prob(greedy, action, len(self.actions), self.epsilon)
 
     def greedy_action(self, state):
-        array = np.array([self.value_function.estimated((*state, action)) for action in self.actions])
+        array = np.array([self.value_function.estimated(state, action) for action in self.actions])
         return randargmax(array)
 
     def calc_returns(self, update_time):
@@ -182,8 +165,8 @@ def on_new_state(self, state, action, reward, next_state, done):
             returns = self.calc_returns(update_time)
             not_last_state = update_time + self.n < self.T
             if not_last_state:
-                returns += pow(self.gamma, self.n) * self.value_function.estimated(key_t_plus_n)
-            self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(key_t))
+                returns += pow(self.gamma, self.n) * self.value_function.estimated(*key_t_plus_n)
+            self.value_function[key_t] += self.alpha * (returns - self.value_function.estimated(*key_t))
         self.t += 1
         if done and update_time != self.T - 1:
             self.on_new_state(state, action, reward, next_state, done)
@@ -259,15 +242,15 @@ def __init__(self, env):
         self.env = env
 
     def __call__(self, alpha):
-        return SemiGradientSarsa(self.env, TilingValueFunction(), alpha)
+        return SemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), alpha)
 
 
 class GimmeNStepSarsa:
     def __init__(self, env):
         self.env = env
 
     def __call__(self, alpha, n):
-        return NStepSemiGradientSarsa(self.env, TilingValueFunction(), n, alpha)
+        return NStepSemiGradientSarsa(self.env, ValueFunction(N_TILINGS, IHT(MAX_SIZE)), n, alpha)
 
 
 def plot_value_function_using_plotly(value_function):
@@ -326,7 +309,7 @@ def plot_n_step_sarsa_by_alpha_and_n(env):
 
     plot_n_step_sarsa_by_alpha_and_n(env)
 
-    # value_function = TilingValueFunction(N_TILINGS)
+    # value_function = ValueFunction(N_TILINGS, IHT(MAX_SIZE))
     # for i in range(100):
     #     # steps = generate_episode(env, NStepSemiGradientSarsa(env, value_function, 8, 0.5 / N_TILINGS))
     #     steps = generate_episode(env, SemiGradientSarsa(env, value_function, 0.5 / N_TILINGS))