-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Description
Question
I was running the Training an Agent tutorial which is in the gym's docs, the docs say that I should see the training error decreasing, but I rather got an increasing training error, therefore I just wanted to know whether this is normal and the docs are just misworded, or am I having some issues.
I'll include everything I can to aid anyone who'd like to answer my question.
Agent Code:
from collections import defaultdict
import numpy as np
class BlackjackAgent:
def __init__(
self,
env:gym.Env,
learning_rate: float,
initial_epsilon: float,
epsilon_decay: float,
final_epsilon: float,
discount_factor: float = 0.95,
):
self.env = env
self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
self.lr = learning_rate
self.discount_factor = discount_factor
self.epsilon = initial_epsilon
self.epsilon_decay = epsilon_decay
self.final_epsilon = final_epsilon
self.training_error = []
def get_action(self, obs: tuple[int, int, bool]) -> int:
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
else:
return int(np.argmax(self.q_values[obs]))
def update(
self,
obs: tuple[int,int,bool],
action: int,
reward: float,
terminated: bool,
next_obs: tuple[int, int, bool],
):
future_q_value = (not terminated) * np.max(self.q_values[next_obs])
target = reward + self.discount_factor * future_q_value
temporal_difference = target - self.q_values[obs][action]
self.q_values[obs][action] = (
self.q_values[obs][action] + self.lr * temporal_difference
)
self.training_error.append(temporal_difference)
def decay_epsilon(self):
self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)Training Loop:
learning_rate = 0.01
n_episodes = 100_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)
final_epsilon = 0.1
env = gym.make("Blackjack-v1", sab=False)
env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=n_episodes)
agent = BlackjackAgent(env, learning_rate, start_epsilon, epsilon_decay, final_epsilon)
from tqdm import tqdm
for episode in tqdm(range(n_episodes)):
obs, info = env.reset()
done = False
while not done:
action = agent.get_action(obs)
next_obs, reward, terminated, truncated, info = env.step(action)
agent.update(obs, action, reward, terminated, next_obs)
done = terminated or truncated
obs = next_obs
agent.decay_epsilon()And just in case the visualisation code:
from matplotlib import pyplot as plt
def get_moving_avgs(arr, window, convolution_mode):
return np.convolve(np.array(arr).flatten(), np.ones(window), mode=convolution_mode) / window
rolling_length = 500
fig, axs = plt.subplots(ncols=3, figsize=(12,5))
axs[0].set_title("Episode rewards")
reward_moving_average = get_moving_avgs(env.return_queue, rolling_length, "valid")
axs[0].plot(range(len(reward_moving_average)), reward_moving_average)
axs[0].set_ylabel("Average Reward")
axs[0].set_xlabel("Episode")
axs[1].set_title("Episode lengths")
length_moving_average = get_moving_avgs(env.length_queue, rolling_length, "valid")
axs[1].plot(range(len(length_moving_average)), length_moving_average)
axs[1].set_ylabel("Average Episode Length")
axs[1].set_xlabel("Episode")
axs[2].set_title("Training Error")
training_error_moving_average = get_moving_avgs(
agent.training_error,
rolling_length,
"same"
)
axs[2].plot(range(len(training_error_moving_average)), training_error_moving_average)
axs[2].set_ylabel("Temporal Difference Error")
axs[2].set_xlabel("Step")
plt.tight_layout()
plt.show()If you're a beginner and have basic questions, please ask on r/reinforcementlearning or in the RL Discord (if you're new please use the beginners channel). Basic questions that are not bugs or feature requests will be closed without reply, because GitHub issues are not an appropriate venue for these.
Advanced/nontrivial questions, especially in areas where documentation is lacking, are very much welcome.
