I am trying to implement the A2C Algorithm on the cartpole environment using stablebaseline3. While the training seems to be successful with the required reward achieved, when I try to use the model , the reward seems low. Here's my code. What am I doing wrong ?
import os
import gymnasium as gym
import numpy as np
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
env_id = "CartPole-v1"
env = gym.make(env_id)
s_size = env.observation_space.shape
a_size = env.action_space
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation
OBSERVATION SPACE
The State Space is: (4,) Sample observation [-2.3014314e+00 4.4097112e+37 -4.1089469e-01 2.7118910e+38]
envs = make_vec_env(env_id,seed=1, n_envs=4)
envs = VecNormalize(envs, norm_obs=True, norm_reward=True, clip_obs=10.)
model = A2C(policy = "MlpPolicy",env = envs, verbose=1)
model.learn(15_000)
After this step I save the model and re-load for evaluation
model.save("a2c-"+env_id)
envs.save("vec_normalize.pkl")
When I load the saved model for evaluation , it produces a mean reward of 500
# Load the saved statistics
eval_env = DummyVecEnv([lambda: gym.make(env_id)])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)
# We need to override the render_mode
eval_env.render_mode = "rgb_array"
# do not update them at test time
eval_env.training = False
# reward normalization is not needed at test time
eval_env.norm_reward = False
# Load the agent
model = A2C.load("a2c-"+env_id)
mean_reward, std_reward = evaluate_policy(model, eval_env)
print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
Mean reward = 500.00 +/- 0.00
However, when I try to test the model myself, the rewards are miserable
for epi in range(10):
score = 0
state = env.reset()[0]
done = False
while not done:
a , _ = model.predict(state)
state_, r, done , _, _ = env.step(a)
score += r
state = np.copy(state_)
env.render()
print(f"Episode {epi} score {score}")
env.close()
Episode 0 score 71.0
Episode 1 score 70.0
Episode 2 score 83.0
Episode 3 score 62.0
Episode 4 score 63.0
Episode 5 score 59.0
Episode 6 score 52.0
Episode 7 score 54.0
Episode 8 score 60.0
Episode 9 score 69.0
I was finally able to solve the problem. It seems I need to simulate the test in the eval_env itself
for epi in range(10):
obs = eval_env.reset()
done = False
score = 0
img = eval_env.render(mode="rgb_array")
images.append(img)
while not done:
action, _states = model.predict(obs)
obs, rewards, dones, info = eval_env.step(action)
done = dones[0]
score +=rewards[0]
print(score)
494.0
297.0
359.0
402.0
406.0
500.0
500.0
500.0
371.0
371.0