In this scenario, I present a box observation with numbers 0, 1 or 2 and shape (1, 10). The odds for 0 and 2 are 2% each, and 96% for 1. I want the model to learn to pick the index of any 2 that comes. If it doesn't have a 2, just choose 0.
Bellow is my code:
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
action_length = 10
class TestBot(gym.Env):
def __init__(self):
super(TestBot, self).__init__()
self.total_rewards = 0
self.time = 0
self.action_space = spaces.Discrete(action_length)
self.observation_space = spaces.Box(low=0, high=2, shape=(1, action_length), dtype=np.float32)
def generate_next_obs(self):
p = [0.02, 0.02, 0.96]
a = [0, 2, 1]
self.observation = np.random.choice(a, size=(1, action_length), p=p)
if 2 in self.observation[0][1:]:
self.best_reward += 1
def reset(self):
if self.time != 0:
print('Total rewards: ', self.total_rewards, 'Best possible rewards: ', self.best_reward)
self.best_reward = 0
self.time = 0
self.generate_next_obs()
self.total_rewards = 0
self.last_observation = self.observation
return self.observation
def step(self, action):
reward = 0
if action != 0:
last_value = self.last_observation[0][action]
if last_value == 2:
reward = 1
else:
reward = -1
self.time += 1
self.generate_next_obs()
done = self.time == 4096
info = {}
self.last_observation = self.observation
self.total_rewards += reward
return self.observation, reward, done, info
For training, I used the following:
env = TestBot()
env = make_vec_env(lambda: env, n_envs=1)
model = PPO('MlpPolicy', env, verbose=0)
iters = 0
while True:
iters += 1
model.learn(total_timesteps=4096, reset_num_timesteps=True)
PPO gave the best result, which wasn't so great. It learned to have positive rewards, but took a long time and got stuck in a point far from optimal.
How can I improve the learning of this scenario?
I managed to solve my problem by tunning the PPO parameters.
I had to change the following parameters:
I also made a small modification in the environment in order to penalize more the loss of oportunity of picking a number 2 index, but that is done just to accelerate the training.
The following is my final code:
env = TestBot()
env = make_vec_env(lambda: env, n_envs=1)
iters = 0
def clip_range_schedule():
def real_clip_range(progress):
global iters
cr = 0.2
if iters > 20:
cr = 0.0
elif iters > 12:
cr = 0.05
elif iters > 6:
cr = 0.1
return cr
return real_clip_range
model = PPO('MlpPolicy', env, verbose=0, gamma=0.0, gae_lambda=0.65, clip_range=clip_range_schedule())
while True:
iters += 1
model.learn(total_timesteps=4096, reset_num_timesteps=True)