I am trying to use a custom boid flocking environment with gymnasium and stable baselines. I have a custom policy and training loop.
My action and observation spaces are as follows:
min_action = np.array([-5, -5] * len(self.agents), dtype=np.float32)
max_action = np.array([5, 5] * len(self.agents), dtype=np.float32)
min_obs = np.array([-np.inf, -np.inf, -2.5, -2.5] * len(self.agents), dtype=np.float32)
max_obs = np.array([np.inf, np.inf, 2.5, 2.5] * len(self.agents), dtype=np.float32)
Training Code:
import numpy as np
import torch as th
from Parameters import *
from stable_baselines3 import PPO
from main import FlockingEnv, CustomMultiAgentPolicy
from Callbacks import TQDMProgressCallback, LossCallback
import os
from stable_baselines3.common.vec_env import DummyVecEnv
if os.path.exists(Results["Rewards"]):
os.remove(Results["Rewards"])
print(f"File {Results['Rewards']} has been deleted.")
if os.path.exists("training_rewards.json"):
os.remove("training_rewards.json")
print(f"File training_rewards has been deleted.")
def seed_everything(seed):
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
th.manual_seed(seed)
th.cuda.manual_seed(seed)
th.backends.cudnn.deterministic = True
env.seed(seed)
env.action_space.seed(seed)
loss_callback = LossCallback()
env = DummyVecEnv([lambda: FlockingEnv()])
seed_everything(SimulationVariables["Seed"])
# # Model Training
model = PPO(CustomMultiAgentPolicy, env, tensorboard_log="./ppo_Agents_tensorboard/", verbose=1)
model.set_random_seed(SimulationVariables["ModelSeed"])
progress_callback = TQDMProgressCallback(total_timesteps=SimulationVariables["LearningTimeSteps"])
# Train the model
model.learn(total_timesteps=SimulationVariables["LearningTimeSteps"], callback=[progress_callback, loss_callback])
Error:
Using cuda device
Traceback (most recent call last):
File "D:\Thesis_\FlockingFinal\MultiAgentFlocking\Training.py", line 45, in <module>
model.learn(total_timesteps=SimulationVariables["LearningTimeSteps"], callback=[progress_callback, loss_callback])
File "C:\Python312\Lib\site-packages\stable_baselines3\ppo\ppo.py", line 315, in learn
return super().learn(
^^^^^^^^^^^^^^
File "C:\Python312\Lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 287, in learn
total_timesteps, callback = self._setup_learn(
^^^^^^^^^^^^^^^^^^
File "C:\Python312\Lib\site-packages\stable_baselines3\common\base_class.py", line 423, in _setup_learn
self._last_obs = self.env.reset() # type: ignore[assignment]
^^^^^^^^^^^^^^^^
File "C:\Python312\Lib\site-packages\stable_baselines3\common\vec_env\dummy_vec_env.py", line 77, in reset
obs, self.reset_infos[env_idx] = self.envs[env_idx].reset(seed=self._seeds[env_idx], **maybe_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 2)
I used similar seeding function with gym as well but no error, I thought it might be causing error but here even when I don't use it the error doesn't go away.
The Stable Baselines3 environment interface expects the reset function to return a tuple: (observation, info).
Original code: `def reset(self, seed=None): # If a seed is provided, set it here if seed is not None: self.seed(seed)
self.agents = [Agent(position) for position in self.read_agent_locations()]
for agent in self.agents:
agent.acceleration = np.zeros(2)
agent.velocity = np.round(np.random.uniform(-SimulationVariables["VelocityUpperLimit"], SimulationVariables["VelocityUpperLimit"], size=2), 2)
observation = self.get_observation().flatten()
################################
self.current_timestep = 0 # Reset time step count
################################
return observation`
Error: Was returning just the observation, which caused the framework to throw an unpacking error.
Debugged: Modified the reset method of your custom environment to return both the observation and an empty info dictionary.
Reset function: ` def reset(self, seed=None, options=None): # If a seed is provided, set it here if seed is not None: self.seed(seed)
self.agents = [Agent(position) for position in self.read_agent_locations()]
for agent in self.agents:
agent.acceleration = np.zeros(2)
agent.velocity = np.round(np.random.uniform(-SimulationVariables["VelocityUpperLimit"], SimulationVariables["VelocityUpperLimit"], size=2), 2)
observation = self.get_observation().flatten()
################################
self.current_timestep = 0 # Reset time step count
################################
super().reset(seed=seed)
info = {} # This is the extra information dictionary, you can populate it with useful info if needed
return observation, info `