pythonreinforcement-learningrayrllibmulti-agent-reinforcement-learning

Error Raised with SAC for Centralized Training, Decentralized Execution in Ray RLlib


I'm using a slight variant of the RockPaperScissors multi-agent environment from the Ray RLlib documentation as a test environment to verify that a custom RLModule for Centralized Training, Decentralized Execution (CTDE) works correctly.

To group the multi-agent environment into a single-agent environment, I'm using GroupAgentsWrapper. The custom RLModule for CTDE implements the required APIs for PPO, APPO, and SAC (i.e., ValueFunctionAPI, TargetNetworkAPI, QNetAPI).

The custom RLModule for CTDE works fine with the test environment for all three algorithms (PPO, APPO, and SAC) using ray.tune.Tuner.

However, when I use my real environment with the custom RLModule for CTDE, I get the following error:

ray/rllib/env/single_agent_episode.py, line 624, in concat_episode, assert np.all(other.observations[0] == self.observations[-1]), ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Just for debugging, I bypassed this assertion, and the code managed to train until, after a few iterations, the following error associated with the Replay Buffer appeared:

  • ray/rllib/utils/replay_buffers/prioritized_episode_buffer.py, line 471, in sample, idx = self._sum_segment.find_prefixsum_idx(random_sum)
  • ray/rllib/execution/segment_tree.py, line 191, in find_prefixsum_idx, assert 0 <= prefixsum <= self.sum() + 1e-5

I also tried modifying the replay_buffer_config in the algorithm configuration, or using a custom Replay Buffer without success.

I think maybe the error is caused by the structure of the observation space. However, it is exactly the same as in my real environment and the test environment. The observation space of the multi-agent environment is gym.spaces.Dict({“agent1”: Box, “agent2”: Box}), and the observation space of the single-agent environment is gym.spaces.Dict({"grouped": gym.spaces.Tuple(Box, Box)}). So I don't understand why the test environment passes the assertions, but my real environment doesn't.

What could be happening, or what am I missing?

Thanks in advance; any help would be appreciated!

Here's the code for the RockPaperScissors multi-agent and grouped, single-agent test environment:

class RockPaperScissors(MultiAgentEnv):
    ROCK = 0
    PAPER = 1
    SCISSORS = 2

    WIN_MATRIX = {
        (ROCK, ROCK): (0, 0),
        (ROCK, PAPER): (-1, 1),
        (ROCK, SCISSORS): (1, -1),
        (PAPER, ROCK): (1, -1),
        (PAPER, PAPER): (0, 0),
        (PAPER, SCISSORS): (-1, 1),
        (SCISSORS, ROCK): (-1, 1),
        (SCISSORS, PAPER): (1, -1),
        (SCISSORS, SCISSORS): (0, 0),
    }

    def __init__(self, env_config=None):
        super().__init__()

        self.agents_id = ["player1", "player2"]
        self.agents = self.possible_agents = self.agents_id
        self.observation_spaces = self.action_spaces = gym.spaces.Dict({
            "player1": gym.spaces.Box(low=0, high=2, shape=(1,)),
            "player2": gym.spaces.Box(low=0, high=2, shape=(1,)),
        })
        self.num_moves = 0

    def reset(self, *, seed=None, options=None):
        self.num_moves = 0

        return {
            "player1": np.array([0.0], dtype=np.float32),
            "player2": np.array([0.0], dtype=np.float32),
        }, {}

    def step(self, action_dict):
        self.num_moves += 1

        move1 = int(action_dict["player1"].item())
        move2 = int(action_dict["player2"].item())

        observations = {
            "player1": np.array([move2], dtype=np.float32),
            "player2": np.array([move1], dtype=np.float32)
        }

        r1, r2 = self.WIN_MATRIX[move1, move2]

        rewards = {
            "player1": r1,
            "player2": r2
        }

        terminateds = {"__all__": bool(self.num_moves >= 10)}
        truncateds = {"__all__": bool(self.num_moves >= 10)}

        return observations, rewards, terminateds, truncateds, {}

class GroupedRockPaperScissors(MultiAgentEnv):
    def __init__(self, env_config=None):
        super().__init__()

        env = RockPaperScissors(env_config)

        _tuple_obs_space = self._dict_to_tuple_space(env.observation_spaces)
        _tuple_act_space = self._dict_to_tuple_space(env.action_spaces)

        self.env = env.with_agent_groups(
            groups={"grouped_agents": ["player1", "player2"]},
            obs_space=_tuple_obs_space, # spaces.Tuple(Box, Box)
            act_space=_tuple_act_space, # spaces.Tuple(Box, Box)
        )

        self.agents_id = ["grouped_agents"]
        self.agents = self.possible_agents = self.agents_id
        self.original_agents_id = env.agents_id

        self.observation_space = gym.spaces.Dict(
            {"grouped_agents": _tuple_obs_space} # spaces.Dict({"grouped": spaces.Tuple(Box, Box)})
        )
        self.action_space = gym.spaces.Dict(
            {"grouped_agents": _tuple_act_space} # spaces.Dict({"grouped": spaces.Tuple(Box, Box)})
        )

    def reset(self, *, seed=None, options=None):
        obs, infos = self.env.reset(seed=seed, options=options)
        grouped_obs = {k: tuple(v) for k, v in obs.items()}  # spaces.Dict({"grouped": spaces.Tuple(Box, Box)})

        return grouped_obs, infos

    def step(self, action_dict):
        obs, rewards, terminateds, truncateds, infos = self.env.step(action_dict)
        grouped_obs = {k: tuple(v) for k, v in obs.items()}  # spaces.Dict({"grouped": spaces.Tuple(Box, Box)})
        grouped_reward = sum(rewards.values())

        return grouped_obs, grouped_reward, terminateds["__all__"], truncateds["__all__"], infos

    @staticmethod
    def _dict_to_tuple_space(dict_space: gym.spaces.Dict) -> gym.spaces.Tuple:
        sorted_keys = sorted(dict_space.keys())
        tuple_of_spaces = tuple(dict_space[key] for key in sorted_keys)

        return gym.spaces.Tuple(tuple_of_spaces)

Here's the SAC configuration:

algo_config = (
    SACConfig()
    .environment(GroupedEnv, env_config={})
    .framework("torch")
    .rl_module(rl_module_spec=RLModuleSpec(module_class=CustomRLModuleCTDE,
                                           observation_space=GroupedEnv.observation_space, # spaces.Dict({"grouped": spaces.Tuple(Box, Box)})
                                           action_space=GroupedEnv.action_space)) # spaces.Dict({"grouped": spaces.Tuple(Box, Box)}
    .training(twin_q=True,
              replay_buffer_config={"type": "PrioritizedEpisodeReplayBuffer"}) # or "EpisodeReplayBuffer"
    .evaluation(evaluation_config=SACConfig.overrides(exploration=False))
)

Solution

  • It seems the PrioritizedEpisodeReplayBuffer can't handle complex observations with the custom RLModule for both CTCE and CTDE. However, by using EpisodeReplayBuffer (CTCE/CTDE) and MultiAgentEpisodeReplayBuffer (DTDE), and setting replay_buffer_config={"replay_sequence_length": 1, "replay_burn_in": 0, "replay_zero_init_states": True} and env_runners(batch_mode="complete_episodes"), it works correctly.