[SOLVED] Reinforcement Learning Gymnasium ValueError

I am testing out reinforcement learning for the first time with gymnasium. I am following a youtube tutorial.

I am getting the following error when I run the training loop:

ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 1.

Here is my training loop (I am getting the error at env.step):

import gymnasium as gym

env = gym.make('LunarLander-v2')

print(f"Action space: {env.action_space}")
print(f"Observation space: {env.observation_space}")

import numpy as np

num_games = 250
load_checkpoint = False

agent = Agent(gamma=0.99, epsilon=1.0, lr=5e-4,
                  input_dims=[8], n_actions=4, max_mem_size=100000, eps_min=0.01,
                  batch_size=64, eps_dec=1e-3)

if load_checkpoint:
      agent.load_models()

scores = []
eps_history = []
n_steps = 0

for i in range(num_games):
   done = False
   observation = env.reset()
   score = 0

   while not done:
      action = agent.choose_action(observation)
      print(env.step(action))
      observation_, reward, done, _, info = env.step(action)
      score += reward
      agent.store_transition(observation, action,
                              reward, observation_, int(done))
      agent.learn()

      observation = observation_

   scores.append(score)
   avg_score = np.mean(scores[max(0, i-100):(i+1)])
   print('episode: ', i,'score %.1f ' % score,
             ' average score %.1f' % avg_score,
            'epsilon %.2f' % agent.epsilon)
   if i > 0 and i % 10 == 0:
            agent.save_models()

   eps_history.append(agent.epsilon)

x = [i+1 for i in range(num_games)]

Here is my agent (where DeepQNetwork is the generic 3-layer FC network):

class Agent:
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                 max_mem_size=100000, eps_min=0.05, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = 100

        self.Q_eval = DeepQNetwork(lr, n_actions=n_actions,
                                   input_dims=input_dims,
                                   fc1_dims=256, fc2_dims=256)
        self.state_memory = np.zeros((self.mem_size, *input_dims),
                                     dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims),
                                         dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.Q_eval.optimizer.zero_grad()

        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(
                self.new_state_memory[batch]).to(self.Q_eval.device)
        action_batch = self.action_memory[batch]
        reward_batch = T.tensor(
                self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(
                self.terminal_memory[batch]).to(self.Q_eval.device)

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        q_target = reward_batch + self.gamma*T.max(q_next, dim=1)[0]

        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec \
            if self.epsilon > self.eps_min else self.eps_min

Any and all help would be greatly appreciated.

Instead of doing:

observation = env.reset()

you should do:

observation, _ = env.reset()

Indeed, env.reset() returns a tuple (observation, info) with observation of the initial state and an auxiliary information dict. See https://gymnasium.farama.org/api/env/#gymnasium.Env.reset.