I'm trying to implement the Monte Carlo method for solving Blackjack, following the Sutton and Barto approach. Everything seems correct but when I look at the Q-table the values inside suggest to stick in too many cases. I'm not able to understand if there is something wrong in my approach, or I'm implementing a wrong code.
The agent class, implementing Epsilon-Greedy policy:
class MonteCarloAgentEpsilonGreedy:
def __init__(
self,
env,
discount_factor,
epsilon):
self.env = env
self.gamma = discount_factor
self.epsilon = epsilon # Exploration rate
# Create a dictionary to store the Q-values
self.Q_values = defaultdict(lambda: np.zeros(env.action_space.n))
self.Returns = defaultdict(lambda: np.zeros(env.action_space.n))
self.N = defaultdict(lambda: np.zeros(env.action_space.n))
def get_action(self, obs):
"""
Get the action with the highest Q-value for a given observation. (Epsilon-Greedy policy)
Args:
obs: The observation for which the action is to be determined.
Returns:
action: The action with the highest Q-value with probability 1-epsilon, otherwise a random action.
"""
if np.random.rand() < self.epsilon:
action = self.env.action_space.sample() # Choose a random action
else:
action = int(np.argmax(self.Q_values[obs])) # Choose the action with the highest Q-value
return action
def update_Q_values(self, episode):
"""
Update Q-values based on the episode.
Args:
episode: List of (state, action, reward) tuples.
"""
G = 0
for state, action, reward in reversed(episode):
G = self.gamma * G + reward
self.Returns[state][action] += G
self.N[state][action] += 1
# Update rule for Q-values
self.Q_values[state][action] = self.Returns[state][action] / self.N[state][action]
This is the main function:
if __name__ == "__main__":
env = gym.make('Blackjack-v1', natural=False, sab=False)
# Create an instance of the MonteCarloAgent class
agent = MonteCarloAgentEpsilonGreedy(
env, discount_factor=0.9, epsilon=0.1)
num_episodes = 1000000
for e in range(num_episodes):
episode = []
terminated = False
truncated = False
# Choose initial state randomly
observation, info = env.reset()
while (not terminated and not truncated): # Loop for each episode
action = agent.get_action(observation)
next_obs, reward, terminated, truncated, info = env.step(action)
episode.append((observation, action, reward))
agent.update_Q_values(episode)
env.close()
These are examples of Q-values that I'm obtaining for different observations:
(15, 10, 0): array([-0.57322077, -0.57813051]), (19, 3, 0): array([ 0.39937642, -0.67754011]), (17, 10, 0): array([-0.45902484, -0.68447894]), (11, 8, 0): array([-0.47658631, -0.47728385]), (12, 10, 0): array([-0.54324405, -0.5438698 ]), (20, 10, 0): array([ 0.44418773, -0.84017038]), (11, 10, 0): array([-0.54170763, -0.54247852]), (15, 3, 0): array([-0.24095023, -0.49996364]), (18, 6, 0): array([ 0.28397257, -0.6047619 ]), (20, 4, 0): array([ 0.65904186, -0.87462687]), (13, 8, 0): array([-0.50007986, -0.50656757]), (13, 6, 0): array([-0.14338235, -0.38048843]), (17, 5, 0): array([-0.03217932, -0.57848101])
And this is the final Q-table, with in blue the stick cases and in red the hit ones:
There was a trivial error in the main. In fact, there wasn't the update of the state to the next state. So, the correct main is:
if __name__ == "__main__":
env = gym.make('Blackjack-v1', natural=False, sab=False)
# Create an instance of the MonteCarloAgent class
agent = QlearningAgent(
env, discount_factor=1, exploration_rate=0.1, learning_rate=0.1)
num_episodes = 1000000
for e in range(num_episodes):
terminated = False
truncated = False
# Choose initial state randomly
obs, info = env.reset()
while (not (truncated or terminated)): # Loop for each episode
action = agent.get_action(obs)
next_obs, reward, terminated, truncated, info = env.step(action)
agent.update_Q_values(obs, action, reward, next_obs)
obs = next_obs
if (e % 10000 == 0):
print(f'Episode {e}/{num_episodes}')
agent.plot_Q_values()
env.close()
By adding this part of code, the result is similar to the one shown in the book.