I am trying to make a very simple DQN algorithm work with the FrozenLake-v0 game but I am getting errors. I understand that it could be an overkill using DQN instead of a Q-table, but I nonetheless would like it to work. Here is the code:
import gym
import numpy as np
import tensorflow as tf
env = gym.make("FrozenLake-v0")
n_actions = env.action_space.n
input_dim = env.observation_space.n
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(64, input_dim = input_dim , activation = 'relu'))
model.add(tf.keras.layers.Dense(32, activation = 'relu'))
model.add(tf.keras.layers.Dense(n_actions, activation = 'linear'))
model.compile(optimizer=tf.keras.optimizers.Adam(), loss = 'mse')
def replay(replay_memory, minibatch_size=32):
minibatch = np.random.choice(replay_memory, minibatch_size, replace=True)
s_l = np.array(list(map(lambda x: x['s'], minibatch)))
a_l = np.array(list(map(lambda x: x['a'], minibatch)))
r_l = np.array(list(map(lambda x: x['r'], minibatch)))
sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
done_l = np.array(list(map(lambda x: x['done'], minibatch)))
qvals_sprime_l = model.predict(sprime_l)
target_f = model.predict(s_l)
for i,(s,a,r,qvals_sprime, done) in enumerate(zip(s_l,a_l,r_l,qvals_sprime_l, done_l)):
if not done: target = r + gamma * np.max(qvals_sprime)
else: target = r
target_f[i][a] = target
model.fit(s_l,target_f, epochs=1, verbose=0)
return model
n_episodes = 500
gamma = 0.99
epsilon = 0.9
minibatch_size = 32
r_sums = []
replay_memory = []
mem_max_size = 100000
for n in range(n_episodes):
s = env.reset()
done=False
r_sum = 0
print(s)
while not done:
qvals_s = model.predict(s.reshape(16))
if np.random.random() < epsilon: a = env.action_space.sample()
else: a = np.argmax(qvals_s);
sprime, r, done, info = env.step(a)
r_sum += r
if len(replay_memory) > mem_max_size:
replay_memory.pop(0)
replay_memory.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
s=sprime
model=replay(replay_memory, minibatch_size = minibatch_size)
if epsilon > 0.1: epsilon -= 0.001
r_sums.append(r_sum)
if n % 100 == 0: print(n)
And the errors I am getting are:
Traceback (most recent call last):
File "froz_versuch.py", line 48, in <module>
qvals_s = model.predict(s.reshape(16))
ValueError: cannot reshape array of size 1 into shape (16,)
And when I try to then change qvals_s = model.predict(s.reshape(16))
to qvals_s = model.predict(s.reshape(1))
I get the error:
ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 16 but received input with shape [None, 1]
I'd appreciate any help!
The problem had to do with one-hot encoding. I had to encode s
and sprime
so that they had the dimensions 16
. This change in the for loop made it work. The encode()
function could be moved outside of the loop but I'm just testing now so optimization comes afterward. Here is the solution:
for n in range(n_episodes):
ss = env.reset()
states_total = 16
data = [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]
def encode(data, states_total):
targets = np.array(data).reshape(-1)
return np.eye(states_total)[targets]
m = encode(data,states_total)
s = m[ss]
#print(s)
#print(len(s))
done=False
r_sum = 0
while not done:
#env.render()
qvals_s = model.predict(s.reshape(1,-1))
if np.random.random() < epsilon: a = env.action_space.sample()
else: a = np.argmax(qvals_s);
sprime, r, done, info = env.step(a)
r_sum += r
q = encode(data,states_total)
sprime = q[sprime]
if len(replay_memory) > mem_max_size:
replay_memory.pop(0)
replay_memory.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
#s = n[sprime]
s=sprime
model=replay(replay_memory, minibatch_size = minibatch_size)
if epsilon > 0.001: epsilon -= 0.001
r_sums.append(r_sum)
print(r_sum)
print(epsilon)
if n % 100 == 0: print(n)