tensorflow keras deep-learning reinforcement-learning dqn

ValueError: Layer "model_69" expects 3 input(s), but it received 96 input tensors

I am trying not to use model.predict() nor model.fit() in a for loop to speed up training so I'm trying to implement this solution to my case but I get an error. The model has three inputs.

This is my code:

n_possible_movements = 9
MINIBATCH_SIZE = 32

class DQNAgent(object):
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.8
        self.epsilon_min = 0.1
        self.learning_rate = 10e-4
        self.tau = 1e-3
        
                
        # Main models
        self.model_uav_pos = self._build_pos_model()

        # Target networks
        self.target_model_uav_pos = self._build_pos_model()
        # Copy weights
        self.target_model_uav_pos.set_weights(self.model_uav_pos.get_weights())

        # An array with last n steps for training
        self.replay_memory_pos_nn = deque(maxlen=REPLAY_MEMORY_SIZE)
        
    def _build_pos_model(self): # compile the DNN
        # create the DNN model
        dnn = self.create_pos_dnn()
        
        opt = Adam(learning_rate=self.learning_rate) #, decay=self.epsilon_decay)
        dnn.compile(loss="mse", optimizer=opt)
        
        return dnn
    
    def create_pos_dnn(self): 
        # initialize the input shape
        pos_input_shape = (2,)
        requests_input_shape = (len(env.ues),)
        number_of_satisfied_ues_input_shape = (1,)
        # How many possible outputs we can have
        output_nodes = n_possible_movements
        
        # Initialize the inputs
        uav_current_position = Input(shape=pos_input_shape, name='pos')
        ues_requests = Input(shape=requests_input_shape, name='requests')
        number_of_satisfied_ues = Input(shape=number_of_satisfied_ues_input_shape, name='number_of_satisfied_ues')
        
        # Put them in a list
        list_inputs = [uav_current_position, ues_requests, number_of_satisfied_ues]
        
        # Merge all input features into a single large vector
        x = layers.concatenate(list_inputs)
        
        # Add a 1st Hidden (Dense) Layer
        dense_layer_1 = Dense(512, activation="relu")(x)
        
        # Add a 2nd Hidden (Dense) Layer
        dense_layer_2 = Dense(512, activation="relu")(dense_layer_1)
        
        # Add a 3rd Hidden (Dense) Layer
        dense_layer_3 = Dense(256, activation="relu")(dense_layer_2)
        
        # Output layer
        output_layer = Dense(output_nodes, activation="linear")(dense_layer_3)

        model = Model(inputs=list_inputs, outputs=output_layer)
                        
        # return the DNN
        return model
    
    def remember_pos_nn(self, state, action, reward, next_state, done):
        self.replay_memory_pos_nn.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
        
    def act_upon_choosing_a_new_position(self, state): # state is a tuple (uav_position, requests_array, number_satisfaction)
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(n_possible_movements)
        pos =  np.array([state[0]])
        reqs =  np.array([state[1]])
        number_satisfaction = np.array([state[2]])
        act_values = self.model_uav_pos.predict([pos, reqs, number_satisfaction]) # if not acting randomly, predict reward value based on current state
        return np.argmax(act_values[0]) 

    def target_train(self):
        weights = self.model_uav_pos.get_weights()
        target_weights = self.target_model_uav_pos.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model_uav_pos.set_weights(target_weights)

This is the training function before I introduced the changes suggested in the link

def train_pos_nn(self):
        print("In Training..")

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
            print("Exiting Training: Replay Memory Not Full Enough...")
            return

        # Get a minibatch of random samples from memory replay table
        list_memory = list(self.replay_memory_pos_nn)
        random.shuffle(list_memory)
        minibatch = random.sample(list_memory, MINIBATCH_SIZE)

        start_time = time.time()
        # Enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
            print('...Starting Training...')
            target = 0
            pos =  np.array([current_state[0]])
            reqs =  np.array([current_state[1]])
            number_satisfaction = np.array([current_state[2]])
            pos_next = np.array([new_current_state[0]])
            reqs_next = np.array([new_current_state[1]])
            number_satisfaction_next = np.array([new_current_state[2]])
    
            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                print("Predict Next State")
                target = reward + DISCOUNT * np.amax(self.target_model_uav_pos.predict([pos_next, reqs_next, number_satisfaction_next]))
            else:
                target = reward

            # Update Q value for given state
            print("Predict State")
            target_f = self.model_uav_pos.predict([pos, reqs, number_satisfaction])
            target_f = np.array(target_f)
            target_f[0][action] = target

            self.model_uav_pos.fit([pos, reqs, number_satisfaction], \
                                   target_f, \
                                   verbose=2, \
                                   shuffle=False, \
                                   callbacks=None, \
                                   epochs=1 \
                                  )  
        end_time = time.time()
        print("Time", end_time - start_time)
        # Update target network counter every episode
        self.target_train()

This is the training function after I introduced the changes:

def train_pos_nn(self):
        print("In Training..")

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
            print("Exiting Training: Replay Memory Not Full Enough...")
            return

        # Get a minibatch of random samples from memory replay table
        list_memory = list(self.replay_memory_pos_nn)
        random.shuffle(list_memory)
        
        # Draw a sample
        samples = random.sample(list_memory, MINIBATCH_SIZE)
        
        start_time = time.time()
        # Prepare the batch
        state, action, reward, new_state, done = zip(*samples)
        nstate = []
        cstate = []
        start_time_2 = time.time()
        for n_state in new_state:
            pos_next = np.array([n_state[0]])
            reqs_next = np.array([n_state[1]])
            number_satisfaction_next = np.array([n_state[2]])
            nstate.append([pos_next,reqs_next,number_satisfaction_next])
        for curr_state in state:
            pos =  np.array([curr_state[0]])
            reqs =  np.array([curr_state[1]])
            number_satisfaction = np.array([curr_state[2]])
            cstate.append([pos,reqs,number_satisfaction])
        end_time_2 = time.time()
        print("Time 2", end_time_2 - start_time_2)
        #next_state = np.concatenate(new_state)
        #next_state = np.concatenate(nstate)
        #print("next_state", nstate[0], "len", len(nstate))#np.asarray(nstate).shape)# np.shape(nstate))
        done = np.array(done)[:,None]
        #state = np.concatenate(state)
        reward = np.array(reward)[:,None]
        q_future = self.target_model_uav_pos.predict(nstate)#np.vstack(nstate))
        targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)
        
        # Fit the model
        self.model.fit(cstate, targets, epochs=1, verbose=2)
        
        end_time = time.time()
        print("Time", end_time - start_time)
        self.target_train()

This line q_future = self.target_model_uav_pos.predict(nstate) throws an error ValueError: Layer "model_69" expects 3 input(s), but it received 96 input tensors (3 inputs for each of the 32 examples in nstate & same error when I use predict_on_batch())

I don't know how to do it correctly. Any help would be appreciated.

Edit

Using the answer below and the code in here I wrote the following code:

def train_pos_nn(self):
        print("In Training..")

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
            print("Exiting Training: Replay Memory Not Full Enough...")
            return

        # Get a minibatch of random samples from memory replay table
        list_memory = list(self.replay_memory_pos_nn)
        random.shuffle(list_memory)
        
        samples = random.sample(list_memory, MINIBATCH_SIZE)
        
        start_time = time.time()
        
        state = []
        new_state = []
        action, reward, done = [], [], []

        # do this before prediction
        # for speedup, this could be done on the tensor level
        # but easier to understand using a loop
        for i in range(MINIBATCH_SIZE):
            state.append(samples[i][0])
            action.append(samples[i][1])
            reward.append(samples[i][2])
            new_state.append(samples[i][3])
            done.append(samples[i][4])

        nstate = OrderedDict(pos=[], reqs=[], num_satisfaction=[])
        cstate = OrderedDict(pos=[], reqs=[], num_satisfaction=[])

        for n_state in new_state:
            pos_next = np.array([n_state[0]])
            reqs_next = np.array([n_state[1]])
            number_satisfaction_next = np.array([n_state[2]])

            # add elements
            for k, v in zip(nstate.keys(), [pos_next, reqs_next, number_satisfaction_next]):
                nstate[k].append(v)

        for curr_state in state:
            pos =  np.array([curr_state[0]])
            reqs =  np.array([curr_state[1]])
            number_satisfaction = np.array([curr_state[2]])

            # add elements
            for k, v in zip(cstate.keys(), [pos, reqs, number_satisfaction]):
                cstate[k].append(v)

        # now concat each list of values in nstate (and cstate), 
        # to get a list with 3 arrays each of MINIBATCH size
        nstate = [np.concatenate(v, axis=0) for v in nstate.values()]
        cstate = [np.concatenate(v, axis=0) for v in cstate.values()]
        
        # do batch prediction to save speed
        # predict Q-values for starting state using the main network
        target = self.model_uav_pos.predict(cstate)
        #q_target = target.copy()
        target_ = np.array(target)
        
        # predict Q-values for ending state using the target network
        target_val = self.target_model_uav_pos.predict(nstate)
        target_val_ = np.array(target_val)

        for i in range(len(samples)):
            # correction on the Q value for the action used
            if done[i]:
                target_[i][action[i]] = reward[i]
            else:   
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q_target(s', a')
                #print("Target Val", target_val)
                target_[i][action[i]] = reward[i] + DISCOUNT * (np.amax(target_val_[i]))
                
        _ = self.model_uav_pos.fit(cstate, target_, verbose=2)
        
        end_time = time.time()
        print("Time", end_time - start_time)
        self.target_train()

Solution

I guess the issue is when you append the 3 inputs to nstate and cstate in train_pos_nn() because that, as the error suggests, yields 96 tensors meaning your nstate is a list of lists (likewise cstate) instead of a list of 3 numpy arrays, each of size MINIBATCH_SIZE.

Try this:

from collections import OrderedDict

def train_pos_nn(self):
    print("In Training..")

    # Start training only if certain number of samples is already saved
    if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
        print("Exiting Training: Replay Memory Not Full Enough...")
        return

    # Get a minibatch of random samples from memory replay table
    list_memory = list(self.replay_memory_pos_nn)
    random.shuffle(list_memory)
    
    # Draw a sample
    samples = random.sample(list_memory, MINIBATCH_SIZE)
    
    start_time = time.time()
    
    # Prepare the batch
    state, action, reward, new_state, done = zip(*samples)

    nstate = OrderedDict(pos=[], reqs=[], num_satisfaction=[])
    cstate = OrderedDict(pos=[], reqs=[], num_satisfaction=[])
    
    start_time_2 = time.time()
    
    for n_state in new_state:
        pos_next = np.array([n_state[0]])
        reqs_next = np.array([n_state[1]])
        number_satisfaction_next = np.array([n_state[2]])
        # nstate.append([pos_next,reqs_next,number_satisfaction_next])

        # add elements
        for k, v in zip(nstate.keys(), [pos_next, reqs_next, number_satisfaction_next]):
            nstate[k].append(v)
    
    for curr_state in state:
        pos =  np.array([curr_state[0]])
        reqs =  np.array([curr_state[1]])
        number_satisfaction = np.array([curr_state[2]])
        # cstate.append([pos,reqs,number_satisfaction])

        # add elements
        for k, v in zip(cstate.keys(), [pos, reqs, number_satisfaction]):
            cstate[k].append(v)

    # now concat each list of values in nstate (and cstate), 
    # to get a list with 3 arrays each of MINIBATCH size
    nstate = [np.concatenate(v, axis=0) for v in nstate.values()]
    cstate = [np.concatenate(v, axis=0) for v in cstate.values()]
  
    end_time_2 = time.time()
    print("Time 2", end_time_2 - start_time_2)
    
    #print("next_state", nstate[0], "len", len(nstate))#np.asarray(nstate).shape)# np.shape(nstate))
    done = np.array(done)[:,None]
    reward = np.array(reward)[:,None]
    q_future = self.target_model_uav_pos.predict(nstate)
    targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)
    
    # Fit the model
    self.model.fit(cstate, targets, epochs=1, verbose=2)
    
    end_time = time.time()
    print("Time", end_time - start_time)
    self.target_train()

I haven't tested it, so it may not run at the first try: if so check the shape of each element of nstate and cstate, adjust, then try again.

Hope it helps.