pythontensorflowgenerator

TensorFlow validation generator doesn't calculate val_loss after the first epoch


I am trying to train a model using generators for the training and validation datasets. Each epoch consumes all the batches of the validation dataset (correct?). The first epoch runs fine, but from the second epoch onwards, it gives an error stating that it can't calculate val_loss. I suspect this might be related to the validation dataset not being reset, but I'm not sure. How can I fix this?

error

Cell In[13], line 43
     41 #treinar modelo
     42 now = datetime.now()
---> 43 model.fit(train_dataset, epochs=5, verbose=1, validation_data=val_dataset,
     44           callbacks=[printbatch(), EarlyStopping(monitor='val_loss', patience=50, verbose=0),
     45                      ModelCheckpoint(os.path.join(output_path, "models", "dnn_model_" + str(tam) + "_" + str(now.day) + "." +
     46                                      str(now.month) + "." + str(now.year) + "_" + "_{epoch:02d}-{val_loss:.2f}.keras"),
     47                                      monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto'),
     48                      reset_val_callback])
     49 model.save(os.path.join(output_path,"dnn_model_" + str(tam) + "_" + str(now.day) + "." +str(now.month) + "." + str(now.year) + "_.keras"))
...
    325         f"Reason: {e}"
    326     )
    327 return file_path

KeyError: 'Failed to format this callback filepath: "modelos/models/dnn_model_150_11.12.2024__{epoch:02d}-{val_loss:.2f}.keras". Reason: \'val_loss\''

also

/home/igor/anaconda3/envs/tcc/lib/python3.9/site-packages/keras/src/callbacks/early_stopping.py:155: UserWarning: Early stopping conditioned on metric `val_loss` which is not available. Available metrics are: accuracy,loss
  current = self.get_monitor_value(logs)

my code

def data_generator(file_paths, batch_size, files_per_batch, tam, value, val_split):
    # Pegando todos os arquivos do diretório
    files = sorted([os.path.basename(ii) for ii in glob.glob(f"{file_paths}/*.dimacs")])
    cut_index = int(len(files) * 0.9)  # Índice para os primeiros 90% dos arquivos
    files = files[:cut_index]  # Mantém apenas os primeiros 90% para usar no treino
    # Dividir aleatoriamente 20% dos arquivos para validação e 80% para treinamento
    files = files[:1000]
    files_t, files_v = train_test_split(files, test_size=val_split, random_state=42)
    
    # Gerador de treinamento
    def create_generator(file_list):
        def generator():
            idx = 0
            remaining_batch_input = remaining_batch_labels = []
            while idx < len(file_list):
                batch_files = file_list[idx: idx + files_per_batch]
                res, clique, labels = ler_arquivos(batch_files, file_paths, tam, value)
                combined_input, labels = combinar_entrada(res, clique, tam, labels, remaining_batch_input, remaining_batch_labels)
                remaining_batch_input = remaining_batch_labels = []
                num_samples = len(combined_input)
                num_batches = num_samples // batch_size
                for batch_idx in range(num_batches):
                    batch_input = combined_input[batch_idx * batch_size: (batch_idx + 1) * batch_size]
                    batch_labels = labels[batch_idx * batch_size: (batch_idx + 1) * batch_size]
                    x_batch, y_batch = estruturar_entrada(batch_input, batch_labels)
                    yield x_batch, y_batch
                idx += files_per_batch
                remaining_samples = num_samples % batch_size
                if remaining_samples > 0:
                    remaining_batch_input = combined_input[-remaining_samples:]
                    remaining_batch_labels = labels[-remaining_samples:]
        return generator
    
    return create_generator(files_t)(), create_generator(files_v)()
train_generator_func, val_generator_func = data_generator(file_paths, batch_size, files_per_batch, tam, False, 0.2)
train_dataset = tf.data.Dataset.from_generator(lambda : train_generator_func, output_signature=output_signature_type(batch_size, tam))
val_dataset = tf.data.Dataset.from_generator(lambda : val_generator_func, output_signature=output_signature_type(batch_size, tam))

# Prefetch para melhorar a performance
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
    
# Instanciando o callback para reiniciar o gerador de validação
reset_val_callback = ResetValidationGenerator(val_generator_func=val_generator_func, batch_size=batch_size, tam=tam)
    
#treinar modelo
now = datetime.now()
model.fit(train_dataset, epochs=5, verbose=1, validation_data=val_dataset,
              callbacks=[printbatch(), EarlyStopping(monitor='val_loss', patience=50, verbose=0),
                         ModelCheckpoint(os.path.join(output_path, "models", "dnn_model_" + str(tam) + "_" + str(now.day) + "." +
                                         str(now.month) + "." + str(now.year) + "_" + "_{epoch:02d}-{val_loss:.2f}.keras"),
                                         monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto'),
                         reset_val_callback])

I created a callback to try resetting the dataset, but I’m still getting the same error.

class ResetValidationGenerator(tf.keras.callbacks.Callback):
    def __init__(self, val_generator_func, batch_size, tam):
        super(ResetValidationGenerator, self).__init__()
        self.val_generator_func = val_generator_func
        self.batch_size = batch_size
        self.tam = tam
        self.val_dataset = None

    def on_epoch_end(self, epoch, logs=None):
        # Aqui você pode reiniciar o gerador de validação
        print(f"Reiniciando o gerador de validação após a época {epoch + 1}")
        
        # Reinicia o gerador de validação chamando a função que cria o gerador
        self.val_dataset = tf.data.Dataset.from_generator(
            lambda: self.val_generator_func(), 
            output_signature=output_signature_type(self.batch_size, self.tam)
        ).prefetch(tf.data.AUTOTUNE)

Solution

  • I couldn't solve this problem directly, with I got a solution that works. The validation dataset is a smaller portion of the total dataset. In this case I loaded it complete in memory (I didn't use a generator). Validation has worked in all epochs.