I am trying to train a model using generators for the training and validation datasets. Each epoch consumes all the batches of the validation dataset (correct?). The first epoch runs fine, but from the second epoch onwards, it gives an error stating that it can't calculate val_loss. I suspect this might be related to the validation dataset not being reset, but I'm not sure. How can I fix this?
error
Cell In[13], line 43
41 #treinar modelo
42 now = datetime.now()
---> 43 model.fit(train_dataset, epochs=5, verbose=1, validation_data=val_dataset,
44 callbacks=[printbatch(), EarlyStopping(monitor='val_loss', patience=50, verbose=0),
45 ModelCheckpoint(os.path.join(output_path, "models", "dnn_model_" + str(tam) + "_" + str(now.day) + "." +
46 str(now.month) + "." + str(now.year) + "_" + "_{epoch:02d}-{val_loss:.2f}.keras"),
47 monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto'),
48 reset_val_callback])
49 model.save(os.path.join(output_path,"dnn_model_" + str(tam) + "_" + str(now.day) + "." +str(now.month) + "." + str(now.year) + "_.keras"))
...
325 f"Reason: {e}"
326 )
327 return file_path
KeyError: 'Failed to format this callback filepath: "modelos/models/dnn_model_150_11.12.2024__{epoch:02d}-{val_loss:.2f}.keras". Reason: \'val_loss\''
also
/home/igor/anaconda3/envs/tcc/lib/python3.9/site-packages/keras/src/callbacks/early_stopping.py:155: UserWarning: Early stopping conditioned on metric `val_loss` which is not available. Available metrics are: accuracy,loss
current = self.get_monitor_value(logs)
my code
def data_generator(file_paths, batch_size, files_per_batch, tam, value, val_split):
# Pegando todos os arquivos do diretório
files = sorted([os.path.basename(ii) for ii in glob.glob(f"{file_paths}/*.dimacs")])
cut_index = int(len(files) * 0.9) # Índice para os primeiros 90% dos arquivos
files = files[:cut_index] # Mantém apenas os primeiros 90% para usar no treino
# Dividir aleatoriamente 20% dos arquivos para validação e 80% para treinamento
files = files[:1000]
files_t, files_v = train_test_split(files, test_size=val_split, random_state=42)
# Gerador de treinamento
def create_generator(file_list):
def generator():
idx = 0
remaining_batch_input = remaining_batch_labels = []
while idx < len(file_list):
batch_files = file_list[idx: idx + files_per_batch]
res, clique, labels = ler_arquivos(batch_files, file_paths, tam, value)
combined_input, labels = combinar_entrada(res, clique, tam, labels, remaining_batch_input, remaining_batch_labels)
remaining_batch_input = remaining_batch_labels = []
num_samples = len(combined_input)
num_batches = num_samples // batch_size
for batch_idx in range(num_batches):
batch_input = combined_input[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_labels = labels[batch_idx * batch_size: (batch_idx + 1) * batch_size]
x_batch, y_batch = estruturar_entrada(batch_input, batch_labels)
yield x_batch, y_batch
idx += files_per_batch
remaining_samples = num_samples % batch_size
if remaining_samples > 0:
remaining_batch_input = combined_input[-remaining_samples:]
remaining_batch_labels = labels[-remaining_samples:]
return generator
return create_generator(files_t)(), create_generator(files_v)()
train_generator_func, val_generator_func = data_generator(file_paths, batch_size, files_per_batch, tam, False, 0.2)
train_dataset = tf.data.Dataset.from_generator(lambda : train_generator_func, output_signature=output_signature_type(batch_size, tam))
val_dataset = tf.data.Dataset.from_generator(lambda : val_generator_func, output_signature=output_signature_type(batch_size, tam))
# Prefetch para melhorar a performance
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
# Instanciando o callback para reiniciar o gerador de validação
reset_val_callback = ResetValidationGenerator(val_generator_func=val_generator_func, batch_size=batch_size, tam=tam)
#treinar modelo
now = datetime.now()
model.fit(train_dataset, epochs=5, verbose=1, validation_data=val_dataset,
callbacks=[printbatch(), EarlyStopping(monitor='val_loss', patience=50, verbose=0),
ModelCheckpoint(os.path.join(output_path, "models", "dnn_model_" + str(tam) + "_" + str(now.day) + "." +
str(now.month) + "." + str(now.year) + "_" + "_{epoch:02d}-{val_loss:.2f}.keras"),
monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto'),
reset_val_callback])
I created a callback to try resetting the dataset, but I’m still getting the same error.
class ResetValidationGenerator(tf.keras.callbacks.Callback):
def __init__(self, val_generator_func, batch_size, tam):
super(ResetValidationGenerator, self).__init__()
self.val_generator_func = val_generator_func
self.batch_size = batch_size
self.tam = tam
self.val_dataset = None
def on_epoch_end(self, epoch, logs=None):
# Aqui você pode reiniciar o gerador de validação
print(f"Reiniciando o gerador de validação após a época {epoch + 1}")
# Reinicia o gerador de validação chamando a função que cria o gerador
self.val_dataset = tf.data.Dataset.from_generator(
lambda: self.val_generator_func(),
output_signature=output_signature_type(self.batch_size, self.tam)
).prefetch(tf.data.AUTOTUNE)
I couldn't solve this problem directly, with I got a solution that works. The validation dataset is a smaller portion of the total dataset. In this case I loaded it complete in memory (I didn't use a generator). Validation has worked in all epochs.