I'm trying to train Siamese neural networks for face recognition. Many resources use this function as a loss function:
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
I train several neural networks of different architecture. And for some of them, this function does not work correctly (return nan). Because of this, the neural network is not trained at all.
My code:
#Models.py
from keras.models import Sequential, Model
from keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Lambda, BatchNormalization, Activation
from keras.optimizers import RMSprop
from keras import backend as K
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def accuracy(y_true, y_pred):
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
def TestModel(input_shape):
model = Sequential()
model.add(Conv2D(filters=96, kernel_size=3, strides=3, activation='relu', input_shape=input_shape, padding='valid'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(.25))
model.add(Conv2D(filters=256, kernel_size=3, strides=3, activation='relu', padding='valid'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
return model
def Net_Definition(input_shape):
model = Sequential()
model.add(Conv2D(filters=96, kernel_size=7, strides=4, activation='relu', padding='valid', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
model.add(BatchNormalization())
model.add(Conv2D(filters=256, kernel_size=5, strides=1, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
model.add(BatchNormalization())
model.add(Conv2D(filters=384, kernel_size=3, strides=1, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=3, strides=2, padding='valid'))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(128, activation='softmax'))
return model
def CreateModel(name, input_shape):
global network
if name == 'test':
network = TestModel(input_shape)
elif name == 'net_definition':
network = Net_Definition(input_shape)
else:
print('Invalid model name!')
exit(0)
network = Net_Definition(input_shape)
input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)
processed_a = network(input_a)
processed_b = network(input_b)
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model(inputs=[input_a, input_b], outputs=distance)
opt = RMSprop()
model.compile(loss=contrastive_loss, optimizer=opt, metrics=[accuracy])
return model
from keras.utils import Sequence
import numpy as np
import Models
from keras.callbacks import CSVLogger
class MyGenerator(Sequence):
def __init__(self, filenames, labels, batch_size):
self.filenames = filenames
self.labels = labels
self.batch_size = batch_size
def __len__(self):
return (np.ceil(len(self.filenames) / float(self.batch_size))).astype(np.int32)
def __getitem__(self, item):
batch_x = self.filenames[item * self.batch_size:(item + 1) * self.batch_size]
batch_y = self.labels[item * self.batch_size:(item + 1) * self.batch_size]
x1 = []
x2 = []
for i, files in enumerate(batch_x):
pair = np.load(files).astype(np.float32)
x1.append(pair[0]/255)
x2.append(pair[1]/255)
x1 = np.asarray(x1)
x2 = np.asarray(x2)
return (x1, x2), np.array(batch_y).astype(np.float32)
# path_to_folder = 'Datasets/test/pairs/224/'
path_to_folder = 'Datasets/6. Pairs/224/'
input_shape = (224, 224, 3)
batch_size = 128
x_train_file = open(path_to_folder + 'X_Train.txt', 'r')
y_train_file = open(path_to_folder + 'Y_Train.txt', 'r')
x_val_file = open(path_to_folder + 'X_Val.txt', 'r')
y_val_file = open(path_to_folder + 'Y_Val.txt', 'r')
x_train = x_train_file.read().splitlines()
y_train = y_train_file.read().splitlines()
x_val = x_val_file.read().splitlines()
y_val = y_val_file.read().splitlines()
csv_logger = CSVLogger('logs.log')
train_generator = MyGenerator(x_train, y_train, batch_size)
val_generator = MyGenerator(x_val, y_val, batch_size)
model = Models.CreateModel('test', input_shape)
history = model.fit(train_generator, epochs=10, verbose=1, validation_data=val_generator, callbacks=[csv_logger])
model.save_weights('my_checkpoint')
For TestModel everything works fine, but for Net_definition it returns nan. TestModel Net_definition How can the problem be solved? Maybe there are other loss functions for this?
I can see a couple of errors here -
y_true
and 1-y_true
terms in contrastive function should be exchanged.You can draw inspiration from here -
def loss(margin=1):
"""Provides 'constrastive_loss' an enclosing scope with variable 'margin'.
Arguments:
margin: Integer, defines the baseline for distance for which pairs
should be classified as dissimilar. - (default is 1).
Returns:
'constrastive_loss' function with data ('margin') attached.
"""
# Contrastive loss = mean( (1-true_value) * square(prediction) +
# true_value * square( max(margin-prediction, 0) ))
def contrastive_loss(y_true, y_pred):
"""Calculates the constrastive loss.
Arguments:
y_true: List of labels, each label is of type float32.
y_pred: List of predictions of same length as of y_true,
each label is of type float32.
Returns:
A tensor containing constrastive loss as floating point value.
"""
square_pred = tf.math.square(y_pred)
margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
return tf.math.reduce_mean(
(1 - y_true) * square_pred + (y_true) * margin_square
)
return contrastive_loss
CreateModel
function is creating Siamese network and the output is euclidean_distance
between two vectors which is not a probability. Euclidean distance can be greater than 1. Better to add activation like sigmoid
in the final layer of Siamese model.