I'm currently building a video classification model for engagement detection but I'm having some trouble training it. The model takes in two tensors as inputs: a 10x48x48x1 tensor which holds a stack of 10 sampled frames each of size 48x48 from the input video and a 10x392 tensor that contains high level facial features that are extracted from each second of the video using OpenFace which is a facial landmark detection tool (each video in the dataset I am training with is 10 seconds long).
The first input tensor is fed into an emotion recognition model that was pre-trained on the FER+ dataset. However, since emotion recognition model was trained on images instead of videos I wrapped the emotion classifier in an "aggregation layer" so that each 48x48x1 slice in the 1st tensor is fed into the emotion classifier and then all of those outputs are averaged together.
The second input tensor is fed into a feed forward neural network. And just like with the emotion classifier, the 329 OpenFace features for each second is fed into the network one at a time and then the output from the network is averaged together.
Finally, the aggregated outputs from these two models are stacked on top of one another and then passed through a final dense layer.
Unfortunately, the model ended up not being able to learn past a certain point. The training loss (sparse categorical cross entropy) decreased to around 1.06 and wouldn't go further. My dataset only has three classes so it looks like the model is essentially randomly guessing.
I couldn't figure out why it's unable to decrease further however. I checked to make sure that the gradients existed for the two sub models and the entire integrated model and they do as can be seen in the below code.
After some time, I figured out that the problem had to do with the generator. When I train on a normal dataset, it trains normally and overfits on a small datasets but not on the generator dataset. The code below demonstrates this:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Input, Dense
from keras import models
#Custom attention layer used in the patt lite model. Tensorflow's attention layer was giving us
#nan gradients so we decided to make our own attention layer
class Custom_Attention(tf.keras.layers.Layer):
def call(self, x):
size = tf.shape(x)
x = tf.reshape(x, [size[0], size[1], 1])
y = tf.einsum("...ij,...jk->...ik", x, tf.transpose(x, perm = [0, 2, 1]))/16.0
y = tf.matmul(tf.nn.softmax(y), x)
return tf.reshape(y, [size[0], size[1]])
#We implemented the Patt Lite model described in https://arxiv.org/pdf/2306.09626v1.pdf for our emotion recognition classifier
class Patt_Lite:
def __init__(self):
mobile_net = tf.keras.applications.mobilenet.MobileNet(
include_top = False)
mobile_net.trainable = False
inputs = tf.keras.Input(shape = (48, 48, 1), name = "image")
x = tf.keras.layers.Conv2D(3, (1, 1))(inputs)
x = tf.keras.layers.Resizing(224, 224)(x)
x = tf.keras.applications.mobilenet.preprocess_input(x)
for layer in mobile_net.layers[:56]:
x = layer(x)
x = tf.keras.layers.ZeroPadding2D(padding=(1, 1))(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.DepthwiseConv2D( (10,10), strides = 2)(x)
x = tf.keras.layers.Conv2D(256, (1,1), activation = "relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.DepthwiseConv2D((3,3))(x)
x = tf.keras.layers.Conv2D(16, (1,1), activation = "relu")(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(256, activation = "relu")(x)
x = Custom_Attention()(x)
outputs = tf.keras.layers.Dense(9, activation = "softmax")(x)
self.model = tf.keras.Model(inputs, outputs)
#we implement our focus and emotion classifiers as layers in our model
class Emotion_Classifier(Layer):
def __init__(self, **kwargs):
super(Emotion_Classifier, self).__init__(**kwargs)
emoti_model = Patt_Lite().model
tf.keras.utils.get_custom_objects()["Custom_Attention"] = Custom_Attention
#we pre-trained the Patt Lite model on the FER+ dataset but for an MRE this isn't necessary
#emoti_model.load_weights(os.path.join("..", "..", "Models", "Emotion_Rec", "PAtt_Lite_weights.h5"))
inputs = keras.Input(shape = (48, 48, 1))
y = emoti_model(inputs)
#we attach a dense layer to our emotion classifier so that our emotion classifier can be trained
#to detect engagement
y = keras.layers.Dense(3, activation = "relu")(y)
self.model = Model(inputs, y)
def __call__(self, x):
return self.model(x)
#for our focus classifier, we implemented it as an MLP. Since we're already working with high level features,
#our classifier doesn't need to be as complex as the emotion classifier
class Focus_Classifier(Layer):
def __init__(self, **kwargs):
super(Focus_Classifier, self).__init__(**kwargs)
inputs = keras.Input(shape = (329))
y = keras.layers.Dense(64, activation = "relu")(inputs)
y = keras.layers.Dense(64, activation = "relu")(y)
y = keras.layers.Dense(128, activation = "relu")(y)
y = keras.layers.Dense(3, activation = "relu")(y)
self.model = Model(inputs, y)
def __call__(self, x):
return self.model(x)
#Finally our aggregation layer goes through each of the 10 sampled / averaged frames
#and evaluates the focus and emotion classifiers on each of them
class AggregationLayer(Layer):
def __init__(self, emo_model, open_model, num_frames, **kwargs):
super(AggregationLayer, self).__init__(**kwargs)
self.emoti_model = emo_model
self.open_model = open_model
self.num_frames = num_frames
def __call__(self, inputs):
#we evaluate our two classifiers on each frame
emot_outputs = [self.emoti_model(frame) for frame in tf.unstack(inputs[0], axis=1)]
focus_outputs = [self.open_model(frame) for frame in tf.unstack(inputs[1], axis=1)]
#average the outputs
aver_emot_output = tf.reduce_mean(tf.stack(emot_outputs, axis = 1), axis=1)
aver_focus_output = tf.reduce_mean(tf.stack(focus_outputs, axis = 1), axis=1)
#and then our output from the aggregation layer are the two 3x1 averaged outputs
aggregate_output = tf.concat([aver_emot_output, aver_focus_output], axis = 1)
return aggregate_output
def test_trainable(model, input_size):
with tf.GradientTape() as tape:
a = [tf.random.normal(size) for size in input_size]
z = model(a)
loss = tf.keras.losses.SparseCategoricalCrossentropy()([1], z)
#loss = tf.math.reduce_mean()
#loss = tf.math.reduce_mean(z**2)
grads = tape.gradient(loss, model.trainable_variables)
for grad, var in zip(grads, model.trainable_variables):
#unfortunately all of the gradients are None and I'm not sure why
try:
tf.debugging.check_numerics(grad, message=f"{var.name}: ")
tf.debugging.assert_greater(tf.sum(tf.math.abs(grad)), 0.0)
except:
continue
inp_emo = keras.Input((10, 48, 48, 1))
inp_open = keras.Input((10, 329))
emoti_model = Emotion_Classifier()
focus_model = Focus_Classifier()
y = AggregationLayer(emoti_model, focus_model, 10)([inp_emo, inp_open])
outputs = Dense(3, activation = "softmax")(y)
model = Model([inp_emo, inp_open], outputs)
model.summary()
#Test 1: Check if gradients of model exist
test_trainable(Patt_Lite().model, [(1, 48, 48, 1)])
test_trainable(emoti_model, [(1, 48, 48, 1)])
test_trainable(focus_model, [(1, 329)])
test_trainable(model, [(1, 10, 48, 48, 1), (1, 10, 329)])
#Test 2: Check if model can overfit on small dataset
#I found that the model was unable to overfit on a small dataset made using
#a generator but did overfit on a normal small dataset as can be seen below
#A. Using Generator
def generator():
feats = (tf.random.normal((1, 10, 48, 48, 1)),
tf.random.normal((1, 10, 329)))
labels = tf.reshape(tf.convert_to_tensor( [1], dtype= tf.int16), [1, 1])
yield feats, labels
output_signature_train = ( (tf.TensorSpec(shape = (None, 10, 48, 48, 1), dtype = tf.float32),
tf.TensorSpec(shape = (None, 10, 329), dtype = tf.float32)),
tf.TensorSpec(shape = (None, 1), dtype = tf.int16))
train_ds = tf.data.Dataset.from_generator(generator,
output_signature = output_signature_train)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = keras.optimizers.SGD(learning_rate = 1e-3),
metrics = ["acc"])
a = train_ds.take(1)
print("Test Using Generator:")
model.fit(a,
epochs = 5)
#B. Using Normal Dataset
#we create a new model
model = Model([inp_emo, inp_open], outputs)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = keras.optimizers.AdamW(learning_rate = 1e-2),
metrics = ["acc"])
#and construct a new dataset without using a generator
feats = (tf.random.normal((1, 10, 48, 48, 1)),
tf.random.normal((1, 10, 329)))
labels = tf.reshape(tf.convert_to_tensor( [2], dtype= tf.int16), [1, 1])
print("Test Without Using Generator")
model.fit(feats, labels,
epochs = 5)
The image below shows the training loss per epoch when the model was trained on a generator dataset and on a "normal" dataset. As you can see, with five epochs, the model is able to overfit on the regular dataset but it's still randomly guessing on the regular dataset.
Training Loss of Model When Trained on Generator vs Normal Dataset
When I use your code with
optimizer = keras.optimizers.AdamW(learning_rate = 1e-2)
for the generator (as you did without the generator), I also achieve
Test Using Generator:
Epoch 1/5
1/1 [==============================] - 19s 19s/step - loss: 1.0785 - acc: 0.0000e+00
Epoch 2/5
1/1 [==============================] - 0s 80ms/step - loss: 0.6456 - acc: 1.0000
Epoch 3/5
1/1 [==============================] - 0s 80ms/step - loss: 0.1144 - acc: 1.0000
Epoch 4/5
1/1 [==============================] - 0s 75ms/step - loss: 0.0260 - acc: 1.0000
Epoch 5/5
1/1 [==============================] - 0s 71ms/step - loss: 0.0040 - acc: 1.0000