Good Evening,
I want to implement a toy example for a simple regression problem with tf2 and the Gradient Tape function. With Model.fit it learns properly but the same with the GradientTape does something but the loss doesn't move compared to model.fit(). Here my example codes and the results. I can't find the problem.
model_opt = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
y = model(X, training=True)
loss_value = loss_fn(y_true, y)
grads = tape.gradient(loss_value, model.trainable_variables)
model_opt.apply_gradients(zip(grads, model.trainable_variables))
#Results:
42.47433806265809
42.63973672226078
36.687397360178586
38.744844324717526
36.59080452300609
...
Here the Regular case with model.fit()
model.compile(optimizer=tf.keras.optimizers.Adam(),loss=tf.keras.losses.MSE,metrics="mse")
...
model.fit(X,y_true,verbose=0)
#Results
[40.97759069299212]
[28.04145720307729]
[17.643483147375473]
[7.575242056454791]
[5.83682193867299]
The accuracy should be roughly the same but it looks like it doesn't learn at all. The input X is a tensor and y_true as well.
Edit for testing
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
'Acceleration', 'Model Year', 'Origin']
dataset = pd.read_csv(dataset_path, names=column_names,
na_values = "?", comment='\t',
sep=" ", skipinitialspace=True)
dataset = dataset.dropna()
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
def norm(x):
return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
def build_model_fit():
model = keras.Sequential([
layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
layers.Dense(64, activation='relu'),
layers.Dense(1)])
optimizer = tf.keras.optimizers.RMSprop(0.001)
model.compile(loss='mse',optimizer=optimizer)
return model
def build_model_tape():
model = keras.Sequential([
layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
layers.Dense(64, activation='relu'),
layers.Dense(1)])
opt = tf.keras.optimizers.RMSprop(0.001)
return model, opt
model_f = build_model_fit()
model_g, opt_g = build_model_tape()
EPOCHS = 20
#Model.fit() - Test
history = model_f.fit(normed_train_data, train_labels, epochs=EPOCHS, verbose=2)
X = tf.convert_to_tensor(normed_train_data.to_numpy())
y_true = tf.convert_to_tensor(train_labels.to_numpy())
#GradientTape - Test
loss_fn = tf.keras.losses.MeanSquaredError()
for i in range(0,EPOCHS):
with tf.GradientTape() as tape:
y = model_g(X, training=True)
loss_value = loss_fn(y_true, y)
grads = tape.gradient(loss_value, model_g.trainable_variables)
opt_g.apply_gradients(zip(grads, model_g.trainable_variables))
print(loss_value)
The discrepancy OP sees in loss values is due to using a different batch size in model.fit
and the tf.GradientTape
training loop. If the batch_size
keyword argument to model.fit
is unspecified, a batch size of 32 will be used. In the tf.GradientTape
training loop, the batch size is equal to the number of samples in the training set (i.e., 314).
To fix this, implement batching in the training loop. One way to do this is with the tf.data
API, as shown below.
loss_fn = tf.keras.losses.MeanSquaredError()
for i in range(0,EPOCHS):
epoch_losses = []
for x_batch, y_batch in tf.data.Dataset.from_tensor_slices((X, y_true)).batch(32):
with tf.GradientTape() as tape:
y = model_g(x_batch, training=True)
loss_value = loss_fn(y_batch, y)
epoch_losses.append(loss_value.numpy())
grads = tape.gradient(loss_value, model_g.trainable_variables)
opt_g.apply_gradients(zip(grads, model_g.trainable_variables))
print(np.mean(loss_value))
Also note that model.fit
shuffles the data with each iteration, whereas the custom training loop does not (that needs to be implemented by the developer).