pythontensorflowmachine-learningkerassgd

How to calculate maximum gradient for each layer given a mini-batch


I try to implement a fully-connected model for classification using the MNIST dataset. A part of the code is the following:

n = 5
act_func = 'relu'

classifier = tf.keras.models.Sequential()
classifier.add(layers.Flatten(input_shape = (28, 28, 1)))
for i in range(n):
  classifier.add(layers.Dense(32, activation=act_func))
classifier.add(layers.Dense(10, activation='softmax'))
opt = tf.keras.optimizers.SGD(learning_rate=0.01)
classifier.compile(optimizer=opt,loss="categorical_crossentropy",metrics ="accuracy")

classifier.summary()

history = classifier.fit(x_train, y_train, batch_size=32, epochs=3, validation_data=(x_test,y_test))

Is there a way to print the maximum gradient for each layer for a given mini-batch?


Solution

  • You could start off with a custom training loop using tf.GradientTape:

    import tensorflow as tf
    import tensorflow_datasets as tfds 
    
    (ds_train, ds_test), ds_info = tfds.load(
        'mnist',
        split=['train', 'test'],
        shuffle_files=True,
        as_supervised=True,
        with_info=True,
    )
    n = 5
    act_func = 'relu'
    
    classifier = tf.keras.models.Sequential()
    classifier.add(tf.keras.layers.Flatten(input_shape = (28, 28, 1)))
    for i in range(n):
      classifier.add(tf.keras.layers.Dense(32, activation=act_func))
    classifier.add(tf.keras.layers.Dense(10, activation='softmax'))
    opt = tf.keras.optimizers.SGD(learning_rate=0.01)
    loss = tf.keras.losses.CategoricalCrossentropy()
    
    classifier.summary()
    
    epochs = 1
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))
        for step, (x_batch_train, y_batch_train) in enumerate(ds_train.take(50).batch(10)):
            x_batch_train = tf.cast(x_batch_train, dtype=tf.float32)
            y_batch_train = tf.keras.utils.to_categorical(y_batch_train, 10)
    
            with tf.GradientTape() as tape:
              logits = classifier(x_batch_train, training=True)
              loss_value = loss(y_batch_train, logits)
    
            grads = tape.gradient(loss_value, classifier.trainable_weights)
            opt.apply_gradients(zip(grads, classifier.trainable_weights)) 
    
            with tf.GradientTape(persistent=True) as tape:
              tape.watch(x_batch_train)
              x = classifier.layers[0](x_batch_train)
              outputs = []
              for layer in classifier.layers[1:]:
                  x = layer(x)
                  outputs.append(x)
    
            for idx, output in enumerate(outputs):
               grad = tf.math.abs(tape.gradient(output, x_batch_train))
               print('Max gradient for layer {} is {}'.format(idx + 1, tf.reduce_max(grad)))
            print('End of batch {}'.format(step + 1))
    
    Model: "sequential_9"
    _________________________________________________________________
     Layer (type)                Output Shape              Param #   
    =================================================================
     flatten_9 (Flatten)         (None, 784)               0         
                                                                     
     dense_54 (Dense)            (None, 32)                25120     
                                                                     
     dense_55 (Dense)            (None, 32)                1056      
                                                                     
     dense_56 (Dense)            (None, 32)                1056      
                                                                     
     dense_57 (Dense)            (None, 32)                1056      
                                                                     
     dense_58 (Dense)            (None, 32)                1056      
                                                                     
     dense_59 (Dense)            (None, 10)                330       
                                                                     
    =================================================================
    Total params: 29,674
    Trainable params: 29,674
    Non-trainable params: 0
    _________________________________________________________________
    
    Start of epoch 0
    Max gradient for layer 1 is 0.7913536429405212
    Max gradient for layer 2 is 0.8477020859718323
    Max gradient for layer 3 is 0.7188305854797363
    Max gradient for layer 4 is 0.5108454823493958
    Max gradient for layer 5 is 0.3362882435321808
    Max gradient for layer 6 is 1.9748875867975357e-09
    End of batch 1
    Max gradient for layer 1 is 0.7535678148269653
    Max gradient for layer 2 is 0.6814548373222351
    Max gradient for layer 3 is 0.5748667120933533
    Max gradient for layer 4 is 0.5439972877502441
    Max gradient for layer 5 is 0.27793681621551514
    Max gradient for layer 6 is 1.9541412932255753e-09
    End of batch 2
    Max gradient for layer 1 is 0.8606255650520325
    Max gradient for layer 2 is 0.8506941795349121
    Max gradient for layer 3 is 0.8556670546531677
    Max gradient for layer 4 is 0.43756356835365295
    Max gradient for layer 5 is 0.2675274908542633
    Max gradient for layer 6 is 3.7072431791074223e-09
    End of batch 3
    Max gradient for layer 1 is 0.7640039324760437
    Max gradient for layer 2 is 0.6926062107086182
    Max gradient for layer 3 is 0.6164448857307434
    Max gradient for layer 4 is 0.43013691902160645
    Max gradient for layer 5 is 0.32356566190719604
    Max gradient for layer 6 is 3.2926392723453546e-09
    End of batch 4
    Max gradient for layer 1 is 0.7604862451553345
    Max gradient for layer 2 is 0.6908300518989563
    Max gradient for layer 3 is 0.6122230887413025
    Max gradient for layer 4 is 0.39982378482818604
    Max gradient for layer 5 is 0.3172021210193634
    Max gradient for layer 6 is 2.3238742041797877e-09
    End of batch 5