tensorflow keras distributed-computing keras-tuner

Cannot Restore Checkpoint without Error Keras Tuner

I'm using Keras Tuner to hyper parameterize my CNN to process EMG data. The code is being run on 4 nvidia gpus and 5 cpus with TensorFlow's mirrored strategy. When I run the tuner, it's able to run through all the trials and each for two epochs. When it tries to restore the models it saved to run more epochs for the next hyperband itereration It throws this error: Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope

This is the code:

class CNNHyperModel(HyperModel):

  def __init__(self, input_shape, output_shape):
    self.input_shape = input_shape
    self.output_shape = output_shape

  def build(self, hp):
    visible = Input(shape = (self.input_shape[0], self.input_shape[1], 1))
    
    filter_number = hp.Int("filter_base_size", 5, 7)

    conv = Conv2D(filters=2**(filter_number), \
      kernel_size=(4,4), activation='relu', padding = 'same')(visible)
    pooling = MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv)

    conv2 = Conv2D(filters=2**(filter_number + 1), \
      kernel_size=(3,3), activation='relu', padding = 'same')(pooling)
    pooling2 =  MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv2)

    conv3 = Conv2D(filters=2**(filter_number + 2), \
      kernel_size=(2,2), activation='relu', padding = 'same')(pooling2)
    pooling3 =  MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv3)

    flatten = Flatten()(pooling3)
    dropout = Dropout(0.5)(flatten)
    
    num_layers = hp.Int("num_layers", 3, 4)
    dense_units = [hp.Int("dense_units_1", 80, 240, 30), \
        hp.Int("dense_units_2", 80, 240, 30), hp.Int("dense_units_3", 80, 240, 30), \
        hp.Int("dense_units_4", 80, 240, 30)]
    #num_layers = 4
    #dense_units = [hp.Int("dense_units_2", 80, 240, 30), 120, 140, 200]
    hidden = dropout

    for i in range(num_layers):
      hidden = Dense(units=dense_units[i], activation = 'relu')(hidden)

    output = Dense(units=self.output_shape[0], activation = 'softmax')(hidden)

    model = Model(inputs=visible, outputs=output)

    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics = "accuracy",
    )
    return model
    
    
strategy = tf.distribute.MirroredStrategy()
batch_size = 800
epochs = 80
val_dataset = (np.array(testing_input), np.array(testing_output))


tuner = Hyperband(
  hypermodel = CNNHyperModel(
    input_shape = [len(training_input[0]), len(training_input[0][0])],
    output_shape = [len(training_output[0])],
  ),
  objective='accuracy',
  distribution_strategy=strategy,
  hyperband_iterations=5
)

early_stopping_callback = keras.callbacks.EarlyStopping(
  monitor='val_loss', 
  patience=5
)

tuner.search(
  x = np.array(training_input),
  y = np.array(training_output),
  validation_data=val_dataset,
  callbacks=[early_stopping_callback],
  batch_size=batch_size,
  epochs = epochs,
)

best_model = tuner.get_best_models(num_models=1)[0]

best_model.fit(
    x = np.array(training_input),
    y = np.array(training_output),
    epochs=epochs,
    batch_size=batch_size,
    validation_data=val_dataset,
)

Solution

I have the same issue, running keras-tuner=1.2.1, tf=2.4.1 (&tf-gpu=2.4.1), on 2 GPU's (nvidia RTX), but not for keras-tuner=1.1.0 and tf=2.4.1 (&tf-gpu=2.4.1) using this code:

tuner = kt.Hyperband(
   hypermodel=build_model,
   objective= kt.Objective("val_loss", direction="min"),
   max_epochs=20,
   factor=3,
   seed = 17,
   hyperband_iterations=1,
   distribution_strategy=tf.distribute.MirroredStrategy(),
   directory='models',
   project_name=EXPERIMENT,
   tuner_id = EXPERIMENT,
   overwrite=True,
   logger=NeptuneLogger()
   )

best_hps = tuner.search(
   train_data,
   steps_per_epoch=20,
   validation_data=val_data,
   validation_steps=5,
   callbacks=callbacks
   )

So an answer could be to downgrade keras-tuner, although I'd rather have it working with the new version.