I'm using Keras Tuner to hyper parameterize my CNN to process EMG data. The code is being run on 4 nvidia gpus and 5 cpus with TensorFlow's mirrored strategy. When I run the tuner, it's able to run through all the trials and each for two epochs. When it tries to restore the models it saved to run more epochs for the next hyperband itereration It throws this error:
Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
This is the code:
class CNNHyperModel(HyperModel):
def __init__(self, input_shape, output_shape):
self.input_shape = input_shape
self.output_shape = output_shape
def build(self, hp):
visible = Input(shape = (self.input_shape[0], self.input_shape[1], 1))
filter_number = hp.Int("filter_base_size", 5, 7)
conv = Conv2D(filters=2**(filter_number), \
kernel_size=(4,4), activation='relu', padding = 'same')(visible)
pooling = MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv)
conv2 = Conv2D(filters=2**(filter_number + 1), \
kernel_size=(3,3), activation='relu', padding = 'same')(pooling)
pooling2 = MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv2)
conv3 = Conv2D(filters=2**(filter_number + 2), \
kernel_size=(2,2), activation='relu', padding = 'same')(pooling2)
pooling3 = MaxPooling2D(pool_size=(2, 2), strides=2, padding='valid')(conv3)
flatten = Flatten()(pooling3)
dropout = Dropout(0.5)(flatten)
num_layers = hp.Int("num_layers", 3, 4)
dense_units = [hp.Int("dense_units_1", 80, 240, 30), \
hp.Int("dense_units_2", 80, 240, 30), hp.Int("dense_units_3", 80, 240, 30), \
hp.Int("dense_units_4", 80, 240, 30)]
#num_layers = 4
#dense_units = [hp.Int("dense_units_2", 80, 240, 30), 120, 140, 200]
hidden = dropout
for i in range(num_layers):
hidden = Dense(units=dense_units[i], activation = 'relu')(hidden)
output = Dense(units=self.output_shape[0], activation = 'softmax')(hidden)
model = Model(inputs=visible, outputs=output)
model.compile(
optimizer="adam",
loss="categorical_crossentropy",
metrics = "accuracy",
)
return model
strategy = tf.distribute.MirroredStrategy()
batch_size = 800
epochs = 80
val_dataset = (np.array(testing_input), np.array(testing_output))
tuner = Hyperband(
hypermodel = CNNHyperModel(
input_shape = [len(training_input[0]), len(training_input[0][0])],
output_shape = [len(training_output[0])],
),
objective='accuracy',
distribution_strategy=strategy,
hyperband_iterations=5
)
early_stopping_callback = keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=5
)
tuner.search(
x = np.array(training_input),
y = np.array(training_output),
validation_data=val_dataset,
callbacks=[early_stopping_callback],
batch_size=batch_size,
epochs = epochs,
)
best_model = tuner.get_best_models(num_models=1)[0]
best_model.fit(
x = np.array(training_input),
y = np.array(training_output),
epochs=epochs,
batch_size=batch_size,
validation_data=val_dataset,
)
I have the same issue, running keras-tuner=1.2.1, tf=2.4.1 (&tf-gpu=2.4.1), on 2 GPU's (nvidia RTX), but not for keras-tuner=1.1.0 and tf=2.4.1 (&tf-gpu=2.4.1) using this code:
tuner = kt.Hyperband(
hypermodel=build_model,
objective= kt.Objective("val_loss", direction="min"),
max_epochs=20,
factor=3,
seed = 17,
hyperband_iterations=1,
distribution_strategy=tf.distribute.MirroredStrategy(),
directory='models',
project_name=EXPERIMENT,
tuner_id = EXPERIMENT,
overwrite=True,
logger=NeptuneLogger()
)
best_hps = tuner.search(
train_data,
steps_per_epoch=20,
validation_data=val_data,
validation_steps=5,
callbacks=callbacks
)
So an answer could be to downgrade keras-tuner, although I'd rather have it working with the new version.