I am trying to tune CatBoost's hyperparameters using Optuna. I need to train my CatBoost model using batches, because training data is too big.
Here is my code:
def expand_embeddings(df, embedding_col="embeddings"):
embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
return np.hstack([other_features, embeddings])
def batch_generator(df, target_col, batch_size):
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i + batch_size]
y = batch[target_col].to_numpy(dtype=np.float32)
X = batch.drop(columns=[target_col])
X = expand_embeddings(X)
yield X, y
train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
num_batches = 1300
batch_size_train = math.ceil(train_data.shape[0] / num_batches)
batch_size_test = math.ceil(val_data.shape[0] / num_batches)
train_batches_regressor = batch_generator(train_data, target_col="weight", batch_size=batch_size_train)
val_batches_regressor = batch_generator(val_data, target_col="weight", batch_size=batch_size_test)
def objective_regressor(trial):
params = {
'iterations': trial.suggest_int('iterations', 500, 2000),
'depth': trial.suggest_int('depth', 4, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
'random_strength': trial.suggest_float('random_strength', 0.1, 10),
'eval_metric': 'RMSE'}
model = CatBoostRegressor(
**params,
task_type = 'CPU',
random_seed=42,
verbose=0)
rmse = []
iterations = 0
for X_batch, y_batch in tqdm(train_batches_regressor):
X_val_batch, y_val_batch = next(val_batches_regressor)
if iterations == 0:
model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch),
use_best_model=True, verbose=0)
else:
model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch),
use_best_model=True, verbose=0, init_model=model)
y_pred = model.predict(X_val_batch)
rmse.append(mean_squared_error(y_val_batch, y_pred))
iterations += 1
return np.mean(rmse)
study_regressor = optuna.create_study(direction='minimize')
study_regressor.optimize(objective_regressor, n_trials=20)
First (zero) trial goes fine, but for next I got following error:
/opt/anaconda3/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.
return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.12/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide
ret = ret.dtype.type(ret / rcount)
[W 2024-11-29 05:47:52,256] Trial 1 failed with parameters: {'iterations': 1053, 'depth': 9, 'learning_rate': 0.03843036523508586, 'l2_leaf_reg': 7.883891260457, 'bagging_temperature': 0.5680668697003115, 'random_strength': 5.111730514165936} because of the following error: The value nan is not acceptable.
[W 2024-11-29 05:47:52,256] Trial 1 failed with value nan.
How should I tune parameters correctly?
It turned out that the problem was not in the "Optuna" library. I used generator for getting batches and created instance outside the trial function. After first trial generator returns NaN
values.
Correct code is:
def expand_embeddings(df, embedding_col="embeddings"):
embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
return np.hstack([other_features, embeddings])
def batch_generator(df, target_col, batch_size):
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i + batch_size]
y = batch[target_col].to_numpy(dtype=np.float32)
X = batch.drop(columns=[target_col])
X = expand_embeddings(X)
yield X, y
train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
num_batches = 1300
batch_size_train = math.ceil(train_data.shape[0] / num_batches)
batch_size_test = math.ceil(val_data.shape[0] / num_batches)
def objective_regressor(trial):
params = {
'iterations': trial.suggest_int('iterations', 500, 2000),
'depth': trial.suggest_int('depth', 4, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
'random_strength': trial.suggest_float('random_strength', 0.1, 10),
'eval_metric': 'RMSE'}
model = CatBoostRegressor(
**params,
task_type = 'CPU',
random_seed=42,
verbose=0)
rmse = []
# creating generators inside func
train_batches_regressor = batch_generator(train_data, target_col="weight",
batch_size=batch_size_train)
val_batches_regressor = batch_generator(val_data, target_col="weight",
batch_size=batch_size_test)
iterations = 0
for X_batch, y_batch in tqdm(train_batches_regressor):
X_val_batch, y_val_batch = next(val_batches_regressor)
if iterations == 0:
model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch),
use_best_model=True, verbose=0)
else:
model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch),
use_best_model=True, verbose=0, init_model=model)
y_pred = model.predict(X_val_batch)
rmse.append(mean_squared_error(y_val_batch, y_pred))
iterations += 1
return np.mean(rmse)
study_regressor = optuna.create_study(direction='minimize')
study_regressor.optimize(objective_regressor, n_trials=20)