catboostoptunacatboostregressor

Using Optuna for CatBoost with batches: got nan on second trial


I am trying to tune CatBoost's hyperparameters using Optuna. I need to train my CatBoost model using batches, because training data is too big.

Here is my code:

def expand_embeddings(df, embedding_col="embeddings"):
    embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
    other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
    return np.hstack([other_features, embeddings])

def batch_generator(df, target_col, batch_size):
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        y = batch[target_col].to_numpy(dtype=np.float32)
        X = batch.drop(columns=[target_col])
        X = expand_embeddings(X)
        yield X, y

train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
num_batches = 1300
batch_size_train = math.ceil(train_data.shape[0] / num_batches)
batch_size_test = math.ceil(val_data.shape[0] / num_batches)
train_batches_regressor = batch_generator(train_data, target_col="weight", batch_size=batch_size_train)
val_batches_regressor = batch_generator(val_data, target_col="weight", batch_size=batch_size_test)

def objective_regressor(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'eval_metric': 'RMSE'}

    model = CatBoostRegressor(
        **params,
        task_type = 'CPU',
        random_seed=42,
        verbose=0)

    rmse = []

    iterations = 0

    for X_batch, y_batch in tqdm(train_batches_regressor):
        X_val_batch, y_val_batch = next(val_batches_regressor)
        if iterations == 0:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0)
        else:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0, init_model=model)
        y_pred = model.predict(X_val_batch)
        rmse.append(mean_squared_error(y_val_batch, y_pred))
        iterations += 1
    
    return np.mean(rmse)

study_regressor = optuna.create_study(direction='minimize')
study_regressor.optimize(objective_regressor, n_trials=20)

First (zero) trial goes fine, but for next I got following error:

/opt/anaconda3/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.12/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide
  ret = ret.dtype.type(ret / rcount)
[W 2024-11-29 05:47:52,256] Trial 1 failed with parameters: {'iterations': 1053, 'depth': 9, 'learning_rate': 0.03843036523508586, 'l2_leaf_reg': 7.883891260457, 'bagging_temperature': 0.5680668697003115, 'random_strength': 5.111730514165936} because of the following error: The value nan is not acceptable.
[W 2024-11-29 05:47:52,256] Trial 1 failed with value nan.

How should I tune parameters correctly?


Solution

  • It turned out that the problem was not in the "Optuna" library. I used generator for getting batches and created instance outside the trial function. After first trial generator returns NaN values.

    Correct code is:

    def expand_embeddings(df, embedding_col="embeddings"):
        embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
        other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
        return np.hstack([other_features, embeddings])
    
    def batch_generator(df, target_col, batch_size):
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i + batch_size]
            y = batch[target_col].to_numpy(dtype=np.float32)
            X = batch.drop(columns=[target_col])
            X = expand_embeddings(X)
            yield X, y
    
    train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
    num_batches = 1300
    batch_size_train = math.ceil(train_data.shape[0] / num_batches)
    batch_size_test = math.ceil(val_data.shape[0] / num_batches)
    
    def objective_regressor(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 500, 2000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
            'random_strength': trial.suggest_float('random_strength', 0.1, 10),
            'eval_metric': 'RMSE'}
    
        model = CatBoostRegressor(
            **params,
            task_type = 'CPU',
            random_seed=42,
            verbose=0)
    
        rmse = []
    
        # creating generators inside func
        train_batches_regressor = batch_generator(train_data, target_col="weight", 
        batch_size=batch_size_train)
        val_batches_regressor = batch_generator(val_data, target_col="weight", 
        batch_size=batch_size_test)
    
        iterations = 0
    
        for X_batch, y_batch in tqdm(train_batches_regressor):
            X_val_batch, y_val_batch = next(val_batches_regressor)
            if iterations == 0:
                model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                          use_best_model=True, verbose=0)
            else:
                model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                          use_best_model=True, verbose=0, init_model=model)
            y_pred = model.predict(X_val_batch)
            rmse.append(mean_squared_error(y_val_batch, y_pred))
            iterations += 1
        
        return np.mean(rmse)
    
    study_regressor = optuna.create_study(direction='minimize')
    study_regressor.optimize(objective_regressor, n_trials=20)