I am trying to train a lightgbm ML model in Python using rmsle as the eval metric, but am encountering an issue when I try to include early stopping.
Here is my code:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
df_train = pd.read_csv('train_data.csv')
X_train = df_train.drop('target', axis=1)
y_train = np.log(df_train['target'])
sample_params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'random_state': 42,
'metric': 'rmsle',
'lambda_l1': 5,
'lambda_l2': 5,
'num_leaves': 5,
'bagging_freq': 5,
'max_depth': 5,
'max_bin': 5,
'min_child_samples': 5,
'feature_fraction': 0.5,
'bagging_fraction': 0.5,
'learning_rate': 0.1,
}
X_train_tr, X_train_val, y_train_tr, y_train_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
def train_lightgbm(X_train_tr, y_train_tr, X_train_val, y_train_val, params, num_boost_round, early_stopping_rounds, verbose_eval):
d_train = lgb.Dataset(X_train_tr, y_train_tr)
d_val = lgb.Dataset(X_train_val, y_train_val)
model = lgb.train(
params=params,
train_set=d_train,
num_boost_round=num_boost_round,
valid_sets=d_val,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
)
return model
model = train_lightgbm(
X_train_tr,
y_train_tr,
X_train_val,
y_train_val,
params=sample_params,
num_boost_round=500,
early_stopping_rounds=True,
verbose_eval=1
)
df_test = pd.read_csv('test_data.csv')
X_test = df_test.drop('target', axis=1)
y_test = np.log(df_test['target'])
df_train['prediction'] = np.exp(model.predict(X_train))
df_test['prediction'] = np.exp(model.predict(X_test))
def rmsle(y_true, y_pred):
assert len(y_true) == len(y_pred)
return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))
metric = rmsle(y_test, df_test['prediction'])
print('Test Metric Value:', round(metric, 4))
If I change early_stopping_rounds=False
in the train_lightgbm method, the code compiles without a problem.
However, if I set early_stopping_rounds=True
it throws the following:
ValueError: For early stopping, at least one dataset and eval metric is required for evaluation.
If I run a similar script but using 'metric': 'rmse' instead of 'rmsle' in the sample_params, it compiles even when early_stopping_rounds=True
.
What do I need to add for lightgbm to recognize my dataset and eval metric? Thank you!
rmsle is not supported as metric by default in LGB (check here the available list)
In order to apply this custom metric, you have to define a custom function
def rmsle_lgbm(y_pred, data):
y_true = np.array(data.get_label())
score = np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))
return 'rmsle', score, False
redefine your param dictionary in this way:
params = {
....
'objective': 'regression',
'metric': 'custom', # <=============
....
}
and then for training
model = lgb.train(
params=params,
train_set=d_train,
num_boost_round=num_boost_round,
valid_sets=d_val,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
feval=rmsle_lgbm # <=============
)
PS: np.log(y + 1) = np.log1p(y) ===> np.log1p(y + 1) seems a mistake