I'm using hyperopt to optimize hyperparameter of lightGBM. The code I use are shown below. I'm trying to log hyperparameters using log_params() in the objective function.
from sklearn.metrics import f1_score
import lightgbm as lgbm
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
from hyperopt.pyll.base import scope
import mlflow
lgbm_space = {
'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)),
'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02),
'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
'subsample': hp.uniform('subsample', 0.7, 1.0),
'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))
}
search_space = lgbm_space
run_name = "run_optimization"
max_eval = 100
#define objective function
def objective (search_space):
model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )
model.fit(X_train, y_train,
eval_set= [ ( X_val, y_val) ],
early_stopping_rounds= 10,
verbose=False)
y_pred = model.predict_proba(X_val)[:,1]
f1 = f1_score(y_val, (y_pred>0.5).astype(int) )
mlflow.log_metric('f1 score', f1)
mlflow.log_params(search_space)
score = 1 - f1
return {'loss': score, 'status': STATUS_OK, 'model': model, 'params': search_space}
spark_trials = Trials()
with mlflow.start_run(run_name = run_name):
best_params = hyperopt.fmin(
fn = objective,
space = search_space,
algo = tpe.suggest,
max_evals = max_eval,
trials = spark_trials )
I got some error messages like below:
INVALID_PARAMETER_VALUE: Parameter with key colsample_bytree was already logged with a value of 0.9523828639856076. The attempted new value was 0.7640043300157543
I'm not sure what I did wrong.
Added the with mlflow.start_run(nested=True):
within the objective function. There was also an issue raised for this here. Now the code creates separate folders for each evaluation containing the params and metric.
import numpy as np
from sklearn.metrics import f1_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
from hyperopt.pyll.base import scope
import mlflow
iris = load_iris()
X_train, X_val, y_train, y_val = train_test_split(iris.data, iris.target, stratify=iris.target)
lgbm_space = {
'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)),
'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02),
'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
'subsample': hp.uniform('subsample', 0.7, 1.0),
'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))
}
search_space = lgbm_space
run_name = "run_optimization"
max_eval = 2
#define objective function
def objective (search_space):
model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )
callbacks = [lgbm.early_stopping(2, verbose=-10), lgbm.log_evaluation(period=0)]
with mlflow.start_run(nested=True):
model.fit(X_train, y_train,
eval_set= [(X_val, y_val)],
callbacks = callbacks
# early_stopping_rounds= 10,
# verbose=False
)
y_pred = model.predict_proba(X_val)[:,1]
f1 = f1_score(y_val, (y_pred>0.5).astype(int), average='weighted')
mlflow.log_metric('f1 score', f1)
score = 1 - f1
mlflow.log_params(search_space)
return {'loss': score, 'status': STATUS_OK, 'model': model} #'params': search_space}
spark_trials = Trials()
with mlflow.start_run(run_name = run_name, nested=True):
best_params = hyperopt.fmin(
fn = objective,
space = search_space,
algo = tpe.suggest,
max_evals = max_eval,
trials = spark_trials)
print("Best value found: ", best_params)