pythonmlflowlightgbmhyperopt

Cannot log lightGBM parameter using log_params in mlflow/hyperopt


I'm using hyperopt to optimize hyperparameter of lightGBM. The code I use are shown below. I'm trying to log hyperparameters using log_params() in the objective function.

from sklearn.metrics import f1_score
import lightgbm as lgbm
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
from hyperopt.pyll.base import scope 
import mlflow


lgbm_space = {
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
        'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)), 
        'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02), 
        'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
        'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
        'subsample': hp.uniform('subsample', 0.7, 1.0), 
        'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))

}

search_space = lgbm_space
run_name = "run_optimization" 
max_eval = 100

#define objective function
def objective (search_space):
    model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )      
    model.fit(X_train, y_train,            
           eval_set= [ ( X_val, y_val) ], 
           early_stopping_rounds= 10, 
           verbose=False)    
    y_pred = model.predict_proba(X_val)[:,1]   
    f1 = f1_score(y_val, (y_pred>0.5).astype(int) )
    mlflow.log_metric('f1 score', f1)
    mlflow.log_params(search_space)
    score = 1 - f1
    
    return {'loss': score, 'status': STATUS_OK, 'model': model, 'params': search_space}

spark_trials = Trials()
with mlflow.start_run(run_name = run_name):
    best_params = hyperopt.fmin(
                    fn = objective,
                    space = search_space,
                    algo = tpe.suggest,
                    max_evals = max_eval, 
                    trials = spark_trials )

I got some error messages like below:

INVALID_PARAMETER_VALUE: Parameter with key colsample_bytree was already logged with a value of 0.9523828639856076. The attempted new value was 0.7640043300157543

I'm not sure what I did wrong.


Solution

  • Added the with mlflow.start_run(nested=True): within the objective function. There was also an issue raised for this here. Now the code creates separate folders for each evaluation containing the params and metric.

    import numpy as np
    from sklearn.metrics import f1_score
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    import lightgbm as lgbm
    import hyperopt
    from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
    from hyperopt.pyll.base import scope 
    import mlflow
    
    iris = load_iris()
    X_train, X_val, y_train, y_val = train_test_split(iris.data, iris.target, stratify=iris.target)
    
    
    lgbm_space = {
            'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
            'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)), 
            'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02), 
            'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
            'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
            'subsample': hp.uniform('subsample', 0.7, 1.0), 
            'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))
    
    }
    
    search_space = lgbm_space
    run_name = "run_optimization" 
    max_eval = 2
    
    #define objective function
    def objective (search_space):
        model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )
        callbacks = [lgbm.early_stopping(2, verbose=-10), lgbm.log_evaluation(period=0)]
        with mlflow.start_run(nested=True):
            model.fit(X_train, y_train,
                      eval_set= [(X_val, y_val)],
                      callbacks = callbacks
                      # early_stopping_rounds= 10, 
                   # verbose=False
                     )    
            y_pred = model.predict_proba(X_val)[:,1]   
            f1 = f1_score(y_val, (y_pred>0.5).astype(int), average='weighted')
            mlflow.log_metric('f1 score', f1)
            score = 1 - f1
            mlflow.log_params(search_space)
    
        return {'loss': score, 'status': STATUS_OK, 'model': model} #'params': search_space}
    
    spark_trials = Trials()
    with mlflow.start_run(run_name = run_name, nested=True):
        best_params = hyperopt.fmin(
                        fn = objective,
                        space = search_space,
                        algo = tpe.suggest,
                        max_evals = max_eval, 
                        trials = spark_trials)
    print("Best value found: ", best_params)