pythonscikit-learnhyperparametershyperopt

How to tune hyperparameters over a hyperparameter space using Bayesian Optimization (in Python)?


I am trying to tune hyperparameters using bayesian optimization for random forest regression over a hyperparameter space using the code below, but I get an error that says

TypeError: init() got an unexpected keyword argument 'min_samples'

I got this error when I tried the following code:

# Import packages
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

# Create datasets
reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
x_train = reg_prob[0][0:50]
y_train = reg_prob[1][0:50]
x_test = reg_prob[0][50:100]
y_test = reg_prob[1][50:100]


#Create Hyperparameter space
space= {'n_estimators':hp.choice('n_estimators', range(2, 150, 1)),
        'min_samples':hp.choice('min_samples', range(2, 100, 1)),
        'max_features':hp.choice('max_features', range(2, 100, 1)),
        'max_samples':hp.choice('max_samples', range(2, 100, 1)),
       }


#Define Objective Function
def objective(space):
    
    rf = RandomForestRegressor(**space)

    
    # fit Training model
    rf.fit(x_train, y_train)
    
    # Making predictions and find RMSE
    y_pred = rf.predict(x_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    
    
    # Return RMSE
    return rmse


#Surrogate Fn
trials = Trials()
best = fmin(objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials)

print(best)
print(trials.results)

I have also tried listing the hyperparameters in the objective function using the code below, but I get the following error

TypeError: objective() missing 3 required positional arguments: 'min_samples', 'max_features', and 'max_samples'

#Define Objective Function
def objective(n_estimators,min_samples,max_features,max_samples):
    
    rf = RandomForestRegressor(n_estimators, min_samples, max_features, max_samples)

    
    # fit Training model
    rf.fit(x_train, y_train)
    
    # Making predictions and find RMSE
    y_pred = rf.predict(x_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    
    
    # Return RMSE
    return rmse

Can you please advise on what I can do to fix my code?

I was able to tune a single hyperparameter using the code below:

# Import packages
import numpy as np
import time
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from collections import OrderedDict

reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
x_train = reg_prob[0][0:50]
y_train = reg_prob[1][0:50]
x_test = reg_prob[0][50:100]
y_test = reg_prob[1][50:100]

space= hp.choice('num_leaves', range(2, 100, 1))


def objective(num_leaves):
    
    rf = RandomForestRegressor(num_leaves)
    

    rf.fit(x_train, y_train)
    

    y_pred = rf.predict(x_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    

    
    # Return RMSE
    return rmse

trials = Trials()
best = fmin(objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials)

print(best)
print(trials.results)

Solution

  • The problem is that there is no parameter called min_samples in RandomForestClassifier. See here. Probably you meant min_samples_leaf.

    Just keep the upper bound of min_sample_leaf within the range of number of samples in your dataset.

    Otherwise there is no other problem with your code.

    import matplotlib.pyplot as plt
    
    # Import packages
    import numpy as np
    from sklearn.metrics import mean_squared_error
    from sklearn import datasets
    from sklearn.ensemble import RandomForestRegressor
    from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
    
    # Create datasets
    reg_prob = datasets.make_friedman1(n_samples=100, n_features=10, noise=1.0, random_state=None)
    x_train = reg_prob[0][0:50]
    y_train = reg_prob[1][0:50]
    x_test = reg_prob[0][50:100]
    y_test = reg_prob[1][50:100]
    
    
    #Create Hyperparameter space
    space= {'n_estimators':hp.choice('n_estimators', range(2, 150, 1)),
            'min_samples_leaf':hp.choice('min_samples', range(2, 50, 1)),
            'max_features':hp.choice('max_features', range(2, 10, 1)),
            'max_samples':hp.choice('max_samples', range(2, 50, 1)),
           }
    
    
    #Define Objective Function
    def objective(space):
        
        rf = RandomForestRegressor(**space)
    
        
        # fit Training model
        rf.fit(x_train, y_train)
        
        # Making predictions and find RMSE
        y_pred = rf.predict(x_test)
        mse = mean_squared_error(y_test,y_pred)
        rmse = np.sqrt(mse)
        
        
        # Return RMSE
        return rmse
    
    
    #Surrogate Fn
    trials = Trials()
    best = fmin(objective,
        space=space,
        algo=tpe.suggest,
        max_evals=2,
        trials=trials)
    
    print(best)
    print(trials.results)