python-3.xmachine-learningscikit-learngridsearchcvscikit-optimize

TypeError: __init__() got an unexpected keyword argument 'iid' while running GridSearchCV for time series data


I have a data Gemini_ETHUSD_d.csv which you can download from this link

I try to re-run the code below from this link:

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 

import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 5
import pandas as pd
# from pandas_datareader import data as web
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('./Gemini_ETHUSD_d.csv', skiprows=1, parse_dates=True, index_col='Date',dtype=str)
df = df.sort_index().drop('Symbol', axis=1)
df.head()

def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=None)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)
        
    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+1.2, -.1], xlim=[0, 100])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.5 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]
            
            
STEPS = 9
for i in np.arange(1, STEPS):
    col_name = '{}d_Fwd_Close'.format(i)
    df[col_name] = df['Close'].shift(-i)
    
df = df.dropna()

Features = 6

X = df.iloc[:, :Features]
y = df.iloc[:, Features:]

split = int(len(df) * 0.7)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]
X.head()


def build_model(_alpha, _l1_ratio):
    estimator = ElasticNet(
        alpha=_alpha,
        l1_ratio=_l1_ratio,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        max_iter=16,
        copy_X=True,
        tol=0.1,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='random'
    )

    return MultiOutputRegressor(estimator, n_jobs=4)
sklearn.metrics.SCORERS.keys()


model = build_model(_alpha=1.0, _l1_ratio=0.3)
tscv = TimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")

# Blocking time series splitter

btscv = BlockingTimeSeriesSplit(n_splits=5)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")


def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, best_params):
    
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title(f"Grid Search Best Params: {best_params}", fontsize=12, fontweight='medium')
    ax.set_xlabel(name_param_1, fontsize=12)
    ax.set_ylabel('CV Average Score', fontsize=12)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')
    ax.legend(bbox_to_anchor=(1.02, 1.02))
# Time series splitter

model.get_params().keys()

params = {
    'estimator__alpha':(0.1, 0.3, 0.5, 0.7, 0.9),
    'estimator__l1_ratio':(0.1, 0.3, 0.5, 0.7, 0.9)
}

scores = []
for i in range(30):
    model = build_model(_alpha=1.0, _l1_ratio=0.3)

    finder = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='r2',
        n_jobs=4,
        iid=False,
        refit=True,
        cv=tscv,  # change this to the splitter subject to test
        verbose=1,
        pre_dispatch=8,
        error_score=-999,
        return_train_score=True
        )

    finder.fit(X_train, y_train)

    best_params = finder.best_params_
    best_score = round(finder.best_score_,4)
    scores.append(best_score)

But it raises a type error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-43-df2fef72cef2> in <module>
    168     model = build_model(_alpha=1.0, _l1_ratio=0.3)
    169 
--> 170     finder = GridSearchCV(
    171         estimator=model,
    172         param_grid=params,

~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

TypeError: __init__() got an unexpected keyword argument 'iid'

​ Based on solution from here, I use command pip install --upgrade scikit-optimize==0.23.3 but it raises another issue:

ERROR: Could not find a version that satisfies the requirement scikit-optimize==0.23.3
ERROR: No matching distribution found for scikit-optimize==0.23.3

Someone could help me to fix this issue? Thanks a lot.


Solution

  • 1. Reason

    scikit-optimize 0.8.1 has parameter iid which is not accepted by scikit-learn 0.24.2

    2. Solution

    Downgrade scikit-learn version to 0.22.2 and scikit-optimize to 0.8.1 by:

    pip install ​​​scikit-optimize==0.8.1
    pip install scikit-learn==0.22.2