python machine-learning scikit-learn mlxtend

SequentialFeatureSelector ValueError: continuous format is not supported

I am new to Machine learning and trying to understand the SequentialFeatureSelector concept from sklearn. I am using Anaconda and Jupyter notebook for poc. I have imported

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

package. by default mlxtend package was not part of Anaconda, then I have installed via pip install mlxtend command.

I have used sklearn Boston housing dataset for this poc and did below code. while fitting sfs, I am getting error.

How to fix this error ?

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline
data = load_boston()
print(data.keys())
X = pd.DataFrame(data.data)
X.columns = data.feature_names
y = data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
sfs1=sfs(RandomForestRegressor(n_jobs=1),
    k_features=7,
    forward=True,
    floating=False,
    verbose=3,
    scoring='roc_auc',
    cv=3
   )
sfs1=sfs1.fit(X_train,y_train)

Error

ValueError                                Traceback (most recent call last)
<ipython-input-77-96b29660189d> in <module>
      1 #sfs1.fit(X_train,y_train)
      2 X_train.shape
----> 3 sfs2=sfs1.fit(X_train,y_train)

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in fit(self, X, y, custom_feature_names, **fit_params)
    371                         X=X_,
    372                         y=y,
--> 373                         **fit_params
    374                     )
    375                 else:

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y, ignore_feature, **fit_params)
    528                              tuple(subset | {feature}),
    529                              **fit_params)
--> 530                             for feature in remaining
    531                             if feature != ignore_feature)
    532 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _calc_score(selector, X, y, indices, **fit_params)
     32                                  n_jobs=1,
     33                                  pre_dispatch=selector.pre_dispatch,
---> 34                                  fit_params=fit_params)
     35     else:
     36         selector.est_.fit(X[:, indices], y, **fit_params)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
    400                                 fit_params=fit_params,
    401                                 pre_dispatch=pre_dispatch,
--> 402                                 error_score=error_score)
    403     return cv_results['test_score']
    404 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    238             return_times=True, return_estimator=return_estimator,
    239             error_score=error_score)
--> 240         for train, test in cv.split(X, y, groups))
    241 
    242     zipped_scores = list(zip(*scores))

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    915             # remaining jobs.
    916             self._iterating = False
--> 917             if self.dispatch_one_batch(iterator):
    918                 self._iterating = self._original_iterator is not None
    919 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    566         fit_time = time.time() - start_time
    567         # _score will return dict if is_multimetric is True
--> 568         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
    569         score_time = time.time() - start_time - fit_time
    570         if return_train_score:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
    603     """
    604     if is_multimetric:
--> 605         return _multimetric_score(estimator, X_test, y_test, scorer)
    606     else:
    607         if y_test is None:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
    633             score = scorer(estimator, X_test)
    634         else:
--> 635             score = scorer(estimator, X_test, y_test)
    636 
    637         if hasattr(score, 'item'):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
    174         y_type = type_of_target(y)
    175         if y_type not in ("binary", "multilabel-indicator"):
--> 176             raise ValueError("{0} format is not supported".format(y_type))
    177 
    178         if is_regressor(clf):

ValueError: continuous format is not supported

Solution

Looking closely at the trace, you will see that the error is not raised by mlxtend - it is raised by the scorer.py module of scikit-learn, and it is because the roc_auc_score you are using is suitable for classification problems only; for regression problems, such as yours here, it is meaninglesss.

From the docs (emphasis added):

sklearn.metrics.roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)

Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.

Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format.

See also the scikit-learn list of metrics per kind of problem, where you can confirm that roc_auc is not suitable for regression.

So, change it in your sfs definition to something like

scoring='neg_mean_squared_error'

like in the docs example of SequentialFeatureSelector, or to any other metric suitable for regression, and you will be fine.