I am using imbalanced dataset(54:38:7%) with RFECV for feature selection like this:
# making a multi logloss metric
from sklearn.metrics import log_loss, make_scorer
log_loss_rfe = make_scorer(score_func=log_loss, greater_is_better=False)
# initiating Light GBM classifier
lgb_rfe = LGBMClassifier(objective='multiclass', learning_rate=0.01, verbose=0, force_col_wise=True,
random_state=100, n_estimators=5_000, n_jobs=7)
# initiating RFECV
rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
# fitting it
rfe.fit(X=X_train, y=y_train)
And I got an error, presumably because the subsamples sklearn's RFECV has made doesn't have all of the classes from my data. I had no issues fitting the very same data outside of RFECV.
Here's the complete error:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 37, in _rfe_single_fit
return rfe._fit(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 259, in _fit
self.scores_.append(step_score(estimator, features))
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 39, in <lambda>
lambda estimator, features: _score(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
scores = scorer(estimator, X_test, y_test)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
return self._score(partial(_cached_call, None), estimator, X, y_true,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
return self._sign * self._score_func(y_true, y_pred,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 2265, in log_loss
raise ValueError("y_true and y_pred contain different number of "
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-9-5feb62a6f457> in <module>
1 rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
----> 2 rfe.fit(X=X_train, y=y_train)
~/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py in fit(self, X, y, groups)
603 func = delayed(_rfe_single_fit)
604
--> 605 scores = parallel(
606 func(rfe, self.estimator, X, y, train, test, scorer)
607 for train, test in cv.split(X, y, groups))
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
1 frames
/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
How to fix this to be able to select features recursively?
Log-loss needs the probability predictions, not the class predictions, so you should add
log_loss_rfe = make_scorer(score_func=log_loss, needs_proba=True, greater_is_better=False)
The error is because without that, the passed y_pred
is one-dimensional (classes 0,1,2) and sklearn
assumes it's a binary classification problem and those predictions are probability of the positive class. To deal with that, it adds on the probability of the negative class, but then there are only two columns compared to your three classes.