pythonmachine-learningscikit-learnsklearn-pandasrfe

How do I resolve the "RFECV object has no support_ attribute" Attribute error?


I am trying to pass the sklearn RFECV object and cross validate the scores to return the model performance with the chosen features and feature rankings.

However, I get the "RFECV object has no support_ attribute" error most likely because I am not fitting it to the data. I need some help in identifying where to fit the data and how to make sure there is no data leakage to the test data set.

The original dataset is a timeseries data, so I've split using TimeSeries Split.

from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE, RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn import metrics
from sklearn.metrics import balanced_accuracy_score, make_scorer

X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)

# create pipeline
rfecv_model = RFECV(estimator=DecisionTreeClassifier())
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfecv_model),('m',model)])

#make balanced scorer
scorer = make_scorer(balanced_accuracy_score)

# evaluate model
cv = TimeSeriesSplit(n_splits=3)
n_scores = cross_val_score(pipeline, X, y, scoring=scorer, cv=cv)
# report performance
print('Balanced_Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

for i in range(X.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfecv_model.support_[i], rfecv_model.ranking_[i]))

This code is derived from the RFE tutorial here


Solution

  • I would recommend using the cross_validate when you need the fitted models of cross validation.

    from sklearn import set_config
    
    set_config(print_changed_only=True)
    
    
    from sklearn.datasets import make_classification
    from sklearn.feature_selection import RFE, RFECV
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import TimeSeriesSplit, cross_validate
    from sklearn import metrics
    from sklearn.metrics import balanced_accuracy_score, make_scorer
    from sklearn.pipeline import Pipeline
    
    X, y = make_classification(
        n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
    
    # create pipeline
    rfecv_model = RFECV(estimator=DecisionTreeClassifier())
    model = DecisionTreeClassifier()
    pipeline = Pipeline(steps=[('s', rfecv_model), ('m', model)])
    
    # make balanced scorer
    scorer = make_scorer(balanced_accuracy_score)
    
    # evaluate model
    cv = TimeSeriesSplit(n_splits=3)
    result = cross_validate(pipeline, X, y, scoring=scorer,
                              cv=cv, return_estimator=True)
    

    result

    {'fit_time': array([0.07009673, 0.09101987, 0.11680794]),
     'score_time': array([0.00072193, 0.00065613, 0.00060487]),
     'estimator': (Pipeline(steps=[('s', RFECV(estimator=DecisionTreeClassifier())),
                      ('m', DecisionTreeClassifier())]),
      Pipeline(steps=[('s', RFECV(estimator=DecisionTreeClassifier())),
                      ('m', DecisionTreeClassifier())]),
      Pipeline(steps=[('s', RFECV(estimator=DecisionTreeClassifier())),
                      ('m', DecisionTreeClassifier())])),
     'test_score': array([0.812     , 0.83170092, 0.8510502 ])}
    

    Now lets go through the feature selector for each iteration of cv.

    for iter, pipe in enumerate(result['estimator']):
        print(f'Iteration no: {iter}')
        for i in range(X.shape[1]):
            print('Column: %d, Selected %s, Rank: %d' %
                (i, pipe['s'].support_[i], pipe['s'].ranking_[i]))
    
    # output
    Iteration no: 0
    Column: 0, Selected False, Rank: 4
    Column: 1, Selected True, Rank: 1
    Column: 2, Selected True, Rank: 1
    Column: 3, Selected True, Rank: 1
    Column: 4, Selected False, Rank: 3
    Column: 5, Selected False, Rank: 5
    Column: 6, Selected True, Rank: 1
    Column: 7, Selected True, Rank: 1
    Column: 8, Selected True, Rank: 1
    Column: 9, Selected False, Rank: 2
    Iteration no: 1
    Column: 0, Selected False, Rank: 2
    Column: 1, Selected False, Rank: 4
    Column: 2, Selected True, Rank: 1
    Column: 3, Selected True, Rank: 1
    Column: 4, Selected True, Rank: 1
    Column: 5, Selected False, Rank: 6
    Column: 6, Selected True, Rank: 1
    Column: 7, Selected False, Rank: 5
    Column: 8, Selected True, Rank: 1
    Column: 9, Selected False, Rank: 3
    Iteration no: 2
    Column: 0, Selected True, Rank: 1
    Column: 1, Selected False, Rank: 4
    Column: 2, Selected True, Rank: 1
    Column: 3, Selected True, Rank: 1
    Column: 4, Selected True, Rank: 1
    Column: 5, Selected False, Rank: 3
    Column: 6, Selected False, Rank: 2
    Column: 7, Selected False, Rank: 5
    Column: 8, Selected True, Rank: 1
    Column: 9, Selected True, Rank: 1