python-3.xscikit-learnpipelinerfe

Get support and ranking attributes for RFE using Pipeline in Python 3


The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.

In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.

# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Get the dataset
def get_dataset(df, target):
    X, y = df.drop(columns = target), df[[target]].values.flatten()
    return X, y

# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
    num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
    cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
                                        ('one-hot-encoder', OneHotEncoder())])
    preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
                                                     ('cat', cat_transformer, list_cat_cols)])
    models = dict()    
    for i in range(2, 4):
        rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
        model_dtr = DecisionTreeRegressor()
        models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
                                                                     ('s_dtr', rfe_dtr), 
                                                                     ('m_dtr', model_dtr)])
    return models

# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
    scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv, 
                             n_jobs = -1, error_score = 'raise')
    return scores


# Define the dataset
X, y = get_dataset(my_df, 'my_target')   # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(), 
                    X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

The following is returning errors:

models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'

Solution

  • I answered the question. I'm posting it in case it can help someone. It consists of using "cross_validate", instead of "cross_val_score", with the option "return_estimator = True" to be able to retrieve the pipelines in the different folds and RFE, and access them by index. Then you can use "named_steps".

    # Explore the number of selected features for RFE
    from numpy import mean
    from numpy import std
    from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
    from sklearn.feature_selection import RFE
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from category_encoders import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    
    # Get the dataset
    def get_dataset(df, target):
        X, y = df.drop(columns = target), df[[target]].values.flatten()
        return X, y
    
    # Get a list of models to evaluate
    def get_models(list_num_cols, list_cat_cols):
        num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
        cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
                                            ('one-hot-encoder', OneHotEncoder())])
        preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
                                                         ('cat', cat_transformer, list_cat_cols)])
        models = dict()    
        for i in range(2, 4):
            rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
            model_dtr = DecisionTreeRegressor()
            models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
                                                                         ('s_dtr', rfe_dtr), 
                                                                         ('m_dtr', model_dtr)])
        return models
    
    # Evaluate a give model using cross-validation
    def evaluate_model(model, X, y):
        cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
        output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv, 
                                 n_jobs = -1, error_score = 'raise', return_estimator = True)
        return output
    
    
    # Define the dataset
    X, y = get_dataset(my_df, 'my_target')   # It begins here
    # Get the models to evaluate
    models = get_models(X.select_dtypes(include = 'number').columns.tolist(), 
                        X.select_dtypes(include = 'object').columns.tolist())
    # Evaluate the models and store results
    results, names = list(), list()
    for name, model in models.items():
        output = evaluate_model(model, X, y)
        results.append(output['test_score'])
        names.append(name)
        print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
        print(output)  
        print(output['estimator'][0].named_steps['s_dtr'].support_)
        print(output['estimator'][0].named_steps['s_dtr'].ranking_)
        print(output['estimator'][0].named_steps['s_dtr'].support_[2])
        print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])