[Link to SampleFile][1] [1]: https://www.dropbox.com/s/vk0ht1bowdhz85n/StackoverFlow_Example.csv?dl=0
Code below is in 2 parts Function and main code that calls function. There are a bunch of print statements along the way to help troubleshoot. I believe the issue has to do with the "mean_feature_importances" variable. This procedure works and does the comparison of binary classifiers with no issues. I have tried to change it to evaluate multi-class classifiers so I compare there performance. It makes sense why it expects only 2 labels because that is what it was for but this model has 5 different labels to choice from. I have changed every single value I think should be changed to accommodate 5 different labels instead of 2. Please advise if I missed something the issue happens on the return after print(19)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron # linear classifiers
from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics
Here is the function used to process classifier ensemble cross validation
def train_MultiClass_classifier_ensemble_CV(classifiers, X_data, y_data, clf_params=None, cv_splits=10,
random_state=21, return_trained_classifiers=True, verbose=0, prtParam=0):
"""
Trains a list of classifiers on the input training data and returns cross-validated accuracy and f1 scores
as well as feature_importances (where available). The list of trained classifier objects is also returned
upon request.
: param classifiers : List of classifier objects; expects each has a scikit-learn wrapper.
: param X_data : Pandas dataframe containing our training features.
: param y_data : Pandas dataframe containing our training class labels.
: param clf_params : (Optional) List of dictionaries containing parameters for each classifier object
in the list 'classifiers'. If not provided, the already-initialized parameters of
each classifier object will be used.
: param cv_splits : Integer number of cross-validation splits.
: param random_state : Seed for reproducibility between executions.
: param return_trained_classifiers : Boolean; if True, function will also return a list containing thefit classifier objects.
: param verbose : The amount of status text displayed during execution; 0 for less, 1 for more.
: return clf_comparison : A pandas dataframe tabulating the cross-validated performance of each classifier.
: return mean_feature_importances : An array containing the ranked feature importances for each classifier having the feature_importances_ attribute.
: return trained_classifiers : (if return_trained_classifiers=True) A list of trained classifier objects.
"""
# initialization
kfold = StratifiedKFold(n_splits=cv_splits, random_state=random_state)
train_accuracy_mean = []
train_accuracy_std = []
test_accuracy_mean = []
test_accuracy_std = []
f1_score_mean = []
f1_score_std = []
mean_feature_importances = []
trained_classifiers = []
classifier_name = []
if clf_params is None: # construct using classifier's existing parameter assignment
clf_params = []
for clf in classifiers:
#print(clf)
params = clf.get_params()
if 'random_state' in params.keys(): # assign random state / seed
params['random_state'] = random_state
elif 'seed' in params.keys():
params['seed'] = random_state
clf_params.append(params)
# step through the classifiers for training and scoring with cross-validation
for clf, params in zip(classifiers, clf_params):
#print(clf)
#print(params)
# automatically obtain the name of the classifier
name = get_clf_name(clf)
classifier_name.append(name)
if prtParam == 1:
print(clf)
if verbose == 1: # print status
print('\nPerforming Cross-Validation on Classifier %s of %s:'
% (len(classifier_name), len(classifiers)))
print(name)
# perform k-fold cross validation for this classifier and calculate scores for each split
kth_train_accuracy = []
kth_test_accuracy = []
kth_test_f1_score = []
kth_feature_importances = []
for (train, test) in kfold.split(X_data, y_data):
clf.set_params(**params)
print(clf)
print(params)
OneVsOneClassifier(clf.fit(X_data.iloc[train], y_data.iloc[train]))
kth_train_accuracy.append(clf.score(X_data.iloc[train], y_data.iloc[train]))
print('1.1')
kth_test_accuracy.append(clf.score(X_data.iloc[test], y_data.iloc[test]))
print('2.2')
kth_test_f1_score.append(f1_score(y_true=y_data.iloc[test], y_pred=clf.predict(X_data.iloc[test]), average='weighted'))
print('3.3')
if hasattr(clf, 'feature_importances_'): # some classifiers (like linReg) lack this attribute
print(clf.feature_importances_)
kth_feature_importances.append(clf.feature_importances_)
# populate scoring statistics for this classifier (over all cross-validation splits)
train_accuracy_mean.append(np.mean(kth_train_accuracy))
print('4')
train_accuracy_std.append(np.std(kth_train_accuracy))
print('5')
test_accuracy_mean.append(np.mean(kth_test_accuracy))
print('6')
test_accuracy_std.append(np.std(kth_test_accuracy))
print('7')
f1_score_mean.append(np.mean(kth_test_f1_score))
print('8')
print('8-1')
f1_score_std.append(np.std(kth_test_f1_score))
print('9')
print(kth_test_f1_score)
# obtain array of mean feature importances, if this classifier had that attribute
print('9-1')
print(kth_feature_importances)
if len(kth_feature_importances) == 0:
print('10')
print(mean_feature_importances)
mean_feature_importances.append(False)
else:
print('10.1')
mean_feature_importances.append(np.mean(kth_feature_importances, axis=0))
# if requested, also export classifier after fitting on the complete training set
if return_trained_classifiers is not False:
print('12')
clf.fit(X_data, y_data)
print('13')
trained_classifiers.append(clf)
print('14')
# remove AdaBoost feature importances (we won't discuss their interpretation)
if type(clf) == type(AdaBoostClassifier()):
print('15')
mean_feature_importances[-1] = False
print('16')
# construct dataframe for comparison of classifiers
clf_comparison = pd.DataFrame({'Classifier Name' : classifier_name,
'Mean Train Accuracy' : train_accuracy_mean,
'Train Accuracy Standard Deviation' : train_accuracy_std,
'Mean Test Accuracy' : test_accuracy_mean,
'Test Accuracy Standard Deviation' : test_accuracy_std,
'Mean Test F1-Score' : f1_score_mean,
'F1-Score Standard Deviation' : f1_score_std})
print('17')
# enforce the desired column order
clf_comparison = clf_comparison[['Classifier Name', 'Mean Train Accuracy',
'Train Accuracy Standard Deviation', 'Mean Test Accuracy',
'Test Accuracy Standard Deviation', 'Mean Test F1-Score',
'F1-Score Standard Deviation']]
print('18')
# add return_trained_classifiers to the function return, if requested, otherwise omit
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers
else:
print('20')
return clf_comparison, mean_feature_importances
This is the code and attachment should help you reproduce error. The Dataframe can be downloaded above and placed here to run the code. I believe I included every package necessary to run code if not please import
dfage_train = pd.read_csv('StackoverFlow_Example.csv')
y1 = dfage_train['AgeBin']
X1 = dfage_train
X1 = X1.drop(['AgeBin'], axis=1)
num_jobs=-1 # I'll use all available CPUs when possible
Ageclassifier_list = [LogisticRegression(n_jobs=num_jobs, solver='lbfgs'),
RandomForestClassifier(criterion = 'entropy',n_estimators=100, n_jobs=num_jobs),
LinearSVC(class_weight=None,random_state=27,multi_class='ovr')]
X1['Pclass'] = X1['Pclass'].astype(int)
X1['isMale'] = X1['isMale'].astype(bool)
X1['Embarked'] = X1['Embarked'].astype(int)
clf_comp_Full_FeatureSet, mean_feature_importances = train_MultiClass_classifier_ensemble_CV(classifiers=Ageclassifier_list, prtParam = 1,
verbose=1,
X_data=X1,
y_data=y1)
Error output
ValueError: too many values to unpack (expected 2)
Depending on a condition, your function train_MultiClass_classifier_ensemble_CV
returns either 2 or 3 arguments. Don't do that. Because when you want to assign the returned variables, there can be a mismatch. Now, it's returning 3 values but you want to assign that to only two values. Here's the problematic part:
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers # three here
else:
print('20')
return clf_comparison, mean_feature_importances # two here