I'm trying to make a classifier with XGBoost, I fit it with RandomizedSearchCV.
Here is the code of my function:
def xgboost_classifier_rscv(x,y):
from scipy import stats
from xgboost import XGBClassifier
from sklearn.metrics import fbeta_score, make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
#splitting the dataset into training and test parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#bag of words implmentation
cv = CountVectorizer()
x_train = cv.fit_transform(x_train).toarray()
#TF-IDF implementation
vector = TfidfTransformer()
x_train = vector.fit_transform(x_train).toarray()
x_test = cv.transform(x_test)
scorers = {
'f1_score':make_scorer(f1_score),
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
param_dist = {'n_estimators': stats.randint(150, 1000),
'learning_rate': stats.uniform(0.01, 0.59),
'subsample': stats.uniform(0.3, 0.6),
'max_depth': [3, 4, 5, 6, 7, 8, 9],
'colsample_bytree': stats.uniform(0.5, 0.4),
'min_child_weight': [1, 2, 3, 4]
}
n_folds = numFolds)
skf = StratifiedKFold(n_splits=3, shuffle = True)
gridCV = RandomizedSearchCV(xgb_model,
param_distributions = param_dist,
cv = skf,
n_iter = 5,
scoring = scorers,
verbose = 3,
n_jobs = -1,
return_train_score=True,
refit = precision_score)
gridCV.fit(x_train,y_train)
best_pars = gridCV.best_params_
print("best params : ", best_pars)
xgb_predict = gridCV.predict(x_test)
xgb_pred_prob = gridCV.predict_proba(x_test)
print('best scores : ', gridCV.grid_scores_)
scores = [x[1] for x in gridCV.grid_scores_]
print("best scores : ", scores)
return y_test, xgb_predict, xgb_pred_prob
When I run the code, I get an error, reported below:
TypeError Traceback (most recent call last)
<ipython-input-30-9adf84d48e5c> in <module>
1 print("********** Xgboost classifier *************")
2 start_time = time.monotonic()
----> 3 y_test, xgb_predict, xgb_pred_prob = xgboost_classifier_rscv(x,y)
4 end_time = time.monotonic()
5 print("the time consumed is : ", timedelta(seconds=end_time - start_time))
<ipython-input-29-e0c6ae026076> in xgboost_classifier_rscv(x, y)
70 # verbose=3, random_state=1001, refit='precision_score' )
71
---> 72 gridCV.fit(x_train,y_train)
73 best_pars = gridCV.best_params_
74 print("best params : ", best_pars)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
858 # parameter set.
859 if callable(self.refit):
--> 860 self.best_index_ = self.refit(results)
861 if not isinstance(self.best_index_, numbers.Integral):
862 raise TypeError('best_index_ returned is not an integer')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
TypeError: precision_score() missing 1 required positional argument: 'y_pred'
When I do the same thing but with GridSearchCV instead of RandomizedSearchCV, the code runs without any problems!
It's not precision_score
it's 'precision_score'
(with ' '), like this-
gridCV = RandomizedSearchCV(xgb_model,
param_distributions = param_dist,
cv = skf,
n_iter = 5,
scoring = scorers,
verbose = 3,
n_jobs = -1,
return_train_score=True,
refit = 'precision_score')
Another error:
grid_scores_
has been removed, so changed it to cv_results_
(in the last 3rd and 4th line)
print('best scores : ', gridCV.cv_results_)
scores = [x[1] for x in gridCV.cv_results_]
One more error:
You have not defined that xgb_model
, so add that.
xgb_model = XGBClassifier(n_jobs = -1, random_state = 42)