I'm having trouble fitting a logistic regression model. I'm getting a much higher score using best parameters on the train data, however against the test data the scores are much lower, and the confusion matrix also doesn't look good. The same issue persists using other models. I've used a 30% test split from the full data set (~ roughly 900 rows)
#importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
#creating test train splits
print(df['high_traffic'].value_counts())
y=df['high_traffic']
X= df.drop(["recipe","high_traffic"],axis=1)
X= pd.get_dummies(X,columns=['category'])
le = LabelEncoder()
X["servings"] = le.fit_transform(X["servings"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3,stratify=y)
#scaling variables
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(X_train)
#PCA
pca=PCA()
pca.fit(scaled_train_X)
exp_variance = pca.explained_variance_ratio_
cum_exp_variance = np.cumsum(exp_variance)
print(cum_exp_variance)
pca = PCA(n_components=12,random_state=7)
train_pca = pca.fit_transform(scaled_train_X)
test_pca = pca.fit_transform(X_test)
#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=3)
parameters = [ {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
'C' : np.logspace(-4, 4, 20),
'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
'max_iter' : [100, 1000,2500, 5000]
}]
grid_search = GridSearchCV(estimator = logreg, param_grid = parameters, scoring = 'accuracy',
cv = 5, verbose=1, refit=True)
grid_search.fit(train_pca, y_train)
#logreg.fit(train_pca,y_train)
#pred_y_log = logreg.predict(test_pca)
pred_y_log = grid_search.best_estimator_.predict(test_pca)
print(grid_search.best_params_)
print(grid_search.best_score_)
from sklearn.metrics import classification_report
class_log = classification_report(y_test,pred_y_log)
print("Logistic Regression: \n", class_log)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_y_log))
The results:
Fitting 5 folds for each of 1600 candidates, totalling 8000 fits {'C': 0.23357214690901212, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
0.7551049197672368
Logistic Regression: accuracy 0.49
[[54 37]
[80 57]]
You forgot to scale your test set.
scaled_test_X = scaler.fit_transform(X_test)
# ...
test_pca = pca.fit_transform(scaled_test_X)