pythonscikit-learnlogistic-regressionmodel-fitting

Logistic regression test score is much lower than train score


I'm having trouble fitting a logistic regression model. I'm getting a much higher score using best parameters on the train data, however against the test data the scores are much lower, and the confusion matrix also doesn't look good. The same issue persists using other models. I've used a 30% test split from the full data set (~ roughly 900 rows)

#importing libraries 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

#creating test train splits
print(df['high_traffic'].value_counts())

y=df['high_traffic']
X= df.drop(["recipe","high_traffic"],axis=1)
X= pd.get_dummies(X,columns=['category'])

le = LabelEncoder()
X["servings"] = le.fit_transform(X["servings"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3,stratify=y)


#scaling variables
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(X_train)


#PCA
pca=PCA()
pca.fit(scaled_train_X)
exp_variance = pca.explained_variance_ratio_
cum_exp_variance = np.cumsum(exp_variance)
print(cum_exp_variance)

pca = PCA(n_components=12,random_state=7)

train_pca = pca.fit_transform(scaled_train_X)
test_pca = pca.fit_transform(X_test)



    #logistic regression
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(random_state=3)
    parameters = [    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }]
    grid_search = GridSearchCV(estimator = logreg, param_grid = parameters, scoring = 'accuracy', 
    cv = 5, verbose=1, refit=True)
    grid_search.fit(train_pca, y_train)

    #logreg.fit(train_pca,y_train)
    #pred_y_log = logreg.predict(test_pca)
    pred_y_log = grid_search.best_estimator_.predict(test_pca)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

    from sklearn.metrics import classification_report
    class_log = classification_report(y_test,pred_y_log)
    print("Logistic Regression: \n", class_log)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test,pred_y_log))

The results:

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits {'C': 0.23357214690901212, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}

0.7551049197672368

Logistic Regression: accuracy 0.49

[[54 37]

[80 57]]


Solution

  • You forgot to scale your test set.

    scaled_test_X = scaler.fit_transform(X_test)
    # ...
    test_pca = pca.fit_transform(scaled_test_X)