I am trying to logistic Regression Model, and run some test but I keep getting this error. Not really sure what I have done differently to everyone else
from sklearn import preprocessing
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]ere
This is how I am separating my columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
TTS
logReg = LogisticRegression(n_jobs=-1)
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_train)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:" , mae)
ValueError Traceback (most recent call last)
Cell In [112], line 1
----> 1 mae = mean_absolute_error(y_test, y_pred)
2 print("MAE:" , mae)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:196, in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
141 def mean_absolute_error(
142 y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
143 ):
144 """Mean absolute error regression loss.
145
146 Read more in the :ref:`User Guide <mean_absolute_error>`.
(...)
194 0.85...
195 """
--> 196 y_type, y_true, y_pred, multioutput = _check_reg_targets(
197 y_true, y_pred, multioutput
198 )
199 check_consistent_length(y_true, y_pred, sample_weight)
200 output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py:100, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
66 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
67 """Check that y_true and y_pred belong to the same regression task.
68
69 Parameters
(...)
98 correct keyword.
99 """
--> 100 check_consistent_length(y_true, y_pred)
101 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
102 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:387, in check_consistent_length(*arrays)
385 uniques = np.unique(lengths)
386 if len(uniques) > 1:
--> 387 raise ValueError(
388 "Found input variables with inconsistent numbers of samples: %r"
389 % [int(l) for l in lengths]
390 )
ValueError: Found input variables with inconsistent numbers of samples: [25404, 101612]
I thought it was the way I split the columns but that doesn't seem to be the issue It works when the test size is 50/50 but no other test size works
You are comparing the predicted labels for the train set with the labels for the test set, which are of different sizes, hence the error.
Replace
y_pred = logReg.predict(X_train)
with
y_pred = logReg.predict(X_test)