I am using Linear Regression to predict the 'Value' throughout the years, I have the following data, i imported from a csv file :
**df.head()**
:
LOCATION INDICATOR SUBJECT MEASURE FREQUENCY TIME Value Flag Codes
0 IRL LTINT TOT PC_PA M 1.167610e+18 4.04 NaN
1 IRL LTINT TOT PC_PA M 1.170288e+18 4.07 NaN
2 IRL LTINT TOT PC_PA M 1.172707e+18 3.97 NaN
3 IRL LTINT TOT PC_PA M 1.175386e+18 4.19 NaN
4 IRL LTINT TOT PC_PA M 1.177978e+18 4.32 NaN
The format of Time column was YYYY-DD-MM before i used the following
***df['TIME'] = pd.to_datetime(df['TIME'])
df['TIME'] = pd.to_numeric(df['TIME'])
df['TIME'] = df['TIME'].astype(float)***
In order to fit data i used the following code :
***X=df['TIME']
Y=df['Value']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=101)
lm= LinearRegression()***
when executing the fit function lm.fit(X_train, Y_train), I got the following error :
***ValueError Traceback (most recent call last)
Cell In[12], line 1
----> 1 lm.fit(X_train, Y_train)
File ~\anaconda3\Lib\site-packages\sklearn\base.py:1151, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1144 estimator._validate_params()
1146 with config_context(
1147 skip_parameter_validation=(
1148 prefer_skip_nested_validation or global_skip_validation
1149 )
1150 ):
-> 1151 return fit_method(estimator, *args, **kwargs)
File ~\anaconda3\Lib\site-packages\sklearn\linear_model\_base.py:678, in LinearRegression.fit(self, X, y, sample_weight)
674 n_jobs_ = self.n_jobs
676 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 678 X, y = self._validate_data(
679 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
680 )
682 has_sw = sample_weight is not None
683 if has_sw:
File ~\anaconda3\Lib\site-packages\sklearn\base.py:621, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
619 y = check_array(y, input_name="y", **check_y_params)
620 else:
--> 621 X, y = check_X_y(X, y, **check_params)
622 out = X, y
624 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:1147, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1142 estimator_name = _check_estimator_name(estimator)
1143 raise ValueError(
1144 f"{estimator_name} requires y to be passed, but the target y is None"
1145 )
-> 1147 X = check_array(
1148 X,
1149 accept_sparse=accept_sparse,
1150 accept_large_sparse=accept_large_sparse,
1151 dtype=dtype,
1152 order=order,
1153 copy=copy,
1154 force_all_finite=force_all_finite,
1155 ensure_2d=ensure_2d,
1156 allow_nd=allow_nd,
1157 ensure_min_samples=ensure_min_samples,
1158 ensure_min_features=ensure_min_features,
1159 estimator=estimator,
1160 input_name="X",
1161 )
1163 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1165 check_consistent_length(X, y)
File ~\anaconda3\Lib\site-packages\sklearn\utils\validation.py:940, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
938 # If input is 1D raise error
939 if array.ndim == 1:
--> 940 raise ValueError(
941 "Expected 2D array, got 1D array instead:\narray={}.\n"
942 "Reshape your data either using array.reshape(-1, 1) if "
943 "your data has a single feature or array.reshape(1, -1) "
944 "if it contains a single sample.".format(array)
945 )
947 if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
948 raise ValueError(
949 "dtype='numeric' is not compatible with arrays of bytes/strings."
950 "Convert your data to numeric values explicitly instead."
951 )
ValueError: Expected 2D array, got 1D array instead:
array=[1.3437792e+18 1.6198272e+18 1.3596768e+18 ... 1.3596768e+18 1.5805152e+18
1.5751584e+18].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.***
Do you have any idea how can i resolve this error ? Thank you in advance.
Change X=df['TIME']
to X=df[['TIME']]
.
Double brackets gets a dataframe back (2d), single brackets gets a series back (1d)