I am having trouble creating a preprocessor for my data. My preprocessor consists of a pipeline for numerical features that imputes NaN and scales values. It also has a pipeline for categorical data that imputes NaN and target encodes. The final transformer is a selector that retains features that meet a certain criteria. When I fit my preprocessor to the data I get this error:
X_40 = preprocessor_40.fit_transform(X, y)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_33/4000599259.py in ?()
----> 1 X_40 = preprocessor_40.fit_transform(X, y)
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y)
723 self._validate_transformers()
724 self._validate_column_callables(X)
725 self._validate_remainder(X)
726
--> 727 result = self._fit_transform(X, y, _fit_transform_one)
728
729 if not result:
730 self._update_fitted_transformers([])
/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y, func, fitted, column_as_strings)
669 except ValueError as e:
670 if "Expected 2D array, got 1D array instead" in str(e):
671 raise ValueError(_ERR_MSG_1DCOLUMN) from e
672 else:
--> 673 raise
/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, iterable)
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
1859 # If n_jobs==1, run the computation sequentially and return
1860 # immediatly to avoid overheads.
1861 output = self._get_sequential_output(iterable)
1862 next(output)
-> 1863 return output if self.return_generator else list(output)
1864
1865 # Let's create an ID that uniquely identifies the current call. If the
1866 # call is interrupted early and that the same instance is immediately
/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
1802 finally:
1803 self.print_progress()
1804 self._running = False
1805 self._iterating = False
-> 1806 self._original_iterator = None
/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, *args, **kwargs)
119 UserWarning,
120 )
121 config = {}
122 with config_context(**config):
--> 123 return self.function(*args, **kwargs)
/opt/conda/lib/python3.10/site-packages/sklearn/pipeline.py in ?(transformer, X, y, weight, message_clsname, message, **fit_params)
889 be multiplied by ``weight``.
890 """
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
896
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, **fit_params)
877 # fit method of arity 1 (unsupervised transformation)
878 return self.fit(X, **fit_params).transform(X)
879 else:
880 # fit method of arity 2 (supervised transformation)
--> 881 return self.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.10/site-packages/sklearn/feature_selection/_univariate_selection.py in ?(self, X, y)
463 Returns the instance itself.
464 """
465 self._validate_params()
466
--> 467 X, y = self._validate_data(
468 X, y, accept_sparse=["csr", "csc"], multi_output=True
469 )
470
/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, **check_params)
580 if "estimator" not in check_y_params:
581 check_y_params = {**default_check_params, **check_y_params}
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
586
587 if not no_val_X and check_params.get("ensure_2d", True):
/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
1105
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
876 )
877 array = xp.astype(array, dtype, copy=False)
878 else:
879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 880 except ComplexWarning as complex_warning:
881 raise ValueError(
882 "Complex data not supported\n{}\n".format(array)
883 ) from complex_warning
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
181 if xp is None:
182 xp, _ = get_namespace(array)
183 if xp.__name__ in {"numpy", "numpy.array_api"}:
184 # Use NumPy API to support order
--> 185 array = numpy.asarray(array, order=order, dtype=dtype)
186 return xp.asarray(array, copy=copy)
187 else:
188 return xp.asarray(array, dtype=dtype, copy=copy)
/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
1996 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
1997 values = self._values
-> 1998 arr = np.asarray(values, dtype=dtype)
1999 if (
2000 astype_is_view(values.dtype, arr.dtype)
2001 and using_copy_on_write()
ValueError: could not convert string to float: 'RL'
I suspect it is an issue with my column transformer rather than the selector as I have been having problems with it and this is a new approach to selecting features I am trying out. However, I am not sure. The follow code is how I defined my preprocessors:
def get_preprocessor(percent, categorical, numerical):
#Pipeline to impute missing values and scale numerical variables
numerical_processes = Pipeline(steps = [('imputer_num', SimpleImputer(strategy = 'constant', fill_value = 0)),
('scaler', StandardScaler())])
#Pipeline to impute missing values and encode categorical variables
categorical_processes = Pipeline(steps = [('imputer_cat', SimpleImputer(strategy = 'constant', fill_value = 'None')),
('encoder', ce.TargetEncoder())])
#Selector to retain only features that meet a certain threshold in an f_regression
selector = SelectPercentile(f_regression, percentile = percent)
#create a preprocessor that wraps up processes for both numerical and categorical variables
Preprocessor = ColumnTransformer(
transformers = [('numeric', numerical_processes, numerical),
('categorical', categorical_processes, categorical),
('selector', selector, numerical + categorical)])
return Preprocessor
preprocessor_full = get_preprocessor(0, categorical, numerical)
preprocessor_40 = get_preprocessor(40, categorical, numerical)
preprocessor_70 = get_preprocessor(70, categorical, numerical)
I have checked how I define categorical and numerical columns many times and I am finding no issues. When it refers to RL in the error that is one of the values in a column called MSZoning and it is listed in categorical, but not numerical. So I don't know why I am getting this error if it's in the right place. It is also the first categorical feature in my column indices for reference.
I tried specifying two different selectors to work in my pipelines rather than having it as a step in the column transformer and I got the same error. However, when I had the selector in both the numerical and categorical pipelines I fit and transformed the dataframe with those rather than combining it all into a column transformer and it worked. I don't know why the column transformer is giving me issues. I know you might say that I should just do without it, but I just want to make this work.
ColumnTransformer
applies its transformers in parallel, horizontally stacking the outputs (not in sequence, replacing columns in place). See e.g. Consistent ColumnTransformer for intersecting lists of columns
So your feature selection transformer is getting the original categorical columns, not the target-encoded ones, and balks at the string values.
Instead, you should use a Pipeline
whose first step is a column transformer, and whose second step is a feature selection.