pythonscikit-learnfeature-selectiondata-preprocessing

Why am I getting a data type error from my preprocessor?


I am having trouble creating a preprocessor for my data. My preprocessor consists of a pipeline for numerical features that imputes NaN and scales values. It also has a pipeline for categorical data that imputes NaN and target encodes. The final transformer is a selector that retains features that meet a certain criteria. When I fit my preprocessor to the data I get this error:

X_40 = preprocessor_40.fit_transform(X, y)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_33/4000599259.py in ?()
----> 1 X_40 = preprocessor_40.fit_transform(X, y)

/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
    138     @wraps(f)
    139     def wrapped(self, X, *args, **kwargs):
--> 140         data_to_wrap = f(self, X, *args, **kwargs)
    141         if isinstance(data_to_wrap, tuple):
    142             # only wrap the first output for cross decomposition
    143             return (

/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y)
    723         self._validate_transformers()
    724         self._validate_column_callables(X)
    725         self._validate_remainder(X)
    726 
--> 727         result = self._fit_transform(X, y, _fit_transform_one)
    728 
    729         if not result:
    730             self._update_fitted_transformers([])

/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y, func, fitted, column_as_strings)
    669         except ValueError as e:
    670             if "Expected 2D array, got 1D array instead" in str(e):
    671                 raise ValueError(_ERR_MSG_1DCOLUMN) from e
    672             else:
--> 673                 raise

/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, iterable)
     59         iterable_with_config = (
     60             (_with_config(delayed_func, config), args, kwargs)
     61             for delayed_func, args, kwargs in iterable
     62         )
---> 63         return super().__call__(iterable_with_config)

/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
   1859             # If n_jobs==1, run the computation sequentially and return
   1860             # immediatly to avoid overheads.
   1861             output = self._get_sequential_output(iterable)
   1862             next(output)
-> 1863             return output if self.return_generator else list(output)
   1864 
   1865         # Let's create an ID that uniquely identifies the current call. If the
   1866         # call is interrupted early and that the same instance is immediately

/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
   1802         finally:
   1803             self.print_progress()
   1804             self._running = False
   1805             self._iterating = False
-> 1806             self._original_iterator = None

/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, *args, **kwargs)
    119                 UserWarning,
    120             )
    121             config = {}
    122         with config_context(**config):
--> 123             return self.function(*args, **kwargs)

/opt/conda/lib/python3.10/site-packages/sklearn/pipeline.py in ?(transformer, X, y, weight, message_clsname, message, **fit_params)
    889     be multiplied by ``weight``.
    890     """
    891     with _print_elapsed_time(message_clsname, message):
    892         if hasattr(transformer, "fit_transform"):
--> 893             res = transformer.fit_transform(X, y, **fit_params)
    894         else:
    895             res = transformer.fit(X, y, **fit_params).transform(X)
    896 

/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
    138     @wraps(f)
    139     def wrapped(self, X, *args, **kwargs):
--> 140         data_to_wrap = f(self, X, *args, **kwargs)
    141         if isinstance(data_to_wrap, tuple):
    142             # only wrap the first output for cross decomposition
    143             return (

/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, **fit_params)
    877             # fit method of arity 1 (unsupervised transformation)
    878             return self.fit(X, **fit_params).transform(X)
    879         else:
    880             # fit method of arity 2 (supervised transformation)
--> 881             return self.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.10/site-packages/sklearn/feature_selection/_univariate_selection.py in ?(self, X, y)
    463             Returns the instance itself.
    464         """
    465         self._validate_params()
    466 
--> 467         X, y = self._validate_data(
    468             X, y, accept_sparse=["csr", "csc"], multi_output=True
    469         )
    470 

/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, **check_params)
    580                 if "estimator" not in check_y_params:
    581                     check_y_params = {**default_check_params, **check_y_params}
    582                 y = check_array(y, input_name="y", **check_y_params)
    583             else:
--> 584                 X, y = check_X_y(X, y, **check_params)
    585             out = X, y
    586 
    587         if not no_val_X and check_params.get("ensure_2d", True):

/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1102         raise ValueError(
   1103             f"{estimator_name} requires y to be passed, but the target y is None"
   1104         )
   1105 
-> 1106     X = check_array(
   1107         X,
   1108         accept_sparse=accept_sparse,
   1109         accept_large_sparse=accept_large_sparse,

/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    876                         )
    877                     array = xp.astype(array, dtype, copy=False)
    878                 else:
    879                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 880             except ComplexWarning as complex_warning:
    881                 raise ValueError(
    882                     "Complex data not supported\n{}\n".format(array)
    883                 ) from complex_warning

/opt/conda/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
    181     if xp is None:
    182         xp, _ = get_namespace(array)
    183     if xp.__name__ in {"numpy", "numpy.array_api"}:
    184         # Use NumPy API to support order
--> 185         array = numpy.asarray(array, order=order, dtype=dtype)
    186         return xp.asarray(array, copy=copy)
    187     else:
    188         return xp.asarray(array, dtype=dtype, copy=copy)

/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
   1996     def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
   1997         values = self._values
-> 1998         arr = np.asarray(values, dtype=dtype)
   1999         if (
   2000             astype_is_view(values.dtype, arr.dtype)
   2001             and using_copy_on_write()

ValueError: could not convert string to float: 'RL'

I suspect it is an issue with my column transformer rather than the selector as I have been having problems with it and this is a new approach to selecting features I am trying out. However, I am not sure. The follow code is how I defined my preprocessors:

def get_preprocessor(percent, categorical, numerical):

    #Pipeline to impute missing values and scale numerical variables
    numerical_processes = Pipeline(steps = [('imputer_num', SimpleImputer(strategy = 'constant', fill_value = 0)),
                                           ('scaler', StandardScaler())])
    
    #Pipeline to impute missing values and encode categorical variables
    categorical_processes = Pipeline(steps = [('imputer_cat', SimpleImputer(strategy = 'constant', fill_value = 'None')),
                                          ('encoder', ce.TargetEncoder())])
    
    #Selector to retain only features that meet a certain threshold in an f_regression
    selector = SelectPercentile(f_regression, percentile = percent)
    
    #create a preprocessor that wraps up processes for both numerical and categorical variables
    Preprocessor = ColumnTransformer(
            transformers = [('numeric', numerical_processes, numerical), 
                            ('categorical', categorical_processes, categorical),
                           ('selector', selector, numerical + categorical)])
    return Preprocessor

preprocessor_full = get_preprocessor(0, categorical, numerical)
preprocessor_40 = get_preprocessor(40, categorical, numerical)
preprocessor_70 = get_preprocessor(70, categorical, numerical)

I have checked how I define categorical and numerical columns many times and I am finding no issues. When it refers to RL in the error that is one of the values in a column called MSZoning and it is listed in categorical, but not numerical. So I don't know why I am getting this error if it's in the right place. It is also the first categorical feature in my column indices for reference.

I tried specifying two different selectors to work in my pipelines rather than having it as a step in the column transformer and I got the same error. However, when I had the selector in both the numerical and categorical pipelines I fit and transformed the dataframe with those rather than combining it all into a column transformer and it worked. I don't know why the column transformer is giving me issues. I know you might say that I should just do without it, but I just want to make this work.


Solution

  • ColumnTransformer applies its transformers in parallel, horizontally stacking the outputs (not in sequence, replacing columns in place). See e.g. Consistent ColumnTransformer for intersecting lists of columns

    So your feature selection transformer is getting the original categorical columns, not the target-encoded ones, and balks at the string values.

    Instead, you should use a Pipeline whose first step is a column transformer, and whose second step is a feature selection.