pythonscikit-learnboosting

HistGradientBoostingClassifier - ValueError: could not convert string to float


I'm running a HistGradientBoostingClassifier with some categorical features, and I'm getting ValueError: could not convert string to float. I defined the categorical features through the categorical_features parameter.

The documentation on HistGradientBoostingClassifier shows that it accepts categorical variables as features. Why am I getting this error?

I tried different inputs for the categorical_features (boolean, indeces) thinking that I'm doing something wrong with that but, I still can't make it work.

Here's a sample of my code below:

X_train = pd.DataFrame({'person_age': {24716: 33.0,
  37121: 28.0,
  34325: 24.0,
  7068: 24.0,
  11680: 23.0,
  17900: 34.0,
  16108: 22.0,
  26879: 27.0,
  37408: 23.0,
  10782: 26.0,
  40871: 26.0,
  16929: 23.0,
  21868: 28.0,
  34622: 31.0,
  14948: 24.0,
  22929: 33.0,
  15295: 26.0,
  16620: 23.0,
  42191: 24.0,
  13442: 26.0},
 'person_gender': {24716: 'female',
  37121: 'male',
  34325: 'male',
  7068: 'female',
  11680: 'male',
  17900: 'female',
  16108: 'male',
  26879: 'female',
  37408: 'male',
  10782: 'male',
  40871: 'male',
  16929: 'male',
  21868: 'male',
  34622: 'male',
  14948: 'male',
  22929: 'female',
  15295: 'female',
  16620: 'female',
  42191: 'male',
  13442: 'female'}})
y_train = np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])
X_train['person_gender']=X_train['person_gender'].astype('category')

clf_hgb = HistGradientBoostingClassifier(categorical_features=['person_gender'])
clf_hgb.fit(X_train, y_train)

I provided the traceback below:

ValueError                                Traceback (most recent call last)
/tmp/ipykernel_168/499751463.py in ?()
      1 clf_hgb = HistGradientBoostingClassifier(categorical_features=['person_gender'])
----> 2 clf_hgb.fit(X_train, y_train)

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
   1147                 skip_parameter_validation=(
   1148                     prefer_skip_nested_validation or global_skip_validation
   1149                 )
   1150             ):
-> 1151                 return fit_method(estimator, *args, **kwargs)

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py in ?(self, X, y, sample_weight)
    367         acc_apply_split_time = 0.0  # time spent splitting nodes
    368         acc_compute_hist_time = 0.0  # time spent computing histograms
    369         # time spent predicting X for gradient and hessians update
    370         acc_prediction_time = 0.0
--> 371         X, y = self._validate_data(X, y, dtype=[X_DTYPE], force_all_finite=False)
    372         y = self._encode_y(y)
    373         check_consistent_length(X, y)
    374         # Do not create unit sample weights by default to later skip some

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    617                 if "estimator" not in check_y_params:
    618                     check_y_params = {**default_check_params, **check_y_params}
    619                 y = check_array(y, input_name="y", **check_y_params)
    620             else:
--> 621                 X, y = check_X_y(X, y, **check_params)
    622             out = X, y
    623 
    624         if not no_val_X and check_params.get("ensure_2d", True):

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1143         raise ValueError(
   1144             f"{estimator_name} requires y to be passed, but the target y is None"
   1145         )
   1146 
-> 1147     X = check_array(
   1148         X,
   1149         accept_sparse=accept_sparse,
   1150         accept_large_sparse=accept_large_sparse,

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    914                         )
    915                     array = xp.astype(array, dtype, copy=False)
    916                 else:
    917                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 918             except ComplexWarning as complex_warning:
    919                 raise ValueError(
    920                     "Complex data not supported\n{}\n".format(array)
    921                 ) from complex_warning

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
    376         # Use NumPy API to support order
    377         if copy is True:
    378             array = numpy.array(array, order=order, dtype=dtype)
    379         else:
--> 380             array = numpy.asarray(array, order=order, dtype=dtype)
    381 
    382         # At this point array is a NumPy ndarray. We convert it to an array
    383         # container that is consistent with the input's namespace.

/opt/conda/envs/anaconda-ai-2024.04-py310/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
   2082     def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
   2083         values = self._values
-> 2084         arr = np.asarray(values, dtype=dtype)
   2085         if (
   2086             astype_is_view(values.dtype, arr.dtype)
   2087             and using_copy_on_write()

ValueError: could not convert string to float: 'female'

Solution

  • In sklearn versions <1.4, categorical features in the histogram GBMs needed to be pre-encoded as nonnegative integers.

    From the User Guide for v1.3.2:

    The cardinality of each categorical feature must be less than the max_bins parameter, and each categorical feature is expected to be encoded in [0, max_bins - 1]. To that end, it might be useful to pre-process the data with an OrdinalEncoder as done in Categorical Feature Support in Gradient Boosting.

    And from the v1.4 changelog:

    Categorical features no longer need to be encoded with numbers.