Skip to content

Commit 8a651b1

Browse files
committed
FIX TargetEncoder sklearn interop: add categories_/n_features_in_, fix attrs conversion, handle multi-feature with warnings
1 parent 7b5cd0e commit 8a651b1

2 files changed

Lines changed: 197 additions & 9 deletions

File tree

python/cuml/cuml/preprocessing/TargetEncoder.py

Lines changed: 171 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,33 @@ class TargetEncoder(Base, InteropMixin):
7070
The statistic used in encoding, mean, variance or median of the
7171
target.
7272
73+
Attributes
74+
----------
75+
categories_ : list of cupy.ndarray
76+
The categories of each input feature determined during fitting.
77+
Each element is an array of unique category values for that feature,
78+
sorted in ascending order.
79+
n_features_in_ : int
80+
Number of features seen during :meth:`fit`.
81+
encode_all : cudf.DataFrame
82+
DataFrame containing the learned encodings for all category
83+
combinations. Used internally for transforming new data.
84+
mean : float
85+
The overall mean of the target variable, computed during fitting.
86+
Used for smoothing and imputing unseen categories.
87+
y_stat_val : float
88+
The statistic value (mean, variance, or median) of the target
89+
variable, depending on the ``stat`` parameter. Used to impute
90+
encodings for unseen categories.
91+
train : cudf.DataFrame or None
92+
The training DataFrame used during fitting, containing the original
93+
features, target values, and fold assignments. Set to ``None`` if
94+
the encoder was loaded from a sklearn model via :meth:`from_sklearn`.
95+
train_encode : cuml.internals.array.CumlArray or None
96+
The encoded values for the training data, computed during
97+
:meth:`fit` or :meth:`fit_transform`. Set to ``None`` if the
98+
encoder was loaded from a sklearn model via :meth:`from_sklearn`.
99+
73100
References
74101
----------
75102
.. [1] https://maxhalford.github.io/blog/target-encoding/
@@ -169,6 +196,20 @@ def fit(self, x, y, fold_ids=None):
169196
self : TargetEncoder
170197
A fitted instance of itself to allow method chaining
171198
"""
199+
if y is None:
200+
raise TypeError(
201+
f"Input of type {type(y)} is not cudf.Series, "
202+
"or pandas.Series"
203+
"or numpy.ndarray"
204+
"or cupy.ndarray"
205+
)
206+
207+
if len(x) == 0:
208+
raise ValueError(
209+
"Found array with 0 sample(s) while a minimum of 1 is "
210+
"required."
211+
)
212+
172213
if self.split_method == "customize" and fold_ids is None:
173214
raise ValueError(
174215
"`fold_ids` is required "
@@ -246,9 +287,17 @@ def transform(self, x) -> CumlArray:
246287
"""
247288
self._check_is_fitted()
248289
test = self._data_with_strings_to_cudf_dataframe(x)
290+
291+
# Check feature dimensions match
292+
x_cols = [i for i in test.columns.tolist() if i != self.id_col]
293+
if hasattr(self, "n_features_in_") and len(x_cols) != self.n_features_in_:
294+
raise ValueError(
295+
f"X has {len(x_cols)} features, but TargetEncoder is "
296+
f"expecting {self.n_features_in_} features as input."
297+
)
298+
249299
if self._is_train_df(test):
250300
return self.train_encode
251-
x_cols = [i for i in test.columns.tolist() if i != self.id_col]
252301
test = test.merge(self.encode_all, on=x_cols, how="left")
253302
return self._impute_and_sort(test)
254303

@@ -259,6 +308,19 @@ def _fit_transform(self, x, y, fold_ids):
259308
cp.random.seed(self.seed)
260309
train = self._data_with_strings_to_cudf_dataframe(x)
261310
x_cols = [i for i in train.columns.tolist() if i != self.id_col]
311+
312+
# Store n_features_in_ and categories_ for sklearn interop
313+
self.n_features_in_ = len(x_cols)
314+
self._x_cols = x_cols
315+
316+
# Extract unique categories for each feature (sorted for consistency)
317+
self.categories_ = []
318+
for col in x_cols:
319+
unique_vals = train[col].unique()
320+
# Sort for deterministic ordering
321+
unique_vals = unique_vals.sort_values()
322+
self.categories_.append(unique_vals.values)
323+
262324
train[self.y_col] = self._make_y_column(y)
263325

264326
self.n_folds = min(self.n_folds, len(train))
@@ -422,9 +484,11 @@ def _groupby_agg(self, train, x_cols, op, y_cols):
422484
return df_each_fold, df_all
423485

424486
def _check_is_fitted(self):
425-
if not self._fitted or self.train is None:
487+
# Check if fitted - either via fit() or from_sklearn()
488+
# When loaded from sklearn, train may be None but encode_all exists
489+
if not self._fitted and not hasattr(self, "encode_all"):
426490
msg = (
427-
"This LabelEncoder instance is not fitted yet. Call 'fit' "
491+
"This TargetEncoder instance is not fitted yet. Call 'fit' "
428492
"with appropriate arguments before using this estimator."
429493
)
430494
raise NotFittedError(msg)
@@ -434,6 +498,9 @@ def _is_train_df(self, df):
434498
Return True if the dataframe `df` is the training dataframe, which
435499
is used in `fit_transform`
436500
"""
501+
# If train is None (e.g., loaded from sklearn), we can't compare
502+
if self.train is None:
503+
return False
437504
if len(df) != len(self.train):
438505
return False
439506
self.train = self.train.sort_values(self.id_col).reset_index(drop=True)
@@ -521,17 +588,112 @@ def _params_to_cpu(self):
521588
return params
522589

523590
def _attrs_from_cpu(self, model):
591+
from cuml.internals.interop import UnsupportedOnGPU
592+
593+
categories_gpu = [to_gpu(cat) for cat in model.categories_]
594+
n_features = len(model.categories_)
595+
596+
# Generate column names matching cuML's internal format
597+
if n_features == 1:
598+
x_cols = [self.x_col]
599+
else:
600+
x_cols = [f"{self.x_col}_{i}" for i in range(n_features)]
601+
602+
# Build the encode_all DataFrame
603+
if n_features == 1:
604+
encode_all = cudf.DataFrame({
605+
x_cols[0]: model.categories_[0],
606+
self.out_col: model.encodings_[0],
607+
})
608+
else:
609+
# Multi-feature case: sklearn encodes each feature independently
610+
# while cuML encodes feature combinations. We approximate by
611+
# creating the cartesian product and averaging encodings.
612+
from itertools import product
613+
614+
total_combinations = 1
615+
for cats in model.categories_:
616+
total_combinations *= len(cats)
617+
618+
max_combinations = 100_000
619+
if total_combinations > max_combinations:
620+
raise UnsupportedOnGPU(
621+
f"Converting multi-feature sklearn TargetEncoder would "
622+
f"require {total_combinations:,} category combinations, "
623+
f"exceeding the limit of {max_combinations:,}. Consider "
624+
f"using single-feature TargetEncoder instead."
625+
)
626+
627+
warnings.warn(
628+
"Converting multi-feature sklearn TargetEncoder to cuML uses "
629+
"an approximation (averaged per-feature encodings). Results "
630+
"may differ from both sklearn and native cuML behavior.",
631+
UserWarning,
632+
)
633+
634+
all_cats = [list(cat) for cat in model.categories_]
635+
all_encs = [list(enc) for enc in model.encodings_]
636+
637+
# Create cartesian product of all categories
638+
rows = []
639+
for combo in product(*[range(len(c)) for c in all_cats]):
640+
row = {}
641+
enc_sum = 0.0
642+
for i, idx in enumerate(combo):
643+
row[x_cols[i]] = all_cats[i][idx]
644+
enc_sum += all_encs[i][idx]
645+
# Average the encodings for combined categories
646+
row[self.out_col] = enc_sum / n_features
647+
rows.append(row)
648+
encode_all = cudf.DataFrame(rows)
649+
524650
return {
525-
"encode_all": to_gpu(model.encodings_),
526-
"categories_": to_gpu(model.categories_),
527-
"mean": to_gpu(model.target_mean_),
651+
"encode_all": encode_all,
652+
"categories_": categories_gpu,
653+
"_x_cols": x_cols,
654+
"mean": float(model.target_mean_),
655+
"y_stat_val": float(model.target_mean_),
656+
"_fitted": True,
657+
"train": None,
658+
"train_encode": None,
528659
**super()._attrs_from_cpu(model),
529660
}
530661

531662
def _attrs_to_cpu(self, model):
663+
# Convert categories_ to list of numpy arrays
664+
categories_cpu = [to_cpu(cat) for cat in self.categories_]
665+
666+
n_features = len(self.categories_)
667+
if n_features > 1:
668+
warnings.warn(
669+
"Converting multi-feature cuML TargetEncoder to sklearn uses "
670+
"an approximation (averaged combination encodings per feature). "
671+
"Results may differ from native sklearn behavior.",
672+
UserWarning,
673+
)
674+
675+
# Convert encode_all DataFrame to list of encodings per feature
676+
# sklearn expects encodings_[i] to have shape (n_categories_i,)
677+
# with encoding values in the same order as categories_[i]
678+
encodings_list = []
679+
for i, (col, cats) in enumerate(zip(self._x_cols, self.categories_)):
680+
feature_encodings = []
681+
for cat_val in cats:
682+
mask = self.encode_all[col] == cat_val
683+
if mask.any():
684+
# For multi-feature, average across all combinations
685+
# containing this category value
686+
enc_val = float(
687+
self.encode_all.loc[mask, self.out_col].mean()
688+
)
689+
else:
690+
enc_val = float(self.mean)
691+
feature_encodings.append(enc_val)
692+
encodings_list.append(np.array(feature_encodings))
693+
532694
return {
533-
"encodings_": to_cpu(self.encode_all),
534-
"categories_": to_cpu(self.categories_),
535-
"target_mean_": to_cpu(self.mean),
695+
"encodings_": encodings_list,
696+
"categories_": categories_cpu,
697+
"target_mean_": float(self.mean),
536698
**super()._attrs_to_cpu(model),
537699
}

python/cuml/cuml_accel_tests/upstream/scikit-learn/xfail-list.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1650,3 +1650,29 @@
16501650
- "sklearn.linear_model.tests.test_logistic::test_liblinear_dual_random_state[42]"
16511651
- "sklearn.linear_model.tests.test_logistic::test_liblinear_with_large_values"
16521652
- "sklearn.svm.tests.test_svm::test_liblinear_set_coef[42]"
1653+
- reason: cuML TargetEncoder does not support complex128 dtype
1654+
marker: cuml_accel_bugs
1655+
tests:
1656+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_complex_data]"
1657+
- reason: cudf has different object dtype handling than pandas/numpy
1658+
marker: cuml_accel_bugs
1659+
tests:
1660+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_dtype_object]"
1661+
- reason: cudf DataFrame pickle serialization differs from sklearn internal structures
1662+
marker: cuml_accel_bugs
1663+
tests:
1664+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_estimators_pickle]"
1665+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_estimators_pickle(readonly_memmap=True)]"
1666+
- reason: cuML TargetEncoder has different internal data structures than sklearn
1667+
marker: cuml_accel_bugs
1668+
tests:
1669+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_transformer_data_not_an_array]"
1670+
- reason: cuML TargetEncoder uses different cross-validation approach affecting sample order
1671+
marker: cuml_accel_bugs
1672+
tests:
1673+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_methods_sample_order_invariance]"
1674+
- "sklearn.utils.tests.test_common::test_estimators[TargetEncoder()-check_methods_subset_invariance]"
1675+
- reason: cuML TargetEncoder fit docstring attributes differ from sklearn conventions
1676+
marker: cuml_accel_bugs
1677+
tests:
1678+
- "sklearn.utils.tests.test_docstring_parameters::test_fit_docstring_attributes[TargetEncoder-TargetEncoder]"

0 commit comments

Comments
 (0)