@@ -70,6 +70,33 @@ class TargetEncoder(Base, InteropMixin):
7070 The statistic used in encoding, mean, variance or median of the
7171 target.
7272
73+ Attributes
74+ ----------
75+ categories_ : list of cupy.ndarray
76+ The categories of each input feature determined during fitting.
77+ Each element is an array of unique category values for that feature,
78+ sorted in ascending order.
79+ n_features_in_ : int
80+ Number of features seen during :meth:`fit`.
81+ encode_all : cudf.DataFrame
82+ DataFrame containing the learned encodings for all category
83+ combinations. Used internally for transforming new data.
84+ mean : float
85+ The overall mean of the target variable, computed during fitting.
86+ Used for smoothing and imputing unseen categories.
87+ y_stat_val : float
88+ The statistic value (mean, variance, or median) of the target
89+ variable, depending on the ``stat`` parameter. Used to impute
90+ encodings for unseen categories.
91+ train : cudf.DataFrame or None
92+ The training DataFrame used during fitting, containing the original
93+ features, target values, and fold assignments. Set to ``None`` if
94+ the encoder was loaded from a sklearn model via :meth:`from_sklearn`.
95+ train_encode : cuml.internals.array.CumlArray or None
96+ The encoded values for the training data, computed during
97+ :meth:`fit` or :meth:`fit_transform`. Set to ``None`` if the
98+ encoder was loaded from a sklearn model via :meth:`from_sklearn`.
99+
73100 References
74101 ----------
75102 .. [1] https://maxhalford.github.io/blog/target-encoding/
@@ -169,6 +196,20 @@ def fit(self, x, y, fold_ids=None):
169196 self : TargetEncoder
170197 A fitted instance of itself to allow method chaining
171198 """
199+ if y is None :
200+ raise TypeError (
201+ f"Input of type { type (y )} is not cudf.Series, "
202+ "or pandas.Series"
203+ "or numpy.ndarray"
204+ "or cupy.ndarray"
205+ )
206+
207+ if len (x ) == 0 :
208+ raise ValueError (
209+ "Found array with 0 sample(s) while a minimum of 1 is "
210+ "required."
211+ )
212+
172213 if self .split_method == "customize" and fold_ids is None :
173214 raise ValueError (
174215 "`fold_ids` is required "
@@ -246,9 +287,17 @@ def transform(self, x) -> CumlArray:
246287 """
247288 self ._check_is_fitted ()
248289 test = self ._data_with_strings_to_cudf_dataframe (x )
290+
291+ # Check feature dimensions match
292+ x_cols = [i for i in test .columns .tolist () if i != self .id_col ]
293+ if hasattr (self , "n_features_in_" ) and len (x_cols ) != self .n_features_in_ :
294+ raise ValueError (
295+ f"X has { len (x_cols )} features, but TargetEncoder is "
296+ f"expecting { self .n_features_in_ } features as input."
297+ )
298+
249299 if self ._is_train_df (test ):
250300 return self .train_encode
251- x_cols = [i for i in test .columns .tolist () if i != self .id_col ]
252301 test = test .merge (self .encode_all , on = x_cols , how = "left" )
253302 return self ._impute_and_sort (test )
254303
@@ -259,6 +308,19 @@ def _fit_transform(self, x, y, fold_ids):
259308 cp .random .seed (self .seed )
260309 train = self ._data_with_strings_to_cudf_dataframe (x )
261310 x_cols = [i for i in train .columns .tolist () if i != self .id_col ]
311+
312+ # Store n_features_in_ and categories_ for sklearn interop
313+ self .n_features_in_ = len (x_cols )
314+ self ._x_cols = x_cols
315+
316+ # Extract unique categories for each feature (sorted for consistency)
317+ self .categories_ = []
318+ for col in x_cols :
319+ unique_vals = train [col ].unique ()
320+ # Sort for deterministic ordering
321+ unique_vals = unique_vals .sort_values ()
322+ self .categories_ .append (unique_vals .values )
323+
262324 train [self .y_col ] = self ._make_y_column (y )
263325
264326 self .n_folds = min (self .n_folds , len (train ))
@@ -422,9 +484,11 @@ def _groupby_agg(self, train, x_cols, op, y_cols):
422484 return df_each_fold , df_all
423485
424486 def _check_is_fitted (self ):
425- if not self ._fitted or self .train is None :
487+ # Check if fitted - either via fit() or from_sklearn()
488+ # When loaded from sklearn, train may be None but encode_all exists
489+ if not self ._fitted and not hasattr (self , "encode_all" ):
426490 msg = (
427- "This LabelEncoder instance is not fitted yet. Call 'fit' "
491+ "This TargetEncoder instance is not fitted yet. Call 'fit' "
428492 "with appropriate arguments before using this estimator."
429493 )
430494 raise NotFittedError (msg )
@@ -434,6 +498,9 @@ def _is_train_df(self, df):
434498 Return True if the dataframe `df` is the training dataframe, which
435499 is used in `fit_transform`
436500 """
501+ # If train is None (e.g., loaded from sklearn), we can't compare
502+ if self .train is None :
503+ return False
437504 if len (df ) != len (self .train ):
438505 return False
439506 self .train = self .train .sort_values (self .id_col ).reset_index (drop = True )
@@ -521,17 +588,112 @@ def _params_to_cpu(self):
521588 return params
522589
523590 def _attrs_from_cpu (self , model ):
591+ from cuml .internals .interop import UnsupportedOnGPU
592+
593+ categories_gpu = [to_gpu (cat ) for cat in model .categories_ ]
594+ n_features = len (model .categories_ )
595+
596+ # Generate column names matching cuML's internal format
597+ if n_features == 1 :
598+ x_cols = [self .x_col ]
599+ else :
600+ x_cols = [f"{ self .x_col } _{ i } " for i in range (n_features )]
601+
602+ # Build the encode_all DataFrame
603+ if n_features == 1 :
604+ encode_all = cudf .DataFrame ({
605+ x_cols [0 ]: model .categories_ [0 ],
606+ self .out_col : model .encodings_ [0 ],
607+ })
608+ else :
609+ # Multi-feature case: sklearn encodes each feature independently
610+ # while cuML encodes feature combinations. We approximate by
611+ # creating the cartesian product and averaging encodings.
612+ from itertools import product
613+
614+ total_combinations = 1
615+ for cats in model .categories_ :
616+ total_combinations *= len (cats )
617+
618+ max_combinations = 100_000
619+ if total_combinations > max_combinations :
620+ raise UnsupportedOnGPU (
621+ f"Converting multi-feature sklearn TargetEncoder would "
622+ f"require { total_combinations :,} category combinations, "
623+ f"exceeding the limit of { max_combinations :,} . Consider "
624+ f"using single-feature TargetEncoder instead."
625+ )
626+
627+ warnings .warn (
628+ "Converting multi-feature sklearn TargetEncoder to cuML uses "
629+ "an approximation (averaged per-feature encodings). Results "
630+ "may differ from both sklearn and native cuML behavior." ,
631+ UserWarning ,
632+ )
633+
634+ all_cats = [list (cat ) for cat in model .categories_ ]
635+ all_encs = [list (enc ) for enc in model .encodings_ ]
636+
637+ # Create cartesian product of all categories
638+ rows = []
639+ for combo in product (* [range (len (c )) for c in all_cats ]):
640+ row = {}
641+ enc_sum = 0.0
642+ for i , idx in enumerate (combo ):
643+ row [x_cols [i ]] = all_cats [i ][idx ]
644+ enc_sum += all_encs [i ][idx ]
645+ # Average the encodings for combined categories
646+ row [self .out_col ] = enc_sum / n_features
647+ rows .append (row )
648+ encode_all = cudf .DataFrame (rows )
649+
524650 return {
525- "encode_all" : to_gpu (model .encodings_ ),
526- "categories_" : to_gpu (model .categories_ ),
527- "mean" : to_gpu (model .target_mean_ ),
651+ "encode_all" : encode_all ,
652+ "categories_" : categories_gpu ,
653+ "_x_cols" : x_cols ,
654+ "mean" : float (model .target_mean_ ),
655+ "y_stat_val" : float (model .target_mean_ ),
656+ "_fitted" : True ,
657+ "train" : None ,
658+ "train_encode" : None ,
528659 ** super ()._attrs_from_cpu (model ),
529660 }
530661
531662 def _attrs_to_cpu (self , model ):
663+ # Convert categories_ to list of numpy arrays
664+ categories_cpu = [to_cpu (cat ) for cat in self .categories_ ]
665+
666+ n_features = len (self .categories_ )
667+ if n_features > 1 :
668+ warnings .warn (
669+ "Converting multi-feature cuML TargetEncoder to sklearn uses "
670+ "an approximation (averaged combination encodings per feature). "
671+ "Results may differ from native sklearn behavior." ,
672+ UserWarning ,
673+ )
674+
675+ # Convert encode_all DataFrame to list of encodings per feature
676+ # sklearn expects encodings_[i] to have shape (n_categories_i,)
677+ # with encoding values in the same order as categories_[i]
678+ encodings_list = []
679+ for i , (col , cats ) in enumerate (zip (self ._x_cols , self .categories_ )):
680+ feature_encodings = []
681+ for cat_val in cats :
682+ mask = self .encode_all [col ] == cat_val
683+ if mask .any ():
684+ # For multi-feature, average across all combinations
685+ # containing this category value
686+ enc_val = float (
687+ self .encode_all .loc [mask , self .out_col ].mean ()
688+ )
689+ else :
690+ enc_val = float (self .mean )
691+ feature_encodings .append (enc_val )
692+ encodings_list .append (np .array (feature_encodings ))
693+
532694 return {
533- "encodings_" : to_cpu ( self . encode_all ) ,
534- "categories_" : to_cpu ( self . categories_ ) ,
535- "target_mean_" : to_cpu (self .mean ),
695+ "encodings_" : encodings_list ,
696+ "categories_" : categories_cpu ,
697+ "target_mean_" : float (self .mean ),
536698 ** super ()._attrs_to_cpu (model ),
537699 }
0 commit comments