11# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
22# SPDX-License-Identifier: Apache-2.0
3+ import warnings
4+
5+ import cudf
36import cupy as cp
47import numpy as np
8+ import pandas as pd
59
610from cuml .internals .array import CumlArray
7- from cuml .internals .input_utils import input_to_cupy_array
11+ from cuml .internals .input_utils import input_to_cuml_array , input_to_cupy_array
812from cuml .internals .memory_utils import cuda_ptr
13+ from cuml .internals .output_utils import cudf_to_pandas
914
1015is_integral = cp .ReductionKernel (
1116 "T x" ,
2025
2126def check_classification_targets (y ):
2227 """Check if `y` is composed of valid class labels"""
23- # TODO: improve this check. This is just a stopgap for now since otherwise
24- # regression targets will be handled as normal, which may possibly be very
25- # expensive. We'll roll this into a common preprocessing routine in a
26- # followup.
2728 if y .dtype .kind == "f" and not is_integral (y ):
2829 raise ValueError (
2930 "Unknown label type: continuous. Maybe you are trying to fit a "
@@ -32,6 +33,198 @@ def check_classification_targets(y):
3233 )
3334
3435
36+ def preprocess_labels (
37+ y , dtype = None , order = "C" , n_samples = None , allow_multitarget = False
38+ ):
39+ """Preprocess the `y` input to a classifier.
40+
41+ Parameters
42+ ----------
43+ y : array-like
44+ The labels for fitting, may be any type cuml supports as input.
45+ dtype : dtype, optional
46+ The output dtype to use for the encoded labels. If not provided,
47+ a data-dependent integral type will be used.
48+ order : {"C", "F"}, optional
49+ The array order to use for the encoded labels.
50+ n_samples : int, optional
51+ If provided, will raise an error if the number of samples in `y`
52+ doesn't match.
53+ allow_multitarget : bool, optional
54+ Whether to allow multi-target labels.
55+
56+ Returns
57+ -------
58+ y_encoded : cp.ndarray
59+ The labels, encoded as integers in [0, n_classes - 1].
60+ classes : np.ndarray or list[np.ndarray]
61+ The classes as a numpy array, or a list of numpy arrays if
62+ y is multi-target.
63+ """
64+ # cudf may coerce the dtype, store the original so we can cast back later
65+ y_dtype = y .dtype if isinstance (y , np .ndarray ) else None
66+
67+ # No cuda container supports all dtypes. Here we coerce to cupy when
68+ # possible, falling back to cudf Series/DataFrame otherwise.
69+ if isinstance (y , np .ndarray ) and y .dtype .kind in "iufb" :
70+ y = cp .asarray (y )
71+ elif isinstance (y , pd .DataFrame ):
72+ y = cudf .DataFrame (y )
73+ elif isinstance (y , pd .Series ):
74+ y = cudf .Series (y )
75+ elif not isinstance (y , (cp .ndarray , cudf .DataFrame , cudf .Series )):
76+ # Non-numeric dtype, always go through cudf
77+ y = input_to_cuml_array (y , convert_to_mem_type = False ).array
78+ if y .dtype .kind in "iufb" :
79+ y = y .to_output ("cupy" )
80+ else :
81+ y = (cudf .DataFrame if y .ndim == 2 else cudf .Series )(
82+ y , dtype = (np .dtype ("O" ) if y .dtype .kind in "U" else None )
83+ )
84+
85+ # Validate dimensionality, ensuring 1D/2D y is as expected
86+ if y .ndim == 2 and y .shape [1 ] == 1 :
87+ warnings .warn (
88+ "A column-vector y was passed when a 1d array was expected. Please "
89+ "change the shape of y to (n_samples,), for example using ravel()."
90+ )
91+ y = y .iloc [:, 0 ] if isinstance (y , cudf .DataFrame ) else y .ravel ()
92+ elif allow_multitarget and y .ndim not in (1 , 2 ):
93+ raise ValueError (
94+ f"y should be a 1d or 2d array, got an array of shape { y .shape } instead."
95+ )
96+ elif not allow_multitarget and y .ndim != 1 :
97+ raise ValueError (
98+ f"y should be a 1d array, got an array of shape { y .shape } instead."
99+ )
100+
101+ # Validate correct number of samples
102+ if n_samples is not None and y .shape [0 ] != n_samples :
103+ raise ValueError (
104+ f"Expected `y` with { n_samples } samples, got { y .shape [0 ]} "
105+ )
106+
107+ def _encode (y ):
108+ """Encode `y` to codes and classes"""
109+ check_classification_targets (y )
110+ if isinstance (y , cudf .Series ):
111+ y = y .astype ("category" )
112+ codes = cp .asarray (y .cat .codes )
113+ classes = y .cat .categories .to_numpy ()
114+ # cudf will sometimes translate non-numeric dtypes. Coerce back to
115+ # the input dtype if the input was originally a numpy array.
116+ if y_dtype is not None :
117+ classes = classes .astype (y_dtype , copy = False )
118+ else :
119+ classes , codes = cp .unique (y , return_inverse = True )
120+ classes = classes .get ()
121+ return codes , classes
122+
123+ if y .ndim == 1 :
124+ y_encoded , classes = _encode (y )
125+ if dtype is not None :
126+ y_encoded = y_encoded .astype (dtype , copy = False )
127+ else :
128+ getter = y .iloc if isinstance (y , cudf .DataFrame ) else y
129+ encoded_cols , classes = zip (
130+ * (_encode (getter [:, i ]) for i in range (y .shape [1 ]))
131+ )
132+ classes = list (classes )
133+ if dtype is None :
134+ dtype = cp .result_type (* (c .dtype for c in encoded_cols ))
135+ y_encoded = cp .empty (shape = y .shape , dtype = dtype , order = order )
136+ for i , col in enumerate (encoded_cols ):
137+ y_encoded [:, i ] = col
138+
139+ return y_encoded , classes
140+
141+
142+ def decode_labels (y_encoded , classes , output_type = "cupy" ):
143+ """Convert encoded labels back into their original classes.
144+
145+ Parameters
146+ ----------
147+ y_encoded : cp.ndarray
148+ The labels, encoded as integers in [0, n_classes - 1].
149+ classes : np.ndarray or list[np.ndarray]
150+ The array of classes, or a list of arrays if multi-target.
151+ output_type : str, optional
152+ The type to output. May be any of the output types cuml supports.
153+
154+ Returns
155+ -------
156+ labels
157+ The decoded labels, as output type ``output_type``.
158+ """
159+ if isinstance (classes , list ):
160+ # Multi-target output
161+ dtype = (
162+ classes [0 ].dtype
163+ if len (set (c .dtype for c in classes )) == 1
164+ else None
165+ )
166+ if dtype is not None and dtype .kind in "iufb" :
167+ # All dtypes are identical and numeric, we can use cupy here
168+ if all ((c == np .arange (len (c ))).all () for c in classes ):
169+ # Fast path for common case of monotonically increasing numeric classes
170+ labels = y_encoded .astype (dtype , copy = False )
171+ else :
172+ # Need to transform y_encoded back to classes
173+ labels = cp .empty (shape = y_encoded .shape , dtype = dtype )
174+ for i , c in enumerate (classes ):
175+ labels [:, i ] = cp .asarray (c ).take (y_encoded [:, i ])
176+
177+ out = CumlArray (labels )
178+ else :
179+ # At least one class is non-numeric, we need to use cudf
180+ out = cudf .DataFrame (
181+ {
182+ i : cudf .Series (c )
183+ .take (y_encoded [:, i ])
184+ .reset_index (drop = True )
185+ for i , c in enumerate (classes )
186+ }
187+ )
188+ else :
189+ # Single-target output
190+ dtype = classes .dtype
191+ if classes .dtype .kind in "iufb" :
192+ # Numeric dtype, we can use cupy here
193+ if (classes == np .arange (len (classes ))).all ():
194+ # Fast path for common case of monotonically increasing numeric classes
195+ labels = y_encoded .astype (classes .dtype , copy = False )
196+ else :
197+ # Need to transform y_encoded back to classes
198+ labels = cp .asarray (classes ).take (y_encoded )
199+
200+ out = CumlArray (labels )
201+ else :
202+ # Non-numeric classes. We use cudf since it supports all types, and will
203+ # error appropriately later on when converting to outputs like `cupy`
204+ # that don't support strings.
205+ out = cudf .Series (classes ).take (y_encoded ).reset_index (drop = True )
206+
207+ # Coerce result to requested output_type
208+ if isinstance (out , CumlArray ):
209+ # Common numeric case, can just rely on CumlArray here
210+ return out .to_output (output_type )
211+ elif (
212+ output_type in ("cudf" , "df_obj" )
213+ or (output_type == "dataframe" and isinstance (out , cudf .DataFrame ))
214+ or (output_type == "series" and isinstance (out , cudf .Series ))
215+ ):
216+ return out
217+ elif output_type == "pandas" :
218+ return cudf_to_pandas (out )
219+ elif output_type in ("numpy" , "array" ):
220+ return out .to_numpy (dtype = dtype )
221+ else :
222+ raise TypeError (
223+ f"{ output_type = !r} doesn't support outputs of dtype "
224+ f"{ dtype or 'object' } and shape { y_encoded .shape } "
225+ )
226+
227+
35228def process_class_weight (
36229 classes ,
37230 y_ind ,
@@ -48,7 +241,7 @@ def process_class_weight(
48241 An array of classes for this classifier.
49242 y_ind : cp.ndarray
50243 An integral array of the transformed labels, where values (in [0,
51- n_classes - 1]) Are indices into `classes` mapping `y_ind` back to the
244+ n_classes - 1]) are indices into `classes` mapping `y_ind` back to the
52245 original `y`.
53246 class_weight : dict, 'balanced', or None
54247 If `"balanced"`, classes are weighted by the inverse of their
0 commit comments