Skip to content

Commit e39cbd3

Browse files
authored
Support non-numeric class labels everywhere (#7480)
Fixes #6267 Fixes #4169 Fixes #5684
1 parent ab72076 commit e39cbd3

17 files changed

Lines changed: 508 additions & 365 deletions

File tree

python/cuml/cuml/accel/_wrappers/sklearn/ensemble.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ class RandomForestRegressor(ProxyBase):
1515
_gpu_class = cuml.ensemble.RandomForestRegressor
1616

1717
def _gpu_fit(self, X, y, sample_weight=None):
18+
if sample_weight is not None:
19+
raise UnsupportedOnGPU("`sample_weight` is not supported")
20+
1821
try:
1922
y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
2023
except ValueError:
@@ -24,9 +27,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
2427
raise UnsupportedOnGPU(
2528
"Multi-output targets are not supported"
2629
)
27-
28-
if sample_weight is not None:
29-
raise UnsupportedOnGPU("`sample_weight` is not supported")
3030
return self._gpu.fit(X, y)
3131

3232
def _gpu_score(self, X, y, sample_weight=None):
@@ -48,6 +48,9 @@ class RandomForestClassifier(ProxyBase):
4848
_gpu_class = cuml.ensemble.RandomForestClassifier
4949

5050
def _gpu_fit(self, X, y, sample_weight=None):
51+
if sample_weight is not None:
52+
raise UnsupportedOnGPU("`sample_weight` is not supported")
53+
5154
try:
5255
y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
5356
except ValueError:
@@ -62,9 +65,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
6265
raise UnsupportedOnGPU(
6366
"Multi-output targets are not supported"
6467
)
65-
66-
if sample_weight is not None:
67-
raise UnsupportedOnGPU("`sample_weight` is not supported")
6868
return self._gpu.fit(X, y)
6969

7070
def _gpu_score(self, X, y, sample_weight=None):

python/cuml/cuml/common/classification.py

Lines changed: 199 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
3+
import warnings
4+
5+
import cudf
36
import cupy as cp
47
import numpy as np
8+
import pandas as pd
59

610
from cuml.internals.array import CumlArray
7-
from cuml.internals.input_utils import input_to_cupy_array
11+
from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
812
from cuml.internals.memory_utils import cuda_ptr
13+
from cuml.internals.output_utils import cudf_to_pandas
914

1015
is_integral = cp.ReductionKernel(
1116
"T x",
@@ -20,10 +25,6 @@
2025

2126
def check_classification_targets(y):
2227
"""Check if `y` is composed of valid class labels"""
23-
# TODO: improve this check. This is just a stopgap for now since otherwise
24-
# regression targets will be handled as normal, which may possibly be very
25-
# expensive. We'll roll this into a common preprocessing routine in a
26-
# followup.
2728
if y.dtype.kind == "f" and not is_integral(y):
2829
raise ValueError(
2930
"Unknown label type: continuous. Maybe you are trying to fit a "
@@ -32,6 +33,198 @@ def check_classification_targets(y):
3233
)
3334

3435

36+
def preprocess_labels(
37+
y, dtype=None, order="C", n_samples=None, allow_multitarget=False
38+
):
39+
"""Preprocess the `y` input to a classifier.
40+
41+
Parameters
42+
----------
43+
y : array-like
44+
The labels for fitting, may be any type cuml supports as input.
45+
dtype : dtype, optional
46+
The output dtype to use for the encoded labels. If not provided,
47+
a data-dependent integral type will be used.
48+
order : {"C", "F"}, optional
49+
The array order to use for the encoded labels.
50+
n_samples : int, optional
51+
If provided, will raise an error if the number of samples in `y`
52+
doesn't match.
53+
allow_multitarget : bool, optional
54+
Whether to allow multi-target labels.
55+
56+
Returns
57+
-------
58+
y_encoded : cp.ndarray
59+
The labels, encoded as integers in [0, n_classes - 1].
60+
classes : np.ndarray or list[np.ndarray]
61+
The classes as a numpy array, or a list of numpy arrays if
62+
y is multi-target.
63+
"""
64+
# cudf may coerce the dtype, store the original so we can cast back later
65+
y_dtype = y.dtype if isinstance(y, np.ndarray) else None
66+
67+
# No cuda container supports all dtypes. Here we coerce to cupy when
68+
# possible, falling back to cudf Series/DataFrame otherwise.
69+
if isinstance(y, np.ndarray) and y.dtype.kind in "iufb":
70+
y = cp.asarray(y)
71+
elif isinstance(y, pd.DataFrame):
72+
y = cudf.DataFrame(y)
73+
elif isinstance(y, pd.Series):
74+
y = cudf.Series(y)
75+
elif not isinstance(y, (cp.ndarray, cudf.DataFrame, cudf.Series)):
76+
# Non-numeric dtype, always go through cudf
77+
y = input_to_cuml_array(y, convert_to_mem_type=False).array
78+
if y.dtype.kind in "iufb":
79+
y = y.to_output("cupy")
80+
else:
81+
y = (cudf.DataFrame if y.ndim == 2 else cudf.Series)(
82+
y, dtype=(np.dtype("O") if y.dtype.kind in "U" else None)
83+
)
84+
85+
# Validate dimensionality, ensuring 1D/2D y is as expected
86+
if y.ndim == 2 and y.shape[1] == 1:
87+
warnings.warn(
88+
"A column-vector y was passed when a 1d array was expected. Please "
89+
"change the shape of y to (n_samples,), for example using ravel()."
90+
)
91+
y = y.iloc[:, 0] if isinstance(y, cudf.DataFrame) else y.ravel()
92+
elif allow_multitarget and y.ndim not in (1, 2):
93+
raise ValueError(
94+
f"y should be a 1d or 2d array, got an array of shape {y.shape} instead."
95+
)
96+
elif not allow_multitarget and y.ndim != 1:
97+
raise ValueError(
98+
f"y should be a 1d array, got an array of shape {y.shape} instead."
99+
)
100+
101+
# Validate correct number of samples
102+
if n_samples is not None and y.shape[0] != n_samples:
103+
raise ValueError(
104+
f"Expected `y` with {n_samples} samples, got {y.shape[0]}"
105+
)
106+
107+
def _encode(y):
108+
"""Encode `y` to codes and classes"""
109+
check_classification_targets(y)
110+
if isinstance(y, cudf.Series):
111+
y = y.astype("category")
112+
codes = cp.asarray(y.cat.codes)
113+
classes = y.cat.categories.to_numpy()
114+
# cudf will sometimes translate non-numeric dtypes. Coerce back to
115+
# the input dtype if the input was originally a numpy array.
116+
if y_dtype is not None:
117+
classes = classes.astype(y_dtype, copy=False)
118+
else:
119+
classes, codes = cp.unique(y, return_inverse=True)
120+
classes = classes.get()
121+
return codes, classes
122+
123+
if y.ndim == 1:
124+
y_encoded, classes = _encode(y)
125+
if dtype is not None:
126+
y_encoded = y_encoded.astype(dtype, copy=False)
127+
else:
128+
getter = y.iloc if isinstance(y, cudf.DataFrame) else y
129+
encoded_cols, classes = zip(
130+
*(_encode(getter[:, i]) for i in range(y.shape[1]))
131+
)
132+
classes = list(classes)
133+
if dtype is None:
134+
dtype = cp.result_type(*(c.dtype for c in encoded_cols))
135+
y_encoded = cp.empty(shape=y.shape, dtype=dtype, order=order)
136+
for i, col in enumerate(encoded_cols):
137+
y_encoded[:, i] = col
138+
139+
return y_encoded, classes
140+
141+
142+
def decode_labels(y_encoded, classes, output_type="cupy"):
143+
"""Convert encoded labels back into their original classes.
144+
145+
Parameters
146+
----------
147+
y_encoded : cp.ndarray
148+
The labels, encoded as integers in [0, n_classes - 1].
149+
classes : np.ndarray or list[np.ndarray]
150+
The array of classes, or a list of arrays if multi-target.
151+
output_type : str, optional
152+
The type to output. May be any of the output types cuml supports.
153+
154+
Returns
155+
-------
156+
labels
157+
The decoded labels, as output type ``output_type``.
158+
"""
159+
if isinstance(classes, list):
160+
# Multi-target output
161+
dtype = (
162+
classes[0].dtype
163+
if len(set(c.dtype for c in classes)) == 1
164+
else None
165+
)
166+
if dtype is not None and dtype.kind in "iufb":
167+
# All dtypes are identical and numeric, we can use cupy here
168+
if all((c == np.arange(len(c))).all() for c in classes):
169+
# Fast path for common case of monotonically increasing numeric classes
170+
labels = y_encoded.astype(dtype, copy=False)
171+
else:
172+
# Need to transform y_encoded back to classes
173+
labels = cp.empty(shape=y_encoded.shape, dtype=dtype)
174+
for i, c in enumerate(classes):
175+
labels[:, i] = cp.asarray(c).take(y_encoded[:, i])
176+
177+
out = CumlArray(labels)
178+
else:
179+
# At least one class is non-numeric, we need to use cudf
180+
out = cudf.DataFrame(
181+
{
182+
i: cudf.Series(c)
183+
.take(y_encoded[:, i])
184+
.reset_index(drop=True)
185+
for i, c in enumerate(classes)
186+
}
187+
)
188+
else:
189+
# Single-target output
190+
dtype = classes.dtype
191+
if classes.dtype.kind in "iufb":
192+
# Numeric dtype, we can use cupy here
193+
if (classes == np.arange(len(classes))).all():
194+
# Fast path for common case of monotonically increasing numeric classes
195+
labels = y_encoded.astype(classes.dtype, copy=False)
196+
else:
197+
# Need to transform y_encoded back to classes
198+
labels = cp.asarray(classes).take(y_encoded)
199+
200+
out = CumlArray(labels)
201+
else:
202+
# Non-numeric classes. We use cudf since it supports all types, and will
203+
# error appropriately later on when converting to outputs like `cupy`
204+
# that don't support strings.
205+
out = cudf.Series(classes).take(y_encoded).reset_index(drop=True)
206+
207+
# Coerce result to requested output_type
208+
if isinstance(out, CumlArray):
209+
# Common numeric case, can just rely on CumlArray here
210+
return out.to_output(output_type)
211+
elif (
212+
output_type in ("cudf", "df_obj")
213+
or (output_type == "dataframe" and isinstance(out, cudf.DataFrame))
214+
or (output_type == "series" and isinstance(out, cudf.Series))
215+
):
216+
return out
217+
elif output_type == "pandas":
218+
return cudf_to_pandas(out)
219+
elif output_type in ("numpy", "array"):
220+
return out.to_numpy(dtype=dtype)
221+
else:
222+
raise TypeError(
223+
f"{output_type=!r} doesn't support outputs of dtype "
224+
f"{dtype or 'object'} and shape {y_encoded.shape}"
225+
)
226+
227+
35228
def process_class_weight(
36229
classes,
37230
y_ind,
@@ -48,7 +241,7 @@ def process_class_weight(
48241
An array of classes for this classifier.
49242
y_ind : cp.ndarray
50243
An integral array of the transformed labels, where values (in [0,
51-
n_classes - 1]) Are indices into `classes` mapping `y_ind` back to the
244+
n_classes - 1]) are indices into `classes` mapping `y_ind` back to the
52245
original `y`.
53246
class_weight : dict, 'balanced', or None
54247
If `"balanced"`, classes are weighted by the inverse of their

python/cuml/cuml/ensemble/randomforestclassifier.py

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
import cuml.internals
88
import cuml.internals.nvtx as nvtx
99
from cuml.common.array_descriptor import CumlArrayDescriptor
10-
from cuml.common.classification import check_classification_targets
10+
from cuml.common.classification import decode_labels, preprocess_labels
1111
from cuml.common.doc_utils import generate_docstring, insert_into_docstring
1212
from cuml.ensemble.randomforest_common import BaseRandomForestModel
1313
from cuml.internals.array import CumlArray
14-
from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
15-
from cuml.internals.interop import UnsupportedOnGPU, to_cpu, to_gpu
14+
from cuml.internals.input_utils import input_to_cuml_array
15+
from cuml.internals.interop import UnsupportedOnGPU
1616
from cuml.internals.mixins import ClassifierMixin
1717
from cuml.metrics import accuracy_score
1818

@@ -141,17 +141,17 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
141141
142142
Attributes
143143
----------
144+
classes_ : np.ndarray, shape=(n_classes,)
145+
A sorted array of the class labels.
144146
oob_score_ : float
145147
Score of the training dataset obtained using an out-of-bag estimate.
146148
This attribute exists only when ``oob_score`` is True.
147-
148149
oob_decision_function_ : ndarray of shape (n_samples, n_classes)
149150
Decision function computed with out-of-bag estimate on the training
150151
set. If n_estimators is small it might be possible that a data point
151152
was never left out during the bootstrap. In this case,
152153
``oob_decision_function_`` might contain NaN. This attribute exists
153154
only when ``oob_score`` is True.
154-
155155
feature_importances_ : ndarray of shape (n_features,)
156156
The impurity-based feature importances.
157157
@@ -168,8 +168,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
168168
`importances = cuml_model.feature_importances_`
169169
"""
170170

171-
classes_ = CumlArrayDescriptor()
172-
173171
oob_decision_function_ = CumlArrayDescriptor(order="C")
174172

175173
_cpu_class_path = "sklearn.ensemble.RandomForestClassifier"
@@ -182,14 +180,14 @@ def _params_from_cpu(cls, model):
182180

183181
def _attrs_from_cpu(self, model):
184182
return {
185-
"classes_": to_gpu(model.classes_),
183+
"classes_": model.classes_,
186184
"n_classes_": model.n_classes_,
187185
**super()._attrs_from_cpu(model),
188186
}
189187

190188
def _attrs_to_cpu(self, model):
191189
return {
192-
"classes_": to_cpu(self.classes_),
190+
"classes_": self.classes_,
193191
"n_classes_": self.n_classes_,
194192
**super()._attrs_to_cpu(model),
195193
}
@@ -237,13 +235,12 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
237235
check_dtype=[np.float32, np.float64],
238236
order="F",
239237
).array
240-
y = input_to_cupy_array(y, check_rows=X_m.shape[0], check_cols=1).array
241-
check_classification_targets(y)
242-
243-
classes, y = cp.unique(y, return_inverse=True)
244-
self.classes_ = CumlArray(data=classes)
245-
self.n_classes_ = len(self.classes_)
246-
y_m = CumlArray(data=y.astype(cp.int32, copy=False))
238+
y, classes = preprocess_labels(
239+
y, n_samples=X_m.shape[0], dtype=cp.int32
240+
)
241+
self.classes_ = classes
242+
self.n_classes_ = len(classes)
243+
y_m = CumlArray(data=y)
247244

248245
return self._fit_forest(X_m, y_m)
249246

@@ -255,7 +252,7 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
255252
parameters=[("dense", "(n_samples, n_features)")],
256253
return_values=[("dense", "(n_samples, 1)")],
257254
)
258-
@cuml.internals.api_base_return_array(get_output_dtype=True)
255+
@cuml.internals.api_base_return_any_skipall
259256
def predict(
260257
self,
261258
X,
@@ -265,7 +262,7 @@ def predict(
265262
layout="depth_first",
266263
default_chunk_size=None,
267264
align_bytes=None,
268-
) -> CumlArray:
265+
):
269266
"""
270267
Predicts the labels for X.
271268
@@ -296,16 +293,10 @@ def predict(
296293
default_chunk_size=default_chunk_size,
297294
align_bytes=align_bytes,
298295
)
299-
preds = fil.predict(X, threshold=threshold)
300-
301-
if not (
302-
self.classes_.dtype.kind == "i"
303-
and (self.classes_ == cp.arange(self.n_classes_)).all()
304-
):
305-
preds = CumlArray(
306-
self.classes_.to_output("cupy").take(preds.to_output("cupy"))
307-
)
308-
return preds
296+
inds = fil.predict(X, threshold=threshold).to_output("cupy")
297+
with cuml.internals.exit_internal_api():
298+
output_type = self._get_output_type(X)
299+
return decode_labels(inds, self.classes_, output_type=output_type)
309300

310301
@insert_into_docstring(
311302
parameters=[("dense", "(n_samples, n_features)")],

0 commit comments

Comments
 (0)