Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions python/cuml/cuml/accel/_wrappers/sklearn/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ class RandomForestRegressor(ProxyBase):
_gpu_class = cuml.ensemble.RandomForestRegressor

def _gpu_fit(self, X, y, sample_weight=None):
if sample_weight is not None:
raise UnsupportedOnGPU("`sample_weight` is not supported")

try:
y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
except ValueError:
Expand All @@ -24,9 +27,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
raise UnsupportedOnGPU(
"Multi-output targets are not supported"
)

if sample_weight is not None:
raise UnsupportedOnGPU("`sample_weight` is not supported")
return self._gpu.fit(X, y)

def _gpu_score(self, X, y, sample_weight=None):
Expand All @@ -48,6 +48,9 @@ class RandomForestClassifier(ProxyBase):
_gpu_class = cuml.ensemble.RandomForestClassifier

def _gpu_fit(self, X, y, sample_weight=None):
if sample_weight is not None:
raise UnsupportedOnGPU("`sample_weight` is not supported")

try:
y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
except ValueError:
Expand All @@ -62,9 +65,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
raise UnsupportedOnGPU(
"Multi-output targets are not supported"
)

if sample_weight is not None:
raise UnsupportedOnGPU("`sample_weight` is not supported")
return self._gpu.fit(X, y)

def _gpu_score(self, X, y, sample_weight=None):
Expand Down
205 changes: 199 additions & 6 deletions python/cuml/cuml/common/classification.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
import warnings

import cudf
import cupy as cp
import numpy as np
import pandas as pd

from cuml.internals.array import CumlArray
from cuml.internals.input_utils import input_to_cupy_array
from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
from cuml.internals.memory_utils import cuda_ptr
from cuml.internals.output_utils import cudf_to_pandas

is_integral = cp.ReductionKernel(
"T x",
Expand All @@ -20,10 +25,6 @@

def check_classification_targets(y):
"""Check if `y` is composed of valid class labels"""
# TODO: improve this check. This is just a stopgap for now since otherwise
# regression targets will be handled as normal, which may possibly be very
# expensive. We'll roll this into a common preprocessing routine in a
# followup.
if y.dtype.kind == "f" and not is_integral(y):
raise ValueError(
"Unknown label type: continuous. Maybe you are trying to fit a "
Expand All @@ -32,6 +33,198 @@ def check_classification_targets(y):
)


def preprocess_labels(
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic in preprocess_labels and decode_labels is mostly the same logic we already had in LogisticRegression, just extracted and generalized so it can apply to all our classifiers.

y, dtype=None, order="C", n_samples=None, allow_multitarget=False
):
"""Preprocess the `y` input to a classifier.

Parameters
----------
y : array-like
The labels for fitting, may be any type cuml supports as input.
dtype : dtype, optional
The output dtype to use for the encoded labels. If not provided,
a data-dependent integral type will be used.
order : {"C", "F"}, optional
The array order to use for the encoded labels.
n_samples : int, optional
If provided, will raise an error if the number of samples in `y`
doesn't match.
allow_multitarget : bool, optional
Whether to allow multi-target labels.

Returns
-------
y_encoded : cp.ndarray
The labels, encoded as integers in [0, n_classes - 1].
classes : np.ndarray or list[np.ndarray]
The classes as a numpy array, or a list of numpy arrays if
y is multi-target.
"""
# cudf may coerce the dtype, store the original so we can cast back later
y_dtype = y.dtype if isinstance(y, np.ndarray) else None

# No cuda container supports all dtypes. Here we coerce to cupy when
# possible, falling back to cudf Series/DataFrame otherwise.
if isinstance(y, np.ndarray) and y.dtype.kind in "iufb":
y = cp.asarray(y)
elif isinstance(y, pd.DataFrame):
y = cudf.DataFrame(y)
elif isinstance(y, pd.Series):
y = cudf.Series(y)
elif not isinstance(y, (cp.ndarray, cudf.DataFrame, cudf.Series)):
# Non-numeric dtype, always go through cudf
y = input_to_cuml_array(y, convert_to_mem_type=False).array
if y.dtype.kind in "iufb":
y = y.to_output("cupy")
else:
y = (cudf.DataFrame if y.ndim == 2 else cudf.Series)(
y, dtype=(np.dtype("O") if y.dtype.kind in "U" else None)
)

# Validate dimensionality, ensuring 1D/2D y is as expected
if y.ndim == 2 and y.shape[1] == 1:
warnings.warn(
"A column-vector y was passed when a 1d array was expected. Please "
"change the shape of y to (n_samples,), for example using ravel()."
)
y = y.iloc[:, 0] if isinstance(y, cudf.DataFrame) else y.ravel()
elif allow_multitarget and y.ndim not in (1, 2):
raise ValueError(
f"y should be a 1d or 2d array, got an array of shape {y.shape} instead."
)
elif not allow_multitarget and y.ndim != 1:
raise ValueError(
f"y should be a 1d array, got an array of shape {y.shape} instead."
)

# Validate correct number of samples
if n_samples is not None and y.shape[0] != n_samples:
raise ValueError(
f"Expected `y` with {n_samples} samples, got {y.shape[0]}"
)

def _encode(y):
"""Encode `y` to codes and classes"""
check_classification_targets(y)
if isinstance(y, cudf.Series):
y = y.astype("category")
codes = cp.asarray(y.cat.codes)
classes = y.cat.categories.to_numpy()
# cudf will sometimes translate non-numeric dtypes. Coerce back to
# the input dtype if the input was originally a numpy array.
if y_dtype is not None:
classes = classes.astype(y_dtype, copy=False)
else:
classes, codes = cp.unique(y, return_inverse=True)
classes = classes.get()
return codes, classes

if y.ndim == 1:
y_encoded, classes = _encode(y)
if dtype is not None:
y_encoded = y_encoded.astype(dtype, copy=False)
else:
getter = y.iloc if isinstance(y, cudf.DataFrame) else y
encoded_cols, classes = zip(
*(_encode(getter[:, i]) for i in range(y.shape[1]))
)
classes = list(classes)
if dtype is None:
dtype = cp.result_type(*(c.dtype for c in encoded_cols))
y_encoded = cp.empty(shape=y.shape, dtype=dtype, order=order)
for i, col in enumerate(encoded_cols):
y_encoded[:, i] = col

return y_encoded, classes


def decode_labels(y_encoded, classes, output_type="cupy"):
"""Convert encoded labels back into their original classes.

Parameters
----------
y_encoded : cp.ndarray
The labels, encoded as integers in [0, n_classes - 1].
classes : np.ndarray or list[np.ndarray]
The array of classes, or a list of arrays if multi-target.
output_type : str, optional
The type to output. May be any of the output types cuml supports.

Returns
-------
labels
The decoded labels, as output type ``output_type``.
"""
if isinstance(classes, list):
# Multi-target output
dtype = (
classes[0].dtype
if len(set(c.dtype for c in classes)) == 1
else None
)
if dtype is not None and dtype.kind in "iufb":
# All dtypes are identical and numeric, we can use cupy here
if all((c == np.arange(len(c))).all() for c in classes):
# Fast path for common case of monotonically increasing numeric classes
labels = y_encoded.astype(dtype, copy=False)
else:
# Need to transform y_encoded back to classes
labels = cp.empty(shape=y_encoded.shape, dtype=dtype)
for i, c in enumerate(classes):
labels[:, i] = cp.asarray(c).take(y_encoded[:, i])

out = CumlArray(labels)
else:
# At least one class is non-numeric, we need to use cudf
out = cudf.DataFrame(
{
i: cudf.Series(c)
.take(y_encoded[:, i])
.reset_index(drop=True)
for i, c in enumerate(classes)
}
)
else:
# Single-target output
dtype = classes.dtype
if classes.dtype.kind in "iufb":
# Numeric dtype, we can use cupy here
if (classes == np.arange(len(classes))).all():
# Fast path for common case of monotonically increasing numeric classes
labels = y_encoded.astype(classes.dtype, copy=False)
else:
# Need to transform y_encoded back to classes
labels = cp.asarray(classes).take(y_encoded)

out = CumlArray(labels)
else:
# Non-numeric classes. We use cudf since it supports all types, and will
# error appropriately later on when converting to outputs like `cupy`
# that don't support strings.
out = cudf.Series(classes).take(y_encoded).reset_index(drop=True)

# Coerce result to requested output_type
if isinstance(out, CumlArray):
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is messy, but is basically a generalization of what we already added to LogisticRegression months ago. I hope we can simplify this a bunch when we cleanup our output type handling.

# Common numeric case, can just rely on CumlArray here
return out.to_output(output_type)
elif (
output_type in ("cudf", "df_obj")
or (output_type == "dataframe" and isinstance(out, cudf.DataFrame))
or (output_type == "series" and isinstance(out, cudf.Series))
):
return out
elif output_type == "pandas":
return cudf_to_pandas(out)
elif output_type in ("numpy", "array"):
return out.to_numpy(dtype=dtype)
else:
raise TypeError(
f"{output_type=!r} doesn't support outputs of dtype "
f"{dtype or 'object'} and shape {y_encoded.shape}"
)


def process_class_weight(
classes,
y_ind,
Expand All @@ -48,7 +241,7 @@ def process_class_weight(
An array of classes for this classifier.
y_ind : cp.ndarray
An integral array of the transformed labels, where values (in [0,
n_classes - 1]) Are indices into `classes` mapping `y_ind` back to the
n_classes - 1]) are indices into `classes` mapping `y_ind` back to the
original `y`.
class_weight : dict, 'balanced', or None
If `"balanced"`, classes are weighted by the inverse of their
Expand Down
47 changes: 19 additions & 28 deletions python/cuml/cuml/ensemble/randomforestclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import cuml.internals
import cuml.internals.nvtx as nvtx
from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.common.classification import check_classification_targets
from cuml.common.classification import decode_labels, preprocess_labels
from cuml.common.doc_utils import generate_docstring, insert_into_docstring
from cuml.ensemble.randomforest_common import BaseRandomForestModel
from cuml.internals.array import CumlArray
from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
from cuml.internals.interop import UnsupportedOnGPU, to_cpu, to_gpu
from cuml.internals.input_utils import input_to_cuml_array
from cuml.internals.interop import UnsupportedOnGPU
from cuml.internals.mixins import ClassifierMixin
from cuml.metrics import accuracy_score

Expand Down Expand Up @@ -141,17 +141,17 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):

Attributes
----------
classes_ : np.ndarray, shape=(n_classes,)
A sorted array of the class labels.
oob_score_ : float
Score of the training dataset obtained using an out-of-bag estimate.
This attribute exists only when ``oob_score`` is True.

oob_decision_function_ : ndarray of shape (n_samples, n_classes)
Decision function computed with out-of-bag estimate on the training
set. If n_estimators is small it might be possible that a data point
was never left out during the bootstrap. In this case,
``oob_decision_function_`` might contain NaN. This attribute exists
only when ``oob_score`` is True.

feature_importances_ : ndarray of shape (n_features,)
The impurity-based feature importances.

Expand All @@ -168,8 +168,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
`importances = cuml_model.feature_importances_`
"""

classes_ = CumlArrayDescriptor()

oob_decision_function_ = CumlArrayDescriptor(order="C")

_cpu_class_path = "sklearn.ensemble.RandomForestClassifier"
Expand All @@ -182,14 +180,14 @@ def _params_from_cpu(cls, model):

def _attrs_from_cpu(self, model):
return {
"classes_": to_gpu(model.classes_),
"classes_": model.classes_,
"n_classes_": model.n_classes_,
**super()._attrs_from_cpu(model),
}

def _attrs_to_cpu(self, model):
return {
"classes_": to_cpu(self.classes_),
"classes_": self.classes_,
"n_classes_": self.n_classes_,
**super()._attrs_to_cpu(model),
}
Expand Down Expand Up @@ -237,13 +235,12 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
check_dtype=[np.float32, np.float64],
order="F",
).array
y = input_to_cupy_array(y, check_rows=X_m.shape[0], check_cols=1).array
check_classification_targets(y)

classes, y = cp.unique(y, return_inverse=True)
self.classes_ = CumlArray(data=classes)
self.n_classes_ = len(self.classes_)
y_m = CumlArray(data=y.astype(cp.int32, copy=False))
y, classes = preprocess_labels(
y, n_samples=X_m.shape[0], dtype=cp.int32
)
self.classes_ = classes
self.n_classes_ = len(classes)
y_m = CumlArray(data=y)

return self._fit_forest(X_m, y_m)

Expand All @@ -255,7 +252,7 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
parameters=[("dense", "(n_samples, n_features)")],
return_values=[("dense", "(n_samples, 1)")],
)
@cuml.internals.api_base_return_array(get_output_dtype=True)
@cuml.internals.api_base_return_any_skipall
def predict(
self,
X,
Expand All @@ -265,7 +262,7 @@ def predict(
layout="depth_first",
default_chunk_size=None,
align_bytes=None,
) -> CumlArray:
):
"""
Predicts the labels for X.

Expand Down Expand Up @@ -296,16 +293,10 @@ def predict(
default_chunk_size=default_chunk_size,
align_bytes=align_bytes,
)
preds = fil.predict(X, threshold=threshold)

if not (
self.classes_.dtype.kind == "i"
and (self.classes_ == cp.arange(self.n_classes_)).all()
):
preds = CumlArray(
self.classes_.to_output("cupy").take(preds.to_output("cupy"))
)
return preds
inds = fil.predict(X, threshold=threshold).to_output("cupy")
with cuml.internals.exit_internal_api():
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is gross, but necessary so with cuml.using_output_type(...) actually works on the predict method. This was broken before in LogisticRegression, but is now fixed (and tested in the generic test). I hope this is not a long lived hack with upcoming refactors we're thinking about to type reflection.

output_type = self._get_output_type(X)
return decode_labels(inds, self.classes_, output_type=output_type)

@insert_into_docstring(
parameters=[("dense", "(n_samples, n_features)")],
Expand Down
Loading