rapidsai
diff --git a/‎python/cuml/cuml/accel/_wrappers/sklearn/ensemble.py‎
Lines changed: 6 additions & 6 deletions b/‎python/cuml/cuml/accel/_wrappers/sklearn/ensemble.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/cuml/cuml/common/classification.py‎
Lines changed: 199 additions & 6 deletions b/‎python/cuml/cuml/common/classification.py‎
Lines changed: 199 additions & 6 deletions
diff --git a/‎python/cuml/cuml/ensemble/randomforestclassifier.py‎
Lines changed: 19 additions & 28 deletions b/‎python/cuml/cuml/ensemble/randomforestclassifier.py‎
Lines changed: 19 additions & 28 deletions
@@ -15,6 +15,9 @@ class RandomForestRegressor(ProxyBase):
     _gpu_class = cuml.ensemble.RandomForestRegressor
 
     def _gpu_fit(self, X, y, sample_weight=None):
+        if sample_weight is not None:
+            raise UnsupportedOnGPU("`sample_weight` is not supported")
+
         try:
             y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
         except ValueError:
@@ -24,9 +27,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
                 raise UnsupportedOnGPU(
                     "Multi-output targets are not supported"
                 )
-
-        if sample_weight is not None:
-            raise UnsupportedOnGPU("`sample_weight` is not supported")
         return self._gpu.fit(X, y)
 
     def _gpu_score(self, X, y, sample_weight=None):
@@ -48,6 +48,9 @@ class RandomForestClassifier(ProxyBase):
     _gpu_class = cuml.ensemble.RandomForestClassifier
 
     def _gpu_fit(self, X, y, sample_weight=None):
+        if sample_weight is not None:
+            raise UnsupportedOnGPU("`sample_weight` is not supported")
+
         try:
             y = input_to_cuml_array(y, convert_to_mem_type=False)[0]
         except ValueError:
@@ -62,9 +65,6 @@ def _gpu_fit(self, X, y, sample_weight=None):
                     raise UnsupportedOnGPU(
                         "Multi-output targets are not supported"
                     )
-
-        if sample_weight is not None:
-            raise UnsupportedOnGPU("`sample_weight` is not supported")
         return self._gpu.fit(X, y)
 
     def _gpu_score(self, X, y, sample_weight=None):
 
@@ -1,11 +1,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
+import warnings
+
+import cudf
 import cupy as cp
 import numpy as np
+import pandas as pd
 
 from cuml.internals.array import CumlArray
-from cuml.internals.input_utils import input_to_cupy_array
+from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
 from cuml.internals.memory_utils import cuda_ptr
+from cuml.internals.output_utils import cudf_to_pandas
 
 is_integral = cp.ReductionKernel(
     "T x",
@@ -20,10 +25,6 @@
 
 def check_classification_targets(y):
     """Check if `y` is composed of valid class labels"""
-    # TODO: improve this check. This is just a stopgap for now since otherwise
-    # regression targets will be handled as normal, which may possibly be very
-    # expensive. We'll roll this into a common preprocessing routine in a
-    # followup.
     if y.dtype.kind == "f" and not is_integral(y):
         raise ValueError(
             "Unknown label type: continuous. Maybe you are trying to fit a "
@@ -32,6 +33,198 @@ def check_classification_targets(y):
         )
 
 
+def preprocess_labels(
+    y, dtype=None, order="C", n_samples=None, allow_multitarget=False
+):
+    """Preprocess the `y` input to a classifier.
+
+    Parameters
+    ----------
+    y : array-like
+        The labels for fitting, may be any type cuml supports as input.
+    dtype : dtype, optional
+        The output dtype to use for the encoded labels. If not provided,
+        a data-dependent integral type will be used.
+    order : {"C", "F"}, optional
+        The array order to use for the encoded labels.
+    n_samples : int, optional
+        If provided, will raise an error if the number of samples in `y`
+        doesn't match.
+    allow_multitarget : bool, optional
+        Whether to allow multi-target labels.
+
+    Returns
+    -------
+    y_encoded : cp.ndarray
+        The labels, encoded as integers in [0, n_classes - 1].
+    classes : np.ndarray or list[np.ndarray]
+        The classes as a numpy array, or a list of numpy arrays if
+        y is multi-target.
+    """
+    # cudf may coerce the dtype, store the original so we can cast back later
+    y_dtype = y.dtype if isinstance(y, np.ndarray) else None
+
+    # No cuda container supports all dtypes. Here we coerce to cupy when
+    # possible, falling back to cudf Series/DataFrame otherwise.
+    if isinstance(y, np.ndarray) and y.dtype.kind in "iufb":
+        y = cp.asarray(y)
+    elif isinstance(y, pd.DataFrame):
+        y = cudf.DataFrame(y)
+    elif isinstance(y, pd.Series):
+        y = cudf.Series(y)
+    elif not isinstance(y, (cp.ndarray, cudf.DataFrame, cudf.Series)):
+        # Non-numeric dtype, always go through cudf
+        y = input_to_cuml_array(y, convert_to_mem_type=False).array
+        if y.dtype.kind in "iufb":
+            y = y.to_output("cupy")
+        else:
+            y = (cudf.DataFrame if y.ndim == 2 else cudf.Series)(
+                y, dtype=(np.dtype("O") if y.dtype.kind in "U" else None)
+            )
+
+    # Validate dimensionality, ensuring 1D/2D y is as expected
+    if y.ndim == 2 and y.shape[1] == 1:
+        warnings.warn(
+            "A column-vector y was passed when a 1d array was expected. Please "
+            "change the shape of y to (n_samples,), for example using ravel()."
+        )
+        y = y.iloc[:, 0] if isinstance(y, cudf.DataFrame) else y.ravel()
+    elif allow_multitarget and y.ndim not in (1, 2):
+        raise ValueError(
+            f"y should be a 1d or 2d array, got an array of shape {y.shape} instead."
+        )
+    elif not allow_multitarget and y.ndim != 1:
+        raise ValueError(
+            f"y should be a 1d array, got an array of shape {y.shape} instead."
+        )
+
+    # Validate correct number of samples
+    if n_samples is not None and y.shape[0] != n_samples:
+        raise ValueError(
+            f"Expected `y` with {n_samples} samples, got {y.shape[0]}"
+        )
+
+    def _encode(y):
+        """Encode `y` to codes and classes"""
+        check_classification_targets(y)
+        if isinstance(y, cudf.Series):
+            y = y.astype("category")
+            codes = cp.asarray(y.cat.codes)
+            classes = y.cat.categories.to_numpy()
+            # cudf will sometimes translate non-numeric dtypes. Coerce back to
+            # the input dtype if the input was originally a numpy array.
+            if y_dtype is not None:
+                classes = classes.astype(y_dtype, copy=False)
+        else:
+            classes, codes = cp.unique(y, return_inverse=True)
+            classes = classes.get()
+        return codes, classes
+
+    if y.ndim == 1:
+        y_encoded, classes = _encode(y)
+        if dtype is not None:
+            y_encoded = y_encoded.astype(dtype, copy=False)
+    else:
+        getter = y.iloc if isinstance(y, cudf.DataFrame) else y
+        encoded_cols, classes = zip(
+            *(_encode(getter[:, i]) for i in range(y.shape[1]))
+        )
+        classes = list(classes)
+        if dtype is None:
+            dtype = cp.result_type(*(c.dtype for c in encoded_cols))
+        y_encoded = cp.empty(shape=y.shape, dtype=dtype, order=order)
+        for i, col in enumerate(encoded_cols):
+            y_encoded[:, i] = col
+
+    return y_encoded, classes
+
+
+def decode_labels(y_encoded, classes, output_type="cupy"):
+    """Convert encoded labels back into their original classes.
+
+    Parameters
+    ----------
+    y_encoded : cp.ndarray
+        The labels, encoded as integers in [0, n_classes - 1].
+    classes : np.ndarray or list[np.ndarray]
+        The array of classes, or a list of arrays if multi-target.
+    output_type : str, optional
+        The type to output. May be any of the output types cuml supports.
+
+    Returns
+    -------
+    labels
+        The decoded labels, as output type ``output_type``.
+    """
+    if isinstance(classes, list):
+        # Multi-target output
+        dtype = (
+            classes[0].dtype
+            if len(set(c.dtype for c in classes)) == 1
+            else None
+        )
+        if dtype is not None and dtype.kind in "iufb":
+            # All dtypes are identical and numeric, we can use cupy here
+            if all((c == np.arange(len(c))).all() for c in classes):
+                # Fast path for common case of monotonically increasing numeric classes
+                labels = y_encoded.astype(dtype, copy=False)
+            else:
+                # Need to transform y_encoded back to classes
+                labels = cp.empty(shape=y_encoded.shape, dtype=dtype)
+                for i, c in enumerate(classes):
+                    labels[:, i] = cp.asarray(c).take(y_encoded[:, i])
+
+            out = CumlArray(labels)
+        else:
+            # At least one class is non-numeric, we need to use cudf
+            out = cudf.DataFrame(
+                {
+                    i: cudf.Series(c)
+                    .take(y_encoded[:, i])
+                    .reset_index(drop=True)
+                    for i, c in enumerate(classes)
+                }
+            )
+    else:
+        # Single-target output
+        dtype = classes.dtype
+        if classes.dtype.kind in "iufb":
+            # Numeric dtype, we can use cupy here
+            if (classes == np.arange(len(classes))).all():
+                # Fast path for common case of monotonically increasing numeric classes
+                labels = y_encoded.astype(classes.dtype, copy=False)
+            else:
+                # Need to transform y_encoded back to classes
+                labels = cp.asarray(classes).take(y_encoded)
+
+            out = CumlArray(labels)
+        else:
+            # Non-numeric classes. We use cudf since it supports all types, and will
+            # error appropriately later on when converting to outputs like `cupy`
+            # that don't support strings.
+            out = cudf.Series(classes).take(y_encoded).reset_index(drop=True)
+
+    # Coerce result to requested output_type
+    if isinstance(out, CumlArray):
+        # Common numeric case, can just rely on CumlArray here
+        return out.to_output(output_type)
+    elif (
+        output_type in ("cudf", "df_obj")
+        or (output_type == "dataframe" and isinstance(out, cudf.DataFrame))
+        or (output_type == "series" and isinstance(out, cudf.Series))
+    ):
+        return out
+    elif output_type == "pandas":
+        return cudf_to_pandas(out)
+    elif output_type in ("numpy", "array"):
+        return out.to_numpy(dtype=dtype)
+    else:
+        raise TypeError(
+            f"{output_type=!r} doesn't support outputs of dtype "
+            f"{dtype or 'object'} and shape {y_encoded.shape}"
+        )
+
+
 def process_class_weight(
     classes,
     y_ind,
@@ -48,7 +241,7 @@ def process_class_weight(
         An array of classes for this classifier.
     y_ind : cp.ndarray
         An integral array of the transformed labels, where values (in [0,
-        n_classes - 1]) Are indices into `classes` mapping `y_ind` back to the
+        n_classes - 1]) are indices into `classes` mapping `y_ind` back to the
         original `y`.
     class_weight : dict, 'balanced', or None
         If `"balanced"`, classes are weighted by the inverse of their
 
@@ -7,12 +7,12 @@
 import cuml.internals
 import cuml.internals.nvtx as nvtx
 from cuml.common.array_descriptor import CumlArrayDescriptor
-from cuml.common.classification import check_classification_targets
+from cuml.common.classification import decode_labels, preprocess_labels
 from cuml.common.doc_utils import generate_docstring, insert_into_docstring
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
 from cuml.internals.array import CumlArray
-from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array
-from cuml.internals.interop import UnsupportedOnGPU, to_cpu, to_gpu
+from cuml.internals.input_utils import input_to_cuml_array
+from cuml.internals.interop import UnsupportedOnGPU
 from cuml.internals.mixins import ClassifierMixin
 from cuml.metrics import accuracy_score
 
@@ -141,17 +141,17 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
 
     Attributes
     ----------
+    classes_ : np.ndarray, shape=(n_classes,)
+        A sorted array of the class labels.
     oob_score_ : float
         Score of the training dataset obtained using an out-of-bag estimate.
         This attribute exists only when ``oob_score`` is True.
-
     oob_decision_function_ : ndarray of shape (n_samples, n_classes)
         Decision function computed with out-of-bag estimate on the training
         set. If n_estimators is small it might be possible that a data point
         was never left out during the bootstrap. In this case,
         ``oob_decision_function_`` might contain NaN. This attribute exists
         only when ``oob_score`` is True.
-
     feature_importances_ : ndarray of shape (n_features,)
         The impurity-based feature importances.
 
@@ -168,8 +168,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
     `importances = cuml_model.feature_importances_`
     """
 
-    classes_ = CumlArrayDescriptor()
-
     oob_decision_function_ = CumlArrayDescriptor(order="C")
 
     _cpu_class_path = "sklearn.ensemble.RandomForestClassifier"
@@ -182,14 +180,14 @@ def _params_from_cpu(cls, model):
 
     def _attrs_from_cpu(self, model):
         return {
-            "classes_": to_gpu(model.classes_),
+            "classes_": model.classes_,
             "n_classes_": model.n_classes_,
             **super()._attrs_from_cpu(model),
         }
 
     def _attrs_to_cpu(self, model):
         return {
-            "classes_": to_cpu(self.classes_),
+            "classes_": self.classes_,
             "n_classes_": self.n_classes_,
             **super()._attrs_to_cpu(model),
         }
@@ -237,13 +235,12 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
             check_dtype=[np.float32, np.float64],
             order="F",
         ).array
-        y = input_to_cupy_array(y, check_rows=X_m.shape[0], check_cols=1).array
-        check_classification_targets(y)
-
-        classes, y = cp.unique(y, return_inverse=True)
-        self.classes_ = CumlArray(data=classes)
-        self.n_classes_ = len(self.classes_)
-        y_m = CumlArray(data=y.astype(cp.int32, copy=False))
+        y, classes = preprocess_labels(
+            y, n_samples=X_m.shape[0], dtype=cp.int32
+        )
+        self.classes_ = classes
+        self.n_classes_ = len(classes)
+        y_m = CumlArray(data=y)
 
         return self._fit_forest(X_m, y_m)
 
@@ -255,7 +252,7 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier":
         parameters=[("dense", "(n_samples, n_features)")],
         return_values=[("dense", "(n_samples, 1)")],
     )
-    @cuml.internals.api_base_return_array(get_output_dtype=True)
+    @cuml.internals.api_base_return_any_skipall
     def predict(
         self,
         X,
@@ -265,7 +262,7 @@ def predict(
         layout="depth_first",
         default_chunk_size=None,
         align_bytes=None,
-    ) -> CumlArray:
+    ):
         """
         Predicts the labels for X.
 
@@ -296,16 +293,10 @@ def predict(
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
         )
-        preds = fil.predict(X, threshold=threshold)
-
-        if not (
-            self.classes_.dtype.kind == "i"
-            and (self.classes_ == cp.arange(self.n_classes_)).all()
-        ):
-            preds = CumlArray(
-                self.classes_.to_output("cupy").take(preds.to_output("cupy"))
-            )
-        return preds
+        inds = fil.predict(X, threshold=threshold).to_output("cupy")
+        with cuml.internals.exit_internal_api():
+            output_type = self._get_output_type(X)
+        return decode_labels(inds, self.classes_, output_type=output_type)
 
     @insert_into_docstring(
         parameters=[("dense", "(n_samples, n_features)")],