-
Notifications
You must be signed in to change notification settings - Fork 623
Support non-numeric class labels everywhere #7480
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a46f6b0
356f6b3
1965f73
4be47ba
f755555
1b3d1d7
bb1d7d1
1edb6f2
3fcb14f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,16 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| import warnings | ||
|
|
||
| import cudf | ||
| import cupy as cp | ||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
| from cuml.internals.array import CumlArray | ||
| from cuml.internals.input_utils import input_to_cupy_array | ||
| from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array | ||
| from cuml.internals.memory_utils import cuda_ptr | ||
| from cuml.internals.output_utils import cudf_to_pandas | ||
|
|
||
| is_integral = cp.ReductionKernel( | ||
| "T x", | ||
|
|
@@ -20,10 +25,6 @@ | |
|
|
||
| def check_classification_targets(y): | ||
| """Check if `y` is composed of valid class labels""" | ||
| # TODO: improve this check. This is just a stopgap for now since otherwise | ||
| # regression targets will be handled as normal, which may possibly be very | ||
| # expensive. We'll roll this into a common preprocessing routine in a | ||
| # followup. | ||
| if y.dtype.kind == "f" and not is_integral(y): | ||
| raise ValueError( | ||
| "Unknown label type: continuous. Maybe you are trying to fit a " | ||
|
|
@@ -32,6 +33,198 @@ def check_classification_targets(y): | |
| ) | ||
|
|
||
|
|
||
| def preprocess_labels( | ||
| y, dtype=None, order="C", n_samples=None, allow_multitarget=False | ||
| ): | ||
| """Preprocess the `y` input to a classifier. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| y : array-like | ||
| The labels for fitting, may be any type cuml supports as input. | ||
| dtype : dtype, optional | ||
| The output dtype to use for the encoded labels. If not provided, | ||
| a data-dependent integral type will be used. | ||
| order : {"C", "F"}, optional | ||
| The array order to use for the encoded labels. | ||
| n_samples : int, optional | ||
| If provided, will raise an error if the number of samples in `y` | ||
| doesn't match. | ||
| allow_multitarget : bool, optional | ||
| Whether to allow multi-target labels. | ||
|
|
||
| Returns | ||
| ------- | ||
| y_encoded : cp.ndarray | ||
| The labels, encoded as integers in [0, n_classes - 1]. | ||
| classes : np.ndarray or list[np.ndarray] | ||
| The classes as a numpy array, or a list of numpy arrays if | ||
| y is multi-target. | ||
| """ | ||
| # cudf may coerce the dtype, store the original so we can cast back later | ||
| y_dtype = y.dtype if isinstance(y, np.ndarray) else None | ||
|
|
||
| # No cuda container supports all dtypes. Here we coerce to cupy when | ||
| # possible, falling back to cudf Series/DataFrame otherwise. | ||
| if isinstance(y, np.ndarray) and y.dtype.kind in "iufb": | ||
| y = cp.asarray(y) | ||
| elif isinstance(y, pd.DataFrame): | ||
| y = cudf.DataFrame(y) | ||
| elif isinstance(y, pd.Series): | ||
| y = cudf.Series(y) | ||
| elif not isinstance(y, (cp.ndarray, cudf.DataFrame, cudf.Series)): | ||
| # Non-numeric dtype, always go through cudf | ||
| y = input_to_cuml_array(y, convert_to_mem_type=False).array | ||
| if y.dtype.kind in "iufb": | ||
| y = y.to_output("cupy") | ||
| else: | ||
| y = (cudf.DataFrame if y.ndim == 2 else cudf.Series)( | ||
| y, dtype=(np.dtype("O") if y.dtype.kind in "U" else None) | ||
| ) | ||
|
|
||
| # Validate dimensionality, ensuring 1D/2D y is as expected | ||
| if y.ndim == 2 and y.shape[1] == 1: | ||
| warnings.warn( | ||
| "A column-vector y was passed when a 1d array was expected. Please " | ||
| "change the shape of y to (n_samples,), for example using ravel()." | ||
| ) | ||
| y = y.iloc[:, 0] if isinstance(y, cudf.DataFrame) else y.ravel() | ||
| elif allow_multitarget and y.ndim not in (1, 2): | ||
| raise ValueError( | ||
| f"y should be a 1d or 2d array, got an array of shape {y.shape} instead." | ||
| ) | ||
| elif not allow_multitarget and y.ndim != 1: | ||
| raise ValueError( | ||
| f"y should be a 1d array, got an array of shape {y.shape} instead." | ||
| ) | ||
|
|
||
| # Validate correct number of samples | ||
| if n_samples is not None and y.shape[0] != n_samples: | ||
| raise ValueError( | ||
| f"Expected `y` with {n_samples} samples, got {y.shape[0]}" | ||
| ) | ||
|
|
||
| def _encode(y): | ||
| """Encode `y` to codes and classes""" | ||
| check_classification_targets(y) | ||
| if isinstance(y, cudf.Series): | ||
| y = y.astype("category") | ||
| codes = cp.asarray(y.cat.codes) | ||
| classes = y.cat.categories.to_numpy() | ||
| # cudf will sometimes translate non-numeric dtypes. Coerce back to | ||
| # the input dtype if the input was originally a numpy array. | ||
| if y_dtype is not None: | ||
| classes = classes.astype(y_dtype, copy=False) | ||
| else: | ||
| classes, codes = cp.unique(y, return_inverse=True) | ||
| classes = classes.get() | ||
| return codes, classes | ||
|
|
||
| if y.ndim == 1: | ||
| y_encoded, classes = _encode(y) | ||
| if dtype is not None: | ||
| y_encoded = y_encoded.astype(dtype, copy=False) | ||
| else: | ||
| getter = y.iloc if isinstance(y, cudf.DataFrame) else y | ||
| encoded_cols, classes = zip( | ||
| *(_encode(getter[:, i]) for i in range(y.shape[1])) | ||
| ) | ||
| classes = list(classes) | ||
| if dtype is None: | ||
| dtype = cp.result_type(*(c.dtype for c in encoded_cols)) | ||
| y_encoded = cp.empty(shape=y.shape, dtype=dtype, order=order) | ||
| for i, col in enumerate(encoded_cols): | ||
| y_encoded[:, i] = col | ||
|
|
||
| return y_encoded, classes | ||
|
|
||
|
|
||
| def decode_labels(y_encoded, classes, output_type="cupy"): | ||
| """Convert encoded labels back into their original classes. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| y_encoded : cp.ndarray | ||
| The labels, encoded as integers in [0, n_classes - 1]. | ||
| classes : np.ndarray or list[np.ndarray] | ||
| The array of classes, or a list of arrays if multi-target. | ||
| output_type : str, optional | ||
| The type to output. May be any of the output types cuml supports. | ||
|
|
||
| Returns | ||
| ------- | ||
| labels | ||
| The decoded labels, as output type ``output_type``. | ||
| """ | ||
| if isinstance(classes, list): | ||
| # Multi-target output | ||
| dtype = ( | ||
| classes[0].dtype | ||
| if len(set(c.dtype for c in classes)) == 1 | ||
| else None | ||
| ) | ||
| if dtype is not None and dtype.kind in "iufb": | ||
| # All dtypes are identical and numeric, we can use cupy here | ||
| if all((c == np.arange(len(c))).all() for c in classes): | ||
| # Fast path for common case of monotonically increasing numeric classes | ||
| labels = y_encoded.astype(dtype, copy=False) | ||
| else: | ||
| # Need to transform y_encoded back to classes | ||
| labels = cp.empty(shape=y_encoded.shape, dtype=dtype) | ||
| for i, c in enumerate(classes): | ||
| labels[:, i] = cp.asarray(c).take(y_encoded[:, i]) | ||
|
|
||
| out = CumlArray(labels) | ||
| else: | ||
| # At least one class is non-numeric, we need to use cudf | ||
| out = cudf.DataFrame( | ||
| { | ||
| i: cudf.Series(c) | ||
| .take(y_encoded[:, i]) | ||
| .reset_index(drop=True) | ||
| for i, c in enumerate(classes) | ||
| } | ||
| ) | ||
| else: | ||
| # Single-target output | ||
| dtype = classes.dtype | ||
| if classes.dtype.kind in "iufb": | ||
| # Numeric dtype, we can use cupy here | ||
| if (classes == np.arange(len(classes))).all(): | ||
| # Fast path for common case of monotonically increasing numeric classes | ||
| labels = y_encoded.astype(classes.dtype, copy=False) | ||
| else: | ||
| # Need to transform y_encoded back to classes | ||
| labels = cp.asarray(classes).take(y_encoded) | ||
|
|
||
| out = CumlArray(labels) | ||
| else: | ||
| # Non-numeric classes. We use cudf since it supports all types, and will | ||
| # error appropriately later on when converting to outputs like `cupy` | ||
| # that don't support strings. | ||
| out = cudf.Series(classes).take(y_encoded).reset_index(drop=True) | ||
|
|
||
| # Coerce result to requested output_type | ||
| if isinstance(out, CumlArray): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is messy, but is basically a generalization of what we already added to |
||
| # Common numeric case, can just rely on CumlArray here | ||
| return out.to_output(output_type) | ||
| elif ( | ||
| output_type in ("cudf", "df_obj") | ||
| or (output_type == "dataframe" and isinstance(out, cudf.DataFrame)) | ||
| or (output_type == "series" and isinstance(out, cudf.Series)) | ||
| ): | ||
| return out | ||
| elif output_type == "pandas": | ||
| return cudf_to_pandas(out) | ||
| elif output_type in ("numpy", "array"): | ||
| return out.to_numpy(dtype=dtype) | ||
| else: | ||
| raise TypeError( | ||
| f"{output_type=!r} doesn't support outputs of dtype " | ||
| f"{dtype or 'object'} and shape {y_encoded.shape}" | ||
| ) | ||
|
|
||
|
|
||
| def process_class_weight( | ||
| classes, | ||
| y_ind, | ||
|
|
@@ -48,7 +241,7 @@ def process_class_weight( | |
| An array of classes for this classifier. | ||
| y_ind : cp.ndarray | ||
| An integral array of the transformed labels, where values (in [0, | ||
| n_classes - 1]) Are indices into `classes` mapping `y_ind` back to the | ||
| n_classes - 1]) are indices into `classes` mapping `y_ind` back to the | ||
| original `y`. | ||
| class_weight : dict, 'balanced', or None | ||
| If `"balanced"`, classes are weighted by the inverse of their | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,12 +7,12 @@ | |
| import cuml.internals | ||
| import cuml.internals.nvtx as nvtx | ||
| from cuml.common.array_descriptor import CumlArrayDescriptor | ||
| from cuml.common.classification import check_classification_targets | ||
| from cuml.common.classification import decode_labels, preprocess_labels | ||
| from cuml.common.doc_utils import generate_docstring, insert_into_docstring | ||
| from cuml.ensemble.randomforest_common import BaseRandomForestModel | ||
| from cuml.internals.array import CumlArray | ||
| from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array | ||
| from cuml.internals.interop import UnsupportedOnGPU, to_cpu, to_gpu | ||
| from cuml.internals.input_utils import input_to_cuml_array | ||
| from cuml.internals.interop import UnsupportedOnGPU | ||
| from cuml.internals.mixins import ClassifierMixin | ||
| from cuml.metrics import accuracy_score | ||
|
|
||
|
|
@@ -141,17 +141,17 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): | |
|
|
||
| Attributes | ||
| ---------- | ||
| classes_ : np.ndarray, shape=(n_classes,) | ||
| A sorted array of the class labels. | ||
| oob_score_ : float | ||
| Score of the training dataset obtained using an out-of-bag estimate. | ||
| This attribute exists only when ``oob_score`` is True. | ||
|
|
||
| oob_decision_function_ : ndarray of shape (n_samples, n_classes) | ||
| Decision function computed with out-of-bag estimate on the training | ||
| set. If n_estimators is small it might be possible that a data point | ||
| was never left out during the bootstrap. In this case, | ||
| ``oob_decision_function_`` might contain NaN. This attribute exists | ||
| only when ``oob_score`` is True. | ||
|
|
||
| feature_importances_ : ndarray of shape (n_features,) | ||
| The impurity-based feature importances. | ||
|
|
||
|
|
@@ -168,8 +168,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): | |
| `importances = cuml_model.feature_importances_` | ||
| """ | ||
|
|
||
| classes_ = CumlArrayDescriptor() | ||
|
|
||
| oob_decision_function_ = CumlArrayDescriptor(order="C") | ||
|
|
||
| _cpu_class_path = "sklearn.ensemble.RandomForestClassifier" | ||
|
|
@@ -182,14 +180,14 @@ def _params_from_cpu(cls, model): | |
|
|
||
| def _attrs_from_cpu(self, model): | ||
| return { | ||
| "classes_": to_gpu(model.classes_), | ||
| "classes_": model.classes_, | ||
| "n_classes_": model.n_classes_, | ||
| **super()._attrs_from_cpu(model), | ||
| } | ||
|
|
||
| def _attrs_to_cpu(self, model): | ||
| return { | ||
| "classes_": to_cpu(self.classes_), | ||
| "classes_": self.classes_, | ||
| "n_classes_": self.n_classes_, | ||
| **super()._attrs_to_cpu(model), | ||
| } | ||
|
|
@@ -237,13 +235,12 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier": | |
| check_dtype=[np.float32, np.float64], | ||
| order="F", | ||
| ).array | ||
| y = input_to_cupy_array(y, check_rows=X_m.shape[0], check_cols=1).array | ||
| check_classification_targets(y) | ||
|
|
||
| classes, y = cp.unique(y, return_inverse=True) | ||
| self.classes_ = CumlArray(data=classes) | ||
| self.n_classes_ = len(self.classes_) | ||
| y_m = CumlArray(data=y.astype(cp.int32, copy=False)) | ||
| y, classes = preprocess_labels( | ||
| y, n_samples=X_m.shape[0], dtype=cp.int32 | ||
| ) | ||
| self.classes_ = classes | ||
| self.n_classes_ = len(classes) | ||
| y_m = CumlArray(data=y) | ||
|
|
||
| return self._fit_forest(X_m, y_m) | ||
|
|
||
|
|
@@ -255,7 +252,7 @@ def fit(self, X, y, *, convert_dtype=True) -> "RandomForestClassifier": | |
| parameters=[("dense", "(n_samples, n_features)")], | ||
| return_values=[("dense", "(n_samples, 1)")], | ||
| ) | ||
| @cuml.internals.api_base_return_array(get_output_dtype=True) | ||
| @cuml.internals.api_base_return_any_skipall | ||
| def predict( | ||
| self, | ||
| X, | ||
|
|
@@ -265,7 +262,7 @@ def predict( | |
| layout="depth_first", | ||
| default_chunk_size=None, | ||
| align_bytes=None, | ||
| ) -> CumlArray: | ||
| ): | ||
| """ | ||
| Predicts the labels for X. | ||
|
|
||
|
|
@@ -296,16 +293,10 @@ def predict( | |
| default_chunk_size=default_chunk_size, | ||
| align_bytes=align_bytes, | ||
| ) | ||
| preds = fil.predict(X, threshold=threshold) | ||
|
|
||
| if not ( | ||
| self.classes_.dtype.kind == "i" | ||
| and (self.classes_ == cp.arange(self.n_classes_)).all() | ||
| ): | ||
| preds = CumlArray( | ||
| self.classes_.to_output("cupy").take(preds.to_output("cupy")) | ||
| ) | ||
| return preds | ||
| inds = fil.predict(X, threshold=threshold).to_output("cupy") | ||
| with cuml.internals.exit_internal_api(): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is gross, but necessary so |
||
| output_type = self._get_output_type(X) | ||
| return decode_labels(inds, self.classes_, output_type=output_type) | ||
|
|
||
| @insert_into_docstring( | ||
| parameters=[("dense", "(n_samples, n_features)")], | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logic in
preprocess_labelsanddecode_labelsis mostly the same logic we already had inLogisticRegression, just extracted and generalized so it can apply to all our classifiers.