|
1 | | -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. |
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | | -import warnings |
4 | | - |
5 | 3 | import cudf |
6 | 4 | import cupy as cp |
7 | 5 | import numpy as np |
8 | | -import pandas as pd |
9 | 6 |
|
10 | 7 | from cuml.internals.array import CumlArray, cuda_ptr |
11 | | -from cuml.internals.input_utils import input_to_cuml_array, input_to_cupy_array |
| 8 | +from cuml.internals.input_utils import input_to_cupy_array |
12 | 9 | from cuml.internals.output_utils import cudf_to_pandas |
13 | 10 |
|
14 | | -is_integral = cp.ReductionKernel( |
15 | | - "T x", |
16 | | - "bool out", |
17 | | - "ceilf(x) == x", |
18 | | - "a && b", |
19 | | - "out = a", |
20 | | - "true", |
21 | | - "is_integral", |
22 | | -) |
23 | | - |
24 | | - |
25 | | -def check_classification_targets(y): |
26 | | - """Check if `y` is composed of valid class labels""" |
27 | | - if y.dtype.kind == "f" and not is_integral(y): |
28 | | - raise ValueError( |
29 | | - "Unknown label type: continuous. Maybe you are trying to fit a " |
30 | | - "classifier, which expects discrete classes on a regression target " |
31 | | - "with continuous values." |
32 | | - ) |
33 | | - |
34 | | - |
35 | | -def preprocess_labels( |
36 | | - y, dtype=None, order="C", n_samples=None, allow_multitarget=False |
37 | | -): |
38 | | - """Preprocess the `y` input to a classifier. |
39 | | -
|
40 | | - Parameters |
41 | | - ---------- |
42 | | - y : array-like |
43 | | - The labels for fitting, may be any type cuml supports as input. |
44 | | - dtype : dtype, optional |
45 | | - The output dtype to use for the encoded labels. If not provided, |
46 | | - a data-dependent integral type will be used. |
47 | | - order : {"C", "F"}, optional |
48 | | - The array order to use for the encoded labels. |
49 | | - n_samples : int, optional |
50 | | - If provided, will raise an error if the number of samples in `y` |
51 | | - doesn't match. |
52 | | - allow_multitarget : bool, optional |
53 | | - Whether to allow multi-target labels. |
54 | | -
|
55 | | - Returns |
56 | | - ------- |
57 | | - y_encoded : cp.ndarray |
58 | | - The labels, encoded as integers in [0, n_classes - 1]. |
59 | | - classes : np.ndarray or list[np.ndarray] |
60 | | - The classes as a numpy array, or a list of numpy arrays if |
61 | | - y is multi-target. |
62 | | - """ |
63 | | - # cudf may coerce the dtype, store the original so we can cast back later |
64 | | - y_dtype = y.dtype if isinstance(y, np.ndarray) else None |
65 | | - |
66 | | - # No cuda container supports all dtypes. Here we coerce to cupy when |
67 | | - # possible, falling back to cudf Series/DataFrame otherwise. |
68 | | - if isinstance(y, np.ndarray) and y.dtype.kind in "iufb": |
69 | | - y = cp.asarray(y) |
70 | | - elif isinstance(y, pd.DataFrame): |
71 | | - y = cudf.DataFrame(y) |
72 | | - elif isinstance(y, pd.Series): |
73 | | - y = cudf.Series(y) |
74 | | - elif not isinstance(y, (cp.ndarray, cudf.DataFrame, cudf.Series)): |
75 | | - # Non-numeric dtype, always go through cudf |
76 | | - y = input_to_cuml_array(y, convert_to_mem_type=False).array |
77 | | - if y.dtype.kind in "iufb": |
78 | | - y = y.to_output("cupy") |
79 | | - else: |
80 | | - y = (cudf.DataFrame if y.ndim == 2 else cudf.Series)( |
81 | | - y, dtype=(np.dtype("O") if y.dtype.kind in "U" else None) |
82 | | - ) |
83 | | - |
84 | | - # Validate dimensionality, ensuring 1D/2D y is as expected |
85 | | - if y.ndim == 2 and y.shape[1] == 1: |
86 | | - warnings.warn( |
87 | | - "A column-vector y was passed when a 1d array was expected. Please " |
88 | | - "change the shape of y to (n_samples,), for example using ravel()." |
89 | | - ) |
90 | | - y = y.iloc[:, 0] if isinstance(y, cudf.DataFrame) else y.ravel() |
91 | | - elif allow_multitarget and y.ndim not in (1, 2): |
92 | | - raise ValueError( |
93 | | - f"y should be a 1d or 2d array, got an array of shape {y.shape} instead." |
94 | | - ) |
95 | | - elif not allow_multitarget and y.ndim != 1: |
96 | | - raise ValueError( |
97 | | - f"y should be a 1d array, got an array of shape {y.shape} instead." |
98 | | - ) |
99 | | - |
100 | | - # Validate correct number of samples |
101 | | - if n_samples is not None and y.shape[0] != n_samples: |
102 | | - raise ValueError( |
103 | | - f"Expected `y` with {n_samples} samples, got {y.shape[0]}" |
104 | | - ) |
105 | | - |
106 | | - def _encode(y): |
107 | | - """Encode `y` to codes and classes""" |
108 | | - check_classification_targets(y) |
109 | | - if isinstance(y, cudf.Series): |
110 | | - y = y.astype("category") |
111 | | - codes = cp.asarray(y.cat.codes) |
112 | | - classes = y.cat.categories.to_numpy() |
113 | | - # cudf will sometimes translate non-numeric dtypes. Coerce back to |
114 | | - # the input dtype if the input was originally a numpy array. |
115 | | - if y_dtype is not None: |
116 | | - classes = classes.astype(y_dtype, copy=False) |
117 | | - else: |
118 | | - classes, codes = cp.unique(y, return_inverse=True) |
119 | | - classes = classes.get() |
120 | | - return codes, classes |
121 | | - |
122 | | - if y.ndim == 1: |
123 | | - y_encoded, classes = _encode(y) |
124 | | - if dtype is not None: |
125 | | - y_encoded = y_encoded.astype(dtype, copy=False) |
126 | | - else: |
127 | | - getter = y.iloc if isinstance(y, cudf.DataFrame) else y |
128 | | - encoded_cols, classes = zip( |
129 | | - *(_encode(getter[:, i]) for i in range(y.shape[1])) |
130 | | - ) |
131 | | - classes = list(classes) |
132 | | - if dtype is None: |
133 | | - dtype = cp.result_type(*(c.dtype for c in encoded_cols)) |
134 | | - y_encoded = cp.empty(shape=y.shape, dtype=dtype, order=order) |
135 | | - for i, col in enumerate(encoded_cols): |
136 | | - y_encoded[:, i] = col |
137 | | - |
138 | | - return y_encoded, classes |
139 | | - |
140 | 11 |
|
141 | 12 | def decode_labels(y_encoded, classes, output_type="cupy"): |
142 | 13 | """Convert encoded labels back into their original classes. |
|
0 commit comments