Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
71142e8
Support StandardScaler in cuml.accel.
csadorf Feb 3, 2026
215505d
partial_fit is not supported
csadorf Feb 4, 2026
2f448a1
fixup n_sample_seen sync to cpu
csadorf Feb 4, 2026
960d9f9
Fix error message
csadorf Feb 4, 2026
9fa8622
do not support complext input data
csadorf Feb 4, 2026
35dd633
Do not support object dtype
csadorf Feb 4, 2026
308b5e2
xfail check_transformer_data_not_an_array test
csadorf Feb 4, 2026
1c39bb6
Document support and limitations.
csadorf Feb 4, 2026
a1065d9
convert n_samples_seen_ when synced from cpu
csadorf Feb 4, 2026
e0ad693
document lack of support for sample_weight argument
csadorf Feb 4, 2026
a3944a7
update xfail list
csadorf Feb 4, 2026
c78da5e
address sklearn upstream failures
csadorf Feb 4, 2026
944b631
remove passing tests from xfail list
csadorf Feb 4, 2026
71613fb
more precise handling of sparse matrix related limitations
csadorf Feb 5, 2026
6e5b8c8
refactor preprocessing.py to deduplicate
csadorf Feb 5, 2026
3a8ae02
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
f32090e
be more precise about detection and conversion of list-like inputs
csadorf Feb 5, 2026
10f6821
Handle sample_weight parameter
csadorf Feb 5, 2026
661012b
Do not sync for get_feature_names_out
csadorf Feb 5, 2026
5ce80ca
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
e7a52e4
fall back for float16
csadorf Feb 5, 2026
4884e3e
fine-tune support matrix and limitation docs
csadorf Feb 5, 2026
51bc0fd
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
b0542c1
fixup xfail list version conditions
csadorf Feb 5, 2026
13869b0
Merge branch 'main' into add-cuml.accel-support-for-standardscaler
csadorf Feb 6, 2026
68db908
Restore erroneously removed entries from xfail list.
csadorf Feb 6, 2026
b8635bc
use scikit-learn approach for is-array-like check
csadorf Feb 9, 2026
c14de81
simplify sample_weight kwargs handling
csadorf Feb 9, 2026
f58ad9a
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 9, 2026
2740a65
Revert change to xfail list from b8635bc83076c45c07631862dbd5d5c147e5…
csadorf Feb 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/cuml-accel/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ the following estimators are mostly or entirely accelerated when run with
* ``sklearn.neighbors.KNeighborsClassifier``
* ``sklearn.neighbors.KNeighborsRegressor``
* ``sklearn.neighbors.KernelDensity``
* ``sklearn.preprocessing.StandardScaler``
* ``sklearn.preprocessing.TargetEncoder``
* ``sklearn.svm.SVC``
* ``sklearn.svm.SVR``
Expand Down
10 changes: 10 additions & 0 deletions docs/source/cuml-accel/limitations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,16 @@ Additional notes:
sklearn.preprocessing
---------------------

StandardScaler
^^^^^^^^^^^^^^

``StandardScaler`` will fall back to CPU in the following cases:

- If ``partial_fit`` is called (incremental learning not supported on GPU).
- If ``sample_weight`` is provided (weighted statistics not supported on GPU).
- If ``X`` has object dtype, half precision (float16) dtype, or complex dtype (``complex64``, ``complex128``).
- If ``X`` is a sparse matrix with integer dtype or in a format other than CSR or CSC.

Comment thread
coderabbitai[bot] marked this conversation as resolved.
TargetEncoder
^^^^^^^^^^^^^

Expand Down
45 changes: 44 additions & 1 deletion python/cuml/cuml/_thirdparty/sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# SPDX-FileCopyrightText: Eric Martin <eric@ericmart.in>
# SPDX-FileCopyrightText: Giorgio Patrini <giorgio.patrini@anu.edu.au>
# SPDX-FileCopyrightText: Eric Chang <ericchang2017@u.northwestern.edu>
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause

# Original authors from Sckit-Learn:
Expand Down Expand Up @@ -40,6 +40,7 @@
SparseInputTagMixin,
StatelessTagMixin,
)
from cuml.internals.interop import InteropMixin, to_cpu, to_gpu

from ....common.array_descriptor import CumlArrayDescriptor
from ....internals.array import CumlArray
Expand Down Expand Up @@ -519,6 +520,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):

class StandardScaler(TransformerMixin,
BaseEstimator,
InteropMixin,
AllowNaNTagMixin,
SparseInputTagMixin):
"""Standardize features by removing the mean and scaling to unit variance
Expand Down Expand Up @@ -658,6 +660,47 @@ def _get_param_names(cls):
"copy"
]

# InteropMixin requirements
_cpu_class_path = "sklearn.preprocessing.StandardScaler"

@classmethod
def _params_from_cpu(cls, model):
"""Convert sklearn StandardScaler hyperparameters to cuML format."""
return {
"copy": model.copy,
"with_mean": model.with_mean,
"with_std": model.with_std,
}

def _params_to_cpu(self):
"""Convert cuML StandardScaler hyperparameters to sklearn format."""
return {
"copy": self.copy,
"with_mean": self.with_mean,
"with_std": self.with_std,
}

def _attrs_from_cpu(self, model):
"""Convert sklearn StandardScaler fitted attributes to cuML format."""
attrs = {
"mean_": to_gpu(mean) if (mean := getattr(model, "mean_", None)) is not None else None,
"var_": to_gpu(var) if (var := getattr(model, "var_", None)) is not None else None,
"scale_": to_gpu(scale) if (scale := getattr(model, "scale_", None)) is not None else None,
"n_samples_seen_": to_gpu(nss) if (nss := getattr(model, "n_samples_seen_", None)) is not None else None,
}
return {**attrs, **super()._attrs_from_cpu(model)}

def _attrs_to_cpu(self, model):
"""Convert cuML StandardScaler fitted attributes to sklearn format."""

attrs = {
"mean_": to_cpu(mean) if (mean := getattr(self, "mean_", None)) is not None else None,
"var_": to_cpu(var) if (var := getattr(self, "var_", None)) is not None else None,
"scale_": to_cpu(scale) if (scale := getattr(self, "scale_", None)) is not None else None,
"n_samples_seen_": None if (nss := getattr(self, "n_samples_seen_", None)) is None else cpu_np.int64(nss) if cpu_np.isscalar(nss) else to_cpu(nss),
}
return {**attrs, **super()._attrs_to_cpu(model)}

@reflect(reset=True)
def fit(self, X, y=None) -> "StandardScaler":
"""Compute the mean and std to be used for later scaling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _check_n_features(self, X, reset):
)
if n_features != self.n_features_in_:
raise ValueError(
'X has {} features, but this {} is expecting {} features '
'X has {} features, but {} is expecting {} features '
'as input.'.format(n_features, self.__class__.__name__,
self.n_features_in_)
)
Expand Down
66 changes: 65 additions & 1 deletion python/cuml/cuml/accel/_wrappers/sklearn/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,77 @@
# SPDX-License-Identifier: Apache-2.0
#

import cupyx.scipy.sparse as cupy_sparse
import numpy as np
from scipy import sparse as sp_sparse

import cuml.preprocessing
from cuml.accel.estimator_proxy import ProxyBase
from cuml.internals.interop import UnsupportedOnGPU

__all__ = ("TargetEncoder",)
__all__ = ("StandardScaler", "TargetEncoder")


def _check_standardscaler_unsupported_inputs(X, **kwargs):
"""Check if inputs are supported by cuML's StandardScaler on GPU.

Raises UnsupportedOnGPU for unsupported cases to trigger CPU fallback.
"""
if "sample_weight" in kwargs:
raise UnsupportedOnGPU("sample_weight parameter not supported")

# Reject complex, object, and float16 dtypes
if hasattr(X, "dtype"):
if np.issubdtype(X.dtype, np.complexfloating):
raise UnsupportedOnGPU("Complex data types not supported")
if X.dtype == np.object_:
raise UnsupportedOnGPU("Object dtype not supported")
if X.dtype == np.float16:
raise UnsupportedOnGPU(
"float16 dtype not supported on GPU (output would not preserve dtype)"
)

# Check for sparse matrices with unsupported properties
if sp_sparse.issparse(X):
# cupy sparse doesn't support int64 dtype
if X.dtype == np.int64:
raise UnsupportedOnGPU(
"Sparse matrices with int64 dtype not supported on GPU "
"(cupy sparse only supports float32, float64, complex64, complex128, bool)"
)
# cuML's StandardScaler algorithm only supports CSR/CSC formats.
if X.format not in ("csr", "csc"):
raise UnsupportedOnGPU(
f"Sparse matrix format '{X.format}' not supported on GPU "
"(only CSR and CSC formats are supported)"
)
elif cupy_sparse.issparse(X):
# Check CuPy sparse matrices (not caught by scipy.sparse.issparse)
# cuML's StandardScaler algorithm only supports CSR/CSC formats.
if X.format not in ("csr", "csc"):
raise UnsupportedOnGPU(
f"CuPy sparse matrix format '{X.format}' not supported "
"(only CSR and CSC formats are supported)"
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class StandardScaler(ProxyBase):
_gpu_class = cuml.preprocessing.StandardScaler

def _gpu_fit(self, X, y=None, sample_weight=None):
kwargs = (
{} if sample_weight is None else {"sample_weight": sample_weight}
)
Comment thread
csadorf marked this conversation as resolved.
Outdated
_check_standardscaler_unsupported_inputs(X, **kwargs)
return self._gpu.fit(X, y)

def _gpu_fit_transform(self, X, y=None, **fit_params):
_check_standardscaler_unsupported_inputs(X, **fit_params)
return self._gpu.fit_transform(X, y, **fit_params)

def _gpu_partial_fit(self, X, y=None, sample_weight=None):
"""partial_fit not supported on GPU - always fall back to CPU."""
raise UnsupportedOnGPU("partial_fit not supported on GPU")


def _check_unsupported_inputs(X, y, cpu_model):
Expand Down
14 changes: 11 additions & 3 deletions python/cuml/cuml/accel/estimator_proxy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

Expand All @@ -8,7 +8,11 @@

import sklearn
from packaging.version import Version
from sklearn.base import BaseEstimator, ClassNamePrefixFeaturesOutMixin
from sklearn.base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
OneToOneFeatureMixin,
)
from sklearn.utils._set_output import _wrap_data_with_container

from cuml.accel import profilers
Expand Down Expand Up @@ -335,12 +339,16 @@ def _gpu_set_output(self, *, transform=None):

def _gpu_get_feature_names_out(self, input_features=None):
# In the common case `get_feature_names_out` doesn't require fitted attributes
# on the CPU. Here we detect and special case a common mixin, falling back to
# on the CPU. Here we detect and special case common mixins, falling back to
# CPU when necessary. This helps avoid unnecessary device -> host transfers.
cpu_method = self._cpu_class.get_feature_names_out
if cpu_method is ClassNamePrefixFeaturesOutMixin.get_feature_names_out:
# Can run cpu method directly on GPU instance, it only references `_n_features_out`
return cpu_method(self._gpu, input_features=input_features)
if cpu_method is OneToOneFeatureMixin.get_feature_names_out:
# Uses n_features_in_ (and optionally feature_names_in_) on the estimator.
# cuML models set n_features_in_ on fit; feature_names_in_ is optional.
return cpu_method(self._gpu, input_features=input_features)

# Fallback to CPU
raise UnsupportedOnGPU
Expand Down
37 changes: 35 additions & 2 deletions python/cuml/cuml/thirdparty_adapters/adapters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#
import cudf
Expand All @@ -9,6 +9,7 @@
import pandas as pd
from scipy import sparse as cpu_sparse

from cuml.accel import enabled as cuml_accel_enabled
from cuml.internals.input_utils import input_to_cupy_array

numeric_types = [
Expand Down Expand Up @@ -227,6 +228,27 @@ def check_array(
array_converted : object
The converted and validated array.
"""
# Convert list-like inputs to numpy arrays early for compatibility with cuml.accel
# This ensures downstream functions can safely access .dtype and other array attributes
if (
cuml_accel_enabled()
and not isinstance(
array,
(
np.ndarray,
pd.DataFrame,
cudf.DataFrame,
pd.Series,
cudf.Series,
cp.ndarray,
),
)
and not (cpu_sparse.issparse(array) or gpu_sparse.issparse(array))
and not hasattr(array, "__cuda_array_interface__")
):
# Check if it's array-like (list, tuple, etc.) by checking for common sequence methods
if hasattr(array, "__len__") and hasattr(array, "__getitem__"):
Comment thread
csadorf marked this conversation as resolved.
Outdated
array = np.asarray(array)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if dtype == "numeric":
dtype = numeric_types
Expand All @@ -250,7 +272,18 @@ def check_array(
hasshape = hasattr(array, "shape")
if ensure_2d and hasshape:
if len(array.shape) != 2:
raise ValueError("Not 2D")
if len(array.shape) == 1:
raise ValueError(
f"Expected 2D array, got 1D array instead:\narray={array!r}.\n"
"Reshape your data either using array.reshape(-1, 1) if "
"your data has a single feature or array.reshape(1, -1) "
"if it contains a single sample."
)
else:
raise ValueError(
f"Expected 2D array, got {len(array.shape)}D array instead:\n"
f"array shape: {array.shape}.\n"
)

if not allow_nd and hasshape:
if len(array.shape) > 2:
Expand Down
30 changes: 29 additions & 1 deletion python/cuml/cuml_accel_tests/test_basic_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
KNeighborsRegressor,
NearestNeighbors,
)
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler, TargetEncoder


def test_kmeans():
Expand All @@ -44,6 +44,34 @@ def test_truncated_svd():
svd.transform(X)


def test_standard_scaler():
import numpy as np

X, _ = make_blobs(n_samples=100, centers=3, random_state=42)
scaler = StandardScaler().fit(X)

# Check fitted attributes exist
assert hasattr(scaler, "mean_")
assert hasattr(scaler, "var_")
assert hasattr(scaler, "scale_")
assert scaler.mean_.shape == (X.shape[1],)
assert scaler.var_.shape == (X.shape[1],)
assert scaler.scale_.shape == (X.shape[1],)

# Transform and check shape
X_transformed = scaler.transform(X)
assert X_transformed.shape == X.shape

# Check that transformed data has mean ≈ 0 and std ≈ 1
assert np.allclose(X_transformed.mean(axis=0), 0, atol=1e-7)
assert np.allclose(X_transformed.std(axis=0), 1, atol=1e-7)

# Check inverse transform
X_inverse = scaler.inverse_transform(X_transformed)
assert X_inverse.shape == X.shape
assert np.allclose(X_inverse, X, atol=1e-6)


def test_linear_regression():
X, y = make_regression(
n_samples=100, n_features=20, noise=0.1, random_state=42
Expand Down
Loading
Loading