Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
71142e8
Support StandardScaler in cuml.accel.
csadorf Feb 3, 2026
215505d
partial_fit is not supported
csadorf Feb 4, 2026
2f448a1
fixup n_sample_seen sync to cpu
csadorf Feb 4, 2026
960d9f9
Fix error message
csadorf Feb 4, 2026
9fa8622
do not support complext input data
csadorf Feb 4, 2026
35dd633
Do not support object dtype
csadorf Feb 4, 2026
308b5e2
xfail check_transformer_data_not_an_array test
csadorf Feb 4, 2026
1c39bb6
Document support and limitations.
csadorf Feb 4, 2026
a1065d9
convert n_samples_seen_ when synced from cpu
csadorf Feb 4, 2026
e0ad693
document lack of support for sample_weight argument
csadorf Feb 4, 2026
a3944a7
update xfail list
csadorf Feb 4, 2026
c78da5e
address sklearn upstream failures
csadorf Feb 4, 2026
944b631
remove passing tests from xfail list
csadorf Feb 4, 2026
71613fb
more precise handling of sparse matrix related limitations
csadorf Feb 5, 2026
6e5b8c8
refactor preprocessing.py to deduplicate
csadorf Feb 5, 2026
3a8ae02
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
f32090e
be more precise about detection and conversion of list-like inputs
csadorf Feb 5, 2026
10f6821
Handle sample_weight parameter
csadorf Feb 5, 2026
661012b
Do not sync for get_feature_names_out
csadorf Feb 5, 2026
5ce80ca
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
e7a52e4
fall back for float16
csadorf Feb 5, 2026
4884e3e
fine-tune support matrix and limitation docs
csadorf Feb 5, 2026
51bc0fd
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 5, 2026
b0542c1
fixup xfail list version conditions
csadorf Feb 5, 2026
13869b0
Merge branch 'main' into add-cuml.accel-support-for-standardscaler
csadorf Feb 6, 2026
68db908
Restore erroneously removed entries from xfail list.
csadorf Feb 6, 2026
b8635bc
use scikit-learn approach for is-array-like check
csadorf Feb 9, 2026
c14de81
simplify sample_weight kwargs handling
csadorf Feb 9, 2026
f58ad9a
Merge remote-tracking branch 'origin/main' into add-cuml.accel-suppor…
csadorf Feb 9, 2026
2740a65
Revert change to xfail list from b8635bc83076c45c07631862dbd5d5c147e5…
csadorf Feb 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/cuml-accel/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ the following estimators are mostly or entirely accelerated when run with
* ``sklearn.neighbors.KNeighborsClassifier``
* ``sklearn.neighbors.KNeighborsRegressor``
* ``sklearn.neighbors.KernelDensity``
* ``sklearn.preprocessing.StandardScaler``
* ``sklearn.preprocessing.TargetEncoder``
* ``sklearn.svm.SVC``
* ``sklearn.svm.SVR``
Expand Down
9 changes: 9 additions & 0 deletions docs/source/cuml-accel/limitations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,15 @@ Additional notes:
sklearn.preprocessing
---------------------

StandardScaler
^^^^^^^^^^^^^^

``StandardScaler`` will fall back to CPU in the following cases:

- If ``partial_fit`` is called (incremental learning not supported on GPU).
- If ``sample_weight`` is provided (weighted statistics not supported on GPU).
- If ``X`` has object or complex dtype (``complex64``, ``complex128``).

Comment thread
coderabbitai[bot] marked this conversation as resolved.
TargetEncoder
^^^^^^^^^^^^^

Expand Down
45 changes: 44 additions & 1 deletion python/cuml/cuml/_thirdparty/sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# SPDX-FileCopyrightText: Eric Martin <eric@ericmart.in>
# SPDX-FileCopyrightText: Giorgio Patrini <giorgio.patrini@anu.edu.au>
# SPDX-FileCopyrightText: Eric Chang <ericchang2017@u.northwestern.edu>
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause

# Original authors from Sckit-Learn:
Expand Down Expand Up @@ -40,6 +40,7 @@
SparseInputTagMixin,
StatelessTagMixin,
)
from cuml.internals.interop import InteropMixin, to_cpu, to_gpu

from ....common.array_descriptor import CumlArrayDescriptor
from ....internals.array import CumlArray
Expand Down Expand Up @@ -519,6 +520,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):

class StandardScaler(TransformerMixin,
BaseEstimator,
InteropMixin,
AllowNaNTagMixin,
SparseInputTagMixin):
"""Standardize features by removing the mean and scaling to unit variance
Expand Down Expand Up @@ -658,6 +660,47 @@ def _get_param_names(cls):
"copy"
]

# InteropMixin requirements
_cpu_class_path = "sklearn.preprocessing.StandardScaler"

@classmethod
def _params_from_cpu(cls, model):
"""Convert sklearn StandardScaler hyperparameters to cuML format."""
return {
"copy": model.copy,
"with_mean": model.with_mean,
"with_std": model.with_std,
}

def _params_to_cpu(self):
"""Convert cuML StandardScaler hyperparameters to sklearn format."""
return {
"copy": self.copy,
"with_mean": self.with_mean,
"with_std": self.with_std,
}

def _attrs_from_cpu(self, model):
"""Convert sklearn StandardScaler fitted attributes to cuML format."""
attrs = {
"mean_": to_gpu(mean) if (mean := getattr(model, "mean_", None)) is not None else None,
"var_": to_gpu(var) if (var := getattr(model, "var_", None)) is not None else None,
"scale_": to_gpu(scale) if (scale := getattr(model, "scale_", None)) is not None else None,
"n_samples_seen_": to_gpu(nss) if (nss := getattr(model, "n_samples_seen_", None)) is not None else None,
}
return {**attrs, **super()._attrs_from_cpu(model)}

def _attrs_to_cpu(self, model):
"""Convert cuML StandardScaler fitted attributes to sklearn format."""

attrs = {
"mean_": to_cpu(mean) if (mean := getattr(self, "mean_", None)) is not None else None,
"var_": to_cpu(var) if (var := getattr(self, "var_", None)) is not None else None,
"scale_": to_cpu(scale) if (scale := getattr(self, "scale_", None)) is not None else None,
"n_samples_seen_": None if (nss := getattr(self, "n_samples_seen_", None)) is None else cpu_np.int64(nss) if cpu_np.isscalar(nss) else to_cpu(nss),
}
return {**attrs, **super()._attrs_to_cpu(model)}

@reflect(reset=True)
def fit(self, X, y=None) -> "StandardScaler":
"""Compute the mean and std to be used for later scaling.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: Gael Varoquaux <gael.varoquaux@normalesup.org>
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: BSD-3-Clause

# Original authors from Sckit-Learn:
Expand Down Expand Up @@ -71,7 +71,7 @@ def _check_n_features(self, X, reset):
)
if n_features != self.n_features_in_:
raise ValueError(
'X has {} features, but this {} is expecting {} features '
'X has {} features, but {} is expecting {} features '
'as input.'.format(n_features, self.__class__.__name__,
self.n_features_in_)
)
Expand Down
61 changes: 60 additions & 1 deletion python/cuml/cuml/accel/_wrappers/sklearn/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,71 @@
#

import numpy as np
from scipy import sparse as sp_sparse

import cuml.preprocessing
from cuml.accel.estimator_proxy import ProxyBase
from cuml.internals.interop import UnsupportedOnGPU

__all__ = ("TargetEncoder",)
__all__ = ("StandardScaler", "TargetEncoder")


class StandardScaler(ProxyBase):
_gpu_class = cuml.preprocessing.StandardScaler

def _gpu_fit(self, X, y=None, **kwargs):
if "sample_weight" in kwargs:
raise UnsupportedOnGPU("sample_weight parameter not supported")
Comment thread
csadorf marked this conversation as resolved.
Outdated
# Reject complex and object dtypes
if hasattr(X, "dtype"):
if np.issubdtype(X.dtype, np.complexfloating):
raise UnsupportedOnGPU("Complex data types not supported")
if X.dtype == np.object_:
raise UnsupportedOnGPU("Object dtype not supported")
# Check for sparse matrices with unsupported properties
if sp_sparse.issparse(X):
# cupy sparse doesn't support int64 dtype
if X.dtype == np.int64:
raise UnsupportedOnGPU(
"Sparse matrices with int64 dtype not supported on GPU "
"(cupy sparse only supports float32, float64, complex64, complex128, bool)"
)
# cuML only supports CSR/CSC formats, not COO, DOK, LIL, etc.
if X.format not in ("csr", "csc"):
raise UnsupportedOnGPU(
f"Sparse matrix format '{X.format}' not supported on GPU "
"(only CSR and CSC formats are supported)"
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
return self._gpu.fit(X, y, **kwargs)

def _gpu_fit_transform(self, X, y=None, **kwargs):
if "sample_weight" in kwargs:
raise UnsupportedOnGPU("sample_weight parameter not supported")
# Reject complex and object dtypes
if hasattr(X, "dtype"):
if np.issubdtype(X.dtype, np.complexfloating):
raise UnsupportedOnGPU("Complex data types not supported")
if X.dtype == np.object_:
raise UnsupportedOnGPU("Object dtype not supported")
# Check for sparse matrices with unsupported properties
if sp_sparse.issparse(X):
# cupy sparse doesn't support int64 dtype
if X.dtype == np.int64:
raise UnsupportedOnGPU(
"Sparse matrices with int64 dtype not supported on GPU "
"(cupy sparse only supports float32, float64, complex64, complex128, bool)"
)
# cuML only supports CSR/CSC formats, not COO, DOK, LIL, etc.
if X.format not in ("csr", "csc"):
raise UnsupportedOnGPU(
f"Sparse matrix format '{X.format}' not supported on GPU "
"(only CSR and CSC formats are supported)"
)
return self._gpu.fit_transform(X, y, **kwargs)

def _gpu_partial_fit(self, X, y=None, **kwargs):
"""partial_fit not supported on GPU - always fall back to CPU."""
raise UnsupportedOnGPU("partial_fit not supported on GPU")


def _check_unsupported_inputs(X, y, cpu_model):
Expand Down
27 changes: 25 additions & 2 deletions python/cuml/cuml/thirdparty_adapters/adapters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#
import cudf
Expand Down Expand Up @@ -227,6 +227,18 @@ def check_array(
array_converted : object
The converted and validated array.
"""
# Convert list-like inputs to numpy arrays early for compatibility with cuml.accel
# This ensures downstream functions can safely access .dtype and other array attributes
from cuml.accel import enabled as cuml_accel_enabled

if (
cuml_accel_enabled()
and not isinstance(array, (np.ndarray, pd.DataFrame, cudf.DataFrame))
and not (cpu_sparse.issparse(array) or gpu_sparse.issparse(array))
):
# Check if it's array-like (list, tuple, etc.) by checking for common sequence methods
if hasattr(array, "__len__") and hasattr(array, "__getitem__"):
Comment thread
csadorf marked this conversation as resolved.
Outdated
array = np.asarray(array)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if dtype == "numeric":
dtype = numeric_types
Expand All @@ -250,7 +262,18 @@ def check_array(
hasshape = hasattr(array, "shape")
if ensure_2d and hasshape:
if len(array.shape) != 2:
raise ValueError("Not 2D")
if len(array.shape) == 1:
raise ValueError(
f"Expected 2D array, got 1D array instead:\narray={array!r}.\n"
"Reshape your data either using array.reshape(-1, 1) if "
"your data has a single feature or array.reshape(1, -1) "
"if it contains a single sample."
)
else:
raise ValueError(
f"Expected 2D array, got {len(array.shape)}D array instead:\n"
f"array shape: {array.shape}.\n"
)

if not allow_nd and hasshape:
if len(array.shape) > 2:
Expand Down
30 changes: 29 additions & 1 deletion python/cuml/cuml_accel_tests/test_basic_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
KNeighborsRegressor,
NearestNeighbors,
)
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler, TargetEncoder


def test_kmeans():
Expand All @@ -44,6 +44,34 @@ def test_truncated_svd():
svd.transform(X)


def test_standard_scaler():
import numpy as np

X, _ = make_blobs(n_samples=100, centers=3, random_state=42)
scaler = StandardScaler().fit(X)

# Check fitted attributes exist
assert hasattr(scaler, "mean_")
assert hasattr(scaler, "var_")
assert hasattr(scaler, "scale_")
assert scaler.mean_.shape == (X.shape[1],)
assert scaler.var_.shape == (X.shape[1],)
assert scaler.scale_.shape == (X.shape[1],)

# Transform and check shape
X_transformed = scaler.transform(X)
assert X_transformed.shape == X.shape

# Check that transformed data has mean ≈ 0 and std ≈ 1
assert np.allclose(X_transformed.mean(axis=0), 0, atol=1e-7)
assert np.allclose(X_transformed.std(axis=0), 1, atol=1e-7)

# Check inverse transform
X_inverse = scaler.inverse_transform(X_transformed)
assert X_inverse.shape == X.shape
assert np.allclose(X_inverse, X, atol=1e-6)


def test_linear_regression():
X, y = make_regression(
n_samples=100, n_features=20, noise=0.1, random_state=42
Expand Down
Loading
Loading