Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/source/cuml-accel/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ the following estimators are mostly or entirely accelerated when run with
* ``sklearn.neighbors.KNeighborsRegressor``
* ``sklearn.neighbors.KernelDensity``
* ``sklearn.preprocessing.StandardScaler``
* ``sklearn.preprocessing.MinMaxScaler``
* ``sklearn.preprocessing.MaxAbsScaler``
* ``sklearn.preprocessing.PolynomialFeatures``
* ``sklearn.preprocessing.TargetEncoder``
* ``sklearn.svm.SVC``
* ``sklearn.svm.SVR``
Expand Down
28 changes: 26 additions & 2 deletions docs/source/cuml-accel/limitations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,32 @@ StandardScaler

``StandardScaler`` will fall back to CPU in the following cases:

- If ``X`` is sparse
- When run on scikit-learn < 1.8
- If ``X`` is sparse.
- When run on scikit-learn < 1.8.

MinMaxScaler
^^^^^^^^^^^^

``MinMaxScaler`` will fall back to CPU in the following cases:

- When run on scikit-learn < 1.8.

MaxAbsScaler
^^^^^^^^^^^^

``MaxAbsScaler`` will fall back to CPU in the following cases:

- If ``X`` is sparse.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(potential follow-up) Delegate to cuML's implementation for sparse inputs.

- When run on scikit-learn < 1.8.

PolynomialFeatures
^^^^^^^^^^^^^^^^^^

``PolynomialFeatures`` will fall back to CPU in the following cases:

- If ``X`` is sparse.
- If ``order`` is ``"F"``.
- When run on scikit-learn < 1.8.

TargetEncoder
^^^^^^^^^^^^^
Expand Down
42 changes: 40 additions & 2 deletions python/cuml/cuml/accel/_overrides/sklearn/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,54 @@
import numpy as np

import cuml.preprocessing
from cuml.accel.estimator_proxy import ArrayAPIProxyBase, ProxyBase
from cuml.accel.estimator_proxy import (
ArrayAPIProxyBase,
ProxyBase,
classproperty,
)
from cuml.internals.interop import UnsupportedOnGPU

__all__ = ("StandardScaler", "TargetEncoder")
__all__ = (
"StandardScaler",
"MinMaxScaler",
"MaxAbsScaler",
"PolynomialFeatures",
"TargetEncoder",
)


class StandardScaler(ArrayAPIProxyBase):
_cpu_class_path = "sklearn.preprocessing.StandardScaler"


class MinMaxScaler(ArrayAPIProxyBase):
_cpu_class_path = "sklearn.preprocessing.MinMaxScaler"


class MaxAbsScaler(ArrayAPIProxyBase):
_cpu_class_path = "sklearn.preprocessing.MaxAbsScaler"


class PolynomialFeatures(ArrayAPIProxyBase):
_cpu_class_path = "sklearn.preprocessing.PolynomialFeatures"

# These are staticmethods on the class that sklearn uses in the tests,
# we can just re-export them here.
@classproperty
def _combinations(cls):
return cls._cpu_class._combinations

@classproperty
def _num_combinations(cls):
return cls._cpu_class._num_combinations

@staticmethod
def _params_from_cpu(model):
if model.order == "F":
raise UnsupportedOnGPU("order='F' is not supported")
return model.get_params(deep=False)


def _check_unsupported_inputs(X, y, cpu_model):
"""Check if inputs are supported on GPU.

Expand Down
45 changes: 33 additions & 12 deletions python/cuml/cuml/accel/estimator_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@
SKLEARN_18 = Version(sklearn.__version__) >= Version("1.8.0.dev0")


class classproperty:
"""A property, but on the class instead of the instance."""

def __init__(self, f):
self.f = f

def __get__(self, obj, owner):
return self.f(owner)


def is_proxy(instance_or_class) -> bool:
"""Check if an instance or class is a proxy object created by the accelerator."""
if isinstance(instance_or_class, type):
Expand Down Expand Up @@ -564,9 +574,9 @@ def _metadata_request(self):
def _estimator_type(self):
return self._cpu._estimator_type

@property
def _parameter_constraints(self):
return self._cpu._parameter_constraints
@classproperty
def _parameter_constraints(cls):
return cls._cpu_class._parameter_constraints

@classmethod
def _get_param_names(cls):
Expand Down Expand Up @@ -710,6 +720,8 @@ def _params_from_cpu(cls, model):
"scikit-learn >= 1.8 is required to run on GPU"
)

if cls._params_from_cpu_override is not None:
return cls._params_from_cpu_override(model)
return model.get_params(deep=False)

def _params_to_cpu(self):
Expand Down Expand Up @@ -738,21 +750,23 @@ def _sync_attrs_from_cpu(self, model) -> None:

def _attrs_from_cpu(self, model):
attrs = super()._attrs_from_cpu(model)
exclude = {
"feature_names_in_",
"n_features_in_",
*self._get_param_names(),
}
for name, value in vars(model).items():
if (
name.endswith("_")
and not name.startswith("_")
and name != "feature_names_in_"
):
if name not in exclude:
if isinstance(value, np.ndarray):
value = cp.asarray(value)
attrs[name] = value
return attrs

def _attrs_to_cpu(self, model):
attrs = super()._attrs_to_cpu(model)
exclude = set(self._get_param_names())
for name, value in vars(self._internal_model).items():
if name.endswith("_") and not name.startswith("_"):
if name not in exclude:
if isinstance(value, cp.ndarray):
value = cp.asnumpy(value)
attrs[name] = value
Expand All @@ -779,15 +793,22 @@ def __getattr__(self, name):
class ArrayAPIProxyBase(ProxyBase):
"""A ProxyBase subclass for proxying array-api-enabled sklearn models.

Subclasses should define ``_cpu_class_path`` as the public import
path of the sklearn class."""
Subclasses should define ``_cpu_class_path`` as the public import path of
the sklearn class. They also may optionally define `_params_from_cpu` to
handle filtering any unsupported hyperparameters.
"""

def __init_subclass__(cls, **kwargs):
# Programmatically create a new private cuml.Base class that wraps the
# sklearn array-api-enabled model in a cuml consistent API.
cls._gpu_class = type(
cls.__name__,
(_ArrayAPIWrapper,),
{"_cpu_class_path": cls._cpu_class_path},
{
"_cpu_class_path": cls._cpu_class_path,
"_params_from_cpu_override": getattr(
cls, "_params_from_cpu", None
),
},
)
super().__init_subclass__(**kwargs)
73 changes: 69 additions & 4 deletions python/cuml/cuml_accel_tests/integration/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@
# SPDX-License-Identifier: Apache-2.0

import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import (
MaxAbsScaler,
MinMaxScaler,
PolynomialFeatures,
StandardScaler,
)


def test_standard_scaler():
Expand All @@ -28,12 +35,57 @@ def test_standard_scaler():
np.testing.assert_allclose(X_inverse, X, atol=1e-6)


def test_standard_scaler_partial_fit():
def test_min_max_scaler():
X, _ = make_blobs(n_samples=100, centers=3, random_state=42)
model = MinMaxScaler().fit(X)

model = StandardScaler().fit(X)
assert model.min_.shape == (X.shape[1],)
assert model.scale_.shape == (X.shape[1],)
assert model.data_min_.shape == (X.shape[1],)
assert model.data_max_.shape == (X.shape[1],)
assert model.data_range_.shape == (X.shape[1],)

# Transform and check shape
X_transformed = model.transform(X)
assert X_transformed.shape == X.shape

# Check that transformed data is scaled appropriately
assert (X_transformed.min(axis=0) >= 0).all()
assert (X_transformed.max(axis=0) <= 1).all()

# Check inverse transform
X_inverse = model.inverse_transform(X_transformed)
assert X_inverse.shape == X.shape
np.testing.assert_allclose(X_inverse, X, atol=1e-6)


def test_max_abs_scaler():
X, _ = make_blobs(n_samples=100, centers=3, random_state=42)
model = MaxAbsScaler().fit(X)

assert model.scale_.shape == (X.shape[1],)
assert model.max_abs_.shape == (X.shape[1],)

# Transform and check shape
X_transformed = model.transform(X)
assert X_transformed.shape == X.shape

# Check that transformed data is scaled appropriately
assert (np.abs(X_transformed) <= 1).all()

# Check inverse transform
X_inverse = model.inverse_transform(X_transformed)
assert X_inverse.shape == X.shape
np.testing.assert_allclose(X_inverse, X, atol=1e-6)

model2 = StandardScaler()

@pytest.mark.parametrize("cls", [StandardScaler, MinMaxScaler, MaxAbsScaler])
def test_scaler_partial_fit(cls):
X, _ = make_blobs(n_samples=100, centers=3, random_state=42)

model = cls().fit(X)

model2 = cls()
model2.partial_fit(X[:25])
assert model2.n_samples_seen_ == 25
model2.partial_fit(X[25:])
Expand All @@ -42,3 +94,16 @@ def test_standard_scaler_partial_fit():
sol = model.transform(X)
res = model2.transform(X)
np.testing.assert_allclose(sol, res)


def test_polynomial_features():
X, _ = make_blobs(n_samples=100, centers=3, random_state=42)

model = PolynomialFeatures().fit(X)
assert isinstance(model.powers_, np.ndarray)
out = model.transform(X)
assert isinstance(out, np.ndarray)

model.set_output(transform="pandas")
out_df = model.transform(X)
assert isinstance(out_df, pd.DataFrame)
5 changes: 5 additions & 0 deletions python/cuml/cuml_accel_tests/test_estimator_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ def test_sklearn_introspect_estimator_type():
assert is_regressor(LinearRegression())


def test_sklearn_introspect_parameter_constraints():
assert isinstance(LogisticRegression._parameter_constraints, dict)
assert isinstance(LogisticRegression()._parameter_constraints, dict)


@pytest.mark.skipif(not SKLEARN_16, reason="sklearn >= 1.6 only")
def test_sklearn_utils_get_tags():
"""sklearn.utils.get_tags was added in sklearn 1.6"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,6 @@
- "sklearn.tests.test_multioutput::test_multi_target_sparse_regression[dok_matrix]"
- "sklearn.tests.test_multioutput::test_multi_target_sparse_regression[lil_array]"
- "sklearn.tests.test_multioutput::test_multi_target_sparse_regression[lil_matrix]"
- "sklearn.tests.test_public_functions::test_class_wrapper_param_validation[sklearn.cluster.dbscan-sklearn.cluster.DBSCAN]"
- "sklearn.tests.test_public_functions::test_class_wrapper_param_validation[sklearn.cluster.k_means-sklearn.cluster.KMeans]"
- "sklearn.tests.test_public_functions::test_class_wrapper_param_validation[sklearn.cluster.spectral_clustering-sklearn.cluster.SpectralClustering]"
- "sklearn.tests.test_public_functions::test_class_wrapper_param_validation[sklearn.covariance.ledoit_wolf-sklearn.covariance.LedoitWolf]"
- "sklearn.utils.tests.test_estimator_checks::test_check_dataframe_column_names_consistency"
- "sklearn.utils.tests.test_estimator_checks::test_check_estimator"
- "sklearn.utils.tests.test_estimator_checks::test_check_estimator_clones"
Expand Down
Loading