Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3d0760e
FIX fix cloning
dantegd Jan 14, 2025
ca66732
Merge branch 'branch-25.02' into fix-interop-fixes
dantegd Feb 7, 2025
497d181
FIX changes from PR review to not use internal sklearn APIs and mate …
dantegd Feb 14, 2025
dce1539
Merge cuML branch-25.04
dantegd Feb 14, 2025
0d797aa
Merge branch 'branch-25.04' into fix-interop-fixes
dantegd Feb 14, 2025
0ef4895
ENH Keep list of original hyperparams that user passed
dantegd Feb 14, 2025
a00af9d
Merge branch 'fix-interop-fixes' of github.com:dantegd/cuml into fix-…
dantegd Feb 14, 2025
4e70f4c
FIX remove unused imported function
dantegd Feb 14, 2025
de3e234
Check that get_params and cloning work
betatim Feb 14, 2025
f14a14b
Typo fix
betatim Feb 20, 2025
5a02fd0
ENH multiple improvements by using the cpu_model as the reference tru…
dantegd Feb 21, 2025
e0cd0d5
FIX style fixes
dantegd Feb 21, 2025
d206bb8
Merge cuML branch-25.04
dantegd Feb 21, 2025
e70c3fb
Merge branch 'branch-25.04' into fix-interop-fixes
dantegd Feb 21, 2025
3785c4e
DOC correct docstrings
dantegd Feb 21, 2025
fd17f09
Merge branch 'fix-interop-fixes' of github.com:dantegd/cuml into fix-…
dantegd Feb 21, 2025
e7a35a1
Move imports to the top
betatim Feb 21, 2025
742404e
Fix style
betatim Feb 21, 2025
fa41369
FIX multiple fixes from issues raised in PR review
dantegd Feb 24, 2025
4fb4982
FIX skip pytest for cuml.accel output type when accel is not active
dantegd Feb 24, 2025
b2c31b2
FIX small test fixes
dantegd Feb 24, 2025
5a5bf59
Merge cuML branch-25.04
dantegd Feb 24, 2025
8eeb5b7
FIX fix from a bad merge
dantegd Feb 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
"""
Fit HDBSCAN model from features.
"""

self._all_finite = True
X_m, n_rows, n_cols, self.dtype = \
input_to_cuml_array(X, order='C',
check_dtype=[np.float32],
Expand Down Expand Up @@ -1163,7 +1163,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
def get_attr_names(self):
attr_names = ['labels_', 'probabilities_', 'cluster_persistence_',
'condensed_tree_', 'single_linkage_tree_',
'outlier_scores_']
'outlier_scores_', '_all_finite']
if self.gen_min_span_tree:
attr_names = attr_names + ['minimum_spanning_tree_']
if self.prediction_data:
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/cuml/experimental/accel/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def main(module, convert_to_sklearn, format, output, args):
(module,) = module
# run the module passing the remaining arguments
# as if it were run with python -m <module> <args>
sys.argv[:] = [module] + args # not thread safe?
sys.argv[:] = [module, *args.args] # not thread safe?
runpy.run_module(module, run_name="__main__")
elif len(args) >= 1:
# Remove ourself from argv and continue
Expand Down
14 changes: 10 additions & 4 deletions python/cuml/cuml/experimental/accel/estimator_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,21 @@ def __init__(self, *args, **kwargs):
self._cpu_model_class = (
original_class_a # Store a reference to the original class
)
kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs)
super().__init__(*args, **kwargs)

translated_kwargs, self._gpuaccel = self._hyperparam_translator(
**kwargs
)
super().__init__(*args, **translated_kwargs)

self._cpu_hyperparams = list(
inspect.signature(
self._cpu_model_class.__init__
).parameters.keys()
)

self.import_cpu_model()
self.build_cpu_model(**kwargs)

def __repr__(self):
"""
Return a formal string representation of the object.
Expand All @@ -226,7 +232,7 @@ def __repr__(self):
A string representation indicating that this is a wrapped
version of the original CPU-based estimator.
"""
return f"wrapped {self._cpu_model_class}"
return self._cpu_model.__repr__()

def __str__(self):
"""
Expand All @@ -238,7 +244,7 @@ def __str__(self):
A string representation indicating that this is a wrapped
version of the original CPU-based estimator.
"""
return f"ProxyEstimator of {self._cpu_model_class}"
return self._cpu_model.__str__()

def __getstate__(self):
"""
Expand Down
70 changes: 61 additions & 9 deletions python/cuml/cuml/internals/base.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -640,17 +640,20 @@ class UniversalBase(Base):
inspect.signature(self._cpu_model_class.__init__).parameters.keys()
)

def build_cpu_model(self):
def build_cpu_model(self, **kwargs):
if hasattr(self, '_cpu_model'):
return
filtered_kwargs = {}
for keyword, arg in self._full_kwargs.items():
if keyword in self._cpu_hyperparams:
filtered_kwargs[keyword] = arg
else:
logger.info("Unused keyword parameter: {} "
"during CPU estimator "
"initialization".format(keyword))
if kwargs:
filtered_kwargs = kwargs
else:
filtered_kwargs = {}
for keyword, arg in self._full_kwargs.items():
if keyword in self._cpu_hyperparams:
filtered_kwargs[keyword] = arg
else:
logger.info("Unused keyword parameter: {} "
"during CPU estimator "
"initialization".format(keyword))

# initialize model
self._cpu_model = self._cpu_model_class(**filtered_kwargs)
Expand Down Expand Up @@ -889,6 +892,12 @@ class UniversalBase(Base):
# that are not in the cuML estimator in the host estimator
if GlobalSettings().accelerator_active or self._experimental_dispatching:

# we don't want to special sklearn dispatch cloning function
# so that cloning works with this class as a regular estimator
# without __sklearn_clone__
if attr == "__sklearn_clone__":
raise ex

self.import_cpu_model()
if hasattr(self._cpu_model_class, attr):

Expand Down Expand Up @@ -978,6 +987,8 @@ class UniversalBase(Base):
estimator = cls()
estimator.import_cpu_model()
estimator._cpu_model = model
params, gpuaccel = cls._hyperparam_translator(**model.get_params())
estimator.set_params(**params)
estimator.cpu_to_gpu()

# we need to set an output type here since
Expand All @@ -988,3 +999,44 @@ class UniversalBase(Base):
estimator.output_mem_type = MemoryType.host

return estimator

def get_params(self, deep=True):
"""
Get parameters for this estimator.

Parameters
----------
deep : bool, default=True
If True, will return the parameters for this estimator and
contained subobjects that are estimators.

Returns
-------
params : dict
Parameter names mapped to their values.
"""
Comment thread
dantegd marked this conversation as resolved.

if GlobalSettings().accelerator_active or self._experimental_dispatching:
return self._cpu_model.get_params(deep=deep)
else:
return super().get_params(deep=deep)

def set_params(self, **params):
"""
Set parameters for this estimator.

Parameters
----------
**params : dict
Estimator parameters

Returns
-------
self : estimator instance
The estimnator instance
"""
self._cpu_model.set_params(**params)
params, gpuaccel = self._hyperparam_translator(**params)
params = {key: params[key] for key in self._get_param_names() if key in params}
super().set_params(**params)
return self
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import pytest
import numpy as np
import cupy as cp
from sklearn import clone
Comment thread
betatim marked this conversation as resolved.
Outdated
from sklearn.datasets import make_classification, make_regression, make_blobs
from sklearn.linear_model import (
LinearRegression,
Expand Down Expand Up @@ -172,6 +172,59 @@ def test_proxy_facade():

assert original_value == proxy_value


def test_proxy_clone():
# Test that cloning a proxy estimator preserves parameters, even those we
# translate for the cuml class
pca = PCA(n_components=42, svd_solver="arpack")
pca_clone = clone(pca)

assert pca.get_params() == pca_clone.get_params()


def test_proxy_params():
# Test that parameters match between constructor and get_params()
# Mix of default and non-default values
pca = PCA(
n_components=5,
copy=False,
# Pass in an argument and set it to its default value
whiten=False,
)

params = pca.get_params()
assert params["n_components"] == 5
assert params["copy"] is False
assert params["whiten"] is False
# A parameter we never touched, should be the default
assert params["tol"] == 0.0

# Check that get_params doesn't return any unexpected parameters
expected_params = set(
[
"n_components",
"copy",
"whiten",
"tol",
"svd_solver",
"n_oversamples",
"random_state",
"iterated_power",
"power_iteration_normalizer",
]
)
assert set(params.keys()) == expected_params


def test_roundtrip():
import cuml
from sklearn import cluster
Comment thread
betatim marked this conversation as resolved.
Outdated

km = cluster.KMeans(n_clusters=13)
ckm = cuml.KMeans.from_sklearn(km)

assert ckm.n_clusters == 13
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lucky number 13 :D

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should merge this PR. It improves things and fixes several things.

We can keep improving the from_/as_sklearn round tripping. I think the test from https://github.com/rapidsai/cuml/pull/6342/files#r1963552769 still doesn't pass (even if you exclude the raft handle). But lets look at that in a new PR



def test_defaults_args_only_methods():
# Check that estimator methods that take no arguments work
Expand All @@ -186,6 +239,7 @@ def test_defaults_args_only_methods():


def test_kernel_ridge():
import cupy as cp
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move this here? Maybe we should leave a comment for people from the future to explain why it can't be imported at the top of the file (or move it back if this was just for debugging)

rng = np.random.RandomState(42)

X = 5 * rng.rand(10000, 1)
Expand Down