Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docs/source/health_checks.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
.. SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
.. SPDX-License-Identifier: Apache-2.0

Health Checks
=============

cuML provides a small set of health checks (smoke tests) to verify that cuML is
working correctly after installation or as part of automated processes such as
CI. These checks are also used by the RAPIDS CLI's ``rapids doctor`` command
when the CLI is installed.

Run standalone
--------------

You can run all cuML health checks from the command line:

.. code-block:: console

python -m cuml.health_checks

Use ``--verbose`` or ``-v`` for extra output when a check passes. The command
exits with 0 if all checks pass, or 1 if any check fails.

Run via RAPIDS CLI
------------------

When `rapids-cli <https://github.com/rapidsai/rapids-cli>`_ is installed, the
same cuML checks are registered as plugins and run as part of:

.. code-block:: console

rapids doctor

See the `rapids-cli documentation
<https://github.com/rapidsai/rapids-cli#check-plugins>`_ for how checks are
discovered and how to run with ``--verbose`` or filter by name.
1 change: 1 addition & 0 deletions docs/source/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ GitHub repository instead.
estimator_intro.ipynb
pickling_cuml_models.ipynb
dask_multigpu_guide.ipynb
health_checks.rst
supported_versions.rst
20 changes: 20 additions & 0 deletions python/cuml/cuml/health_checks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#

"""Health checks for cuML, used by ``rapids doctor`` and runnable via ``python -m cuml.health_checks``."""

from cuml.health_checks._checks import (
accel_basic_check,
accel_cli_check,
functional_check,
import_check,
)

__all__ = (
"accel_basic_check",
"accel_cli_check",
"functional_check",
"import_check",
)
70 changes: 70 additions & 0 deletions python/cuml/cuml/health_checks/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#

"""Run cuML health checks when invoked as ``python -m cuml.health_checks``."""

import argparse
import sys

from cuml.health_checks import (
accel_basic_check,
accel_cli_check,
functional_check,
import_check,
)

_CHECKS = [
("import", import_check),
("functional", functional_check),
("accel-basic", accel_basic_check),
("accel-cli", accel_cli_check),
]


_CHECK_NAMES = [name for name, _ in _CHECKS]


def main(argv=None):
parser = argparse.ArgumentParser(
prog="python -m cuml.health_checks",
description="Run cuML health checks.",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
help="Print extra output when a check passes.",
)
parser.add_argument(
"checks",
nargs="*",
metavar="CHECK",
choices=_CHECK_NAMES,
help=(
f"Names of checks to run (default: all). "
f"Available: {', '.join(_CHECK_NAMES)}"
),
)
args = parser.parse_args(argv)

selected = set(args.checks) if args.checks else None
failed = False
for name, check_fn in _CHECKS:
if selected is not None and name not in selected:
continue
try:
result = check_fn(verbose=args.verbose)
print(f"{name}: OK")
if args.verbose and result:
print(f" {result}")
except Exception as e:
print(f"{name}: FAIL - {e}")
failed = True
return 1 if failed else 0


if __name__ == "__main__":
sys.exit(main())
147 changes: 147 additions & 0 deletions python/cuml/cuml/health_checks/_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
#

"""Implementation of cuML health checks for rapids doctor and standalone use."""


def import_check(verbose=False, **kwargs):
"""Check that cuML can be imported.

Mainly useful when invoked programmatically; when run via rapids doctor,
cuml is typically already loaded. On failure, use the RAPIDS install docs.
"""
try:
import cuml
except ImportError as e:
raise ImportError(
"cuML could not be imported. Install cuML with conda or pip as "
"described at https://docs.rapids.ai/install/"
) from e
if verbose:
return f"cuML {cuml.__version__} is available"


def functional_check(verbose=False, **kwargs):
"""Check that a basic cuML estimator can fit and predict."""
import numpy as np

from cuml.linear_model import LinearRegression

X = np.array([[1], [2], [3], [4]], dtype=np.float32)
y = np.array([1, 2, 3, 4], dtype=np.float32)
model = LinearRegression()
model.fit(X, y)
pred = model.predict(X)
if pred.shape != (4,):
raise AssertionError(
f"Expected predictions of shape (4,), got {pred.shape}"
)
pred = np.asarray(pred, dtype=np.float32)
if not np.allclose(pred, y, atol=0.1):
raise AssertionError(
f"LinearRegression predictions differ from expected: "
f"got {pred.tolist()}, expected {y.tolist()}"
)
if verbose:
return "LinearRegression fit/predict succeeded"


_SUBPROCESS_TIMEOUT = 120


def accel_basic_check(verbose=False, **kwargs):
"""Check that cuml.accel can be installed and intercepts sklearn."""
import subprocess
import sys

script = (
"import cuml.accel; cuml.accel.install(); "
"from sklearn.ensemble import RandomForestClassifier; "
"assert cuml.accel.is_proxy(RandomForestClassifier), "
"'RandomForestClassifier is not a cuml.accel proxy'; "
"from sklearn.datasets import make_classification; "
"X, y = make_classification(n_samples=100, random_state=0); "
"RandomForestClassifier(n_estimators=10).fit(X, y)"
)
try:
result = subprocess.run(
[sys.executable, "-c", script],
capture_output=True,
text=True,
timeout=_SUBPROCESS_TIMEOUT,
)
except subprocess.TimeoutExpired:
raise RuntimeError(
f"cuml.accel subprocess check timed out after "
f"{_SUBPROCESS_TIMEOUT}s"
)
if result.returncode != 0:
stderr = result.stderr.strip()
detail = (
"\n".join(stderr.splitlines()[-5:]) if stderr else "unknown error"
)
raise RuntimeError(f"cuml.accel subprocess check failed:\n{detail}")
if verbose:
return (
"cuml.accel intercepted sklearn and fit a RandomForestClassifier"
Comment thread
betatim marked this conversation as resolved.
)


def accel_cli_check(verbose=False, **kwargs):
"""Check that python -m cuml.accel runs sklearn code on the GPU."""
import os
import subprocess
import sys
import tempfile

script_content = (
"from sklearn.datasets import make_classification\n"
"from sklearn.ensemble import RandomForestClassifier\n"
"X, y = make_classification(n_samples=200, random_state=0)\n"
"clf = RandomForestClassifier(n_estimators=10, random_state=0)\n"
"clf.fit(X, y)\n"
"clf.predict(X)\n"
)
fd, script_path = tempfile.mkstemp(suffix=".py")
try:
with os.fdopen(fd, "w") as f:
f.write(script_content)

try:
result = subprocess.run(
[sys.executable, "-m", "cuml.accel", "--verbose", script_path],
capture_output=True,
text=True,
timeout=_SUBPROCESS_TIMEOUT,
)
except subprocess.TimeoutExpired:
raise RuntimeError(
f"python -m cuml.accel --verbose timed out after "
f"{_SUBPROCESS_TIMEOUT}s"
)
finally:
os.unlink(script_path)

if result.returncode != 0:
stderr = result.stderr.strip()
detail = (
"\n".join(stderr.splitlines()[-5:]) if stderr else "unknown error"
)
raise RuntimeError(f"python -m cuml.accel --verbose failed:\n{detail}")

output = result.stdout
if "ran on GPU" not in output:
raise AssertionError(
"cuml.accel --verbose output missing 'ran on GPU':\n" + output
)
if "falling back to CPU" in output or "ran on CPU" in output:
raise AssertionError(
"cuml.accel --verbose reported CPU fallbacks:\n" + output
)
if verbose:
return (
"python -m cuml.accel --verbose ran sklearn code on GPU "
"with no fallbacks"
)
Comment thread
csadorf marked this conversation as resolved.
62 changes: 62 additions & 0 deletions python/cuml/tests/test_health_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0

import inspect

import pytest

from cuml.health_checks import _checks
from cuml.health_checks.__main__ import _CHECKS


def _get_public_check_functions():
"""Return all public functions defined in _checks module."""
return {
name: obj
for name, obj in inspect.getmembers(_checks, inspect.isfunction)
if not name.startswith("_") and obj.__module__ == _checks.__name__
}


_CHECK_IDS = [name for name, _ in _CHECKS]


@pytest.mark.parametrize("name,check_fn", _CHECKS, ids=_CHECK_IDS)
def test_health_check(name, check_fn):
"""Each registered health check should pass."""
check_fn(verbose=True)


def test_all_checks_registered():
"""Every public function in _checks must appear in _CHECKS."""
registered_fns = {fn for _, fn in _CHECKS}
public_fns = _get_public_check_functions()
missing = {
name for name, fn in public_fns.items() if fn not in registered_fns
}
assert not missing, (
f"Public check functions not registered in _CHECKS: {missing}"
)


def test_check_function_signatures():
"""All check functions must accept (verbose, **kwargs) per the rapids doctor contract."""
for name, check_fn in _CHECKS:
sig = inspect.signature(check_fn)
params = list(sig.parameters.values())

assert len(params) >= 2, (
f"{name}: expected at least 2 parameters (verbose, **kwargs), "
f"got {len(params)}"
)
assert params[0].name == "verbose", (
f"{name}: first parameter should be 'verbose', "
f"got '{params[0].name}'"
)
assert params[0].default is False, (
f"{name}: 'verbose' should default to False, "
f"got {params[0].default!r}"
)
assert params[-1].kind == inspect.Parameter.VAR_KEYWORD, (
f"{name}: last parameter should be **kwargs, got {params[-1]}"
)