rapidsai · raydouglass · Feb 28, 2025 · Feb 9, 2025 · Feb 14, 2025 · Feb 17, 2025
@@ -1,7 +1,7 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 # Support invoking run_cuml_singlegpu_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/cuml/tests/experimental/accel
 
-python -m pytest -p cuml.experimental.accel --cache-clear "$@" .
+python -m pytest -p cuml.accel --cache-clear "$@" .
@@ -44,6 +44,7 @@ dependencies:
 - libcuvs==25.2.*,>=0.0.0a0
 - libraft==25.2.*,>=0.0.0a0
 - librmm==25.2.*,>=0.0.0a0
+- matplotlib
 - nbsphinx
 - ninja
 - nltk
@@ -71,9 +72,9 @@ dependencies:
 - scipy>=1.8.0
 - seaborn
 - spdlog>=1.14.1,<1.15
-- sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - statsmodels
 - sysroot_linux-64==2.28
 - treelite==4.4.1

@@ -41,6 +41,7 @@ dependencies:
 - libcuvs==25.2.*,>=0.0.0a0
 - libraft==25.2.*,>=0.0.0a0
 - librmm==25.2.*,>=0.0.0a0
+- matplotlib
 - nbsphinx
 - ninja
 - nltk
@@ -67,9 +68,9 @@ dependencies:
 - scipy>=1.8.0
 - seaborn
 - spdlog>=1.14.1,<1.15
-- sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - statsmodels
 - sysroot_linux-64==2.28
 - treelite==4.4.1

@@ -116,7 +116,9 @@ struct forest {
       // if n_items was not provided, try from 1 to MAX_N_ITEMS. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items =
-        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? MAX_N_ITEMS : 1) : ssp.n_items;
+        // we force this to 1 to avoid running into the BATCH_TREE_REORG issue
+        // ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? MAX_N_ITEMS : 1) : ssp.n_items;
+        ssp.n_items == 0 ? 1 : ssp.n_items;
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,7 @@ void truncCompExpVars(const raft::handle_t& handle,
                       math_t* components,
                       math_t* explained_var,
                       math_t* explained_var_ratio,
+                      math_t* noise_vars,
                       const paramsTSVDTemplate<enum_solver>& prms,
                       cudaStream_t stream)
 {
@@ -67,6 +68,20 @@ void truncCompExpVars(const raft::handle_t& handle,
                                 prms.n_components,
                                 std::size_t(1),
                                 stream);
+
+  // Compute the scalar noise_vars defined as (pseudocode)
+  // (n_components < min(n_cols, n_rows)) ? explained_var_all[n_components:].mean() : 0
+  if (prms.n_components < prms.n_cols && prms.n_components < prms.n_rows) {
+    raft::stats::mean(noise_vars,
+                      explained_var_all.data() + prms.n_components,
+                      std::size_t{1},
+                      prms.n_cols - prms.n_components,
+                      false,
+                      true,
+                      stream);
+  } else {
+    raft::matrix::setValue(noise_vars, noise_vars, math_t{0}, 1, stream);
+  }
 }
 
 /**
@@ -116,7 +131,7 @@ void pcaFit(const raft::handle_t& handle,
   raft::stats::cov(
     handle, cov.data(), input, mu, prms.n_cols, prms.n_rows, true, false, true, stream);
   truncCompExpVars(
-    handle, cov.data(), components, explained_var, explained_var_ratio, prms, stream);
+    handle, cov.data(), components, explained_var, explained_var_ratio, noise_vars, prms, stream);
 
   math_t scalar = (prms.n_rows - 1);
   raft::matrix::seqRoot(explained_var, singular_vals, scalar, n_components, stream, true);

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ void fit_impl(raft::handle_t& handle,
   Stats::opg::cov(handle, cov, input_data, input_desc, mu_data, true, streams, n_streams);
 
   ML::truncCompExpVars<T, mg_solver>(
-    handle, cov.ptr, components, explained_var, explained_var_ratio, prms, streams[0]);
+    handle, cov.ptr, components, explained_var, explained_var_ratio, noise_vars, prms, streams[0]);
 
   T scalar = (prms.n_rows - 1);
   raft::matrix::seqRoot(explained_var, singular_vals, scalar, prms.n_components, streams[0], true);
@@ -128,9 +128,6 @@ void fit_impl(raft::handle_t& handle,
              streams,
              n_streams,
              verbose);
-    for (std::uint32_t i = 0; i < n_streams; i++) {
-      handle.sync_stream(streams[i]);
-    }
   } else if (prms.algorithm == mg_solver::QR) {
     const raft::handle_t& h = handle;
     cudaStream_t stream     = h.get_stream();
@@ -194,6 +191,20 @@ void fit_impl(raft::handle_t& handle,
                                   std::size_t(1),
                                   stream);
 
+    // Compute the scalar noise_vars defined as (pseudocode)
+    // (n_components < min(n_cols, n_rows)) ? explained_var_all[n_components:].mean() : 0
+    if (prms.n_components < prms.n_cols && prms.n_components < prms.n_rows) {
+      raft::stats::mean(noise_vars,
+                        explained_var_all.data() + prms.n_components,
+                        std::size_t{1},
+                        prms.n_cols - prms.n_components,
+                        false,
+                        true,
+                        stream);
+    } else {
+      raft::matrix::setValue(noise_vars, noise_vars, T{0}, 1, stream);
+    }
+
     raft::linalg::transpose(vMatrix.data(), prms.n_cols, stream);
     raft::matrix::truncZeroOrigin(
       vMatrix.data(), prms.n_cols, components, prms.n_components, prms.n_cols, stream);

@@ -313,14 +313,16 @@ void launcher(int n,
 
   raft::sparse::COO<value_t> in(stream, n * n_neighbors, n, n);
 
-  CUML_LOG_DEBUG("Smooth kNN Distances");
+  /*
   // check for logging in order to avoid the potentially costly `arr2Str` call!
   if (ML::default_logger().should_log(ML::level_enum::trace)) {
+    CUML_LOG_DEBUG("Smooth kNN Distances");
     auto str = raft::arr2Str(sigmas.data(), 25, "sigmas", stream);
     CUML_LOG_TRACE("%s", str.c_str());
     str = raft::arr2Str(rhos.data(), 25, "rhos", stream);
     CUML_LOG_TRACE("%s", str.c_str());
   }
+  */
 
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
@@ -342,12 +344,14 @@ void launcher(int n,
                                                                               n_neighbors);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  CUML_LOG_DEBUG("Compute Membership Strength");
+  /*
   if (ML::default_logger().should_log(ML::level_enum::trace)) {
+    CUML_LOG_DEBUG("Compute Membership Strength");
     std::stringstream ss;
     ss << in;
     CUML_LOG_TRACE(ss.str().c_str());
   }
+  */
 
   /**
    * Combines all the fuzzy simplicial sets into a global

@@ -341,11 +341,13 @@ void launcher(
 
   make_epochs_per_sample(out.vals(), out.nnz, n_epochs, epochs_per_sample.data(), stream);
 
-  if (ML::default_logger().should_log(ML::level_enum::trace)) {
+  /*
+  if (ML::default_logger().should_log(ML::level_enum::debug)) {
     std::stringstream ss;
     ss << raft::arr2Str(epochs_per_sample.data(), out.nnz, "epochs_per_sample", stream);
     CUML_LOG_TRACE(ss.str().c_str());
   }
+  */
 
   optimize_layout<TPB_X, T>(embedding,
                             m,

@@ -301,8 +301,9 @@ void perform_general_intersection(const raft::handle_t& handle,
     handle, y_inputs, y_inputs, knn_graph, params->target_n_neighbors, params, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  CUML_LOG_DEBUG("Target kNN Graph");
-  if (ML::default_logger().should_log(ML::level_enum::trace)) {
+  /*
+  if (ML::default_logger().should_log(ML::level_enum::debug)) {
+    CUML_LOG_DEBUG("Target kNN Graph");
     std::stringstream ss1, ss2;
     ss1 << raft::arr2Str(
       y_knn_indices.data(), rgraph_coo->n_rows * params->target_n_neighbors, "knn_indices", stream);
@@ -311,6 +312,7 @@ void perform_general_intersection(const raft::handle_t& handle,
       y_knn_dists.data(), rgraph_coo->n_rows * params->target_n_neighbors, "knn_dists", stream);
     CUML_LOG_TRACE("%s", ss2.str().c_str());
   }
+  */
 
   /**
    * Compute fuzzy simplicial set
@@ -326,12 +328,14 @@ void perform_general_intersection(const raft::handle_t& handle,
                                                 stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
+  /*
   if (ML::default_logger().should_log(ML::level_enum::debug)) {
     CUML_LOG_DEBUG("Target Fuzzy Simplicial Set");
     std::stringstream ss;
     ss << ygraph_coo;
     CUML_LOG_DEBUG(ss.str().c_str());
   }
+  */
 
   /**
    * Compute general simplicial set intersection.

@@ -432,11 +432,12 @@ dependencies:
           - ipykernel
           - nbsphinx
           - numpydoc
+          - matplotlib
           # https://github.com/pydata/pydata-sphinx-theme/issues/1539
           - pydata-sphinx-theme!=0.14.2
           - recommonmark
           - &scikit_learn scikit-learn==1.5.*
-          - sphinx
+          - sphinx<8.2.0
           - sphinx-copybutton
           - sphinx-markdown-tables
       - output_types: conda

diff --git a/docs/source/img/inference_overhead.png b/docs/source/img/inference_overhead.png
diff --git a/docs/source/img/inference_speedup.png b/docs/source/img/inference_speedup.png
diff --git a/docs/source/img/overall_overhead.png b/docs/source/img/overall_overhead.png
diff --git a/docs/source/img/overall_speedup.png b/docs/source/img/overall_speedup.png
diff --git a/docs/source/img/skinny_speedup.png b/docs/source/img/skinny_speedup.png
diff --git a/docs/source/img/wide_speedup.png b/docs/source/img/wide_speedup.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -24,6 +24,7 @@ Support for Windows is possible in the near future.
    cuml_intro.rst
    api.rst
    user_guide.rst
+   zero-code-change.rst
    cuml_blogs.rst
 
 

diff --git a/docs/source/zero-code-change-benchmarks.rst b/docs/source/zero-code-change-benchmarks.rst
@@ -0,0 +1,59 @@
+cuml.accel: Zero Code Change Acceleration Benchmarks
+====================================================
+
+cuML offers accelerated inference and training for classical ML models using NVIDIA GPUs. With cuml.accel, you can get the benefit of similar acceleration in existing Scikit-Learn, UMAP, and HDBScan scripts without changing a line of code. While the exact speedup depends on the model, dataset size, and hyperparameters, the following benchmarks should give a general sense of the benefit you're likely to observe when using `cuml.accel.`
+
+Training
+--------
+
+While both training and inference benefit from GPU acceleration, cuML tends to offer even more significant gains for training than inference. Training times 2-80 times faster are typical for cuML, especially with large datasets. `cuml.accel` offers similar speedups without requiring any cuML-specific code changes.
+
+Relatively complex manifold algorithms like HDBSCAN, t-SNE, and UMAP tend to see the largest benefit from `cuml.accel`. Speedups of 60x to 300x are typical for realistic workloads. Simpler algorithms like KMeans and Random Forest can also see speedups of 15x to 80x. Even the simplest ML algorithms, like Logistic Regression, Lasso, PCA, and Ridge will typically have speedusp of 2x to 10x.
+
+In the following chart, we see the relative speedup obtained by running the same code with and without `cuml.accel`. The datasets for these workloads range in width from 8 to 512 features. As we can see `cuml.accel` offered the most benefit for HDBSCAN, with a 179x speedup, but even KNeighborsRegressor saw a 2x speedup.
+
+.. image:: img/overall_speedup.png
+   :alt: Overall speedup
+
+
+What’s the overhead compared to invoking cuML directly?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+While cuml.accel tries to provide as much acceleration as cuML-specific scripts, there is some overhead relative to direct cuML invocations. So one might reasonably wonder at what point it makes sense to rewrite code and invoke cuML directly to squeeze out every bit of performance. While the exact amount of overhead depends on the estimator, parameters, and data size, the overhead is typically quite low for model training, some algorithms have a bit more overhead than others:
+
+.. image:: img/overall_overhead.png
+   :alt: Overall overhead
+
+The differences can be attributed to one main factor: training typically is quite computationally expensive. So the cost of transferring data from CPU to GPU and the machinery of the cuML Accelerator overheads don't affect the runtime significantly. But even here, one can immediately notice that the overhead is more significant for tasks that are simpler, for example training ``KNeighbors`` models. There, using cuML directly can be significantly faster if one wants to get the maximum performance of a GPU, though it's important to note that the difference in execution time is the difference of computing in seconds vs milliseconds.
+
+It’s also important to note how dataset shape influences these gains. For skinny datasets — where you have relatively few features but many rows — GPU acceleration still provides a great performance boost, although the relative advantage may be more modest for simpler algorithms that are already quite fast on CPU. The following benchmark shows speedups for datasets with 8 and 16 features:
+
+.. image:: img/skinny_speedup.png
+   :alt: Skinny speedup
+
+Wide datasets, on the other hand truly showcase the accelerator’s strengths. High-dimensional tasks often require intense computation and can bog down CPU-based workflows. In these cases, the cuML Accelerator steps in to deliver some of its most dramatic speedups, especially for dimension reduction methods (t-SNE, UMAP) and other math-heavy operations. It's not uncommon that a task that was unfeasible to achieve before, like incorporating UMAP and HDBSCAN in complex, high dimensional workflows, can now easily be achieved thanks to cuML and ``cuml.accel``. The following benchmark shows those speedups for datasets with 128, 256 and 512 features:
+
+.. image:: img/wide_speedup.png
+   :alt: Wide speedup
+
+
+Inference
+----------
+
+
+While the accelerator also speeds up inference, the gains tend to be smaller in absolute terms because inference is usually much faster than training to begin with. Still, a 2×–7× improvement (as with KNeighbors or RandomForest) can be critical for running large-scale or real-time predictions.  Especially for large-batch or repeated inference scenarios, the GPU acceleration can provide significant value.
+
+
+.. image:: img/inference_speedup.png
+   :alt: Inference Speedup
+
+
+For smaller datasets, the data transfer becomes a bigger slice of the total runtime, which means that especially for many tiny batches, the overhead might eat up most (or all!) of the benefit from running an accelerated algorithm on the GPU. In those cases it becomes especially important to avoid unnecessary data transfers, e.g., by explicitly keeping inputs and outputs on the GPU, for instance in the form of cupy arrays. This is not possible in the accelerator mode which is why, for these workflows, it might be more advisable to invoke cuML directly with GPU native data types to preserve those speedups.
+
+.. image:: img/inference_overhead.png
+   :alt: Inference overhead
+
+
+Overall, these benchmarks underscore how the cuML Accelerator can radically cut down training times for an extensive range of machine learning tasks and still offer meaningful inference improvements, all of this without the need of changing existing code, making it a compelling choice for end-to-end ML pipelines and tools.
+
+As this is the first beta release, performance optimizations and algorithm coverage are an active area that will have improvements in the next RAPIDS releases.