rapidsai · rapids-bot · Jul 22, 2025 · Jul 21, 2025
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
@@ -16,6 +16,7 @@
 #
 import os
 import sys
+import textwrap
 
 from packaging.version import Version
 
@@ -192,10 +193,48 @@
 numpydoc_class_members_toctree = False
 
 
+# Redirects
+REDIRECTS = {
+    "zero-code-change.html": "cuml-accel/index.html",
+    "zero-code-change-benchmarks.html": "cuml-accel/benchmarks.html",
+    "zero-code-change-limitations.html": "cuml-accel/limitations.html",
+    "zero-code-change-logging.html": "cuml-accel/logging-and-profiling.html",
+    "zero_code_change_examples/plot_kmeans_digits/index.html": (
+        "cuml-accel/examples/plot_kmeans_digits.html"
+    ),
+}
+
+
+def setup_redirects(app, docname):
+    """Generates redirect pages for moved content when building html docs"""
+    template = textwrap.dedent(
+        """
+        <html>
+        <head>
+            <meta http-equiv="refresh" content="1; url={new_path}" />
+            <script>
+            window.location.href = "{new_path}"
+            </script>
+        </head>
+        </html>
+        """
+    ).strip()
+
+    if app.builder.name == "html":
+        for old_path, new_path in REDIRECTS.items():
+            # make new_path relative to old_path
+            new_path = f"{'../' * old_path.count('/')}{new_path}"
+            redirect_path = os.path.join(app.outdir, old_path)
+            os.makedirs(os.path.dirname(redirect_path), exist_ok=True)
+            with open(redirect_path, "w") as f:
+                f.write(template.format(new_path=new_path))
+
+
 def setup(app):
     app.add_css_file("references.css")
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
+    app.connect("build-finished", setup_redirects)
 
 
 # The following is used by sphinx.ext.linkcode to provide links to github

diff --git a/docs/source/zero-code-change-benchmarks.rst → docs/source/cuml-accel/benchmarks.rst b/docs/source/zero-code-change-benchmarks.rst → docs/source/cuml-accel/benchmarks.rst
@@ -1,5 +1,5 @@
-cuml.accel: Zero Code Change Acceleration Benchmarks
-====================================================
+Benchmarks
+==========
 
 cuML offers accelerated inference and training for classical ML models using NVIDIA GPUs. With cuml.accel, you can get the benefit of similar acceleration in existing Scikit-Learn, UMAP, and HDBScan scripts without changing a line of code. While the exact speedup depends on the model, dataset size, and hyperparameters, the following benchmarks should give a general sense of the benefit you're likely to observe when using `cuml.accel.`
 
@@ -12,7 +12,7 @@ Relatively complex manifold algorithms like HDBSCAN, t-SNE, and UMAP tend to see
 
 In the following chart, we see the relative speedup obtained by running the same code with and without `cuml.accel`. The datasets for these workloads range in width from 8 to 512 features. As we can see `cuml.accel` offered the most benefit for HDBSCAN, with a 179x speedup, but even KNeighborsRegressor saw a 2x speedup.
 
-.. image:: img/overall_speedup.png
+.. image:: ../img/overall_speedup.png
    :alt: Overall speedup
 
 
@@ -21,19 +21,19 @@ What’s the overhead compared to invoking cuML directly?
 
 While cuml.accel tries to provide as much acceleration as cuML-specific scripts, there is some overhead relative to direct cuML invocations. So one might reasonably wonder at what point it makes sense to rewrite code and invoke cuML directly to squeeze out every bit of performance. While the exact amount of overhead depends on the estimator, parameters, and data size, the overhead is typically quite low for model training, some algorithms have a bit more overhead than others:
 
-.. image:: img/overall_overhead.png
+.. image:: ../img/overall_overhead.png
    :alt: Overall overhead
 
 The differences can be attributed to one main factor: training typically is quite computationally expensive. So the cost of transferring data from CPU to GPU and the machinery of the cuML Accelerator overheads don't affect the runtime significantly. But even here, one can immediately notice that the overhead is more significant for tasks that are simpler, for example training ``KNeighbors`` models. There, using cuML directly can be significantly faster if one wants to get the maximum performance of a GPU, though it's important to note that the difference in execution time is the difference of computing in seconds vs milliseconds.
 
 It’s also important to note how dataset shape influences these gains. For skinny datasets — where you have relatively few features but many rows — GPU acceleration still provides a great performance boost, although the relative advantage may be more modest for simpler algorithms that are already quite fast on CPU. The following benchmark shows speedups for datasets with 8 and 16 features:
 
-.. image:: img/skinny_speedup.png
+.. image:: ../img/skinny_speedup.png
    :alt: Skinny speedup
 
 Wide datasets, on the other hand truly showcase the accelerator’s strengths. High-dimensional tasks often require intense computation and can bog down CPU-based workflows. In these cases, the cuML Accelerator steps in to deliver some of its most dramatic speedups, especially for dimension reduction methods (t-SNE, UMAP) and other math-heavy operations. It's not uncommon that a task that was unfeasible to achieve before, like incorporating UMAP and HDBSCAN in complex, high dimensional workflows, can now easily be achieved thanks to cuML and ``cuml.accel``. The following benchmark shows those speedups for datasets with 128, 256 and 512 features:
 
-.. image:: img/wide_speedup.png
+.. image:: ../img/wide_speedup.png
    :alt: Wide speedup
 
 
@@ -44,13 +44,13 @@ Inference
 While the accelerator also speeds up inference, the gains tend to be smaller in absolute terms because inference is usually much faster than training to begin with. Still, a 2×–7× improvement (as with KNeighbors or RandomForest) can be critical for running large-scale or real-time predictions.  Especially for large-batch or repeated inference scenarios, the GPU acceleration can provide significant value.
 
 
-.. image:: img/inference_speedup.png
+.. image:: ../img/inference_speedup.png
    :alt: Inference Speedup
 
 
 For smaller datasets, the data transfer becomes a bigger slice of the total runtime, which means that especially for many tiny batches, the overhead might eat up most (or all!) of the benefit from running an accelerated algorithm on the GPU. In those cases it becomes especially important to avoid unnecessary data transfers, e.g., by explicitly keeping inputs and outputs on the GPU, for instance in the form of cupy arrays. This is not possible in the accelerator mode which is why, for these workflows, it might be more advisable to invoke cuML directly with GPU native data types to preserve those speedups.
 
-.. image:: img/inference_overhead.png
+.. image:: ../img/inference_overhead.png
    :alt: Inference overhead
 
 

diff --git a/docs/source/cuml-accel/examples/index.rst b/docs/source/cuml-accel/examples/index.rst
@@ -0,0 +1,9 @@
+Examples
+========
+
+Here we provide a few examples of using ``cuml.accel``:
+
+.. toctree::
+   :maxdepth: 1
+
+   plot_kmeans_digits.ipynb
diff --git a/..._change_examples/plot_kmeans_digits.ipynb → ...l-accel/examples/plot_kmeans_digits.ipynb b/..._change_examples/plot_kmeans_digits.ipynb → ...l-accel/examples/plot_kmeans_digits.ipynb
@@ -4,10 +4,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# cuml.accel: Zero-Code Change Acceleration with NVIDIA GPUs.\n",
-    "`cuml.accel` is a new tool in cuML which allows you to accelerate [Scikit-Learn](https://scikit-learn.org/1.5/index.html) estimators on NVIDIA GPUs without having to make any changes to your scripts or notebooks. The following notebook is taken directly from the [Scikit-Learn example gallery](https://scikit-learn.org/1.5/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py) to demonstrate how unaltered Scikit-Learn code can be accelerated with `cuml.accel`.\n",
+    "# A demo of K-Means clustering on the handwritten digits data\n",
+    "\n",
+    "*cuml.accel is a new tool in cuML which allows you to accelerate [Scikit-Learn](https://scikit-learn.org/1.5/index.html) estimators on NVIDIA GPUs without having to make any changes to your scripts or notebooks. The following notebook is taken directly from the [Scikit-Learn example gallery](https://scikit-learn.org/1.5/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py) to demonstrate how unaltered Scikit-Learn code can be accelerated with cuml.accel.*\n",
     "\n",
-    "As always, please remember to [cite](https://scikit-learn.org/1.5/about.html#citing-scikit-learn) Scikit-Learn in any publication based on the fantastic work of the Scikit-Learn community."
+    "*As always, please remember to [cite](https://scikit-learn.org/1.5/about.html#citing-scikit-learn) Scikit-Learn in any publication based on the fantastic work of the Scikit-Learn community.*"
    ]
   },
   {
@@ -16,17 +17,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# The following magic is the only change required to enable GPU acceleration with cuml.accel\n",
-    "%load_ext cuml.accel\n",
-    "# If you wish to see results WITHOUT cuml.accel, be sure to comment out the above AND restart the notebook kernel"
+    "# The following magic is the only change required to enable GPU acceleration\n",
+    "# with cuml.accel. If you wish to see results WITHOUT cuml.accel, be sure\n",
+    "# to comment out this cell AND restart the notebook kernel\n",
+    "%load_ext cuml.accel"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "\n",
-    "# A demo of K-Means clustering on the handwritten digits data\n",
+    "---\n",
     "\n",
     "In this example we compare the various initialization strategies for K-means in\n",
     "terms of runtime and quality of the results.\n",
@@ -37,6 +38,7 @@
     "Cluster quality metrics evaluated (see `clustering_evaluation` for\n",
     "definitions and discussions of the metrics):\n",
     "\n",
+    "```\n",
     "=========== ========================================================\n",
     "Shorthand    full name\n",
     "=========== ========================================================\n",
@@ -46,7 +48,8 @@
     "ARI          adjusted Rand index\n",
     "AMI          adjusted mutual information\n",
     "silhouette   silhouette coefficient\n",
-    "=========== ========================================================\n"
+    "=========== ========================================================\n",
+    "```"
    ]
   },
   {
@@ -319,7 +322,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.9"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/source/zero-code-change.rst → docs/source/cuml-accel/faq.rst b/docs/source/zero-code-change.rst → docs/source/cuml-accel/faq.rst
@@ -1,54 +1,5 @@
-cuml.accel: Zero Code Change Acceleration for Scikit-Learn, UMAP and HDBSCAN
-============================================================================
-
-Starting in RAPIDS 25.02.01, cuML offers a new way to accelerate existing code
-based on Scikit-Learn, UMAP-Learn, and HDBSCAN. Instead of rewriting that code
-to import equivalent cuML functionality, simply invoke your existing,
-unaltered Python script as follows, and cuML will accelerate as much of the
-code as possible with NVIDIA GPUs, falling back to CPU where necessary:
-
-.. code-block:: console
-
-    python -m cuml.accel unchanged_script.py
-
-The same functionality is available in Jupyter notebooks using the
-following magic at the beginning of the notebook (before other imports):
-
-.. code-block::
-
-   %load_ext cuml.accel
-   import sklearn
-
-You can see an example of this in
-`KMeans Digits Notebook <zero_code_change_examples/plot_kmeans_digits.ipynb>`_, where an unmodified
-Scikit-Learn example notebook is used to demonstrate how ``cuml.accel`` can be
-used in Jupyter.
-
-In any Python environment, the following code snippet can also be used to
-activate ``cuml.accel`` if it is run prior to importing the module you wish to
-accelerate:
-
-.. code-block:: python
-
-   from cuml.accel import install
-   install()
-   import sklearn
-
-**``cuml.accel`` is currently a beta feature and will continue to improve over
-time.**
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   zero-code-change-limitations.rst
-   zero-code-change-benchmarks.rst
-   zero-code-change-logging.rst
-   zero_code_change_examples/plot_kmeans_digits.ipynb
-
-
-FAQs
-----
+FAQ
+---
 
 1. Why use cuml.accel instead of using cuML directly?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -253,4 +204,4 @@ accelerated on GPU and which fall back to CPU execution.
   ``install(log_level="info")``
 
 For detailed information about logging and troubleshooting, see
-:doc:`zero-code-change-logging`.
+:doc:`logging-and-profiling`.
diff --git a/docs/source/cuml-accel/index.rst b/docs/source/cuml-accel/index.rst
@@ -0,0 +1,81 @@
+cuml.accel
+==========
+
+The ``cuml.accel`` zero code change accelerator provides a mechanism to
+accelerate existing python machine learning code on the GPU, *without requiring
+any changes to that code*. Depending on the data size and algorithms chosen,
+this may result in :doc:`major speedups <benchmarks>`.
+
+.. code-block:: python
+
+    %%load_ext cuml.accel
+    # Certain operations in common ML libraries (sklearn, umap, hdbscan)
+    # are now GPU accelerated
+
+    from sklearn.datasets import make_regression
+    from sklearn.linear_model import ElasticNet
+
+    X, y = make_regression(n_samples=1_000_000)
+
+    model = ElasticNet()
+    model.fit(X, y)   # runs on GPU!
+    model.predict(X)  # runs on GPU!
+
+Currently ``cuml.accel`` targets ``sklearn``, ``umap``, and ``hdbscan`` as
+libraries to accelerate. Functionality that isn't yet supported will fallback
+to CPU execution. See :doc:`limitations` for more information on what's
+currently accelerated and what requires a CPU fallback.
+
+Usage
+-----
+
+``cuml.accel`` comes standard with ``cuml``, no additional installation
+requirements are needed. It's designed to be used with existing code that makes
+use of ``sklearn``, ``umap`` or ``hdbscan``, with the only change being something
+to enable the use of the accelerator.
+
+Command Line Interface (CLI)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When executing from the commandline, you can use ``python -m cuml.accel`` in
+place of ``python`` to execute python code with the accelerator enabled.
+
+.. code-block:: console
+
+   python -m cuml.accel script.py
+
+
+Jupyter/IPython
+~~~~~~~~~~~~~~~
+
+The same functionality is available in Jupyter notebooks or IPython by
+executing the following cell magic at the top (before other imports):
+
+.. code-block::
+
+   %%load_ext cuml.accel
+
+You can see an example of this in :doc:`this example
+<examples/plot_kmeans_digits>`.
+
+Enabling Programmatically
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When needed, the accelerator may also be enabled programmatically by calling
+`cuml.accel.install`. Note that you'll want to call this early in your code,
+before importing functionality from ``sklearn``/``umap``/``hdbscan``.
+
+.. code-block:: python
+
+   import cuml
+   cuml.accel.install()
+
+.. toctree::
+   :hidden:
+
+   self
+   logging-and-profiling.rst
+   limitations.rst
+   faq.rst
+   benchmarks.rst
+   examples/index.rst
diff --git a/docs/source/zero-code-change-limitations.rst → docs/source/cuml-accel/limitations.rst b/docs/source/zero-code-change-limitations.rst → docs/source/cuml-accel/limitations.rst
@@ -18,7 +18,7 @@ These limitations fall into a few categories:
 - Estimators that are only partially accelerated. ``cuml.accel`` will fall back
   to using the CPU implementations for some algorithms in the presence of
   certain hyperparameters or input types. These cases are documented below in
-  estimator-specific sections. See :doc:`zero-code-change-logging` for how to
+  estimator-specific sections. See :doc:`logging-and-profiling` for how to
   enable logging to gain insight into when ``cuml.accel`` needs to fall back to
   CPU.
 

diff --git a/docs/source/zero-code-change-logging.rst → ...urce/cuml-accel/logging-and-profiling.rst b/docs/source/zero-code-change-logging.rst → ...urce/cuml-accel/logging-and-profiling.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -16,20 +16,11 @@ contributors, users and hobbyists! Thank you for your wonderful support!
 cuML is only supported on Linux operating systems.
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   :hidden:
 
    cuml_intro.rst
-   api.rst
    user_guide.rst
-   zero-code-change.rst
+   cuml-accel/index.rst
+   api.rst
    FIL.rst
    cuml_blogs.rst
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`