rapidsai
diff --git a/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/test_wheel_integrations.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_wheel_integrations.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/cuml/manifold/umap.hpp‎
Lines changed: 40 additions & 2 deletions b/‎cpp/include/cuml/manifold/umap.hpp‎
Lines changed: 40 additions & 2 deletions
diff --git a/‎cpp/include/cuml/matrix/kernel_params.hpp‎
Lines changed: 9 additions & 2 deletions b/‎cpp/include/cuml/matrix/kernel_params.hpp‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎cpp/src/matrix/kernel_params.cpp‎
Lines changed: 8 additions & 2 deletions b/‎cpp/src/matrix/kernel_params.cpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/src/svm/kernelcache.cuh‎
Lines changed: 134 additions & 48 deletions b/‎cpp/src/svm/kernelcache.cuh‎
Lines changed: 134 additions & 48 deletions
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - "main"
+      - "release/*"
     tags:
       - v[0-9][0-9].[0-9][0-9].[0-9][0-9]
   workflow_dispatch:
 
@@ -159,3 +159,4 @@ jobs:
       # Test all CUDA major versions with latest dependencies and respective latest Python version
       matrix_filter: map(select(.DEPENDENCIES == "latest")) | group_by(.CUDA_VER|split(".")|.[0]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
+      continue-on-error: true
@@ -30,6 +30,7 @@ rapids-pip-retry install \
 # Step 2: Install BERTopic
 rapids-logger "Installing BERTopic"
 rapids-pip-retry install bertopic
+rapids-pip-retry install requests # TODO remove once sentence-transformers#3617 is fixed
 
 # Test 1: Verify imports
 rapids-logger "Testing imports"
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -111,6 +111,8 @@ void init_and_refine(const raft::handle_t& handle,
  * @param[out] embeddings: unique_ptr to device_buffer that will be allocated and filled with
  * embeddings
  * @param[out] graph: pointer to fuzzy simplicial set graph
+ * @param[out] sigmas: optional output array for per-point sigma values (size n, device memory)
+ * @param[out] rhos: optional output array for per-point rho values (size n, device memory)
  */
 void fit(const raft::handle_t& handle,
          float* X,
@@ -121,7 +123,9 @@ void fit(const raft::handle_t& handle,
          float* knn_dists,
          UMAPParams* params,
          std::unique_ptr<rmm::device_buffer>& embeddings,
-         raft::host_coo_matrix<float, int, int, uint64_t>& graph);
+         raft::host_coo_matrix<float, int, int, uint64_t>& graph,
+         float* sigmas = nullptr,
+         float* rhos   = nullptr);
 
 /**
  * Sparse fit
@@ -217,5 +221,39 @@ void transform_sparse(const raft::handle_t& handle,
                       UMAPParams* params,
                       float* transformed);
 
+/**
+ * Inverse transform - optimize layout in original space
+ *
+ * @param[in] handle: raft::handle_t
+ * @param[in,out] inv_transformed: pointer to initial inverse-transformed positions (will be
+ * optimized in-place)
+ * @param[in] n: number of points to inverse transform
+ * @param[in] n_features: number of features in original space
+ * @param[in] orig_X: pointer to original training data
+ * @param[in] orig_n: number of rows in original training data
+ * @param[in] graph_rows: row indices of the inverse transform graph (COO format)
+ * @param[in] graph_cols: column indices of the inverse transform graph (COO format)
+ * @param[in] graph_vals: edge weights of the inverse transform graph
+ * @param[in] nnz: number of edges in the graph
+ * @param[in] sigmas: per-point sigma values from fuzzy simplicial set
+ * @param[in] rhos: per-point rho values from fuzzy simplicial set
+ * @param[in] params: pointer to ML::UMAPParams object
+ * @param[in] n_epochs: number of optimization epochs
+ */
+void inverse_transform(const raft::handle_t& handle,
+                       float* inv_transformed,
+                       int n,
+                       int n_features,
+                       float* orig_X,
+                       int orig_n,
+                       int* graph_rows,
+                       int* graph_cols,
+                       float* graph_vals,
+                       int nnz,
+                       float* sigmas,
+                       float* rhos,
+                       UMAPParams* params,
+                       int n_epochs);
+
 }  // namespace UMAP
 }  // namespace ML
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -13,14 +13,21 @@ struct KernelParams;
 
 namespace ML::matrix {
 
-enum class KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
+enum class KernelType { LINEAR, POLYNOMIAL, RBF, TANH, PRECOMPUTED };
 
 struct KernelParams {
   KernelType kernel;
   int degree;
   double gamma;
   double coef0;
 
+  /**
+   * @brief Convert to cuvs KernelParams.
+   *
+   * @note For PRECOMPUTED kernels, the returned cuvs params will have kernel_type
+   *       set to LINEAR as a placeholder, since cuvs doesn't have a PRECOMPUTED type.
+   *       The kernel value won't be used in this case.
+   */
   cuvs::distance::kernels::KernelParams to_cuvs() const;
 };
 
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -13,7 +13,13 @@ cuvs::distance::kernels::KernelParams KernelParams::to_cuvs() const
 {
   cuvs::distance::kernels::KernelParams params;
 
-  params.kernel = static_cast<cuvs::distance::kernels::KernelType>(this->kernel);
+  // For precomputed kernels, we use LINEAR as a placeholder since cuvs
+  // doesn't have PRECOMPUTED. The actual kernel won't be used.
+  if (this->kernel == KernelType::PRECOMPUTED) {
+    params.kernel = cuvs::distance::kernels::KernelType::LINEAR;
+  } else {
+    params.kernel = static_cast<cuvs::distance::kernels::KernelType>(this->kernel);
+  }
   params.degree = this->degree;
   params.gamma  = this->gamma;
   params.coef0  = this->coef0;
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -27,7 +27,9 @@
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/reverse.h>
+#include <thrust/transform.h>
 
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/distance/grammian.hpp>
@@ -40,6 +42,50 @@ namespace SVM {
 
 namespace {  // unnamed namespace to avoid multiple definition error
 
+/**
+ * @brief Extract columns from a matrix for precomputed kernels
+ *
+ * Given a matrix src of shape (n_rows_src, n_cols_src), extract columns
+ * specified by col_indices and store in dst of shape (n_rows_src, n_cols_dst).
+ *
+ * @param [out] dst destination matrix, size [n_rows_src x n_cols_dst]
+ * @param [in] src source matrix, size [n_rows_src x n_cols_src]
+ * @param [in] n_rows_src number of rows in source matrix
+ * @param [in] col_indices column indices to extract, size [n_cols_dst]
+ * @param [in] n_cols_dst number of columns to extract
+ */
+template <typename math_t>
+CUML_KERNEL void extractColumnsKernel(
+  math_t* dst, const math_t* src, int n_rows_src, const int* col_indices, int n_cols_dst)
+{
+  int64_t tid   = static_cast<int64_t>(threadIdx.x) + static_cast<int64_t>(blockIdx.x) * blockDim.x;
+  int64_t total = static_cast<int64_t>(n_rows_src) * n_cols_dst;
+  if (tid < total) {
+    int64_t row = tid % n_rows_src;
+    int64_t col = tid / n_rows_src;
+    int src_col = col_indices[col];
+    // Both source and destination are column-major:
+    // src[row, col] = src[row + col * n_rows_src]
+    // dst[row, col] = dst[row + col * n_rows_src] = dst[tid]
+    dst[tid] = src[row + static_cast<int64_t>(src_col) * n_rows_src];
+  }
+}
+
+template <typename math_t>
+void extractColumnsForPrecomputed(math_t* dst,
+                                  const math_t* src,
+                                  int n_rows_src,
+                                  const int* col_indices,
+                                  int n_cols_dst,
+                                  cudaStream_t stream)
+{
+  int total    = n_rows_src * n_cols_dst;
+  int TPB      = 256;
+  int n_blocks = raft::ceildiv(total, TPB);
+  extractColumnsKernel<math_t>
+    <<<n_blocks, TPB, 0, stream>>>(dst, src, n_rows_src, col_indices, n_cols_dst);
+}
+
 /**
  * @brief Re-raise working set indexes to SVR scope [0..2*n_rows)
  *
@@ -322,6 +368,8 @@ class KernelCache {
    * @param dense_extract_byte_limit sparse rows will be extracted as dense
    *        up to this limit to speed up kernel computation. Only valid
    *        for sparse input. (default 1GB)
+   * @param is_precomputed if true, the matrix is a precomputed kernel matrix
+   *        and no kernel computation is performed
    */
   KernelCache(const raft::handle_t& handle,
               MatrixViewType matrix,
@@ -333,7 +381,8 @@ class KernelCache {
               float cache_size                = 200,
               SvmType svmType                 = C_SVC,
               size_t kernel_tile_byte_limit   = 1 << 30,
-              size_t dense_extract_byte_limit = 1 << 30)
+              size_t dense_extract_byte_limit = 1 << 30,
+              bool is_precomputed             = false)
     : batch_cache(n_rows, cache_size, handle.get_stream()),
       handle(handle),
       kernel(kernel),
@@ -343,6 +392,7 @@ class KernelCache {
       n_cols(n_cols),
       n_ws(n_ws),
       svmType(svmType),
+      is_precomputed(is_precomputed),
       kernel_tile(0, handle.get_stream()),
       matrix_l2(0, handle.get_stream()),
       matrix_l2_ws(0, handle.get_stream()),
@@ -353,7 +403,7 @@ class KernelCache {
       indptr_batched(0, handle.get_stream()),
       ws_cache_idx(n_ws * 2, handle.get_stream())
   {
-    ASSERT(kernel != nullptr, "Kernel pointer required for KernelCache!");
+    ASSERT(kernel != nullptr || is_precomputed, "Kernel pointer required for KernelCache!");
     stream = handle.get_stream();
 
     batching_enabled = false;
@@ -386,8 +436,8 @@ class KernelCache {
       x_ws_dense.resize(n_ws * static_cast<size_t>(n_cols), stream);
     }
 
-    // store matrix l2 norm for RBF kernels
-    if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
+    // store matrix l2 norm for RBF kernels (not needed for precomputed)
+    if (!is_precomputed && kernel_type == cuvs::distance::kernels::KernelType::RBF) {
       matrix_l2.resize(n_rows, stream);
       matrix_l2_ws.resize(n_ws, stream);
       ML::SVM::matrixRowNorm(handle, matrix, matrix_l2.data(), raft::linalg::NormType::L2Norm);
@@ -507,33 +557,41 @@ class KernelCache {
       ML::SVM::extractRows<math_t>(matrix, x_ws_dense.data(), ws_idx_mod.data(), n_ws, handle);
     }
 
-    // extract dot array for RBF
-    if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
-      selectValueSubset(matrix_l2_ws.data(), matrix_l2.data(), ws_idx_mod.data(), n_ws);
-    }
+    if (is_precomputed) {
+      // For precomputed kernels, x_ws_dense contains K[ws, :] (shape n_ws x n_cols)
+      // We need to extract columns ws to get K[ws, ws]
+      // Since n_cols == n_rows for precomputed, we extract columns using ws_idx_mod
+      extractColumnsForPrecomputed(
+        kernel_tile.data(), x_ws_dense.data(), n_ws, ws_idx_mod.data(), n_ws, stream);
+    } else {
+      // extract dot array for RBF
+      if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
+        selectValueSubset(matrix_l2_ws.data(), matrix_l2.data(), ws_idx_mod.data(), n_ws);
+      }
 
-    // compute kernel
-    {
-      if (sparse_extract) {
-        auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_ws, n_cols);
-        KernelOp(handle,
-                 kernel,
-                 ws_view,
-                 ws_view,
-                 kernel_tile.data(),
-                 matrix_l2_ws.data(),
-                 matrix_l2_ws.data());
-      } else {
-        KernelOp(handle,
-                 kernel,
-                 x_ws_dense.data(),
-                 n_ws,
-                 n_cols,
-                 x_ws_dense.data(),
-                 n_ws,
-                 kernel_tile.data(),
-                 matrix_l2_ws.data(),
-                 matrix_l2_ws.data());
+      // compute kernel
+      {
+        if (sparse_extract) {
+          auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_ws, n_cols);
+          KernelOp(handle,
+                   kernel,
+                   ws_view,
+                   ws_view,
+                   kernel_tile.data(),
+                   matrix_l2_ws.data(),
+                   matrix_l2_ws.data());
+        } else {
+          KernelOp(handle,
+                   kernel,
+                   x_ws_dense.data(),
+                   n_ws,
+                   n_cols,
+                   x_ws_dense.data(),
+                   n_ws,
+                   kernel_tile.data(),
+                   matrix_l2_ws.data(),
+                   matrix_l2_ws.data());
+        }
       }
     }
     return kernel_tile.data();
@@ -641,24 +699,51 @@ class KernelCache {
       int* ws_idx_new  = batch_descriptor.nz_da_idx + n_cached;
       math_t* tile_new = kernel_tile.data() + (size_t)n_cached * batch_size;
 
-      auto batch_matrix = getMatrixBatch(
-        matrix, batch_size, offset, host_indptr.data(), indptr_batched.data(), stream);
-
-      // compute kernel
-      math_t* norm_with_offset = matrix_l2.data() != nullptr ? matrix_l2.data() + offset : nullptr;
-      if (sparse_extract) {
-        auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_uncached, n_cols);
-        KernelOp(
-          handle, kernel, batch_matrix, ws_view, tile_new, norm_with_offset, matrix_l2_ws.data());
+      if (is_precomputed) {
+        // For precomputed kernels, extract K[offset:offset+batch_size, ws_idx_new]
+        // Input matrix is column-major: K[row, col] = K[row + col * n_rows]
+        // Output tile_new is column-major: tile_new[i, j] = tile_new[i + j * batch_size]
+        if constexpr (isDenseType<MatrixViewType>()) {
+          const math_t* matrix_data = getDenseData(matrix);
+          thrust::counting_iterator<int> iter(0);
+          int n_elems     = batch_size * n_uncached;
+          int matrix_rows = n_rows;  // Copy member to local for lambda capture
+          thrust::transform(
+            thrust::cuda::par.on(stream),
+            iter,
+            iter + n_elems,
+            tile_new,
+            [matrix_data, ws_idx_new, matrix_rows, offset, batch_size] __device__(int tid) {
+              // Column-major output: tile_new[row, col] = tile_new[row + col * batch_size]
+              int row     = tid % batch_size;
+              int col     = tid / batch_size;
+              int src_row = offset + row;
+              int src_col = ws_idx_new[col];
+              // Column-major input: K[row, col] = K[row + col * matrix_rows]
+              return matrix_data[src_row + src_col * matrix_rows];
+            });
+        }
       } else {
-        KernelOp(handle,
-                 kernel,
-                 batch_matrix,
-                 x_ws_dense.data(),
-                 n_uncached,
-                 tile_new,
-                 norm_with_offset,
-                 matrix_l2_ws.data());
+        auto batch_matrix = getMatrixBatch(
+          matrix, batch_size, offset, host_indptr.data(), indptr_batched.data(), stream);
+
+        // compute kernel
+        math_t* norm_with_offset =
+          matrix_l2.data() != nullptr ? matrix_l2.data() + offset : nullptr;
+        if (sparse_extract) {
+          auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_uncached, n_cols);
+          KernelOp(
+            handle, kernel, batch_matrix, ws_view, tile_new, norm_with_offset, matrix_l2_ws.data());
+        } else {
+          KernelOp(handle,
+                   kernel,
+                   batch_matrix,
+                   x_ws_dense.data(),
+                   n_uncached,
+                   tile_new,
+                   norm_with_offset,
+                   matrix_l2_ws.data());
+        }
       }
 
       RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -757,6 +842,7 @@ class KernelCache {
 
   cuvs::distance::kernels::GramMatrixBase<math_t>* kernel;
   cuvs::distance::kernels::KernelType kernel_type;
+  bool is_precomputed;  //!< if true, matrix is precomputed kernel
 
   int n_rows;  //!< number of rows in x
   int n_cols;  //!< number of columns in x