Skip to content

Commit ee46427

Browse files
committed
Merge release/26.02 into main
2 parents 5be7a4a + a931a3b commit ee46427

37 files changed

Lines changed: 2351 additions & 369 deletions

.github/workflows/build.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ on:
44
push:
55
branches:
66
- "main"
7+
- "release/*"
78
tags:
89
- v[0-9][0-9].[0-9][0-9].[0-9][0-9]
910
workflow_dispatch:

.github/workflows/test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,4 @@ jobs:
159159
# Test all CUDA major versions with latest dependencies and respective latest Python version
160160
matrix_filter: map(select(.DEPENDENCIES == "latest")) | group_by(.CUDA_VER|split(".")|.[0]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
161161
sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN
162+
continue-on-error: true

ci/test_wheel_integrations.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ rapids-pip-retry install \
3030
# Step 2: Install BERTopic
3131
rapids-logger "Installing BERTopic"
3232
rapids-pip-retry install bertopic
33+
rapids-pip-retry install requests # TODO remove once sentence-transformers#3617 is fixed
3334

3435
# Test 1: Verify imports
3536
rapids-logger "Testing imports"

cpp/include/cuml/manifold/umap.hpp

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -111,6 +111,8 @@ void init_and_refine(const raft::handle_t& handle,
111111
* @param[out] embeddings: unique_ptr to device_buffer that will be allocated and filled with
112112
* embeddings
113113
* @param[out] graph: pointer to fuzzy simplicial set graph
114+
* @param[out] sigmas: optional output array for per-point sigma values (size n, device memory)
115+
* @param[out] rhos: optional output array for per-point rho values (size n, device memory)
114116
*/
115117
void fit(const raft::handle_t& handle,
116118
float* X,
@@ -121,7 +123,9 @@ void fit(const raft::handle_t& handle,
121123
float* knn_dists,
122124
UMAPParams* params,
123125
std::unique_ptr<rmm::device_buffer>& embeddings,
124-
raft::host_coo_matrix<float, int, int, uint64_t>& graph);
126+
raft::host_coo_matrix<float, int, int, uint64_t>& graph,
127+
float* sigmas = nullptr,
128+
float* rhos = nullptr);
125129

126130
/**
127131
* Sparse fit
@@ -217,5 +221,39 @@ void transform_sparse(const raft::handle_t& handle,
217221
UMAPParams* params,
218222
float* transformed);
219223

224+
/**
225+
* Inverse transform - optimize layout in original space
226+
*
227+
* @param[in] handle: raft::handle_t
228+
* @param[in,out] inv_transformed: pointer to initial inverse-transformed positions (will be
229+
* optimized in-place)
230+
* @param[in] n: number of points to inverse transform
231+
* @param[in] n_features: number of features in original space
232+
* @param[in] orig_X: pointer to original training data
233+
* @param[in] orig_n: number of rows in original training data
234+
* @param[in] graph_rows: row indices of the inverse transform graph (COO format)
235+
* @param[in] graph_cols: column indices of the inverse transform graph (COO format)
236+
* @param[in] graph_vals: edge weights of the inverse transform graph
237+
* @param[in] nnz: number of edges in the graph
238+
* @param[in] sigmas: per-point sigma values from fuzzy simplicial set
239+
* @param[in] rhos: per-point rho values from fuzzy simplicial set
240+
* @param[in] params: pointer to ML::UMAPParams object
241+
* @param[in] n_epochs: number of optimization epochs
242+
*/
243+
void inverse_transform(const raft::handle_t& handle,
244+
float* inv_transformed,
245+
int n,
246+
int n_features,
247+
float* orig_X,
248+
int orig_n,
249+
int* graph_rows,
250+
int* graph_cols,
251+
float* graph_vals,
252+
int nnz,
253+
float* sigmas,
254+
float* rhos,
255+
UMAPParams* params,
256+
int n_epochs);
257+
220258
} // namespace UMAP
221259
} // namespace ML

cpp/include/cuml/matrix/kernel_params.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -13,14 +13,21 @@ struct KernelParams;
1313

1414
namespace ML::matrix {
1515

16-
enum class KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
16+
enum class KernelType { LINEAR, POLYNOMIAL, RBF, TANH, PRECOMPUTED };
1717

1818
struct KernelParams {
1919
KernelType kernel;
2020
int degree;
2121
double gamma;
2222
double coef0;
2323

24+
/**
25+
* @brief Convert to cuvs KernelParams.
26+
*
27+
* @note For PRECOMPUTED kernels, the returned cuvs params will have kernel_type
28+
* set to LINEAR as a placeholder, since cuvs doesn't have a PRECOMPUTED type.
29+
* The kernel value won't be used in this case.
30+
*/
2431
cuvs::distance::kernels::KernelParams to_cuvs() const;
2532
};
2633

cpp/src/matrix/kernel_params.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -13,7 +13,13 @@ cuvs::distance::kernels::KernelParams KernelParams::to_cuvs() const
1313
{
1414
cuvs::distance::kernels::KernelParams params;
1515

16-
params.kernel = static_cast<cuvs::distance::kernels::KernelType>(this->kernel);
16+
// For precomputed kernels, we use LINEAR as a placeholder since cuvs
17+
// doesn't have PRECOMPUTED. The actual kernel won't be used.
18+
if (this->kernel == KernelType::PRECOMPUTED) {
19+
params.kernel = cuvs::distance::kernels::KernelType::LINEAR;
20+
} else {
21+
params.kernel = static_cast<cuvs::distance::kernels::KernelType>(this->kernel);
22+
}
1723
params.degree = this->degree;
1824
params.gamma = this->gamma;
1925
params.coef0 = this->coef0;

cpp/src/svm/kernelcache.cuh

Lines changed: 134 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -27,7 +27,9 @@
2727
#include <thrust/device_ptr.h>
2828
#include <thrust/execution_policy.h>
2929
#include <thrust/iterator/constant_iterator.h>
30+
#include <thrust/iterator/counting_iterator.h>
3031
#include <thrust/reverse.h>
32+
#include <thrust/transform.h>
3133

3234
#include <cuvs/distance/distance.hpp>
3335
#include <cuvs/distance/grammian.hpp>
@@ -40,6 +42,50 @@ namespace SVM {
4042

4143
namespace { // unnamed namespace to avoid multiple definition error
4244

45+
/**
46+
* @brief Extract columns from a matrix for precomputed kernels
47+
*
48+
* Given a matrix src of shape (n_rows_src, n_cols_src), extract columns
49+
* specified by col_indices and store in dst of shape (n_rows_src, n_cols_dst).
50+
*
51+
* @param [out] dst destination matrix, size [n_rows_src x n_cols_dst]
52+
* @param [in] src source matrix, size [n_rows_src x n_cols_src]
53+
* @param [in] n_rows_src number of rows in source matrix
54+
* @param [in] col_indices column indices to extract, size [n_cols_dst]
55+
* @param [in] n_cols_dst number of columns to extract
56+
*/
57+
template <typename math_t>
58+
CUML_KERNEL void extractColumnsKernel(
59+
math_t* dst, const math_t* src, int n_rows_src, const int* col_indices, int n_cols_dst)
60+
{
61+
int64_t tid = static_cast<int64_t>(threadIdx.x) + static_cast<int64_t>(blockIdx.x) * blockDim.x;
62+
int64_t total = static_cast<int64_t>(n_rows_src) * n_cols_dst;
63+
if (tid < total) {
64+
int64_t row = tid % n_rows_src;
65+
int64_t col = tid / n_rows_src;
66+
int src_col = col_indices[col];
67+
// Both source and destination are column-major:
68+
// src[row, col] = src[row + col * n_rows_src]
69+
// dst[row, col] = dst[row + col * n_rows_src] = dst[tid]
70+
dst[tid] = src[row + static_cast<int64_t>(src_col) * n_rows_src];
71+
}
72+
}
73+
74+
template <typename math_t>
75+
void extractColumnsForPrecomputed(math_t* dst,
76+
const math_t* src,
77+
int n_rows_src,
78+
const int* col_indices,
79+
int n_cols_dst,
80+
cudaStream_t stream)
81+
{
82+
int total = n_rows_src * n_cols_dst;
83+
int TPB = 256;
84+
int n_blocks = raft::ceildiv(total, TPB);
85+
extractColumnsKernel<math_t>
86+
<<<n_blocks, TPB, 0, stream>>>(dst, src, n_rows_src, col_indices, n_cols_dst);
87+
}
88+
4389
/**
4490
* @brief Re-raise working set indexes to SVR scope [0..2*n_rows)
4591
*
@@ -322,6 +368,8 @@ class KernelCache {
322368
* @param dense_extract_byte_limit sparse rows will be extracted as dense
323369
* up to this limit to speed up kernel computation. Only valid
324370
* for sparse input. (default 1GB)
371+
* @param is_precomputed if true, the matrix is a precomputed kernel matrix
372+
* and no kernel computation is performed
325373
*/
326374
KernelCache(const raft::handle_t& handle,
327375
MatrixViewType matrix,
@@ -333,7 +381,8 @@ class KernelCache {
333381
float cache_size = 200,
334382
SvmType svmType = C_SVC,
335383
size_t kernel_tile_byte_limit = 1 << 30,
336-
size_t dense_extract_byte_limit = 1 << 30)
384+
size_t dense_extract_byte_limit = 1 << 30,
385+
bool is_precomputed = false)
337386
: batch_cache(n_rows, cache_size, handle.get_stream()),
338387
handle(handle),
339388
kernel(kernel),
@@ -343,6 +392,7 @@ class KernelCache {
343392
n_cols(n_cols),
344393
n_ws(n_ws),
345394
svmType(svmType),
395+
is_precomputed(is_precomputed),
346396
kernel_tile(0, handle.get_stream()),
347397
matrix_l2(0, handle.get_stream()),
348398
matrix_l2_ws(0, handle.get_stream()),
@@ -353,7 +403,7 @@ class KernelCache {
353403
indptr_batched(0, handle.get_stream()),
354404
ws_cache_idx(n_ws * 2, handle.get_stream())
355405
{
356-
ASSERT(kernel != nullptr, "Kernel pointer required for KernelCache!");
406+
ASSERT(kernel != nullptr || is_precomputed, "Kernel pointer required for KernelCache!");
357407
stream = handle.get_stream();
358408

359409
batching_enabled = false;
@@ -386,8 +436,8 @@ class KernelCache {
386436
x_ws_dense.resize(n_ws * static_cast<size_t>(n_cols), stream);
387437
}
388438

389-
// store matrix l2 norm for RBF kernels
390-
if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
439+
// store matrix l2 norm for RBF kernels (not needed for precomputed)
440+
if (!is_precomputed && kernel_type == cuvs::distance::kernels::KernelType::RBF) {
391441
matrix_l2.resize(n_rows, stream);
392442
matrix_l2_ws.resize(n_ws, stream);
393443
ML::SVM::matrixRowNorm(handle, matrix, matrix_l2.data(), raft::linalg::NormType::L2Norm);
@@ -507,33 +557,41 @@ class KernelCache {
507557
ML::SVM::extractRows<math_t>(matrix, x_ws_dense.data(), ws_idx_mod.data(), n_ws, handle);
508558
}
509559

510-
// extract dot array for RBF
511-
if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
512-
selectValueSubset(matrix_l2_ws.data(), matrix_l2.data(), ws_idx_mod.data(), n_ws);
513-
}
560+
if (is_precomputed) {
561+
// For precomputed kernels, x_ws_dense contains K[ws, :] (shape n_ws x n_cols)
562+
// We need to extract columns ws to get K[ws, ws]
563+
// Since n_cols == n_rows for precomputed, we extract columns using ws_idx_mod
564+
extractColumnsForPrecomputed(
565+
kernel_tile.data(), x_ws_dense.data(), n_ws, ws_idx_mod.data(), n_ws, stream);
566+
} else {
567+
// extract dot array for RBF
568+
if (kernel_type == cuvs::distance::kernels::KernelType::RBF) {
569+
selectValueSubset(matrix_l2_ws.data(), matrix_l2.data(), ws_idx_mod.data(), n_ws);
570+
}
514571

515-
// compute kernel
516-
{
517-
if (sparse_extract) {
518-
auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_ws, n_cols);
519-
KernelOp(handle,
520-
kernel,
521-
ws_view,
522-
ws_view,
523-
kernel_tile.data(),
524-
matrix_l2_ws.data(),
525-
matrix_l2_ws.data());
526-
} else {
527-
KernelOp(handle,
528-
kernel,
529-
x_ws_dense.data(),
530-
n_ws,
531-
n_cols,
532-
x_ws_dense.data(),
533-
n_ws,
534-
kernel_tile.data(),
535-
matrix_l2_ws.data(),
536-
matrix_l2_ws.data());
572+
// compute kernel
573+
{
574+
if (sparse_extract) {
575+
auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_ws, n_cols);
576+
KernelOp(handle,
577+
kernel,
578+
ws_view,
579+
ws_view,
580+
kernel_tile.data(),
581+
matrix_l2_ws.data(),
582+
matrix_l2_ws.data());
583+
} else {
584+
KernelOp(handle,
585+
kernel,
586+
x_ws_dense.data(),
587+
n_ws,
588+
n_cols,
589+
x_ws_dense.data(),
590+
n_ws,
591+
kernel_tile.data(),
592+
matrix_l2_ws.data(),
593+
matrix_l2_ws.data());
594+
}
537595
}
538596
}
539597
return kernel_tile.data();
@@ -641,24 +699,51 @@ class KernelCache {
641699
int* ws_idx_new = batch_descriptor.nz_da_idx + n_cached;
642700
math_t* tile_new = kernel_tile.data() + (size_t)n_cached * batch_size;
643701

644-
auto batch_matrix = getMatrixBatch(
645-
matrix, batch_size, offset, host_indptr.data(), indptr_batched.data(), stream);
646-
647-
// compute kernel
648-
math_t* norm_with_offset = matrix_l2.data() != nullptr ? matrix_l2.data() + offset : nullptr;
649-
if (sparse_extract) {
650-
auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_uncached, n_cols);
651-
KernelOp(
652-
handle, kernel, batch_matrix, ws_view, tile_new, norm_with_offset, matrix_l2_ws.data());
702+
if (is_precomputed) {
703+
// For precomputed kernels, extract K[offset:offset+batch_size, ws_idx_new]
704+
// Input matrix is column-major: K[row, col] = K[row + col * n_rows]
705+
// Output tile_new is column-major: tile_new[i, j] = tile_new[i + j * batch_size]
706+
if constexpr (isDenseType<MatrixViewType>()) {
707+
const math_t* matrix_data = getDenseData(matrix);
708+
thrust::counting_iterator<int> iter(0);
709+
int n_elems = batch_size * n_uncached;
710+
int matrix_rows = n_rows; // Copy member to local for lambda capture
711+
thrust::transform(
712+
thrust::cuda::par.on(stream),
713+
iter,
714+
iter + n_elems,
715+
tile_new,
716+
[matrix_data, ws_idx_new, matrix_rows, offset, batch_size] __device__(int tid) {
717+
// Column-major output: tile_new[row, col] = tile_new[row + col * batch_size]
718+
int row = tid % batch_size;
719+
int col = tid / batch_size;
720+
int src_row = offset + row;
721+
int src_col = ws_idx_new[col];
722+
// Column-major input: K[row, col] = K[row + col * matrix_rows]
723+
return matrix_data[src_row + src_col * matrix_rows];
724+
});
725+
}
653726
} else {
654-
KernelOp(handle,
655-
kernel,
656-
batch_matrix,
657-
x_ws_dense.data(),
658-
n_uncached,
659-
tile_new,
660-
norm_with_offset,
661-
matrix_l2_ws.data());
727+
auto batch_matrix = getMatrixBatch(
728+
matrix, batch_size, offset, host_indptr.data(), indptr_batched.data(), stream);
729+
730+
// compute kernel
731+
math_t* norm_with_offset =
732+
matrix_l2.data() != nullptr ? matrix_l2.data() + offset : nullptr;
733+
if (sparse_extract) {
734+
auto ws_view = getViewWithFixedDimension(*x_ws_csr, n_uncached, n_cols);
735+
KernelOp(
736+
handle, kernel, batch_matrix, ws_view, tile_new, norm_with_offset, matrix_l2_ws.data());
737+
} else {
738+
KernelOp(handle,
739+
kernel,
740+
batch_matrix,
741+
x_ws_dense.data(),
742+
n_uncached,
743+
tile_new,
744+
norm_with_offset,
745+
matrix_l2_ws.data());
746+
}
662747
}
663748

664749
RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -757,6 +842,7 @@ class KernelCache {
757842

758843
cuvs::distance::kernels::GramMatrixBase<math_t>* kernel;
759844
cuvs::distance::kernels::KernelType kernel_type;
845+
bool is_precomputed; //!< if true, matrix is precomputed kernel
760846

761847
int n_rows; //!< number of rows in x
762848
int n_cols; //!< number of columns in x

0 commit comments

Comments
 (0)