Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,17 @@
#include <memory>

struct AlgorithmLauncher {
AlgorithmLauncher() = default;
AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}

AlgorithmLauncher(cudaKernel_t k);
AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);

~AlgorithmLauncher();

AlgorithmLauncher(const AlgorithmLauncher&) = delete;
AlgorithmLauncher& operator=(const AlgorithmLauncher&) = delete;

AlgorithmLauncher(AlgorithmLauncher&& other) noexcept;
AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;

template <typename... Args>
void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
Expand All @@ -31,6 +39,7 @@ struct AlgorithmLauncher {
private:
void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
cudaKernel_t kernel;
cudaLibrary_t library;
};

std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();
5 changes: 4 additions & 1 deletion cpp/src/cluster/detail/kmeans_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cuda.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
Expand Down
27 changes: 26 additions & 1 deletion cpp/src/detail/jit_lto/AlgorithmLauncher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,32 @@

#include <raft/util/cuda_rt_essentials.hpp>

AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k) : kernel{k} {}
AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}

AlgorithmLauncher::~AlgorithmLauncher()
{
if (library != nullptr) { (void)cudaLibraryUnload(library); }
}

AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
: kernel{other.kernel}, library{other.library}
{
other.kernel = nullptr;
other.library = nullptr;
}

AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
{
if (this != &other) {
// Unload current library if it exists
if (library != nullptr) { cudaLibraryUnload(library); }
kernel = other.kernel;
library = other.library;
other.kernel = nullptr;
other.library = nullptr;
}
return *this;
}

void AlgorithmLauncher::call(
cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
Expand Down
39 changes: 34 additions & 5 deletions cpp/src/detail/jit_lto/AlgorithmPlanner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,44 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");

// cubin is linked, so now load it
// NOTE: cudaLibrary_t does not need to be freed explicitly
cudaLibrary_t library;
RAFT_CUDA_TRY(
cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));

constexpr unsigned int count = 1;
unsigned int kernel_count = 0;
RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library));

// NOTE: cudaKernel_t does not need to be freed explicitly
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[count]};
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), count, library));
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]};
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library));

// Filter out EmptyKernel by checking kernel names using cudaFuncGetName
const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv";
std::vector<cudaKernel_t> valid_kernels;
valid_kernels.reserve(kernel_count);

for (unsigned int i = 0; i < kernel_count; ++i) {
// cudaFuncGetName can be used with cudaKernel_t by casting to void*
const void* func_ptr = reinterpret_cast<const void*>(kernels[i]);
const char* func_name = nullptr;
RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr));

bool is_empty_kernel = false;
if (func_name != nullptr) {
std::string kernel_name(func_name);
// Check if this is EmptyKernel
if (kernel_name.find(empty_kernel_name) != std::string::npos ||
kernel_name == empty_kernel_name) {
is_empty_kernel = true;
}
}

// Only keep the kernel if it's not EmptyKernel
if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }
}

RAFT_EXPECTS(
valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size());

return std::make_shared<AlgorithmLauncher>(kernels.release()[0]);
return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library);
}
2 changes: 1 addition & 1 deletion cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include <cub/warp/warp_reduce.cuh>

namespace cuvs {
namespace distance {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/distance/detail/sparse/utils.cuh
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <raft/core/math.hpp>

#include <cub/cub.cuh>
#include <cub/warp/warp_reduce.cuh>
#include <cuda_fp16.h>
#include <cuda_pipeline.h>

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/distance/fused_distance_nn-inl.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -14,7 +14,7 @@
#include <raft/linalg/contractions.cuh>
#include <raft/util/cuda_utils.cuh>

#include <cub/cub.cuh>
#include <cub/util_type.cuh>

#include <stdint.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "../utils.hpp"
#include "topk.h"

#include <cub/cub.cuh>
#include <cub/block/block_scan.cuh>

#include <raft/core/detail/macros.hpp>
#include <raft/core/error.hpp>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/neighbors/detail/fused_l2_knn.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand All @@ -8,7 +8,7 @@
#include <raft/linalg/norm.cuh>
#include <raft/util/cuda_utils.cuh>

#include <cub/cub.cuh>
#include <cub/util_type.cuh>

#include <limits>

Expand Down
3 changes: 1 addition & 2 deletions cpp/src/neighbors/detail/knn_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -9,7 +9,6 @@

#include <cuvs/distance/distance.hpp>

#include <cub/cub.cuh>
#include <cuda_fp16.h>
#include <cuda_pipeline.h>

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/neighbors/detail/vamana/greedy_search.cuh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <cub/cub.cuh>
#include <cub/block/block_merge_sort.cuh>

#include "macros.cuh"
#include "priority_queue.cuh"
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/neighbors/detail/vamana/robust_prune.cuh
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <cub/cub.cuh>
#include <thrust/sort.h>

#include <raft/util/cuda_dev_essentials.cuh>
Expand Down
21 changes: 20 additions & 1 deletion cpp/src/neighbors/ivf_common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include <raft/util/cudart_utils.hpp>
#include <raft/util/pow2_utils.cuh>

#include <cub/cub.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/device/device_radix_sort.cuh>

namespace cuvs::neighbors::ivf::detail {

Expand Down Expand Up @@ -72,4 +73,22 @@ void calc_chunk_indices::configured::operator()(const uint32_t* cluster_sizes,
RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
}

// Helper function to sort cluster sizes using CUB, extracted from template to avoid
// including cub/device/* in the header file
void sort_cluster_sizes_descending(uint32_t* input,
uint32_t* output,
uint32_t n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* tmp_res)
{
int begin_bit = 0;
int end_bit = sizeof(uint32_t) * 8;
size_t cub_workspace_size = 0;
cub::DeviceRadixSort::SortKeysDescending(
nullptr, cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
cub::DeviceRadixSort::SortKeysDescending(
cub_workspace.data(), cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
}

} // namespace cuvs::neighbors::ivf::detail
32 changes: 10 additions & 22 deletions cpp/src/neighbors/ivf_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@
#include <raft/linalg/unary_op.cuh>
#include <raft/matrix/detail/select_warpsort.cuh> // matrix::detail::select::warpsort::warp_sort_distributed

#include <cub/cub.cuh>

namespace cuvs::neighbors::ivf::detail {

// Forward declaration of helper function to avoid including cub/device/* in header
void sort_cluster_sizes_descending(uint32_t* input,
uint32_t* output,
uint32_t n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* tmp_res);

/**
* Default value returned by `search` when the `n_probes` is too small and top-k is too large.
* One may encounter it if the combined size of probed clusters is smaller than the requested
Expand Down Expand Up @@ -247,26 +252,9 @@ void recompute_internal_state(const raft::resources& res, Index& index)
}

// Sort the cluster sizes in the descending order.
int begin_bit = 0;
int end_bit = sizeof(uint32_t) * 8;
size_t cub_workspace_size = 0;
cub::DeviceRadixSort::SortKeysDescending(nullptr,
cub_workspace_size,
index.list_sizes().data_handle(),
sorted_sizes.data(),
index.n_lists(),
begin_bit,
end_bit,
stream);
rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
cub::DeviceRadixSort::SortKeysDescending(cub_workspace.data(),
cub_workspace_size,
index.list_sizes().data_handle(),
sorted_sizes.data(),
index.n_lists(),
begin_bit,
end_bit,
stream);
// Use helper function to avoid including cub/device/* in this header
sort_cluster_sizes_descending(
index.list_sizes().data_handle(), sorted_sizes.data(), index.n_lists(), stream, tmp_res);
// copy the results to CPU
std::vector<uint32_t> sorted_sizes_host(index.n_lists());
raft::copy(sorted_sizes_host.data(), sorted_sizes.data(), index.n_lists(), stream);
Expand Down
1 change: 0 additions & 1 deletion cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/per_device_resource.hpp>

#include <cub/cub.cuh>
#include <cuda_fp16.h>

namespace cuvs::neighbors::ivf_pq::detail {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/per_device_resource.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cuda_fp16.h>

#include <optional>
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/neighbors/scann/detail/scann_avq.cuh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/error.hpp>
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/sparse/neighbors/detail/cross_component_nn.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand All @@ -26,7 +26,6 @@

#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/gather.h>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/stats/detail/silhouette_score.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -17,7 +17,7 @@

#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>

#include <math.h>

Expand Down
4 changes: 3 additions & 1 deletion cpp/tests/cluster/linkage.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -13,6 +13,8 @@
#include <raft/sparse/coo.hpp>
#include <raft/util/cudart_utils.hpp>

#include <cub/block/block_reduce.cuh>

#include <rmm/device_uvector.hpp>

#include <gtest/gtest.h>
Expand Down
2 changes: 0 additions & 2 deletions cpp/tests/sparse/neighbors/cross_component_nn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@

#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>

#include <gtest/gtest.h>

#include <vector>
Expand Down