Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,17 @@
#include <memory>

struct AlgorithmLauncher {
AlgorithmLauncher() = default;
AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}

AlgorithmLauncher(cudaKernel_t k);
AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);

~AlgorithmLauncher();

AlgorithmLauncher(const AlgorithmLauncher&) = delete;
AlgorithmLauncher& operator=(const AlgorithmLauncher&) = delete;

AlgorithmLauncher(AlgorithmLauncher&& other) noexcept;
AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;

template <typename... Args>
void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
Expand All @@ -31,6 +39,7 @@ struct AlgorithmLauncher {
private:
void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
cudaKernel_t kernel;
cudaLibrary_t library;
};

std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();
5 changes: 4 additions & 1 deletion cpp/src/cluster/detail/kmeans_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cuda.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
Expand Down
24 changes: 23 additions & 1 deletion cpp/src/detail/jit_lto/AlgorithmLauncher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,29 @@

#include <raft/util/cuda_rt_essentials.hpp>

AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k) : kernel{k} {}
AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}

AlgorithmLauncher::~AlgorithmLauncher() { RAFT_CUDA_TRY(cudaLibraryUnload(library)); }
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
AlgorithmLauncher::~AlgorithmLauncher() { RAFT_CUDA_TRY(cudaLibraryUnload(library)); }
AlgorithmLauncher::~AlgorithmLauncher() {
if (library != nullptr) {
(void)cudaLibraryUnload(library); // ignore errors in destructor
}
}

This might be one of those places where I'm rusty about C++ destructors, isn't throwing from destructors during stack unwinding undefined behavior?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ugh yeah. I'm surprised the compiler did not complain.


AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
: kernel{other.kernel}, library{other.library}
{
other.kernel = nullptr;
other.library = nullptr;
}

AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
{
if (this != &other) {
// Unload current library if it exists
if (library != nullptr) { cudaLibraryUnload(library); }
kernel = other.kernel;
library = other.library;
other.kernel = nullptr;
other.library = nullptr;
}
return *this;
}

void AlgorithmLauncher::call(
cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)
Expand Down
39 changes: 34 additions & 5 deletions cpp/src/detail/jit_lto/AlgorithmPlanner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,44 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");

// cubin is linked, so now load it
// NOTE: cudaLibrary_t does not need to be freed explicitly
cudaLibrary_t library;
RAFT_CUDA_TRY(
cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));

constexpr unsigned int count = 1;
unsigned int kernel_count = 0;
RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library));

// NOTE: cudaKernel_t does not need to be freed explicitly
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[count]};
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), count, library));
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]};
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library));

// Filter out EmptyKernel by checking kernel names using cudaFuncGetName
const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv";
std::vector<cudaKernel_t> valid_kernels;
valid_kernels.reserve(kernel_count);

for (unsigned int i = 0; i < kernel_count; ++i) {
// cudaFuncGetName can be used with cudaKernel_t by casting to void*
const void* func_ptr = reinterpret_cast<const void*>(kernels[i]);
const char* func_name = nullptr;
RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr));

bool is_empty_kernel = false;
if (func_name != nullptr) {
std::string kernel_name(func_name);
// Check if this is EmptyKernel
if (kernel_name.find(empty_kernel_name) != std::string::npos ||
kernel_name == empty_kernel_name) {
is_empty_kernel = true;
}
}

// Only keep the kernel if it's not EmptyKernel
if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); }
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); }
if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }

In the first iteration (e.g. i == 0) this transfers ownership and sets kernels to nullptr. On the next iteration, kernels.release() returns nullptr, and nullptr[i] is undefined behavior. The unique_ptr can keep owning the array until the function returns; you only need to copy the cudaKernel_t handles into valid_kernels, no?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, probably why the tests are failing.

}

RAFT_EXPECTS(
valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size());

return std::make_shared<AlgorithmLauncher>(kernels.release()[0]);
return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library);
}
2 changes: 1 addition & 1 deletion cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <cub/block/block_load.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_store.cuh>
#include <cub/cub.cuh>
#include <cub/warp/warp_reduce.cuh>

namespace cuvs {
namespace distance {
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/distance/detail/sparse/utils.cuh
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <raft/core/math.hpp>

#include <cub/cub.cuh>
#include <cub/warp/warp_reduce.cuh>
#include <cuda_fp16.h>
#include <cuda_pipeline.h>

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/distance/fused_distance_nn-inl.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -14,7 +14,7 @@
#include <raft/linalg/contractions.cuh>
#include <raft/util/cuda_utils.cuh>

#include <cub/cub.cuh>
#include <cub/util_type.cuh>

#include <stdint.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "../utils.hpp"
#include "topk.h"

#include <cub/cub.cuh>
#include <cub/block/block_scan.cuh>

#include <raft/core/detail/macros.hpp>
#include <raft/core/error.hpp>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/neighbors/detail/fused_l2_knn.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand All @@ -8,7 +8,7 @@
#include <raft/linalg/norm.cuh>
#include <raft/util/cuda_utils.cuh>

#include <cub/cub.cuh>
#include <cub/util_type.cuh>

#include <limits>

Expand Down
3 changes: 1 addition & 2 deletions cpp/src/neighbors/detail/knn_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -9,7 +9,6 @@

#include <cuvs/distance/distance.hpp>

#include <cub/cub.cuh>
#include <cuda_fp16.h>
#include <cuda_pipeline.h>

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/neighbors/detail/vamana/greedy_search.cuh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <cub/cub.cuh>
#include <cub/block/block_merge_sort.cuh>

#include "macros.cuh"
#include "priority_queue.cuh"
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/neighbors/detail/vamana/robust_prune.cuh
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <cub/cub.cuh>
#include <thrust/sort.h>

#include <raft/util/cuda_dev_essentials.cuh>
Expand Down
21 changes: 20 additions & 1 deletion cpp/src/neighbors/ivf_common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include <raft/util/cudart_utils.hpp>
#include <raft/util/pow2_utils.cuh>

#include <cub/cub.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/device/device_radix_sort.cuh>

namespace cuvs::neighbors::ivf::detail {

Expand Down Expand Up @@ -72,4 +73,22 @@ void calc_chunk_indices::configured::operator()(const uint32_t* cluster_sizes,
RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
}

// Helper function to sort cluster sizes using CUB, extracted from template to avoid
// including cub/device/* in the header file
void sort_cluster_sizes_descending(uint32_t* input,
uint32_t* output,
uint32_t n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* tmp_res)
{
int begin_bit = 0;
int end_bit = sizeof(uint32_t) * 8;
size_t cub_workspace_size = 0;
cub::DeviceRadixSort::SortKeysDescending(
nullptr, cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
cub::DeviceRadixSort::SortKeysDescending(
cub_workspace.data(), cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
}

} // namespace cuvs::neighbors::ivf::detail
32 changes: 10 additions & 22 deletions cpp/src/neighbors/ivf_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@
#include <raft/linalg/unary_op.cuh>
#include <raft/matrix/detail/select_warpsort.cuh> // matrix::detail::select::warpsort::warp_sort_distributed

#include <cub/cub.cuh>

namespace cuvs::neighbors::ivf::detail {

// Forward declaration of helper function to avoid including cub/device/* in header
void sort_cluster_sizes_descending(uint32_t* input,
uint32_t* output,
uint32_t n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* tmp_res);

/**
* Default value returned by `search` when the `n_probes` is too small and top-k is too large.
* One may encounter it if the combined size of probed clusters is smaller than the requested
Expand Down Expand Up @@ -247,26 +252,9 @@ void recompute_internal_state(const raft::resources& res, Index& index)
}

// Sort the cluster sizes in the descending order.
int begin_bit = 0;
int end_bit = sizeof(uint32_t) * 8;
size_t cub_workspace_size = 0;
cub::DeviceRadixSort::SortKeysDescending(nullptr,
cub_workspace_size,
index.list_sizes().data_handle(),
sorted_sizes.data(),
index.n_lists(),
begin_bit,
end_bit,
stream);
rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
cub::DeviceRadixSort::SortKeysDescending(cub_workspace.data(),
cub_workspace_size,
index.list_sizes().data_handle(),
sorted_sizes.data(),
index.n_lists(),
begin_bit,
end_bit,
stream);
// Use helper function to avoid including cub/device/* in this header
sort_cluster_sizes_descending(
index.list_sizes().data_handle(), sorted_sizes.data(), index.n_lists(), stream, tmp_res);
// copy the results to CPU
std::vector<uint32_t> sorted_sizes_host(index.n_lists());
raft::copy(sorted_sizes_host.data(), sorted_sizes.data(), index.n_lists(), stream);
Expand Down
1 change: 0 additions & 1 deletion cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/per_device_resource.hpp>

#include <cub/cub.cuh>
#include <cuda_fp16.h>

namespace cuvs::neighbors::ivf_pq::detail {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/per_device_resource.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cuda_fp16.h>

#include <optional>
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/neighbors/scann/detail/scann_avq.cuh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/error.hpp>
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/sparse/neighbors/detail/cross_component_nn.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand All @@ -26,7 +26,6 @@

#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/gather.h>
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/stats/detail/silhouette_score.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -17,7 +17,7 @@

#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>
#include <cub/device/device_histogram.cuh>

#include <math.h>

Expand Down
4 changes: 3 additions & 1 deletion cpp/tests/cluster/linkage.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -13,6 +13,8 @@
#include <raft/sparse/coo.hpp>
#include <raft/util/cudart_utils.hpp>

#include <cub/block/block_reduce.cuh>

#include <rmm/device_uvector.hpp>

#include <gtest/gtest.h>
Expand Down
2 changes: 0 additions & 2 deletions cpp/tests/sparse/neighbors/cross_component_nn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@

#include <rmm/device_uvector.hpp>

#include <cub/cub.cuh>

#include <gtest/gtest.h>

#include <vector>
Expand Down
Loading