Skip to content

Commit c18ccfa

Browse files
authored
Use Specific CCCL Includes (#1806)
This is needed downstream because cub injects an EmptyKernel symbol into every TU that includes either cub/cub.cuh or cub/device/* includes, and this causes an issue for cubins created using JIT-LTO. This PR also fixes 2 bugs in JIT kernels: 1. Filters out `cub::EmptyKernel` from JIT TUs specifically because they only expect 1 kernel but are finding 2 instead 2. Keeps a reference to `cudaLibrary_t` that loads the JIT kernel and unloading it at destruction Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #1806
1 parent 8fab5b3 commit c18ccfa

21 files changed

Lines changed: 128 additions & 57 deletions

File tree

cpp/include/cuvs/detail/jit_lto/AlgorithmLauncher.hpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,17 @@
1515
#include <memory>
1616

1717
struct AlgorithmLauncher {
18-
AlgorithmLauncher() = default;
18+
AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}
1919

20-
AlgorithmLauncher(cudaKernel_t k);
20+
AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);
21+
22+
~AlgorithmLauncher();
23+
24+
AlgorithmLauncher(const AlgorithmLauncher&) = delete;
25+
AlgorithmLauncher& operator=(const AlgorithmLauncher&) = delete;
26+
27+
AlgorithmLauncher(AlgorithmLauncher&& other) noexcept;
28+
AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;
2129

2230
template <typename... Args>
2331
void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
@@ -31,6 +39,7 @@ struct AlgorithmLauncher {
3139
private:
3240
void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
3341
cudaKernel_t kernel;
42+
cudaLibrary_t library;
3443
};
3544

3645
std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();

cpp/src/cluster/detail/kmeans_common.cuh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,10 @@
3131
#include <rmm/device_scalar.hpp>
3232
#include <rmm/device_uvector.hpp>
3333

34-
#include <cub/cub.cuh>
34+
#include <cub/device/device_histogram.cuh>
35+
#include <cub/device/device_reduce.cuh>
36+
#include <cub/device/device_select.cuh>
37+
#include <cub/iterator/arg_index_input_iterator.cuh>
3538
#include <cuda.h>
3639
#include <thrust/fill.h>
3740
#include <thrust/for_each.h>

cpp/src/detail/jit_lto/AlgorithmLauncher.cu

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,32 @@
77

88
#include <raft/util/cuda_rt_essentials.hpp>
99

10-
AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k) : kernel{k} {}
10+
AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}
11+
12+
AlgorithmLauncher::~AlgorithmLauncher()
13+
{
14+
if (library != nullptr) { (void)cudaLibraryUnload(library); }
15+
}
16+
17+
AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
18+
: kernel{other.kernel}, library{other.library}
19+
{
20+
other.kernel = nullptr;
21+
other.library = nullptr;
22+
}
23+
24+
AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
25+
{
26+
if (this != &other) {
27+
// Unload current library if it exists
28+
if (library != nullptr) { cudaLibraryUnload(library); }
29+
kernel = other.kernel;
30+
library = other.library;
31+
other.kernel = nullptr;
32+
other.library = nullptr;
33+
}
34+
return *this;
35+
}
1136

1237
void AlgorithmLauncher::call(
1338
cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)

cpp/src/detail/jit_lto/AlgorithmPlanner.cu

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,44 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
106106
RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");
107107

108108
// cubin is linked, so now load it
109-
// NOTE: cudaLibrary_t does not need to be freed explicitly
110109
cudaLibrary_t library;
111110
RAFT_CUDA_TRY(
112111
cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
113112

114-
constexpr unsigned int count = 1;
113+
unsigned int kernel_count = 0;
114+
RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library));
115+
115116
// NOTE: cudaKernel_t does not need to be freed explicitly
116-
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[count]};
117-
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), count, library));
117+
std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]};
118+
RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library));
119+
120+
// Filter out EmptyKernel by checking kernel names using cudaFuncGetName
121+
const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv";
122+
std::vector<cudaKernel_t> valid_kernels;
123+
valid_kernels.reserve(kernel_count);
124+
125+
for (unsigned int i = 0; i < kernel_count; ++i) {
126+
// cudaFuncGetName can be used with cudaKernel_t by casting to void*
127+
const void* func_ptr = reinterpret_cast<const void*>(kernels[i]);
128+
const char* func_name = nullptr;
129+
RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr));
130+
131+
bool is_empty_kernel = false;
132+
if (func_name != nullptr) {
133+
std::string kernel_name(func_name);
134+
// Check if this is EmptyKernel
135+
if (kernel_name.find(empty_kernel_name) != std::string::npos ||
136+
kernel_name == empty_kernel_name) {
137+
is_empty_kernel = true;
138+
}
139+
}
140+
141+
// Only keep the kernel if it's not EmptyKernel
142+
if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }
143+
}
144+
145+
RAFT_EXPECTS(
146+
valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size());
118147

119-
return std::make_shared<AlgorithmLauncher>(kernels.release()[0]);
148+
return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library);
120149
}

cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <cub/block/block_load.cuh>
1212
#include <cub/block/block_radix_sort.cuh>
1313
#include <cub/block/block_store.cuh>
14-
#include <cub/cub.cuh>
14+
#include <cub/warp/warp_reduce.cuh>
1515

1616
namespace cuvs {
1717
namespace distance {

cpp/src/distance/detail/sparse/utils.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

66
#pragma once
77

88
#include <raft/core/math.hpp>
99

10-
#include <cub/cub.cuh>
10+
#include <cub/warp/warp_reduce.cuh>
1111
#include <cuda_fp16.h>
1212
#include <cuda_pipeline.h>
1313

cpp/src/distance/fused_distance_nn-inl.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -14,7 +14,7 @@
1414
#include <raft/linalg/contractions.cuh>
1515
#include <raft/util/cuda_utils.cuh>
1616

17-
#include <cub/cub.cuh>
17+
#include <cub/util_type.cuh>
1818

1919
#include <stdint.h>
2020

cpp/src/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "../utils.hpp"
88
#include "topk.h"
99

10-
#include <cub/cub.cuh>
10+
#include <cub/block/block_scan.cuh>
1111

1212
#include <raft/core/detail/macros.hpp>
1313
#include <raft/core/error.hpp>

cpp/src/neighbors/detail/fused_l2_knn.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#pragma once
@@ -8,7 +8,7 @@
88
#include <raft/linalg/norm.cuh>
99
#include <raft/util/cuda_utils.cuh>
1010

11-
#include <cub/cub.cuh>
11+
#include <cub/util_type.cuh>
1212

1313
#include <limits>
1414

cpp/src/neighbors/detail/knn_utils.cuh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

@@ -9,7 +9,6 @@
99

1010
#include <cuvs/distance/distance.hpp>
1111

12-
#include <cub/cub.cuh>
1312
#include <cuda_fp16.h>
1413
#include <cuda_pipeline.h>
1514

0 commit comments

Comments
 (0)