-
Notifications
You must be signed in to change notification settings - Fork 184
Use Specific CCCL Includes #1806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
c419173
04cc166
66ccedf
f20abac
0c3b589
e4c87b5
949700d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -106,15 +106,44 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build() | |||||
| RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed"); | ||||||
|
|
||||||
| // cubin is linked, so now load it | ||||||
| // NOTE: cudaLibrary_t does not need to be freed explicitly | ||||||
| cudaLibrary_t library; | ||||||
| RAFT_CUDA_TRY( | ||||||
| cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0)); | ||||||
|
|
||||||
| constexpr unsigned int count = 1; | ||||||
| unsigned int kernel_count = 0; | ||||||
| RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library)); | ||||||
|
|
||||||
| // NOTE: cudaKernel_t does not need to be freed explicitly | ||||||
| std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[count]}; | ||||||
| RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), count, library)); | ||||||
| std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]}; | ||||||
| RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library)); | ||||||
|
|
||||||
| // Filter out EmptyKernel by checking kernel names using cudaFuncGetName | ||||||
| const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv"; | ||||||
| std::vector<cudaKernel_t> valid_kernels; | ||||||
| valid_kernels.reserve(kernel_count); | ||||||
|
|
||||||
| for (unsigned int i = 0; i < kernel_count; ++i) { | ||||||
| // cudaFuncGetName can be used with cudaKernel_t by casting to void* | ||||||
| const void* func_ptr = reinterpret_cast<const void*>(kernels[i]); | ||||||
| const char* func_name = nullptr; | ||||||
| RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr)); | ||||||
|
|
||||||
| bool is_empty_kernel = false; | ||||||
| if (func_name != nullptr) { | ||||||
| std::string kernel_name(func_name); | ||||||
| // Check if this is EmptyKernel | ||||||
| if (kernel_name.find(empty_kernel_name) != std::string::npos || | ||||||
| kernel_name == empty_kernel_name) { | ||||||
| is_empty_kernel = true; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Only keep the kernel if it's not EmptyKernel | ||||||
| if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); } | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
In the first iteration (e.g. i == 0) this transfers ownership and sets kernels to nullptr. On the next iteration, kernels.release() returns nullptr, and nullptr[i] is undefined behavior. The unique_ptr can keep owning the array until the function returns; you only need to copy the cudaKernel_t handles into valid_kernels, no?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, probably why the tests are failing. |
||||||
| } | ||||||
|
|
||||||
| RAFT_EXPECTS( | ||||||
| valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size()); | ||||||
|
|
||||||
| return std::make_shared<AlgorithmLauncher>(kernels.release()[0]); | ||||||
| return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library); | ||||||
| } | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,8 +29,6 @@ | |
|
|
||
| #include <rmm/device_uvector.hpp> | ||
|
|
||
| #include <cub/cub.cuh> | ||
|
|
||
| #include <gtest/gtest.h> | ||
|
|
||
| #include <vector> | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This might be one of those places where I'm rusty about C++ destructors, isn't throwing from destructors during stack unwinding undefined behavior?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ugh yeah. I'm surprised the compiler did not complain.