Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
96af4dd
Set max-dynamic-shared-mem with thread-safety
mythrocks Feb 3, 2026
305cb55
Copyright date. Formatting.
mythrocks Feb 4, 2026
4463666
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 4, 2026
677f6fe
Moved other call to set cudaFuncAttributeMaxDynamicSharedMemorySize.
mythrocks Feb 4, 2026
5e9e410
Moved call-sites in other files.
mythrocks Feb 4, 2026
f55cbd3
Copyright date.
mythrocks Feb 5, 2026
b394b19
Resolved merge conflicts. Moved all call-sites to use the new `optio…
mythrocks Feb 5, 2026
f3efb37
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 6, 2026
8792c40
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 10, 2026
e55eb23
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 10, 2026
2a5a00f
Invoke kernel within critical section.
mythrocks Feb 10, 2026
b1b97cc
Removed old function.
mythrocks Feb 10, 2026
f50d4d4
Tie the kernel to its launcher.
mythrocks Feb 11, 2026
c2b1c43
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 11, 2026
8d66233
Better error reporting.
mythrocks Feb 11, 2026
1acb864
Merge branch 'main' into cuda-invalid-argument-kernel-error
achirkin Feb 12, 2026
1d41373
Remove the safety fix for persistent kernel (only one kernel must run…
achirkin Feb 12, 2026
f6cf7d3
Add a reproducer
achirkin Feb 12, 2026
9b13500
Fix style
achirkin Feb 12, 2026
8d6fb1a
Apply suggestion from @achirkin
mythrocks Feb 13, 2026
1cd8b2c
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 13, 2026
9ed3400
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 13, 2026
1955dce
Fixed formatting again.
mythrocks Feb 13, 2026
3ba77d5
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 13, 2026
5a8771c
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 16, 2026
45dff11
Apply suggestion from @achirkin
achirkin Feb 18, 2026
8c107e1
Merge branch 'main' into cuda-invalid-argument-kernel-error
achirkin Feb 18, 2026
a6f3071
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 19, 2026
d3f6ecd
Fixed format string.
mythrocks Feb 19, 2026
088fcc8
Fixed thrust header.
mythrocks Feb 19, 2026
856e695
Fix compile error for thrust make_counting_iterator
mythrocks Feb 19, 2026
eff47c8
Revert "Fix compile error for thrust make_counting_iterator"
mythrocks Feb 20, 2026
b59b8c3
Cherrypick fix from #1825.
mythrocks Feb 20, 2026
3581c01
Merge remote-tracking branch 'origin/main' into cuda-invalid-argument…
mythrocks Feb 20, 2026
80825ab
Merge branch 'main' into cuda-invalid-argument-kernel-error
mythrocks Feb 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/cudart_utils.hpp> // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
Expand Down Expand Up @@ -589,8 +590,7 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
}

RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
// Initialize hash table
const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
set_value_batch(traversed_hashmap_ptr,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/integer_utils.hpp>
Expand Down Expand Up @@ -1980,8 +1981,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
auto* dd_dev_ptr = dd_host.dev_ptr(stream);

// set kernel attributes same as in normal kernel
RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);

// set kernel launch parameters
Comment thread
achirkin marked this conversation as resolved.
dim3 gs = calc_coop_grid_size(block_size, smem_size, persistent_device_usage);
Expand Down Expand Up @@ -2312,8 +2312,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
dim3 thread_dims(block_size, 1, 1);
dim3 block_dims(1, num_queries, 1);
RAFT_LOG_DEBUG(
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/neighbors/detail/smem_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <raft/util/cuda_rt_essentials.hpp>

#include <cstdint>
#include <mutex>

namespace cuvs::neighbors::detail {

/**
* @brief Optionally set the larger max dynamic shared memory size for the kernel.
* This is required because `cudaFuncSetAttribute` is not thread-safe.
* In the event of concurrent calls, we'd like to accommodate the largest requested size.
* @tparam KernelT The type of the kernel.
* @param smem_size The size of the dynamic shared memory to be set.
* @param kernel The kernel to be set.
*/
template <typename KernelT>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should work fine if the kernel type is different for each CUDA kernel instantiation. I am not familiar with this.

void optionally_set_larger_max_smem_size(uint32_t smem_size, KernelT& kernel)
{
static auto mutex = std::mutex{};
static auto running_max_smem_size = uint32_t{0};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would maybe make running_max_smem_size be a std::atomic for more safety.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's necessary. Per the C++ standard, a function static's initialization is threadsafe. The updates are within the critical section.

Forgive me if I've missed the point of concern.

if (smem_size > running_max_smem_size) {
auto guard = std::lock_guard<std::mutex>{mutex};
if (smem_size > running_max_smem_size) {
running_max_smem_size = smem_size;
RAFT_CUDA_TRY(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, running_max_smem_size));
}
}
}

} // namespace cuvs::neighbors::detail
Comment thread
mythrocks marked this conversation as resolved.
Loading