rapidsai · rapids-bot · Mar 16, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/cpp/cmake/modules/generate_jit_lto_kernels.cmake b/cpp/cmake/modules/generate_jit_lto_kernels.cmake
@@ -104,6 +104,9 @@ function(process_matrix_entry source_list_var)
     EMBEDDED_HEADER_FILE "${embedded_header_file}"
     LINK_LIBRARIES ${_JIT_LTO_KERNEL_LINK_LIBRARIES}
   )
+
+  target_compile_definitions(${kernel_target} PRIVATE "C_SYMBOL_NAME=${kernel_name}")
+
   list(APPEND ${source_list_var} "${embedded_header_file}" "${embedded_file}")
   set(${source_list_var}
       "${${source_list_var}}"

@@ -13,7 +13,10 @@
 struct FragmentEntry;
 
 struct AlgorithmPlanner {
-  AlgorithmPlanner(std::string const& n, std::string const& p) : entrypoint(n + "_" + p) {}
+  AlgorithmPlanner(std::string const& n, std::string const& p)
+    : entrypoint(p.empty() ? n : (n + "_" + p))
+  {
+  }
 
   std::shared_ptr<AlgorithmLauncher> get_launcher();
 

diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentDatabase.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentDatabase.hpp
@@ -22,6 +22,8 @@ class FragmentDatabase {
 
   FragmentEntry* get_fragment(std::string const& key);
 
+  std::unordered_map<std::string, std::unique_ptr<FragmentEntry>> cache;
+
  private:
   FragmentDatabase();
 
@@ -33,8 +35,6 @@ class FragmentDatabase {
                                      std::string const& params,
                                      unsigned char const* blob,
                                      std::size_t size);
-
-  std::unordered_map<std::string, std::unique_ptr<FragmentEntry>> cache;
 };
 
 FragmentDatabase& fragment_database();

@@ -22,6 +22,45 @@ struct tag_acc_ui {};
 // Tag types for index types
 struct tag_idx_l {};
 
+template <typename T>
+struct tag_abbrev;
+template <>
+struct tag_abbrev<tag_f> {
+  static constexpr char const* value = "f";
+};
+template <>
+struct tag_abbrev<tag_h> {
+  static constexpr char const* value = "h";
+};
+template <>
+struct tag_abbrev<tag_sc> {
+  static constexpr char const* value = "sc";
+};
+template <>
+struct tag_abbrev<tag_uc> {
+  static constexpr char const* value = "uc";
+};
+template <>
+struct tag_abbrev<tag_acc_f> {
+  static constexpr char const* value = "f";
+};
+template <>
+struct tag_abbrev<tag_acc_h> {
+  static constexpr char const* value = "h";
+};
+template <>
+struct tag_abbrev<tag_acc_i> {
+  static constexpr char const* value = "i";
+};
+template <>
+struct tag_abbrev<tag_acc_ui> {
+  static constexpr char const* value = "ui";
+};
+template <>
+struct tag_abbrev<tag_idx_l> {
+  static constexpr char const* value = "l";
+};
+
 // Tag types for filter subtypes
 struct tag_filter_bitset_impl {};
 struct tag_filter_none_impl {};

@@ -110,40 +110,8 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_CUDA_TRY(
     cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
 
-  unsigned int kernel_count = 0;
-  RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library));
-
-  // NOTE: cudaKernel_t does not need to be freed explicitly
-  std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]};
-  RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library));
-
-  // Filter out EmptyKernel by checking kernel names using cudaFuncGetName
-  const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv";
-  std::vector<cudaKernel_t> valid_kernels;
-  valid_kernels.reserve(kernel_count);
-
-  for (unsigned int i = 0; i < kernel_count; ++i) {
-    // cudaFuncGetName can be used with cudaKernel_t by casting to void*
-    const void* func_ptr  = reinterpret_cast<const void*>(kernels[i]);
-    const char* func_name = nullptr;
-    RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr));
-
-    bool is_empty_kernel = false;
-    if (func_name != nullptr) {
-      std::string kernel_name(func_name);
-      // Check if this is EmptyKernel
-      if (kernel_name.find(empty_kernel_name) != std::string::npos ||
-          kernel_name == empty_kernel_name) {
-        is_empty_kernel = true;
-      }
-    }
-
-    // Only keep the kernel if it's not EmptyKernel
-    if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }
-  }
-
-  RAFT_EXPECTS(
-    valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size());
+  cudaKernel_t kernel;
+  RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str()));
 
-  return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library);
+  return std::make_shared<AlgorithmLauncher>(kernel, library);
 }
@@ -6,21 +6,13 @@
 // This file is auto-generated. Do not edit manually.
 
 #include <cuvs/detail/jit_lto/RegisterKernelFragment.hpp>
-#include <cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>
 #include "@embedded_header_file@"
 
-using namespace cuvs::neighbors::ivf_flat::detail;
-
 namespace {
 
 __attribute__((__constructor__)) void register_kernel()
 {
-  registerAlgorithm<tag_@type_abbrev@,
-    tag_acc_@acc_abbrev@,
-    tag_idx_@idx_abbrev@>(
-    "interleaved_scan_kernel_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@",
-    embedded_fatbin,
-    sizeof(embedded_fatbin));
+  registerAlgorithm("@kernel_name@", embedded_fatbin, sizeof(embedded_fatbin));
 }
 
 }
@@ -9,10 +9,31 @@
 
 namespace cuvs::neighbors::ivf_flat::detail {
 
-// Instantiate the kernel template
-template __global__ void interleaved_scan_kernel<@capacity@, @veclen@, @ascending_value@, @compute_norm_value@, @data_type@, @acc_type@, @idx_type@>(
-    const uint32_t, const @data_type@*, const uint32_t*, const @data_type@* const*, const uint32_t*,
-    const uint32_t, const uint32_t, const uint32_t, const uint32_t, const uint32_t*, const uint32_t,
-    @idx_type@* const* const, uint32_t*, @idx_type@, @idx_type@, uint32_t*, float*);
+extern "C" {
+__global__ __launch_bounds__(kThreadsPerBlock) void C_SYMBOL_NAME(
+    const uint32_t query_smem_elems,
+    const @data_type@* query,
+    const uint32_t* coarse_index,
+    const @data_type@* const* list_data_ptrs,
+    const uint32_t* list_sizes,
+    const uint32_t queries_offset,
+    const uint32_t n_probes,
+    const uint32_t k,
+    const uint32_t max_samples,
+    const uint32_t* chunk_indices,
+    const uint32_t dim,
+    @idx_type@* const* const inds_ptrs,
+    uint32_t* bitset_ptr,
+    @idx_type@ bitset_len,
+    @idx_type@ original_nbits,
+    uint32_t* neighbors,
+    float* distances)
+{
+  interleaved_scan_kernel_impl<@capacity@, @veclen@, @ascending_value@, @compute_norm_value@, @data_type@, @acc_type@, @idx_type@>(
+      query_smem_elems, query, coarse_index, list_data_ptrs, list_sizes, queries_offset, n_probes,
+      k, max_samples, chunk_indices, dim, inds_ptrs, bitset_ptr, bitset_len, original_nbits,
+      neighbors, distances);
+}
+}  // extern "C"
 
 }  // namespace cuvs::neighbors::ivf_flat::detail
@@ -8,16 +8,21 @@
 #include <cuvs/detail/jit_lto/AlgorithmPlanner.hpp>
 #include <cuvs/detail/jit_lto/FragmentDatabase.hpp>
 #include <cuvs/detail/jit_lto/MakeFragmentKey.hpp>
+#include <cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp>
 #include <iostream>
 #include <string>
 
-template <typename... Args>
+template <typename DataTypeTag, typename AccTypeTag, typename IdxTypeTag>
 struct InterleavedScanPlanner : AlgorithmPlanner {
   InterleavedScanPlanner(int Capacity, int Veclen, bool Ascending, bool ComputeNorm)
-    : AlgorithmPlanner("interleaved_scan_kernel_capacity_" + std::to_string(Capacity) + "_veclen_" +
+    : AlgorithmPlanner("interleaved_scan_capacity_" + std::to_string(Capacity) + "_veclen_" +
                          std::to_string(Veclen) + "_" + (Ascending ? "ascending" : "descending") +
-                         "_" + (ComputeNorm ? "compute_norm" : "no_compute_norm"),
-                       make_fragment_key<Args...>())
+                         "_" + (ComputeNorm ? "compute_norm" : "no_compute_norm") + "_data_" +
+                         cuvs::neighbors::ivf_flat::detail::tag_abbrev<DataTypeTag>::value +
+                         "_acc_" +
+                         cuvs::neighbors::ivf_flat::detail::tag_abbrev<AccTypeTag>::value +
+                         "_idx_" + cuvs::neighbors::ivf_flat::detail::tag_abbrev<IdxTypeTag>::value,
+                       "")
   {
   }
 

@@ -733,55 +733,30 @@ struct flat_block_sort<0, Ascending, T, IdxT>
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 using block_sort_t = typename flat_block_sort<Capacity, Ascending, T, IdxT>::type;
 
-/**
- * Scan clusters for nearest neighbors of the query vectors.
- * See `ivfflat_interleaved_scan` for more information.
- *
- * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
- * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
- * calculated, and the top-k nearest neighbors are selected.
- *
- * @param compute_dist distance function
- * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
- * block; this number must be a multiple of `WarpSize * Veclen`.
- * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
- * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
- * @param[in] list_indices index<T, IdxT>.indices
- * @param[in] list_data index<T, IdxT>.data
- * @param[in] list_sizes index<T, IdxT>.list_sizes
- * @param[in] list_offsets index<T, IdxT>.list_offsets
- * @param n_probes
- * @param k
- * @param dim
- * @param sample_filter
- * @param[out] neighbors
- * @param[out] distances
- */
 template <int Capacity,
           int Veclen,
           bool Ascending,
           bool ComputeNorm,
           typename T,
           typename AccT,
           typename IdxT>
-RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
-  interleaved_scan_kernel(const uint32_t query_smem_elems,
-                          const T* query,
-                          const uint32_t* coarse_index,
-                          const T* const* list_data_ptrs,
-                          const uint32_t* list_sizes,
-                          const uint32_t queries_offset,
-                          const uint32_t n_probes,
-                          const uint32_t k,
-                          const uint32_t max_samples,
-                          const uint32_t* chunk_indices,
-                          const uint32_t dim,
-                          IdxT* const* const inds_ptrs,
-                          uint32_t* bitset_ptr,
-                          IdxT bitset_len,
-                          IdxT original_nbits,
-                          uint32_t* neighbors,
-                          float* distances)
+__device__ __forceinline__ void interleaved_scan_kernel_impl(const uint32_t query_smem_elems,
+                                                             const T* query,
+                                                             const uint32_t* coarse_index,
+                                                             const T* const* list_data_ptrs,
+                                                             const uint32_t* list_sizes,
+                                                             const uint32_t queries_offset,
+                                                             const uint32_t n_probes,
+                                                             const uint32_t k,
+                                                             const uint32_t max_samples,
+                                                             const uint32_t* chunk_indices,
+                                                             const uint32_t dim,
+                                                             IdxT* const* const inds_ptrs,
+                                                             uint32_t* bitset_ptr,
+                                                             IdxT bitset_len,
+                                                             IdxT original_nbits,
+                                                             uint32_t* neighbors,
+                                                             float* distances)
 {
   extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
   constexpr bool kManageLocalTopK = Capacity > 0;