rapidsai · rapids-bot · Feb 18, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 17, 2026
@@ -15,9 +15,17 @@
 #include <memory>
 
 struct AlgorithmLauncher {
-  AlgorithmLauncher() = default;
+  AlgorithmLauncher() : kernel{nullptr}, library{nullptr} {}
 
-  AlgorithmLauncher(cudaKernel_t k);
+  AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib);
+
+  ~AlgorithmLauncher();
+
+  AlgorithmLauncher(const AlgorithmLauncher&)            = delete;
+  AlgorithmLauncher& operator=(const AlgorithmLauncher&) = delete;
+
+  AlgorithmLauncher(AlgorithmLauncher&& other) noexcept;
+  AlgorithmLauncher& operator=(AlgorithmLauncher&& other) noexcept;
 
   template <typename... Args>
   void dispatch(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, Args&&... args)
@@ -31,6 +39,7 @@ struct AlgorithmLauncher {
  private:
   void call(cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** args);
   cudaKernel_t kernel;
+  cudaLibrary_t library;
 };
 
 std::unordered_map<std::string, std::shared_ptr<AlgorithmLauncher>>& get_cached_launchers();
@@ -31,7 +31,10 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <cub/cub.cuh>
+#include <cub/device/device_histogram.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cuda.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>

@@ -7,7 +7,29 @@
 
 #include <raft/util/cuda_rt_essentials.hpp>
 
-AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k) : kernel{k} {}
+AlgorithmLauncher::AlgorithmLauncher(cudaKernel_t k, cudaLibrary_t lib) : kernel{k}, library{lib} {}
+
+AlgorithmLauncher::~AlgorithmLauncher() { RAFT_CUDA_TRY(cudaLibraryUnload(library)); }
-AlgorithmLauncher::~AlgorithmLauncher() { RAFT_CUDA_TRY(cudaLibraryUnload(library)); }
+AlgorithmLauncher::~AlgorithmLauncher() {
+  if (library != nullptr) {
+    (void)cudaLibraryUnload(library);  // ignore errors in destructor
+  }
+}
-AlgorithmLauncher::~AlgorithmLauncher() { RAFT_CUDA_TRY(cudaLibraryUnload(library)); }
+AlgorithmLauncher::~AlgorithmLauncher() {
+  if (library != nullptr) {
+    (void)cudaLibraryUnload(library);  // ignore errors in destructor
+  }
+}
+
+AlgorithmLauncher::AlgorithmLauncher(AlgorithmLauncher&& other) noexcept
+  : kernel{other.kernel}, library{other.library}
+{
+  other.kernel  = nullptr;
+  other.library = nullptr;
+}
+
+AlgorithmLauncher& AlgorithmLauncher::operator=(AlgorithmLauncher&& other) noexcept
+{
+  if (this != &other) {
+    // Unload current library if it exists
+    if (library != nullptr) { cudaLibraryUnload(library); }
+    kernel        = other.kernel;
+    library       = other.library;
+    other.kernel  = nullptr;
+    other.library = nullptr;
+  }
+  return *this;
+}
 
 void AlgorithmLauncher::call(
   cudaStream_t stream, dim3 grid, dim3 block, std::size_t shared_mem, void** kernel_args)

@@ -106,15 +106,44 @@ std::shared_ptr<AlgorithmLauncher> AlgorithmPlanner::build()
   RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed");
 
   // cubin is linked, so now load it
-  // NOTE: cudaLibrary_t does not need to be freed explicitly
   cudaLibrary_t library;
   RAFT_CUDA_TRY(
     cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0));
 
-  constexpr unsigned int count = 1;
+  unsigned int kernel_count = 0;
+  RAFT_CUDA_TRY(cudaLibraryGetKernelCount(&kernel_count, library));
+
   // NOTE: cudaKernel_t does not need to be freed explicitly
-  std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[count]};
-  RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), count, library));
+  std::unique_ptr<cudaKernel_t[]> kernels{new cudaKernel_t[kernel_count]};
+  RAFT_CUDA_TRY(cudaLibraryEnumerateKernels(kernels.get(), kernel_count, library));
+
+  // Filter out EmptyKernel by checking kernel names using cudaFuncGetName
+  const char* empty_kernel_name = "_ZN3cub6detail11EmptyKernelIvEEvv";
+  std::vector<cudaKernel_t> valid_kernels;
+  valid_kernels.reserve(kernel_count);
+
+  for (unsigned int i = 0; i < kernel_count; ++i) {
+    // cudaFuncGetName can be used with cudaKernel_t by casting to void*
+    const void* func_ptr  = reinterpret_cast<const void*>(kernels[i]);
+    const char* func_name = nullptr;
+    RAFT_CUDA_TRY(cudaFuncGetName(&func_name, func_ptr));
+
+    bool is_empty_kernel = false;
+    if (func_name != nullptr) {
+      std::string kernel_name(func_name);
+      // Check if this is EmptyKernel
+      if (kernel_name.find(empty_kernel_name) != std::string::npos ||
+          kernel_name == empty_kernel_name) {
+        is_empty_kernel = true;
+      }
+    }
+
+    // Only keep the kernel if it's not EmptyKernel
+    if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); }
-    if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); }
+    if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }
-    if (!is_empty_kernel) { valid_kernels.push_back(kernels.release()[i]); }
+    if (!is_empty_kernel) { valid_kernels.push_back(kernels[i]); }
+  }
+
+  RAFT_EXPECTS(
+    valid_kernels.size() == 1, "Expected 1 valid JIT kernel, got %zu", valid_kernels.size());
 
-  return std::make_shared<AlgorithmLauncher>(kernels.release()[0]);
+  return std::make_shared<AlgorithmLauncher>(valid_kernels[0], library);
 }
@@ -11,7 +11,7 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_store.cuh>
-#include <cub/cub.cuh>
+#include <cub/warp/warp_reduce.cuh>
 
 namespace cuvs {
 namespace distance {

@@ -1,13 +1,13 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/math.hpp>
 
-#include <cub/cub.cuh>
+#include <cub/warp/warp_reduce.cuh>
 #include <cuda_fp16.h>
 #include <cuda_pipeline.h>
 

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -14,7 +14,7 @@
 #include <raft/linalg/contractions.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <cub/util_type.cuh>
 
 #include <stdint.h>
 

@@ -7,7 +7,7 @@
 #include "../utils.hpp"
 #include "topk.h"
 
-#include <cub/cub.cuh>
+#include <cub/block/block_scan.cuh>
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/error.hpp>

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -8,7 +8,7 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <cub/util_type.cuh>
 
 #include <limits>
 

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,7 +9,6 @@
 
 #include <cuvs/distance/distance.hpp>
 
-#include <cub/cub.cuh>
 #include <cuda_fp16.h>
 #include <cuda_pipeline.h>
 

@@ -1,11 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
-#include <cub/cub.cuh>
+#include <cub/block/block_merge_sort.cuh>
 
 #include "macros.cuh"
 #include "priority_queue.cuh"

@@ -1,11 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
-#include <cub/cub.cuh>
 #include <thrust/sort.h>
 
 #include <raft/util/cuda_dev_essentials.cuh>

@@ -8,7 +8,8 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/device/device_radix_sort.cuh>
 
 namespace cuvs::neighbors::ivf::detail {
 
@@ -72,4 +73,22 @@ void calc_chunk_indices::configured::operator()(const uint32_t* cluster_sizes,
   RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
 }
 
+// Helper function to sort cluster sizes using CUB, extracted from template to avoid
+// including cub/device/* in the header file
+void sort_cluster_sizes_descending(uint32_t* input,
+                                   uint32_t* output,
+                                   uint32_t n_lists,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* tmp_res)
+{
+  int begin_bit             = 0;
+  int end_bit               = sizeof(uint32_t) * 8;
+  size_t cub_workspace_size = 0;
+  cub::DeviceRadixSort::SortKeysDescending(
+    nullptr, cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
+  rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
+  cub::DeviceRadixSort::SortKeysDescending(
+    cub_workspace.data(), cub_workspace_size, input, output, n_lists, begin_bit, end_bit, stream);
+}
+
 }  // namespace cuvs::neighbors::ivf::detail
@@ -9,10 +9,15 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
 
-#include <cub/cub.cuh>
-
 namespace cuvs::neighbors::ivf::detail {
 
+// Forward declaration of helper function to avoid including cub/device/* in header
+void sort_cluster_sizes_descending(uint32_t* input,
+                                   uint32_t* output,
+                                   uint32_t n_lists,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* tmp_res);
+
 /**
  * Default value returned by `search` when the `n_probes` is too small and top-k is too large.
  * One may encounter it if the combined size of probed clusters is smaller than the requested
@@ -247,26 +252,9 @@ void recompute_internal_state(const raft::resources& res, Index& index)
   }
 
   // Sort the cluster sizes in the descending order.
-  int begin_bit             = 0;
-  int end_bit               = sizeof(uint32_t) * 8;
-  size_t cub_workspace_size = 0;
-  cub::DeviceRadixSort::SortKeysDescending(nullptr,
-                                           cub_workspace_size,
-                                           index.list_sizes().data_handle(),
-                                           sorted_sizes.data(),
-                                           index.n_lists(),
-                                           begin_bit,
-                                           end_bit,
-                                           stream);
-  rmm::device_buffer cub_workspace(cub_workspace_size, stream, tmp_res);
-  cub::DeviceRadixSort::SortKeysDescending(cub_workspace.data(),
-                                           cub_workspace_size,
-                                           index.list_sizes().data_handle(),
-                                           sorted_sizes.data(),
-                                           index.n_lists(),
-                                           begin_bit,
-                                           end_bit,
-                                           stream);
+  // Use helper function to avoid including cub/device/* in this header
+  sort_cluster_sizes_descending(
+    index.list_sizes().data_handle(), sorted_sizes.data(), index.n_lists(), stream, tmp_res);
   // copy the results to CPU
   std::vector<uint32_t> sorted_sizes_host(index.n_lists());
   raft::copy(sorted_sizes_host.data(), sorted_sizes.data(), index.n_lists(), stream);

@@ -19,7 +19,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/per_device_resource.hpp>
 
-#include <cub/cub.cuh>
 #include <cuda_fp16.h>
 
 namespace cuvs::neighbors::ivf_pq::detail {

@@ -42,7 +42,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/per_device_resource.hpp>
 
-#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
 #include <cuda_fp16.h>
 
 #include <optional>

@@ -1,9 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <cub/cub.cuh>
+#include <cub/device/device_histogram.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_scan.cuh>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -26,7 +26,6 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <cub/cub.cuh>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/gather.h>

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -17,7 +17,7 @@
 
 #include <rmm/device_scalar.hpp>
 
-#include <cub/cub.cuh>
+#include <cub/device/device_histogram.cuh>
 
 #include <math.h>
 

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -13,6 +13,8 @@
 #include <raft/sparse/coo.hpp>
 #include <raft/util/cudart_utils.hpp>
 
+#include <cub/block/block_reduce.cuh>
+
 #include <rmm/device_uvector.hpp>
 
 #include <gtest/gtest.h>

@@ -29,8 +29,6 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <cub/cub.cuh>
-
 #include <gtest/gtest.h>
 
 #include <vector>