rapidsai
diff --git a/‎cpp/src/neighbors/ball_cover/registers.cuh‎
Lines changed: 3 additions & 3 deletions b/‎cpp/src/neighbors/ball_cover/registers.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/src/neighbors/detail/faiss_select/Comparators.cuh‎
Lines changed: 34 additions & 0 deletions b/‎cpp/src/neighbors/detail/faiss_select/Comparators.cuh‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎cpp/src/neighbors/detail/faiss_select/DistanceUtils.h‎
Lines changed: 57 additions & 0 deletions b/‎cpp/src/neighbors/detail/faiss_select/DistanceUtils.h‎
Lines changed: 57 additions & 0 deletions
@@ -11,10 +11,10 @@
 #include "registers_types.cuh"  // DistFunc
 #include <cuvs/neighbors/ball_cover.hpp>
 
+#include "../detail/faiss_select/key_value_block_select.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/neighbors/detail/faiss_select/key_value_block_select.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <thrust/count.h>
@@ -166,7 +166,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_reordered,
     local_x_ptr[j] = x_ptr[j];
   }
 
-  using namespace raft::neighbors::detail::faiss_select;
+  using namespace cuvs::neighbors::detail::faiss_select;
   KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
     std::numeric_limits<value_t>::max(),
     std::numeric_limits<value_t>::max(),
@@ -326,7 +326,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_reordered,
   }
 
   // Each warp works on 1 R
-  using namespace raft::neighbors::detail::faiss_select;
+  using namespace cuvs::neighbors::detail::faiss_select;
   KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
     std::numeric_limits<value_t>::max(),
     std::numeric_limits<value_t>::max(),
 
@@ -0,0 +1,34 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) Facebook, Inc. and its affiliates.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+namespace cuvs::neighbors::detail::faiss_select {
+
+template <typename T>
+struct Comparator {
+  __device__ static inline bool lt(T a, T b) { return a < b; }
+
+  __device__ static inline bool gt(T a, T b) { return a > b; }
+};
+
+template <>
+struct Comparator<half> {
+  __device__ static inline bool lt(half a, half b) { return __hlt(a, b); }
+
+  __device__ static inline bool gt(half a, half b) { return __hgt(a, b); }
+};
+
+}  // namespace cuvs::neighbors::detail::faiss_select
@@ -0,0 +1,57 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) Facebook, Inc. and its affiliates.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+namespace cuvs::neighbors::detail::faiss_select {
+// If the inner size (dim) of the vectors is small, we want a larger query tile
+// size, like 1024
+inline void chooseTileSize(size_t numQueries,
+                           size_t numCentroids,
+                           size_t dim,
+                           size_t elementSize,
+                           size_t totalMem,
+                           size_t& tileRows,
+                           size_t& tileCols)
+{
+  // The matrix multiplication should be large enough to be efficient, but if
+  // it is too large, we seem to lose efficiency as opposed to
+  // double-streaming. Each tile size here defines 1/2 of the memory use due
+  // to double streaming. We ignore available temporary memory, as that is
+  // adjusted independently by the user and can thus meet these requirements
+  // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
+  // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
+  size_t targetUsage = 0;
+
+  if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
+    targetUsage = 512 * 1024 * 1024;
+  } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
+    targetUsage = 768 * 1024 * 1024;
+  } else {
+    targetUsage = 1024 * 1024 * 1024;
+  }
+
+  targetUsage /= 2 * elementSize;
+
+  // 512 seems to be a batch size sweetspot for float32.
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  size_t preferredTileRows = 512;
+  if (dim <= 32) { preferredTileRows = 1024; }
+
+  tileRows = std::min(preferredTileRows, numQueries);
+
+  // tileCols is the remainder size
+  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
+}
+}  // namespace cuvs::neighbors::detail::faiss_select