ReachabilityPostProcess distance epilogue for NN Descent (#1073)

jinsolp · web-flow · commit 23c6dc006e0e · 2025-07-08T00:27:07.000Z
NN Descent changed to support distance epilogues. Currently supporting `ReachabilityPostProcess` and `identity_op`. A new distance epilogue will need new instantiations. Mutual reachability computation will eventually be hidden behind the `all_neighbors` API. Basic tests are still added in this PR to ensure the correctness of this feature. Authors: - Jinsol Park (https://github.com/jinsolp) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #1073
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -488,7 +488,7 @@ __device__ __forceinline__ void remove_duplicates(
 // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048
 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
-template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
+template <typename Index_t, typename ID_t = InternalID_t<Index_t>, typename DistEpilogue_t>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 // Use minBlocksPerMultiprocessor = 4 on specific arches
@@ -513,7 +513,8 @@ __launch_bounds__(BLOCK_SIZE)
                     int graph_width,
                     int* locks,
                     DistData_t* l2_norms,
-                    cuvs::distance::DistanceType metric)
+                    cuvs::distance::DistanceType metric,
+                    DistEpilogue_t dist_epilogue)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -623,20 +624,22 @@ __launch_bounds__(BLOCK_SIZE)
   __syncthreads();
 
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
-    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
-        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+    int row_id = i % SKEWED_MAX_NUM_BI_SAMPLES;
+    int col_id = i / SKEWED_MAX_NUM_BI_SAMPLES;
+
+    if (row_id < list_new_size && col_id < list_new_size) {
       if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
       } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         s_distances[i] = 1.0 - s_distances[i];
       } else {  // L2Expanded or L2SqrtExpanded
-        s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
-                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
-                         2.0 * s_distances[i];
+        s_distances[i] =
+          l2_norms[new_neighbors[row_id]] + l2_norms[new_neighbors[col_id]] - 2.0 * s_distances[i];
         // for fp32 vs fp16 precision differences resulting in negative distances when distance
         // should be 0 related issue: https://github.com/rapidsai/cuvs/issues/991
         s_distances[i] = s_distances[i] < 0.0f ? 0.0f : s_distances[i];
       }
+      s_distances[i] = dist_epilogue(s_distances[i], new_neighbors[row_id], new_neighbors[col_id]);
     } else {
       s_distances[i] = std::numeric_limits<float>::max();
     }
@@ -707,20 +710,21 @@ __launch_bounds__(BLOCK_SIZE)
   __syncthreads();
 
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
-    if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
-        i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
+    int row_id = i % SKEWED_MAX_NUM_BI_SAMPLES;
+    int col_id = i / SKEWED_MAX_NUM_BI_SAMPLES;
+    if (row_id < list_old_size && col_id < list_new_size) {
       if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
       } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         s_distances[i] = 1.0 - s_distances[i];
       } else {  // L2Expanded or L2SqrtExpanded
-        s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
-                         l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
-                         2.0 * s_distances[i];
+        s_distances[i] =
+          l2_norms[old_neighbors[row_id]] + l2_norms[new_neighbors[col_id]] - 2.0 * s_distances[i];
         // for fp32 vs fp16 precision differences resulting in negative distances when distance
         // should be 0 related issue: https://github.com/rapidsai/cuvs/issues/991
         s_distances[i] = s_distances[i] < 0.0f ? 0.0f : s_distances[i];
       }
+      s_distances[i] = dist_epilogue(s_distances[i], old_neighbors[row_id], new_neighbors[col_id]);
     } else {
       s_distances[i] = std::numeric_limits<float>::max();
     }
@@ -1034,7 +1038,8 @@ void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
 }
 
 template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
+template <typename DistEpilogue_t>
+void GNND<Data_t, Index_t>::local_join(cudaStream_t stream, DistEpilogue_t dist_epilogue)
 {
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
   local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
@@ -1051,15 +1056,18 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
                                                       DEGREE_ON_DEVICE,
                                                       d_locks_.data_handle(),
                                                       l2_norms_.data_handle(),
-                                                      build_config_.metric);
+                                                      build_config_.metric,
+                                                      dist_epilogue);
 }
 
 template <typename Data_t, typename Index_t>
+template <typename DistEpilogue_t>
 void GNND<Data_t, Index_t>::build(Data_t* data,
                                   const Index_t nrow,
                                   Index_t* output_graph,
                                   bool return_distances,
-                                  DistData_t* output_distances)
+                                  DistData_t* output_distances,
+                                  DistEpilogue_t dist_epilogue)
 {
   using input_t = typename std::remove_const<Data_t>::type;
 
@@ -1154,7 +1162,7 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
       raft::util::arch::SM_range(raft::util::arch::SM_70(), raft::util::arch::SM_future());
 
     if (wmma_range.contains(runtime_arch)) {
-      local_join(stream);
+      local_join(stream, dist_epilogue);
     } else {
       THROW("NN_DESCENT cannot be run for __CUDA_ARCH__ < 700");
     }
diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp
@@ -207,11 +207,13 @@ class GNND {
   GNND(const GNND&)            = delete;
   GNND& operator=(const GNND&) = delete;
 
+  template <typename DistEpilogue_t = raft::identity_op>
   void build(Data_t* data,
              const Index_t nrow,
              Index_t* output_graph,
              bool return_distances,
-             DistData_t* output_distances);
+             DistData_t* output_distances,
+             DistEpilogue_t dist_epilogue = DistEpilogue_t{});
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
   void reset(raft::resources const& res);
@@ -222,7 +224,9 @@ class GNND {
                          Index_t* d_rev_graph_ptr,
                          int2* list_sizes,
                          cudaStream_t stream = 0);
-  void local_join(cudaStream_t stream = 0);
+
+  template <typename DistEpilogue_t = raft::identity_op>
+  void local_join(cudaStream_t stream = 0, DistEpilogue_t dist_epilogue = DistEpilogue_t{});
 
   raft::resources const& res;
 
diff --git a/cpp/src/neighbors/detail/reachability.cuh b/cpp/src/neighbors/detail/reachability.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,6 +131,7 @@ struct ReachabilityPostProcess {
 
   const value_t* core_dists;
   value_t alpha;
+  size_t n;  // size of core_dists array
 };
 
 /**
@@ -163,7 +164,7 @@ void mutual_reachability_knn_l2(const raft::resources& handle,
   // `A type local to a function cannot be used in the template argument of the
   // enclosing parent function (and any parent classes) of an extended __device__
   // or __host__ __device__ lambda`
-  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha};
+  auto epilogue = ReachabilityPostProcess<value_idx, value_t>{core_dists, alpha, m};
 
   cuvs::neighbors::detail::
     tiled_brute_force_knn<value_t, value_idx, value_t, ReachabilityPostProcess<value_idx, value_t>>(
diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "./detail/nn_descent_gnnd.hpp"
+#include "./detail/reachability.cuh"
 #include "nn_descent.cuh"
 #include <cuvs/neighbors/nn_descent.hpp>
 
@@ -54,7 +56,30 @@ namespace cuvs::neighbors::nn_descent {
       return idx;                                                                             \
     }                                                                                         \
   };                                                                                          \
-  template class detail::GNND<const T, int>;
+  template class detail::GNND<const T, int>;                                                  \
+                                                                                              \
+  template void detail::GNND<const T, int>::build<                                            \
+    cuvs::neighbors::detail::reachability::ReachabilityPostProcess<int, T>>(                  \
+    const T* data,                                                                            \
+    const int nrow,                                                                           \
+    int* output_graph,                                                                        \
+    bool return_distances,                                                                    \
+    float* output_distances,                                                                  \
+    cuvs::neighbors::detail::reachability::ReachabilityPostProcess<int, T> dist_epilogue);    \
+  template void detail::GNND<const T, int>::local_join<                                       \
+    cuvs::neighbors::detail::reachability::ReachabilityPostProcess<int, T>>(                  \
+    cudaStream_t stream,                                                                      \
+    cuvs::neighbors::detail::reachability::ReachabilityPostProcess<int, T> dist_epilogue);    \
+                                                                                              \
+  template void detail::GNND<const T, int>::build<raft::identity_op>(                         \
+    const T* data,                                                                            \
+    const int nrow,                                                                           \
+    int* output_graph,                                                                        \
+    bool return_distances,                                                                    \
+    float* output_distances,                                                                  \
+    raft::identity_op dist_epilogue);                                                         \
+  template void detail::GNND<const T, int>::local_join<raft::identity_op>(                    \
+    cudaStream_t stream, raft::identity_op dist_epilogue);
 
 CUVS_INST_NN_DESCENT_BUILD(float, uint32_t);
 
diff --git a/cpp/tests/neighbors/ann_nn_descent.cuh b/cpp/tests/neighbors/ann_nn_descent.cuh
diff --git a/cpp/tests/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_nn_descent/test_float_uint32_t.cu