cuvs/cpp/src/neighbors/detail/vamana/greedy_search.cuh at 949700d6f927fceca976355ee5811594a09cfc50 · divyegala/cuvs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/*
 * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 * SPDX-License-Identifier: Apache-2.0
 */

#pragma once

#include <cub/block/block_merge_sort.cuh>

#include "macros.cuh"
#include "priority_queue.cuh"
#include "vamana_structs.cuh"
#include <cuvs/neighbors/vamana.hpp>

#include <cuvs/distance/distance.hpp>
#include <raft/util/warp_primitives.cuh>
#include <rmm/resource_ref.hpp>

#include <chrono>
#include <cstdio>
#include <vector>

namespace cuvs::neighbors::vamana::detail {

/* @defgroup greedy_search_detail greedy search
 * @{
 */

/* Combines edge and candidate lists, removes duplicates, and sorts by distance
 * Uses CUB primitives, so needs to be templated. Called with Macros for supported sizes above */
template <typename accT, typename IdxT, int CANDS>
__forceinline__ __device__ void sort_visited(
  QueryCandidates<IdxT, accT>* query,
  typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, (CANDS / 32)>::TempStorage* sort_mem)
{
  const int ELTS   = CANDS / 32;
  using BlockSortT = cub::BlockMergeSort<DistPair<IdxT, accT>, 32, ELTS>;
  DistPair<IdxT, accT> tmp[ELTS];
  for (int i = 0; i < ELTS; i++) {
    tmp[i].idx  = query->ids[ELTS * threadIdx.x + i];
    tmp[i].dist = query->dists[ELTS * threadIdx.x + i];
  }

  __syncthreads();
  BlockSortT(*sort_mem).Sort(tmp, CmpDist<IdxT, accT>());
  __syncthreads();

  for (int i = 0; i < ELTS; i++) {
    query->ids[ELTS * threadIdx.x + i]   = tmp[i].idx;
    query->dists[ELTS * threadIdx.x + i] = tmp[i].dist;
  }
  __syncthreads();
}

namespace {

template <typename T, typename accT, typename IdxT = uint32_t>
__global__ void SortPairsKernel(void* query_list_ptr, int num_queries, int topk)
{
  union ShmemLayout {
    typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, 1>::TempStorage sort_mem;
  };
  extern __shared__ __align__(alignof(ShmemLayout)) char smem[];

  QueryCandidates<IdxT, accT>* query_list =
    static_cast<QueryCandidates<IdxT, accT>*>(query_list_ptr);

  for (int i = blockIdx.x; i < num_queries; i += gridDim.x) {
    __syncthreads();
    SEARCH_SELECT_SORT(topk);
  }
}

/********************************************************************************************
  GPU kernel to perform a batched GreedySearch on a graph. Since this is used for
  Vamana construction, the entire visited list is kept and stored within the query_list.
  Input - graph with edge lists, dataset vectors, query_list_ptr with the ids of dataset
          vectors to be searched. All inputs, including dataset,  must be device accessible.

  Output - the id and dist lists in query_list_ptr will be updated with the nodes visited
           during the GreedySearch.
**********************************************************************************************/
template <typename T,
          typename accT,
          typename IdxT = uint32_t,
          typename Accessor =
            raft::host_device_accessor<cuda::std::default_accessor<T>, raft::memory_type::host>>
__global__ void GreedySearchKernel(
  raft::device_matrix_view<IdxT, int64_t> graph,
  raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
  void* query_list_ptr,
  int num_queries,
  int medoid_id,
  int topk,
  cuvs::distance::DistanceType metric,
  int max_queue_size,
  Node<accT>* topk_pq_mem)
{
  int n      = dataset.extent(0);
  int dim    = dataset.extent(1);
  int degree = graph.extent(1);

  QueryCandidates<IdxT, accT>* query_list =
    static_cast<QueryCandidates<IdxT, accT>*>(query_list_ptr);

  static __shared__ int topk_q_size;
  static __shared__ int cand_q_size;
  static __shared__ accT cur_k_max;
  static __shared__ int k_max_idx;

  static __shared__ Point<T, accT> s_query;

  union ShmemLayout {
    // All blocksort sizes have same alignment (16)
    T coords;
    int neighborhood_arr;
    DistPair<IdxT, accT> candidate_queue;
  };

  int align_padding = (((dim - 1) / alignof(ShmemLayout)) + 1) * alignof(ShmemLayout) - dim;

  // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list
  extern __shared__ __align__(alignof(ShmemLayout)) char smem[];

  size_t smem_offset = 0;

  T* s_coords = reinterpret_cast<T*>(&smem[smem_offset]);
  smem_offset += (dim + align_padding) * sizeof(T);

  Node<accT>* topk_pq = &topk_pq_mem[blockIdx.x * topk];

  int* neighbor_array = reinterpret_cast<int*>(&smem[smem_offset]);
  smem_offset += degree * sizeof(int);

  DistPair<IdxT, accT>* candidate_queue_smem =
    reinterpret_cast<DistPair<IdxT, accT>*>(&smem[smem_offset]);

  s_query.coords = s_coords;
  s_query.Dim    = dim;

  PriorityQueue<IdxT, accT> heap_queue;

  if (threadIdx.x == 0) {
    heap_queue.initialize(candidate_queue_smem, max_queue_size, &cand_q_size);
  }

  static __shared__ int num_neighbors;

  for (int i = blockIdx.x; i < num_queries; i += gridDim.x) {
    __syncthreads();

    // resetting visited list
    query_list[i].reset();

    // storing the current query vector into shared memory
    update_shared_point<T, accT>(&s_query, &dataset(0, 0), query_list[i].queryId, dim);

    if (threadIdx.x == 0) {
      topk_q_size = 0;
      cand_q_size = 0;
      s_query.id  = query_list[i].queryId;
      cur_k_max   = 0;
      k_max_idx   = 0;
      heap_queue.reset();
    }

    __syncthreads();

    Point<T, accT>* query_vec;

    // Just start from medoid every time, rather than multiple set_ups
    query_vec        = &s_query;
    query_vec->Dim   = dim;
    const T* medoid  = &dataset((size_t)medoid_id, 0);
    accT medoid_dist = dist<T, accT>(query_vec->coords, medoid, dim, metric);

    if (threadIdx.x == 0) { heap_queue.insert_back(medoid_dist, medoid_id); }
    __syncthreads();

    while (cand_q_size != 0) {
      __syncthreads();

      int cand_num;
      accT cur_distance;
      if (threadIdx.x == 0) {
        Node<accT> test_cand;
        DistPair<IdxT, accT> test_cand_out = heap_queue.pop();
        test_cand.distance                 = test_cand_out.dist;
        test_cand.nodeid                   = test_cand_out.idx;
        cand_num                           = test_cand.nodeid;
        cur_distance                       = test_cand_out.dist;
      }
      __syncthreads();

      cand_num = raft::shfl(cand_num, 0);

      __syncthreads();

      if (query_list[i].check_visited(cand_num, cur_distance)) { continue; }

      cur_distance = raft::shfl(cur_distance, 0);

      // stop condition for the graph traversal process
      bool done      = false;
      bool pass_flag = false;

      if (topk_q_size == topk) {
        // Check the current node with the worst candidate in top-k queue
        if (threadIdx.x == 0) {
          if (cur_k_max <= cur_distance) { done = true; }
        }

        done = raft::shfl(done, 0);
        if (done) {
          if (query_list[i].size < topk) {
            pass_flag = true;
          }

          else if (query_list[i].size >= topk) {
            break;
          }
        }
      }

      // The current node is closer to the query vector than the worst candidate in top-K queue, so
      // enquee the current node in top-k queue
      Node<accT> new_cand;
      new_cand.distance = cur_distance;
      new_cand.nodeid   = cand_num;

      if (check_duplicate(topk_pq, topk_q_size, new_cand) == false) {
        if (!pass_flag) {
          parallel_pq_max_enqueue<accT>(
            topk_pq, &topk_q_size, topk, new_cand, &cur_k_max, &k_max_idx);

          __syncthreads();
        }
      } else {
        // already visited
        continue;
      }

      num_neighbors = degree;
      __syncthreads();

      for (size_t j = threadIdx.x; j < degree; j += blockDim.x) {
        // Load neighbors from the graph array and store them in neighbor array (shared memory)
        neighbor_array[j] = graph(cand_num, j);
        if (neighbor_array[j] == raft::upper_bound<IdxT>())
          atomicMin(&num_neighbors, (int)j);  // warp-wide min to find the number of neighbors
      }

      // computing distances between the query vector and neighbor vectors then enqueue in priority
      // queue.
      enqueue_all_neighbors<T, accT, IdxT>(
        num_neighbors, query_vec, &dataset(0, 0), neighbor_array, heap_queue, dim, metric);

      __syncthreads();

    }  // End cand_q_size != 0 loop

    bool self_found = false;
    // Remove self edges
    for (int j = threadIdx.x; j < query_list[i].size; j += blockDim.x) {
      if (query_list[i].ids[j] == query_vec->id) {
        query_list[i].dists[j] = raft::upper_bound<accT>();
        query_list[i].ids[j]   = raft::upper_bound<IdxT>();
        self_found             = true;  // Flag to reduce size by 1
      }
    }

    for (int j = query_list[i].size + threadIdx.x; j < query_list[i].maxSize; j += blockDim.x) {
      query_list[i].ids[j]   = raft::upper_bound<IdxT>();
      query_list[i].dists[j] = raft::upper_bound<accT>();
    }

    __syncthreads();
    if (self_found) query_list[i].size--;
  }

  return;
}

}  // namespace

/**
 * @}
 */

}  // namespace cuvs::neighbors::vamana::detail