From 4379bfd438b529221733ff6fd1dbfbd98086bbba Mon Sep 17 00:00:00 2001 From: bkarsin Date: Thu, 19 Dec 2024 07:21:09 +0000 Subject: [PATCH 01/13] Add support for any dataset dimension and fix bug with alignment --- .../neighbors/detail/vamana/greedy_search.cuh | 7 +- .../neighbors/detail/vamana/robust_prune.cuh | 4 +- .../neighbors/detail/vamana/vamana_build.cuh | 11 +-- .../detail/vamana/vamana_structs.cuh | 5 +- cpp/test/neighbors/ann_vamana.cuh | 83 +++++++++---------- 5 files changed, 58 insertions(+), 52 deletions(-) diff --git a/cpp/src/neighbors/detail/vamana/greedy_search.cuh b/cpp/src/neighbors/detail/vamana/greedy_search.cuh index f51c6c91bf..f7dff7a259 100644 --- a/cpp/src/neighbors/detail/vamana/greedy_search.cuh +++ b/cpp/src/neighbors/detail/vamana/greedy_search.cuh @@ -103,6 +103,7 @@ __global__ void GreedySearchKernel( static __shared__ Point s_query; + union ShmemLayout { // All blocksort sizes have same alignment (16) typename cub::BlockMergeSort, 32, 1>::TempStorage sort_mem; @@ -112,13 +113,15 @@ __global__ void GreedySearchKernel( DistPair candidate_queue; }; + int align_padding = (((dim-1)/alignof(ShmemLayout))+1)*alignof(ShmemLayout) - dim; + // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; size_t smem_offset = sort_smem_size; // temp sorting memory takes first chunk T* s_coords = reinterpret_cast(&smem[smem_offset]); - smem_offset += dim * sizeof(T); + smem_offset += (dim+align_padding) * sizeof(T); Node* topk_pq = reinterpret_cast*>(&smem[smem_offset]); smem_offset += topk * sizeof(Node); @@ -170,7 +173,7 @@ __global__ void GreedySearchKernel( if (threadIdx.x == 0) { heap_queue.insert_back(medoid_dist, medoid_id); } __syncthreads(); - + while (cand_q_size != 0) { __syncthreads(); diff --git a/cpp/src/neighbors/detail/vamana/robust_prune.cuh b/cpp/src/neighbors/detail/vamana/robust_prune.cuh index 8446ac136d..fe96dc0360 100644 --- a/cpp/src/neighbors/detail/vamana/robust_prune.cuh +++ b/cpp/src/neighbors/detail/vamana/robust_prune.cuh @@ -145,9 +145,11 @@ __global__ void RobustPruneKernel( // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; + int align_padding = (((dim-1)/alignof(ShmemLayout))+1)*alignof(ShmemLayout) - dim; + T* s_coords = reinterpret_cast(&smem[sort_smem_size]); DistPair* new_nbh_list = - reinterpret_cast*>(&smem[dim * sizeof(T) + sort_smem_size]); + reinterpret_cast*>(&smem[(dim+align_padding) * sizeof(T) + sort_smem_size]); static __shared__ Point s_query; s_query.coords = s_coords; diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index ec75c99c11..7d961b6ebc 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -175,13 +175,16 @@ void batched_insert_vamana( SELECT_SMEM_SIZES(degree, visited_size); // Sets above 2 variables to appropriate sizes // Total dynamic shared memory used by GreedySearch + int align_padding = ((((dim-1)/16)+1)*16) - dim; int search_smem_total_size = - static_cast(search_smem_sort_size + dim * sizeof(T) + visited_size * sizeof(Node) + + static_cast(search_smem_sort_size + (dim+align_padding) * sizeof(T) + + visited_size * sizeof(Node) + degree * sizeof(int) + queue_size * sizeof(DistPair)); // Total dynamic shared memory size needed by both RobustPrune calls int prune_smem_total_size = - prune_smem_sort_size + dim * sizeof(T) + (degree + visited_size) * sizeof(DistPair); + prune_smem_sort_size + (dim+align_padding) * sizeof(T) + + (degree + visited_size) * sizeof(DistPair); RAFT_LOG_DEBUG("Dynamic shared memory usage (bytes): GreedySearch: %d, RobustPrune: %d", search_smem_total_size, @@ -228,7 +231,6 @@ void batched_insert_vamana( metric, queue_size, search_smem_sort_size); - // Run on candidates of vectors being inserted RobustPruneKernel <<>>(d_graph.view(), @@ -279,6 +281,7 @@ void batched_insert_vamana( edge_dest.data_handle() + total_edges); auto unique_indices = raft::make_device_vector(res, total_edges); raft::linalg::map_offset(res, unique_indices.view(), raft::identity_op{}); + thrust::unique_by_key( edge_dest_vec.begin(), edge_dest_vec.end(), unique_indices.data_handle()); @@ -371,8 +374,6 @@ index build( RAFT_EXPECTS(params.visited_size > graph_degree, "visited_size must be > graph_degree"); int dim = dataset.extent(1); - // TODO - Fix issue with alignment when dataset dimension is odd - RAFT_EXPECTS(dim % 2 == 0, "Datasets with an odd number of dimensions not currently supported"); RAFT_LOG_DEBUG("Creating empty graph structure"); auto vamana_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); diff --git a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh index f6f0279f7c..5485d5e94d 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh @@ -170,7 +170,7 @@ __device__ SUMTYPE l2_ILP4(Point* src_vec, Point* dst_ve temp_dst[0] = dst_vec->coords[i]; if (i + 32 < src_vec->Dim) temp_dst[1] = dst_vec->coords[i + 32]; if (i + 64 < src_vec->Dim) temp_dst[2] = dst_vec->coords[i + 64]; - if (i + 92 < src_vec->Dim) temp_dst[3] = dst_vec->coords[i + 96]; + if (i + 96 < src_vec->Dim) temp_dst[3] = dst_vec->coords[i + 96]; partial_sum[0] = fmaf( (src_vec[0].coords[i] - temp_dst[0]), (src_vec[0].coords[i] - temp_dst[0]), partial_sum[0]); @@ -182,7 +182,7 @@ __device__ SUMTYPE l2_ILP4(Point* src_vec, Point* dst_ve partial_sum[2] = fmaf((src_vec[0].coords[i + 64] - temp_dst[2]), (src_vec[0].coords[i + 64] - temp_dst[2]), partial_sum[2]); - if (i + 92 < src_vec->Dim) + if (i + 96 < src_vec->Dim) partial_sum[3] = fmaf((src_vec[0].coords[i + 96] - temp_dst[3]), (src_vec[0].coords[i + 96] - temp_dst[3]), partial_sum[3]); @@ -192,6 +192,7 @@ __device__ SUMTYPE l2_ILP4(Point* src_vec, Point* dst_ve for (int offset = 16; offset > 0; offset /= 2) { partial_sum[0] += __shfl_down_sync(FULL_BITMASK, partial_sum[0], offset); } + return partial_sum[0]; } diff --git a/cpp/test/neighbors/ann_vamana.cuh b/cpp/test/neighbors/ann_vamana.cuh index 9d9df44703..8572571244 100644 --- a/cpp/test/neighbors/ann_vamana.cuh +++ b/cpp/test/neighbors/ann_vamana.cuh @@ -264,9 +264,7 @@ inline std::vector generate_inputs() { std::vector inputs = raft::util::itertools::product( {1000}, - // {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // TODO - fix alignment - // issue for odd dims - {16, 32, 64, 128, 192, 256, 512, 1024}, // dim + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {32}, // graph degree {64, 128, 256}, // visited_size {0.06, 0.1}, @@ -282,54 +280,55 @@ inline std::vector generate_inputs() std::vector inputs2 = raft::util::itertools::product({1000}, - {16, 32, 64, 128, 192, 256, 512, 1024}, // dim - {64}, // graph degree - {128, 256, 512}, // visited_size - {0.06, 0.1}, - {cuvs::distance::DistanceType::L2Expanded}, - {false}, - {100}, - {10}, - {cagra::search_algo::AUTO}, - {10}, - {32}, - {1}, - {0.2}); + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + {64}, // graph degree + {128, 256, 512}, // visited_size + {0.06, 0.1}, + {cuvs::distance::DistanceType::L2Expanded}, + {false}, + {100}, + {10}, + {cagra::search_algo::AUTO}, + {10}, + {32}, + {1}, + {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); inputs2 = raft::util::itertools::product({1000}, - {16, 32, 64, 128, 192, 256, 512, 1024}, // dim - {128}, // graph degree - {256, 512}, // visited_size - {0.06, 0.1}, - {cuvs::distance::DistanceType::L2Expanded}, - {false}, - {100}, - {10}, - {cagra::search_algo::AUTO}, - {10}, - {64}, - {1}, - {0.2}); + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + {128}, // graph degree + {256, 512}, // visited_size + {0.06, 0.1}, + {cuvs::distance::DistanceType::L2Expanded}, + {false}, + {100}, + {10}, + {cagra::search_algo::AUTO}, + {10}, + {64}, + {1}, + {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); inputs2 = raft::util::itertools::product({1000}, - {16, 32, 64, 128, 192, 256, 512, 1024}, // dim - {256}, // graph degree - {512, 1024}, // visited_size - {0.06, 0.1}, - {cuvs::distance::DistanceType::L2Expanded}, - {false}, - {100}, - {10}, - {cagra::search_algo::AUTO}, - {10}, - {64}, - {1}, - {0.2}); + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + {256}, // graph degree + {512, 1024}, // visited_size + {0.06, 0.1}, + {cuvs::distance::DistanceType::L2Expanded}, + {false}, + {100}, + {10}, + {cagra::search_algo::AUTO}, + {10}, + {64}, + {1}, + {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + return inputs; } From 82dfb2b464f07848bb2bb447d805e7b57831aeb8 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Mon, 23 Dec 2024 21:27:10 +0000 Subject: [PATCH 02/13] Remove experimental namespace from vamana --- cpp/include/cuvs/neighbors/vamana.hpp | 34 +++++++++---------- .../neighbors/detail/vamana/greedy_search.cuh | 4 +-- cpp/src/neighbors/detail/vamana/macros.cuh | 4 +-- .../detail/vamana/priority_queue.cuh | 4 +-- .../neighbors/detail/vamana/robust_prune.cuh | 4 +-- .../neighbors/detail/vamana/vamana_build.cuh | 4 +-- .../detail/vamana/vamana_serialize.cuh | 4 +-- .../detail/vamana/vamana_structs.cuh | 4 +-- cpp/src/neighbors/vamana.cuh | 8 ++--- cpp/src/neighbors/vamana_build_float.cu | 16 ++++----- cpp/src/neighbors/vamana_build_int8.cu | 16 ++++----- cpp/src/neighbors/vamana_build_uint8.cu | 16 ++++----- cpp/src/neighbors/vamana_serialize.cuh | 8 ++--- cpp/src/neighbors/vamana_serialize_float.cu | 4 +-- cpp/src/neighbors/vamana_serialize_int8.cu | 4 +-- cpp/src/neighbors/vamana_serialize_uint8.cu | 4 +-- cpp/test/neighbors/ann_vamana.cuh | 4 +-- .../ann_vamana/test_float_uint32_t.cu | 4 +-- .../ann_vamana/test_int8_t_uint32_t.cu | 4 +-- .../ann_vamana/test_uint8_t_uint32_t.cu | 4 +-- docs/source/cpp_api/neighbors.rst | 1 + 21 files changed, 78 insertions(+), 77 deletions(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index bec17937f9..4b4a8a8d3e 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -31,7 +31,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { /** * @defgroup vamana_cpp_index_params Vamana index build parameters * @{ @@ -219,34 +219,34 @@ struct index : cuvs::neighbors::index { * */ auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; auto build(raft::resources const& handle, - const cuvs::neighbors::experimental::vamana::index_params& params, + const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) - -> cuvs::neighbors::experimental::vamana::index; + -> cuvs::neighbors::vamana::index; /** * @defgroup vamana_cpp_serialize Vamana serialize functions @@ -258,18 +258,18 @@ auto build(raft::resources const& handle, void serialize(raft::resources const& handle, const std::string& file_prefix, - const cuvs::neighbors::experimental::vamana::index& index); + const cuvs::neighbors::vamana::index& index); void serialize(raft::resources const& handle, const std::string& file_prefix, - const cuvs::neighbors::experimental::vamana::index& index); + const cuvs::neighbors::vamana::index& index); void serialize(raft::resources const& handle, const std::string& file_prefix, - const cuvs::neighbors::experimental::vamana::index& index); + const cuvs::neighbors::vamana::index& index); /** * @} */ -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/detail/vamana/greedy_search.cuh b/cpp/src/neighbors/detail/vamana/greedy_search.cuh index f7dff7a259..6a79383213 100644 --- a/cpp/src/neighbors/detail/vamana/greedy_search.cuh +++ b/cpp/src/neighbors/detail/vamana/greedy_search.cuh @@ -30,7 +30,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /* @defgroup greedy_search_detail greedy search * @{ @@ -286,4 +286,4 @@ __global__ void GreedySearchKernel( * @} */ -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/macros.cuh b/cpp/src/neighbors/detail/vamana/macros.cuh index 5692650a0e..c290413a20 100644 --- a/cpp/src/neighbors/detail/vamana/macros.cuh +++ b/cpp/src/neighbors/detail/vamana/macros.cuh @@ -16,7 +16,7 @@ #pragma once -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /* Macros to compute the shared memory requirements for CUB primitives used by search and prune */ #define COMPUTE_SMEM_SIZES(degree, visited_size, DEG, CANDS) \ @@ -79,4 +79,4 @@ namespace cuvs::neighbors::experimental::vamana::detail { SEARCH_CALL_SORT(topk, 512); \ SEARCH_CALL_SORT(topk, 1024); -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/priority_queue.cuh b/cpp/src/neighbors/detail/vamana/priority_queue.cuh index 4b3bd84661..6dc1dc94a1 100644 --- a/cpp/src/neighbors/detail/vamana/priority_queue.cuh +++ b/cpp/src/neighbors/detail/vamana/priority_queue.cuh @@ -20,7 +20,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /*************************************************************************************** ***************************************************************************************/ @@ -326,4 +326,4 @@ __forceinline__ __device__ void enqueue_all_neighbors(int num_neighbors, } } -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/robust_prune.cuh b/cpp/src/neighbors/detail/vamana/robust_prune.cuh index fe96dc0360..ee51776cf9 100644 --- a/cpp/src/neighbors/detail/vamana/robust_prune.cuh +++ b/cpp/src/neighbors/detail/vamana/robust_prune.cuh @@ -22,7 +22,7 @@ #include "macros.cuh" #include "vamana_structs.cuh" -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { // Load candidates (from query) and previous edges (from nbh_list) into registers (tmp) spanning // warp @@ -247,4 +247,4 @@ __global__ void RobustPruneKernel( } // namespace -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index 7d961b6ebc..65f683a080 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -46,7 +46,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /* @defgroup vamana_build_detail vamana build * @{ @@ -406,4 +406,4 @@ index build( * @} */ -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh index c360ae19a5..27a17205e6 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh @@ -34,7 +34,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /** * Save the index to file. @@ -117,4 +117,4 @@ void serialize(raft::resources const& res, if (!index_of) { RAFT_FAIL("Error writing output %s", file_name.c_str()); } } -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh index 5485d5e94d..fade674dcf 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh @@ -34,7 +34,7 @@ #include -namespace cuvs::neighbors::experimental::vamana::detail { +namespace cuvs::neighbors::vamana::detail { /* @defgroup vamana_structures vamana structures * @{ @@ -476,4 +476,4 @@ __global__ void recompute_reverse_dists( * @} */ -} // namespace cuvs::neighbors::experimental::vamana::detail +} // namespace cuvs::neighbors::vamana::detail diff --git a/cpp/src/neighbors/vamana.cuh b/cpp/src/neighbors/vamana.cuh index 9b9e8d271d..81b63c52f4 100644 --- a/cpp/src/neighbors/vamana.cuh +++ b/cpp/src/neighbors/vamana.cuh @@ -31,7 +31,7 @@ #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { /** * @defgroup VAMANA ANN Graph-based nearest neighbor search @@ -85,7 +85,7 @@ index build( const index_params& params, raft::mdspan, raft::row_major, Accessor> dataset) { - return cuvs::neighbors::experimental::vamana::detail::build( + return cuvs::neighbors::vamana::detail::build( res, params, dataset); } @@ -94,9 +94,9 @@ void serialize(raft::resources const& res, const std::string& file_prefix, const index& index_) { - cuvs::neighbors::experimental::vamana::detail::build(res, file_prefix, index_); + cuvs::neighbors::vamana::detail::build(res, file_prefix, index_); } /** @} */ // end group vamana -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_build_float.cu b/cpp/src/neighbors/vamana_build_float.cu index b83af61220..33a5381ff5 100644 --- a/cpp/src/neighbors/vamana_build_float.cu +++ b/cpp/src/neighbors/vamana_build_float.cu @@ -17,27 +17,27 @@ #include "vamana.cuh" #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { #define RAFT_INST_VAMANA_BUILD(T, IdxT) \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::device_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } \ \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::host_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(float, uint32_t); #undef RAFT_INST_VAMANA_BUILD -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_build_int8.cu b/cpp/src/neighbors/vamana_build_int8.cu index 91d2cf0280..a942bfbe22 100644 --- a/cpp/src/neighbors/vamana_build_int8.cu +++ b/cpp/src/neighbors/vamana_build_int8.cu @@ -17,27 +17,27 @@ #include "vamana.cuh" #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { #define RAFT_INST_VAMANA_BUILD(T, IdxT) \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::device_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } \ \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::host_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(int8_t, uint32_t); #undef RAFT_INST_VAMANA_BUILD -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_build_uint8.cu b/cpp/src/neighbors/vamana_build_uint8.cu index bba93e7f45..e1f8f1d219 100644 --- a/cpp/src/neighbors/vamana_build_uint8.cu +++ b/cpp/src/neighbors/vamana_build_uint8.cu @@ -17,27 +17,27 @@ #include "vamana.cuh" #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { #define RAFT_INST_VAMANA_BUILD(T, IdxT) \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::device_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } \ \ auto build(raft::resources const& handle, \ - const cuvs::neighbors::experimental::vamana::index_params& params, \ + const cuvs::neighbors::vamana::index_params& params, \ raft::host_matrix_view dataset) \ - ->cuvs::neighbors::experimental::vamana::index \ + ->cuvs::neighbors::vamana::index \ { \ - return cuvs::neighbors::experimental::vamana::build(handle, params, dataset); \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(uint8_t, uint32_t); #undef RAFT_INST_VAMANA_BUILD -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_serialize.cuh b/cpp/src/neighbors/vamana_serialize.cuh index a49d267b3a..a800d497a0 100644 --- a/cpp/src/neighbors/vamana_serialize.cuh +++ b/cpp/src/neighbors/vamana_serialize.cuh @@ -18,7 +18,7 @@ #include "detail/vamana/vamana_serialize.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { /** * @defgroup VAMANA graph serialize/derserialize @@ -28,12 +28,12 @@ namespace cuvs::neighbors::experimental::vamana { #define CUVS_INST_VAMANA_SERIALIZE(DTYPE) \ void serialize(raft::resources const& handle, \ const std::string& file_prefix, \ - const cuvs::neighbors::experimental::vamana::index& index_) \ + const cuvs::neighbors::vamana::index& index_) \ { \ - cuvs::neighbors::experimental::vamana::detail::serialize( \ + cuvs::neighbors::vamana::detail::serialize( \ handle, file_prefix, index_); \ }; /** @} */ // end group vamana -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_serialize_float.cu b/cpp/src/neighbors/vamana_serialize_float.cu index f253693687..8bf7ceb1e1 100644 --- a/cpp/src/neighbors/vamana_serialize_float.cu +++ b/cpp/src/neighbors/vamana_serialize_float.cu @@ -16,8 +16,8 @@ #include "vamana_serialize.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { CUVS_INST_VAMANA_SERIALIZE(float); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_serialize_int8.cu b/cpp/src/neighbors/vamana_serialize_int8.cu index 1cd54b1983..0f87f67ce1 100644 --- a/cpp/src/neighbors/vamana_serialize_int8.cu +++ b/cpp/src/neighbors/vamana_serialize_int8.cu @@ -16,8 +16,8 @@ #include "vamana_serialize.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { CUVS_INST_VAMANA_SERIALIZE(int8_t); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/src/neighbors/vamana_serialize_uint8.cu b/cpp/src/neighbors/vamana_serialize_uint8.cu index 3e6d945b80..871c305069 100644 --- a/cpp/src/neighbors/vamana_serialize_uint8.cu +++ b/cpp/src/neighbors/vamana_serialize_uint8.cu @@ -16,8 +16,8 @@ #include "vamana_serialize.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { CUVS_INST_VAMANA_SERIALIZE(uint8_t); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/test/neighbors/ann_vamana.cuh b/cpp/test/neighbors/ann_vamana.cuh index 8572571244..3d4f88639f 100644 --- a/cpp/test/neighbors/ann_vamana.cuh +++ b/cpp/test/neighbors/ann_vamana.cuh @@ -46,7 +46,7 @@ #include #include -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { struct edge_op { template @@ -335,4 +335,4 @@ inline std::vector generate_inputs() const std::vector inputs = generate_inputs(); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/test/neighbors/ann_vamana/test_float_uint32_t.cu b/cpp/test/neighbors/ann_vamana/test_float_uint32_t.cu index 9aa9da1b8c..7b89b65448 100644 --- a/cpp/test/neighbors/ann_vamana/test_float_uint32_t.cu +++ b/cpp/test/neighbors/ann_vamana/test_float_uint32_t.cu @@ -18,11 +18,11 @@ #include "../ann_vamana.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { typedef AnnVamanaTest AnnVamanaTestF_U32; TEST_P(AnnVamanaTestF_U32, AnnVamana) { this->testVamana(); } INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestF_U32, ::testing::ValuesIn(inputs)); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/test/neighbors/ann_vamana/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_vamana/test_int8_t_uint32_t.cu index 0a6b563b2c..843d2274a8 100644 --- a/cpp/test/neighbors/ann_vamana/test_int8_t_uint32_t.cu +++ b/cpp/test/neighbors/ann_vamana/test_int8_t_uint32_t.cu @@ -18,11 +18,11 @@ #include "../ann_vamana.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { typedef AnnVamanaTest AnnVamanaTestI8_U32; TEST_P(AnnVamanaTestI8_U32, AnnVamana) { this->testVamana(); } INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestI8_U32, ::testing::ValuesIn(inputs)); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/cpp/test/neighbors/ann_vamana/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_vamana/test_uint8_t_uint32_t.cu index c0680dc188..f08db0c49f 100644 --- a/cpp/test/neighbors/ann_vamana/test_uint8_t_uint32_t.cu +++ b/cpp/test/neighbors/ann_vamana/test_uint8_t_uint32_t.cu @@ -18,11 +18,11 @@ #include "../ann_vamana.cuh" -namespace cuvs::neighbors::experimental::vamana { +namespace cuvs::neighbors::vamana { typedef AnnVamanaTest AnnVamanaTestU8_U32; TEST_P(AnnVamanaTestU8_U32, AnnVamana) { this->testVamana(); } INSTANTIATE_TEST_CASE_P(AnnVamanaTest, AnnVamanaTestU8_U32, ::testing::ValuesIn(inputs)); -} // namespace cuvs::neighbors::experimental::vamana +} // namespace cuvs::neighbors::vamana diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst index ab810ab531..95359558f5 100644 --- a/docs/source/cpp_api/neighbors.rst +++ b/docs/source/cpp_api/neighbors.rst @@ -18,3 +18,4 @@ Nearest Neighbors neighbors_nn_descent.rst neighbors_refine.rst neighbors_mg.rst + neighbors_vamana.rst From 734d5ef1c28350f54ed40b68f41f91178f8cc1cb Mon Sep 17 00:00:00 2001 From: bkarsin Date: Tue, 7 Jan 2025 08:40:19 +0000 Subject: [PATCH 03/13] Update docs and remove cleanup other files --- cpp/include/cuvs/neighbors/vamana.hpp | 230 ++++++++++++++++++++++- docs/source/cpp_api/neighbors_vamana.rst | 44 +++++ docs/source/indexes/vamana.rst | 76 ++++++++ examples/cpp/src/vamana_example.cu | 2 +- 4 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 docs/source/cpp_api/neighbors_vamana.rst create mode 100644 docs/source/indexes/vamana.rst diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index 4b4a8a8d3e..fca1b8ce44 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -215,34 +215,198 @@ struct index : cuvs::neighbors::index { * @{ */ /** - * @brief Build the index from the dataset for efficient search. + * @brief Build the index from the dataset for efficient DiskANN search. * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; +/** + * @brief Build the index from the dataset for efficient DiskANN search. + * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index + */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) -> cuvs::neighbors::vamana::index; +/** + * @brief Build the index from the dataset for efficient DiskANN search. + * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index + */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; +/** + * @brief Build the index from the dataset for efficient DiskANN search. + * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index + */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) -> cuvs::neighbors::vamana::index; +/** + * @brief Build the index from the dataset for efficient DiskANN search. + * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (device) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index + */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; +/** + * @brief Build the index from the dataset for efficient DiskANN search. + * + * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves + * performing a greedy search for each vector to be inserted, and inserting it with edges to + * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied + * to improve graph quality. The index_params struct controls the degree of the final graph. + * + * The following distance metrics are supported: + * - L2 + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) + * vamana::serialize(res, filename, index); + * + * @param[in] res + * @param[in] params parameters for building the index + * @param[in] dataset a matrix view (host) to a row-major matrix [n_rows, dim] + * + * @return the constructed vamana index + */ auto build(raft::resources const& handle, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) @@ -252,18 +416,82 @@ auto build(raft::resources const& handle, * @defgroup vamana_cpp_serialize Vamana serialize functions * @{ */ + /** * Save the index to file. + * + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] file_prefix prefix of path and name of index files + * @param[in] index Vamana index + * */ void serialize(raft::resources const& handle, const std::string& file_prefix, const cuvs::neighbors::vamana::index& index); +/** + * Save the index to file. + * + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] file_prefix prefix of path and name of index files + * @param[in] index Vamana index + * + */ void serialize(raft::resources const& handle, const std::string& file_prefix, const cuvs::neighbors::vamana::index& index); +/** + * Save the index to file. + * + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] file_prefix prefix of path and name of index files + * @param[in] index Vamana index + * + */ void serialize(raft::resources const& handle, const std::string& file_prefix, const cuvs::neighbors::vamana::index& index); diff --git a/docs/source/cpp_api/neighbors_vamana.rst b/docs/source/cpp_api/neighbors_vamana.rst new file mode 100644 index 0000000000..08377c72a0 --- /dev/null +++ b/docs/source/cpp_api/neighbors_vamana.rst @@ -0,0 +1,44 @@ +Vamana +===== + +Vamana is the graph construction algorithm behind the well-known DiskANN vector search solution. The cuVS implementation of Vamana/DiskANN is a custom GPU-acceleration version of the algorithm that aims to reduce index construction time using NVIDIA GPUs. + +.. role:: py(code) + :language: c++ + :class: highlight + +``#include `` + +namespace *cuvs::neighbors::vamana* + +Index build parameters +---------------------- + +.. doxygengroup:: vamana_cpp_index_params + :project: cuvs + :members: + :content-only: + +Index +----- + +.. doxygengroup:: vamana_cpp_index + :project: cuvs + :members: + :content-only: + +Index build +----------- + +.. doxygengroup:: vamana_cpp_index_build + :project: cuvs + :members: + :content-only: + +Index serialize +--------------- + +.. doxygengroup:: vamana_cpp_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/indexes/vamana.rst b/docs/source/indexes/vamana.rst new file mode 100644 index 0000000000..8a14cf0f6b --- /dev/null +++ b/docs/source/indexes/vamana.rst @@ -0,0 +1,76 @@ +CAGRA +===== + +VAMANA is the underlying graph construction algorithm used to construct indexes for the DiskANN vector search solution. DiskANN and the Vamana algortihm are described in detail in the `published paper `, and a highly optimized `open-source repository ` includes many features for index construction and search. In cuVS, we provide a version of the Vamana algorithm optimized for GPU architectures to accelreate graph construction to build DiskANN idnexes. At a high level, the Vamana algorithm operates as follows: + +* 1. Starting with an empty graph, select a medoid vector from the D-dimension vector dataset and insert it into the graph. +* 2. Iteratively insert batches of dataset vectors into the graph, connecting each inserted vector to neighbors based on a graph traversal. +* 3. For each batch, create reverse edges and prune unnecessary edges. + +There are many algorithmic details that are outlined in the `paper `, and many GPU-specific optimizations are included in this implementation. + +The current implementation of DiskANN in cuVS only includes the 'in-memory' graph construction and a serialization step that writes the index to a file. This index file can be then used by the `open-source DiskANN ` library to perform efficient search. Additional DiskANN functionality, including GPU-accelerated search and 'ssd' index build are planned for future cuVS releases. + +[ :doc:`C++ API <../cpp_api/neighbors_vamana>` | :doc:`Python API <../python_api/neighbors_vamana>` ] + +Interoperability with CPU DiskANN +-------------------------- + +The 'vamana::serialize' API calls writes the index to a file with a format that is compatable with the `open-source DiskANN repositoriy `. This allows cuVS to be used to accelerate index construction while leveraging the efficient CPU-based search currently available. + +Configuration parameters +------------------------ + +Build parameters +~~~~~~~~~~~~~~~~ + +.. list-table:: + :widths: 25 25 50 + :header-rows: 1 + + * - Name + - Default + - Description + * - graph_degree + - 32 + - The maximum degre of the final Vamana graph. The internal representation of the graph includes this many edges for every node, but serialize will compress the graph into a 'CSR' format with, potentially, fewer edges. + * - visited_size + - 64 + - Maximum number of visited nodes saved during each traversal to insert a new node. This corresponds to the 'L' parameter in the paper. + * - vamana_iters + - 1 + - Number of iterations ran to improve the graph. Each iteration involves inserting every vector in the dataset. + * - alpha + - 1.2 + - Alpha parameter that defines how aggressively to prune edges. + * - max_fraction + - 0.06 + - Maximum fraction of the dataset that will be inserted as a single batch. Larger max batch size decreases graph quality but improves speed. + * - batch_base + - 2 + - Base of growth rate of batch sizes. Insertion batch sizes increase exponentially based on this parameter until max_fraction is reached. + * - queue_size + - 127 + - Size of the candidate queue structure used during graph traversal. Must be (2^x)-1 for some x, and must be > visited_size. + +Tuning Considerations +--------------------- + +The 2 hyper-parameters that are most often tuned are `graph_degree` and `visited_size`. The time needed to create a graph increases dramatically when increasing `graph_degree`, in particular. However, larger graphs may be needed to achieve very high recall search, especially for large datasets. + +Memory footprint +---------------- + +Vamana builds a graph that is stored in device memory. However, in order to serialize the index and write it to a file for later use, it must be moved into host memory. If the `include_dataset` parameter is also set, then the dataset must be resident in host memory when calling serialize as well. + +Device memory usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The built index represents the graph as fixed degree, storing a total of :math:`graph_degree * n_index_vectors` edges. Graph construction also requires the dataset be in device memory (or it copies it to device during build). In addition, device memory is used during construction to sort and create the reverse edges. Thus, the amount of device memory needed depends on the dataset itself, but it is bounded by a maximum sum of: + +- vector dataset: :math:`n_index_vectors * n__dims * sizeof(T)` +- output graph: :math:`graph_degree * n_index_vectors * sizeof(IdxT)` +- scratch memory: :math:`n_index_vectors * max_fraction * (2 + graph_degree) * sizeof(IdxT)` + +Reduction in scratch device memory requirements are planned for upcoming releases of cuVS. + diff --git a/examples/cpp/src/vamana_example.cu b/examples/cpp/src/vamana_example.cu index 60bf14d56a..e85ae0e5b1 100644 --- a/examples/cpp/src/vamana_example.cu +++ b/examples/cpp/src/vamana_example.cu @@ -33,7 +33,7 @@ void vamana_build_and_write(raft::device_resources const &dev_resources, raft::device_matrix_view dataset, std::string out_fname, int degree, int visited_size, float max_fraction, int iters) { - using namespace cuvs::neighbors::experimental; + using namespace cuvs::neighbors; // use default index parameters vamana::index_params index_params; From 2b41a0e05eb397700e4e8ebba039084973a713fa Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 15 Jan 2025 11:30:12 +0000 Subject: [PATCH 04/13] Reduce memory footprint by batching reverse edge computation --- cpp/include/cuvs/neighbors/vamana.hpp | 2 + .../neighbors/detail/vamana/vamana_build.cuh | 190 ++++++++++-------- .../detail/vamana/vamana_structs.cuh | 16 +- cpp/test/neighbors/ann_vamana.cuh | 39 ++-- 4 files changed, 138 insertions(+), 109 deletions(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index fca1b8ce44..53d59f4362 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -59,6 +59,8 @@ struct index_params : cuvs::neighbors::index_params { float batch_base = 2; /** Size of candidate queue structure - should be (2^x)-1 */ uint32_t queue_size = 127; + /** Max batchsize of reverse edge processing (reduces memory footprint) */ + uint32_t reverse_batchsize = 1000000; }; /** diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index 65f683a080..9f68d31413 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -104,11 +104,12 @@ void batched_insert_vamana( "to 1.0"); max_batchsize = (int)dataset.extent(0); } - int insert_iters = (int)(params.vamana_iters); - double base = (double)(params.batch_base); - float alpha = (float)(params.alpha); - int visited_size = params.visited_size; - int queue_size = params.queue_size; + int insert_iters = (int)(params.vamana_iters); + double base = (double)(params.batch_base); + float alpha = (float)(params.alpha); + int visited_size = params.visited_size; + int queue_size = params.queue_size; + int reverse_batch = params.reverse_batchsize; if ((visited_size & (visited_size - 1)) != 0) { RAFT_LOG_WARN("visited_size must be a power of 2, rounding up."); @@ -152,39 +153,20 @@ void batched_insert_vamana( std::vector insert_order; create_insert_permutation(insert_order, (uint32_t)N); - // Memory needed to sort reverse edges - potentially large memory footprint - auto edge_dest = - raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(max_batchsize, degree)); - auto edge_src = - raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(max_batchsize, degree)); - - size_t temp_storage_bytes = max_batchsize * degree * (2 * sizeof(IdxT)); - RAFT_LOG_DEBUG("Temp storage needed for sorting (bytes): %lu", temp_storage_bytes); - auto temp_sort_storage = - raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(2 * max_batchsize, degree)); - // Calculate the shared memory sizes of each kernel int search_smem_sort_size = 0; int prune_smem_sort_size = 0; SELECT_SMEM_SIZES(degree, visited_size); // Sets above 2 variables to appropriate sizes // Total dynamic shared memory used by GreedySearch - int align_padding = ((((dim-1)/16)+1)*16) - dim; - int search_smem_total_size = - static_cast(search_smem_sort_size + (dim+align_padding) * sizeof(T) + - visited_size * sizeof(Node) + - degree * sizeof(int) + queue_size * sizeof(DistPair)); + int align_padding = ((((dim - 1) / 16) + 1) * 16) - dim; + int search_smem_total_size = static_cast( + search_smem_sort_size + (dim + align_padding) * sizeof(T) + visited_size * sizeof(Node) + + degree * sizeof(int) + queue_size * sizeof(DistPair)); // Total dynamic shared memory size needed by both RobustPrune calls - int prune_smem_total_size = - prune_smem_sort_size + (dim+align_padding) * sizeof(T) - + (degree + visited_size) * sizeof(DistPair); + int prune_smem_total_size = prune_smem_sort_size + (dim + align_padding) * sizeof(T) + + (degree + visited_size) * sizeof(DistPair); RAFT_LOG_DEBUG("Dynamic shared memory usage (bytes): GreedySearch: %d, RobustPrune: %d", search_smem_total_size, @@ -255,6 +237,15 @@ void batched_insert_vamana( int total_edges; raft::copy(&total_edges, d_total_edges.data_handle(), 1, stream); + auto edge_dest = + raft::make_device_mdarray(res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(total_edges)); + auto edge_src = + raft::make_device_mdarray(res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(total_edges)); + // Create reverse edge list create_reverse_edge_list <<>>(query_list_ptr.data_handle(), @@ -263,6 +254,24 @@ void batched_insert_vamana( edge_src.data_handle(), edge_dest.data_handle()); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + + cub::DeviceMergeSort::SortPairs(d_temp_storage, + temp_storage_bytes, + edge_dest.data_handle(), + edge_src.data_handle(), + total_edges, + CmpEdge(), + stream); + + RAFT_LOG_DEBUG("Temp storage needed for sorting (bytes): %lu", temp_storage_bytes); + + auto temp_sort_storage = raft::make_device_mdarray( + res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(temp_storage_bytes / sizeof(IdxT))); + // Sort to group reverse edges by destination cub::DeviceMergeSort::SortPairs(temp_sort_storage.data_handle(), temp_storage_bytes, @@ -285,61 +294,72 @@ void batched_insert_vamana( thrust::unique_by_key( edge_dest_vec.begin(), edge_dest_vec.end(), unique_indices.data_handle()); - // Allocate reverse QueryCandidate list based on number of unique destinations - // TODO - Do this in batches to reduce memory footprint / support larger datasets - auto reverse_list_ptr = raft::make_device_mdarray>( - res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(unique_dests)); - auto rev_ids = - raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(unique_dests, visited_size)); - auto rev_dists = - raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), - raft::make_extents(unique_dests, visited_size)); - - QueryCandidates* reverse_list = - static_cast*>(reverse_list_ptr.data_handle()); - - init_query_candidate_list<<<256, blockD, 0, stream>>>(reverse_list, - rev_ids.data_handle(), - rev_dists.data_handle(), - (int)unique_dests, - visited_size); - - // May need more blocks for reverse list - num_blocks = min(maxBlocks, unique_dests); - - // Populate reverse list ids and candidate lists from edge_src and edge_dest - populate_reverse_list_struct - <<>>(reverse_list, - edge_src.data_handle(), - edge_dest.data_handle(), - unique_indices.data_handle(), - unique_dests, - total_edges, - dataset.extent(0)); - - // Recompute distances (avoided keeping it during sorting) - recompute_reverse_dists - <<>>(reverse_list, dataset, unique_dests, metric); - - // Call 2nd RobustPrune on reverse query_list - RobustPruneKernel - <<>>(d_graph.view(), - raft::make_const_mdspan(dataset), - reverse_list_ptr.data_handle(), - unique_dests, - visited_size, - metric, - alpha, - prune_smem_sort_size); - - // Write new edge lists to graph - write_graph_edges_kernel<<>>( - d_graph.view(), reverse_list_ptr.data_handle(), degree, unique_dests); + edge_dest_vec.clear(); + edge_dest_vec.shrink_to_fit(); + + // Batch execution of reverse edge creation/application + for (int rev_start = 0; rev_start < (int)unique_dests; rev_start += reverse_batch) { + if (rev_start + reverse_batch > (int)unique_dests) { + reverse_batch = (int)unique_dests - rev_start; + } + + // Allocate reverse QueryCandidate list based on number of unique destinations + auto reverse_list_ptr = raft::make_device_mdarray>( + res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(reverse_batch)); + auto rev_ids = + raft::make_device_mdarray(res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(reverse_batch, visited_size)); + auto rev_dists = + raft::make_device_mdarray(res, + raft::resource::get_large_workspace_resource(res), + raft::make_extents(reverse_batch, visited_size)); + + QueryCandidates* reverse_list = + static_cast*>(reverse_list_ptr.data_handle()); + + init_query_candidate_list<<<256, blockD, 0, stream>>>(reverse_list, + rev_ids.data_handle(), + rev_dists.data_handle(), + (int)reverse_batch, + visited_size); + + // May need more blocks for reverse list + num_blocks = min(maxBlocks, reverse_batch); + + // Populate reverse list ids and candidate lists from edge_src and edge_dest + populate_reverse_list_struct + <<>>(reverse_list, + edge_src.data_handle(), + edge_dest.data_handle(), + unique_indices.data_handle(), + unique_dests, + total_edges, + dataset.extent(0), + rev_start, + reverse_batch); + + // Recompute distances (avoided keeping it during sorting) + recompute_reverse_dists + <<>>(reverse_list, dataset, reverse_batch, metric); + + // Call 2nd RobustPrune on reverse query_list + RobustPruneKernel + <<>>(d_graph.view(), + raft::make_const_mdspan(dataset), + reverse_list_ptr.data_handle(), + reverse_batch, + visited_size, + metric, + alpha, + prune_smem_sort_size); + + // Write new edge lists to graph + write_graph_edges_kernel<<>>( + d_graph.view(), reverse_list_ptr.data_handle(), degree, reverse_batch); + } start += step_size; step_size *= base; diff --git a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh index fade674dcf..22678c1962 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh @@ -420,22 +420,24 @@ __global__ void populate_reverse_list_struct(QueryCandidates* revers int* unique_indices, int unique_dests, int total_edges, - int N) + int N, + int rev_start, + int reverse_batch) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < unique_dests; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < reverse_batch; i += blockDim.x * gridDim.x) { - reverse_list[i].queryId = edge_dest[unique_indices[i]]; - if (i == unique_dests - 1) { - reverse_list[i].size = total_edges - unique_indices[i]; + reverse_list[i].queryId = edge_dest[unique_indices[i + rev_start]]; + if (rev_start + i == unique_dests - 1) { + reverse_list[i].size = total_edges - unique_indices[i + rev_start]; } else { - reverse_list[i].size = unique_indices[i + 1] - unique_indices[i]; + reverse_list[i].size = unique_indices[i + rev_start + 1] - unique_indices[i + rev_start]; } if (reverse_list[i].size > reverse_list[i].maxSize) { reverse_list[i].size = reverse_list[i].maxSize; } for (int j = 0; j < reverse_list[i].size; j++) { - reverse_list[i].ids[j] = edge_src[unique_indices[i] + j]; + reverse_list[i].ids[j] = edge_src[unique_indices[i + rev_start] + j]; } for (int j = reverse_list[i].size; j < reverse_list[i].maxSize; j++) { reverse_list[i].ids[j] = raft::upper_bound(); diff --git a/cpp/test/neighbors/ann_vamana.cuh b/cpp/test/neighbors/ann_vamana.cuh index 3d4f88639f..9e941b51af 100644 --- a/cpp/test/neighbors/ann_vamana.cuh +++ b/cpp/test/neighbors/ann_vamana.cuh @@ -64,6 +64,7 @@ struct AnnVamanaInputs { double max_fraction; cuvs::distance::DistanceType metric; bool host_dataset; + int reverse_batchsize; // cagra search params int n_queries; @@ -131,10 +132,11 @@ class AnnVamanaTest : public ::testing::TestWithParam { void testVamana() { vamana::index_params index_params; - index_params.metric = ps.metric; - index_params.graph_degree = ps.graph_degree; - index_params.visited_size = ps.visited_size; - index_params.max_fraction = ps.max_fraction; + index_params.metric = ps.metric; + index_params.graph_degree = ps.graph_degree; + index_params.visited_size = ps.visited_size; + index_params.max_fraction = ps.max_fraction; + index_params.reverse_batchsize = ps.reverse_batchsize; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); @@ -264,12 +266,13 @@ inline std::vector generate_inputs() { std::vector inputs = raft::util::itertools::product( {1000}, - {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, - {32}, // graph degree - {64, 128, 256}, // visited_size + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + {32}, // graph degree + {64, 128, 256}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, {false}, + {100, 1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -278,14 +281,15 @@ inline std::vector generate_inputs() {1}, {0.2}); - std::vector inputs2 = - raft::util::itertools::product({1000}, - {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + std::vector inputs2 = raft::util::itertools::product( + {1000}, + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {64}, // graph degree {128, 256, 512}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, {false}, + {100, 1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -295,14 +299,15 @@ inline std::vector generate_inputs() {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - inputs2 = - raft::util::itertools::product({1000}, - {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + inputs2 = raft::util::itertools::product( + {1000}, + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {128}, // graph degree {256, 512}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, {false}, + {100, 1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -312,14 +317,15 @@ inline std::vector generate_inputs() {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - inputs2 = - raft::util::itertools::product({1000}, - {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, + inputs2 = raft::util::itertools::product( + {1000}, + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {256}, // graph degree {512, 1024}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, {false}, + {100, 1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -328,7 +334,6 @@ inline std::vector generate_inputs() {1}, {0.2}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); - return inputs; } From 61819c536b4f7223ccb4de502c52401e1e1cd374 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 15 Jan 2025 11:33:50 +0000 Subject: [PATCH 05/13] clang-format --- .../neighbors/detail/vamana/greedy_search.cuh | 7 +-- .../neighbors/detail/vamana/robust_prune.cuh | 8 +-- cpp/src/neighbors/vamana.cuh | 3 +- cpp/src/neighbors/vamana_build_float.cu | 30 +++++----- cpp/src/neighbors/vamana_build_int8.cu | 30 +++++----- cpp/src/neighbors/vamana_build_uint8.cu | 30 +++++----- cpp/src/neighbors/vamana_serialize.cuh | 5 +- examples/cpp/src/vamana_example.cu | 58 +++++++++++-------- 8 files changed, 88 insertions(+), 83 deletions(-) diff --git a/cpp/src/neighbors/detail/vamana/greedy_search.cuh b/cpp/src/neighbors/detail/vamana/greedy_search.cuh index 6a79383213..4d94bbaa77 100644 --- a/cpp/src/neighbors/detail/vamana/greedy_search.cuh +++ b/cpp/src/neighbors/detail/vamana/greedy_search.cuh @@ -103,7 +103,6 @@ __global__ void GreedySearchKernel( static __shared__ Point s_query; - union ShmemLayout { // All blocksort sizes have same alignment (16) typename cub::BlockMergeSort, 32, 1>::TempStorage sort_mem; @@ -113,7 +112,7 @@ __global__ void GreedySearchKernel( DistPair candidate_queue; }; - int align_padding = (((dim-1)/alignof(ShmemLayout))+1)*alignof(ShmemLayout) - dim; + int align_padding = (((dim - 1) / alignof(ShmemLayout)) + 1) * alignof(ShmemLayout) - dim; // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; @@ -121,7 +120,7 @@ __global__ void GreedySearchKernel( size_t smem_offset = sort_smem_size; // temp sorting memory takes first chunk T* s_coords = reinterpret_cast(&smem[smem_offset]); - smem_offset += (dim+align_padding) * sizeof(T); + smem_offset += (dim + align_padding) * sizeof(T); Node* topk_pq = reinterpret_cast*>(&smem[smem_offset]); smem_offset += topk * sizeof(Node); @@ -173,7 +172,7 @@ __global__ void GreedySearchKernel( if (threadIdx.x == 0) { heap_queue.insert_back(medoid_dist, medoid_id); } __syncthreads(); - + while (cand_q_size != 0) { __syncthreads(); diff --git a/cpp/src/neighbors/detail/vamana/robust_prune.cuh b/cpp/src/neighbors/detail/vamana/robust_prune.cuh index ee51776cf9..8fbbb974f3 100644 --- a/cpp/src/neighbors/detail/vamana/robust_prune.cuh +++ b/cpp/src/neighbors/detail/vamana/robust_prune.cuh @@ -145,11 +145,11 @@ __global__ void RobustPruneKernel( // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; - int align_padding = (((dim-1)/alignof(ShmemLayout))+1)*alignof(ShmemLayout) - dim; + int align_padding = (((dim - 1) / alignof(ShmemLayout)) + 1) * alignof(ShmemLayout) - dim; - T* s_coords = reinterpret_cast(&smem[sort_smem_size]); - DistPair* new_nbh_list = - reinterpret_cast*>(&smem[(dim+align_padding) * sizeof(T) + sort_smem_size]); + T* s_coords = reinterpret_cast(&smem[sort_smem_size]); + DistPair* new_nbh_list = reinterpret_cast*>( + &smem[(dim + align_padding) * sizeof(T) + sort_smem_size]); static __shared__ Point s_query; s_query.coords = s_coords; diff --git a/cpp/src/neighbors/vamana.cuh b/cpp/src/neighbors/vamana.cuh index 81b63c52f4..964d7a9a09 100644 --- a/cpp/src/neighbors/vamana.cuh +++ b/cpp/src/neighbors/vamana.cuh @@ -85,8 +85,7 @@ index build( const index_params& params, raft::mdspan, raft::row_major, Accessor> dataset) { - return cuvs::neighbors::vamana::detail::build( - res, params, dataset); + return cuvs::neighbors::vamana::detail::build(res, params, dataset); } template diff --git a/cpp/src/neighbors/vamana_build_float.cu b/cpp/src/neighbors/vamana_build_float.cu index 33a5381ff5..0e09d63994 100644 --- a/cpp/src/neighbors/vamana_build_float.cu +++ b/cpp/src/neighbors/vamana_build_float.cu @@ -19,21 +19,21 @@ namespace cuvs::neighbors::vamana { -#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ - } \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ +#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::device_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ + } \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::host_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(float, uint32_t); diff --git a/cpp/src/neighbors/vamana_build_int8.cu b/cpp/src/neighbors/vamana_build_int8.cu index a942bfbe22..f70b9ea276 100644 --- a/cpp/src/neighbors/vamana_build_int8.cu +++ b/cpp/src/neighbors/vamana_build_int8.cu @@ -19,21 +19,21 @@ namespace cuvs::neighbors::vamana { -#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ - } \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ +#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::device_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ + } \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::host_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(int8_t, uint32_t); diff --git a/cpp/src/neighbors/vamana_build_uint8.cu b/cpp/src/neighbors/vamana_build_uint8.cu index e1f8f1d219..8daf0c065c 100644 --- a/cpp/src/neighbors/vamana_build_uint8.cu +++ b/cpp/src/neighbors/vamana_build_uint8.cu @@ -19,21 +19,21 @@ namespace cuvs::neighbors::vamana { -#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ - } \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::vamana::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::vamana::index \ - { \ - return cuvs::neighbors::vamana::build(handle, params, dataset); \ +#define RAFT_INST_VAMANA_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::device_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ + } \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::vamana::index_params& params, \ + raft::host_matrix_view dataset) \ + ->cuvs::neighbors::vamana::index \ + { \ + return cuvs::neighbors::vamana::build(handle, params, dataset); \ } RAFT_INST_VAMANA_BUILD(uint8_t, uint32_t); diff --git a/cpp/src/neighbors/vamana_serialize.cuh b/cpp/src/neighbors/vamana_serialize.cuh index a800d497a0..b8cb580a82 100644 --- a/cpp/src/neighbors/vamana_serialize.cuh +++ b/cpp/src/neighbors/vamana_serialize.cuh @@ -28,10 +28,9 @@ namespace cuvs::neighbors::vamana { #define CUVS_INST_VAMANA_SERIALIZE(DTYPE) \ void serialize(raft::resources const& handle, \ const std::string& file_prefix, \ - const cuvs::neighbors::vamana::index& index_) \ + const cuvs::neighbors::vamana::index& index_) \ { \ - cuvs::neighbors::vamana::detail::serialize( \ - handle, file_prefix, index_); \ + cuvs::neighbors::vamana::detail::serialize(handle, file_prefix, index_); \ }; /** @} */ // end group vamana diff --git a/examples/cpp/src/vamana_example.cu b/examples/cpp/src/vamana_example.cu index e85ae0e5b1..9e5201d31c 100644 --- a/examples/cpp/src/vamana_example.cu +++ b/examples/cpp/src/vamana_example.cu @@ -29,10 +29,14 @@ #include "common.cuh" template -void vamana_build_and_write(raft::device_resources const &dev_resources, +void vamana_build_and_write(raft::device_resources const& dev_resources, raft::device_matrix_view dataset, - std::string out_fname, int degree, int visited_size, - float max_fraction, int iters) { + std::string out_fname, + int degree, + int visited_size, + float max_fraction, + int iters) +{ using namespace cuvs::neighbors; // use default index parameters @@ -46,13 +50,12 @@ void vamana_build_and_write(raft::device_resources const &dev_resources, auto start = std::chrono::system_clock::now(); auto index = vamana::build(dev_resources, index_params, dataset); - auto end = std::chrono::system_clock::now(); + auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end - start; std::cout << "Vamana index has " << index.size() << " vectors" << std::endl; - std::cout << "Vamana graph has degree " << index.graph_degree() - << ", graph size [" << index.graph().extent(0) << ", " - << index.graph().extent(1) << "]" << std::endl; + std::cout << "Vamana graph has degree " << index.graph_degree() << ", graph size [" + << index.graph().extent(0) << ", " << index.graph().extent(1) << "]" << std::endl; std::cout << "Time to build index: " << elapsed_seconds.count() << "s\n"; @@ -60,9 +63,11 @@ void vamana_build_and_write(raft::device_resources const &dev_resources, serialize(dev_resources, out_fname, index); } -void usage() { - printf("Usage: ./vamana_example \n"); +void usage() +{ + printf( + "Usage: ./vamana_example \n"); printf("Input file expected to be binary file of fp32 vectors.\n"); printf("Graph degree sizes supported: 32, 64, 128, 256\n"); printf("Visited_size must be > degree and a power of 2.\n"); @@ -71,13 +76,14 @@ void usage() { exit(1); } -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) +{ raft::device_resources dev_resources; // Set pool memory resource with 1 GiB initial pool size. All allocations use // the same pool. rmm::mr::pool_memory_resource pool_mr( - rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull); + rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull); rmm::mr::set_current_device_resource(&pool_mr); // Alternatively, one could define a pool allocator for temporary arrays (used @@ -87,22 +93,24 @@ int main(int argc, char *argv[]) { // limit. raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * // 1024 * 1024 * 1024ull); - if (argc != 7) - usage(); + if (argc != 7) usage(); - std::string data_fname = (std::string)(argv[1]); // Input filename - std::string out_fname = (std::string)argv[2]; // Output index filename - int degree = atoi(argv[3]); - int max_visited = atoi(argv[4]); - float max_fraction = atof(argv[5]); - int iters = atoi(argv[6]); + std::string data_fname = (std::string)(argv[1]); // Input filename + std::string out_fname = (std::string)argv[2]; // Output index filename + int degree = atoi(argv[3]); + int max_visited = atoi(argv[4]); + float max_fraction = atof(argv[5]); + int iters = atoi(argv[6]); // Read in binary dataset file - auto dataset = - read_bin_dataset(dev_resources, data_fname, INT_MAX); + auto dataset = read_bin_dataset(dev_resources, data_fname, INT_MAX); // Simple build example to create graph and write to a file - vamana_build_and_write( - dev_resources, raft::make_const_mdspan(dataset.view()), out_fname, degree, - max_visited, max_fraction, iters); + vamana_build_and_write(dev_resources, + raft::make_const_mdspan(dataset.view()), + out_fname, + degree, + max_visited, + max_fraction, + iters); } From e27fea27c7a10f7282150432251a27be9e310ecf Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 15 Jan 2025 20:24:22 +0000 Subject: [PATCH 06/13] codespell --- cpp/include/cuvs/neighbors/vamana.hpp | 20 ++++++++++---------- docs/source/indexes/vamana.rst | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index 53d59f4362..6123b47261 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -55,7 +55,7 @@ struct index_params : cuvs::neighbors::index_params { /** Maximum fraction of dataset inserted per batch. * * Larger max batch decreases graph quality, but improves speed */ float max_fraction = 0.06; - /** Base of growth rate of batch sies **/ + /** Base of growth rate of batch sizes **/ float batch_base = 2; /** Size of candidate queue structure - should be (2^x)-1 */ uint32_t queue_size = 127; @@ -219,7 +219,7 @@ struct index : cuvs::neighbors::index { /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -252,7 +252,7 @@ auto build(raft::resources const& handle, /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -285,7 +285,7 @@ auto build(raft::resources const& handle, /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -318,7 +318,7 @@ auto build(raft::resources const& handle, /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -351,7 +351,7 @@ auto build(raft::resources const& handle, /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -384,7 +384,7 @@ auto build(raft::resources const& handle, /** * @brief Build the index from the dataset for efficient DiskANN search. * - * The build utilies the Vamana insertion-based algorithm to create the graph. The algorithm + * The build utilities the Vamana insertion-based algorithm to create the graph. The algorithm * starts with an empty graph and iteratively iserts batches of nodes. Each batch involves * performing a greedy search for each vector to be inserted, and inserting it with edges to * all nodes traversed during the search. Reverse edges are also inserted and robustPrune is applied @@ -422,7 +422,7 @@ auto build(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. * * @code{.cpp} * #include @@ -449,7 +449,7 @@ void serialize(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. * * @code{.cpp} * #include @@ -475,7 +475,7 @@ void serialize(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatabilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. * * @code{.cpp} * #include diff --git a/docs/source/indexes/vamana.rst b/docs/source/indexes/vamana.rst index 8a14cf0f6b..a0dd45cdb8 100644 --- a/docs/source/indexes/vamana.rst +++ b/docs/source/indexes/vamana.rst @@ -1,7 +1,7 @@ CAGRA ===== -VAMANA is the underlying graph construction algorithm used to construct indexes for the DiskANN vector search solution. DiskANN and the Vamana algortihm are described in detail in the `published paper `, and a highly optimized `open-source repository ` includes many features for index construction and search. In cuVS, we provide a version of the Vamana algorithm optimized for GPU architectures to accelreate graph construction to build DiskANN idnexes. At a high level, the Vamana algorithm operates as follows: +VAMANA is the underlying graph construction algorithm used to construct indexes for the DiskANN vector search solution. DiskANN and the Vamana algorithm are described in detail in the `published paper `, and a highly optimized `open-source repository ` includes many features for index construction and search. In cuVS, we provide a version of the Vamana algorithm optimized for GPU architectures to accelreate graph construction to build DiskANN idnexes. At a high level, the Vamana algorithm operates as follows: * 1. Starting with an empty graph, select a medoid vector from the D-dimension vector dataset and insert it into the graph. * 2. Iteratively insert batches of dataset vectors into the graph, connecting each inserted vector to neighbors based on a graph traversal. @@ -16,7 +16,7 @@ The current implementation of DiskANN in cuVS only includes the 'in-memory' grap Interoperability with CPU DiskANN -------------------------- -The 'vamana::serialize' API calls writes the index to a file with a format that is compatable with the `open-source DiskANN repositoriy `. This allows cuVS to be used to accelerate index construction while leveraging the efficient CPU-based search currently available. +The 'vamana::serialize' API calls writes the index to a file with a format that is compatible with the `open-source DiskANN repositoriy `. This allows cuVS to be used to accelerate index construction while leveraging the efficient CPU-based search currently available. Configuration parameters ------------------------ From 0b8085c716b91e57df395beb53d5fc6ad510903f Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 15 Jan 2025 20:28:45 +0000 Subject: [PATCH 07/13] Fix spelling mistake --- cpp/include/cuvs/neighbors/vamana.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index 6123b47261..eb0caaa8c0 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -422,7 +422,7 @@ auto build(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} * #include @@ -449,7 +449,7 @@ void serialize(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} * #include @@ -475,7 +475,7 @@ void serialize(raft::resources const& handle, /** * Save the index to file. * - * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibilty. + * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} * #include From 0bc808fc975a85cfe356e62a0e96c4077fb6cab4 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Tue, 21 Jan 2025 23:45:47 +0000 Subject: [PATCH 08/13] Fixed batched reverse list bug and removed excessive tests --- .../neighbors/detail/vamana/vamana_build.cuh | 2 ++ cpp/test/neighbors/ann_vamana.cuh | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index 9f68d31413..f67d4e9c76 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -298,6 +298,7 @@ void batched_insert_vamana( edge_dest_vec.shrink_to_fit(); // Batch execution of reverse edge creation/application + reverse_batch = params.reverse_batchsize; for (int rev_start = 0; rev_start < (int)unique_dests; rev_start += reverse_batch) { if (rev_start + reverse_batch > (int)unique_dests) { reverse_batch = (int)unique_dests - rev_start; @@ -359,6 +360,7 @@ void batched_insert_vamana( // Write new edge lists to graph write_graph_edges_kernel<<>>( d_graph.view(), reverse_list_ptr.data_handle(), degree, reverse_batch); + } start += step_size; diff --git a/cpp/test/neighbors/ann_vamana.cuh b/cpp/test/neighbors/ann_vamana.cuh index 9e941b51af..92e4d96d79 100644 --- a/cpp/test/neighbors/ann_vamana.cuh +++ b/cpp/test/neighbors/ann_vamana.cuh @@ -268,7 +268,7 @@ inline std::vector generate_inputs() {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {32}, // graph degree - {64, 128, 256}, // visited_size + {64, 256}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, {false}, @@ -285,11 +285,11 @@ inline std::vector generate_inputs() {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {64}, // graph degree - {128, 256, 512}, // visited_size - {0.06, 0.1}, + {128, 512}, // visited_size + {0.06}, {cuvs::distance::DistanceType::L2Expanded}, {false}, - {100, 1000000}, + {1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -303,11 +303,11 @@ inline std::vector generate_inputs() {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {128}, // graph degree - {256, 512}, // visited_size - {0.06, 0.1}, + {256}, // visited_size + {0.06}, {cuvs::distance::DistanceType::L2Expanded}, {false}, - {100, 1000000}, + {1000000}, {100}, {10}, {cagra::search_algo::AUTO}, @@ -322,10 +322,10 @@ inline std::vector generate_inputs() {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, {256}, // graph degree {512, 1024}, // visited_size - {0.06, 0.1}, + {0.06}, {cuvs::distance::DistanceType::L2Expanded}, {false}, - {100, 1000000}, + {1000000}, {100}, {10}, {cagra::search_algo::AUTO}, From 04d8b5311c5aee1c3bbc3319f3b47c31a26f344f Mon Sep 17 00:00:00 2001 From: bkarsin Date: Tue, 21 Jan 2025 23:49:42 +0000 Subject: [PATCH 09/13] clang-format --- cpp/src/neighbors/detail/vamana/vamana_build.cuh | 1 - cpp/test/neighbors/ann_vamana.cuh | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index f67d4e9c76..c788dd4515 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -360,7 +360,6 @@ void batched_insert_vamana( // Write new edge lists to graph write_graph_edges_kernel<<>>( d_graph.view(), reverse_list_ptr.data_handle(), degree, reverse_batch); - } start += step_size; diff --git a/cpp/test/neighbors/ann_vamana.cuh b/cpp/test/neighbors/ann_vamana.cuh index 92e4d96d79..9fe0324d78 100644 --- a/cpp/test/neighbors/ann_vamana.cuh +++ b/cpp/test/neighbors/ann_vamana.cuh @@ -267,7 +267,7 @@ inline std::vector generate_inputs() std::vector inputs = raft::util::itertools::product( {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, - {32}, // graph degree + {32}, // graph degree {64, 256}, // visited_size {0.06, 0.1}, {cuvs::distance::DistanceType::L2Expanded}, @@ -284,7 +284,7 @@ inline std::vector generate_inputs() std::vector inputs2 = raft::util::itertools::product( {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, - {64}, // graph degree + {64}, // graph degree {128, 512}, // visited_size {0.06}, {cuvs::distance::DistanceType::L2Expanded}, @@ -302,7 +302,7 @@ inline std::vector generate_inputs() inputs2 = raft::util::itertools::product( {1000}, {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, - {128}, // graph degree + {128}, // graph degree {256}, // visited_size {0.06}, {cuvs::distance::DistanceType::L2Expanded}, From ba226820413b7bd4bc960c23546698ff8647b8d0 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 29 Jan 2025 21:42:27 +0000 Subject: [PATCH 10/13] Address reviewer comments and some doxygen issues --- cpp/include/cuvs/neighbors/vamana.hpp | 147 ++++++++++-------- .../neighbors/detail/vamana/robust_prune.cuh | 4 +- .../neighbors/detail/vamana/vamana_build.cuh | 3 +- 3 files changed, 87 insertions(+), 67 deletions(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index eb0caaa8c0..88d7ec8bfc 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -32,15 +32,26 @@ #include namespace cuvs::neighbors::vamana { + /** * @defgroup vamana_cpp_index_params Vamana index build parameters * @{ */ /** - * @brief ANN parameters used by VAMANA to build index - * + * @brief Parameters used to build DiskANN index + * + * `graph_degree`: Maximum degree of graph; correspods to the R parameter of + * Vamana algorithm in the literature. + * `visited_size`: Maximum number of visited nodes per search during Vamana algorithm. + * Loosely corresponds to the L parameter in the literature. + * `vamana_iters`: The number of times all vectors are inserted into the graph. If > 1, + * all vectors are re-inserted to improve graph quality. + * `max_fraction`: The maximum batch size is this fraction of the total dataset size. Larger + * gives faster build but lower graph quality. + * `alpha`: Used to determine how aggresive the pruning will be. */ + struct index_params : cuvs::neighbors::index_params { /** Maximum degree of output graph corresponds to the R parameter in the original Vamana * literature. */ @@ -231,12 +242,13 @@ struct index : cuvs::neighbors::index { * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -244,7 +256,7 @@ struct index : cuvs::neighbors::index { * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -264,12 +276,13 @@ auto build(raft::resources const& handle, * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -277,7 +290,7 @@ auto build(raft::resources const& handle, * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -297,12 +310,13 @@ auto build(raft::resources const& handle, * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -310,7 +324,7 @@ auto build(raft::resources const& handle, * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -330,12 +344,13 @@ auto build(raft::resources const& handle, * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -343,7 +358,7 @@ auto build(raft::resources const& handle, * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -363,12 +378,13 @@ auto build(raft::resources const& handle, * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -376,7 +392,7 @@ auto build(raft::resources const& handle, * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::device_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -396,12 +412,13 @@ auto build(raft::resources const& handle, * Usage example: * @code{.cpp} * using namespace cuvs::neighbors; - * // use default index parameters; - * vamana::index_params index_params; - * // create and fill index from a [N, D] dataset; - * auto index = vamana::build(res, index_params, dataset); - * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support search) - * vamana::serialize(res, filename, index); + * // use default index parameters; + * vamana::index_params index_params; + * // create and fill index from a [N, D] dataset; + * auto index = vamana::build(res, index_params, dataset); + * // write index to file to be used by CPU-based DiskANN search (cuVS does not yet support + * search) vamana::serialize(res, filename, index); + * @endcode * * @param[in] res * @param[in] params parameters for building the index @@ -409,7 +426,7 @@ auto build(raft::resources const& handle, * * @return the constructed vamana index */ -auto build(raft::resources const& handle, +auto build(raft::resources const& res, const cuvs::neighbors::vamana::index_params& params, raft::host_matrix_view dataset) -> cuvs::neighbors::vamana::index; @@ -425,15 +442,15 @@ auto build(raft::resources const& handle, * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} - * #include - * #include + * #include + * #include * - * raft::resources handle; + * raft::resources handle; * - * // create a string with a filepath - * std::string file_prefix("/path/to/index/prefix"); - * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` - * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); * @endcode * * @param[in] handle the raft handle @@ -452,15 +469,15 @@ void serialize(raft::resources const& handle, * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} - * #include - * #include + * #include + * #include * - * raft::resources handle; + * raft::resources handle; * - * // create a string with a filepath - * std::string file_prefix("/path/to/index/prefix"); - * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` - * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); * @endcode * * @param[in] handle the raft handle @@ -478,15 +495,15 @@ void serialize(raft::resources const& handle, * Matches the file format used by the DiskANN open-source repository, allowing cross-compatibility. * * @code{.cpp} - * #include - * #include + * #include + * #include * - * raft::resources handle; + * raft::resources handle; * - * // create a string with a filepath - * std::string file_prefix("/path/to/index/prefix"); - * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` - * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); + * // create a string with a filepath + * std::string file_prefix("/path/to/index/prefix"); + * // create an index with `auto index = cuvs::neighbors::vamana::build(...);` + * cuvs::neighbors::vamana::serialize(handle, file_prefix, index); * @endcode * * @param[in] handle the raft handle diff --git a/cpp/src/neighbors/detail/vamana/robust_prune.cuh b/cpp/src/neighbors/detail/vamana/robust_prune.cuh index 8fbbb974f3..182d20c887 100644 --- a/cpp/src/neighbors/detail/vamana/robust_prune.cuh +++ b/cpp/src/neighbors/detail/vamana/robust_prune.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include "macros.cuh" #include "vamana_structs.cuh" @@ -145,7 +147,7 @@ __global__ void RobustPruneKernel( // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; - int align_padding = (((dim - 1) / alignof(ShmemLayout)) + 1) * alignof(ShmemLayout) - dim; + int align_padding = raft::alignTo(dim, alignof(ShmemLayout)) - dim; T* s_coords = reinterpret_cast(&smem[sort_smem_size]); DistPair* new_nbh_list = reinterpret_cast*>( diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index c788dd4515..184b024f86 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -159,7 +159,7 @@ void batched_insert_vamana( SELECT_SMEM_SIZES(degree, visited_size); // Sets above 2 variables to appropriate sizes // Total dynamic shared memory used by GreedySearch - int align_padding = ((((dim - 1) / 16) + 1) * 16) - dim; + int align_padding = raft::alignTo(dim, 16) - dim; int search_smem_total_size = static_cast( search_smem_sort_size + (dim + align_padding) * sizeof(T) + visited_size * sizeof(Node) + degree * sizeof(int) + queue_size * sizeof(DistPair)); @@ -236,6 +236,7 @@ void batched_insert_vamana( int total_edges; raft::copy(&total_edges, d_total_edges.data_handle(), 1, stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); auto edge_dest = raft::make_device_mdarray(res, From 5831bd5c6b4df1e95ce809401ba47b89ff682117 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 29 Jan 2025 21:51:06 +0000 Subject: [PATCH 11/13] Fix docs for DiskANN/Vamana --- docs/source/cpp_api/neighbors_vamana.rst | 2 +- docs/source/indexes/indexes.rst | 3 ++- docs/source/indexes/vamana.rst | 17 ++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/cpp_api/neighbors_vamana.rst b/docs/source/cpp_api/neighbors_vamana.rst index 08377c72a0..25447efce1 100644 --- a/docs/source/cpp_api/neighbors_vamana.rst +++ b/docs/source/cpp_api/neighbors_vamana.rst @@ -1,5 +1,5 @@ Vamana -===== +====== Vamana is the graph construction algorithm behind the well-known DiskANN vector search solution. The cuVS implementation of Vamana/DiskANN is a custom GPU-acceleration version of the algorithm that aims to reduce index construction time using NVIDIA GPUs. diff --git a/docs/source/indexes/indexes.rst b/docs/source/indexes/indexes.rst index a2fb1434a5..8746b84acc 100644 --- a/docs/source/indexes/indexes.rst +++ b/docs/source/indexes/indexes.rst @@ -9,6 +9,7 @@ Nearest Neighbor Indexes cagra.rst ivfflat.rst ivfpq.rst + vamana.rst Indices and tables @@ -16,4 +17,4 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` -* :ref:`search` \ No newline at end of file +* :ref:`search` diff --git a/docs/source/indexes/vamana.rst b/docs/source/indexes/vamana.rst index a0dd45cdb8..43e98b0967 100644 --- a/docs/source/indexes/vamana.rst +++ b/docs/source/indexes/vamana.rst @@ -1,5 +1,5 @@ -CAGRA -===== +Vamana +====== VAMANA is the underlying graph construction algorithm used to construct indexes for the DiskANN vector search solution. DiskANN and the Vamana algorithm are described in detail in the `published paper `, and a highly optimized `open-source repository ` includes many features for index construction and search. In cuVS, we provide a version of the Vamana algorithm optimized for GPU architectures to accelreate graph construction to build DiskANN idnexes. At a high level, the Vamana algorithm operates as follows: @@ -14,7 +14,7 @@ The current implementation of DiskANN in cuVS only includes the 'in-memory' grap [ :doc:`C++ API <../cpp_api/neighbors_vamana>` | :doc:`Python API <../python_api/neighbors_vamana>` ] Interoperability with CPU DiskANN --------------------------- +--------------------------------- The 'vamana::serialize' API calls writes the index to a file with a format that is compatible with the `open-source DiskANN repositoriy `. This allows cuVS to be used to accelerate index construction while leveraging the efficient CPU-based search currently available. @@ -64,13 +64,12 @@ Memory footprint Vamana builds a graph that is stored in device memory. However, in order to serialize the index and write it to a file for later use, it must be moved into host memory. If the `include_dataset` parameter is also set, then the dataset must be resident in host memory when calling serialize as well. Device memory usage -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~ -The built index represents the graph as fixed degree, storing a total of :math:`graph_degree * n_index_vectors` edges. Graph construction also requires the dataset be in device memory (or it copies it to device during build). In addition, device memory is used during construction to sort and create the reverse edges. Thus, the amount of device memory needed depends on the dataset itself, but it is bounded by a maximum sum of: +The built index represents the graph as fixed degree, storing a total of :math:`graph\_degree * n\_index\_vectors` edges. Graph construction also requires the dataset be in device memory (or it copies it to device during build). In addition, device memory is used during construction to sort and create the reverse edges. Thus, the amount of device memory needed depends on the dataset itself, but it is bounded by a maximum sum of: -- vector dataset: :math:`n_index_vectors * n__dims * sizeof(T)` -- output graph: :math:`graph_degree * n_index_vectors * sizeof(IdxT)` -- scratch memory: :math:`n_index_vectors * max_fraction * (2 + graph_degree) * sizeof(IdxT)` +- vector dataset: :math:`n\_index\_vectors * n\_dims * sizeof(T)` +- output graph: :math:`graph\_degree * n\_index\_vectors * sizeof(IdxT)` +- scratch memory: :math:`n\_index\_vectors * max\_fraction * (2 + graph\_degree) * sizeof(IdxT)` Reduction in scratch device memory requirements are planned for upcoming releases of cuVS. - From cf24142d92d8b63efa3183148f9029a2fec2954b Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 29 Jan 2025 21:54:08 +0000 Subject: [PATCH 12/13] Fix spelling error. --- cpp/include/cuvs/neighbors/vamana.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index 88d7ec8bfc..bc205a6f4f 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -49,7 +49,7 @@ namespace cuvs::neighbors::vamana { * all vectors are re-inserted to improve graph quality. * `max_fraction`: The maximum batch size is this fraction of the total dataset size. Larger * gives faster build but lower graph quality. - * `alpha`: Used to determine how aggresive the pruning will be. + * `alpha`: Used to determine how aggressive the pruning will be. */ struct index_params : cuvs::neighbors::index_params { From 1427a5686bf3fac106f79a6c7aa0165d0281cad4 Mon Sep 17 00:00:00 2001 From: bkarsin Date: Wed, 29 Jan 2025 23:15:16 +0000 Subject: [PATCH 13/13] Removed extra whitespace --- docs/source/indexes/vamana.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/indexes/vamana.rst b/docs/source/indexes/vamana.rst index 43e98b0967..7e0a79fb84 100644 --- a/docs/source/indexes/vamana.rst +++ b/docs/source/indexes/vamana.rst @@ -7,9 +7,9 @@ VAMANA is the underlying graph construction algorithm used to construct indexes * 2. Iteratively insert batches of dataset vectors into the graph, connecting each inserted vector to neighbors based on a graph traversal. * 3. For each batch, create reverse edges and prune unnecessary edges. -There are many algorithmic details that are outlined in the `paper `, and many GPU-specific optimizations are included in this implementation. +There are many algorithmic details that are outlined in the `paper `, and many GPU-specific optimizations are included in this implementation. -The current implementation of DiskANN in cuVS only includes the 'in-memory' graph construction and a serialization step that writes the index to a file. This index file can be then used by the `open-source DiskANN ` library to perform efficient search. Additional DiskANN functionality, including GPU-accelerated search and 'ssd' index build are planned for future cuVS releases. +The current implementation of DiskANN in cuVS only includes the 'in-memory' graph construction and a serialization step that writes the index to a file. This index file can be then used by the `open-source DiskANN ` library to perform efficient search. Additional DiskANN functionality, including GPU-accelerated search and 'ssd' index build are planned for future cuVS releases. [ :doc:`C++ API <../cpp_api/neighbors_vamana>` | :doc:`Python API <../python_api/neighbors_vamana>` ] @@ -61,7 +61,7 @@ The 2 hyper-parameters that are most often tuned are `graph_degree` and `visited Memory footprint ---------------- -Vamana builds a graph that is stored in device memory. However, in order to serialize the index and write it to a file for later use, it must be moved into host memory. If the `include_dataset` parameter is also set, then the dataset must be resident in host memory when calling serialize as well. +Vamana builds a graph that is stored in device memory. However, in order to serialize the index and write it to a file for later use, it must be moved into host memory. If the `include_dataset` parameter is also set, then the dataset must be resident in host memory when calling serialize as well. Device memory usage ~~~~~~~~~~~~~~~~~~~