Skip to content

Commit e04c73a

Browse files
anarusemythrocks
authored andcommitted
Reduce device memory usage for CAGRA's graph optimization process (2-hop detour counting) (rapidsai#822)
CAGRA takes the initial knn graph as input and optimizes it to create a search graph. Several types of processing are performed in the graph optimization, the most memory-intensive of which is the counting of 2-hop detours. Currently, the counting of 2-hop detours is performed on the GPU to speed up processing, and this requires that the entire initial knn graph be placed in device memory. In general, the size of the initial knn graph is 2x the size of the search graph. In other words, in the current implementation, roughly half the device memory size is the upper limit of the search graph that can be created. As it is, creating search graphs for huge datasets requires a GPU with a large amount of device memory, which is not practical. To address this issue, this PR adds a CPU implementation of 2-hop detour counting and uses this CPU implementation to count 2-hop detours when device memory is insufficient. The CPU implementation supports thread parallelism and is optimized to reduce conditional branches and is sufficiently fast. Of course, it is slower than the GPU implementation, but it can count 2-hop detours in about 3 to 4 times the time of the GPU implementation. Since the time for counting 2-hop detours on GPU is approximately 10% of the total indexing time, the overall time will increase by 20-30% when using the CPU implementation, but this is well within the practical range. Authors: - Akira Naruse (https://github.com/anaruse) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: rapidsai#822
1 parent bff5e51 commit e04c73a

2 files changed

Lines changed: 245 additions & 87 deletions

File tree

cpp/src/neighbors/detail/cagra/cagra_build.cuh

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#include <cstdio>
4545
#include <vector>
4646

47+
#include <sys/mman.h>
48+
4749
namespace cuvs::neighbors::cagra::detail {
4850

4951
template <typename IdxT>
@@ -410,6 +412,52 @@ void optimize(
410412
res, knn_graph_internal, new_graph_internal, guarantee_connectivity);
411413
}
412414

415+
// RAII wrapper for allocating memory with Transparent HugePage
416+
struct mmap_owner {
417+
// Allocate a new memory (not backed by a file)
418+
mmap_owner(size_t size) : size_{size}
419+
{
420+
int flags = MAP_ANONYMOUS | MAP_PRIVATE;
421+
ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
422+
if (ptr_ == MAP_FAILED) {
423+
ptr_ = nullptr;
424+
throw std::runtime_error("cuvs::mmap_owner error");
425+
}
426+
if (madvise(ptr_, size, MADV_HUGEPAGE) != 0) {
427+
munmap(ptr_, size);
428+
ptr_ = nullptr;
429+
throw std::runtime_error("cuvs::mmap_owner error");
430+
}
431+
}
432+
433+
~mmap_owner() noexcept
434+
{
435+
if (ptr_ != nullptr) { munmap(ptr_, size_); }
436+
}
437+
438+
// No copies for owning struct
439+
mmap_owner(const mmap_owner& res) = delete;
440+
auto operator=(const mmap_owner& other) -> mmap_owner& = delete;
441+
// Moving is fine
442+
mmap_owner(mmap_owner&& other)
443+
: ptr_{std::exchange(other.ptr_, nullptr)}, size_{std::exchange(other.size_, 0)}
444+
{
445+
}
446+
auto operator=(mmap_owner&& other) -> mmap_owner&
447+
{
448+
std::swap(this->ptr_, other.ptr_);
449+
std::swap(this->size_, other.size_);
450+
return *this;
451+
}
452+
453+
[[nodiscard]] auto data() const -> void* { return ptr_; }
454+
[[nodiscard]] auto size() const -> size_t { return size_; }
455+
456+
private:
457+
void* ptr_;
458+
size_t size_;
459+
};
460+
413461
template <typename T,
414462
typename IdxT = uint32_t,
415463
typename Accessor = raft::host_device_accessor<std::experimental::default_accessor<T>,
@@ -493,6 +541,14 @@ auto iterative_build_graph(
493541
}
494542
}
495543

544+
// Allocate memory for neighbors list using Transparent HugePage
545+
constexpr size_t thp_size = 2 * 1024 * 1024;
546+
size_t byte_size = sizeof(IdxT) * final_graph_size * topk;
547+
if (byte_size % thp_size) { byte_size += thp_size - (byte_size % thp_size); }
548+
mmap_owner neighbors_list(byte_size);
549+
IdxT* neighbors_ptr = (IdxT*)neighbors_list.data();
550+
memset(neighbors_ptr, 0, byte_size);
551+
496552
auto curr_graph_size = initial_graph_size;
497553
while (true) {
498554
RAFT_LOG_DEBUG("# graph_size = %lu (%.3lf)",
@@ -524,7 +580,9 @@ auto iterative_build_graph(
524580

525581
auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
526582
dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
527-
auto neighbors = raft::make_host_matrix<IdxT, int64_t>(curr_query_size, curr_topk);
583+
584+
auto neighbors_view =
585+
raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);
528586

529587
// Search.
530588
// Since there are many queries, divide them into batches and search them.
@@ -551,7 +609,7 @@ auto iterative_build_graph(
551609
batch_dev_distances_view);
552610

553611
auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
554-
neighbors.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
612+
neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
555613
raft::copy(batch_neighbors_view.data_handle(),
556614
batch_dev_neighbors_view.data_handle(),
557615
batch_neighbors_view.size(),
@@ -564,7 +622,7 @@ auto iterative_build_graph(
564622
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(0, 0); // delete existing grahp
565623
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(curr_graph_size, curr_graph_degree);
566624
optimize<IdxT>(
567-
res, neighbors.view(), cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
625+
res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
568626
if (flag_last) { break; }
569627
}
570628

0 commit comments

Comments
 (0)