Skip to content
Merged
28 changes: 25 additions & 3 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include <cstdio>
#include <vector>

#include <sys/mman.h>

namespace cuvs::neighbors::cagra::detail {

template <typename IdxT>
Expand Down Expand Up @@ -485,6 +487,19 @@ auto iterative_build_graph(
}
}

// Allocate memory for neighbors list using Transparent HugePage
constexpr size_t thp_size = 2 * 1024 * 1024;
size_t byte_size = sizeof(IdxT) * final_graph_size * topk;
if (byte_size % thp_size) { byte_size += thp_size - (byte_size % thp_size); }
IdxT* neighbors_ptr =
(IdxT*)mmap(NULL, byte_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (neighbors_ptr == MAP_FAILED) {
perror("mmap");
exit(-1);
}
if (madvise(neighbors_ptr, byte_size, MADV_HUGEPAGE) != 0) { perror("madvise"); }
memset(neighbors_ptr, 0, byte_size);

Comment thread
anaruse marked this conversation as resolved.
auto curr_graph_size = initial_graph_size;
while (true) {
RAFT_LOG_DEBUG("# graph_size = %lu (%.3lf)",
Expand Down Expand Up @@ -516,7 +531,9 @@ auto iterative_build_graph(

auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
auto neighbors = raft::make_host_matrix<IdxT, int64_t>(curr_query_size, curr_topk);

auto neighbors_view =
raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);

// Search.
// Since there are many queries, divide them into batches and search them.
Expand All @@ -543,7 +560,7 @@ auto iterative_build_graph(
batch_dev_distances_view);

auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
neighbors.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
raft::copy(batch_neighbors_view.data_handle(),
batch_dev_neighbors_view.data_handle(),
batch_neighbors_view.size(),
Expand All @@ -556,10 +573,15 @@ auto iterative_build_graph(
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(0, 0); // delete existing grahp
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(curr_graph_size, curr_graph_degree);
optimize<IdxT>(
res, neighbors.view(), cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
if (flag_last) { break; }
}

if (munmap(neighbors_ptr, byte_size) != 0) {
perror("munmap");
exit(-1);
}

return cagra_graph;
}

Expand Down
210 changes: 144 additions & 66 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1157,26 +1157,6 @@ void optimize(
{
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune");
//
// Prune kNN graph
//
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dev_stats = raft::make_device_vector<uint64_t>(res, 2);
auto host_stats = raft::make_host_vector<uint64_t>(2);

//
// Prune unimportant edges.
Expand All @@ -1191,57 +1171,155 @@ void optimize(
// specified number of edges are picked up for each node, starting with the
// edge with the lowest number of 2-hop detours.
//
const double time_prune_start = cur_time();
RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

// Copy input_graph_ptr over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (input_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%zu). "
"It must be equal to or smaller than %d.",
input_graph_degree,
1024);
auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);

//
// If the available device memory is insufficient, do not use the GPU to count
// the number of 2-hop detours, but use the CPU.
//
bool use_gpu = true;
try {
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_input_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
} catch (std::bad_alloc& e) {
RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU");
use_gpu = false;
} catch (raft::logic_error& e) {
RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU (logic error)");
use_gpu = false;
}
const uint32_t batch_size =
std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
const dim3 threads_prune(32, 1, 1);
const dim3 blocks_prune(batch_size, 1, 1);

RAFT_CUDA_TRY(cudaMemsetAsync(
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
uint64_t num_keep __attribute__((unused)) = 0;
uint64_t num_full __attribute__((unused)) = 0;
if (use_gpu) {
Comment thread
anaruse marked this conversation as resolved.
Outdated
// Count 2-hop detours on GPU
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune/2-hop-counting-by-GPU");

auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dev_stats = raft::make_device_vector<uint64_t>(res, 2);
auto host_stats = raft::make_host_vector<uint64_t>(2);

const double time_prune_start = cur_time();
RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

// Copy input_graph_ptr over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(
input_graph_ptr, graph_size, input_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (input_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%zu). "
"It must be equal to or smaller than %d.",
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
1024);
Comment thread
anaruse marked this conversation as resolved.
Outdated
}
const uint32_t batch_size =
std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
const dim3 threads_prune(32, 1, 1);
const dim3 blocks_prune(batch_size, 1, 1);

RAFT_CUDA_TRY(cudaMemsetAsync(
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
(double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
}
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
(double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
}
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");
RAFT_LOG_DEBUG("\n");

host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
raft::copy(detour_count.data_handle(),
d_detour_count.data_handle(),
detour_count.size(),
raft::resource::get_cuda_stream(res));

raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
const auto num_keep = host_stats.data_handle()[0];
const auto num_full = host_stats.data_handle()[1];
raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
num_keep = host_stats.data_handle()[0];
num_full = host_stats.data_handle()[1];
} else {
// Count 2-hop detours on CPU
Comment thread
anaruse marked this conversation as resolved.
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune/2-hop-counting-by-CPU");

#pragma omp parallel for reduction(+ : num_keep, num_full)
for (IdxT iA = 0; iA < graph_size; iA++) {
uint32_t num_edges_no_detour = 0;
// Create a list of nodes, iB_candidates, that can be reached in 2-hops from node A.
auto iB_candidates = raft::make_host_vector<IdxT, int64_t>((input_graph_degree - 1) *
Comment thread
tfeher marked this conversation as resolved.
Outdated
(input_graph_degree - 1));
for (uint64_t kAC = 0; kAC < input_graph_degree - 1; kAC++) {
IdxT iC = input_graph_ptr[kAC + (input_graph_degree * iA)];
Comment thread
anaruse marked this conversation as resolved.
Outdated
for (uint64_t kCB = 0; kCB < input_graph_degree - 1; kCB++) {
IdxT iB_candidate;
if (iC == iA || iC >= graph_size) {
iB_candidate = graph_size;
} else {
iB_candidate = input_graph_ptr[kCB + (input_graph_degree * iC)];
Comment thread
anaruse marked this conversation as resolved.
Outdated
if (iB_candidate == iA || iB_candidate == iC) { iB_candidate = graph_size; }
}
uint64_t idx;
if (kAC < kCB) {
idx = (kCB * kCB) + kAC;
} else {
idx = (kAC * (kAC + 1)) + kCB;
}
iB_candidates(idx) = iB_candidate;
}
}
// Count how many 2-hop detours are on each edge of node A.
for (uint64_t kAB = 0; kAB < input_graph_degree; kAB++) {
constexpr uint32_t max_count = 255;
uint32_t count = 0;
IdxT iB = input_graph_ptr[kAB + (input_graph_degree * iA)];
Comment thread
anaruse marked this conversation as resolved.
Outdated
if (iB == iA) {
count = max_count;
} else {
for (uint64_t idx = 0; idx < kAB * kAB; idx++) {
if (iB_candidates(idx) == iB) { count += 1; }
}
}
detour_count(iA, kAB) = std::min(count, max_count);
if (count == 0) { num_edges_no_detour += 1; }
}
num_keep += num_edges_no_detour;
if (num_edges_no_detour > input_graph_degree) { num_full += 1; }
}
}

// Create pruned kNN graph
bool invalid_neighbor_list = false;
Expand Down