Skip to content
Merged
28 changes: 25 additions & 3 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include <cstdio>
#include <vector>

#include <sys/mman.h>

namespace cuvs::neighbors::cagra::detail {

template <typename IdxT>
Expand Down Expand Up @@ -485,6 +487,19 @@ auto iterative_build_graph(
}
}

// Allocate memory for neighbors list using Transparent HugePage
constexpr size_t thp_size = 2 * 1024 * 1024;
size_t byte_size = sizeof(IdxT) * final_graph_size * topk;
if (byte_size % thp_size) { byte_size += thp_size - (byte_size % thp_size); }
IdxT* neighbors_ptr =
(IdxT*)mmap(NULL, byte_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (neighbors_ptr == MAP_FAILED) {
perror("mmap");
exit(-1);
}
if (madvise(neighbors_ptr, byte_size, MADV_HUGEPAGE) != 0) { perror("madvise"); }
memset(neighbors_ptr, 0, byte_size);

Comment thread
anaruse marked this conversation as resolved.
auto curr_graph_size = initial_graph_size;
while (true) {
RAFT_LOG_DEBUG("# graph_size = %lu (%.3lf)",
Expand Down Expand Up @@ -516,7 +531,9 @@ auto iterative_build_graph(

auto dev_query_view = raft::make_device_matrix_view<const T, int64_t>(
dev_dataset.data_handle(), (int64_t)curr_query_size, dev_dataset.extent(1));
auto neighbors = raft::make_host_matrix<IdxT, int64_t>(curr_query_size, curr_topk);

auto neighbors_view =
raft::make_host_matrix_view<IdxT, int64_t>(neighbors_ptr, curr_query_size, curr_topk);

// Search.
// Since there are many queries, divide them into batches and search them.
Expand All @@ -543,7 +560,7 @@ auto iterative_build_graph(
batch_dev_distances_view);

auto batch_neighbors_view = raft::make_host_matrix_view<IdxT, int64_t>(
neighbors.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
neighbors_view.data_handle() + batch.offset() * curr_topk, batch.size(), curr_topk);
raft::copy(batch_neighbors_view.data_handle(),
batch_dev_neighbors_view.data_handle(),
batch_neighbors_view.size(),
Expand All @@ -556,10 +573,15 @@ auto iterative_build_graph(
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(0, 0); // delete existing grahp
cagra_graph = raft::make_host_matrix<IdxT, int64_t>(curr_graph_size, curr_graph_degree);
optimize<IdxT>(
res, neighbors.view(), cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
if (flag_last) { break; }
}

if (munmap(neighbors_ptr, byte_size) != 0) {
perror("munmap");
exit(-1);
}

return cagra_graph;
}

Expand Down
266 changes: 184 additions & 82 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,58 @@ void mst_optimization(raft::resources const& res,
RAFT_LOG_DEBUG("# MST optimization time: %.1lf sec", time_mst_opt_end - time_mst_opt_start);
}

template <typename IdxT = uint32_t>
void count_2hop_detours(raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
raft::host_matrix_view<uint8_t, int64_t, raft::row_major> detour_count)
{
RAFT_EXPECTS(knn_graph.extent(0) == detour_count.extent(0),
"knn_graph and detour_count are expected to have the same number of rows");
RAFT_EXPECTS(knn_graph.extent(1) == detour_count.extent(1),
"knn_graph and detour_count are expected to have the same number of cols");
const uint64_t graph_size = knn_graph.extent(0);
const uint64_t graph_degree = knn_graph.extent(1);

#pragma omp parallel for
for (IdxT iA = 0; iA < graph_size; iA++) {
// Create a list of nodes, iB_candidates, that can be reached in 2-hops from node A.
auto iB_candidates =
raft::make_host_vector<IdxT, int64_t>((graph_degree - 1) * (graph_degree - 1));
for (uint64_t kAC = 0; kAC < graph_degree - 1; kAC++) {
IdxT iC = knn_graph(iA, kAC);
for (uint64_t kCB = 0; kCB < graph_degree - 1; kCB++) {
IdxT iB_candidate;
if (iC == iA || iC >= graph_size) {
iB_candidate = graph_size;
} else {
iB_candidate = knn_graph(iC, kCB);
if (iB_candidate == iA || iB_candidate == iC) { iB_candidate = graph_size; }
}
uint64_t idx;
if (kAC < kCB) {
idx = (kCB * kCB) + kAC;
} else {
idx = (kAC * (kAC + 1)) + kCB;
}
iB_candidates(idx) = iB_candidate;
}
}
// Count how many 2-hop detours are on each edge of node A.
for (uint64_t kAB = 0; kAB < graph_degree; kAB++) {
constexpr uint32_t max_count = 255;
uint32_t count = 0;
IdxT iB = knn_graph(iA, kAB);
if (iB == iA) {
count = max_count;
} else {
for (uint64_t idx = 0; idx < kAB * kAB; idx++) {
if (iB_candidates(idx) == iB) { count += 1; }
}
}
detour_count(iA, kAB) = std::min(count, max_count);
}
}
}

template <
typename IdxT = uint32_t,
typename g_accessor =
Expand All @@ -1122,13 +1174,14 @@ void optimize(
"Each input array is expected to have the same number of rows");
RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
"output graph cannot have more columns than input graph");
const uint64_t input_graph_degree = knn_graph.extent(1);
// const uint64_t input_graph_degree = knn_graph.extent(1);
const uint64_t knn_graph_degree = knn_graph.extent(1);
const uint64_t output_graph_degree = new_graph.extent(1);
const uint64_t graph_size = new_graph.extent(0);
auto input_graph_ptr = knn_graph.data_handle();
auto output_graph_ptr = new_graph.data_handle();
// auto input_graph_ptr = knn_graph.data_handle();
auto output_graph_ptr = new_graph.data_handle();
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
"cagra::graph::optimize(%zu, %zu, %u)", graph_size, input_graph_degree, output_graph_degree);
"cagra::graph::optimize(%zu, %zu, %u)", graph_size, knn_graph_degree, output_graph_degree);

// MST optimization
auto mst_graph = raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0);
Expand Down Expand Up @@ -1157,26 +1210,7 @@ void optimize(
{
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune");
//
// Prune kNN graph
//
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dev_stats = raft::make_device_vector<uint64_t>(res, 2);
auto host_stats = raft::make_host_vector<uint64_t>(2);
const double time_prune_start = cur_time();

//
// Prune unimportant edges.
Expand All @@ -1191,57 +1225,132 @@ void optimize(
// specified number of edges are picked up for each node, starting with the
// edge with the lowest number of 2-hop detours.
//
const double time_prune_start = cur_time();
RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

// Copy input_graph_ptr over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (input_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%zu). "
"It must be equal to or smaller than %d.",
input_graph_degree,
1024);
auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, knn_graph_degree);

//
// If the available device memory is insufficient, do not use the GPU to count
// the number of 2-hop detours, but use the CPU.
//
// bool use_gpu = true;
bool use_gpu = false;
Comment thread
anaruse marked this conversation as resolved.
Outdated
if (use_gpu) {
try {
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, knn_graph_degree);
auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_input_graph =
raft::make_device_matrix<IdxT, int64_t>(res, graph_size, knn_graph_degree);
} catch (std::bad_alloc& e) {
RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU");
use_gpu = false;
} catch (raft::logic_error& e) {
RAFT_LOG_DEBUG("Insufficient memory for 2-hop node counting on GPU (logic error)");
use_gpu = false;
}
}
const uint32_t batch_size =
std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
const dim3 threads_prune(32, 1, 1);
const dim3 blocks_prune(batch_size, 1, 1);

RAFT_CUDA_TRY(cudaMemsetAsync(
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
input_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
if (use_gpu) {
Comment thread
anaruse marked this conversation as resolved.
Outdated
// Count 2-hop detours on GPU
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune/2-hop-counting-by-GPU");
const double time_2hop_count_start = cur_time();

uint64_t num_keep __attribute__((unused)) = 0;
uint64_t num_full __attribute__((unused)) = 0;
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, knn_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * knn_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dev_stats = raft::make_device_vector<uint64_t>(res, 2);
auto host_stats = raft::make_host_vector<uint64_t>(2);

RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");

// Copy knn_graph over to device if necessary
device_matrix_view_from_host d_input_graph(
res,
raft::make_host_matrix_view<IdxT, int64_t>(
knn_graph.data_handle(), graph_size, knn_graph_degree));

constexpr int MAX_DEGREE = 1024;
if (knn_graph_degree > MAX_DEGREE) {
RAFT_FAIL(
"The degree of input knn graph is too large (%zu). "
"It must be equal to or smaller than %d.",
knn_graph_degree,
MAX_DEGREE);
}
const uint32_t batch_size =
std::min(static_cast<uint32_t>(graph_size), static_cast<uint32_t>(256 * 1024));
const uint32_t num_batch = (graph_size + batch_size - 1) / batch_size;
const dim3 threads_prune(32, 1, 1);
const dim3 blocks_prune(batch_size, 1, 1);

RAFT_CUDA_TRY(cudaMemsetAsync(
dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, raft::resource::get_cuda_stream(res)));

for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
kern_prune<MAX_DEGREE, IdxT>
<<<blocks_prune, threads_prune, 0, raft::resource::get_cuda_stream(res)>>>(
d_input_graph.data_handle(),
graph_size,
knn_graph_degree,
output_graph_degree,
batch_size,
i_batch,
d_detour_count.data_handle(),
d_num_no_detour_edges.data_handle(),
dev_stats.data_handle());
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
(double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
}
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");

raft::copy(detour_count.data_handle(),
d_detour_count.data_handle(),
detour_count.size(),
raft::resource::get_cuda_stream(res));

raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
num_keep = host_stats.data_handle()[0];
num_full = host_stats.data_handle()[1];

const double time_2hop_count_end = cur_time();
RAFT_LOG_DEBUG(
"# Pruning kNN Graph on GPUs (%.1lf %%)\r",
(double)std::min<IdxT>((i_batch + 1) * batch_size, graph_size) / graph_size * 100);
}
raft::resource::sync_stream(res);
RAFT_LOG_DEBUG("\n");
"# Time for 2-hop detour counting on GPU: %.1lf sec, "
"avg_no_detour_edges_per_node: %.2lf/%u, "
"nodes_with_no_detour_at_all_edges: %.1lf%%",
time_2hop_count_end - time_2hop_count_start,
(double)num_keep / graph_size,
output_graph_degree,
(double)num_full / graph_size * 100);
} else {
// Count 2-hop detours on CPU
Comment thread
anaruse marked this conversation as resolved.
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> block_scope(
"cagra::graph::optimize/prune/2-hop-counting-by-CPU");
const double time_2hop_count_start = cur_time();

host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
count_2hop_detours(knn_graph, detour_count.view());

raft::copy(
host_stats.data_handle(), dev_stats.data_handle(), 2, raft::resource::get_cuda_stream(res));
const auto num_keep = host_stats.data_handle()[0];
const auto num_full = host_stats.data_handle()[1];
const double time_2hop_count_end = cur_time();
RAFT_LOG_DEBUG(/* TODO: change to DEBUG */
"# Time for 2-hop detour counting on CPU: %.1lf sec",
time_2hop_count_end - time_2hop_count_start);
}

// Create pruned kNN graph
bool invalid_neighbor_list = false;
Expand All @@ -1251,10 +1360,10 @@ void optimize(
// count of the neighbors while increasing the target detourable count from zero.
uint64_t pk = 0;
uint32_t num_detour = 0;
for (uint32_t l = 0; l < input_graph_degree && pk < output_graph_degree; l++) {
for (uint32_t l = 0; l < knn_graph_degree && pk < output_graph_degree; l++) {
uint32_t next_num_detour = std::numeric_limits<uint32_t>::max();
for (uint64_t k = 0; k < input_graph_degree; k++) {
const auto num_detour_k = detour_count.data_handle()[k + (input_graph_degree * i)];
for (uint64_t k = 0; k < knn_graph_degree; k++) {
const auto num_detour_k = detour_count.data_handle()[k + (knn_graph_degree * i)];
Comment thread
anaruse marked this conversation as resolved.
Outdated
// Find the detourable count to check in the next iteration
if (num_detour_k > num_detour) {
next_num_detour = std::min(static_cast<uint32_t>(num_detour_k), next_num_detour);
Expand All @@ -1264,7 +1373,7 @@ void optimize(
if (num_detour_k != num_detour) { continue; }

// Check duplication and append
const auto candidate_node = input_graph_ptr[k + (input_graph_degree * i)];
const auto candidate_node = knn_graph(i, k);
bool dup = false;
for (uint32_t dk = 0; dk < pk; dk++) {
if (candidate_node == output_graph_ptr[i * output_graph_degree + dk]) {
Expand Down Expand Up @@ -1303,14 +1412,7 @@ void optimize(
"overflows occur during the norm computation between the dataset vectors.");

const double time_prune_end = cur_time();
RAFT_LOG_DEBUG(
"# Pruning time: %.1lf sec, "
"avg_no_detour_edges_per_node: %.2lf/%u, "
"nodes_with_no_detour_at_all_edges: %.1lf%%\n",
time_prune_end - time_prune_start,
(double)num_keep / graph_size,
output_graph_degree,
(double)num_full / graph_size * 100);
RAFT_LOG_DEBUG("# Pruning time: %.1lf sec", time_prune_end - time_prune_start);
}

auto rev_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
Expand Down