Skip to content

Commit eb4d38e

Browse files
authored
Enable kernel & memcpy overlapping in IVF index building (#230)
Currently, in IVF index building (both IVF-Flat and IVF-PQ), large dataset is usually in pageable host memory or mmap-ed file. In both case, after the cluster centers are trained, the entire dataset needs to be copied twice to the GPU -- one for assigning vectors to clusters, the other for copying vectors to the corresponding clusters. Both copies are done using `batch_load_iterator` in a chunk-by-chunk fashion. Since the source buffer is in pageable memory, the current `batch_load_iterator` implementation doesn't support kernel and memcopy overlapping. This PR adds support on prefetching with `cudaMemcpyAsync` on pageable memory. We achieve kernel copy overlapping by launching kernel first following by the prefetching of the next chunk. We benchmarked the change on L40S. The results show 3%-21% speedup on index building, without impacting the search recall (about 1-2%, similar to run-to-run variance). algo | dataset | model | with prefetching (s) | without prefetching (s) | speedup -- | -- | -- | -- | -- | -- IVF-PQ | deep-100M | d64b5n50K | 97.3547 | 100.36 | 1.03 IVF-PQ | wiki-all-10M | d64-nlist16K | 14.9763 | 18.1602 | 1.21 IVF-Flat | deep-100M | nlist50K | 78.8188 | 81.4461 | 1.03 This PR is related to the issue submitted to RAFT: rapidsai/raft#2106 Authors: - Rui Lan (https://github.com/abc99lr) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: #230
1 parent e67caa5 commit eb4d38e

12 files changed

Lines changed: 382 additions & 31 deletions

File tree

cpp/bench/ann/src/cuvs/cuvs_ivf_flat_wrapper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <raft/core/resource/cuda_stream.hpp>
2727
#include <raft/linalg/unary_op.cuh>
2828
#include <raft/util/cudart_utils.hpp>
29+
#include <rmm/cuda_stream_pool.hpp>
2930

3031
#include <cassert>
3132
#include <fstream>
@@ -96,6 +97,9 @@ class cuvs_ivf_flat : public algo<T>, public algo_gpu {
9697
template <typename T, typename IdxT>
9798
void cuvs_ivf_flat<T, IdxT>::build(const T* dataset, size_t nrow)
9899
{
100+
// Create a CUDA stream pool with 1 stream (besides main stream) for kernel/copy overlapping.
101+
size_t n_streams = 1;
102+
raft::resource::set_cuda_stream_pool(handle_, std::make_shared<rmm::cuda_stream_pool>(n_streams));
99103
index_ = std::make_shared<cuvs::neighbors::ivf_flat::index<T, IdxT>>(
100104
std::move(cuvs::neighbors::ivf_flat::build(
101105
handle_,

cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <raft/linalg/unary_op.cuh>
3131
#include <raft/neighbors/refine.cuh>
3232
#include <raft/util/cudart_utils.hpp>
33+
#include <rmm/cuda_stream_pool.hpp>
3334

3435
#include <type_traits>
3536

@@ -115,6 +116,9 @@ void cuvs_ivf_pq<T, IdxT>::load(const std::string& file)
115116
template <typename T, typename IdxT>
116117
void cuvs_ivf_pq<T, IdxT>::build(const T* dataset, size_t nrow)
117118
{
119+
// Create a CUDA stream pool with 1 stream (besides main stream) for kernel/copy overlapping.
120+
size_t n_streams = 1;
121+
raft::resource::set_cuda_stream_pool(handle_, std::make_shared<rmm::cuda_stream_pool>(n_streams));
118122
auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
119123
std::make_shared<cuvs::neighbors::ivf_pq::index<IdxT>>(
120124
std::move(cuvs::neighbors::ivf_pq::build(handle_, index_params_, dataset_v)))

cpp/include/cuvs/neighbors/ivf_flat.hpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,11 +445,18 @@ void build(raft::resources const& handle,
445445
/**
446446
* @brief Build the index from the dataset for efficient search.
447447
*
448+
* Note, if index_params.add_data_on_build is set to true, the user can set a
449+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
450+
* overlapping.
451+
*
448452
* Usage example:
449453
* @code{.cpp}
450454
* using namespace cuvs::neighbors;
451455
* // use default index parameters
452456
* ivf_flat::index_params index_params;
457+
* // optional: create a stream pool with at least one stream to enable kernel and copy
458+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
459+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
453460
* // create and fill the index from a [N, D] dataset
454461
* auto index = ivf_flat::build(handle, dataset, index_params);
455462
* @endcode
@@ -468,11 +475,18 @@ auto build(raft::resources const& handle,
468475
/**
469476
* @brief Build the index from the dataset for efficient search.
470477
*
478+
* Note, if index_params.add_data_on_build is set to true, the user can set a
479+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
480+
* overlapping.
481+
*
471482
* Usage example:
472483
* @code{.cpp}
473484
* using namespace cuvs::neighbors;
474485
* // use default index parameters
475486
* ivf_flat::index_params index_params;
487+
* // optional: create a stream pool with at least one stream to enable kernel and copy
488+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
489+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
476490
* // create and fill the index from a [N, D] dataset
477491
* ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
478492
* ivf_flat::build(handle, dataset, index_params, index);
@@ -492,11 +506,18 @@ void build(raft::resources const& handle,
492506
/**
493507
* @brief Build the index from the dataset for efficient search.
494508
*
509+
* Note, if index_params.add_data_on_build is set to true, the user can set a
510+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
511+
* overlapping.
512+
*
495513
* Usage example:
496514
* @code{.cpp}
497515
* using namespace cuvs::neighbors;
498516
* // use default index parameters
499517
* ivf_flat::index_params index_params;
518+
* // optional: create a stream pool with at least one stream to enable kernel and copy
519+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
520+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
500521
* // create and fill the index from a [N, D] dataset
501522
* auto index = ivf_flat::build(handle, dataset, index_params);
502523
* @endcode
@@ -515,11 +536,18 @@ auto build(raft::resources const& handle,
515536
/**
516537
* @brief Build the index from the dataset for efficient search.
517538
*
539+
* Note, if index_params.add_data_on_build is set to true, the user can set a
540+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
541+
* overlapping.
542+
*
518543
* Usage example:
519544
* @code{.cpp}
520545
* using namespace cuvs::neighbors;
521546
* // use default index parameters
522547
* ivf_flat::index_params index_params;
548+
* // optional: create a stream pool with at least one stream to enable kernel and copy
549+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
550+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
523551
* // create and fill the index from a [N, D] dataset
524552
* ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
525553
* ivf_flat::build(handle, dataset, index_params, index);
@@ -539,11 +567,18 @@ void build(raft::resources const& handle,
539567
/**
540568
* @brief Build the index from the dataset for efficient search.
541569
*
570+
* Note, if index_params.add_data_on_build is set to true, the user can set a
571+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
572+
* overlapping.
573+
*
542574
* Usage example:
543575
* @code{.cpp}
544576
* using namespace cuvs::neighbors;
545577
* // use default index parameters
546578
* ivf_flat::index_params index_params;
579+
* // optional: create a stream pool with at least one stream to enable kernel and copy
580+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
581+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
547582
* // create and fill the index from a [N, D] dataset
548583
* auto index = ivf_flat::build(handle, dataset, index_params);
549584
* @endcode
@@ -562,11 +597,18 @@ auto build(raft::resources const& handle,
562597
/**
563598
* @brief Build the index from the dataset for efficient search.
564599
*
600+
* Note, if index_params.add_data_on_build is set to true, the user can set a
601+
* stream pool in the input raft::resource with at least one stream to enable kernel and copy
602+
* overlapping.
603+
*
565604
* Usage example:
566605
* @code{.cpp}
567606
* using namespace cuvs::neighbors;
568607
* // use default index parameters
569608
* ivf_flat::index_params index_params;
609+
* // optional: create a stream pool with at least one stream to enable kernel and copy
610+
* // overlapping. This is only applicable if index_params.add_data_on_build is set to true
611+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
570612
* // create and fill the index from a [N, D] dataset
571613
* ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
572614
* ivf_flat::build(handle, dataset, index_params, index);
@@ -710,6 +752,7 @@ auto extend(raft::resources const& handle,
710752
* @param[in] handle
711753
* @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
712754
* @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
755+
*
713756
* If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
714757
* here to imply a continuous range `[0...n_rows)`.
715758
* @param[inout] idx pointer to index, to be overwritten in-place
@@ -786,6 +829,9 @@ void extend(raft::resources const& handle,
786829
/**
787830
* @brief Build a new index containing the data of the original plus new extra vectors.
788831
*
832+
* Note, the user can set a stream pool in the input raft::resource with
833+
* at least one stream to enable kernel and copy overlapping.
834+
*
789835
* Implementation note:
790836
* The new data is clustered according to existing kmeans clusters, then the cluster
791837
* centers are adjusted to match the newly labeled data.
@@ -798,6 +844,9 @@ void extend(raft::resources const& handle,
798844
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
799845
* // train the index from a [N, D] dataset
800846
* auto index_empty = ivf_flat::build(handle, index_params, dataset);
847+
* // optional: create a stream pool with at least one stream to enable kernel and copy
848+
* // overlapping
849+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
801850
* // fill the index with the data
802851
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
803852
* auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
@@ -821,6 +870,9 @@ auto extend(raft::resources const& handle,
821870
/**
822871
* @brief Extend the index in-place with the new data.
823872
*
873+
* Note, the user can set a stream pool in the input raft::resource with
874+
* at least one stream to enable kernel and copy overlapping.
875+
*
824876
* Usage example:
825877
* @code{.cpp}
826878
* using namespace cuvs::neighbors;
@@ -829,6 +881,9 @@ auto extend(raft::resources const& handle,
829881
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
830882
* // train the index from a [N, D] dataset
831883
* auto index_empty = ivf_flat::build(handle, index_params, dataset);
884+
* // optional: create a stream pool with at least one stream to enable kernel and copy
885+
* // overlapping
886+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
832887
* // fill the index with the data
833888
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
834889
* ivf_flat::extend(handle, dataset, no_opt, &index_empty);
@@ -850,6 +905,9 @@ void extend(raft::resources const& handle,
850905
/**
851906
* @brief Build a new index containing the data of the original plus new extra vectors.
852907
*
908+
* Note, the user can set a stream pool in the input raft::resource with
909+
* at least one stream to enable kernel and copy overlapping.
910+
*
853911
* Implementation note:
854912
* The new data is clustered according to existing kmeans clusters, then the cluster
855913
* centers are adjusted to match the newly labeled data.
@@ -862,6 +920,9 @@ void extend(raft::resources const& handle,
862920
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
863921
* // train the index from a [N, D] dataset
864922
* auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
923+
* // optional: create a stream pool with at least one stream to enable kernel and copy
924+
* // overlapping
925+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
865926
* // fill the index with the data
866927
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
867928
* auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
@@ -885,6 +946,9 @@ auto extend(raft::resources const& handle,
885946
/**
886947
* @brief Extend the index in-place with the new data.
887948
*
949+
* Note, the user can set a stream pool in the input raft::resource with
950+
* at least one stream to enable kernel and copy overlapping.
951+
*
888952
* Usage example:
889953
* @code{.cpp}
890954
* using namespace cuvs::neighbors;
@@ -893,6 +957,9 @@ auto extend(raft::resources const& handle,
893957
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
894958
* // train the index from a [N, D] dataset
895959
* auto index_empty = ivf_flat::build(handle, index_params, dataset);
960+
* // optional: create a stream pool with at least one stream to enable kernel and copy
961+
* // overlapping
962+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
896963
* // fill the index with the data
897964
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
898965
* ivf_flat::extend(handle, dataset, no_opt, &index_empty);
@@ -914,6 +981,9 @@ void extend(raft::resources const& handle,
914981
/**
915982
* @brief Build a new index containing the data of the original plus new extra vectors.
916983
*
984+
* Note, the user can set a stream pool in the input raft::resource with
985+
* at least one stream to enable kernel and copy overlapping.
986+
*
917987
* Implementation note:
918988
* The new data is clustered according to existing kmeans clusters, then the cluster
919989
* centers are adjusted to match the newly labeled data.
@@ -926,6 +996,9 @@ void extend(raft::resources const& handle,
926996
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
927997
* // train the index from a [N, D] dataset
928998
* auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
999+
* // optional: create a stream pool with at least one stream to enable kernel and copy
1000+
* // overlapping
1001+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
9291002
* // fill the index with the data
9301003
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
9311004
* auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
@@ -949,6 +1022,9 @@ auto extend(raft::resources const& handle,
9491022
/**
9501023
* @brief Extend the index in-place with the new data.
9511024
*
1025+
* Note, the user can set a stream pool in the input raft::resource with
1026+
* at least one stream to enable kernel and copy overlapping.
1027+
*
9521028
* Usage example:
9531029
* @code{.cpp}
9541030
* using namespace cuvs::neighbors;
@@ -957,6 +1033,9 @@ auto extend(raft::resources const& handle,
9571033
* index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
9581034
* // train the index from a [N, D] dataset
9591035
* auto index_empty = ivf_flat::build(handle, index_params, dataset);
1036+
* // optional: create a stream pool with at least one stream to enable kernel and copy
1037+
* // overlapping
1038+
* raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
9601039
* // fill the index with the data
9611040
* std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
9621041
* ivf_flat::extend(handle, dataset, no_opt, &index_empty);

0 commit comments

Comments
 (0)