Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
90 commits
Select commit Hold shift + click to select a range
cfe4f92
Initial commit for PQ preprocessing API
lowener Aug 22, 2025
096daa5
Support `n_lists` and cleanup code
lowener Aug 25, 2025
244a9cd
Switch to VPQ
lowener Sep 22, 2025
9537eb1
Fix trainpq and train workflow
lowener Sep 23, 2025
2883a25
Remove timer
lowener Sep 23, 2025
78dfd69
Merge branch 'branch-25.10' into 25.10-pq-preprocessing
lowener Sep 23, 2025
9dd0cfe
Cleanup Code
lowener Sep 23, 2025
9c543fb
Add double dtype
lowener Sep 24, 2025
5471d9a
Add C and python API
lowener Sep 26, 2025
716fa58
Merge branch 'branch-25.10' into 25.10-pq-preprocessing
lowener Sep 26, 2025
6d6d4ca
Merge branch 'branch-25.10' into 25.10-pq-preprocessing
KyleFromNVIDIA Sep 26, 2025
746cac4
Make VQ optional
lowener Sep 29, 2025
1950da4
Add option for classical KMeans
lowener Sep 29, 2025
75629e5
Add kmeans option to python
lowener Sep 30, 2025
32c8912
Merge branch 'branch-25.10' into 25.10-pq-preprocessing
lowener Oct 1, 2025
d774999
Add getter for pq codebooks
lowener Sep 30, 2025
a55df82
Fix doc
lowener Oct 1, 2025
5a151f9
Fix reconstruct kernel
lowener Oct 2, 2025
a0c5071
Merge branch 'branch-25.12' into 25.10-pq-preprocessing
lowener Oct 6, 2025
7112efe
Add Vector Quantization
lowener Oct 14, 2025
407b500
Merge branch 'branch-25.12' into 25.10-pq-preprocessing
lowener Oct 14, 2025
51d9c94
Merge C API with latest changes
lowener Oct 14, 2025
37d9f7c
Add VQ to C/Python API
lowener Oct 16, 2025
eea1421
Address reviews on struct/enum declaration
lowener Oct 16, 2025
a73809c
Fix params order
lowener Oct 16, 2025
7bd8f17
Update C/Python doc
lowener Oct 20, 2025
3e8ef62
Merge branch 'branch-25.12' into 25.10-pq-preprocessing
lowener Oct 20, 2025
f5bf4ea
Merge branch 'main' into 25.10-pq-preprocessing
lowener Oct 28, 2025
cebc548
Improve docs
lowener Oct 28, 2025
f6b7829
Update copyright
lowener Oct 28, 2025
f8fd16e
Merge branch 'main' into 25.10-pq-preprocessing
lowener Oct 29, 2025
f04545a
Fix compilation
lowener Nov 3, 2025
487376c
Switch namespace to pq
lowener Nov 6, 2025
03f8761
Add shared mem to compute_code PQ kernel
lowener Nov 6, 2025
9ebca8a
Add use_pq parameter
lowener Nov 10, 2025
a2c833b
Add subspace option for PQ
lowener Nov 18, 2025
bf68702
Merge branch 'main' into 25.10-pq-preprocessing
lowener Nov 20, 2025
294db1c
Remove double+simplify train
lowener Nov 21, 2025
746961d
Optimize train steps, use build cluster for km balanced
lowener Nov 21, 2025
cf6d482
Simplify subspace build loop
lowener Nov 26, 2025
6cdfa9d
Split PQ params
lowener Dec 1, 2025
67a5997
Merge branch 'main' into 25.10-pq-preprocessing
lowener Dec 9, 2025
8d12d2d
Add extreme cpp test cases, Add support for host dataset
lowener Dec 10, 2025
ab0fa28
Merge branch 'main' into 25.10-pq-preprocessing
lowener Dec 12, 2025
d4a46fa
Fix compilation mdspan changes
lowener Dec 16, 2025
377b908
Merge branch 'main' into 25.10-pq-preprocessing
lowener Dec 16, 2025
b13f9f0
Add c header to all
lowener Dec 16, 2025
5f5791a
Add pool allocator in example
lowener Dec 16, 2025
22e83e6
Update python docstring
lowener Dec 16, 2025
fe6753e
Fix test when nrows == n_centers for VPQ
lowener Dec 16, 2025
d05c85c
Fix train conditions
lowener Dec 16, 2025
cc90c75
Revert "Fix test when nrows == n_centers for VPQ"
lowener Dec 16, 2025
d8e6c84
Fix doc
lowener Dec 17, 2025
c850df7
Merge branch 'main' into 25.10-pq-preprocessing
lowener Dec 18, 2025
dcd8380
Merge branch 'main' into 25.10-pq-preprocessing
lowener Dec 23, 2025
86465ff
Cooperative load + prefetch
lowener Dec 24, 2025
9bfe19e
Compute by chunk of 4
lowener Dec 24, 2025
4730df6
Fix math_t data_t
lowener Dec 26, 2025
0f17a0c
Add pq_bits support of [8-16]. Remove it as a template
lowener Jan 5, 2026
794e5b9
Merge branch 'main' into 25.10-pq-preprocessing
lowener Jan 5, 2026
bf238fd
Fix comment
lowener Jan 5, 2026
cb4780d
Fix copyright
lowener Jan 5, 2026
0f05512
Fix copyright 2
lowener Jan 5, 2026
b374931
Fix vamana header
lowener Jan 5, 2026
c122228
Simplify code and add helper function
lowener Jan 8, 2026
06ddb89
Remove copy_vectorized, and direct intrinsics calls. Simplify bitfield
lowener Jan 9, 2026
b9675e8
Add float2 vectorization
lowener Jan 9, 2026
d97f9b7
Merge branch 'main' into 25.10-pq-preprocessing
lowener Jan 9, 2026
6b9126e
Default std optional to nullopt
lowener Jan 12, 2026
faa7659
Merge branch 'main' into 25.10-pq-preprocessing
lowener Jan 12, 2026
05bf256
Fix reconstruct tpb and logic in shared_memory handling of non-subspa…
lowener Jan 12, 2026
0d76786
Fix shared mem for very large pq_len
lowener Jan 12, 2026
e0805da
Merge branch 'main' into 25.10-pq-preprocessing
cjnolet Jan 13, 2026
261a4b4
Fix misaligned address for vectorized load
lowener Jan 14, 2026
f0d8061
Modify params struct + deprecate trainset_fraction
lowener Jan 15, 2026
c56a8c4
Merge branch 'main' into 25.10-pq-preprocessing
lowener Jan 15, 2026
2ddae5c
Simplify use of optional, change kmeans_type to avoid name conflicts
lowener Jan 21, 2026
8052db3
Spectral Embedding with `all_neighbors` (#1693)
aamijar Jan 15, 2026
c927dec
Deduplicate `calc_chunk_indices_kernel` (#1657)
jinsolp Jan 16, 2026
60ebe3f
Prepare release/26.02
AyodeAwe Jan 16, 2026
f198537
wheel builds: react to changes in pip's handling of build constraints…
mmccarty Jan 16, 2026
4bb9435
Use raft::TxN_t
lowener Jan 21, 2026
2e13085
Use separate vector for VQ labels, switch Vamana to public PQ API
lowener Jan 22, 2026
081254c
Add note on doc
lowener Jan 22, 2026
a69e2cf
Add issue #
lowener Jan 22, 2026
6e77ab0
Fix doc
lowener Jan 22, 2026
841a3a9
pre-built libcuvs_c.so now use the new ABI major/minor values (#1708)
robertmaynard Jan 22, 2026
b95eb46
Correct base release for cuvs abi 1 major (#1724)
robertmaynard Jan 23, 2026
809bacb
Add new option to ann-bench
lowener Jan 23, 2026
3116fe3
Merge branch 'release/26.02' into 25.10-pq-preprocessing
lowener Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/vamana_serialize_int8.cu
src/preprocessing/quantize/scalar.cu
src/preprocessing/quantize/binary.cu
src/preprocessing/quantize/product.cu
src/preprocessing/spectral/spectral_embedding.cu
src/selection/select_k_float_int64_t.cu
src/selection/select_k_float_int32_t.cu
Expand Down
164 changes: 164 additions & 0 deletions cpp/include/cuvs/preprocessing/quantize/product.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
Comment thread
lowener marked this conversation as resolved.
Outdated
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuvs/neighbors/ivf_pq.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/handle.hpp>
#include <raft/core/host_mdarray.hpp>
#include <raft/core/host_mdspan.hpp>

#include <cuda_fp16.h>

namespace cuvs::preprocessing::quantize::product {

/**
* @defgroup product Product Quantizer utilities
* @{
*/

/**
* @brief Product Quantizer parameters.
*/
struct params {
/*
* The bit length of the vector element after compression by PQ.
*
* Possible values: [4, 5, 6, 7, 8].
*/
int64_t pq_bits = 8;
int64_t pq_dim = 0;
Comment thread
lowener marked this conversation as resolved.
Outdated
cuvs::neighbors::ivf_pq::codebook_gen codebook_kind =
cuvs::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
bool force_random_rotation = false;
bool conservative_memory_allocation = false;
/**
* The max number of data points to use per PQ code during PQ codebook training. Using more data
* points per PQ code may increase the quality of PQ codebook but may also increase the build
* time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
* PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
* points to train each codebook.
*/
uint32_t max_train_points_per_pq_code = 256;
};

/**
* @brief Defines and stores PQ index upon training
*
* @tparam T data element type
*
*/
struct quantizer {
cuvs::neighbors::ivf_pq::index<int64_t> pq_index;
};

/**
* @brief Initializes a product quantizer to be used later for quantizing the dataset.
*
* Usage example:
* @code{.cpp}
* raft::handle_t handle;
* cuvs::preprocessing::quantize::product::params params;
* auto quantizer = cuvs::preprocessing::quantize::product::train(handle, params, dataset);
* @endcode
*
* @param[in] res raft resource
* @param[in] params configure product quantizer, e.g. quantile
* @param[in] dataset a row-major matrix view on device
*
* @return quantizer
*/
quantizer train(raft::resources const& res,
Comment thread
lowener marked this conversation as resolved.
Outdated
const params params,
raft::device_matrix_view<const float, int64_t> dataset);

/** @copydoc train */
/*quantizer train(raft::resources const& res,
const params params,
raft::device_matrix_view<const half, int64_t> dataset);*/

quantizer train(
raft::resources const& res,
const params params,
const uint32_t dim,
raft::device_mdspan<const float, raft::extent_3d<uint32_t>, raft::row_major> pq_centers,
raft::device_matrix_view<const float, uint32_t, raft::row_major> centers,
std::optional<raft::device_matrix_view<const float, uint32_t, raft::row_major>> centers_rot,
std::optional<raft::device_matrix_view<const float, uint32_t, raft::row_major>> rotation_matrix);

/**
* @brief Initializes a product quantizer to be used later for quantizing the dataset.
*
* Usage example:
* @code{.cpp}
* raft::handle_t handle;
* cuvs::preprocessing::quantize::product::params params;
* auto quantizer = cuvs::preprocessing::quantize::product::train(handle, params, dataset);
* @endcode
*
* @param[in] res raft resource
* @param[in] params configure product quantizer, e.g. quantile
* @param[in] dataset a row-major matrix view on host
*
* @return quantizer
*/
/*quantizer train(raft::resources const& res,
Comment thread
lowener marked this conversation as resolved.
Outdated
const params params,
raft::host_matrix_view<const float, int64_t> dataset);*/

/**
* @brief Applies quantization transform to given dataset
*
* Usage example:
* @code{.cpp}
* raft::handle_t handle;
* cuvs::preprocessing::quantize::product::params params;
* auto quantizer = cuvs::preprocessing::quantize::product::train(handle, params, dataset);
* auto quantized_dataset =
* raft::make_device_matrix<uint8_t, int64_t>(handle, samples, pq_dim);
* cuvs::preprocessing::quantize::product::transform(handle, quantizer, dataset,
* quantized_dataset.view());
*
* @endcode
*
* @param[in] res raft resource
* @param[in] quantizer a product quantizer
* @param[in] dataset a row-major matrix view on device
* @param[out] out a row-major matrix view on device
*
*/
void transform(raft::resources const& res,
const quantizer& quantizer,
raft::device_matrix_view<const float, int64_t> dataset,
raft::device_matrix_view<uint8_t, int64_t> out);

/** @copydoc transform */
/*void transform(raft::resources const& res,
Comment thread
lowener marked this conversation as resolved.
Outdated
const quantizer& quantizer,
raft::device_matrix_view<const half, int64_t> dataset,
raft::device_matrix_view<uint8_t, int64_t> out);*/

/** @copydoc transform */
/*void transform(raft::resources const& res,
const quantizer& quantizer,
raft::host_matrix_view<const float, int64_t> dataset,
raft::host_matrix_view<uint8_t, int64_t> out);*/

/** @} */ // end of group product

} // namespace cuvs::preprocessing::quantize::product
164 changes: 164 additions & 0 deletions cpp/src/preprocessing/quantize/detail/product.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "../../../core/nvtx.hpp"
#include <cuvs/preprocessing/quantize/product.hpp>
#include <raft/core/operators.hpp>
#include <raft/linalg/init.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/matrix/sample_rows.cuh>
#include <raft/random/rng.cuh>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/system/omp/execution_policy.h>

namespace cuvs::preprocessing::quantize::product::detail {

template <typename T>
quantizer train(raft::resources const& res,
const cuvs::preprocessing::quantize::product::params params,
raft::device_matrix_view<const T, int64_t> dataset)
{
auto n_rows = dataset.extent(0);
auto dim = dataset.extent(1);
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
"preprocessing::quantize::product::train(%zu, %u)", size_t(n_rows), dim);

auto pq_params = cuvs::neighbors::ivf_pq::index_params();
pq_params.n_lists = 1;
pq_params.pq_bits = params.pq_bits;
pq_params.pq_dim = params.pq_dim;
pq_params.codebook_kind = params.codebook_kind;
pq_params.force_random_rotation = params.force_random_rotation;
pq_params.add_data_on_build = false;
pq_params.max_train_points_per_pq_code = params.max_train_points_per_pq_code;

auto pq_index = cuvs::neighbors::ivf_pq::build(res, pq_params, dataset);
Comment thread
lowener marked this conversation as resolved.
Outdated
return cuvs::preprocessing::quantize::product::quantizer{std::move(pq_index)};
}

quantizer train(
raft::resources const& res,
const cuvs::preprocessing::quantize::product::params params,
const uint32_t dim,
raft::device_mdspan<const float, raft::extent_3d<uint32_t>, raft::row_major> pq_centers,
raft::device_matrix_view<const float, uint32_t, raft::row_major> centers,
std::optional<raft::device_matrix_view<const float, uint32_t, raft::row_major>> centers_rot,
std::optional<raft::device_matrix_view<const float, uint32_t, raft::row_major>> rotation_matrix)
{
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
"preprocessing::quantize::product::train()");
auto pq_params = cuvs::neighbors::ivf_pq::index_params();
pq_params.n_lists = centers.extent(0);
pq_params.pq_bits = params.pq_bits;
pq_params.pq_dim = params.pq_dim;
pq_params.codebook_kind = params.codebook_kind;
pq_params.force_random_rotation = params.force_random_rotation;
pq_params.max_train_points_per_pq_code = params.max_train_points_per_pq_code;

auto pq_index = cuvs::neighbors::ivf_pq::build(
res, pq_params, dim, pq_centers, centers, centers_rot, rotation_matrix);
return cuvs::preprocessing::quantize::product::quantizer{std::move(pq_index)};
}

template <typename T, typename QuantI = uint8_t>
void transform(raft::resources const& res,
const quantizer& quantizer,
raft::device_matrix_view<const T, int64_t> dataset,
raft::device_matrix_view<QuantI, int64_t> out)
{
// std::optional<raft::device_vector<int64_t, int64_t>> indices_opt = std::nullopt;
std::optional<raft::device_vector_view<const int64_t, int64_t>> indices_view_opt = std::nullopt;
// auto current_size = quantizer.pq_index.size();
// if (current_size != 0) {
// indices_opt = raft::make_device_vector<int64_t>(res, dataset.extent(0));
// raft::linalg::range(indices_opt.value().data_handle(), current_size,
// current_size + dataset.extent(0), raft::resource::get_cuda_stream(res));
// indices_view_opt = raft::make_const_mdspan(indices_opt.value().view());
// }

// TODO: Call detail::extend to avoid clone()
auto extended_index =
cuvs::neighbors::ivf_pq::extend(res, dataset, indices_view_opt, quantizer.pq_index);

auto n_lists = extended_index.n_lists();
auto lists = extended_index.lists();
auto indices_uint32 = rmm::device_uvector<uint32_t>(0, raft::resource::get_cuda_stream(res));
for (size_t i = 0; i < n_lists; i++) {
/*auto current_list_size = lists[i]->size.load();
Comment thread
lowener marked this conversation as resolved.
Outdated
if (current_list_size > indices_uint32.size()) {
indices_uint32.resize(current_list_size, raft::resource::get_cuda_stream(res));
}
auto indices_uint32_view = raft::make_device_vector_view<uint32_t>(indices_uint32.data(),
current_list_size); auto indices_list_view = raft::make_device_vector_view<const
int64_t>(lists[i]->indices.data_handle(), current_list_size); raft::linalg::map(res,
indices_uint32_view, raft::cast_op<uint32_t>{}, indices_list_view);

cuvs::neighbors::ivf_pq::helpers::codepacker::unpack_list_data(
res, extended_index, raft::make_const_mdspan(indices_uint32_view), out, i);*/

cuvs::neighbors::ivf_pq::helpers::codepacker::unpack_list_data(res, extended_index, out, i, 0);
}
// TODO: Resize the extended index lists to 0
}

/*
template <typename T>
quantizer train(
raft::resources const& res,
const params params,
raft::host_matrix_view<const T, int64_t> dataset)
Comment thread
lowener marked this conversation as resolved.
Outdated
{
RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0,
"quantile for scalar quantization needs to be within (0, 1] but is %f",
params.quantile);

auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile);

RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max));

return quantizer{min, max};
}

template <typename T, typename QuantI = int8_t>
void transform(raft::resources const& res,
const quantizer& quantizer,
raft::device_matrix_view<const T, int64_t> dataset,
raft::device_matrix_view<QuantI, int64_t> out)
{
cudaStream_t stream = raft::resource::get_cuda_stream(res);

raft::linalg::map(res, out, quantize_op<T, QuantI>(quantizer.min_, quantizer.max_), dataset);
}

template <typename T, typename QuantI = int8_t>
void transform(raft::resources const& res,
const quantizer& quantizer,
raft::host_matrix_view<const T, int64_t> dataset,
raft::host_matrix_view<QuantI, int64_t> out)
{
auto main_op = quantize_op<T, QuantI>(quantizer.min_, quantizer.max_);
size_t n_elements = dataset.extent(0) * dataset.extent(1);

#pragma omp parallel for
for (size_t i = 0; i < n_elements; ++i) {
out.data_handle()[i] = main_op(dataset.data_handle()[i]);
}
}
*/
} // namespace cuvs::preprocessing::quantize::product::detail
Loading