diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h index b6319fe3b0..b47af2c773 100644 --- a/cpp/include/cuvs/core/c_api.h +++ b/cpp/include/cuvs/core/c_api.h @@ -75,23 +75,6 @@ cuvsError_t cuvsResourcesCreate(cuvsResources_t* res); */ cuvsError_t cuvsResourcesDestroy(cuvsResources_t res); -/** - * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg` - * for multi-GPU operations - * - * @param[in] res cuvsResources_t opaque C handle - * @return cuvsError_t - */ -cuvsError_t cuvsMultiGpuResourcesCreate(cuvsResources_t* res); - -/** - * @brief Destroy and de-allocate opaque C handle for C++ type `raft::device_resources_snmg` - * - * @param[in] res cuvsResources_t opaque C handle - * @return cuvsError_t - */ -cuvsError_t cuvsMultiGpuResourcesDestroy(cuvsResources_t res); - /** * @brief Set cudaStream_t on cuvsResources_t to queue CUDA kernels on APIs * that accept a cuvsResources_t handle diff --git a/cpp/scripts/gitutils.py b/cpp/scripts/gitutils.py index a7337ed4df..800d7797e8 100644 --- a/cpp/scripts/gitutils.py +++ b/cpp/scripts/gitutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -155,7 +155,7 @@ def uncommittedFiles(): ret = [] for f in files.splitlines(): f = f.strip(" ") - f = re.sub("\s+", " ", f) # noqa: W605 + f = re.sub(r"\s+", " ", f) # noqa: W605 tmp = f.split(" ", 1) # only consider staged files or uncommitted files # in other words, ignore untracked files diff --git a/cpp/src/neighbors/mg_cagra_c.cpp b/cpp/src/neighbors/mg_cagra_c.cpp index c6d05605d7..e661297b97 100644 --- a/cpp/src/neighbors/mg_cagra_c.cpp +++ b/cpp/src/neighbors/mg_cagra_c.cpp @@ -267,7 +267,12 @@ extern "C" cuvsError_t cuvsMultiGpuCagraBuild(cuvsResources_t res, cuvsMultiGpuCagraIndex_t index) { return cuvs::core::translate_exceptions([=] { - auto dataset = dataset_tensor->dl_tensor; + auto dataset = dataset_tensor->dl_tensor; + + // Multi-GPU CAGRA requires dataset to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset), + "Multi-GPU CAGRA build requires dataset to have host compatible memory"); + index->dtype.code = dataset.dtype.code; index->dtype.bits = dataset.dtype.bits; @@ -295,7 +300,29 @@ extern "C" cuvsError_t cuvsMultiGpuCagraSearch(cuvsResources_t res, DLManagedTensor* distances_tensor) { return cuvs::core::translate_exceptions([=] { - auto queries = queries_tensor->dl_tensor; + auto queries = queries_tensor->dl_tensor; + auto neighbors = neighbors_tensor->dl_tensor; + auto distances = distances_tensor->dl_tensor; + + // Multi-GPU CAGRA requires all tensors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries), + "Multi-GPU CAGRA search requires queries to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors), + "Multi-GPU CAGRA search requires neighbors to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances), + "Multi-GPU CAGRA search requires distances to have host compatible memory"); + + // Validate data types + RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64, + "neighbors should be of type int64_t"); + RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32, + "distances should be of type float32"); + + // Check type compatibility between index and queries + RAFT_EXPECTS(queries.dtype.code == index->dtype.code, + "type mismatch between index and queries"); + RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits, + "type mismatch between index and queries"); if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) { _mg_search(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor); @@ -321,6 +348,25 @@ extern "C" cuvsError_t cuvsMultiGpuCagraExtend(cuvsResources_t res, return cuvs::core::translate_exceptions([=] { auto vectors = new_vectors_tensor->dl_tensor; + // Multi-GPU CAGRA requires vectors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors), + "Multi-GPU CAGRA extend requires new_vectors to have host compatible memory"); + + // Check type compatibility between index and vectors + RAFT_EXPECTS(vectors.dtype.code == index->dtype.code, + "type mismatch between index and new_vectors"); + RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits, + "type mismatch between index and new_vectors"); + + // If indices are provided, they should also be in host memory + if (new_indices_tensor != nullptr) { + auto indices = new_indices_tensor->dl_tensor; + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices), + "Multi-GPU CAGRA extend requires new_indices to have host compatible memory"); + RAFT_EXPECTS(indices.dtype.code == kDLUInt && indices.dtype.bits == 32, + "new_indices should be of type uint32_t"); + } + if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) { _mg_extend(res, *index, new_vectors_tensor, new_indices_tensor); } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) { diff --git a/cpp/src/neighbors/mg_ivf_flat_c.cpp b/cpp/src/neighbors/mg_ivf_flat_c.cpp index c012cb4c7c..bec2fe8149 100644 --- a/cpp/src/neighbors/mg_ivf_flat_c.cpp +++ b/cpp/src/neighbors/mg_ivf_flat_c.cpp @@ -264,7 +264,12 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatBuild(cuvsResources_t res, cuvsMultiGpuIvfFlatIndex_t index) { return cuvs::core::translate_exceptions([=] { - auto dataset = dataset_tensor->dl_tensor; + auto dataset = dataset_tensor->dl_tensor; + + // Multi-GPU IVF-Flat requires dataset to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset), + "Multi-GPU IVF-Flat build requires dataset to have host compatible memory"); + index->dtype.code = dataset.dtype.code; index->dtype.bits = dataset.dtype.bits; @@ -292,7 +297,29 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatSearch(cuvsResources_t res, DLManagedTensor* distances_tensor) { return cuvs::core::translate_exceptions([=] { - auto queries = queries_tensor->dl_tensor; + auto queries = queries_tensor->dl_tensor; + auto neighbors = neighbors_tensor->dl_tensor; + auto distances = distances_tensor->dl_tensor; + + // Multi-GPU IVF-Flat requires all tensors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries), + "Multi-GPU IVF-Flat search requires queries to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors), + "Multi-GPU IVF-Flat search requires neighbors to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances), + "Multi-GPU IVF-Flat search requires distances to have host compatible memory"); + + // Validate data types + RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64, + "neighbors should be of type int64_t"); + RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32, + "distances should be of type float32"); + + // Check type compatibility between index and queries + RAFT_EXPECTS(queries.dtype.code == index->dtype.code, + "type mismatch between index and queries"); + RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits, + "type mismatch between index and queries"); if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) { _mg_search(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor); @@ -318,6 +345,25 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatExtend(cuvsResources_t res, return cuvs::core::translate_exceptions([=] { auto vectors = new_vectors_tensor->dl_tensor; + // Multi-GPU IVF-Flat requires vectors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors), + "Multi-GPU IVF-Flat extend requires new_vectors to have host compatible memory"); + + // Check type compatibility between index and vectors + RAFT_EXPECTS(vectors.dtype.code == index->dtype.code, + "type mismatch between index and new_vectors"); + RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits, + "type mismatch between index and new_vectors"); + + // If indices are provided, they should also be in host memory + if (new_indices_tensor != nullptr) { + auto indices = new_indices_tensor->dl_tensor; + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices), + "Multi-GPU IVF-Flat extend requires new_indices to have host compatible memory"); + RAFT_EXPECTS(indices.dtype.code == kDLInt && indices.dtype.bits == 64, + "new_indices should be of type int64_t"); + } + if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) { _mg_extend(res, *index, new_vectors_tensor, new_indices_tensor); } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) { diff --git a/cpp/src/neighbors/mg_ivf_pq_c.cpp b/cpp/src/neighbors/mg_ivf_pq_c.cpp index 57d11f5264..0307a659d1 100644 --- a/cpp/src/neighbors/mg_ivf_pq_c.cpp +++ b/cpp/src/neighbors/mg_ivf_pq_c.cpp @@ -256,7 +256,12 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqBuild(cuvsResources_t res, cuvsMultiGpuIvfPqIndex_t index) { return cuvs::core::translate_exceptions([=] { - auto dataset = dataset_tensor->dl_tensor; + auto dataset = dataset_tensor->dl_tensor; + + // Multi-GPU IVF-PQ requires dataset to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset), + "Multi-GPU IVF-PQ build requires dataset to have host compatible memory"); + index->dtype.code = dataset.dtype.code; index->dtype.bits = dataset.dtype.bits; @@ -284,7 +289,29 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqSearch(cuvsResources_t res, DLManagedTensor* distances_tensor) { return cuvs::core::translate_exceptions([=] { - auto queries = queries_tensor->dl_tensor; + auto queries = queries_tensor->dl_tensor; + auto neighbors = neighbors_tensor->dl_tensor; + auto distances = distances_tensor->dl_tensor; + + // Multi-GPU IVF-PQ requires all tensors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries), + "Multi-GPU IVF-PQ search requires queries to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors), + "Multi-GPU IVF-PQ search requires neighbors to have host compatible memory"); + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances), + "Multi-GPU IVF-PQ search requires distances to have host compatible memory"); + + // Validate data types + RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64, + "neighbors should be of type int64_t"); + RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32, + "distances should be of type float32"); + + // Check type compatibility between index and queries + RAFT_EXPECTS(queries.dtype.code == index->dtype.code, + "type mismatch between index and queries"); + RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits, + "type mismatch between index and queries"); if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) { _mg_search(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor); @@ -310,6 +337,25 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqExtend(cuvsResources_t res, return cuvs::core::translate_exceptions([=] { auto vectors = new_vectors_tensor->dl_tensor; + // Multi-GPU IVF-PQ requires vectors to be in host memory + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors), + "Multi-GPU IVF-PQ extend requires new_vectors to have host compatible memory"); + + // Check type compatibility between index and vectors + RAFT_EXPECTS(vectors.dtype.code == index->dtype.code, + "type mismatch between index and new_vectors"); + RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits, + "type mismatch between index and new_vectors"); + + // If indices are provided, they should also be in host memory + if (new_indices_tensor != nullptr) { + auto indices = new_indices_tensor->dl_tensor; + RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices), + "Multi-GPU IVF-PQ extend requires new_indices to have host compatible memory"); + RAFT_EXPECTS(indices.dtype.code == kDLInt && indices.dtype.bits == 64, + "new_indices should be of type int64_t"); + } + if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) { _mg_extend(res, *index, new_vectors_tensor, new_indices_tensor); } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) { @@ -381,28 +427,8 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqDistribute(cuvsResources_t res, cuvsMultiGpuIvfPqIndex_t index) { return cuvs::core::translate_exceptions([=] { - std::ifstream is(filename, std::ios::in | std::ios::binary); - if (!is) { RAFT_FAIL("Cannot open file %s", filename); } - char dtype_string[4]; - is.read(dtype_string, 4); - auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4)); - is.close(); - - index->dtype.bits = dtype.itemsize * 8; - if (dtype.kind == 'f' && dtype.itemsize == 4) { - index->dtype.code = kDLFloat; - index->addr = reinterpret_cast(_mg_distribute(res, filename)); - } else if (dtype.kind == 'f' && dtype.itemsize == 2) { - index->dtype.code = kDLFloat; - index->addr = reinterpret_cast(_mg_distribute(res, filename)); - } else if (dtype.kind == 'i' && dtype.itemsize == 1) { - index->dtype.code = kDLInt; - index->addr = reinterpret_cast(_mg_distribute(res, filename)); - } else if (dtype.kind == 'u' && dtype.itemsize == 1) { - index->dtype.code = kDLUInt; - index->addr = reinterpret_cast(_mg_distribute(res, filename)); - } else { - RAFT_FAIL("Unsupported index dtype"); - } + index->dtype.code = kDLFloat; + index->dtype.bits = 32; + index->addr = reinterpret_cast(_mg_distribute(res, filename)); }); } diff --git a/docs/source/python_api/neighbors.rst b/docs/source/python_api/neighbors.rst index 47e4b2044e..909f2013ad 100644 --- a/docs/source/python_api/neighbors.rst +++ b/docs/source/python_api/neighbors.rst @@ -5,9 +5,12 @@ Nearest Neighbors :language: python :class: highlight +Single-GPU Algorithms +##################### + .. toctree:: :maxdepth: 2 - :caption: Contents: + :caption: Single-GPU ANN Algorithms: neighbors_brute_force.rst neighbors_cagra.rst @@ -15,4 +18,13 @@ Nearest Neighbors neighbors_ivf_flat.rst neighbors_ivf_pq.rst neighbors_nn_decent.rst + +Multi-GPU Algorithms +#################### + +.. toctree:: + :maxdepth: 2 + :caption: Multi-GPU Distributed ANN: + + neighbors_multi_gpu.rst neighbors_all_neighbors.rst diff --git a/docs/source/python_api/neighbors_mg_cagra.rst b/docs/source/python_api/neighbors_mg_cagra.rst new file mode 100644 index 0000000000..2f03fa389b --- /dev/null +++ b/docs/source/python_api/neighbors_mg_cagra.rst @@ -0,0 +1,55 @@ +Multi-GPU CAGRA +=============== + +Multi-GPU CAGRA extends the graph-based CAGRA algorithm to work across multiple GPUs, providing improved scalability and performance for large-scale vector search. It supports both replicated and sharded distribution modes. + +.. role:: py(code) + :language: python + :class: highlight + +.. note:: + **IMPORTANT**: Multi-GPU CAGRA requires all data (datasets, queries, output arrays) to be in host memory (CPU). + If using CuPy/device arrays, transfer to host with ``array.get()`` or ``cp.asnumpy(array)`` before use. + +Index build parameters +###################### + +.. autoclass:: cuvs.neighbors.mg_cagra.IndexParams + :members: + +Index search parameters +####################### + +.. autoclass:: cuvs.neighbors.mg_cagra.SearchParams + :members: + +Index +##### + +.. autoclass:: cuvs.neighbors.mg_cagra.Index + :members: + +Index build +########### + +.. autofunction:: cuvs.neighbors.mg_cagra.build + +Index search +############ + +.. autofunction:: cuvs.neighbors.mg_cagra.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.mg_cagra.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.mg_cagra.load + +Index distribute +################ + +.. autofunction:: cuvs.neighbors.mg_cagra.distribute diff --git a/docs/source/python_api/neighbors_mg_ivf_flat.rst b/docs/source/python_api/neighbors_mg_ivf_flat.rst new file mode 100644 index 0000000000..37515ce546 --- /dev/null +++ b/docs/source/python_api/neighbors_mg_ivf_flat.rst @@ -0,0 +1,60 @@ +Multi-GPU IVF-Flat +================== + +Multi-GPU IVF-Flat extends the IVF-Flat algorithm to work across multiple GPUs, providing improved scalability and performance for large-scale vector search. It supports both replicated and sharded distribution modes. + +.. role:: py(code) + :language: python + :class: highlight + +.. note:: + **IMPORTANT**: Multi-GPU IVF-Flat requires all data (datasets, queries, output arrays) to be in host memory (CPU). + If using CuPy/device arrays, transfer to host with ``array.get()`` or ``cp.asnumpy(array)`` before use. + +Index build parameters +###################### + +.. autoclass:: cuvs.neighbors.mg_ivf_flat.IndexParams + :members: + +Index search parameters +####################### + +.. autoclass:: cuvs.neighbors.mg_ivf_flat.SearchParams + :members: + +Index +##### + +.. autoclass:: cuvs.neighbors.mg_ivf_flat.Index + :members: + +Index build +########### + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.build + +Index search +############ + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.search + +Index extend +############ + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.extend + +Index save +########## + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.load + +Index distribute +################ + +.. autofunction:: cuvs.neighbors.mg_ivf_flat.distribute diff --git a/docs/source/python_api/neighbors_mg_ivf_pq.rst b/docs/source/python_api/neighbors_mg_ivf_pq.rst new file mode 100644 index 0000000000..d7d13b4734 --- /dev/null +++ b/docs/source/python_api/neighbors_mg_ivf_pq.rst @@ -0,0 +1,60 @@ +Multi-GPU IVF-PQ +================ + +Multi-GPU IVF-PQ extends the IVF-PQ (Inverted File with Product Quantization) algorithm to work across multiple GPUs, providing improved scalability and performance for large-scale vector search. It supports both replicated and sharded distribution modes. + +.. role:: py(code) + :language: python + :class: highlight + +.. note:: + **IMPORTANT**: Multi-GPU IVF-PQ requires all data (datasets, queries, output arrays) to be in host memory (CPU). + If using CuPy/device arrays, transfer to host with ``array.get()`` or ``cp.asnumpy(array)`` before use. + +Index build parameters +###################### + +.. autoclass:: cuvs.neighbors.mg_ivf_pq.IndexParams + :members: + +Index search parameters +####################### + +.. autoclass:: cuvs.neighbors.mg_ivf_pq.SearchParams + :members: + +Index +##### + +.. autoclass:: cuvs.neighbors.mg_ivf_pq.Index + :members: + +Index build +########### + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.build + +Index search +############ + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.search + +Index extend +############ + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.extend + +Index save +########## + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.load + +Index distribute +################ + +.. autofunction:: cuvs.neighbors.mg_ivf_pq.distribute diff --git a/docs/source/python_api/neighbors_multi_gpu.rst b/docs/source/python_api/neighbors_multi_gpu.rst new file mode 100644 index 0000000000..e8230d7695 --- /dev/null +++ b/docs/source/python_api/neighbors_multi_gpu.rst @@ -0,0 +1,116 @@ +Multi-GPU Nearest Neighbors +=========================== + +Multi-GPU support in cuVS enables scaling ANN (Approximate Nearest Neighbors) algorithms across multiple GPUs on a single node, providing improved performance and the ability to handle larger datasets. + +.. role:: py(code) + :language: python + :class: highlight + +Overview +-------- + +The multi-GPU implementations extend the single-GPU algorithms to work across multiple GPUs using two main distribution strategies: + +- **Replicated Mode**: The entire index is replicated across all GPUs. This mode provides higher query throughput by distributing queries across GPUs while maintaining the full index on each GPU. + +- **Sharded Mode**: The index is partitioned (sharded) across GPUs. This mode allows handling larger datasets that don't fit on a single GPU by distributing the data across multiple GPUs. + +Important Notes +--------------- + +.. warning:: + **Memory Requirements**: Multi-GPU algorithms require all data to be in host memory (CPU). This is different from single-GPU algorithms that typically work with device memory. + +.. note:: + **Supported Algorithms**: Currently, multi-GPU support is available for: + + - CAGRA (Graph-based ANN) + - IVF-Flat (Inverted File with Flat storage) + - IVF-PQ (Inverted File with Product Quantization) + +Configuration Options +--------------------- + +Distribution Modes +^^^^^^^^^^^^^^^^^^ + +- **Replicated Mode** + + In replicated mode, the complete index is stored on each GPU. This approach: + + - Maximizes query throughput by processing queries in parallel across all GPUs + - Requires each GPU to have enough memory to store the entire index + - Is ideal for scenarios where query throughput is more important than index size limitations + +- **Sharded Mode** + + In sharded mode, the index is distributed across GPUs. This approach: + + - Enables handling of larger datasets by partitioning across GPUs + - Requires coordination between GPUs during search operations + - Is ideal for scenarios where the dataset is too large for a single GPU + +Search Modes +^^^^^^^^^^^^ + +- **Load Balancer** + + Divides each query across multiple GPUs, distributing workload efficiently to maximize performance and throughput. + +- **Round Robin** + + Distributes queries evenly across GPUs in a rotating sequence, ensuring balanced workload allocation. This mode is best suited for frequent, small-scale search operations. + +Merge Modes +^^^^^^^^^^^ + +- **Merge on Root Rank** + + Results from all GPUs are collected and merged on the root rank (typically GPU 0). + +- **Tree Merge** + + Results are merged in a tree-like fashion across GPUs to reduce communication overhead. + +Usage Examples +-------------- + +Basic Multi-GPU Usage +^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + import numpy as np + from cuvs.neighbors import mg_cagra + + # Create dataset in host memory + n_samples = 100000 + n_features = 128 + dataset = np.random.random_sample((n_samples, n_features), dtype=np.float32) + + # Build multi-GPU index + build_params = mg_cagra.IndexParams( + distribution_mode="sharded", + metric="sqeuclidean" + ) + index = mg_cagra.build(build_params, dataset) + + # Search with multi-GPU + queries = np.random.random_sample((1000, n_features), dtype=np.float32) + search_params = mg_cagra.SearchParams( + search_mode="load_balancer", + merge_mode="merge_on_root_rank" + ) + distances, neighbors = mg_cagra.search(search_params, index, queries, k=10) + +Algorithm-Specific Documentation +-------------------------------- + +.. toctree:: + :maxdepth: 2 + :caption: Multi-GPU Algorithms: + + neighbors_mg_cagra.rst + neighbors_mg_ivf_flat.rst + neighbors_mg_ivf_pq.rst diff --git a/python/cuvs/cuvs/neighbors/CMakeLists.txt b/python/cuvs/cuvs/neighbors/CMakeLists.txt index 0c9196dc43..ee48687d69 100644 --- a/python/cuvs/cuvs/neighbors/CMakeLists.txt +++ b/python/cuvs/cuvs/neighbors/CMakeLists.txt @@ -10,8 +10,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. -# ============================================================================= - +# add_subdirectory(brute_force) add_subdirectory(cagra) add_subdirectory(hnsw) @@ -20,6 +19,7 @@ add_subdirectory(ivf_pq) add_subdirectory(filters) add_subdirectory(nn_descent) add_subdirectory(tiered_index) +add_subdirectory(mg) add_subdirectory(all_neighbors) # Set the list of Cython files to build diff --git a/python/cuvs/cuvs/neighbors/__init__.py b/python/cuvs/cuvs/neighbors/__init__.py index 8ae8c5678b..b34a3b3598 100644 --- a/python/cuvs/cuvs/neighbors/__init__.py +++ b/python/cuvs/cuvs/neighbors/__init__.py @@ -20,6 +20,7 @@ filters, ivf_flat, ivf_pq, + mg, nn_descent, ) @@ -31,6 +32,7 @@ "filters", "ivf_flat", "ivf_pq", + "mg", "nn_descent", "all_neighbors", "refine", diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd index b498cf1681..b142a4b33b 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd @@ -197,3 +197,13 @@ cdef class Index: cdef cuvsCagraIndex_t index cdef bool trained cdef str active_index_type + + +cdef class IndexParams: + cdef cuvsCagraIndexParams* params + cdef public object compression + cdef public object ivf_pq_build_params + cdef public object ivf_pq_search_params + +cdef class SearchParams: + cdef cuvsCagraSearchParams * params diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx index d82ea99ffa..8738e035c3 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx @@ -172,13 +172,6 @@ cdef class IndexParams: """ - cdef cuvsCagraIndexParams* params - - # hold on to a reference to the compression, to keep from being GC'ed - cdef public object compression - cdef public object ivf_pq_build_params - cdef public object ivf_pq_search_params - def __cinit__(self): check_cuvs(cuvsCagraIndexParamsCreate(&self.params)) self.compression = None @@ -186,7 +179,8 @@ cdef class IndexParams: self.ivf_pq_search_params = None def __dealloc__(self): - check_cuvs(cuvsCagraIndexParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsCagraIndexParamsDestroy(self.params)) def __init__(self, *, metric="sqeuclidean", @@ -475,13 +469,12 @@ cdef class SearchParams: """ - cdef cuvsCagraSearchParams * params - def __cinit__(self): check_cuvs(cuvsCagraSearchParamsCreate(&self.params)) def __dealloc__(self): - check_cuvs(cuvsCagraSearchParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsCagraSearchParamsDestroy(self.params)) def __init__(self, *, max_queries=0, diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py index f49d9eb1f0..4eaaf91d65 100644 --- a/python/cuvs/cuvs/neighbors/common.py +++ b/python/cuvs/cuvs/neighbors/common.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np + def _check_input_array( cai, exp_dt, exp_rows=None, exp_cols=None, exp_row_major=True @@ -36,3 +38,43 @@ def _check_input_array( exp_rows, cai.shape[0] ) ) + + +def _check_memory_location(array_like, expected_host=True, name="array"): + """ + Check if array is in expected memory location for multi-GPU operations. + + Parameters + ---------- + array_like : array-like + Array to check memory location of + expected_host : bool, default=True + If True, expects host memory. If False, expects device memory. + name : str + Name of the array for error messages + + Raises + ------ + ValueError + If array is not in expected memory location + """ + # Check if array has __cuda_array_interface__ (device memory indicator) + has_cuda_interface = hasattr(array_like, "__cuda_array_interface__") + + # Check if array is NumPy array (host memory indicator) + is_numpy = isinstance(array_like, np.ndarray) + + if expected_host: + if has_cuda_interface and not is_numpy: + raise ValueError( + f"Multi-GPU IVF-PQ requires {name} to be in host memory " + f"(CPU), but received device memory (GPU). Please use " + f"array.get() or cp.asnumpy(array) to transfer to host memory." + ) + else: + if is_numpy and not has_cuda_interface: + raise ValueError( + f"Expected {name} to be in device memory (GPU), but received " + f"host memory (CPU). Please use cp.asarray(array) to transfer " + f"to device memory." + ) diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pxd b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pxd index 470234d9cf..2078210d30 100644 --- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pxd +++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pxd @@ -95,3 +95,10 @@ cdef extern from "cuvs/neighbors/ivf_flat.h" nogil: DLManagedTensor* new_vectors, DLManagedTensor* new_indices, cuvsIvfFlatIndex_t index) + + +cdef class IndexParams: + cdef cuvsIvfFlatIndexParams* params + +cdef class SearchParams: + cdef cuvsIvfFlatSearchParams* params diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx index 996a39d4fe..c5f5fce5a4 100644 --- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx @@ -99,13 +99,12 @@ cdef class IndexParams: distribution of the newly added data. """ - cdef cuvsIvfFlatIndexParams* params - def __cinit__(self): cuvsIvfFlatIndexParamsCreate(&self.params) def __dealloc__(self): - check_cuvs(cuvsIvfFlatIndexParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsIvfFlatIndexParamsDestroy(self.params)) def __init__(self, *, n_lists=1024, @@ -284,13 +283,12 @@ cdef class SearchParams: The number of clusters to search. """ - cdef cuvsIvfFlatSearchParams* params - def __cinit__(self): cuvsIvfFlatSearchParamsCreate(&self.params) def __dealloc__(self): - check_cuvs(cuvsIvfFlatSearchParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsIvfFlatSearchParamsDestroy(self.params)) def __init__(self, *, n_probes=20): self.params.n_probes = n_probes diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd index 5ca7b97602..928a0cba1b 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd @@ -121,3 +121,11 @@ cdef extern from "cuvs/neighbors/ivf_pq.h" nogil: DLManagedTensor* new_vectors, DLManagedTensor* new_indices, cuvsIvfPqIndex_t index) + + +cdef class IndexParams: + cdef cuvsIvfPqIndexParams* params + cdef object _metric + +cdef class SearchParams: + cdef cuvsIvfPqSearchParams* params diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx index 5ffa0b0c88..dd3b17f949 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx @@ -123,14 +123,12 @@ cdef class IndexParams: train each codebook. """ - cdef cuvsIvfPqIndexParams* params - cdef object _metric - def __cinit__(self): cuvsIvfPqIndexParamsCreate(&self.params) def __dealloc__(self): - check_cuvs(cuvsIvfPqIndexParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsIvfPqIndexParamsDestroy(self.params)) def __init__(self, *, n_lists=1024, @@ -399,13 +397,12 @@ cdef class SearchParams: of larger memory footprint. """ - cdef cuvsIvfPqSearchParams* params - def __cinit__(self): cuvsIvfPqSearchParamsCreate(&self.params) def __dealloc__(self): - check_cuvs(cuvsIvfPqSearchParamsDestroy(self.params)) + if self.params != NULL: + check_cuvs(cuvsIvfPqSearchParamsDestroy(self.params)) def __init__(self, *, n_probes=20, lut_dtype=np.float32, internal_distance_dtype=np.float32, diff --git a/python/cuvs/cuvs/neighbors/mg/CMakeLists.txt b/python/cuvs/cuvs/neighbors/mg/CMakeLists.txt new file mode 100644 index 0000000000..24a2ae01a4 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# + +add_subdirectory(cagra) +add_subdirectory(ivf_flat) +add_subdirectory(ivf_pq) diff --git a/python/cuvs/cuvs/neighbors/mg/__init__.py b/python/cuvs/cuvs/neighbors/mg/__init__.py new file mode 100644 index 0000000000..a36b96d653 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import cagra, ivf_flat, ivf_pq + +__all__ = [ + "cagra", + "ivf_flat", + "ivf_pq", +] diff --git a/python/cuvs/cuvs/neighbors/mg/cagra/CMakeLists.txt b/python/cuvs/cuvs/neighbors/mg/cagra/CMakeLists.txt new file mode 100644 index 0000000000..4a9ffb13e0 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/cagra/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# + +# Set the list of Cython files to build +set(cython_sources cagra.pyx) +set(linked_libraries cuvs::cuvs cuvs::c_api) + +# Build all of the Cython targets +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_mg_cagra_ +) diff --git a/python/cuvs/cuvs/neighbors/mg/cagra/__init__.py b/python/cuvs/cuvs/neighbors/mg/cagra/__init__.py new file mode 100644 index 0000000000..597f6317fd --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/cagra/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cagra import ( + Index, + IndexParams, + SearchParams, + build, + distribute, + extend, + load, + save, + search, +) + +__all__ = [ + "Index", + "IndexParams", + "SearchParams", + "build", + "extend", + "search", + "save", + "load", + "distribute", +] diff --git a/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pxd new file mode 100644 index 0000000000..bb42c07d4a --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pxd @@ -0,0 +1,126 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +from libc.stdint cimport uint32_t +from libcpp cimport bool + +# Import base single-GPU extension module for subclassing +cimport cuvs.neighbors.cagra.cagra as _cagra +from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t +from cuvs.common.cydlpack cimport DLManagedTensor +from cuvs.neighbors.cagra.cagra cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsCagraIndexParams_t, + cuvsCagraSearchParams_t, +) + + +# Multi-GPU distribution modes +cdef extern from "cuvs/neighbors/mg_common.h" nogil: + ctypedef enum cuvsMultiGpuDistributionMode: + CUVS_NEIGHBORS_MG_REPLICATED + CUVS_NEIGHBORS_MG_SHARDED + + ctypedef enum cuvsMultiGpuReplicatedSearchMode: + CUVS_NEIGHBORS_MG_LOAD_BALANCER + CUVS_NEIGHBORS_MG_ROUND_ROBIN + + ctypedef enum cuvsMultiGpuShardedMergeMode: + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + CUVS_NEIGHBORS_MG_TREE_MERGE + +# Multi-GPU CAGRA structures and functions +cdef extern from "cuvs/neighbors/mg_cagra.h" nogil: + + cdef struct cuvsMultiGpuCagraIndexParams: + cuvsCagraIndexParams_t base_params + cuvsMultiGpuDistributionMode mode + + cdef struct cuvsMultiGpuCagraSearchParams: + cuvsCagraSearchParams_t base_params + cuvsMultiGpuReplicatedSearchMode search_mode + cuvsMultiGpuShardedMergeMode merge_mode + uint32_t n_rows_per_batch + + cdef struct cuvsMultiGpuCagraIndex: + pass + + ctypedef cuvsMultiGpuCagraIndexParams* cuvsMultiGpuCagraIndexParams_t + ctypedef cuvsMultiGpuCagraSearchParams* cuvsMultiGpuCagraSearchParams_t + ctypedef cuvsMultiGpuCagraIndex* cuvsMultiGpuCagraIndex_t + + cuvsError_t cuvsMultiGpuCagraIndexParamsCreate( + cuvsMultiGpuCagraIndexParams_t* index_params) + + cuvsError_t cuvsMultiGpuCagraIndexParamsDestroy( + cuvsMultiGpuCagraIndexParams_t index_params) + + cuvsError_t cuvsMultiGpuCagraSearchParamsCreate( + cuvsMultiGpuCagraSearchParams_t* params) + + cuvsError_t cuvsMultiGpuCagraSearchParamsDestroy( + cuvsMultiGpuCagraSearchParams_t params) + + cuvsError_t cuvsMultiGpuCagraIndexCreate(cuvsMultiGpuCagraIndex_t* index) + + cuvsError_t cuvsMultiGpuCagraIndexDestroy(cuvsMultiGpuCagraIndex_t index) + + cuvsError_t cuvsMultiGpuCagraBuild(cuvsResources_t res, + cuvsMultiGpuCagraIndexParams_t params, + DLManagedTensor* dataset_tensor, + cuvsMultiGpuCagraIndex_t index) except + + + cuvsError_t cuvsMultiGpuCagraSearch( + cuvsResources_t res, + cuvsMultiGpuCagraSearchParams_t params, + cuvsMultiGpuCagraIndex_t index, + DLManagedTensor* queries_tensor, + DLManagedTensor* neighbors_tensor, + DLManagedTensor* distances_tensor) except + + + cuvsError_t cuvsMultiGpuCagraSerialize( + cuvsResources_t res, + cuvsMultiGpuCagraIndex_t index, + const char* filename) except + + + cuvsError_t cuvsMultiGpuCagraDeserialize( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuCagraIndex_t index) except + + + cuvsError_t cuvsMultiGpuCagraDistribute( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuCagraIndex_t index) except + + + cuvsError_t cuvsMultiGpuCagraExtend( + cuvsResources_t res, + cuvsMultiGpuCagraIndex_t index, + DLManagedTensor* new_vectors_tensor, + DLManagedTensor* new_indices_tensor) except + + + +cdef class IndexParams(SingleGpuIndexParams): + cdef cuvsMultiGpuCagraIndexParams_t mg_params + +cdef class SearchParams(SingleGpuSearchParams): + cdef cuvsMultiGpuCagraSearchParams_t mg_params + +cdef class Index: + cdef cuvsMultiGpuCagraIndex_t mg_index + cdef bool mg_trained diff --git a/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pyx new file mode 100644 index 0000000000..6efcd0cd24 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/cagra/cagra.pyx @@ -0,0 +1,571 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +import numpy as np + +from libc.stdint cimport uint32_t +from libcpp.string cimport string + +from pylibraft.common import auto_convert_output +from pylibraft.common.cai_wrapper import wrap_array +from pylibraft.common.interruptible import cuda_interruptible + +from cuvs.common.exceptions import check_cuvs +from cuvs.common.mg_resources import auto_sync_multi_gpu_resources +from cuvs.neighbors.common import _check_input_array, _check_memory_location + +from cuvs.common cimport cydlpack +from cuvs.common.c_api cimport cuvsResources_t +from cuvs.neighbors.cagra.cagra cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsCagraIndexParams_t, + cuvsCagraIndexParamsDestroy, + cuvsCagraSearchParams_t, + cuvsCagraSearchParamsDestroy, +) + +from .cagra cimport ( + cuvsMultiGpuCagraBuild, + cuvsMultiGpuCagraDeserialize, + cuvsMultiGpuCagraDistribute, + cuvsMultiGpuCagraExtend, + cuvsMultiGpuCagraIndex_t, + cuvsMultiGpuCagraIndexCreate, + cuvsMultiGpuCagraIndexDestroy, + cuvsMultiGpuCagraIndexParams_t, + cuvsMultiGpuCagraIndexParamsCreate, + cuvsMultiGpuCagraIndexParamsDestroy, + cuvsMultiGpuCagraSearch, + cuvsMultiGpuCagraSearchParams_t, + cuvsMultiGpuCagraSearchParamsCreate, + cuvsMultiGpuCagraSearchParamsDestroy, + cuvsMultiGpuCagraSerialize, + cuvsMultiGpuDistributionMode, + cuvsMultiGpuReplicatedSearchMode, + cuvsMultiGpuShardedMergeMode, +) + + +cdef class IndexParams(SingleGpuIndexParams): + """ + Parameters to build multi-GPU CAGRA index for efficient search. + Extends single-GPU IndexParams with multi-GPU specific parameters. + + Parameters + ---------- + distribution_mode : str, default = "sharded" + Distribution mode for multi-GPU setup. + Valid values: ["replicated", "sharded"] + **kwargs : Additional parameters passed to single-GPU IndexParams + + Note + ---- + CAGRA currently only supports "sqeuclidean" and "inner_product" metrics. + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsCagraIndexParamsDestroy(self.params)) + + # Create multi-GPU params which includes embedded base params + check_cuvs(cuvsMultiGpuCagraIndexParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuCagraIndexParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, distribution_mode="sharded", **kwargs): + super().__init__(**kwargs) + if distribution_mode == "replicated": + self.mg_params.mode = CUVS_NEIGHBORS_MG_REPLICATED + elif distribution_mode == "sharded": + self.mg_params.mode = CUVS_NEIGHBORS_MG_SHARDED + else: + raise ValueError( + "distribution_mode must be 'replicated' or 'sharded'") + + def get_handle(self): + return self.mg_params + + @property + def distribution_mode(self): + return ("replicated" if self.mg_params.mode == + CUVS_NEIGHBORS_MG_REPLICATED else "sharded") + + +cdef class Index: + """ + Multi-GPU CAGRA index object. Stores the trained multi-GPU CAGRA index + state which can be used to perform nearest neighbors searches across + multiple GPUs. + """ + + def __cinit__(self): + # Initialize multi-GPU index + check_cuvs(cuvsMultiGpuCagraIndexCreate(&self.mg_index)) + # Initialize multi-GPU trained state + self.mg_trained = False + + def __dealloc__(self): + check_cuvs(cuvsMultiGpuCagraIndexDestroy(self.mg_index)) + + def __repr__(self): + return "Index(type=MultiGpuCagra)" + + @property + def trained(self): + return self.mg_trained + + +@auto_sync_multi_gpu_resources +def build(IndexParams index_params, dataset, resources=None): + """ + Build the multi-GPU CAGRA index from the dataset for efficient search. + + Parameters + ---------- + index_params : :py:class:`cuvs.neighbors.cagra.IndexParams` + dataset : Array interface compliant matrix shape (n_samples, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU CAGRA, the dataset MUST be in host + memory (CPU). If using CuPy/device arrays, transfer to host with + array.get() or cp.asnumpy(array). + {resources_docstring} + + Returns + ------- + index: py:class:`cuvs.neighbors.cagra.Index` + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import cagra + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU CAGRA, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = cagra.IndexParams(metric="sqeuclidean") + >>> index = cagra.build(build_params, dataset) + >>> distances, neighbors = cagra.search(cagra.SearchParams(), + ... index, dataset, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + dataset_ai = wrap_array(dataset) + _check_input_array(dataset_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU CAGRA requires dataset in host memory + _check_memory_location(dataset, expected_host=True, name="dataset") + + cdef Index idx = Index() + cdef cydlpack.DLManagedTensor* dataset_dlpack = ( + cydlpack.dlpack_c(dataset_ai)) + cdef cuvsMultiGpuCagraIndexParams_t params = index_params.mg_params + + cdef cuvsResources_t res = resources.get_c_obj() + + # Build the multi-GPU index + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuCagraBuild( + res, params, dataset_dlpack, idx.mg_index)) + idx.mg_trained = True + + return idx + + +cdef class SearchParams(SingleGpuSearchParams): + """ + Parameters to search multi-GPU CAGRA index. + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsCagraSearchParamsDestroy(self.params)) + + # Create multi-GPU search params which includes embedded base params + check_cuvs(cuvsMultiGpuCagraSearchParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuCagraSearchParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, search_mode="load_balancer", + merge_mode="merge_on_root_rank", + n_rows_per_batch=1000, **kwargs): + super().__init__(**kwargs) + # Use the property setters for consistent validation + self.search_mode = search_mode + self.merge_mode = merge_mode + self.n_rows_per_batch = n_rows_per_batch + + def get_handle(self): + return self.mg_params + + @property + def search_mode(self): + """Get the search mode for multi-GPU search.""" + return ("load_balancer" if self.mg_params.search_mode == + CUVS_NEIGHBORS_MG_LOAD_BALANCER else "round_robin") + + @search_mode.setter + def search_mode(self, value): + """Set the search mode for multi-GPU search.""" + if value == "load_balancer": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_LOAD_BALANCER + elif value == "round_robin": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_ROUND_ROBIN + else: + raise ValueError( + "search_mode must be 'load_balancer' or 'round_robin'") + + @property + def merge_mode(self): + """Get the merge mode for multi-GPU search.""" + return ("merge_on_root_rank" if self.mg_params.merge_mode == + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK else "tree_merge") + + @merge_mode.setter + def merge_mode(self, value): + """Set the merge mode for multi-GPU search.""" + if value == "merge_on_root_rank": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + elif value == "tree_merge": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_TREE_MERGE + else: + raise ValueError( + "merge_mode must be 'merge_on_root_rank' or 'tree_merge'") + + @property + def n_rows_per_batch(self): + """Get the number of rows per batch for multi-GPU search.""" + return self.mg_params.n_rows_per_batch + + @n_rows_per_batch.setter + def n_rows_per_batch(self, value): + """Set the number of rows per batch for multi-GPU search.""" + if not isinstance(value, int) or value <= 0: + raise ValueError("n_rows_per_batch must be a positive integer") + self.mg_params.n_rows_per_batch = value + + +@auto_sync_multi_gpu_resources +@auto_convert_output +def search(SearchParams search_params, Index index, queries, + k, neighbors=None, distances=None, resources=None): + """ + Search the multi-GPU CAGRA index for the k-nearest neighbors of each query. + + Parameters + ---------- + search_params : :py:class:`cuvs.neighbors.cagra.SearchParams` + index : :py:class:`cuvs.neighbors.cagra.Index` + queries : Array interface compliant matrix shape (n_queries, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU CAGRA, queries MUST be in host memory + (CPU). If using CuPy/device arrays, transfer to host with + array.get() or cp.asnumpy(array). + k : int + The number of neighbors to search for each query. + neighbors : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the indices of + the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU CAGRA. + Expected dtype: int64 + distances : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the distances + to the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU CAGRA. + {resources_docstring} + + Returns + ------- + distances : numpy.ndarray + The distances to the k-nearest neighbors for each query + (in host memory). + neighbors : numpy.ndarray + The indices of the k-nearest neighbors for each query + (in host memory). + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import cagra + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU CAGRA, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> queries = np.random.random_sample((n_queries, n_features)).astype( + ... np.float32) + >>> build_params = cagra.IndexParams(metric="sqeuclidean") + >>> index = cagra.build(build_params, dataset) + >>> distances, neighbors = cagra.search(cagra.SearchParams(), + ... index, queries, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + if not index.trained: + raise ValueError("Index needs to be built before searching") + + queries_ai = wrap_array(queries) + _check_input_array(queries_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU CAGRA requires queries in host memory + _check_memory_location(queries, expected_host=True, name="queries") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + # Prepare output arrays + cdef uint32_t n_queries = queries.shape[0] + if neighbors is None: + # For multi-GPU, create host arrays instead of device arrays + neighbors = np.empty((n_queries, k), dtype='int64') + if distances is None: + # For multi-GPU, create host arrays instead of device arrays + distances = np.empty((n_queries, k), dtype='float32') + + neighbors_ai = wrap_array(neighbors) + _check_input_array(neighbors_ai, [np.dtype('int64')], + exp_rows=n_queries, exp_cols=k) + distances_ai = wrap_array(distances) + _check_input_array(distances_ai, [np.dtype('float32')], + exp_rows=n_queries, exp_cols=k) + + # Multi-GPU CAGRA requires output arrays in host memory + _check_memory_location(neighbors, expected_host=True, + name="neighbors") + _check_memory_location(distances, expected_host=True, + name="distances") + + cdef cydlpack.DLManagedTensor* queries_dlpack = ( + cydlpack.dlpack_c(queries_ai)) + cdef cydlpack.DLManagedTensor* neighbors_dlpack = ( + cydlpack.dlpack_c(neighbors_ai)) + cdef cydlpack.DLManagedTensor* distances_dlpack = ( + cydlpack.dlpack_c(distances_ai)) + + # Perform search + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuCagraSearch( + res, search_params.mg_params, index.mg_index, queries_dlpack, + neighbors_dlpack, distances_dlpack)) + + return (distances, neighbors) + + +@auto_sync_multi_gpu_resources +def extend(Index index, new_vectors, new_indices=None, resources=None): + """ + Extend the multi-GPU CAGRA index with new vectors. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.cagra.Index` + new_vectors : Array interface compliant matrix shape (n_new_vectors, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU CAGRA, new_vectors MUST be in host + memory (CPU). If using CuPy/device arrays, transfer to host with + array.get() or cp.asnumpy(array). + new_indices : Array interface compliant matrix shape (n_new_vectors,), + optional + If provided, these indices will be used for the new vectors. + If not provided, indices will be automatically assigned. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU CAGRA. + Expected dtype: uint32 + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import cagra + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_new_vectors = 1000 + >>> # For multi-GPU CAGRA, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> new_vectors = np.random.random_sample( + ... (n_new_vectors, n_features)).astype(np.float32) + >>> new_indices = np.arange(n_samples, n_samples + n_new_vectors, + ... dtype=np.uint32) + >>> build_params = cagra.IndexParams(metric="sqeuclidean") + >>> index = cagra.build(build_params, dataset) + >>> cagra.extend(index, new_vectors, new_indices) # doctest: +SKIP + """ + + if not index.trained: + raise ValueError("Index needs to be built before extending") + + new_vectors_ai = wrap_array(new_vectors) + _check_input_array(new_vectors_ai, + [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU CAGRA requires new_vectors in host memory + _check_memory_location(new_vectors, expected_host=True, name="new_vectors") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef cydlpack.DLManagedTensor* new_vectors_dlpack = \ + cydlpack.dlpack_c(new_vectors_ai) + cdef cydlpack.DLManagedTensor* new_indices_dlpack = NULL + + if new_indices is not None: + new_indices_ai = wrap_array(new_indices) + _check_input_array(new_indices_ai, [np.dtype('uint32')]) + # Multi-GPU CAGRA requires new_indices in host memory + _check_memory_location(new_indices, expected_host=True, + name="new_indices") + new_indices_dlpack = cydlpack.dlpack_c(new_indices_ai) + + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuCagraExtend(res, index.mg_index, + new_vectors_dlpack, + new_indices_dlpack)) + + +@auto_sync_multi_gpu_resources +def save(Index index, filename, resources=None): + """ + Serialize the multi-GPU CAGRA index to a file. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.cagra.Index` + filename : str + The filename to serialize the index to. + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import cagra + >>> n_samples = 50000 + >>> n_features = 50 + >>> # For multi-GPU CAGRA, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = cagra.IndexParams(metric="sqeuclidean") + >>> index = cagra.build(build_params, dataset) + >>> cagra.save(index, "index.bin") + """ + + if not index.trained: + raise ValueError("Index needs to be built before serializing") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuCagraSerialize( + res, index.mg_index, filename_str.c_str())) + + +@auto_sync_multi_gpu_resources +def load(filename, resources=None): + """ + Deserialize the multi-GPU CAGRA index from a file. + + Parameters + ---------- + filename : str + The filename to deserialize the index from. + {resources_docstring} + + Returns + ------- + index : Index + The deserialized index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import cagra + >>> index = cagra.load("index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuCagraDeserialize( + res, filename_str.c_str(), index.mg_index)) + index.mg_trained = True + + return index + + +@auto_sync_multi_gpu_resources +def distribute(filename, resources=None): + """ + Distribute a single-GPU CAGRA index across multiple GPUs from a file. + + Parameters + ---------- + filename : str + The filename to distribute the index from. + {resources_docstring} + + Returns + ------- + index : Index + The distributed index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import cagra + >>> index = cagra.distribute("single_gpu_index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuCagraDistribute( + res, filename_str.c_str(), index.mg_index)) + index.mg_trained = True + + return index diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_flat/CMakeLists.txt b/python/cuvs/cuvs/neighbors/mg/ivf_flat/CMakeLists.txt new file mode 100644 index 0000000000..faacbfe2ea --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_flat/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# + +# Set the list of Cython files to build +set(cython_sources ivf_flat.pyx) +set(linked_libraries cuvs::cuvs cuvs::c_api) + +# Build all of the Cython targets +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_mg_ivf_flat_ +) diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_flat/__init__.py b/python/cuvs/cuvs/neighbors/mg/ivf_flat/__init__.py new file mode 100644 index 0000000000..e4ea5ce643 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_flat/__init__.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .ivf_flat import ( + Index, + IndexParams, + SearchParams, + build, + distribute, + extend, + load, + save, + search, +) + +__all__ = [ + "Index", + "IndexParams", + "SearchParams", + "build", + "extend", + "search", + "save", + "load", + "distribute", +] diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pxd b/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pxd new file mode 100644 index 0000000000..65dfe0db15 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pxd @@ -0,0 +1,128 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +from libc.stdint cimport int64_t, uintptr_t +from libcpp cimport bool + +# Import base single-GPU extension module for subclassing +from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t +from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor +from cuvs.neighbors.ivf_flat.ivf_flat cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsIvfFlatIndexParams_t, + cuvsIvfFlatSearchParams_t, +) + + +# Multi-GPU distribution modes +cdef extern from "cuvs/neighbors/mg_common.h" nogil: + ctypedef enum cuvsMultiGpuDistributionMode: + CUVS_NEIGHBORS_MG_REPLICATED + CUVS_NEIGHBORS_MG_SHARDED + + ctypedef enum cuvsMultiGpuReplicatedSearchMode: + CUVS_NEIGHBORS_MG_LOAD_BALANCER + CUVS_NEIGHBORS_MG_ROUND_ROBIN + + ctypedef enum cuvsMultiGpuShardedMergeMode: + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + CUVS_NEIGHBORS_MG_TREE_MERGE + +# Multi-GPU IVF-Flat structures and functions +cdef extern from "cuvs/neighbors/mg_ivf_flat.h" nogil: + cdef struct cuvsMultiGpuIvfFlatIndexParams: + cuvsIvfFlatIndexParams_t base_params + cuvsMultiGpuDistributionMode mode + + cdef struct cuvsMultiGpuIvfFlatSearchParams: + cuvsIvfFlatSearchParams_t base_params + cuvsMultiGpuReplicatedSearchMode search_mode + cuvsMultiGpuShardedMergeMode merge_mode + int64_t n_rows_per_batch + + cdef struct cuvsMultiGpuIvfFlatIndex: + uintptr_t addr + DLDataType dtype + + ctypedef cuvsMultiGpuIvfFlatIndexParams* cuvsMultiGpuIvfFlatIndexParams_t + ctypedef cuvsMultiGpuIvfFlatSearchParams* cuvsMultiGpuIvfFlatSearchParams_t + ctypedef cuvsMultiGpuIvfFlatIndex* cuvsMultiGpuIvfFlatIndex_t + + cuvsError_t cuvsMultiGpuIvfFlatIndexParamsCreate( + cuvsMultiGpuIvfFlatIndexParams_t* index_params) + + cuvsError_t cuvsMultiGpuIvfFlatIndexParamsDestroy( + cuvsMultiGpuIvfFlatIndexParams_t index_params) + + cuvsError_t cuvsMultiGpuIvfFlatSearchParamsCreate( + cuvsMultiGpuIvfFlatSearchParams_t* params) + + cuvsError_t cuvsMultiGpuIvfFlatSearchParamsDestroy( + cuvsMultiGpuIvfFlatSearchParams_t params) + + cuvsError_t cuvsMultiGpuIvfFlatIndexCreate( + cuvsMultiGpuIvfFlatIndex_t* index) + + cuvsError_t cuvsMultiGpuIvfFlatIndexDestroy( + cuvsMultiGpuIvfFlatIndex_t index) + + cuvsError_t cuvsMultiGpuIvfFlatBuild( + cuvsResources_t res, + cuvsMultiGpuIvfFlatIndexParams_t params, + DLManagedTensor* dataset_tensor, + cuvsMultiGpuIvfFlatIndex_t index) except + + + cuvsError_t cuvsMultiGpuIvfFlatSearch( + cuvsResources_t res, + cuvsMultiGpuIvfFlatSearchParams_t params, + cuvsMultiGpuIvfFlatIndex_t index, + DLManagedTensor* queries_tensor, + DLManagedTensor* neighbors_tensor, + DLManagedTensor* distances_tensor) except + + + cuvsError_t cuvsMultiGpuIvfFlatExtend( + cuvsResources_t res, + cuvsMultiGpuIvfFlatIndex_t index, + DLManagedTensor* new_vectors_tensor, + DLManagedTensor* new_indices_tensor) except + + + cuvsError_t cuvsMultiGpuIvfFlatSerialize( + cuvsResources_t res, + cuvsMultiGpuIvfFlatIndex_t index, + const char* filename) except + + + cuvsError_t cuvsMultiGpuIvfFlatDeserialize( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuIvfFlatIndex_t index) except + + + cuvsError_t cuvsMultiGpuIvfFlatDistribute( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuIvfFlatIndex_t index) except + + + +cdef class IndexParams(SingleGpuIndexParams): + cdef cuvsMultiGpuIvfFlatIndexParams_t mg_params + +cdef class SearchParams(SingleGpuSearchParams): + cdef cuvsMultiGpuIvfFlatSearchParams_t mg_params + +cdef class Index: + cdef cuvsMultiGpuIvfFlatIndex_t mg_index + cdef bool mg_trained diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pyx new file mode 100644 index 0000000000..e40b6b82ea --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_flat/ivf_flat.pyx @@ -0,0 +1,575 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +import numpy as np + +from libc.stdint cimport uint32_t +from libcpp.string cimport string + +from pylibraft.common import auto_convert_output +from pylibraft.common.cai_wrapper import wrap_array +from pylibraft.common.interruptible import cuda_interruptible + +from cuvs.common.exceptions import check_cuvs +from cuvs.common.mg_resources import auto_sync_multi_gpu_resources +from cuvs.neighbors.common import _check_input_array, _check_memory_location + +from cuvs.common cimport cydlpack +from cuvs.common.c_api cimport cuvsResources_t +from cuvs.neighbors.ivf_flat.ivf_flat cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsIvfFlatIndexParams_t, + cuvsIvfFlatIndexParamsDestroy, + cuvsIvfFlatSearchParams_t, + cuvsIvfFlatSearchParamsDestroy, +) + +from .ivf_flat cimport ( + cuvsMultiGpuDistributionMode, + cuvsMultiGpuIvfFlatBuild, + cuvsMultiGpuIvfFlatDeserialize, + cuvsMultiGpuIvfFlatDistribute, + cuvsMultiGpuIvfFlatExtend, + cuvsMultiGpuIvfFlatIndex, + cuvsMultiGpuIvfFlatIndex_t, + cuvsMultiGpuIvfFlatIndexCreate, + cuvsMultiGpuIvfFlatIndexDestroy, + cuvsMultiGpuIvfFlatIndexParams, + cuvsMultiGpuIvfFlatIndexParams_t, + cuvsMultiGpuIvfFlatIndexParamsCreate, + cuvsMultiGpuIvfFlatIndexParamsDestroy, + cuvsMultiGpuIvfFlatSearch, + cuvsMultiGpuIvfFlatSearchParams, + cuvsMultiGpuIvfFlatSearchParams_t, + cuvsMultiGpuIvfFlatSearchParamsCreate, + cuvsMultiGpuIvfFlatSearchParamsDestroy, + cuvsMultiGpuIvfFlatSerialize, + cuvsMultiGpuReplicatedSearchMode, + cuvsMultiGpuShardedMergeMode, +) + + +cdef class IndexParams(SingleGpuIndexParams): + """ + Parameters to build multi-GPU IVF-Flat index for efficient search. + Extends single-GPU IndexParams with multi-GPU specific parameters. + + Parameters + ---------- + distribution_mode : str, default = "sharded" + Distribution mode for multi-GPU setup. + Valid values: ["replicated", "sharded"] + **kwargs : Additional parameters passed to single-GPU IndexParams + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsIvfFlatIndexParamsDestroy(self.params)) + + # Create multi-GPU params which includes embedded base params + check_cuvs(cuvsMultiGpuIvfFlatIndexParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuIvfFlatIndexParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, distribution_mode="sharded", **kwargs): + super().__init__(**kwargs) + if distribution_mode == "replicated": + self.mg_params.mode = CUVS_NEIGHBORS_MG_REPLICATED + elif distribution_mode == "sharded": + self.mg_params.mode = CUVS_NEIGHBORS_MG_SHARDED + else: + raise ValueError( + "distribution_mode must be 'replicated' or 'sharded'") + + def get_handle(self): + return self.mg_params + + @property + def distribution_mode(self): + return ("replicated" if self.mg_params.mode == + CUVS_NEIGHBORS_MG_REPLICATED else "sharded") + + +cdef class Index: + """ + Multi-GPU IVF-Flat index object. Stores the trained multi-GPU IVF-Flat + index state which can be used to perform nearest neighbors searches + across multiple GPUs. + """ + + def __cinit__(self): + # Initialize multi-GPU index + check_cuvs(cuvsMultiGpuIvfFlatIndexCreate(&self.mg_index)) + # Initialize multi-GPU trained state + self.mg_trained = False + + def __dealloc__(self): + check_cuvs(cuvsMultiGpuIvfFlatIndexDestroy(self.mg_index)) + + def __repr__(self): + return "Index(type=MultiGpuIvfFlat)" + + @property + def trained(self): + return self.mg_trained + + +@auto_sync_multi_gpu_resources +def build(IndexParams index_params, dataset, resources=None): + """ + Build the multi-GPU IVF-Flat index from the dataset for efficient search. + + Parameters + ---------- + index_params : :py:class:`cuvs.neighbors.ivf_flat.\ +IndexParams` + dataset : Array interface compliant matrix shape (n_samples, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-Flat, the dataset MUST be in host + memory (CPU). If using CuPy/device arrays, transfer to host with + array.get() or cp.asnumpy(array). + {resources_docstring} + + Returns + ------- + index: py:class:`cuvs.neighbors.ivf_flat.Index` + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_flat + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU IVF-Flat, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = ivf_flat.IndexParams(metric="sqeuclidean") + >>> index = ivf_flat.build(build_params, dataset) + >>> distances, neighbors = ivf_flat.search( + ... ivf_flat.SearchParams(), + ... index, dataset, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + dataset_ai = wrap_array(dataset) + _check_input_array(dataset_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-Flat requires dataset in host memory + _check_memory_location(dataset, expected_host=True, name="dataset") + + cdef Index idx = Index() + cdef cydlpack.DLManagedTensor* dataset_dlpack = ( + cydlpack.dlpack_c(dataset_ai)) + cdef cuvsMultiGpuIvfFlatIndexParams_t params = index_params.mg_params + + cdef cuvsResources_t res = resources.get_c_obj() + + # Build the multi-GPU index + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfFlatBuild( + res, params, dataset_dlpack, idx.mg_index)) + idx.mg_trained = True + + return idx + + +cdef class SearchParams(SingleGpuSearchParams): + """ + Parameters to search multi-GPU IVF-Flat index. + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsIvfFlatSearchParamsDestroy(self.params)) + + # Create multi-GPU search params which includes embedded base params + check_cuvs(cuvsMultiGpuIvfFlatSearchParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuIvfFlatSearchParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, n_probes=1, search_mode="load_balancer", + merge_mode="merge_on_root_rank", + n_rows_per_batch=1000, **kwargs): + super().__init__(n_probes=n_probes, **kwargs) + # Use the property setters for consistent validation + self.search_mode = search_mode + self.merge_mode = merge_mode + self.n_rows_per_batch = n_rows_per_batch + + def get_handle(self): + return self.mg_params + + @property + def search_mode(self): + """Get the search mode for multi-GPU search.""" + return ("load_balancer" if self.mg_params.search_mode == + CUVS_NEIGHBORS_MG_LOAD_BALANCER else "round_robin") + + @search_mode.setter + def search_mode(self, value): + """Set the search mode for multi-GPU search.""" + if value == "load_balancer": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_LOAD_BALANCER + elif value == "round_robin": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_ROUND_ROBIN + else: + raise ValueError( + "search_mode must be 'load_balancer' or 'round_robin'") + + @property + def merge_mode(self): + """Get the merge mode for multi-GPU search.""" + return ("merge_on_root_rank" if self.mg_params.merge_mode == + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK else "tree_merge") + + @merge_mode.setter + def merge_mode(self, value): + """Set the merge mode for multi-GPU search.""" + if value == "merge_on_root_rank": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + elif value == "tree_merge": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_TREE_MERGE + else: + raise ValueError( + "merge_mode must be 'merge_on_root_rank' or 'tree_merge'") + + @property + def n_rows_per_batch(self): + """Get the number of rows per batch for multi-GPU search.""" + return self.mg_params.n_rows_per_batch + + @n_rows_per_batch.setter + def n_rows_per_batch(self, value): + """Set the number of rows per batch for multi-GPU search.""" + if not isinstance(value, int) or value <= 0: + raise ValueError("n_rows_per_batch must be a positive integer") + self.mg_params.n_rows_per_batch = value + + +@auto_sync_multi_gpu_resources +@auto_convert_output +def search(SearchParams search_params, Index index, queries, + k, neighbors=None, distances=None, resources=None): + """ + Search the multi-GPU IVF-Flat index for the k-nearest neighbors + of each query. + + Parameters + ---------- + search_params : :py:class:`cuvs.neighbors.ivf_flat.SearchParams` + index : :py:class:`cuvs.neighbors.ivf_flat.Index` + queries : Array interface compliant matrix shape (n_queries, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-Flat, queries MUST be + in host memory (CPU). + If using CuPy/device arrays, transfer to host with array.get() + or cp.asnumpy(array). + k : int + The number of neighbors to search for each query. + neighbors : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the indices of + the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-Flat. + distances : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the distances to + the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-Flat. + {resources_docstring} + + Returns + ------- + distances : numpy.ndarray + The distances to the k-nearest neighbors for each query + (in host memory). + neighbors : numpy.ndarray + The indices of the k-nearest neighbors for each query + (in host memory). + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_flat + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU IVF-Flat, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> queries = np.random.random_sample((n_queries, n_features)).astype( + ... np.float32) + >>> build_params = ivf_flat.IndexParams(metric="sqeuclidean") + >>> index = ivf_flat.build(build_params, dataset) + >>> distances, neighbors = ivf_flat.search( + ... ivf_flat.SearchParams(), + ... index, queries, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + if not index.trained: + raise ValueError("Index needs to be built before searching") + + queries_ai = wrap_array(queries) + _check_input_array(queries_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-Flat requires queries in host memory + _check_memory_location(queries, expected_host=True, name="queries") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + # Prepare output arrays + cdef uint32_t n_queries = queries.shape[0] + if neighbors is None: + # For multi-GPU, create host arrays instead of device arrays + neighbors = np.empty((n_queries, k), dtype='int64') + if distances is None: + # For multi-GPU, create host arrays instead of device arrays + distances = np.empty((n_queries, k), dtype='float32') + + neighbors_ai = wrap_array(neighbors) + _check_input_array(neighbors_ai, [np.dtype('int64')], + exp_rows=n_queries, exp_cols=k) + distances_ai = wrap_array(distances) + _check_input_array(distances_ai, [np.dtype('float32')], + exp_rows=n_queries, exp_cols=k) + + # Multi-GPU IVF-Flat requires output arrays in host memory + _check_memory_location(neighbors, expected_host=True, name="neighbors") + _check_memory_location(distances, expected_host=True, name="distances") + + cdef cydlpack.DLManagedTensor* queries_dlpack = \ + cydlpack.dlpack_c(queries_ai) + cdef cydlpack.DLManagedTensor* neighbors_dlpack = \ + cydlpack.dlpack_c(neighbors_ai) + cdef cydlpack.DLManagedTensor* distances_dlpack = \ + cydlpack.dlpack_c(distances_ai) + + # Perform search + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfFlatSearch(res, search_params.mg_params, + index.mg_index, queries_dlpack, + neighbors_dlpack, + distances_dlpack)) + + return (distances, neighbors) + + +@auto_sync_multi_gpu_resources +def extend(Index index, new_vectors, new_indices=None, + resources=None): + """ + Extend the multi-GPU IVF-Flat index with new vectors. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.ivf_flat.Index` + new_vectors : Array interface compliant matrix shape (n_new_vectors, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-Flat, new_vectors MUST be + in host memory (CPU). If using CuPy/device arrays, transfer + to host with array.get() or cp.asnumpy(array). + new_indices : Array interface compliant matrix shape (n_new_vectors,) + , optional + If provided, these indices will be used for the new vectors. + If not provided, indices will be automatically assigned. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-Flat. + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_flat + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_new_vectors = 1000 + >>> # For multi-GPU IVF-Flat, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> new_vectors = np.random.random_sample( + ... (n_new_vectors, n_features)).astype(np.float32) + >>> new_indices = np.arange(n_samples, n_new_vectors, dtype=np.int64) + >>> build_params = ivf_flat.IndexParams(metric="sqeuclidean") + >>> index = ivf_flat.build(build_params, dataset) + >>> ivf_flat.extend(index, new_vectors, new_indices) + """ + + if not index.trained: + raise ValueError("Index needs to be built before extending") + + new_vectors_ai = wrap_array(new_vectors) + _check_input_array(new_vectors_ai, + [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-Flat requires new_vectors in host memory + _check_memory_location(new_vectors, expected_host=True, name="new_vectors") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef cydlpack.DLManagedTensor* new_vectors_dlpack = \ + cydlpack.dlpack_c(new_vectors_ai) + cdef cydlpack.DLManagedTensor* new_indices_dlpack = NULL + + if new_indices is not None: + new_indices_ai = wrap_array(new_indices) + _check_input_array(new_indices_ai, [np.dtype('int64')]) + # Multi-GPU IVF-Flat requires new_indices in host memory + _check_memory_location(new_indices, expected_host=True, + name="new_indices") + new_indices_dlpack = cydlpack.dlpack_c(new_indices_ai) + + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfFlatExtend(res, index.mg_index, + new_vectors_dlpack, + new_indices_dlpack)) + + +@auto_sync_multi_gpu_resources +def save(Index index, filename, resources=None): + """ + Serialize the multi-GPU IVF-Flat index to a file. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.ivf_flat.Index` + filename : str + The filename to serialize the index to. + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_flat + >>> n_samples = 50000 + >>> n_features = 50 + >>> # For multi-GPU IVF-Flat, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = ivf_flat.IndexParams(metric="sqeuclidean") + >>> index = ivf_flat.build(build_params, dataset) + >>> ivf_flat.save(index, "index.bin") + """ + + if not index.trained: + raise ValueError("Index needs to be built before serializing") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfFlatSerialize(res, + index.mg_index, + filename_str.c_str())) + + +@auto_sync_multi_gpu_resources +def load(filename, resources=None): + """ + Deserialize the multi-GPU IVF-Flat index from a file. + + Parameters + ---------- + filename : str + The filename to deserialize the index from. + {resources_docstring} + + Returns + ------- + index : Index + The deserialized index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import ivf_flat + >>> index = ivf_flat.load("index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfFlatDeserialize(res, + filename_str.c_str(), + index.mg_index)) + index.mg_trained = True + + return index + + +@auto_sync_multi_gpu_resources +def distribute(filename, resources=None): + """ + Distribute a single-GPU IVF-Flat index across multiple GPUs from a file. + + Parameters + ---------- + filename : str + The filename to distribute the index from. + {resources_docstring} + + Returns + ------- + index : Index + The distributed index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import ivf_flat + >>> index = ivf_flat.distribute("single_gpu_index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfFlatDistribute(res, + filename_str.c_str(), + index.mg_index)) + index.mg_trained = True + + return index diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_pq/CMakeLists.txt b/python/cuvs/cuvs/neighbors/mg/ivf_pq/CMakeLists.txt new file mode 100644 index 0000000000..2b5c5a18c8 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_pq/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# + +# Set the list of Cython files to build +set(cython_sources ivf_pq.pyx) +set(linked_libraries cuvs::cuvs cuvs::c_api) + +# Build all of the Cython targets +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_mg_ivf_pq_ +) diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_pq/__init__.py b/python/cuvs/cuvs/neighbors/mg/ivf_pq/__init__.py new file mode 100644 index 0000000000..c75cedc267 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_pq/__init__.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .ivf_pq import ( + Index, + IndexParams, + SearchParams, + build, + distribute, + extend, + load, + save, + search, +) + +__all__ = [ + "Index", + "IndexParams", + "SearchParams", + "build", + "extend", + "search", + "save", + "load", + "distribute", +] diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pxd new file mode 100644 index 0000000000..b0a635eb52 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pxd @@ -0,0 +1,125 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +from libc.stdint cimport int64_t, uintptr_t +from libcpp cimport bool + +from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t +from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor +from cuvs.neighbors.ivf_pq.ivf_pq cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsIvfPqIndexParams_t, + cuvsIvfPqSearchParams_t, +) + +# Import base single-GPU extension module for subclassing + +# Multi-GPU distribution modes +cdef extern from "cuvs/neighbors/mg_common.h" nogil: + ctypedef enum cuvsMultiGpuDistributionMode: + CUVS_NEIGHBORS_MG_REPLICATED + CUVS_NEIGHBORS_MG_SHARDED + + ctypedef enum cuvsMultiGpuReplicatedSearchMode: + CUVS_NEIGHBORS_MG_LOAD_BALANCER + CUVS_NEIGHBORS_MG_ROUND_ROBIN + + ctypedef enum cuvsMultiGpuShardedMergeMode: + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + CUVS_NEIGHBORS_MG_TREE_MERGE + +# Multi-GPU IVF-PQ structures and functions +cdef extern from "cuvs/neighbors/mg_ivf_pq.h" nogil: + cdef struct cuvsMultiGpuIvfPqIndexParams: + cuvsIvfPqIndexParams_t base_params + cuvsMultiGpuDistributionMode mode + + cdef struct cuvsMultiGpuIvfPqSearchParams: + cuvsIvfPqSearchParams_t base_params + cuvsMultiGpuReplicatedSearchMode search_mode + cuvsMultiGpuShardedMergeMode merge_mode + int64_t n_rows_per_batch + + cdef struct cuvsMultiGpuIvfPqIndex: + uintptr_t addr + DLDataType dtype + + ctypedef cuvsMultiGpuIvfPqIndexParams* cuvsMultiGpuIvfPqIndexParams_t + ctypedef cuvsMultiGpuIvfPqSearchParams* cuvsMultiGpuIvfPqSearchParams_t + ctypedef cuvsMultiGpuIvfPqIndex* cuvsMultiGpuIvfPqIndex_t + + cuvsError_t cuvsMultiGpuIvfPqIndexParamsCreate( + cuvsMultiGpuIvfPqIndexParams_t* index_params) + + cuvsError_t cuvsMultiGpuIvfPqIndexParamsDestroy( + cuvsMultiGpuIvfPqIndexParams_t index_params) + + cuvsError_t cuvsMultiGpuIvfPqSearchParamsCreate( + cuvsMultiGpuIvfPqSearchParams_t* params) + + cuvsError_t cuvsMultiGpuIvfPqSearchParamsDestroy( + cuvsMultiGpuIvfPqSearchParams_t params) + + cuvsError_t cuvsMultiGpuIvfPqIndexCreate(cuvsMultiGpuIvfPqIndex_t* index) + + cuvsError_t cuvsMultiGpuIvfPqIndexDestroy(cuvsMultiGpuIvfPqIndex_t index) + + cuvsError_t cuvsMultiGpuIvfPqBuild(cuvsResources_t res, + cuvsMultiGpuIvfPqIndexParams_t params, + DLManagedTensor* dataset_tensor, + cuvsMultiGpuIvfPqIndex_t index) except + + + cuvsError_t cuvsMultiGpuIvfPqSearch( + cuvsResources_t res, + cuvsMultiGpuIvfPqSearchParams_t params, + cuvsMultiGpuIvfPqIndex_t index, + DLManagedTensor* queries_tensor, + DLManagedTensor* neighbors_tensor, + DLManagedTensor* distances_tensor) except + + + cuvsError_t cuvsMultiGpuIvfPqExtend( + cuvsResources_t res, + cuvsMultiGpuIvfPqIndex_t index, + DLManagedTensor* new_vectors_tensor, + DLManagedTensor* new_indices_tensor) except + + + cuvsError_t cuvsMultiGpuIvfPqSerialize( + cuvsResources_t res, + cuvsMultiGpuIvfPqIndex_t index, + const char* filename) except + + + cuvsError_t cuvsMultiGpuIvfPqDeserialize( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuIvfPqIndex_t index) except + + + cuvsError_t cuvsMultiGpuIvfPqDistribute( + cuvsResources_t res, + const char* filename, + cuvsMultiGpuIvfPqIndex_t index) except + + + +cdef class IndexParams(SingleGpuIndexParams): + cdef cuvsMultiGpuIvfPqIndexParams_t mg_params + +cdef class SearchParams(SingleGpuSearchParams): + cdef cuvsMultiGpuIvfPqSearchParams_t mg_params + +cdef class Index: + cdef cuvsMultiGpuIvfPqIndex_t mg_index + cdef bool mg_trained diff --git a/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pyx new file mode 100644 index 0000000000..6e137ce492 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/mg/ivf_pq/ivf_pq.pyx @@ -0,0 +1,572 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: language_level=3 + +import numpy as np + +from libc.stdint cimport uint32_t +from libcpp.string cimport string + +from pylibraft.common import auto_convert_output +from pylibraft.common.cai_wrapper import wrap_array +from pylibraft.common.interruptible import cuda_interruptible + +from cuvs.common.exceptions import check_cuvs +from cuvs.common.mg_resources import auto_sync_multi_gpu_resources +from cuvs.neighbors.common import _check_input_array, _check_memory_location + +from cuvs.common cimport cydlpack +from cuvs.common.c_api cimport cuvsResources_t +from cuvs.neighbors.ivf_pq.ivf_pq cimport ( + IndexParams as SingleGpuIndexParams, + SearchParams as SingleGpuSearchParams, + cuvsIvfPqIndexParams_t, + cuvsIvfPqIndexParamsDestroy, + cuvsIvfPqSearchParams_t, + cuvsIvfPqSearchParamsDestroy, +) + +from .ivf_pq cimport ( + cuvsMultiGpuDistributionMode, + cuvsMultiGpuIvfPqBuild, + cuvsMultiGpuIvfPqDeserialize, + cuvsMultiGpuIvfPqDistribute, + cuvsMultiGpuIvfPqExtend, + cuvsMultiGpuIvfPqIndex, + cuvsMultiGpuIvfPqIndex_t, + cuvsMultiGpuIvfPqIndexCreate, + cuvsMultiGpuIvfPqIndexDestroy, + cuvsMultiGpuIvfPqIndexParams, + cuvsMultiGpuIvfPqIndexParams_t, + cuvsMultiGpuIvfPqIndexParamsCreate, + cuvsMultiGpuIvfPqIndexParamsDestroy, + cuvsMultiGpuIvfPqSearch, + cuvsMultiGpuIvfPqSearchParams, + cuvsMultiGpuIvfPqSearchParams_t, + cuvsMultiGpuIvfPqSearchParamsCreate, + cuvsMultiGpuIvfPqSearchParamsDestroy, + cuvsMultiGpuIvfPqSerialize, + cuvsMultiGpuReplicatedSearchMode, + cuvsMultiGpuShardedMergeMode, +) + + +cdef class IndexParams(SingleGpuIndexParams): + """ + Parameters to build multi-GPU IVF-PQ index for efficient search. + Extends single-GPU IndexParams with multi-GPU specific parameters. + + Parameters + ---------- + distribution_mode : str, default = "sharded" + Distribution mode for multi-GPU setup. + Valid values: ["replicated", "sharded"] + **kwargs : Additional parameters passed to single-GPU IndexParams + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsIvfPqIndexParamsDestroy(self.params)) + + # Create multi-GPU params which includes embedded base params + check_cuvs(cuvsMultiGpuIvfPqIndexParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuIvfPqIndexParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, distribution_mode="sharded", **kwargs): + super().__init__(**kwargs) + if distribution_mode == "replicated": + self.mg_params.mode = CUVS_NEIGHBORS_MG_REPLICATED + elif distribution_mode == "sharded": + self.mg_params.mode = CUVS_NEIGHBORS_MG_SHARDED + else: + raise ValueError( + "distribution_mode must be 'replicated' or 'sharded'") + + def get_handle(self): + return self.mg_params + + @property + def distribution_mode(self): + return ("replicated" if self.mg_params.mode == + CUVS_NEIGHBORS_MG_REPLICATED else "sharded") + + +cdef class Index: + """ + Multi-GPU IVF-PQ index object. Stores the trained multi-GPU IVF-PQ + index state which can be used to perform nearest neighbors searches + across multiple GPUs. + """ + + def __cinit__(self): + # Initialize multi-GPU index + check_cuvs(cuvsMultiGpuIvfPqIndexCreate(&self.mg_index)) + # Initialize multi-GPU trained state + self.mg_trained = False + + def __dealloc__(self): + check_cuvs(cuvsMultiGpuIvfPqIndexDestroy(self.mg_index)) + + def __repr__(self): + return "Index(type=MultiGpuIvfPq)" + + @property + def trained(self): + return self.mg_trained + + +@auto_sync_multi_gpu_resources +def build(IndexParams index_params, dataset, resources=None): + """ + Build the multi-GPU IVF-PQ index from the dataset for efficient search. + + Parameters + ---------- + index_params : :py:class:`cuvs.neighbors.ivf_pq.IndexParams` + dataset : Array interface compliant matrix shape (n_samples, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-PQ, the dataset MUST be + in host memory (CPU). + If using CuPy/device arrays, transfer to host with array.get() + or cp.asnumpy(array). + {resources_docstring} + + Returns + ------- + index: py:class:`cuvs.neighbors.ivf_pq.Index` + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_pq + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU IVF-PQ, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = ivf_pq.IndexParams(metric="sqeuclidean") + >>> index = ivf_pq.build(build_params, dataset) + >>> distances, neighbors = ivf_pq.search( + ... ivf_pq.SearchParams(), + ... index, dataset, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + dataset_ai = wrap_array(dataset) + _check_input_array(dataset_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-PQ requires dataset in host memory + _check_memory_location(dataset, expected_host=True, name="dataset") + + cdef Index idx = Index() + cdef cydlpack.DLManagedTensor* dataset_dlpack = \ + cydlpack.dlpack_c(dataset_ai) + cdef cuvsMultiGpuIvfPqIndexParams_t params = index_params.mg_params + + cdef cuvsResources_t res = resources.get_c_obj() + + # Build the multi-GPU index + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfPqBuild(res, params, + dataset_dlpack, + idx.mg_index)) + idx.mg_trained = True + + return idx + + +cdef class SearchParams(SingleGpuSearchParams): + """ + Parameters to search multi-GPU IVF-PQ index. + """ + + def __cinit__(self): + # Base class __cinit__ has already created self.params + # We need to destroy it and use our embedded params instead + if self.params != NULL: + check_cuvs(cuvsIvfPqSearchParamsDestroy(self.params)) + + # Create multi-GPU search params which includes embedded base params + check_cuvs(cuvsMultiGpuIvfPqSearchParamsCreate(&self.mg_params)) + # Replace base pointer with embedded base params + self.params = self.mg_params.base_params + + def __dealloc__(self): + # Only destroy the mg_params, which will handle base_params cleanup + check_cuvs(cuvsMultiGpuIvfPqSearchParamsDestroy(self.mg_params)) + self.mg_params = NULL + self.params = NULL + + def __init__(self, *, n_probes=20, search_mode="load_balancer", + merge_mode="merge_on_root_rank", n_rows_per_batch=1000, + **kwargs): + super().__init__(n_probes=n_probes, **kwargs) + # Use the property setters for consistent validation + self.search_mode = search_mode + self.merge_mode = merge_mode + self.n_rows_per_batch = n_rows_per_batch + + def get_handle(self): + return self.mg_params + + @property + def search_mode(self): + """Get the search mode for multi-GPU search.""" + return ("load_balancer" if self.mg_params.search_mode == + CUVS_NEIGHBORS_MG_LOAD_BALANCER else "round_robin") + + @search_mode.setter + def search_mode(self, value): + """Set the search mode for multi-GPU search.""" + if value == "load_balancer": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_LOAD_BALANCER + elif value == "round_robin": + self.mg_params.search_mode = CUVS_NEIGHBORS_MG_ROUND_ROBIN + else: + raise ValueError( + "search_mode must be 'load_balancer' or 'round_robin'") + + @property + def merge_mode(self): + """Get the merge mode for multi-GPU search.""" + return ("merge_on_root_rank" if self.mg_params.merge_mode == + CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK else "tree_merge") + + @merge_mode.setter + def merge_mode(self, value): + """Set the merge mode for multi-GPU search.""" + if value == "merge_on_root_rank": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_MERGE_ON_ROOT_RANK + elif value == "tree_merge": + self.mg_params.merge_mode = CUVS_NEIGHBORS_MG_TREE_MERGE + else: + raise ValueError( + "merge_mode must be 'merge_on_root_rank' or 'tree_merge'") + + @property + def n_rows_per_batch(self): + """Get the number of rows per batch for multi-GPU search.""" + return self.mg_params.n_rows_per_batch + + @n_rows_per_batch.setter + def n_rows_per_batch(self, value): + """Set the number of rows per batch for multi-GPU search.""" + if not isinstance(value, int) or value <= 0: + raise ValueError("n_rows_per_batch must be a positive integer") + self.mg_params.n_rows_per_batch = value + + +@auto_sync_multi_gpu_resources +@auto_convert_output +def search(SearchParams search_params, Index index, queries, + k, neighbors=None, distances=None, resources=None): + """ + Search the multi-GPU IVF-PQ index for the k-nearest neighbors + of each query. + + Parameters + ---------- + search_params : :py:class:`cuvs.neighbors.ivf_pq.SearchParams` + index : :py:class:`cuvs.neighbors.ivf_pq.Index` + queries : Array interface compliant matrix shape (n_queries, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-PQ, queries MUST be + in host memory (CPU). + If using CuPy/device arrays, transfer to host with array.get() + or cp.asnumpy(array). + k : int + The number of neighbors to search for each query. + neighbors : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the indices of + the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-PQ. + distances : Array interface compliant matrix shape (n_queries, k), optional + If provided, this array will be filled with the distances to + the k-nearest neighbors. + If not provided, a new host array will be allocated. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-PQ. + {resources_docstring} + + Returns + ------- + distances : numpy.ndarray + The distances to the k-nearest neighbors for each query + (in host memory). + neighbors : numpy.ndarray + The indices of the k-nearest neighbors for each query + (in host memory). + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_pq + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + >>> # For multi-GPU IVF-PQ, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> queries = np.random.random_sample((n_queries, n_features)).astype( + ... np.float32) + >>> build_params = ivf_pq.IndexParams(metric="sqeuclidean") + >>> index = ivf_pq.build(build_params, dataset) + >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), + ... index, queries, k) + >>> # Results are already in host memory (NumPy arrays) + """ + + if not index.trained: + raise ValueError("Index needs to be built before searching") + + queries_ai = wrap_array(queries) + _check_input_array(queries_ai, [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-PQ requires queries in host memory + _check_memory_location(queries, expected_host=True, name="queries") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + # Prepare output arrays + cdef uint32_t n_queries = queries.shape[0] + if neighbors is None: + # For multi-GPU, create host arrays instead of device arrays + neighbors = np.empty((n_queries, k), dtype='int64') + if distances is None: + # For multi-GPU, create host arrays instead of device arrays + distances = np.empty((n_queries, k), dtype='float32') + + neighbors_ai = wrap_array(neighbors) + _check_input_array(neighbors_ai, [np.dtype('int64')], + exp_rows=n_queries, exp_cols=k) + distances_ai = wrap_array(distances) + _check_input_array(distances_ai, [np.dtype('float32')], + exp_rows=n_queries, exp_cols=k) + + # Multi-GPU IVF-PQ requires output arrays in host memory + _check_memory_location(neighbors, expected_host=True, name="neighbors") + _check_memory_location(distances, expected_host=True, name="distances") + + cdef cydlpack.DLManagedTensor* queries_dlpack = \ + cydlpack.dlpack_c(queries_ai) + cdef cydlpack.DLManagedTensor* neighbors_dlpack = \ + cydlpack.dlpack_c(neighbors_ai) + cdef cydlpack.DLManagedTensor* distances_dlpack = \ + cydlpack.dlpack_c(distances_ai) + + # Perform search + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfPqSearch(res, search_params.mg_params, + index.mg_index, queries_dlpack, + neighbors_dlpack, distances_dlpack)) + + return (distances, neighbors) + + +@auto_sync_multi_gpu_resources +def extend(Index index, new_vectors, new_indices=None, + resources=None): + """ + Extend the multi-GPU IVF-PQ index with new vectors. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.ivf_pq.Index` + new_vectors : Array interface compliant matrix shape (n_new_vectors, dim) + Supported dtype [float32, float16, int8, uint8] + **IMPORTANT**: For multi-GPU IVF-PQ, new_vectors MUST be + in host memory (CPU). + If using CuPy/device arrays, transfer to host with array.get() + or cp.asnumpy(array). + new_indices : Array interface compliant matrix shape (n_new_vectors,) + , optional + If provided, these indices will be used for the new vectors. + If not provided, indices will be automatically assigned. + **IMPORTANT**: Must be in host memory (CPU) for multi-GPU IVF-PQ. + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_pq + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_new_vectors = 1000 + >>> # For multi-GPU IVF-PQ, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> new_vectors = np.random.random_sample( + ... (n_new_vectors, n_features)).astype(np.float32) + >>> new_indices = np.arange(n_samples, n_new_vectors, dtype=np.int64) + >>> build_params = ivf_pq.IndexParams(metric="sqeuclidean") + >>> index = ivf_pq.build(build_params, dataset) + >>> ivf_pq.extend(index, new_vectors, new_indices) + """ + + if not index.trained: + raise ValueError("Index needs to be built before extending") + + new_vectors_ai = wrap_array(new_vectors) + _check_input_array(new_vectors_ai, + [np.dtype('float32'), np.dtype('float16'), + np.dtype('byte'), np.dtype('ubyte')]) + + # Multi-GPU IVF-PQ requires new_vectors in host memory + _check_memory_location(new_vectors, expected_host=True, name="new_vectors") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef cydlpack.DLManagedTensor* new_vectors_dlpack = \ + cydlpack.dlpack_c(new_vectors_ai) + cdef cydlpack.DLManagedTensor* new_indices_dlpack = NULL + + if new_indices is not None: + new_indices_ai = wrap_array(new_indices) + _check_input_array(new_indices_ai, [np.dtype('int64')]) + # Multi-GPU IVF-PQ requires new_indices in host memory + _check_memory_location(new_indices, expected_host=True, + name="new_indices") + new_indices_dlpack = cydlpack.dlpack_c(new_indices_ai) + + with cuda_interruptible(): + check_cuvs(cuvsMultiGpuIvfPqExtend(res, index.mg_index, + new_vectors_dlpack, + new_indices_dlpack)) + + +@auto_sync_multi_gpu_resources +def save(Index index, filename, resources=None): + """ + Serialize the multi-GPU IVF-PQ index to a file. + + Parameters + ---------- + index : :py:class:`cuvs.neighbors.ivf_pq.Index` + filename : str + The filename to serialize the index to. + {resources_docstring} + + Examples + -------- + + >>> import numpy as np + >>> from cuvs.neighbors.mg import ivf_pq + >>> n_samples = 50000 + >>> n_features = 50 + >>> # For multi-GPU IVF-PQ, use host (NumPy) arrays + >>> dataset = np.random.random_sample((n_samples, n_features)).astype( + ... np.float32) + >>> build_params = ivf_pq.IndexParams(metric="sqeuclidean") + >>> index = ivf_pq.build(build_params, dataset) + >>> ivf_pq.save(index, "index.bin") + """ + + if not index.trained: + raise ValueError("Index needs to be built before serializing") + + # Get resources + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfPqSerialize(res, index.mg_index, + filename_str.c_str())) + + +@auto_sync_multi_gpu_resources +def load(filename, resources=None): + """ + Deserialize the multi-GPU IVF-PQ index from a file. + + Parameters + ---------- + filename : str + The filename to deserialize the index from. + {resources_docstring} + + Returns + ------- + index : Index + The deserialized index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import ivf_pq + >>> index = ivf_pq.load("index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfPqDeserialize(res, filename_str.c_str(), + index.mg_index)) + index.mg_trained = True + + return index + + +@auto_sync_multi_gpu_resources +def distribute(filename, resources=None): + """ + Distribute a single-GPU IVF-PQ index across multiple GPUs from a file. + + Parameters + ---------- + filename : str + The filename to distribute the index from. + {resources_docstring} + + Returns + ------- + index : Index + The distributed index. + + Examples + -------- + + >>> from cuvs.neighbors.mg import ivf_pq + >>> index = ivf_pq.distribute("single_gpu_index.bin") # doctest: +SKIP + """ + + cdef Index index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + + cdef string filename_str = filename.encode('utf-8') + check_cuvs(cuvsMultiGpuIvfPqDistribute(res, filename_str.c_str(), + index.mg_index)) + index.mg_trained = True + + return index diff --git a/python/cuvs/cuvs/tests/test_mg_cagra.py b/python/cuvs/cuvs/tests/test_mg_cagra.py new file mode 100644 index 0000000000..16d40f9c17 --- /dev/null +++ b/python/cuvs/cuvs/tests/test_mg_cagra.py @@ -0,0 +1,608 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile + +import numpy as np +import pytest +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import normalize + +from cuvs.common import MultiGpuResources +from cuvs.neighbors.mg import cagra as mg_cagra +from cuvs.tests.ann_utils import calc_recall, generate_data + + +# Check if multi-GPU functionality is available +def has_multiple_gpus(): + """Check if system has multiple GPUs available.""" + try: + import cupy as cp + + return cp.cuda.runtime.getDeviceCount() > 1 + except Exception: + return False + + +# Mark tests that require multiple GPUs +requires_multiple_gpus = pytest.mark.skipif( + not has_multiple_gpus(), reason="Multi-GPU tests require multiple GPUs" +) + + +def run_mg_cagra_build_search_test( + n_rows=10000, + n_cols=10, + n_queries=100, + k=10, + dtype=np.float32, + metric="sqeuclidean", # CAGRA only supports sqeuclidean and inner_product + distribution_mode="sharded", + search_mode="load_balancer", + merge_mode="tree_merge", + n_rows_per_batch=1000, + compare=True, + search_params=None, + graph_degree=64, + intermediate_graph_degree=128, +): + """ + Run a multi-GPU CAGRA build and search test. + + Note: Multi-GPU CAGRA requires host memory arrays (NumPy), not device + arrays. + """ + # Generate host memory arrays (NumPy) + dataset = generate_data((n_rows, n_cols), dtype) + if metric == "inner_product": + dataset = normalize(dataset, norm="l2", axis=1) + + queries = generate_data((n_queries, n_cols), dtype) + if metric == "inner_product": + queries = normalize(queries, norm="l2", axis=1) + + # Multi-GPU resources + resources = MultiGpuResources() + + # Build parameters + build_params = mg_cagra.IndexParams( + metric=metric, + distribution_mode=distribution_mode, + graph_degree=graph_degree, + intermediate_graph_degree=intermediate_graph_degree, + ) + + # Build index + index = mg_cagra.build(build_params, dataset, resources=resources) + assert index.trained + + # Search parameters + if search_params is None: + search_params = {} + search_params_obj = mg_cagra.SearchParams( + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=n_rows_per_batch, + **search_params, + ) + + # Perform search + distances, neighbors = mg_cagra.search( + search_params_obj, + index, + queries, + k, + resources=resources, + ) + + # Verify results are in host memory (NumPy arrays) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + if not compare: + return distances, neighbors + + # Calculate reference values with sklearn + skl_metric = { + "sqeuclidean": "sqeuclidean", + "inner_product": "cosine", + }[metric] + + nn_skl = NearestNeighbors( + n_neighbors=k, algorithm="brute", metric=skl_metric + ) + nn_skl.fit(dataset) + skl_idx = nn_skl.kneighbors(queries, return_distance=False) + + recall = calc_recall(neighbors, skl_idx) + # Multi-GPU implementation may have lower recall due to data + # distribution across GPUs + # This is acceptable as long as the functionality works correctly + assert recall > 0.3, f"Recall too low: {recall:.3f}" + + return distances, neighbors + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32]) +@pytest.mark.parametrize( + "metric", ["sqeuclidean"] +) # Start with just sqeuclidean +@pytest.mark.parametrize( + "distribution_mode", ["sharded"] +) # Start with just sharded +def test_mg_cagra_basic(dtype, metric, distribution_mode): + """Test basic multi-GPU CAGRA build and search functionality.""" + run_mg_cagra_build_search_test( + n_rows=2000, # Use smaller dataset for more reliable tests + n_cols=8, + n_queries=20, + k=5, + dtype=dtype, + metric=metric, + distribution_mode=distribution_mode, + graph_degree=32, # Smaller graph for faster tests + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize( + "metric", ["inner_product"] +) # Only test supported metrics +@pytest.mark.parametrize("distribution_mode", ["replicated"]) +def test_mg_cagra_additional_metrics(metric, distribution_mode): + """Test additional metrics and distribution modes.""" + run_mg_cagra_build_search_test( + n_rows=2000, + n_cols=8, + n_queries=20, + k=5, + dtype=np.float32, + metric=metric, + distribution_mode=distribution_mode, + graph_degree=32, + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32, np.float16, np.int8, np.uint8]) +def test_mg_cagra_dtypes(dtype): + """Test multi-GPU CAGRA with different data types.""" + run_mg_cagra_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + dtype=dtype, + metric="sqeuclidean", + graph_degree=32, + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("distribution_mode", ["sharded", "replicated"]) +def test_mg_cagra_distribution_modes(distribution_mode): + """Test different distribution modes for multi-GPU CAGRA.""" + run_mg_cagra_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + distribution_mode=distribution_mode, + graph_degree=32, + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("search_mode", ["load_balancer", "round_robin"]) +@pytest.mark.parametrize("merge_mode", ["merge_on_root_rank", "tree_merge"]) +def test_mg_cagra_search_params(search_mode, merge_mode): + """Test different multi-GPU search parameters.""" + run_mg_cagra_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=500, + graph_degree=32, + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize( + "metric", ["sqeuclidean"] +) # Only test supported metrics +def test_mg_cagra_metrics(metric): + """Test different distance metrics for multi-GPU CAGRA.""" + run_mg_cagra_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + metric=metric, + graph_degree=32, + intermediate_graph_degree=64, + ) + + +@requires_multiple_gpus +def test_mg_cagra_serialize(): + """Test save/load functionality for multi-GPU CAGRA.""" + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build original index + build_params = mg_cagra.IndexParams( + graph_degree=32, intermediate_graph_degree=64 + ) + original_index = mg_cagra.build(build_params, dataset, resources=resources) + + # Search with original index + search_params = mg_cagra.SearchParams(itopk_size=32) + orig_distances, orig_neighbors = mg_cagra.search( + search_params, original_index, queries, k, resources=resources + ) + + # Save index to temporary file + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + mg_cagra.save(original_index, temp_filename, resources=resources) + + # Load index from file + loaded_index = mg_cagra.load(temp_filename, resources=resources) + assert loaded_index.trained + + # Search with loaded index + loaded_distances, loaded_neighbors = mg_cagra.search( + search_params, loaded_index, queries, k, resources=resources + ) + + # Results should be identical + np.testing.assert_array_equal(orig_neighbors, loaded_neighbors) + np.testing.assert_allclose(orig_distances, loaded_distances, rtol=1e-6) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) + + +@requires_multiple_gpus +def test_mg_cagra_distribute(): + """Test distribute functionality for multi-GPU CAGRA.""" + # Note: Distribute is for replicating a single-GPU index across + # multiple GPUs. + # This test builds a single-GPU index, serializes it, then distributes it. + + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + # Import single-GPU CAGRA to build and serialize a single-GPU index + from cuvs.common import Resources + from cuvs.neighbors import cagra + + # Build single-GPU index first + single_gpu_resources = Resources() + single_build_params = cagra.IndexParams( + metric="sqeuclidean", graph_degree=32, intermediate_graph_degree=64 + ) + + # Convert to device arrays for single-GPU build + try: + import cupy as cp + + device_dataset = cp.asarray(dataset) + single_index = cagra.build( + single_build_params, device_dataset, resources=single_gpu_resources + ) + except ImportError: + pytest.skip("CuPy not available for single-GPU index building") + + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + # Serialize single-GPU index + cagra.save(temp_filename, single_index, resources=single_gpu_resources) + + # Now distribute the single-GPU index across multiple GPUs + resources = MultiGpuResources() + distributed_index = mg_cagra.distribute( + temp_filename, resources=resources + ) + assert distributed_index.trained + + # Search should work with distributed index (using host memory arrays) + search_params = mg_cagra.SearchParams(itopk_size=32) + distances, neighbors = mg_cagra.search( + search_params, distributed_index, queries, k, resources=resources + ) + + assert distances.shape == (20, k) + assert neighbors.shape == (20, k) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) + + +def test_memory_location_validation(): + """Test that multi-GPU CAGRA validates memory locations correctly.""" + try: + import cupy as cp + except ImportError: + pytest.skip("CuPy not available for memory location tests") + + n_rows, n_cols = 1500, 8 + + # Create host and device arrays + host_data = generate_data((n_rows, n_cols), np.float32) + device_data = cp.asarray(host_data) + + resources = MultiGpuResources() + build_params = mg_cagra.IndexParams( + graph_degree=32, intermediate_graph_degree=64 + ) + + # Test that device arrays are rejected for build + with pytest.raises(ValueError, match="host memory"): + mg_cagra.build(build_params, device_data, resources=resources) + + # Test that host arrays work for build + index = mg_cagra.build(build_params, host_data, resources=resources) + + # Test that device arrays are rejected for search + queries = generate_data((20, n_cols), np.float32) + device_queries = cp.asarray(queries) + search_params = mg_cagra.SearchParams(itopk_size=32) + + with pytest.raises(ValueError, match="host memory"): + mg_cagra.search( + search_params, index, device_queries, 5, resources=resources + ) + + # Test that host arrays work for search + distances, neighbors = mg_cagra.search( + search_params, index, queries, 5, resources=resources + ) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + + +def test_parameter_validation(): + """Test parameter validation for multi-GPU CAGRA.""" + # Test invalid distribution mode + with pytest.raises(ValueError, match="distribution_mode must be"): + mg_cagra.IndexParams(distribution_mode="invalid") + + # Test invalid search mode + with pytest.raises(ValueError, match="search_mode must be"): + mg_cagra.SearchParams(search_mode="invalid") + + # Test invalid merge mode + with pytest.raises(ValueError, match="merge_mode must be"): + mg_cagra.SearchParams(merge_mode="invalid") + + +def test_parameter_properties(): + """Test that parameters can be accessed via properties.""" + # Test IndexParams properties + params = mg_cagra.IndexParams(distribution_mode="replicated") + assert params.distribution_mode == "replicated" + + params = mg_cagra.IndexParams(distribution_mode="sharded") + assert params.distribution_mode == "sharded" + + # Test SearchParams creation with different parameters + mg_cagra.SearchParams( + search_mode="round_robin", + merge_mode="tree_merge", + n_rows_per_batch=2000, + ) + # These don't have properties exposed, but creation should work + + +def test_untrained_index_error(): + """Test that using an untrained index raises appropriate errors.""" + resources = MultiGpuResources() + + # Create untrained index + index = mg_cagra.Index() + assert not index.trained + + queries = generate_data((100, 10), np.float32) + search_params = mg_cagra.SearchParams() + + # Test that search on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_cagra.search(search_params, index, queries, 10, resources=resources) + + # Test that save on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_cagra.save(index, "temp.bin", resources=resources) + + +@requires_multiple_gpus +def test_mg_cagra_with_prealloc_output(): + """Test multi-GPU CAGRA search with pre-allocated output arrays.""" + n_rows, n_cols = 1500, 8 + n_queries = 20 + k = 5 + + # Generate data in host memory + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build index + build_params = mg_cagra.IndexParams( + graph_degree=32, intermediate_graph_degree=64 + ) + index = mg_cagra.build(build_params, dataset, resources=resources) + + # Pre-allocate output arrays in host memory + neighbors = np.empty((n_queries, k), dtype=np.int64) + distances = np.empty((n_queries, k), dtype=np.float32) + + # Search with pre-allocated arrays + search_params = mg_cagra.SearchParams(itopk_size=32) + ret_distances, ret_neighbors = mg_cagra.search( + search_params, + index, + queries, + k, + neighbors=neighbors, + distances=distances, + resources=resources, + ) + + # Should return the same arrays we passed in + assert ret_distances is distances + assert ret_neighbors is neighbors + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + +def test_index_repr(): + """Test string representation of Index.""" + index = mg_cagra.Index() + assert repr(index) == "Index(type=MultiGpuCagra)" + + +def test_mg_cagra_simple(): + """Simple test to validate multi-GPU CAGRA works with very favorable + parameters. + """ + if not has_multiple_gpus(): + pytest.skip("Multi-GPU tests require multiple GPUs") + + # Use simple test case that should definitely work + n_rows, n_cols = 1000, 8 + n_queries, k = 20, 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Use small graph for reliable testing + build_params = mg_cagra.IndexParams( + metric="sqeuclidean", + graph_degree=16, + intermediate_graph_degree=32, + ) + + # Build index + index = mg_cagra.build(build_params, dataset, resources=resources) + + # Search with basic parameters + search_params = mg_cagra.SearchParams(itopk_size=16) + distances, neighbors = mg_cagra.search( + search_params, index, queries, k, resources=resources + ) + + # Basic sanity checks + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + + # Check that we get valid neighbors + assert np.all(neighbors >= 0) + assert np.all(neighbors < n_rows) + + # Distances should be non-negative and sorted + assert np.all(distances >= 0) + for i in range(n_queries): + assert np.all( + distances[i, :-1] <= distances[i, 1:] + ), f"Distances not sorted for query {i}" + + +# Integration test with multiple operations +@requires_multiple_gpus +def test_mg_cagra_integration(): + """Integration test covering build, search, and serialization.""" + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate initial dataset + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build initial index + build_params = mg_cagra.IndexParams( + distribution_mode="sharded", + metric="sqeuclidean", + graph_degree=32, + intermediate_graph_degree=64, + ) + index = mg_cagra.build(build_params, dataset, resources=resources) + + # Initial search + search_params = mg_cagra.SearchParams( + itopk_size=32, + search_mode="load_balancer", + merge_mode="merge_on_root_rank", + ) + distances1, neighbors1 = mg_cagra.search( + search_params, index, queries, k, resources=resources + ) + + # Save and reload + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + mg_cagra.save(index, temp_filename, resources=resources) + reloaded_index = mg_cagra.load(temp_filename, resources=resources) + + # Search with reloaded index + distances2, neighbors2 = mg_cagra.search( + search_params, reloaded_index, queries, k, resources=resources + ) + + # Results from reloaded index should match + np.testing.assert_array_equal(neighbors1, neighbors2) + np.testing.assert_allclose(distances1, distances2, rtol=1e-6) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) diff --git a/python/cuvs/cuvs/tests/test_mg_ivf_flat.py b/python/cuvs/cuvs/tests/test_mg_ivf_flat.py new file mode 100644 index 0000000000..8bec3663c0 --- /dev/null +++ b/python/cuvs/cuvs/tests/test_mg_ivf_flat.py @@ -0,0 +1,650 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile + +import numpy as np +import pytest +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import normalize + +from cuvs.common import MultiGpuResources +from cuvs.neighbors.mg import ivf_flat as mg_ivf_flat +from cuvs.tests.ann_utils import calc_recall, generate_data + + +# Check if multi-GPU functionality is available +def has_multiple_gpus(): + """Check if system has multiple GPUs available.""" + try: + import cupy as cp + + return cp.cuda.runtime.getDeviceCount() > 1 + except Exception: + return False + + +# Mark tests that require multiple GPUs +requires_multiple_gpus = pytest.mark.skipif( + not has_multiple_gpus(), reason="Multi-GPU tests require multiple GPUs" +) + + +def run_mg_ivf_flat_build_search_test( + n_rows=10000, + n_cols=10, + n_queries=100, + k=10, + dtype=np.float32, + metric="euclidean", + distribution_mode="sharded", + search_mode="load_balancer", + merge_mode="tree_merge", + n_rows_per_batch=1000, + compare=True, + add_data_on_build=True, + search_params=None, + n_lists=None, +): + """ + Run a multi-GPU IVF-Flat build and search test. + + Note: Multi-GPU IVF-Flat requires host memory arrays (NumPy), not + device arrays. + """ + # Generate host memory arrays (NumPy) + dataset = generate_data((n_rows, n_cols), dtype) + if metric == "inner_product": + dataset = normalize(dataset, norm="l2", axis=1) + + queries = generate_data((n_queries, n_cols), dtype) + if metric == "inner_product": + queries = normalize(queries, norm="l2", axis=1) + + # Multi-GPU resources + resources = MultiGpuResources() + + # Build parameters - use fewer clusters for better recall + # with smaller datasets + if n_lists is None: + # Use fewer clusters for smaller datasets to ensure enough points + # per cluster + n_lists = min(1024, max(64, n_rows // 50)) + + build_params = mg_ivf_flat.IndexParams( + metric=metric, + distribution_mode=distribution_mode, + add_data_on_build=add_data_on_build, + n_lists=n_lists, + ) + + # Build index + index = mg_ivf_flat.build(build_params, dataset, resources=resources) + assert index.trained + + # If not adding data on build, extend the index + if not add_data_on_build: + dataset_1 = dataset[: n_rows // 2, :] + dataset_2 = dataset[n_rows // 2 :, :] + indices_1 = np.arange(n_rows // 2, dtype=np.int64) + indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.int64) + + mg_ivf_flat.extend(index, dataset_1, indices_1, resources=resources) + mg_ivf_flat.extend(index, dataset_2, indices_2, resources=resources) + + # Search parameters + if search_params is None: + search_params = {} + # Use higher n_probes for better recall in multi-GPU setting + if "n_probes" not in search_params: + # Use many clusters for good recall - search majority of clusters + search_params["n_probes"] = min(n_lists, max(20, (n_lists * 3) // 4)) + search_params_obj = mg_ivf_flat.SearchParams( + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=n_rows_per_batch, + **search_params, + ) + + # Perform search + distances, neighbors = mg_ivf_flat.search( + search_params_obj, + index, + queries, + k, + resources=resources, + ) + + # Verify results are in host memory (NumPy arrays) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + if not compare: + return distances, neighbors + + # Calculate reference values with sklearn + skl_metric = { + "sqeuclidean": "sqeuclidean", + "inner_product": "cosine", + "cosine": "cosine", + "euclidean": "euclidean", + }[metric] + + nn_skl = NearestNeighbors( + n_neighbors=k, algorithm="brute", metric=skl_metric + ) + nn_skl.fit(dataset) + skl_idx = nn_skl.kneighbors(queries, return_distance=False) + + recall = calc_recall(neighbors, skl_idx) + # Multi-GPU implementation may have lower recall due to data distribution + # across GPUs + # This is acceptable as long as the functionality works correctly + assert recall > 0.3, ( + f"Recall too low: {recall:.3f} (n_lists={n_lists}, " + f"n_probes={search_params.get('n_probes', 'default')})" + ) + + return distances, neighbors + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32]) +@pytest.mark.parametrize( + "metric", ["sqeuclidean"] +) # Start with just sqeuclidean +@pytest.mark.parametrize( + "distribution_mode", ["sharded"] +) # Start with just sharded +def test_mg_ivf_flat_basic(dtype, metric, distribution_mode): + """Test basic multi-GPU IVF-Flat build and search functionality.""" + run_mg_ivf_flat_build_search_test( + n_rows=2000, # Use smaller dataset for more reliable tests + n_cols=8, + n_queries=20, + k=5, + dtype=dtype, + metric=metric, + distribution_mode=distribution_mode, + n_lists=50, # Fixed small number of clusters + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("metric", ["inner_product", "euclidean", "cosine"]) +@pytest.mark.parametrize("distribution_mode", ["replicated"]) +def test_mg_ivf_flat_additional_metrics(metric, distribution_mode): + """Test additional metrics and distribution modes.""" + run_mg_ivf_flat_build_search_test( + n_rows=2000, + n_cols=8, + n_queries=20, + k=5, + dtype=np.float32, + metric=metric, + distribution_mode=distribution_mode, + n_lists=50, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32, np.float16, np.int8, np.uint8]) +def test_mg_ivf_flat_dtypes(dtype): + """Test multi-GPU IVF-Flat with different data types.""" + run_mg_ivf_flat_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + dtype=dtype, + metric="sqeuclidean", + n_lists=30, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("distribution_mode", ["sharded", "replicated"]) +def test_mg_ivf_flat_distribution_modes(distribution_mode): + """Test different distribution modes for multi-GPU IVF-Flat.""" + run_mg_ivf_flat_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + distribution_mode=distribution_mode, + n_lists=30, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("search_mode", ["load_balancer", "round_robin"]) +@pytest.mark.parametrize("merge_mode", ["merge_on_root_rank", "tree_merge"]) +def test_mg_ivf_flat_search_params(search_mode, merge_mode): + """Test different multi-GPU search parameters.""" + run_mg_ivf_flat_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=500, + n_lists=30, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("metric", ["euclidean", "sqeuclidean"]) +def test_mg_ivf_flat_metrics(metric): + """Test different distance metrics for multi-GPU IVF-Flat.""" + run_mg_ivf_flat_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + metric=metric, + n_lists=30, + ) + + +@requires_multiple_gpus +def test_mg_ivf_flat_extend(): + """Test extending multi-GPU IVF-Flat index with new vectors.""" + run_mg_ivf_flat_build_search_test( + n_rows=1500, + n_cols=8, + n_queries=15, + k=5, + add_data_on_build=False, + n_lists=30, + ) + + +@requires_multiple_gpus +def test_mg_ivf_flat_serialize(): + """Test save/load functionality for multi-GPU IVF-Flat.""" + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build original index + build_params = mg_ivf_flat.IndexParams(n_lists=50) + original_index = mg_ivf_flat.build( + build_params, dataset, resources=resources + ) + + # Search with original index + search_params = mg_ivf_flat.SearchParams(n_probes=37) + orig_distances, orig_neighbors = mg_ivf_flat.search( + search_params, original_index, queries, k, resources=resources + ) + + # Save index to temporary file + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + mg_ivf_flat.save(original_index, temp_filename, resources=resources) + + # Load index from file + loaded_index = mg_ivf_flat.load(temp_filename, resources=resources) + assert loaded_index.trained + + # Search with loaded index + loaded_distances, loaded_neighbors = mg_ivf_flat.search( + search_params, loaded_index, queries, k, resources=resources + ) + + # Results should be identical + np.testing.assert_array_equal(orig_neighbors, loaded_neighbors) + np.testing.assert_allclose(orig_distances, loaded_distances, rtol=1e-6) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) + + +@requires_multiple_gpus +def test_mg_ivf_flat_distribute(): + """Test distribute functionality for multi-GPU IVF-Flat.""" + # Note: Distribute is for replicating a single-GPU index + # across multiple GPUs. + # This test builds a single-GPU index, serializes it, then distributes it. + + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + # Import single-GPU IVF-Flat to build and serialize a single-GPU index + from cuvs.common import Resources + from cuvs.neighbors import ivf_flat + + # Build single-GPU index first + single_gpu_resources = Resources() + single_build_params = ivf_flat.IndexParams( + metric="sqeuclidean", n_lists=50 + ) + + # Convert to device arrays for single-GPU build + try: + import cupy as cp + + device_dataset = cp.asarray(dataset) + single_index = ivf_flat.build( + single_build_params, device_dataset, resources=single_gpu_resources + ) + except ImportError: + pytest.skip("CuPy not available for single-GPU index building") + + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + # Serialize single-GPU index + ivf_flat.save( + temp_filename, single_index, resources=single_gpu_resources + ) + + # Now distribute the single-GPU index across multiple GPUs + resources = MultiGpuResources() + distributed_index = mg_ivf_flat.distribute( + temp_filename, resources=resources + ) + assert distributed_index.trained + + # Search should work with distributed index (using host memory arrays) + search_params = mg_ivf_flat.SearchParams(n_probes=37) + distances, neighbors = mg_ivf_flat.search( + search_params, distributed_index, queries, k, resources=resources + ) + + assert distances.shape == (20, k) + assert neighbors.shape == (20, k) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) + + +def test_memory_location_validation(): + """Test that multi-GPU IVF-Flat validates memory locations correctly.""" + try: + import cupy as cp + except ImportError: + pytest.skip("CuPy not available for memory location tests") + + n_rows, n_cols = 1500, 8 + + # Create host and device arrays + host_data = generate_data((n_rows, n_cols), np.float32) + device_data = cp.asarray(host_data) + + resources = MultiGpuResources() + build_params = mg_ivf_flat.IndexParams(n_lists=30) + + # Test that device arrays are rejected for build + with pytest.raises(ValueError, match="host memory"): + mg_ivf_flat.build(build_params, device_data, resources=resources) + + # Test that host arrays work for build + index = mg_ivf_flat.build(build_params, host_data, resources=resources) + + # Test that device arrays are rejected for search + queries = generate_data((20, n_cols), np.float32) + device_queries = cp.asarray(queries) + search_params = mg_ivf_flat.SearchParams(n_probes=22) + + with pytest.raises(ValueError, match="host memory"): + mg_ivf_flat.search( + search_params, index, device_queries, 5, resources=resources + ) + + # Test that host arrays work for search + distances, neighbors = mg_ivf_flat.search( + search_params, index, queries, 5, resources=resources + ) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + + +def test_parameter_validation(): + """Test parameter validation for multi-GPU IVF-Flat.""" + # Test invalid distribution mode + with pytest.raises(ValueError, match="distribution_mode must be"): + mg_ivf_flat.IndexParams(distribution_mode="invalid") + + # Test invalid search mode + with pytest.raises(ValueError, match="search_mode must be"): + mg_ivf_flat.SearchParams(search_mode="invalid") + + # Test invalid merge mode + with pytest.raises(ValueError, match="merge_mode must be"): + mg_ivf_flat.SearchParams(merge_mode="invalid") + + +def test_parameter_properties(): + """Test that parameters can be accessed via properties.""" + # Test IndexParams properties + params = mg_ivf_flat.IndexParams(distribution_mode="replicated") + assert params.distribution_mode == "replicated" + + params = mg_ivf_flat.IndexParams(distribution_mode="sharded") + assert params.distribution_mode == "sharded" + + # Test SearchParams creation with different parameters + mg_ivf_flat.SearchParams( + search_mode="round_robin", + merge_mode="tree_merge", + n_rows_per_batch=2000, + ) + # These don't have properties exposed, but creation should work + + +def test_untrained_index_error(): + """Test that using an untrained index raises appropriate errors.""" + resources = MultiGpuResources() + + # Create untrained index + index = mg_ivf_flat.Index() + assert not index.trained + + queries = generate_data((100, 10), np.float32) + search_params = mg_ivf_flat.SearchParams(n_probes=20) + + # Test that search on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_flat.search( + search_params, index, queries, 10, resources=resources + ) + + # Test that extend on untrained index fails + new_vectors = generate_data((50, 10), np.float32) + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_flat.extend(index, new_vectors, resources=resources) + + # Test that save on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_flat.save(index, "temp.bin", resources=resources) + + +@requires_multiple_gpus +def test_mg_ivf_flat_with_prealloc_output(): + """Test multi-GPU IVF-Flat search with pre-allocated output arrays.""" + n_rows, n_cols = 1500, 8 # Ensure n_rows > n_lists + n_queries = 20 + k = 5 + + # Generate data in host memory + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build index with fewer clusters to avoid n_rows < n_lists error + build_params = mg_ivf_flat.IndexParams(n_lists=30) + index = mg_ivf_flat.build(build_params, dataset, resources=resources) + + # Pre-allocate output arrays in host memory + neighbors = np.empty((n_queries, k), dtype=np.int64) + distances = np.empty((n_queries, k), dtype=np.float32) + + # Search with pre-allocated arrays + search_params = mg_ivf_flat.SearchParams(n_probes=20) + ret_distances, ret_neighbors = mg_ivf_flat.search( + search_params, + index, + queries, + k, + neighbors=neighbors, + distances=distances, + resources=resources, + ) + + # Should return the same arrays we passed in + assert ret_distances is distances + assert ret_neighbors is neighbors + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + +def test_index_repr(): + """Test string representation of Index.""" + index = mg_ivf_flat.Index() + assert repr(index) == "Index(type=MultiGpuIvfFlat)" + + +def test_mg_ivf_flat_simple(): + """Simple test to validate multi-GPU IVF-Flat works with very favorable + parameters. + """ + if not has_multiple_gpus(): + pytest.skip("Multi-GPU tests require multiple GPUs") + + # Use simple test case that should definitely work + n_rows, n_cols = 1000, 8 + n_queries, k = 20, 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Use very few clusters for high recall + build_params = mg_ivf_flat.IndexParams( + metric="sqeuclidean", + n_lists=32, # Very few clusters + ) + + # Build index + index = mg_ivf_flat.build(build_params, dataset, resources=resources) + + # Search with many probes for maximum recall + search_params = mg_ivf_flat.SearchParams( + n_probes=32 + ) # Search all clusters + distances, neighbors = mg_ivf_flat.search( + search_params, index, queries, k, resources=resources + ) + + # Basic sanity checks + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + + # Check that we get valid neighbors + assert np.all(neighbors >= 0) + assert np.all(neighbors < n_rows) + + # Distances should be non-negative and sorted + assert np.all(distances >= 0) + for i in range(n_queries): + assert np.all( + distances[i, :-1] <= distances[i, 1:] + ), f"Distances not sorted for query {i}" + + +# Integration test with multiple operations +@requires_multiple_gpus +def test_mg_ivf_flat_integration(): + """Integration test covering build, search, extend, and serialization.""" + n_rows, n_cols = 2000, 8 + k = 5 + + # Generate initial dataset + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build initial index + build_params = mg_ivf_flat.IndexParams( + distribution_mode="sharded", metric="sqeuclidean", n_lists=50 + ) + index = mg_ivf_flat.build(build_params, dataset, resources=resources) + + # Initial search + search_params = mg_ivf_flat.SearchParams( + n_probes=37, + search_mode="load_balancer", + merge_mode="merge_on_root_rank", + ) + distances1, neighbors1 = mg_ivf_flat.search( + search_params, index, queries, k, resources=resources + ) + + # Extend index with new vectors + new_vectors = generate_data((200, n_cols), np.float32) + # Provide indices for extend operation on non-empty index + new_indices = np.arange(n_rows, n_rows + 200, dtype=np.int64) + mg_ivf_flat.extend(index, new_vectors, new_indices, resources=resources) + + # Search after extend + distances2, neighbors2 = mg_ivf_flat.search( + search_params, index, queries, k, resources=resources + ) + + # Save and reload + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + mg_ivf_flat.save(index, temp_filename, resources=resources) + reloaded_index = mg_ivf_flat.load(temp_filename, resources=resources) + + # Search with reloaded index + distances3, neighbors3 = mg_ivf_flat.search( + search_params, reloaded_index, queries, k, resources=resources + ) + + # Results from extended and reloaded index should match + np.testing.assert_array_equal(neighbors2, neighbors3) + np.testing.assert_allclose(distances2, distances3, rtol=1e-6) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) diff --git a/python/cuvs/cuvs/tests/test_mg_ivf_pq.py b/python/cuvs/cuvs/tests/test_mg_ivf_pq.py new file mode 100644 index 0000000000..382fb9eed7 --- /dev/null +++ b/python/cuvs/cuvs/tests/test_mg_ivf_pq.py @@ -0,0 +1,682 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile + +import numpy as np +import pytest +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import normalize + +from cuvs.common import MultiGpuResources +from cuvs.neighbors.mg import ivf_pq as mg_ivf_pq +from cuvs.tests.ann_utils import calc_recall, generate_data + + +# Check if multi-GPU functionality is available +def has_multiple_gpus(): + """Check if system has multiple GPUs available.""" + try: + import cupy as cp + + return cp.cuda.runtime.getDeviceCount() > 1 + except Exception: + return False + + +# Mark tests that require multiple GPUs +requires_multiple_gpus = pytest.mark.skipif( + not has_multiple_gpus(), reason="Multi-GPU tests require multiple GPUs" +) + + +def run_mg_ivf_pq_build_search_test( + n_rows=10000, + n_cols=10, + n_queries=100, + k=10, + dtype=np.float32, + metric="euclidean", + distribution_mode="sharded", + search_mode="load_balancer", + merge_mode="tree_merge", + n_rows_per_batch=1000, + compare=True, + add_data_on_build=True, + search_params=None, + n_lists=None, + pq_bits=8, + pq_dim=0, + codebook_kind="subspace", +): + """ + Run a multi-GPU IVF-PQ build and search test. + + Note: Multi-GPU IVF-PQ requires host memory arrays (NumPy), not device + arrays. + """ + # Generate host memory arrays (NumPy) + dataset = generate_data((n_rows, n_cols), dtype) + if metric == "inner_product": + dataset = normalize(dataset, norm="l2", axis=1) + + queries = generate_data((n_queries, n_cols), dtype) + if metric == "inner_product": + queries = normalize(queries, norm="l2", axis=1) + + # Multi-GPU resources + resources = MultiGpuResources() + + # Build parameters - use fewer clusters for better recall with smaller + # datasets + if n_lists is None: + # Use fewer clusters for smaller datasets to ensure enough points per + # cluster + n_lists = min(1024, max(64, n_rows // 50)) + + build_params = mg_ivf_pq.IndexParams( + metric=metric, + distribution_mode=distribution_mode, + add_data_on_build=add_data_on_build, + n_lists=n_lists, + pq_bits=pq_bits, + pq_dim=pq_dim, + codebook_kind=codebook_kind, + ) + + # Build index + index = mg_ivf_pq.build(build_params, dataset, resources=resources) + assert index.trained + + # If not adding data on build, extend the index + if not add_data_on_build: + dataset_1 = dataset[: n_rows // 2, :] + dataset_2 = dataset[n_rows // 2 :, :] + indices_1 = np.arange(n_rows // 2, dtype=np.int64) + indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.int64) + + mg_ivf_pq.extend(index, dataset_1, indices_1, resources=resources) + mg_ivf_pq.extend(index, dataset_2, indices_2, resources=resources) + + # Search parameters + if search_params is None: + search_params = {} + # Use higher n_probes for better recall in multi-GPU setting + if "n_probes" not in search_params: + # Use many clusters for good recall - search majority of clusters + search_params["n_probes"] = min(n_lists, max(20, (n_lists * 3) // 4)) + search_params_obj = mg_ivf_pq.SearchParams( + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=n_rows_per_batch, + **search_params, + ) + + # Perform search + distances, neighbors = mg_ivf_pq.search( + search_params_obj, + index, + queries, + k, + resources=resources, + ) + + # Verify results are in host memory (NumPy arrays) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + if not compare: + return distances, neighbors + + # Calculate reference values with sklearn + skl_metric = { + "sqeuclidean": "sqeuclidean", + "inner_product": "cosine", + "cosine": "cosine", + "euclidean": "euclidean", + }[metric] + + nn_skl = NearestNeighbors( + n_neighbors=k, algorithm="brute", metric=skl_metric + ) + nn_skl.fit(dataset) + skl_idx = nn_skl.kneighbors(queries, return_distance=False) + + recall = calc_recall(neighbors, skl_idx) + + return distances, neighbors, recall + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32]) +@pytest.mark.parametrize( + "metric", ["sqeuclidean"] +) # Start with just sqeuclidean +@pytest.mark.parametrize( + "distribution_mode", ["sharded"] +) # Start with just sharded +def test_mg_ivf_pq_basic(dtype, metric, distribution_mode): + """Test basic multi-GPU IVF-PQ build and search functionality.""" + run_mg_ivf_pq_build_search_test( + n_rows=2000, # Use smaller dataset for more reliable tests + n_cols=32, + n_queries=20, + k=5, + dtype=dtype, + metric=metric, + distribution_mode=distribution_mode, + n_lists=50, # Fixed small number of clusters + compare=True, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("metric", ["inner_product", "euclidean", "cosine"]) +@pytest.mark.parametrize("distribution_mode", ["replicated"]) +def test_mg_ivf_pq_additional_metrics(metric, distribution_mode): + """Test additional metrics and distribution modes for IVF-PQ.""" + run_mg_ivf_pq_build_search_test( + n_rows=2000, + n_cols=32, + n_queries=20, + k=5, + dtype=np.float32, + metric=metric, + distribution_mode=distribution_mode, + n_lists=50, + compare=False, # PQ may have lower recall, don't enforce strict recall + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("dtype", [np.float32, np.float16, np.int8, np.uint8]) +def test_mg_ivf_pq_dtypes(dtype): + """Test multi-GPU IVF-PQ with different data types.""" + run_mg_ivf_pq_build_search_test( + n_rows=1500, + n_cols=32, + n_queries=15, + k=5, + dtype=dtype, + metric="sqeuclidean", + n_lists=30, + compare=False, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("distribution_mode", ["sharded", "replicated"]) +def test_mg_ivf_pq_distribution_modes(distribution_mode): + """Test different distribution modes for multi-GPU IVF-PQ.""" + run_mg_ivf_pq_build_search_test( + n_rows=1500, + n_cols=32, + n_queries=15, + k=5, + distribution_mode=distribution_mode, + n_lists=30, + compare=False, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("search_mode", ["load_balancer", "round_robin"]) +@pytest.mark.parametrize("merge_mode", ["merge_on_root_rank", "tree_merge"]) +def test_mg_ivf_pq_search_params(search_mode, merge_mode): + """Test different multi-GPU search parameters for IVF-PQ.""" + run_mg_ivf_pq_build_search_test( + n_rows=1500, + n_cols=32, + n_queries=15, + k=5, + search_mode=search_mode, + merge_mode=merge_mode, + n_rows_per_batch=500, + n_lists=30, + compare=False, + ) + + +@requires_multiple_gpus +def test_mg_ivf_pq_pq_parameters(): + """Test different PQ-specific parameters.""" + for pq_bits in [4, 8]: + for pq_dim in [0, 8, 16]: # 0 means auto-select + for codebook_kind in ["subspace", "cluster"]: + run_mg_ivf_pq_build_search_test( + n_rows=1000, + n_cols=32, + n_queries=100, + k=10, + pq_bits=pq_bits, + pq_dim=pq_dim, + codebook_kind=codebook_kind, + compare=False, + ) + + +@requires_multiple_gpus +@pytest.mark.parametrize("metric", ["euclidean", "sqeuclidean"]) +def test_mg_ivf_pq_metrics(metric): + """Test different distance metrics for multi-GPU IVF-PQ.""" + run_mg_ivf_pq_build_search_test( + n_rows=1500, + n_cols=32, + n_queries=15, + k=5, + metric=metric, + n_lists=30, + compare=False, + ) + + +@requires_multiple_gpus +def test_mg_ivf_pq_extend(): + """Test extending index with new vectors.""" + run_mg_ivf_pq_build_search_test( + n_rows=1000, + n_cols=32, + n_queries=100, + k=10, + add_data_on_build=False, # This triggers extend functionality + compare=False, + ) + + +@requires_multiple_gpus +def test_mg_ivf_pq_serialize(): + """Test serialization and deserialization.""" + # Generate data + n_rows, n_cols = 1000, 32 + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((100, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build index + build_params = mg_ivf_pq.IndexParams( + metric="euclidean", + n_lists=100, + pq_bits=8, + pq_dim=16, + ) + index = mg_ivf_pq.build(build_params, dataset, resources=resources) + + # Search before serialization + search_params = mg_ivf_pq.SearchParams(n_probes=50) + distances_1, neighbors_1 = mg_ivf_pq.search( + search_params, index, queries, 10, resources=resources + ) + + # Serialize + with tempfile.NamedTemporaryFile(delete=False) as f: + filename = f.name + + try: + mg_ivf_pq.save(index, filename, resources=resources) + + # Load index + index_loaded = mg_ivf_pq.load(filename, resources=resources) + assert index_loaded.trained + + # Search after loading + distances_2, neighbors_2 = mg_ivf_pq.search( + search_params, index_loaded, queries, 10, resources=resources + ) + + # Results should be the same + assert np.array_equal(distances_1, distances_2) + assert np.array_equal(neighbors_1, neighbors_2) + + finally: + if os.path.exists(filename): + os.unlink(filename) + + +@requires_multiple_gpus +def test_mg_ivf_pq_distribute(): + """Test distribute functionality for multi-GPU IVF-PQ.""" + # Note: Distribute is for replicating a single-GPU index across multiple + # GPUs. + # This test builds a single-GPU index, serializes it, then distributes it. + # Multi-GPU distribute only supports float32 indexes. + + n_rows, n_cols = 2000, 32 + k = 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((100, n_cols), np.float32) + + # Import single-GPU IVF-PQ to build and serialize a single-GPU index + from cuvs.common import Resources + from cuvs.neighbors import ivf_pq + + # Build single-GPU index first + single_gpu_resources = Resources() + single_build_params = ivf_pq.IndexParams( + metric="sqeuclidean", n_lists=50, pq_bits=8, pq_dim=16 + ) + + # Convert to device arrays for single-GPU build + try: + import cupy as cp + + device_dataset = cp.asarray(dataset, dtype=np.float32) + single_index = ivf_pq.build( + single_build_params, device_dataset, resources=single_gpu_resources + ) + except ImportError: + pytest.skip("CuPy not available for single-GPU index building") + + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + # Serialize single-GPU index + ivf_pq.save( + temp_filename, single_index, resources=single_gpu_resources + ) + + # Now distribute the single-GPU index across multiple GPUs + resources = MultiGpuResources() + distributed_index = mg_ivf_pq.distribute( + temp_filename, resources=resources + ) + assert distributed_index.trained + + # Search using the distributed index + search_params = mg_ivf_pq.SearchParams(n_probes=25) + distances, neighbors = mg_ivf_pq.search( + search_params, distributed_index, queries, k, resources=resources + ) + + # Verify results shape + assert distances.shape == (100, k) + assert neighbors.shape == (100, k) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename) + + +def test_memory_location_validation(): + """Test that multi-GPU IVF-PQ validates memory locations correctly.""" + try: + import cupy as cp + except ImportError: + pytest.skip("CuPy not available") + + # Generate device arrays (should fail) - use enough data points for n_lists + dataset_gpu = cp.random.random((1000, 32), dtype=cp.float32) + queries_gpu = cp.random.random((100, 32), dtype=cp.float32) + + # Create parameters with smaller n_lists for the small dataset + build_params = mg_ivf_pq.IndexParams( + n_lists=20 + ) # Smaller n_lists for 1000 points + search_params = mg_ivf_pq.SearchParams() + + # These should raise ValueError about memory location + with pytest.raises(ValueError, match="host memory"): + mg_ivf_pq.build(build_params, dataset_gpu) + + # For search test, we need a valid index first + dataset_cpu = cp.asnumpy(dataset_gpu) + resources = MultiGpuResources() if has_multiple_gpus() else None + if resources: + index = mg_ivf_pq.build(build_params, dataset_cpu, resources=resources) + + with pytest.raises(ValueError, match="host memory"): + mg_ivf_pq.search( + search_params, index, queries_gpu, 5, resources=resources + ) + + +def test_parameter_validation(): + """Test parameter validation for multi-GPU IVF-PQ.""" + # Test invalid distribution mode + with pytest.raises(ValueError, match="distribution_mode must be"): + mg_ivf_pq.IndexParams(distribution_mode="invalid") + + # Test invalid search mode + with pytest.raises(ValueError, match="search_mode must be"): + mg_ivf_pq.SearchParams(search_mode="invalid") + + # Test invalid merge mode + with pytest.raises(ValueError, match="merge_mode must be"): + mg_ivf_pq.SearchParams(merge_mode="invalid") + + # Test invalid codebook kind + with pytest.raises(ValueError, match="Incorrect codebook kind"): + mg_ivf_pq.IndexParams(codebook_kind="invalid") + + +def test_parameter_properties(): + """Test that parameters can be accessed via properties.""" + # Test IndexParams properties + params = mg_ivf_pq.IndexParams(distribution_mode="replicated") + assert params.distribution_mode == "replicated" + + params = mg_ivf_pq.IndexParams(distribution_mode="sharded") + assert params.distribution_mode == "sharded" + + # Test PQ-specific parameters + params = mg_ivf_pq.IndexParams( + pq_bits=4, pq_dim=16, codebook_kind="cluster" + ) + # These don't have properties exposed, but creation should work + + # Test SearchParams creation with different parameters + mg_ivf_pq.SearchParams( + search_mode="round_robin", + merge_mode="tree_merge", + n_rows_per_batch=2000, + ) + # These don't have properties exposed, but creation should work + + +def test_untrained_index_error(): + """Test that using an untrained index raises appropriate errors.""" + resources = MultiGpuResources() + + # Create untrained index + index = mg_ivf_pq.Index() + assert not index.trained + + queries = generate_data((100, 10), np.float32) + search_params = mg_ivf_pq.SearchParams(n_probes=20) + + # Test that search on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_pq.search( + search_params, index, queries, 10, resources=resources + ) + + # Test that extend on untrained index fails + new_vectors = generate_data((50, 10), np.float32) + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_pq.extend(index, new_vectors, resources=resources) + + # Test that save on untrained index fails + with pytest.raises(ValueError, match="Index needs to be built"): + mg_ivf_pq.save(index, "temp.bin", resources=resources) + + +@requires_multiple_gpus +def test_mg_ivf_pq_with_prealloc_output(): + """Test multi-GPU IVF-PQ search with pre-allocated output arrays.""" + n_rows, n_cols = 1500, 32 # Ensure n_rows > n_lists + n_queries = 20 + k = 5 + + # Generate data in host memory + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build index with fewer clusters to avoid n_rows < n_lists error + build_params = mg_ivf_pq.IndexParams(n_lists=30, pq_bits=8, pq_dim=16) + index = mg_ivf_pq.build(build_params, dataset, resources=resources) + + # Pre-allocate output arrays in host memory + neighbors = np.empty((n_queries, k), dtype=np.int64) + distances = np.empty((n_queries, k), dtype=np.float32) + + # Search with pre-allocated arrays + search_params = mg_ivf_pq.SearchParams(n_probes=20) + ret_distances, ret_neighbors = mg_ivf_pq.search( + search_params, + index, + queries, + k, + neighbors=neighbors, + distances=distances, + resources=resources, + ) + + # Should return the same arrays we passed in + assert ret_distances is distances + assert ret_neighbors is neighbors + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + + +def test_index_repr(): + """Test string representation of Index.""" + index = mg_ivf_pq.Index() + assert repr(index) == "Index(type=MultiGpuIvfPq)" + + +def test_mg_ivf_pq_simple(): + """Simple test to validate multi-GPU IVF-PQ works with very favorable + parameters. + """ + if not has_multiple_gpus(): + pytest.skip("Multi-GPU tests require multiple GPUs") + + # Use simple test case that should definitely work + n_rows, n_cols = 1000, 32 + n_queries, k = 20, 5 + + # Generate data + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((n_queries, n_cols), np.float32) + + resources = MultiGpuResources() + + # Use very few clusters for high recall + build_params = mg_ivf_pq.IndexParams( + metric="sqeuclidean", + n_lists=32, # Very few clusters + pq_bits=8, + pq_dim=16, + ) + + # Build index + index = mg_ivf_pq.build(build_params, dataset, resources=resources) + + # Search with many probes for maximum recall + search_params = mg_ivf_pq.SearchParams(n_probes=32) # Search all clusters + distances, neighbors = mg_ivf_pq.search( + search_params, index, queries, k, resources=resources + ) + + # Basic sanity checks + assert distances.shape == (n_queries, k) + assert neighbors.shape == (n_queries, k) + assert isinstance(distances, np.ndarray) + assert isinstance(neighbors, np.ndarray) + + # Check that we get valid neighbors + assert np.all(neighbors >= 0) + assert np.all(neighbors < n_rows) + + # Distances should be non-negative and sorted + assert np.all(distances >= 0) + for i in range(n_queries): + assert np.all( + distances[i, :-1] <= distances[i, 1:] + ), f"Distances not sorted for query {i}" + + +# Integration test with multiple operations +@requires_multiple_gpus +def test_mg_ivf_pq_integration(): + """Integration test covering build, search, extend, and serialization.""" + n_rows, n_cols = 2000, 32 + k = 5 + + # Generate initial dataset + dataset = generate_data((n_rows, n_cols), np.float32) + queries = generate_data((20, n_cols), np.float32) + + resources = MultiGpuResources() + + # Build initial index + build_params = mg_ivf_pq.IndexParams( + distribution_mode="sharded", + metric="sqeuclidean", + n_lists=50, + pq_bits=8, + pq_dim=16, + ) + index = mg_ivf_pq.build(build_params, dataset, resources=resources) + + # Initial search + search_params = mg_ivf_pq.SearchParams( + n_probes=37, + search_mode="load_balancer", + merge_mode="merge_on_root_rank", + ) + distances1, neighbors1 = mg_ivf_pq.search( + search_params, index, queries, k, resources=resources + ) + + # Extend index with new vectors + new_vectors = generate_data((200, n_cols), np.float32) + # Provide indices for extend operation on non-empty index + new_indices = np.arange(n_rows, n_rows + 200, dtype=np.int64) + mg_ivf_pq.extend(index, new_vectors, new_indices, resources=resources) + + # Search after extend + distances2, neighbors2 = mg_ivf_pq.search( + search_params, index, queries, k, resources=resources + ) + + # Save and reload + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + temp_filename = f.name + + try: + mg_ivf_pq.save(index, temp_filename, resources=resources) + reloaded_index = mg_ivf_pq.load(temp_filename, resources=resources) + + # Search with reloaded index + distances3, neighbors3 = mg_ivf_pq.search( + search_params, reloaded_index, queries, k, resources=resources + ) + + # Results from extended and reloaded index should match + np.testing.assert_array_equal(neighbors2, neighbors3) + np.testing.assert_allclose(distances2, distances3, rtol=1e-6) + + finally: + if os.path.exists(temp_filename): + os.unlink(temp_filename)