rapidsai · rapids-bot · Sep 18, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
@@ -734,6 +734,7 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
       src/neighbors/nn_descent_c.cpp
       src/neighbors/refine/refine_c.cpp
       src/neighbors/tiered_index_c.cpp
+      src/neighbors/all_neighbors_c.cpp
       src/preprocessing/quantize/binary_c.cpp
       src/preprocessing/quantize/scalar_c.cpp
       src/distance/pairwise_distance_c.cpp

@@ -127,6 +127,34 @@ cuvsError_t cuvsStreamSync(cuvsResources_t res);
  * @return cuvsError_t
  */
 cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id);
+
+/**
+ * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
+ *        for multi-GPU operations
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsMultiGpuResourcesCreate(cuvsResources_t* res);
+
+/**
+ * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
+ *        for multi-GPU operations with specific device IDs
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] device_ids DLManagedTensor* containing device IDs to use
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsMultiGpuResourcesCreateWithDeviceIds(cuvsResources_t* res,
+                                                     DLManagedTensor* device_ids);
+
+/**
+ * @brief Destroy and de-allocate opaque C handle for C++ type `raft::device_resources_snmg`
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsMultiGpuResourcesDestroy(cuvsResources_t res);
 /** @} */
 
 /**

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/distance/distance.h>
+#include <cuvs/neighbors/ivf_pq.h>
+#include <cuvs/neighbors/nn_descent.h>
+#include <dlpack/dlpack.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup all_neighbors_c_build All-neighbors C-API build (SNMG only)
+ * @{
+ *
+ * All-neighbors constructs an approximate k-NN graph for all vectors in a dataset.
+ * This SNMG C API can be used with a multi-GPU resources handle (`cuvsResources_t`)
+ * created from `raft::device_resources_snmg` to distribute work across GPUs.
+ *
+ * Notes:
+ * - Outputs (indices, distances, core_distances) are expected to be on device memory.
+ * - Host variant accepts host-resident dataset; device variant accepts device-resident dataset.
+ * - For batching, `overlap_factor < n_clusters` must hold.
+ * - When `core_distances` is provided, mutual-reachability distances are produced (see alpha).
+ */
+
+/**
+ * @brief Graph build algorithm selection.
+ */
+typedef enum {
+  CUVS_ALL_NEIGHBORS_ALGO_BRUTE_FORCE = 0,  ///< Use Brute Force for local kNN subgraphs
+  CUVS_ALL_NEIGHBORS_ALGO_IVF_PQ = 1,  ///< Use IVF-PQ for local kNN subgraphs (host dataset only)
+  CUVS_ALL_NEIGHBORS_ALGO_NN_DESCENT = 2  ///< Use NN-Descent for local kNN subgraphs
+} cuvsAllNeighborsAlgo;
+
+/**
+ * @brief Parameters controlling SNMG all-neighbors build.
+ */
+struct cuvsAllNeighborsIndexParams {
+  cuvsAllNeighborsAlgo algo;  ///< Local kNN graph build algorithm
+  size_t overlap_factor;  ///< Number of clusters each point is assigned to (must be < n_clusters)
+  size_t
+    n_clusters;  ///< Number of clusters/batches to partition the dataset into (> overlap_factor)
+  cuvsDistanceType metric;  ///< Distance metric used for graph construction
+
+  // Algorithm-specific parameters (only one should be set based on algo)
+  cuvsIvfPqIndexParams_t ivf_pq_params;          ///< Parameters for IVF-PQ algorithm (when algo ==
+                                                 ///< CUVS_ALL_NEIGHBORS_ALGO_IVF_PQ)
+  cuvsNNDescentIndexParams_t nn_descent_params;  ///< Parameters for NN-Descent algorithm (when algo
+                                                 ///< == CUVS_ALL_NEIGHBORS_ALGO_NN_DESCENT)
+};
+
+typedef struct cuvsAllNeighborsIndexParams* cuvsAllNeighborsIndexParams_t;
+
+/**
+ * @brief Build an all-neighbors k-NN graph automatically detecting host vs device dataset.
+ *
+ * @param[in] res             Can be a SNMG multi-GPU resources (`cuvsResources_t`) or single-GPU
+ * resources
+ * @param[in] params          Build parameters (see cuvsAllNeighborsIndexParams)
+ * @param[in] dataset         2D tensor [num_rows x dim] on host or device (auto-detected)
+ * @param[out] indices        2D tensor [num_rows x k] on device (int64)
+ * @param[out] distances      Optional 2D tensor [num_rows x k] on device (float32); can be NULL
+ * @param[out] core_distances Optional 1D tensor [num_rows] on device (float32); can be NULL
+ * @param[in] alpha           Mutual-reachability scaling; used only when core_distances is provided
+ *
+ * The function automatically detects whether the dataset is host-resident or device-resident
+ * and calls the appropriate implementation. For host datasets, it partitions data into
+ * `n_clusters` clusters and assigns each row to `overlap_factor` nearest clusters. For device
+ * datasets, `n_clusters` must be 1 (no batching); `overlap_factor` is ignored.
+ * Outputs always reside in device memory.
+ */
+cuvsError_t cuvsAllNeighborsBuild(cuvsResources_t res,
+                                  cuvsAllNeighborsIndexParams_t params,
+                                  DLManagedTensor* dataset,
+                                  DLManagedTensor* indices,
+                                  DLManagedTensor* distances,
+                                  DLManagedTensor* core_distances,
+                                  float alpha);
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
@@ -60,6 +60,34 @@ extern "C" cuvsError_t cuvsMultiGpuResourcesCreate(cuvsResources_t* res)
   });
 }
 
+extern "C" cuvsError_t cuvsMultiGpuResourcesCreateWithDeviceIds(cuvsResources_t* res,
+                                                                DLManagedTensor* device_ids)
+{
+  return cuvs::core::translate_exceptions([=] {
+    // Basic validation
+    if (device_ids == nullptr || device_ids->dl_tensor.data == nullptr) {
+      throw std::invalid_argument("device_ids cannot be null");
+    }
+
+    // Check data type is int32
+    if (device_ids->dl_tensor.dtype.code != kDLInt || device_ids->dl_tensor.dtype.bits != 32) {
+      throw std::invalid_argument("device_ids must be int32");
+    }
+
+    // Check data is on host
+    if (device_ids->dl_tensor.device.device_type != kDLCPU) {
+      throw std::invalid_argument("device_ids must be on host memory");
+    }
+
+    // Cast void* to int* to perform pointer arithmetic
+    int* data_ptr = static_cast<int*>(device_ids->dl_tensor.data);
+    std::vector<int> ids(data_ptr, data_ptr + device_ids->dl_tensor.shape[0]);
+
+    auto res_ptr = new raft::device_resources_snmg{ids};
+    *res         = reinterpret_cast<uintptr_t>(res_ptr);
+  });
+}
+
 extern "C" cuvsError_t cuvsMultiGpuResourcesDestroy(cuvsResources_t res)
 {
   return cuvs::core::translate_exceptions([=] {