diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h
index 58025ef359..487ada503d 100644
--- a/c/include/cuvs/neighbors/cagra.h
+++ b/c/include/cuvs/neighbors/cagra.h
@@ -34,7 +34,14 @@ enum cuvsCagraGraphBuildAlgo {
   /* Experimental, use NN-Descent to build all-neighbors knn graph */
   NN_DESCENT = 2,
   /* Experimental, use iterative cagra search and optimize to build the knn graph */
-  ITERATIVE_CAGRA_SEARCH = 3
+  ITERATIVE_CAGRA_SEARCH = 3,
+  /**
+   * Experimental, use ACE (Augmented Core Extraction) to build the graph. ACE partitions the
+   * dataset into core and augmented partitions and builds a sub-index for each partition. This
+   * enables building indices for datasets too large to fit in GPU or host memory.
+   * See cuvsAceParams for more details about the ACE algorithm and its parameters.
+   */
+  ACE = 4
 };
 
 /**
@@ -118,6 +125,52 @@ struct cuvsIvfPqParams {
 
 typedef struct cuvsIvfPqParams* cuvsIvfPqParams_t;
 
+/**
+ * Parameters for ACE (Augmented Core Extraction) graph build.
+ * ACE enables building indices for datasets too large to fit in GPU memory by:
+ * 1. Partitioning the dataset in core (closest) and augmented (second-closest)
+ * partitions using balanced k-means.
+ * 2. Building sub-indices for each partition independently
+ * 3. Concatenating sub-graphs into a final unified index
+ */
+struct cuvsAceParams {
+  /**
+   * Number of partitions for ACE (Augmented Core Extraction) partitioned build.
+   *
+   * Small values might improve recall but potentially degrade performance and
+   * increase memory usage. Partitions should not be too small to prevent issues
+   * in KNN graph construction. 100k - 5M vectors per partition is recommended
+   * depending on the available host and GPU memory. The partition size is on
+   * average 2 * (n_rows / npartitions) * dim * sizeof(T). 2 is because of the
+   * core and augmented vectors. Please account for imbalance in the partition
+   * sizes (up to 3x in our tests).
+   */
+  size_t npartitions;
+  /**
+   * The index quality for the ACE build.
+   *
+   * Bigger values increase the index quality. At some point, increasing this will no longer
+   * improve the quality.
+   */
+  size_t ef_construction;
+  /**
+   * Directory to store ACE build artifacts (e.g., KNN graph, optimized graph).
+   *
+   * Used when `use_disk` is true or when the graph does not fit in host and GPU
+   * memory. This should be the fastest disk in the system and hold enough space
+   * for twice the dataset, final graph, and label mapping.
+   */
+  const char* build_dir;
+  /**
+   * Whether to use disk-based storage for ACE build.
+   *
+   * When true, enables disk-based operations for memory-efficient graph construction.
+   */
+  bool use_disk;
+};
+
+typedef struct cuvsAceParams* cuvsAceParams_t;
+
 /**
  * @brief Supplemental parameters to build CAGRA Index
  *
@@ -140,9 +193,12 @@ struct cuvsCagraIndexParams {
    */
   cuvsCagraCompressionParams_t compression;
   /**
-   * Optional: specify ivf pq params when `build_algo = IVF_PQ`
+   * Optional: specify graph build params based on build_algo
+   * - IVF_PQ: cuvsIvfPqParams_t
+   * - ACE: cuvsAceParams_t
+   * - Others: nullptr
    */
-  cuvsIvfPqParams_t graph_build_params;
+  void* graph_build_params;
 };
 
 typedef struct cuvsCagraIndexParams* cuvsCagraIndexParams_t;
@@ -179,6 +235,38 @@ cuvsError_t cuvsCagraCompressionParamsCreate(cuvsCagraCompressionParams_t* param
  */
 cuvsError_t cuvsCagraCompressionParamsDestroy(cuvsCagraCompressionParams_t params);
 
+/**
+ * @brief Allocate ACE params, and populate with default values
+ *
+ * @param[in] params cuvsAceParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsAceParamsCreate(cuvsAceParams_t* params);
+
+/**
+ * @brief De-allocate ACE params
+ *
+ * @param[in] params
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsAceParamsDestroy(cuvsAceParams_t params);
+
+/**
+ * @brief Allocate ACE params, and populate with default values
+ *
+ * @param[in] params cuvsAceParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsAceParamsCreate(cuvsAceParams_t* params);
+
+/**
+ * @brief De-allocate ACE params
+ *
+ * @param[in] params
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsAceParamsDestroy(cuvsAceParams_t params);
+
 /**
  * @brief Create CAGRA index parameters similar to an HNSW index
  *
diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp
index b8242fe07a..611ef3e086 100644
--- a/c/src/neighbors/cagra.cpp
+++ b/c/src/neighbors/cagra.cpp
@@ -4,6 +4,7 @@
  */
 
 #include <cstdint>
+#include <cstring>
 #include <dlpack/dlpack.h>
 
 #include <raft/core/error.hpp>
@@ -17,6 +18,7 @@
 #include <cuvs/neighbors/cagra.h>
 #include <cuvs/neighbors/common.h>
 #include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/graph_build_types.hpp>
 
 #include "../core/exceptions.hpp"
 #include "../core/interop.hpp"
@@ -30,6 +32,7 @@ static void _set_graph_build_params(
   std::variant<std::monostate,
                cuvs::neighbors::cagra::graph_build_params::ivf_pq_params,
                cuvs::neighbors::cagra::graph_build_params::nn_descent_params,
+               cuvs::neighbors::cagra::graph_build_params::ace_params,
                cuvs::neighbors::cagra::graph_build_params::iterative_search_params>& out_params,
   cuvsCagraIndexParams& params,
   cuvsCagraGraphBuildAlgo algo,
@@ -81,6 +84,18 @@ static void _set_graph_build_params(
       out_params               = nn_params;
       break;
     }
+    case cuvsCagraGraphBuildAlgo::ACE: {
+      cuvs::neighbors::cagra::graph_build_params::ace_params ace_p;
+      if (params.graph_build_params) {
+        auto ace_params_c             = static_cast<cuvsAceParams*>(params.graph_build_params);
+        ace_p.npartitions     = ace_params_c->npartitions;
+        ace_p.ef_construction = ace_params_c->ef_construction;
+        ace_p.build_dir       = std::string(ace_params_c->build_dir);
+        ace_p.use_disk        = ace_params_c->use_disk;
+      }
+      out_params = ace_p;
+      break;
+    }
     case cuvsCagraGraphBuildAlgo::ITERATIVE_CAGRA_SEARCH: {
       cuvs::neighbors::cagra::graph_build_params::iterative_search_params p;
       out_params = p;
@@ -388,7 +403,19 @@ static void _populate_cagra_index_params_from_cpp(cuvsCagraIndexParams_t c_param
       std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
         cpp_params.graph_build_params);
 
-    _populate_c_ivf_pq_params(c_params->graph_build_params, ivf_pq_params);
+    _populate_c_ivf_pq_params(static_cast<cuvsIvfPqParams*>(c_params->graph_build_params), ivf_pq_params);
+  } else if (std::holds_alternative<cuvs::neighbors::cagra::graph_build_params::ace_params>(
+               cpp_params.graph_build_params)) {
+    c_params->build_algo = ACE;
+    auto ace_params =
+      std::get<cuvs::neighbors::cagra::graph_build_params::ace_params>(
+        cpp_params.graph_build_params);
+    cuvsAceParams* c_ace_params = new cuvsAceParams;
+    c_ace_params->npartitions = ace_params.npartitions;
+    c_ace_params->ef_construction = ace_params.ef_construction;
+    c_ace_params->build_dir = ace_params.build_dir.empty() ? nullptr : strdup(ace_params.build_dir.c_str());
+    c_ace_params->use_disk = ace_params.use_disk;
+    c_params->graph_build_params = c_ace_params;
   }
 }
 
@@ -700,7 +727,26 @@ extern "C" cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params
 extern "C" cuvsError_t cuvsCagraIndexParamsDestroy(cuvsCagraIndexParams_t params)
 {
   return cuvs::core::translate_exceptions([=] {
-    delete params->graph_build_params;
+    // Delete graph_build_params based on the build algorithm type
+    if (params->graph_build_params != nullptr) {
+      switch (params->build_algo) {
+      case cuvsCagraGraphBuildAlgo::IVF_PQ:
+        delete static_cast<cuvsIvfPqParams *>(params->graph_build_params);
+        break;
+      case cuvsCagraGraphBuildAlgo::ACE: {
+        auto ace_params = static_cast<cuvsAceParams *>(params->graph_build_params);
+        // Free the allocated build directory string
+        if (ace_params->build_dir) { free(const_cast<char*>(ace_params->build_dir)); }
+        delete ace_params;
+        break;
+      }
+      case cuvsCagraGraphBuildAlgo::AUTO_SELECT:
+      case cuvsCagraGraphBuildAlgo::NN_DESCENT:
+      case cuvsCagraGraphBuildAlgo::ITERATIVE_CAGRA_SEARCH:
+        // These algorithms don't have separate parameter structs
+        break;
+      }
+    }
     delete params;
   });
 }
@@ -724,6 +770,32 @@ extern "C" cuvsError_t cuvsCagraCompressionParamsDestroy(cuvsCagraCompressionPar
   return cuvs::core::translate_exceptions([=] { delete params; });
 }
 
+extern "C" cuvsError_t cuvsAceParamsCreate(cuvsAceParams_t* params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto ps = cuvs::neighbors::cagra::graph_build_params::ace_params();
+
+    // Allocate and copy the build directory string
+    const char* build_dir = strdup(ps.build_dir.c_str());
+
+    *params = new cuvsAceParams{.npartitions     = ps.npartitions,
+                                .ef_construction = ps.ef_construction,
+                                .build_dir       = build_dir,
+                                .use_disk        = ps.use_disk};
+  });
+}
+
+extern "C" cuvsError_t cuvsAceParamsDestroy(cuvsAceParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (params) {
+      // Free the allocated build directory string
+      if (params->build_dir) { free(const_cast<char*>(params->build_dir)); }
+      delete params;
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraIndexParamsFromHnswParams(cuvsCagraIndexParams_t params,
                                                            int64_t n_rows,
                                                            int64_t dim,
diff --git a/c/tests/neighbors/ann_cagra_c.cu b/c/tests/neighbors/ann_cagra_c.cu
index ab46c8b877..31f0e79e80 100644
--- a/c/tests/neighbors/ann_cagra_c.cu
+++ b/c/tests/neighbors/ann_cagra_c.cu
@@ -10,7 +10,9 @@
 #include <dlpack/dlpack.h>
 
 #include <cstdint>
+#include <cstring>
 #include <cuvs/neighbors/cagra.h>
+#include <cuvs/neighbors/hnsw.h>
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
@@ -44,6 +46,9 @@ float distances_exp[4]    = {0.03878258, 0.12472608, 0.04776672, 0.15224178};
 uint32_t neighbors_exp_filtered[4] = {3, 0, 3, 0};
 float distances_exp_filtered[4]    = {0.03878258, 0.12472608, 0.04776672, 0.59063464};
 
+std::vector<uint64_t> neighbors_exp_disk = {3, 0, 3, 1};
+std::vector<float> distances_exp_disk    = {0.03878258, 0.12472608, 0.04776672, 0.15224178};
+
 TEST(CagraC, BuildSearch)
 {
   // create cuvsResources_t
@@ -565,3 +570,221 @@ TEST(CagraC, BuildMergeSearch)
   cuvsCagraIndexDestroy(index_main);
   cuvsResourcesDestroy(res);
 }
+
+TEST(CagraC, BuildSearchACEMemory)
+{
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+  cudaStream_t stream;
+  cuvsStreamGet(res, &stream);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = dataset;
+  dataset_tensor.dl_tensor.device.device_type = kDLCPU;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {4, 2};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = nullptr;
+
+  // create index
+  cuvsCagraIndex_t index;
+  cuvsCagraIndexCreate(&index);
+
+  // build index with ACE memory mode
+  cuvsCagraIndexParams_t build_params;
+  cuvsCagraIndexParamsCreate(&build_params);
+  build_params->build_algo = ACE;
+
+  cuvsAceParams_t ace_params;
+  cuvsAceParamsCreate(&ace_params);
+  ace_params->npartitions = 2;
+  ace_params->ef_construction = 120;
+  ace_params->use_disk = false;
+
+  build_params->graph_build_params = ace_params;
+  cuvsCagraBuild(res, build_params, &dataset_tensor, index);
+
+  // create queries DLTensor
+  rmm::device_uvector<float> queries_d(4 * 2, stream);
+  raft::copy(queries_d.data(), (float*)queries, 4 * 2, stream);
+
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = queries_d.data();
+  queries_tensor.dl_tensor.device.device_type = kDLCUDA;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {4, 2};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = nullptr;
+
+  // create neighbors DLTensor
+  rmm::device_uvector<uint32_t> neighbors_d(4, stream);
+
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors_d.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCUDA;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLUInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 32;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {4, 1};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = nullptr;
+
+  // create distances DLTensor
+  rmm::device_uvector<float> distances_d(4, stream);
+
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances_d.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCUDA;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {4, 1};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = nullptr;
+
+  cuvsFilter filter;
+  filter.type = NO_FILTER;
+  filter.addr = (uintptr_t)NULL;
+
+  // search index
+  cuvsCagraSearchParams_t search_params;
+  cuvsCagraSearchParamsCreate(&search_params);
+  cuvsCagraSearch(
+    res, search_params, index, &queries_tensor, &neighbors_tensor, &distances_tensor, filter);
+
+  // verify output
+  ASSERT_TRUE(
+    cuvs::devArrMatchHost(neighbors_exp, neighbors_d.data(), 4, cuvs::Compare<uint32_t>()));
+  ASSERT_TRUE(cuvs::devArrMatchHost(
+    distances_exp, distances_d.data(), 4, cuvs::CompareApprox<float>(0.001f)));
+
+  // de-allocate index and res
+  cuvsCagraSearchParamsDestroy(search_params);
+  cuvsCagraIndexParamsDestroy(build_params);
+  cuvsCagraIndexDestroy(index);
+  cuvsResourcesDestroy(res);
+}
+
+TEST(CagraC, BuildSearchACEDisk)
+{
+  // create cuvsResources_t
+  cuvsResources_t res;
+  cuvsResourcesCreate(&res);
+
+  // create dataset DLTensor
+  DLManagedTensor dataset_tensor;
+  dataset_tensor.dl_tensor.data               = dataset;
+  dataset_tensor.dl_tensor.device.device_type = kDLCPU;
+  dataset_tensor.dl_tensor.ndim               = 2;
+  dataset_tensor.dl_tensor.dtype.code         = kDLFloat;
+  dataset_tensor.dl_tensor.dtype.bits         = 32;
+  dataset_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t dataset_shape[2]                    = {4, 2};
+  dataset_tensor.dl_tensor.shape              = dataset_shape;
+  dataset_tensor.dl_tensor.strides            = nullptr;
+
+  // create index
+  cuvsCagraIndex_t index;
+  cuvsCagraIndexCreate(&index);
+
+  // build index with ACE memory mode
+  cuvsCagraIndexParams_t build_params;
+  cuvsCagraIndexParamsCreate(&build_params);
+  build_params->build_algo = ACE;
+
+  cuvsAceParams_t ace_params;
+  cuvsAceParamsCreate(&ace_params);
+  ace_params->npartitions = 2;
+  ace_params->ef_construction = 120;
+  ace_params->use_disk = true;
+  ace_params->build_dir = strdup("/tmp/cagra_ace_test_disk");
+
+  build_params->graph_build_params = ace_params;
+  cuvsCagraBuild(res, build_params, &dataset_tensor, index);
+
+  // Convert CAGRA index to HNSW (automatically serializes to disk for ACE)
+  cuvsHnswIndex_t hnsw_index_ser;
+  cuvsHnswIndexCreate(&hnsw_index_ser);
+  cuvsHnswIndexParams_t hnsw_params;
+  cuvsHnswIndexParamsCreate(&hnsw_params);
+
+  cuvsHnswFromCagra(res, hnsw_params, index, hnsw_index_ser);
+  ASSERT_NE(hnsw_index_ser->addr, 0);
+  cuvsHnswIndexDestroy(hnsw_index_ser);
+
+  DLManagedTensor queries_tensor;
+  queries_tensor.dl_tensor.data               = queries;
+  queries_tensor.dl_tensor.device.device_type = kDLCPU;
+  queries_tensor.dl_tensor.ndim               = 2;
+  queries_tensor.dl_tensor.dtype.code         = kDLFloat;
+  queries_tensor.dl_tensor.dtype.bits         = 32;
+  queries_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t queries_shape[2]                    = {4, 2};
+  queries_tensor.dl_tensor.shape              = queries_shape;
+  queries_tensor.dl_tensor.strides            = nullptr;
+
+  // create neighbors DLTensor
+  std::vector<uint64_t> neighbors(4);
+
+  DLManagedTensor neighbors_tensor;
+  neighbors_tensor.dl_tensor.data               = neighbors.data();
+  neighbors_tensor.dl_tensor.device.device_type = kDLCPU;
+  neighbors_tensor.dl_tensor.ndim               = 2;
+  neighbors_tensor.dl_tensor.dtype.code         = kDLUInt;
+  neighbors_tensor.dl_tensor.dtype.bits         = 64;
+  neighbors_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t neighbors_shape[2]                    = {4, 1};
+  neighbors_tensor.dl_tensor.shape              = neighbors_shape;
+  neighbors_tensor.dl_tensor.strides            = nullptr;
+
+  // create distances DLTensor
+  std::vector<float> distances(4);
+
+  DLManagedTensor distances_tensor;
+  distances_tensor.dl_tensor.data               = distances.data();
+  distances_tensor.dl_tensor.device.device_type = kDLCPU;
+  distances_tensor.dl_tensor.ndim               = 2;
+  distances_tensor.dl_tensor.dtype.code         = kDLFloat;
+  distances_tensor.dl_tensor.dtype.bits         = 32;
+  distances_tensor.dl_tensor.dtype.lanes        = 1;
+  int64_t distances_shape[2]                    = {4, 1};
+  distances_tensor.dl_tensor.shape              = distances_shape;
+  distances_tensor.dl_tensor.strides            = nullptr;
+
+  // Deserialize the HNSW index from disk for search
+  cuvsHnswIndex_t hnsw_index;
+  cuvsHnswIndexCreate(&hnsw_index);
+  hnsw_index->dtype = index->dtype;
+
+  // Use the actual dimension from the dataset
+  int dim = dataset_tensor.dl_tensor.shape[1];
+  cuvsHnswDeserialize(res, hnsw_params, "/tmp/cagra_ace_test_disk/hnsw_index.bin", dim, L2Expanded, hnsw_index);
+  ASSERT_NE(hnsw_index->addr, 0);
+
+  // Search the HNSW index
+  cuvsHnswSearchParams_t search_params;
+  cuvsHnswSearchParamsCreate(&search_params);
+  cuvsHnswSearch(
+    res, search_params, hnsw_index, &queries_tensor, &neighbors_tensor, &distances_tensor);
+
+  // Verify output
+  ASSERT_TRUE(cuvs::hostVecMatch(neighbors_exp_disk, neighbors, cuvs::Compare<uint64_t>()));
+  ASSERT_TRUE(cuvs::hostVecMatch(distances_exp_disk, distances, cuvs::CompareApprox<float>(0.001f)));
+
+  cuvsCagraIndexParamsDestroy(build_params);
+  cuvsCagraIndexDestroy(index);
+  cuvsHnswSearchParamsDestroy(search_params);
+  cuvsHnswIndexParamsDestroy(hnsw_params);
+  cuvsHnswIndexDestroy(hnsw_index);
+  cuvsResourcesDestroy(res);
+}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ababb22548..b46cf03f3a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -351,6 +351,8 @@ if(NOT BUILD_CPU_ONLY)
     src/cluster/spectral.cu
     src/core/bitset.cu
     src/core/omp_wrapper.cpp
+    src/util/file_io.cpp
+    src/util/host_memory.cpp
     src/distance/detail/kernels/gram_matrix.cu
     src/distance/detail/kernels/kernel_factory.cu
     src/distance/detail/kernels/kernel_matrices.cu
@@ -442,6 +444,7 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/cagra_index_wrapper.cu
     src/neighbors/composite/index.cu
     src/neighbors/composite/merge.cpp
+    $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/cagra.cpp>
     $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
     src/neighbors/ivf_flat_index.cpp
     src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 84c6f0628d..d9e7c0f41d 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -260,6 +260,11 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
             params.graph_build_params)) {
         params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
       }
+    } else if (conf.at("graph_build_algo") == "ACE") {
+      if (!std::holds_alternative<cuvs::neighbors::graph_build_params::ace_params>(
+            params.graph_build_params)) {
+        params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
+      }
     }
   }
 
@@ -267,12 +272,15 @@ void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::cagra::index
   nlohmann::json ivf_pq_build_conf  = collect_conf_with_prefix(conf, "ivf_pq_build_");
   nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
   nlohmann::json nn_descent_conf    = collect_conf_with_prefix(conf, "nn_descent_");
+  nlohmann::json ace_conf           = collect_conf_with_prefix(conf, "ace_");
 
   if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
     if (!ivf_pq_build_conf.empty() || !ivf_pq_search_conf.empty()) {
       params.graph_build_params = cuvs::neighbors::graph_build_params::ivf_pq_params{};
     } else if (!nn_descent_conf.empty()) {
       params.graph_build_params = cuvs::neighbors::graph_build_params::nn_descent_params{};
+    } else if (!ace_conf.empty()) {
+      params.graph_build_params = cuvs::neighbors::graph_build_params::ace_params{};
     } else {
       params.graph_build_params = cuvs::neighbors::graph_build_params::iterative_search_params{};
     }
@@ -328,6 +336,20 @@ void parse_build_param(const nlohmann::json& conf,
         cuvs::neighbors::cagra::graph_build_params::nn_descent_params(
           conf.value("intermediate_graph_degree", cagra_params.intermediate_graph_degree),
           dist_type);
+    } else if (conf.value("graph_build_algo", "") == "ACE") {
+      cagra_params.graph_build_params = cuvs::neighbors::cagra::graph_build_params::ace_params{};
+    }
+    // Parse ACE parameters if provided
+    nlohmann::json ace_conf = collect_conf_with_prefix(conf, "ace_");
+    if (!ace_conf.empty()) {
+      auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
+      if (ace_conf.contains("npartitions")) { ace_params.npartitions = ace_conf.at("npartitions"); }
+      if (ace_conf.contains("build_dir")) { ace_params.build_dir = ace_conf.at("build_dir"); }
+      if (ace_conf.contains("ef_construction")) {
+        ace_params.ef_construction = ace_conf.at("ef_construction");
+      }
+      if (ace_conf.contains("use_disk")) { ace_params.use_disk = ace_conf.at("use_disk"); }
+      cagra_params.graph_build_params = ace_params;
     }
     ::parse_build_param<T, IdxT>(conf, cagra_params);
     return cagra_params;
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index b69b3946b2..113e79fa15 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -48,9 +48,25 @@ auto parse_build_param(const nlohmann::json& conf) ->
     // to override them.
     cagra_params.cagra_params = [conf, hnsw_params](raft::matrix_extent<int64_t> extents,
                                                     cuvs::distance::DistanceType dist_type) {
-      auto ps = cuvs::neighbors::hnsw::to_cagra_params(
-        extents, conf.at("M"), hnsw_params.ef_construction, dist_type);
+      auto ps = cuvs::neighbors::cagra::index_params::from_hnsw_params(
+        extents,
+        conf.at("M"),
+        hnsw_params.ef_construction,
+        cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
+        dist_type);
       ps.metric = dist_type;
+      // Parse ACE parameters if provided
+      if (conf.contains("npartitions") || conf.contains("build_dir") ||
+          conf.contains("ef_construction") || conf.contains("use_disk")) {
+        auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
+        if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); }
+        if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); }
+        if (conf.contains("ef_construction")) {
+          ace_params.ef_construction = conf.at("ef_construction");
+        }
+        if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); }
+        ps.graph_build_params = ace_params;
+      }
       // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual.
       ::parse_build_param<T, uint32_t>(conf, ps);
       return ps;
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 1b237f82df..2f0c54e1bd 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -75,6 +75,8 @@ class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
   build_param build_param_;
   search_param search_param_;
   std::shared_ptr<cuvs::neighbors::hnsw::index<T>> hnsw_index_;
+
+  bool cagra_ace_build_ = false;
 };
 
 template <typename T, typename IdxT>
@@ -110,6 +112,11 @@ void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
   // convert the index to HNSW format
   hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
     handle_, build_param_.hnsw_index_params, cagra_index, opt_dataset_view);
+
+  // special treatment in save/serialize step
+  if (cagra_index.dataset_fd().has_value() && cagra_index.graph_fd().has_value()) {
+    cagra_ace_build_ = true;
+  }
 }
 
 template <typename T, typename IdxT>
@@ -123,7 +130,21 @@ void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& para
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
-  cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
+  if (cagra_ace_build_) {
+    std::string index_filename = hnsw_index_->file_path();
+    RAFT_EXPECTS(!index_filename.empty(), "HNSW index file path is not available.");
+    RAFT_EXPECTS(std::filesystem::exists(index_filename),
+                 "Index file '%s' does not exist.",
+                 index_filename.c_str());
+    if (std::filesystem::exists(file)) { std::filesystem::remove(file); }
+    // might fail when using 2 different filesystems
+    std::error_code ec;
+    std::filesystem::rename(index_filename, file, ec);
+    RAFT_EXPECTS(
+      !ec, "Failed to rename index file '%s' to '%s'.", index_filename.c_str(), file.c_str());
+  } else {
+    cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 6192b263c3..fb1b1549af 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -11,17 +11,24 @@
 #include <cuvs/neighbors/graph_build_types.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
+#include <cuvs/util/file_io.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/stream_view.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <fcntl.h>
+#include <filesystem>
+#include <fstream>
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 #include <optional>
+#include <string>
 #include <variant>
 
 namespace cuvs::neighbors::cagra {
@@ -79,9 +86,9 @@ struct index_params : cuvs::neighbors::index_params {
 
   /** Parameters for graph building.
    *
-   * Set ivf_pq_params, nn_descent_params, or iterative_search_params to select the graph build
-   * algorithm and control their parameters. The default (std::monostate) is to use a heuristic
-   *  to decide the algorithm and its parameters.
+   * Set ivf_pq_params, nn_descent_params, ace_params, or iterative_search_params to select the
+   * graph build algorithm and control their parameters. The default (std::monostate) is to use a
+   * heuristic to decide the algorithm and its parameters.
    *
    * @code{.cpp}
    * cagra::index_params params;
@@ -93,7 +100,10 @@ struct index_params : cuvs::neighbors::index_params {
    * params.graph_build_params =
    * cagra::graph_build_params::nn_descent_params(params.intermediate_graph_degree);
    *
-   * // 3. Choose iterative graph building using CAGRA's search() and optimize()  [Experimental]
+   * // 3. Choose ACE algorithm for graph construction
+   * params.graph_build_params = cagra::graph_build_params::ace_params();
+   *
+   * // 4. Choose iterative graph building using CAGRA's search() and optimize()  [Experimental]
    * params.graph_build_params =
    * cagra::graph_build_params::iterative_search_params();
    * @endcode
@@ -101,9 +111,9 @@ struct index_params : cuvs::neighbors::index_params {
   std::variant<std::monostate,
                graph_build_params::ivf_pq_params,
                graph_build_params::nn_descent_params,
+               graph_build_params::ace_params,
                graph_build_params::iterative_search_params>
     graph_build_params;
-
   /**
    * Whether to use MST optimization to guarantee graph connectivity.
    */
@@ -363,15 +373,19 @@ struct index : cuvs::neighbors::index {
   [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
   {
     auto data_rows = dataset_->n_rows();
+    if (dataset_fd_.has_value()) { return n_rows_; }
     return data_rows > 0 ? data_rows : graph_view_.extent(0);
   }
 
   /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dataset_->dim(); }
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
+  {
+    return dataset_fd_.has_value() ? dim_ : dataset_->dim();
+  }
   /** Graph degree */
   [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
   {
-    return graph_view_.extent(1);
+    return dataset_fd_.has_value() ? graph_degree_ : graph_view_.extent(1);
   }
 
   [[nodiscard]] inline auto dataset() const noexcept
@@ -406,6 +420,27 @@ struct index : cuvs::neighbors::index {
              : std::nullopt;
   }
 
+  /** Get the dataset file descriptor (for disk-backed index) */
+  [[nodiscard]] inline auto dataset_fd() const noexcept
+    -> const std::optional<cuvs::util::file_descriptor>&
+  {
+    return dataset_fd_;
+  }
+
+  /** Get the graph file descriptor (for disk-backed index) */
+  [[nodiscard]] inline auto graph_fd() const noexcept
+    -> const std::optional<cuvs::util::file_descriptor>&
+  {
+    return graph_fd_;
+  }
+
+  /** Get the mapping file descriptor (for disk-backed index) */
+  [[nodiscard]] inline auto mapping_fd() const noexcept
+    -> const std::optional<cuvs::util::file_descriptor>&
+  {
+    return mapping_fd_;
+  }
+
   /** Dataset norms for cosine distance [size] */
   [[nodiscard]] inline auto dataset_norms() const noexcept
     -> std::optional<raft::device_vector_view<const float, int64_t>>
@@ -677,6 +712,117 @@ struct index : cuvs::neighbors::index {
                raft::resource::get_cuda_stream(res));
   }
 
+  /**
+   * Update the dataset from a disk file using a file descriptor.
+   *
+   * This method configures the index to use a disk-based dataset.
+   * The dataset file should contain a numpy header followed by vectors in row-major format.
+   * The number of rows and dimensionality are read from the numpy header.
+   *
+   * @param[in] res raft resources
+   * @param[in] fd File descriptor (will be moved into the index for lifetime management)
+   */
+  void update_dataset(raft::resources const& res, cuvs::util::file_descriptor&& fd)
+  {
+    RAFT_EXPECTS(fd.is_valid(), "Invalid file descriptor provided for dataset");
+
+    auto stream = fd.make_istream();
+    if (lseek(fd.get(), 0, SEEK_SET) == -1) {
+      RAFT_FAIL("Failed to seek to beginning of dataset file");
+    }
+    auto header = raft::detail::numpy_serializer::read_header(stream);
+    RAFT_EXPECTS(header.shape.size() == 2,
+                 "Dataset file should be 2D, got %zu dimensions",
+                 header.shape.size());
+
+    n_rows_ = header.shape[0];
+    dim_    = header.shape[1];
+
+    RAFT_LOG_DEBUG("ACE: Dataset has shape [%zu, %zu]", n_rows_, dim_);
+
+    // Re-open the file descriptor in read-only mode for subsequent operations
+    dataset_fd_.emplace(std::move(fd));
+
+    dataset_ = std::make_unique<cuvs::neighbors::empty_dataset<int64_t>>(0);
+    dataset_norms_.reset();
+  }
+
+  /**
+   * Update the graph from a disk file using a file descriptor.
+   *
+   * This method configures the index to use a disk-based graph.
+   * The graph file should contain a numpy header followed by neighbor indices in row-major format.
+   * The number of rows and graph degree are read from the numpy header.
+   *
+   * @param[in] res raft resources
+   * @param[in] fd File descriptor (will be moved into the index for lifetime management)
+   */
+  void update_graph(raft::resources const& res, cuvs::util::file_descriptor&& fd)
+  {
+    RAFT_EXPECTS(fd.is_valid(), "Invalid file descriptor provided for graph");
+
+    auto stream = fd.make_istream();
+    if (lseek(fd.get(), 0, SEEK_SET) == -1) {
+      RAFT_FAIL("Failed to seek to beginning of graph file");
+    }
+    auto header = raft::detail::numpy_serializer::read_header(stream);
+    RAFT_EXPECTS(
+      header.shape.size() == 2, "Graph file should be 2D, got %zu dimensions", header.shape.size());
+
+    if (dataset_fd_.has_value() && n_rows_ != 0) {
+      RAFT_EXPECTS(n_rows_ == header.shape[0],
+                   "Graph size (%zu) must match dataset size (%zu)",
+                   header.shape[0],
+                   n_rows_);
+    }
+
+    n_rows_       = header.shape[0];
+    graph_degree_ = header.shape[1];
+
+    RAFT_LOG_DEBUG("ACE: Graph has shape [%zu, %zu]", n_rows_, graph_degree_);
+
+    // Re-open the file descriptor in read-only mode for subsequent operations
+    graph_fd_.emplace(std::move(fd));
+
+    graph_      = raft::make_device_matrix<IdxT, int64_t>(res, 0, 0);
+    graph_view_ = graph_.view();
+  }
+
+  /**
+   * Update the dataset mapping from a disk file using a file descriptor.
+   *
+   * This method configures the index to use a disk-based dataset mapping.
+   * The mapping file should contain a numpy header followed by index mappings.
+   *
+   * @param[in] res raft resources
+   * @param[in] fd File descriptor (will be moved into the index for lifetime management)
+   */
+  void update_mapping(raft::resources const& res, cuvs::util::file_descriptor&& fd)
+  {
+    RAFT_EXPECTS(fd.is_valid(), "Invalid file descriptor provided for mapping");
+
+    // Read header from file using ifstream
+    auto stream = fd.make_istream();
+    if (lseek(fd.get(), 0, SEEK_SET) == -1) {
+      RAFT_FAIL("Failed to seek to beginning of mapping file");
+    }
+    auto header = raft::detail::numpy_serializer::read_header(stream);
+    RAFT_EXPECTS(header.shape.size() == 1,
+                 "Mapping file should be 1D, got %zu dimensions",
+                 header.shape.size());
+
+    if (dataset_fd_.has_value() && n_rows_ != 0) {
+      RAFT_EXPECTS(header.shape[0] == n_rows_,
+                   "Mapping size (%zu) must match dataset size (%zu)",
+                   header.shape[0],
+                   n_rows_);
+    }
+
+    RAFT_LOG_DEBUG("ACE: Mapping has %zu elements", header.shape[0]);
+
+    mapping_fd_.emplace(std::move(fd));
+  }
+
  private:
   cuvs::distance::DistanceType metric_;
   raft::device_matrix<graph_index_type, int64_t, raft::row_major> graph_;
@@ -687,7 +833,15 @@ struct index : cuvs::neighbors::index {
   // only float distances supported at the moment
   std::optional<raft::device_vector<float, int64_t>> dataset_norms_;
 
+  // File descriptors for disk-backed index components (ACE disk mode)
+  std::optional<cuvs::util::file_descriptor> dataset_fd_;
+  std::optional<cuvs::util::file_descriptor> graph_fd_;
+  std::optional<cuvs::util::file_descriptor> mapping_fd_;
+
   void compute_dataset_norms_(raft::resources const& res);
+  size_t n_rows_       = 0;
+  size_t dim_          = 0;
+  size_t graph_degree_ = 0;
 };
 /**
  * @}
@@ -2866,6 +3020,166 @@ template <typename T, typename IdxT>
 auto distribute(const raft::resources& clique, const std::string& filename)
   -> cuvs::neighbors::mg_index<cagra::index<T, IdxT>, T, IdxT>;
 
+/**
+ * @brief Build a kNN graph using IVF-PQ.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ *                                      optimized_graph.view());
+ * @endcode
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params ivf-pq parameters for graph build
+ */
+void build_knn_graph(raft::resources const& res,
+                     raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+                     raft::host_matrix_view<uint32_t, int64_t, raft::row_major> knn_graph,
+                     cuvs::neighbors::cagra::graph_build_params::ivf_pq_params build_params);
+
+/**
+ * @brief Build a kNN graph using IVF-PQ.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ *                                      optimized_graph.view());
+ * @endcode
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params ivf-pq parameters for graph build
+ */
+void build_knn_graph(raft::resources const& res,
+                     raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+                     raft::host_matrix_view<uint32_t, int64_t, raft::row_major> knn_graph,
+                     cuvs::neighbors::cagra::graph_build_params::ivf_pq_params build_params);
+
+/**
+ * @brief Build a kNN graph using IVF-PQ.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ *                                      optimized_graph.view());
+ * @endcode
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params ivf-pq parameters for graph build
+ */
+void build_knn_graph(raft::resources const& res,
+                     raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+                     raft::host_matrix_view<uint32_t, int64_t, raft::row_major> knn_graph,
+                     cuvs::neighbors::cagra::graph_build_params::ivf_pq_params build_params);
+
+/**
+ * @brief Build a kNN graph using IVF-PQ.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each point in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
+ *   auto optimized_gaph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view());
+ *   // Construct an index from dataset and optimized knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset,
+ *                                      optimized_graph.view());
+ * @endcode
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] build_params ivf-pq parameters for graph build
+ */
+void build_knn_graph(raft::resources const& res,
+                     raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+                     raft::host_matrix_view<uint32_t, int64_t, raft::row_major> knn_graph,
+                     cuvs::neighbors::cagra::graph_build_params::ivf_pq_params build_params);
+
 }  // namespace cuvs::neighbors::cagra
 
 #include <cuvs/neighbors/cagra_index_wrapper.hpp>
diff --git a/cpp/include/cuvs/neighbors/graph_build_types.hpp b/cpp/include/cuvs/neighbors/graph_build_types.hpp
index 2f3f93b3f9..7f501240a2 100644
--- a/cpp/include/cuvs/neighbors/graph_build_types.hpp
+++ b/cpp/include/cuvs/neighbors/graph_build_types.hpp
@@ -16,7 +16,7 @@ namespace cuvs::neighbors {
  * @{
  */
 
-enum GRAPH_BUILD_ALGO { BRUTE_FORCE = 0, IVF_PQ = 1, NN_DESCENT = 2 };
+enum GRAPH_BUILD_ALGO { BRUTE_FORCE = 0, IVF_PQ = 1, NN_DESCENT = 2, ACE = 3 };
 
 namespace graph_build_params {
 
@@ -94,6 +94,45 @@ struct brute_force_params {
   cuvs::neighbors::brute_force::search_params search_params;
 };
 
+/** Specialized parameters for ACE (Augmented Core Extraction) graph build */
+struct ace_params {
+  /**
+   * Number of partitions for ACE (Augmented Core Extraction) partitioned build.
+   *
+   * Small values might improve recall but potentially degrade performance and
+   * increase memory usage. Partitions should not be too small to prevent issues
+   * in KNN graph construction. 100k - 5M vectors per partition is recommended
+   * depending on the available host and GPU memory. The partition size is on
+   * average 2 * (n_rows / npartitions) * dim * sizeof(T). 2 is because of the
+   * core and augmented vectors. Please account for imbalance in the partition
+   * sizes (up to 3x in our tests).
+   */
+  size_t npartitions = 1;
+  /**
+   * The index quality for the ACE build.
+   *
+   * Bigger values increase the index quality. At some point, increasing this will no longer improve
+   * the quality.
+   */
+  size_t ef_construction = 120;
+  /**
+   * Directory to store ACE build artifacts (e.g., KNN graph, optimized graph).
+   *
+   * Used when `use_disk` is true or when the graph does not fit in host and GPU
+   * memory. This should be the fastest disk in the system and hold enough space
+   * for twice the dataset, final graph, and label mapping.
+   */
+  std::string build_dir = "/tmp/ace_build";
+  /**
+   * Whether to use disk-based storage for ACE build.
+   *
+   * When true, enables disk-based operations for memory-efficient graph construction.
+   */
+  bool use_disk = false;
+
+  ace_params() = default;
+};
+
 // **** Experimental ****
 using iterative_search_params = cuvs::neighbors::search_params;
 }  // namespace graph_build_params
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index 3ba9df63e2..c2bfb1993d 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -132,6 +132,11 @@ struct index : cuvs::neighbors::index {
   */
   virtual void set_ef(int ef) const;
 
+  /**
+  @brief Get file path for disk-backed index
+  */
+  virtual std::string file_path() const { return ""; }
+
  private:
   int dim_;
   cuvs::distance::DistanceType metric_;
diff --git a/cpp/include/cuvs/util/file_io.hpp b/cpp/include/cuvs/util/file_io.hpp
new file mode 100644
index 0000000000..363b1b1ca0
--- /dev/null
+++ b/cpp/include/cuvs/util/file_io.hpp
@@ -0,0 +1,243 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/error.hpp>
+
+#include <algorithm>
+#include <cstring>
+#include <istream>
+#include <limits.h>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace cuvs::util {
+/**
+ * @brief Streambuf that reads from a POSIX file descriptor
+ */
+class fd_streambuf : public std::streambuf {
+  int fd_;
+  std::unique_ptr<char[]> buffer_;
+  size_t buffer_size_;
+
+ protected:
+  int_type underflow() override
+  {
+    if (gptr() < egptr()) return traits_type::to_int_type(*gptr());
+    ssize_t n = ::read(fd_, buffer_.get(), buffer_size_);
+    if (n <= 0) return traits_type::eof();
+    setg(buffer_.get(), buffer_.get(), buffer_.get() + n);
+    return traits_type::to_int_type(*gptr());
+  }
+
+ public:
+  explicit fd_streambuf(int fd, size_t buffer_size = 8192)
+    : fd_(fd), buffer_(new char[buffer_size]), buffer_size_(buffer_size)
+  {
+    setg(buffer_.get(), buffer_.get(), buffer_.get());
+  }
+
+  ~fd_streambuf()
+  {
+    if (fd_ != -1) ::close(fd_);
+  }
+
+  fd_streambuf(const fd_streambuf&)                = delete;
+  fd_streambuf& operator=(const fd_streambuf&)     = delete;
+  fd_streambuf(fd_streambuf&&) noexcept            = default;
+  fd_streambuf& operator=(fd_streambuf&&) noexcept = default;
+};
+
+/**
+ * @brief Istream that reads from a POSIX file descriptor
+ */
+class fd_istream : public std::istream {
+  fd_streambuf buf_;
+
+ public:
+  explicit fd_istream(int fd) : std::istream(&buf_), buf_(fd) {}
+
+  fd_istream(const fd_istream&)            = delete;
+  fd_istream& operator=(const fd_istream&) = delete;
+
+  fd_istream(fd_istream&& o) noexcept : std::istream(std::move(o)), buf_(std::move(o.buf_))
+  {
+    rdbuf(&buf_);
+  }
+
+  fd_istream& operator=(fd_istream&& o) noexcept
+  {
+    std::istream::operator=(std::move(o));
+    buf_ = std::move(o.buf_);
+    rdbuf(&buf_);
+    return *this;
+  }
+};
+
+/**
+ * @brief RAII wrapper for POSIX file descriptors
+ *
+ * Manages file descriptor lifecycle with automatic cleanup.
+ * Non-copyable, move-only.
+ */
+class file_descriptor {
+ public:
+  explicit file_descriptor(int fd = -1) : fd_(fd) {}
+
+  file_descriptor(const std::string& path, int flags, mode_t mode = 0644)
+    : fd_(open(path.c_str(), flags, mode)), path_(path)
+  {
+    if (fd_ == -1) {
+      RAFT_FAIL("Failed to open file: %s (errno: %d, %s)", path.c_str(), errno, strerror(errno));
+    }
+  }
+
+  file_descriptor(const file_descriptor&)            = delete;
+  file_descriptor& operator=(const file_descriptor&) = delete;
+
+  file_descriptor(file_descriptor&& other) noexcept
+    : fd_{std::exchange(other.fd_, -1)}, path_{std::move(other.path_)}
+  {
+  }
+
+  file_descriptor& operator=(file_descriptor&& other) noexcept
+  {
+    std::swap(this->fd_, other.fd_);
+    std::swap(this->path_, other.path_);
+    return *this;
+  }
+
+  ~file_descriptor() noexcept { close(); }
+
+  [[nodiscard]] int get() const noexcept { return fd_; }
+  [[nodiscard]] bool is_valid() const noexcept { return fd_ != -1; }
+
+  void close() noexcept
+  {
+    if (fd_ != -1) {
+      ::close(fd_);
+      fd_ = -1;
+    }
+  }
+
+  [[nodiscard]] int release() noexcept
+  {
+    const int fd = fd_;
+    fd_          = -1;
+    return fd;
+  }
+
+  [[nodiscard]] std::string get_path() const { return path_; }
+
+  /**
+   * @brief Create an input stream from this file descriptor
+   *
+   * Creates an istream that reads directly from the file descriptor using POSIX read().
+   * The original descriptor remains valid and unchanged (we duplicate it internally).
+   * Returns the stream by value (uses move semantics).
+   *
+   * @return fd_istream (movable istream)
+   */
+  [[nodiscard]] fd_istream make_istream() const
+  {
+    RAFT_EXPECTS(is_valid(), "Invalid file descriptor");
+
+    // Duplicate the fd to avoid consuming the original
+    int dup_fd = dup(fd_);
+    RAFT_EXPECTS(dup_fd != -1, "Failed to duplicate file descriptor");
+
+    // Create stream that owns the duplicated fd
+    // Returned by value, uses move semantics
+    return fd_istream(dup_fd);
+  }
+
+ private:
+  int fd_;
+  std::string path_;
+};
+
+/**
+ * @brief Read large file in chunks using pread
+ *
+ * Reads a file in chunks to avoid issues with very large reads.
+ * Uses pread for thread-safe, offset-based reading.
+ *
+ * @param fd File descriptor to read from
+ * @param dest_ptr Destination buffer
+ * @param total_bytes Total bytes to read
+ * @param file_offset Starting offset in file
+ */
+void read_large_file(const file_descriptor& fd,
+                     void* dest_ptr,
+                     const size_t total_bytes,
+                     const uint64_t file_offset);
+
+/**
+ * @brief Write large file in chunks using pwrite
+ *
+ * Writes data to a file in chunks to avoid issues with very large writes.
+ * Uses pwrite for thread-safe, offset-based writing.
+ *
+ * @param fd File descriptor to write to
+ * @param data_ptr Source data buffer
+ * @param total_bytes Total bytes to write
+ * @param file_offset Starting offset in file
+ */
+void write_large_file(const file_descriptor& fd,
+                      const void* data_ptr,
+                      const size_t total_bytes,
+                      const uint64_t file_offset);
+
+/**
+ * @brief Buffered output stream wrapper
+ *
+ * Wraps an std::ostream with a buffer to improve write performance by
+ * reducing the number of system calls. Automatically flushes on destruction.
+ * Non-copyable, non-movable.
+ */
+class buffered_ofstream {
+ public:
+  buffered_ofstream(std::ostream* os, size_t buffer_size) : os_(os), buffer_(buffer_size), pos_(0)
+  {
+  }
+
+  ~buffered_ofstream() noexcept { flush(); }
+
+  buffered_ofstream(const buffered_ofstream& res)                      = delete;
+  auto operator=(const buffered_ofstream& other) -> buffered_ofstream& = delete;
+  buffered_ofstream(buffered_ofstream&& other)                         = delete;
+  auto operator=(buffered_ofstream&& other) -> buffered_ofstream&      = delete;
+
+  void flush()
+  {
+    if (pos_ > 0) {
+      os_->write(reinterpret_cast<char*>(&buffer_.front()), pos_);
+      if (!os_->good()) { RAFT_FAIL("Error writing HNSW file!"); }
+      pos_ = 0;
+    }
+  }
+
+  void write(const char* input, size_t size)
+  {
+    if (pos_ + size > buffer_.size()) { flush(); }
+    std::copy(input, input + size, &buffer_[pos_]);
+    pos_ += size;
+  }
+
+ private:
+  std::vector<char> buffer_;
+  std::ostream* os_;
+  size_t pos_;
+};
+
+}  // namespace cuvs::util
diff --git a/cpp/include/cuvs/util/host_memory.hpp b/cpp/include/cuvs/util/host_memory.hpp
new file mode 100644
index 0000000000..7ca9da1687
--- /dev/null
+++ b/cpp/include/cuvs/util/host_memory.hpp
@@ -0,0 +1,25 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/error.hpp>
+
+#include <fstream>
+#include <string>
+
+namespace cuvs::util {
+
+/**
+ * @brief Get available host memory from /proc/meminfo
+ *
+ * Queries the system for available memory by reading /proc/meminfo.
+ * This is useful for determining how much host memory can be used
+ * for buffering or temporary storage.
+ *
+ * @return Available memory in bytes
+ */
+size_t get_free_host_memory();
+
+}  // namespace cuvs::util
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index fdc5b2b7e7..cf65ef7c4d 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -118,12 +118,7 @@ void build_knn_graph(
     raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, accessor>(
       dataset.data_handle(), dataset.extent(0), dataset.extent(1));
 
-  cagra::detail::build_knn_graph(res,
-                                 dataset_internal,
-                                 knn_graph_internal,
-                                 ivf_pq_params.refinement_rate,
-                                 ivf_pq_params.build_params,
-                                 ivf_pq_params.search_params);
+  cagra::detail::build_knn_graph(res, dataset_internal, knn_graph_internal, ivf_pq_params);
 }
 
 /**
@@ -278,6 +273,15 @@ index<T, IdxT> build(
   const index_params& params,
   raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
+  // Check if ACE dispatch is requested via graph_build_params
+  if (std::holds_alternative<graph_build_params::ace_params>(params.graph_build_params)) {
+    // ACE expects the dataset to be on host due to the large dataset size
+    RAFT_EXPECTS(raft::get_device_for_address(dataset.data_handle()) == -1,
+                 "ACE: Dataset must be on host for ACE build");
+    auto dataset_view = raft::make_host_matrix_view<const T, int64_t, row_major>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    return cuvs::neighbors::cagra::detail::build_ace<T, IdxT>(res, params, dataset_view);
+  }
   return cuvs::neighbors::cagra::detail::build<T, IdxT, Accessor>(res, params, dataset);
 }
 
diff --git a/cpp/src/neighbors/cagra_build_float.cu b/cpp/src/neighbors/cagra_build_float.cu
index fe4c757b72..b3097f7647 100644
--- a/cpp/src/neighbors/cagra_build_float.cu
+++ b/cpp/src/neighbors/cagra_build_float.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,21 +8,29 @@
 
 namespace cuvs::neighbors::cagra {
 
-#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                            \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
-  }                                                                               \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
+#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                                    \
+  void build_knn_graph(raft::resources const& handle,                                     \
+                       raft::host_matrix_view<const T, int64_t, raft::row_major> dataset, \
+                       raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,  \
+                       cuvs::neighbors::cagra::graph_build_params::ivf_pq_params params)  \
+  {                                                                                       \
+    cuvs::neighbors::cagra::build_knn_graph<T, IdxT>(handle, dataset, knn_graph, params); \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)         \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)           \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
   }
 
 RAFT_INST_CAGRA_BUILD(float, uint32_t);
diff --git a/cpp/src/neighbors/cagra_build_half.cu b/cpp/src/neighbors/cagra_build_half.cu
index 10a995da45..dd57cb87cc 100644
--- a/cpp/src/neighbors/cagra_build_half.cu
+++ b/cpp/src/neighbors/cagra_build_half.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,14 @@
 
 namespace cuvs::neighbors::cagra {
 
+void build_knn_graph(raft::resources const& handle,
+                     raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+                     raft::host_matrix_view<uint32_t, int64_t, raft::row_major> knn_graph,
+                     cuvs::neighbors::cagra::graph_build_params::ivf_pq_params params)
+{
+  cuvs::neighbors::cagra::build_knn_graph<half, uint32_t>(handle, dataset, knn_graph, params);
+}
+
 cuvs::neighbors::cagra::index<half, uint32_t> build(
   raft::resources const& handle,
   const cuvs::neighbors::cagra::index_params& params,
diff --git a/cpp/src/neighbors/cagra_build_int8.cu b/cpp/src/neighbors/cagra_build_int8.cu
index 291c3a4bae..d651790662 100644
--- a/cpp/src/neighbors/cagra_build_int8.cu
+++ b/cpp/src/neighbors/cagra_build_int8.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,21 +8,29 @@
 
 namespace cuvs::neighbors::cagra {
 
-#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                            \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
-  }                                                                               \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
+#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                                    \
+  void build_knn_graph(raft::resources const& handle,                                     \
+                       raft::host_matrix_view<const T, int64_t, raft::row_major> dataset, \
+                       raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,  \
+                       cuvs::neighbors::cagra::graph_build_params::ivf_pq_params params)  \
+  {                                                                                       \
+    cuvs::neighbors::cagra::build_knn_graph<T, IdxT>(handle, dataset, knn_graph, params); \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)         \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)           \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
   }
 
 RAFT_INST_CAGRA_BUILD(int8_t, uint32_t);
diff --git a/cpp/src/neighbors/cagra_build_uint8.cu b/cpp/src/neighbors/cagra_build_uint8.cu
index 6bba2814bf..a819675d9c 100644
--- a/cpp/src/neighbors/cagra_build_uint8.cu
+++ b/cpp/src/neighbors/cagra_build_uint8.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,21 +8,29 @@
 
 namespace cuvs::neighbors::cagra {
 
-#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                            \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
-  }                                                                               \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::cagra::index_params& params,                  \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    -> cuvs::neighbors::cagra::index<T, IdxT>                                     \
-  {                                                                               \
-    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);       \
+#define RAFT_INST_CAGRA_BUILD(T, IdxT)                                                    \
+  void build_knn_graph(raft::resources const& handle,                                     \
+                       raft::host_matrix_view<const T, int64_t, raft::row_major> dataset, \
+                       raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,  \
+                       cuvs::neighbors::cagra::graph_build_params::ivf_pq_params params)  \
+  {                                                                                       \
+    cuvs::neighbors::cagra::build_knn_graph<T, IdxT>(handle, dataset, knn_graph, params); \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)         \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
+  }                                                                                       \
+                                                                                          \
+  auto build(raft::resources const& handle,                                               \
+             const cuvs::neighbors::cagra::index_params& params,                          \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)           \
+    -> cuvs::neighbors::cagra::index<T, IdxT>                                             \
+  {                                                                                       \
+    return cuvs::neighbors::cagra::build<T, IdxT>(handle, params, dataset);               \
   }
 
 RAFT_INST_CAGRA_BUILD(uint8_t, uint32_t);
diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index 755b37c119..9d70f7848c 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -357,6 +357,11 @@ void extend_core(
   std::optional<raft::device_matrix_view<T, int64_t, raft::layout_stride>> new_dataset_buffer_view,
   std::optional<raft::device_matrix_view<IdxT, int64_t>> new_graph_buffer_view)
 {
+  RAFT_EXPECTS(!index.dataset_fd().has_value(),
+               "Cannot extend a disk-backed CAGRA index. Convert it with "
+               "cuvs::neighbors::hnsw::from_cagra() and load it into memory via "
+               "cuvs::neighbors::hnsw::deserialize() before calling extend().");
+
   if (dynamic_cast<const non_owning_dataset<T, IdxT>*>(&index.data()) != nullptr &&
       !new_dataset_buffer_view.has_value()) {
     RAFT_LOG_WARN(
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 08ba6bf207..5f7389493a 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -7,7 +7,6 @@
 #include "../../../core/nvtx.hpp"
 #include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
-#include <cuvs/neighbors/cagra.hpp>
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -17,12 +16,16 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/integer_utils.hpp>
 
+#include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
-#include <cuvs/neighbors/refine.hpp>
-
 #include <cuvs/neighbors/nn_descent.hpp>
+#include <cuvs/neighbors/refine.hpp>
+#include <cuvs/util/file_io.hpp>
+#include <cuvs/util/host_memory.hpp>
 
 // TODO: This shouldn't be calling spatial/knn APIs
 #include "../ann_utils.cuh"
@@ -31,13 +34,1362 @@
 
 #include <chrono>
 #include <cstdio>
+#include <filesystem>
+#include <omp.h>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 
 #include <sys/mman.h>
+#include <sys/stat.h>
 
 namespace cuvs::neighbors::cagra::detail {
 
+template <typename T, typename IdxT>
+void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size)
+{
+  if (intermediate_degree >= static_cast<size_t>(dataset_size)) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
+      dataset_size);
+    intermediate_degree = dataset_size - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+}
+
+// ACE: Get partition labels for partitioned approach
+// TODO(julianmi): Use all neighbors APIs.
+template <typename T, typename IdxT>
+void ace_get_partition_labels(
+  raft::resources const& res,
+  raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_labels,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_histogram,
+  size_t min_partition_size,
+  double sampling_rate = 0.01)
+{
+  size_t dataset_size = dataset.extent(0);
+  size_t dataset_dim  = dataset.extent(1);
+  size_t labels_size  = partition_labels.extent(0);
+  size_t labels_dim   = partition_labels.extent(1);
+  RAFT_EXPECTS(dataset_size == labels_size, "Dataset size must match partition labels extent");
+  size_t n_partitions = partition_histogram.extent(0);
+  RAFT_EXPECTS(labels_dim == 2, "Labels must have 2 columns");
+  RAFT_EXPECTS(partition_histogram.extent(1) == 2, "Partition histogram must have 2 columns");
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  // Sampling vectors from dataset. Uses float conversion on host instead of
+  // raft::matrix::sample_rows to minimize GPU memory usage.
+  // TODO(julianmi): Switch to sample_rows when https://github.com/rapidsai/cuvs/issues/1461 is
+  // addressed.
+  size_t n_samples         = dataset_size * sampling_rate;
+  const size_t min_samples = 100 * n_partitions;
+  n_samples                = std::max(n_samples, min_samples);
+  n_samples                = std::min(n_samples, dataset_size);
+  RAFT_LOG_DEBUG("ACE: n_samples: %lu", n_samples);
+
+  auto sample_db = raft::make_host_matrix<float, int64_t>(n_samples, dataset_dim);
+#pragma omp parallel for
+  for (size_t i = 0; i < n_samples; i++) {
+    size_t j = i * dataset_size / n_samples;
+    for (size_t k = 0; k < dataset_dim; k++) {
+      sample_db(i, k) = static_cast<float>(dataset(j, k));
+    }
+  }
+  auto sample_db_dev = raft::make_device_matrix<float, int64_t>(res, n_samples, dataset_dim);
+  raft::update_device(
+    sample_db_dev.data_handle(), sample_db.data_handle(), sample_db.size(), stream);
+
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  auto centroids_dev = raft::make_device_matrix<float, int64_t>(res, n_partitions, dataset_dim);
+  cuvs::cluster::kmeans::fit(res, kmeans_params, sample_db_dev.view(), centroids_dev.view());
+
+  // Compute distances between dataset and centroid vectors
+  // Uses float conversion on host instead of batch_load_iterator to minimize GPU memory usage.
+  const size_t chunk_size = 32 * 1024;
+  auto _sub_dataset       = raft::make_host_matrix<float, int64_t>(chunk_size, dataset_dim);
+  auto _sub_distances     = raft::make_host_matrix<float, int64_t>(chunk_size, n_partitions);
+  auto _sub_dataset_dev   = raft::make_device_matrix<float, int64_t>(res, chunk_size, dataset_dim);
+  auto _sub_distances_dev = raft::make_device_matrix<float, int64_t>(res, chunk_size, n_partitions);
+  size_t report_interval  = dataset_size / 10;
+  report_interval         = (report_interval / chunk_size) * chunk_size;
+  report_interval         = std::max(report_interval, chunk_size);
+
+  for (size_t i_base = 0; i_base < dataset_size; i_base += chunk_size) {
+    const size_t sub_dataset_size = std::min(chunk_size, dataset_size - i_base);
+    if (i_base % report_interval == 0) {
+      RAFT_LOG_INFO("ACE: Processing chunk %lu / %lu (%.1f%%)",
+                    i_base,
+                    dataset_size,
+                    static_cast<double>(100 * i_base) / dataset_size);
+    }
+
+    auto sub_dataset = raft::make_host_matrix_view<float, int64_t>(
+      _sub_dataset.data_handle(), sub_dataset_size, dataset_dim);
+#pragma omp parallel for
+    for (size_t i_sub = 0; i_sub < sub_dataset_size; i_sub++) {
+      size_t i = i_base + i_sub;
+      for (size_t k = 0; k < dataset_dim; k++) {
+        sub_dataset(i_sub, k) = static_cast<float>(dataset(i, k));
+      }
+    }
+    auto sub_dataset_dev = raft::make_device_matrix_view<const float, int64_t>(
+      _sub_dataset_dev.data_handle(), sub_dataset_size, dataset_dim);
+    raft::update_device(
+      _sub_dataset_dev.data_handle(), sub_dataset.data_handle(), sub_dataset.size(), stream);
+
+    auto sub_distances = raft::make_host_matrix_view<float, int64_t>(
+      _sub_distances.data_handle(), sub_dataset_size, n_partitions);
+    auto sub_distances_dev = raft::make_device_matrix_view<float, int64_t>(
+      _sub_distances_dev.data_handle(), sub_dataset_size, n_partitions);
+
+    cuvs::distance::pairwise_distance(res,
+                                      sub_dataset_dev,
+                                      centroids_dev.view(),
+                                      sub_distances_dev,
+                                      cuvs::distance::DistanceType::L2Expanded);
+
+    raft::update_host(
+      sub_distances.data_handle(), sub_distances_dev.data_handle(), sub_distances.size(), stream);
+    raft::resource::sync_stream(res, stream);
+
+    // Find two closest partitions to each dataset vector
+#pragma omp parallel for
+    for (size_t i_sub = 0; i_sub < sub_dataset_size; i_sub++) {
+      size_t core_label      = 0;
+      size_t augmented_label = 1;
+      if (sub_distances(i_sub, 0) > sub_distances(i_sub, 1)) {
+        core_label      = 1;
+        augmented_label = 0;
+      }
+      for (size_t c = 2; c < n_partitions; c++) {
+        if (sub_distances(i_sub, c) < sub_distances(i_sub, core_label)) {
+          augmented_label = core_label;
+          core_label      = c;
+        } else if (sub_distances(i_sub, c) < sub_distances(i_sub, augmented_label)) {
+          augmented_label = c;
+        }
+      }
+      size_t i               = i_base + i_sub;
+      partition_labels(i, 0) = core_label;
+      partition_labels(i, 1) = augmented_label;
+
+#pragma omp atomic update
+      partition_histogram(core_label, 0) += 1;
+#pragma omp atomic update
+      partition_histogram(augmented_label, 1) += 1;
+    }
+  }
+}
+
+// ACE: Check partition sizes for stable KNN graph construction
+template <typename IdxT>
+void ace_check_partition_sizes(
+  size_t dataset_size,
+  size_t n_partitions,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_labels,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_histogram,
+  size_t min_partition_size)
+{
+  // Collect partition histogram statistics
+  size_t total_core_vectors      = 0;
+  size_t total_augmented_vectors = 0;
+  size_t min_core_vectors        = dataset_size;
+  size_t max_core_vectors        = 0;
+  size_t min_augmented_vectors   = dataset_size;
+  size_t max_augmented_vectors   = 0;
+  size_t min_total_vectors       = dataset_size;
+  size_t max_total_vectors       = 0;
+
+  for (size_t c = 0; c < n_partitions; c++) {
+    size_t core_count      = partition_histogram(c, 0);
+    size_t augmented_count = partition_histogram(c, 1);
+    size_t total_count     = core_count + augmented_count;
+
+    if (total_count > 0) {
+      total_core_vectors += core_count;
+      total_augmented_vectors += augmented_count;
+
+      min_core_vectors      = std::min(min_core_vectors, core_count);
+      max_core_vectors      = std::max(max_core_vectors, core_count);
+      min_augmented_vectors = std::min(min_augmented_vectors, augmented_count);
+      max_augmented_vectors = std::max(max_augmented_vectors, augmented_count);
+      min_total_vectors     = std::min(min_total_vectors, total_count);
+      max_total_vectors     = std::max(max_total_vectors, total_count);
+    }
+  }
+
+  double avg_core_vectors      = static_cast<double>(total_core_vectors) / n_partitions;
+  double avg_augmented_vectors = static_cast<double>(total_augmented_vectors) / n_partitions;
+  double avg_total_vectors     = 2.0 * static_cast<double>(dataset_size) / n_partitions;
+  double expected_avg_vectors  = 2.0 * static_cast<double>(dataset_size) / n_partitions;
+
+  RAFT_LOG_INFO("ACE: Core vectors        - Total: %lu, Avg: %.1f, Min: %lu, Max: %lu",
+                total_core_vectors,
+                avg_core_vectors,
+                min_core_vectors,
+                max_core_vectors);
+  RAFT_LOG_INFO("ACE: Augmented vectors   - Total: %lu, Avg: %.1f, Min: %lu, Max: %lu",
+                total_augmented_vectors,
+                avg_augmented_vectors,
+                min_augmented_vectors,
+                max_augmented_vectors);
+  RAFT_LOG_INFO("ACE: Total per partition - Total: %lu, Avg: %.1f, Min: %lu, Max: %lu",
+                total_core_vectors + total_augmented_vectors,
+                avg_total_vectors,
+                min_total_vectors,
+                max_total_vectors);
+
+  // Check for partition imbalance and issue warnings
+  size_t very_small_threshold = min_partition_size;
+  size_t very_large_threshold = static_cast<size_t>(5.0 * expected_avg_vectors);
+
+  for (size_t c = 0; c < n_partitions; c++) {
+    size_t total_count = partition_histogram(c, 0) + partition_histogram(c, 1);
+
+    if (total_count > 0 && total_count < very_small_threshold) {
+      RAFT_LOG_WARN(
+        "ACE: Partition %lu is very small (%lu vectors, expected ~%.1f). This may affect graph "
+        "quality.",
+        c,
+        total_count,
+        expected_avg_vectors);
+    } else if (total_count > very_large_threshold) {
+      RAFT_LOG_WARN(
+        "ACE: Partition %lu is very large (%lu vectors, expected ~%.1f, threshold: %lu). This may "
+        "indicate imbalance and can lead to memory issues in restricted environments.",
+        c,
+        total_count,
+        expected_avg_vectors,
+        very_large_threshold);
+    }
+  }
+}
+
+// ACE: Create forward/backward mappings between original and reordered vector IDs
+// The in-memory path can be parallelized but the disk path requires ordering.
+template <typename IdxT>
+void ace_create_forward_and_backward_lists(
+  size_t dataset_size,
+  size_t n_partitions,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_labels,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_histogram,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_forward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets)
+{
+  core_partition_offsets(0)      = 0;
+  augmented_partition_offsets(0) = 0;
+  for (size_t c = 1; c < n_partitions; c++) {
+    core_partition_offsets(c) = core_partition_offsets(c - 1) + partition_histogram(c - 1, 0);
+    augmented_partition_offsets(c) =
+      augmented_partition_offsets(c - 1) + partition_histogram(c - 1, 1);
+  }
+
+  if (static_cast<size_t>(core_forward_mapping.extent(0)) == 0) {
+    // Memory path: both backward mappings
+    RAFT_EXPECTS(static_cast<size_t>(core_backward_mapping.extent(0)) == dataset_size,
+                 "core_backward_mapping must be of size dataset_size");
+    RAFT_EXPECTS(static_cast<size_t>(augmented_backward_mapping.extent(0)) == dataset_size,
+                 "augmented_backward_mapping must be of size dataset_size");
+#pragma omp parallel for
+    for (size_t i = 0; i < dataset_size; i++) {
+      size_t core_partition_id = partition_labels(i, 0);
+      size_t core_id;
+#pragma omp atomic capture
+      core_id = core_partition_offsets(core_partition_id)++;
+      RAFT_EXPECTS(core_id < dataset_size, "Vector ID must be smaller than dataset_size");
+      core_backward_mapping(core_id) = i;
+
+      size_t augmented_partition_id = partition_labels(i, 1);
+      size_t augmented_id;
+#pragma omp atomic capture
+      augmented_id = augmented_partition_offsets(augmented_partition_id)++;
+      RAFT_EXPECTS(augmented_id < dataset_size, "Vector ID must be smaller than dataset_size");
+      augmented_backward_mapping(augmented_id) = i;
+    }
+  } else {
+    // Disk path: all three mappings
+    RAFT_EXPECTS(static_cast<size_t>(core_forward_mapping.extent(0)) == dataset_size,
+                 "core_forward_mapping must be of size dataset_size");
+    RAFT_EXPECTS(static_cast<size_t>(core_backward_mapping.extent(0)) == dataset_size,
+                 "core_backward_mapping must be of size dataset_size");
+    RAFT_EXPECTS(static_cast<size_t>(augmented_backward_mapping.extent(0)) == dataset_size,
+                 "augmented_backward_mapping must be of size dataset_size");
+    for (size_t i = 0; i < dataset_size; i++) {
+      size_t core_partition_id = partition_labels(i, 0);
+      size_t core_id;
+      core_id = core_partition_offsets(core_partition_id)++;
+      RAFT_EXPECTS(core_id < dataset_size, "Vector ID must be smaller than dataset_size");
+      core_backward_mapping(core_id) = i;
+      core_forward_mapping(i)        = core_id;
+
+      size_t augmented_partition_id = partition_labels(i, 1);
+      size_t augmented_id;
+      augmented_id = augmented_partition_offsets(augmented_partition_id)++;
+      RAFT_EXPECTS(augmented_id < dataset_size, "Vector ID must be smaller than dataset_size");
+      augmented_backward_mapping(augmented_id) = i;
+    }
+  }
+
+  // Restore idxptr arrays
+  for (size_t c = n_partitions; c > 0; c--) {
+    core_partition_offsets(c)      = core_partition_offsets(c - 1);
+    augmented_partition_offsets(c) = augmented_partition_offsets(c - 1);
+  }
+  core_partition_offsets(0)      = 0;
+  augmented_partition_offsets(0) = 0;
+}
+
+// ACE: Gather partition dataset
+template <typename T, typename IdxT>
+void ace_gather_partition_dataset(
+  size_t core_sub_dataset_size,
+  size_t augmented_sub_dataset_size,
+  size_t dataset_dim,
+  size_t partition_id,
+  raft::host_matrix_view<const T, int64_t, row_major> dataset,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets,
+  raft::host_matrix_view<T, int64_t, raft::row_major> sub_dataset)
+{
+  const size_t vector_size_bytes = dataset_dim * sizeof(T);
+
+  // Copy core partition vectors
+#pragma omp parallel for
+  for (size_t j = 0; j < core_sub_dataset_size; j++) {
+    size_t i = core_backward_mapping(j + core_partition_offsets(partition_id));
+    memcpy(&sub_dataset(j, 0), &dataset(i, 0), vector_size_bytes);
+  }
+
+  // Copy augmented partition vectors (2nd closest partition)
+#pragma omp parallel for
+  for (size_t j = 0; j < augmented_sub_dataset_size; j++) {
+    size_t i = augmented_backward_mapping(j + augmented_partition_offsets(partition_id));
+    memcpy(&sub_dataset(j + core_sub_dataset_size, 0), &dataset(i, 0), vector_size_bytes);
+  }
+}
+
+// ACE: Adjust IDs from core and augmented partitions to global reordered IDs
+template <typename IdxT>
+void ace_adjust_sub_graph_ids(
+  size_t core_sub_dataset_size,
+  size_t augmented_sub_dataset_size,
+  size_t graph_degree,
+  size_t partition_id,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> sub_search_graph,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> search_graph,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_backward_mapping)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < core_sub_dataset_size; i++) {
+    // Map row index from local → reordered → original
+    size_t i_reordered = i + core_partition_offsets(partition_id);
+    size_t i_original  = core_backward_mapping(i_reordered);
+
+    for (size_t k = 0; k < graph_degree; k++) {
+      size_t j = sub_search_graph(i, k);
+      size_t j_original;
+
+      if (j < core_sub_dataset_size) {
+        // core partition neighbor: local → core reordered → original
+        size_t j_reordered = j + core_partition_offsets(partition_id);
+        j_original         = core_backward_mapping(j_reordered);
+      } else {
+        // Augmented partition neighbor: local → augmented reordered → original
+        size_t j_augmented = j - core_sub_dataset_size;
+        j_original =
+          augmented_backward_mapping(j_augmented + augmented_partition_offsets(partition_id));
+      }
+      search_graph(i_original, k) = j_original;
+    }
+  }
+}
+
+// ACE: Adjust ids in sub search graph in place for disk version
+template <typename IdxT>
+void ace_adjust_sub_graph_ids_disk(
+  size_t core_sub_dataset_size,
+  size_t augmented_sub_dataset_size,
+  size_t graph_degree,
+  size_t partition_id,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> sub_search_graph,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_forward_mapping)
+{
+#pragma omp parallel for
+  for (size_t i = 0; i < core_sub_dataset_size; i++) {
+    for (size_t k = 0; k < graph_degree; k++) {
+      size_t j = sub_search_graph(i, k);
+      if (j < core_sub_dataset_size) {
+        // core partition neighbor: local → core reordered
+        sub_search_graph(i, k) = j + core_partition_offsets(partition_id);
+      } else {
+        // Augmented partition neighbor: local → augmented reordered→ original → core reordered
+        size_t j_augmented = j - core_sub_dataset_size;
+        size_t j_original =
+          augmented_backward_mapping(j_augmented + augmented_partition_offsets(partition_id));
+        sub_search_graph(i, k) = core_forward_mapping(j_original);
+      }
+    }
+  }
+}
+
+// ACE: Reorder dataset based on partition assignments and store to disk
+// Writes two files: reordered_dataset.npy (core partitions) and augmented_dataset.npy (secondary
+// partitions). Uses buffered writes optimized for NVMe storage.
+template <typename T, typename IdxT>
+void ace_reorder_and_store_dataset(
+  raft::resources const& res,
+  const std::string& build_dir,
+  raft::host_matrix_view<const T, int64_t, row_major> dataset,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_labels,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_histogram,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_backward_mapping,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets,
+  cuvs::util::file_descriptor& reordered_fd,
+  cuvs::util::file_descriptor& augmented_fd,
+  cuvs::util::file_descriptor& mapping_fd,
+  size_t reordered_header_size,
+  size_t augmented_header_size,
+  size_t mapping_header_size)
+{
+  auto start = std::chrono::high_resolution_clock::now();
+
+  size_t dataset_size = dataset.extent(0);
+  size_t dataset_dim  = dataset.extent(1);
+  size_t n_partitions = partition_histogram.extent(0);
+
+  RAFT_LOG_DEBUG(
+    "ACE: Reordering and storing dataset to disk (%lu vectors, %lu dimensions, %lu partitions)",
+    dataset_size,
+    dataset_dim,
+    n_partitions);
+
+  // Calculate total sizes for pre-allocation
+  size_t total_core_vectors      = 0;
+  size_t total_augmented_vectors = 0;
+  size_t max_core_vectors        = 0;
+  size_t max_augmented_vectors   = 0;
+  for (size_t p = 0; p < n_partitions; p++) {
+    total_core_vectors += partition_histogram(p, 0);
+    total_augmented_vectors += partition_histogram(p, 1);
+    max_core_vectors      = std::max<size_t>(max_core_vectors, partition_histogram(p, 0));
+    max_augmented_vectors = std::max<size_t>(max_augmented_vectors, partition_histogram(p, 1));
+  }
+  RAFT_EXPECTS(total_core_vectors == dataset_size,
+               "Total core vectors must be equal to dataset size");
+  RAFT_EXPECTS(total_augmented_vectors == dataset_size,
+               "Total augmented vectors must be equal to dataset size");
+
+  // Pre-allocate file space for better performance
+  const size_t vector_size   = dataset_dim * sizeof(T);
+  size_t reordered_file_size = total_core_vectors * vector_size;
+  size_t augmented_file_size = total_augmented_vectors * vector_size;
+
+  RAFT_LOG_DEBUG("ACE: Reordered dataset: %lu core vectors (%.2f GiB)",
+                 total_core_vectors,
+                 reordered_file_size / (1024.0 * 1024.0 * 1024.0));
+  RAFT_LOG_DEBUG("ACE: Augmented dataset: %lu secondary vectors (%.2f GiB)",
+                 total_augmented_vectors,
+                 augmented_file_size / (1024.0 * 1024.0 * 1024.0));
+
+  // Calculate partition start offsets for reordered and augmented datasets
+  auto core_partition_starts = raft::make_host_vector<size_t, int64_t>(n_partitions + 1);
+  memset(core_partition_starts.data_handle(), 0, (n_partitions + 1) * sizeof(size_t));
+  auto augmented_partition_starts = raft::make_host_vector<size_t, int64_t>(n_partitions + 1);
+  memset(augmented_partition_starts.data_handle(), 0, (n_partitions + 1) * sizeof(size_t));
+  auto core_partition_current = raft::make_host_vector<size_t, int64_t>(n_partitions);
+  memset(core_partition_current.data_handle(), 0, n_partitions * sizeof(size_t));
+  auto augmented_partition_current = raft::make_host_vector<size_t, int64_t>(n_partitions);
+  memset(augmented_partition_current.data_handle(), 0, n_partitions * sizeof(size_t));
+
+  for (size_t p = 0; p < n_partitions; p++) {
+    core_partition_starts(p + 1)      = core_partition_starts(p) + partition_histogram(p, 0);
+    augmented_partition_starts(p + 1) = augmented_partition_starts(p) + partition_histogram(p, 1);
+  }
+
+  const size_t free_memory = cuvs::util::get_free_host_memory();
+  // Conservatively allocate 50% of free memory per partition. Accounts for internal buffers and
+  // overhead.
+  // TODO: Adjust overhead if needed.
+  const size_t memory_per_partition = 0.5 * free_memory / (n_partitions * 2);
+  size_t disk_write_size            = raft::bound_by_power_of_two<size_t>(memory_per_partition);
+  // 64MB should be enough to saturate typical NVMe SSDs.
+  disk_write_size           = std::min<size_t>(disk_write_size, 64 * 1024 * 1024);
+  size_t vectors_per_buffer = std::max<size_t>(64, disk_write_size / vector_size);
+
+  RAFT_LOG_DEBUG("ACE: Reorder buffers: %lu vectors per buffer (%.2f MiB)",
+                 vectors_per_buffer,
+                 vectors_per_buffer * vector_size / (1024.0 * 1024.0));
+
+  std::vector<raft::host_matrix<T, int64_t>> core_buffers;
+  std::vector<raft::host_matrix<T, int64_t>> augmented_buffers;
+  auto core_buffer_counts      = raft::make_host_vector<size_t, int64_t>(n_partitions);
+  auto augmented_buffer_counts = raft::make_host_vector<size_t, int64_t>(n_partitions);
+
+  core_buffers.reserve(n_partitions);
+  augmented_buffers.reserve(n_partitions);
+
+  for (size_t p = 0; p < n_partitions; p++) {
+    core_buffers.emplace_back(raft::make_host_matrix<T, int64_t>(vectors_per_buffer, dataset_dim));
+    augmented_buffers.emplace_back(
+      raft::make_host_matrix<T, int64_t>(vectors_per_buffer, dataset_dim));
+    core_buffer_counts(p)      = 0;
+    augmented_buffer_counts(p) = 0;
+  }
+  auto flush_core_buffer = [&](size_t partition_id) {
+    const size_t count = core_buffer_counts(partition_id);
+    if (count > 0) {
+      const size_t bytes_to_write = count * vector_size;
+      const size_t file_offset =
+        (core_partition_starts(partition_id) + core_partition_current(partition_id)) * vector_size +
+        reordered_header_size;
+
+      cuvs::util::write_large_file(
+        reordered_fd, core_buffers[partition_id].data_handle(), bytes_to_write, file_offset);
+
+      core_partition_current(partition_id) += count;
+      core_buffer_counts(partition_id) = 0;
+    }
+  };
+
+  auto flush_augmented_buffer = [&](size_t partition_id) {
+    const size_t count = augmented_buffer_counts(partition_id);
+    if (count > 0) {
+      const size_t bytes_to_write = count * vector_size;
+      const size_t file_offset =
+        (augmented_partition_starts(partition_id) + augmented_partition_current(partition_id)) *
+          vector_size +
+        augmented_header_size;
+
+      cuvs::util::write_large_file(
+        augmented_fd, augmented_buffers[partition_id].data_handle(), bytes_to_write, file_offset);
+
+      augmented_partition_current(partition_id) += count;
+      augmented_buffer_counts(partition_id) = 0;
+    }
+  };
+
+  size_t vectors_processed  = 0;
+  const size_t log_interval = std::max(dataset_size / 10, size_t(1));
+  for (size_t i = 0; i < dataset_size; i++) {
+    size_t core_partition      = partition_labels(i, 0);
+    size_t secondary_partition = partition_labels(i, 1);
+
+    // Add vector to core partition buffer
+    size_t core_buffer_row = core_buffer_counts(core_partition);
+    memcpy(
+      &core_buffers[core_partition](core_buffer_row, 0), &dataset(i, 0), dataset_dim * sizeof(T));
+    core_buffer_counts(core_partition)++;
+
+    // Flush core buffer if full
+    if (core_buffer_counts(core_partition) >= vectors_per_buffer) {
+      flush_core_buffer(core_partition);
+    }
+
+    // Add vector to augmented partition buffer
+    size_t augmented_buffer_row = augmented_buffer_counts(secondary_partition);
+    memcpy(&augmented_buffers[secondary_partition](augmented_buffer_row, 0),
+           &dataset(i, 0),
+           dataset_dim * sizeof(T));
+    augmented_buffer_counts(secondary_partition)++;
+
+    // Flush augmented buffer if full
+    if (augmented_buffer_counts(secondary_partition) >= vectors_per_buffer) {
+      flush_augmented_buffer(secondary_partition);
+    }
+
+    vectors_processed++;
+    if (vectors_processed % log_interval == 0) {
+      RAFT_LOG_INFO("ACE: Processed %lu/%lu vectors (%.1f%%)",
+                    vectors_processed,
+                    dataset_size,
+                    100.0 * vectors_processed / dataset_size);
+    }
+  }
+
+  // Flush all remaining buffers
+  RAFT_LOG_DEBUG("ACE: Flushing remaining buffers...");
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      for (size_t p = 0; p < n_partitions; p++) {
+        flush_core_buffer(p);
+      }
+    }
+#pragma omp section
+    {
+      for (size_t p = 0; p < n_partitions; p++) {
+        flush_augmented_buffer(p);
+      }
+    }
+  }
+
+  const size_t mapping_file_size = dataset_size * sizeof(IdxT);
+  cuvs::util::write_large_file(
+    mapping_fd, core_backward_mapping.data_handle(), mapping_file_size, mapping_header_size);
+
+  auto end        = std::chrono::high_resolution_clock::now();
+  auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+  // Calculate total bytes written
+  size_t total_bytes_written = reordered_file_size + augmented_file_size + mapping_file_size;
+  double throughput_mb_s =
+    elapsed_ms > 0 ? (total_bytes_written / (1024.0 * 1024.0)) / (elapsed_ms / 1000.0) : 0.0;
+
+  RAFT_LOG_INFO(
+    "ACE: Dataset (%.2f GiB reordered, %.2f GiB augmented, %.2f GiB mapping) reordering completed "
+    "in %ld ms (%.1f MiB/s)",
+    reordered_file_size / (1024.0 * 1024.0 * 1024.0),
+    augmented_file_size / (1024.0 * 1024.0 * 1024.0),
+    mapping_file_size / (1024.0 * 1024.0 * 1024.0),
+    elapsed_ms,
+    throughput_mb_s);
+}
+
+// ACE: Load partition dataset and augmented dataset from disk
+template <typename T, typename IdxT>
+void ace_load_partition_dataset_from_disk(
+  raft::resources const& res,
+  const std::string& build_dir,
+  size_t partition_id,
+  size_t dataset_dim,
+  raft::host_matrix_view<IdxT, int64_t, raft::row_major> partition_histogram,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> core_partition_offsets,
+  raft::host_vector_view<IdxT, int64_t, raft::row_major> augmented_partition_offsets,
+  raft::host_matrix_view<T, int64_t, raft::row_major> sub_dataset)
+{
+  size_t n_partitions = partition_histogram.extent(0);
+
+  RAFT_LOG_DEBUG("ACE: Loading partition %lu dataset from disk", partition_id);
+
+  size_t core_size            = partition_histogram(partition_id, 0);
+  size_t augmented_size       = partition_histogram(partition_id, 1);
+  size_t total_partition_size = core_size + augmented_size;
+
+  RAFT_LOG_DEBUG("ACE: Partition %lu: %lu core + %lu augmented = %lu total vectors",
+                 partition_id,
+                 core_size,
+                 augmented_size,
+                 total_partition_size);
+
+  RAFT_EXPECTS(static_cast<size_t>(sub_dataset.extent(0)) == total_partition_size,
+               "sub_dataset rows (%lu) must match total partition size (%lu)",
+               sub_dataset.extent(0),
+               total_partition_size);
+  RAFT_EXPECTS(static_cast<size_t>(sub_dataset.extent(1)) == dataset_dim,
+               "sub_dataset columns (%lu) must match dataset dimensions (%lu)",
+               sub_dataset.extent(1),
+               dataset_dim);
+
+  const size_t vector_size = dataset_dim * sizeof(T);
+
+  const std::string reordered_dataset_path = build_dir + "/reordered_dataset.npy";
+  const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
+
+  if (!std::filesystem::exists(reordered_dataset_path)) {
+    RAFT_FAIL("ACE: Required file does not exist: %s", reordered_dataset_path.c_str());
+  }
+  if (!std::filesystem::exists(augmented_dataset_path)) {
+    RAFT_FAIL("ACE: Required file does not exist: %s", augmented_dataset_path.c_str());
+  }
+
+  size_t core_header_size      = 0;
+  size_t augmented_header_size = 0;
+  size_t core_file_offset      = 0;
+  size_t augmented_file_offset = 0;
+  {
+    std::ifstream is(reordered_dataset_path, std::ios::in | std::ios::binary);
+    if (!is) { RAFT_FAIL("Cannot open file %s", reordered_dataset_path.c_str()); }
+    auto start_pos = is.tellg();
+    raft::detail::numpy_serializer::read_header(is);
+    core_header_size = static_cast<size_t>(is.tellg() - start_pos);
+  }
+  {
+    std::ifstream is(augmented_dataset_path, std::ios::in | std::ios::binary);
+    if (!is) { RAFT_FAIL("Cannot open file %s", augmented_dataset_path.c_str()); }
+    auto start_pos = is.tellg();
+    raft::detail::numpy_serializer::read_header(is);
+    augmented_header_size = static_cast<size_t>(is.tellg() - start_pos);
+  }
+
+  for (size_t p = 0; p < partition_id; p++) {
+    core_file_offset += partition_histogram(p, 0);
+    augmented_file_offset += partition_histogram(p, 1);
+  }
+
+  core_file_offset *= vector_size;
+  augmented_file_offset *= vector_size;
+
+  core_file_offset += core_header_size;
+  augmented_file_offset += augmented_header_size;
+
+  RAFT_LOG_DEBUG("ACE: Core file offset: %lu bytes, Augmented file offset: %lu bytes",
+                 core_file_offset,
+                 augmented_file_offset);
+
+  // Read core and augmented data in parallel
+  std::exception_ptr core_exception      = nullptr;
+  std::exception_ptr augmented_exception = nullptr;
+
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      try {
+        if (core_size > 0) {
+          RAFT_LOG_DEBUG(
+            "ACE: Reading %lu core vectors from offset %lu", core_size, core_file_offset);
+          cuvs::util::file_descriptor reordered_fd(reordered_dataset_path, O_RDONLY);
+          const size_t core_bytes = core_size * vector_size;
+          cuvs::util::read_large_file(
+            reordered_fd, sub_dataset.data_handle(), core_bytes, core_file_offset);
+        }
+      } catch (...) {
+        core_exception = std::current_exception();
+      }
+    }
+#pragma omp section
+    {
+      try {
+        if (augmented_size > 0) {
+          RAFT_LOG_DEBUG("ACE: Reading %lu augmented vectors from offset %lu",
+                         augmented_size,
+                         augmented_file_offset);
+          cuvs::util::file_descriptor augmented_fd(augmented_dataset_path, O_RDONLY);
+          const size_t augmented_bytes = augmented_size * vector_size;
+          T* augmented_dest            = sub_dataset.data_handle() + (core_size * dataset_dim);
+          cuvs::util::read_large_file(
+            augmented_fd, augmented_dest, augmented_bytes, augmented_file_offset);
+        }
+      } catch (...) {
+        augmented_exception = std::current_exception();
+      }
+    }
+  }
+
+  // Check for exceptions from parallel sections
+  if (core_exception) { std::rethrow_exception(core_exception); }
+  if (augmented_exception) { std::rethrow_exception(augmented_exception); }
+}
+
+// Build CAGRA index using ACE (Augmented Core Extraction) partitioning
+// ACE enables building indices for datasets too large to fit in GPU memory by:
+// 1. Partitioning the dataset using balanced k-means in core (non-overlapping) and augmented
+// (second-closest) partitions
+// 2. Building sub-indices for each partition independently
+// 3. Concatenating sub-graphs (of core partitions) into a final unified index
+// Supports both in-memory and disk-based modes depending on available host memory.
+// In disk mode, the graph is stored in build_dir and dataset is reordered on disk.
+// The returned index is not usable for search. Use the created files for search instead.
+template <typename T, typename IdxT>
+index<T, IdxT> build_ace(raft::resources const& res,
+                         const index_params& params,
+                         raft::host_matrix_view<const T, int64_t, row_major> dataset)
+{
+  // Extract ACE parameters from graph_build_params
+  RAFT_EXPECTS(
+    std::holds_alternative<cagra::graph_build_params::ace_params>(params.graph_build_params),
+    "ACE build requires graph_build_params to be set to ace_params");
+
+  auto ace_params    = std::get<cagra::graph_build_params::ace_params>(params.graph_build_params);
+  size_t npartitions = ace_params.npartitions;
+  size_t ef_construction = ace_params.ef_construction;
+  std::string build_dir  = ace_params.build_dir;
+  bool use_disk          = ace_params.use_disk;
+
+  common::nvtx::range<common::nvtx::domain::cuvs> function_scope(
+    "cagra::build_ace<host>(%zu, %zu, %zu)",
+    params.intermediate_graph_degree,
+    params.graph_degree,
+    npartitions);
+
+  size_t dataset_size = dataset.extent(0);
+  size_t dataset_dim  = dataset.extent(1);
+
+  RAFT_EXPECTS(dataset_size > 0, "ACE: Dataset must not be empty");
+  if (dataset_size < 1000) {
+    RAFT_LOG_WARN("ACE: Very small dataset size (%zu), consider using regular CAGRA build instead.",
+                  dataset_size);
+  }
+  RAFT_EXPECTS(dataset_dim > 0, "ACE: Dataset dimension must be greater than 0");
+  RAFT_EXPECTS(params.intermediate_graph_degree > 0,
+               "ACE: Intermediate graph degree must be greater than 0");
+  RAFT_EXPECTS(params.graph_degree > 0, "ACE: Graph degree must be greater than 0");
+
+  size_t n_partitions = npartitions;
+  RAFT_EXPECTS(n_partitions > 0, "ACE: npartitions must be greater than 0");
+
+  size_t min_required_per_partition = 1000;
+  if (n_partitions > dataset_size / min_required_per_partition) {
+    n_partitions = dataset_size / min_required_per_partition;
+    if (n_partitions < 2) {
+      RAFT_LOG_WARN(
+        "ACE: Reduced number of partitions to the minimum of 2 to avoid tiny partitions. Consider "
+        "using regular CAGRA build instead.");
+      n_partitions = 2;
+    } else {
+      RAFT_LOG_WARN("ACE: Reduced number of partitions to %zu to avoid tiny partitions",
+                    n_partitions);
+    }
+  }
+
+  auto total_start = std::chrono::high_resolution_clock::now();
+  RAFT_LOG_INFO("ACE: Starting partitioned CAGRA build with %zu partitions", n_partitions);
+
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  // Track whether to clean up build directory on failure
+  bool cleanup_on_failure = false;
+
+  try {
+    check_graph_degree<T, IdxT>(intermediate_degree, graph_degree, dataset_size);
+
+    size_t available_memory = cuvs::util::get_free_host_memory();
+
+    // Optimistic memory model: focus on largest arrays, assumes all partitions are of equal size
+    // For memory path:
+    //   - Partition labes (core + augmented): 2 * dataset_size * sizeof(IdxT)
+    //   - Backward ID mapping arrays (core + augmented): 2 * dataset_size * sizeof(IdxT)
+    //   - Per-partition dataset (2x for imbalanced partitions): 4 * (dataset_size / n_partitions) *
+    //   dataset_dim * sizeof(T)
+    //   - Per-partition graph during build: (dataset_size / n_partitions) * (intermediate + final)
+    //   * sizeof(IdxT)
+    //   - Final assembled graph: dataset_size * graph_degree * sizeof(IdxT)
+    size_t ace_partition_labels_size = 2 * dataset_size * sizeof(IdxT);
+    size_t ace_id_mapping_size       = 2 * dataset_size * sizeof(IdxT);
+    size_t ace_sub_dataset_size      = 4 * (dataset_size / n_partitions) * dataset_dim * sizeof(T);
+    size_t ace_sub_graph_size =
+      (dataset_size / n_partitions) * (intermediate_degree + graph_degree) * sizeof(IdxT);
+    size_t cagra_graph_size = dataset_size * graph_degree * sizeof(IdxT);
+    size_t total_size = ace_partition_labels_size + ace_id_mapping_size + ace_sub_dataset_size +
+                        ace_sub_graph_size + cagra_graph_size;
+    RAFT_LOG_INFO("ACE: Estimated host memory required: %.2f GiB, available: %.2f GiB",
+                  total_size / (1024.0 * 1024.0 * 1024.0),
+                  available_memory / (1024.0 * 1024.0 * 1024.0));
+    // TODO: Adjust overhead factor if needed
+    bool host_memory_limited = static_cast<size_t>(0.8 * available_memory) < total_size;
+
+    // GPU is mostly limited by the index size (update_graph() in the end of this routine).
+    // Check if GPU has enough memory for the final graph or use disk mode instead.
+    // TODO: Extend model or use managed memory if running out of GPU memory.
+    auto available_gpu_memory = rmm::available_device_memory().second;
+    bool gpu_memory_limited   = static_cast<size_t>(0.8 * available_gpu_memory) < cagra_graph_size;
+    RAFT_LOG_INFO("ACE: Estimated GPU memory required: %.2f GiB, available: %.2f GiB",
+                  cagra_graph_size / (1024.0 * 1024.0 * 1024.0),
+                  available_gpu_memory / (1024.0 * 1024.0 * 1024.0));
+
+    bool use_disk_mode = use_disk || host_memory_limited || gpu_memory_limited;
+    if (use_disk_mode) {
+      bool valid_build_dir = !build_dir.empty();
+      valid_build_dir &= build_dir.length() <= 255;
+      valid_build_dir &= build_dir.find('\0') == std::string::npos;
+      valid_build_dir &= build_dir.find("//") == std::string::npos;
+      if (!valid_build_dir) {
+        RAFT_LOG_WARN("ACE: Invalid build_dir path, resetting to default: /tmp/ace_build");
+        build_dir = "/tmp/ace_build";
+      }
+      if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) {
+        RAFT_EXPECTS(false, "Failed to create ACE build directory: %s", build_dir.c_str());
+      }
+    }
+
+    if (host_memory_limited && gpu_memory_limited) {
+      RAFT_LOG_INFO(
+        "ACE: Graph does not fit in host and GPU memory. Using disk-mode with temporary storage %s",
+        build_dir.c_str());
+    } else if (host_memory_limited) {
+      RAFT_LOG_INFO(
+        "ACE: Graph does not fit in host memory. Using disk-mode with temporary storage %s",
+        build_dir.c_str());
+    } else if (gpu_memory_limited) {
+      RAFT_LOG_INFO(
+        "ACE: Graph does not fit in GPU memory. Using disk-mode with temporary storage %s",
+        build_dir.c_str());
+    } else if (use_disk) {
+      RAFT_LOG_INFO(
+        "ACE: Graph fits in host and GPU memory but disk mode is forced. Using disk-mode with "
+        "temporary storage %s",
+        build_dir.c_str());
+    } else {
+      RAFT_LOG_INFO("ACE: Graph fits in host and GPU memory. Using in-memory mode.");
+    }
+
+    // Preallocate space for files for better performance and fail early if not enough space.
+    cuvs::util::file_descriptor reordered_fd;
+    cuvs::util::file_descriptor augmented_fd;
+    cuvs::util::file_descriptor mapping_fd;
+    cuvs::util::file_descriptor graph_fd;
+    size_t reordered_header_size = 0;
+    size_t augmented_header_size = 0;
+    size_t mapping_header_size   = 0;
+    size_t graph_header_size     = 0;
+
+    if (use_disk_mode) {
+      if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) {
+        RAFT_EXPECTS(false, "Failed to create ACE build directory: %s", build_dir.c_str());
+      }
+      // Mark for cleanup if we fail after creating the directory
+      cleanup_on_failure = true;
+
+      // Helper lambda to write numpy header to file descriptor
+      auto write_numpy_header = [](int fd,
+                                   const std::vector<size_t>& shape,
+                                   const raft::detail::numpy_serializer::dtype_t& dtype) {
+        std::stringstream ss;
+
+        const bool fortran_order                              = false;
+        const raft::detail::numpy_serializer::header_t header = {dtype, fortran_order, shape};
+
+        raft::detail::numpy_serializer::write_header(ss, header);
+
+        std::string header_str = ss.str();
+        ssize_t written        = write(fd, header_str.data(), header_str.size());
+        if (written < 0 || static_cast<size_t>(written) != header_str.size()) {
+          RAFT_FAIL("Failed to write numpy header to file descriptor");
+        }
+        return header_str.size();
+      };
+
+      // Create and allocate dataset file
+      reordered_fd = cuvs::util::file_descriptor(
+        build_dir + "/reordered_dataset.npy", O_CREAT | O_RDWR | O_TRUNC, 0644);
+      {
+        std::stringstream ss;
+        const auto dtype         = raft::detail::numpy_serializer::get_numpy_dtype<T>();
+        const bool fortran_order = false;
+        const raft::detail::numpy_serializer::header_t header = {
+          dtype, fortran_order, {dataset_size, dataset_dim}};
+        raft::detail::numpy_serializer::write_header(ss, header);
+        reordered_header_size = ss.str().size();
+      }
+      if (posix_fallocate(reordered_fd.get(),
+                          0,
+                          reordered_header_size + dataset_size * dataset_dim * sizeof(T)) != 0) {
+        RAFT_FAIL("Failed to pre-allocate space for reordered dataset file");
+      }
+      {
+        auto dtype_for_dataset = raft::detail::numpy_serializer::get_numpy_dtype<T>();
+        RAFT_LOG_DEBUG("Writing reordered_dataset.npy header: shape=[%zu,%zu], dtype=%c",
+                       dataset_size,
+                       dataset_dim,
+                       dtype_for_dataset.kind);
+        if (lseek(reordered_fd.get(), 0, SEEK_SET) == -1) {
+          RAFT_FAIL("Failed to seek to beginning of reordered dataset file");
+        }
+        write_numpy_header(reordered_fd.get(), {dataset_size, dataset_dim}, dtype_for_dataset);
+      }
+
+      // Create and allocate augmented dataset file
+      augmented_fd = cuvs::util::file_descriptor(
+        build_dir + "/augmented_dataset.npy", O_CREAT | O_RDWR | O_TRUNC, 0644);
+      {
+        std::stringstream ss;
+        const auto dtype         = raft::detail::numpy_serializer::get_numpy_dtype<T>();
+        const bool fortran_order = false;
+        const raft::detail::numpy_serializer::header_t header = {
+          dtype, fortran_order, {dataset_size, dataset_dim}};
+        raft::detail::numpy_serializer::write_header(ss, header);
+        augmented_header_size = ss.str().size();
+      }
+      if (posix_fallocate(augmented_fd.get(),
+                          0,
+                          augmented_header_size + dataset_size * dataset_dim * sizeof(T)) != 0) {
+        RAFT_FAIL("Failed to pre-allocate space for augmented dataset file");
+      }
+      // Seek to beginning before writing header
+      if (lseek(augmented_fd.get(), 0, SEEK_SET) == -1) {
+        RAFT_FAIL("Failed to seek to beginning of augmented dataset file");
+      }
+      write_numpy_header(augmented_fd.get(),
+                         {dataset_size, dataset_dim},
+                         raft::detail::numpy_serializer::get_numpy_dtype<T>());
+
+      // Create and allocate mapping file
+      mapping_fd = cuvs::util::file_descriptor(
+        build_dir + "/dataset_mapping.npy", O_CREAT | O_RDWR | O_TRUNC, 0644);
+      {
+        std::stringstream ss;
+        const auto dtype         = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>();
+        const bool fortran_order = false;
+        const raft::detail::numpy_serializer::header_t header = {
+          dtype, fortran_order, {dataset_size}};
+        raft::detail::numpy_serializer::write_header(ss, header);
+        mapping_header_size = ss.str().size();
+      }
+      if (posix_fallocate(mapping_fd.get(), 0, mapping_header_size + dataset_size * sizeof(IdxT)) !=
+          0) {
+        RAFT_FAIL("Failed to pre-allocate space for dataset mapping file");
+      }
+      {
+        auto dtype_for_mapping = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>();
+        RAFT_LOG_DEBUG("Writing dataset_mapping.npy header: shape=[%zu], dtype=%c",
+                       dataset_size,
+                       dtype_for_mapping.kind);
+        if (lseek(mapping_fd.get(), 0, SEEK_SET) == -1) {
+          RAFT_FAIL("Failed to seek to beginning of mapping file");
+        }
+        write_numpy_header(mapping_fd.get(), {dataset_size}, dtype_for_mapping);
+      }
+
+      // Create and allocate graph file
+      graph_fd = cuvs::util::file_descriptor(
+        build_dir + "/cagra_graph.npy", O_CREAT | O_RDWR | O_TRUNC, 0644);
+      {
+        std::stringstream ss;
+        const auto dtype         = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>();
+        const bool fortran_order = false;
+        const raft::detail::numpy_serializer::header_t header = {
+          dtype, fortran_order, {dataset_size, graph_degree}};
+        raft::detail::numpy_serializer::write_header(ss, header);
+        graph_header_size = ss.str().size();
+      }
+      if (posix_fallocate(graph_fd.get(), 0, graph_header_size + cagra_graph_size) != 0) {
+        RAFT_FAIL("Failed to pre-allocate space for graph file");
+      }
+      {
+        auto dtype_for_graph = raft::detail::numpy_serializer::get_numpy_dtype<IdxT>();
+        RAFT_LOG_DEBUG("Writing cagra_graph.npy header: shape=[%zu,%zu], dtype=%c",
+                       dataset_size,
+                       graph_degree,
+                       dtype_for_graph.kind);
+        if (lseek(graph_fd.get(), 0, SEEK_SET) == -1) {
+          RAFT_FAIL("Failed to seek to beginning of graph file");
+        }
+        write_numpy_header(graph_fd.get(), {dataset_size, graph_degree}, dtype_for_graph);
+      }
+
+      RAFT_LOG_DEBUG(
+        "ACE: Wrote numpy headers (reordered: %zu, augmented: %zu, mapping: %zu, graph: %zu bytes)",
+        reordered_header_size,
+        augmented_header_size,
+        mapping_header_size,
+        graph_header_size);
+    }
+
+    auto partition_start     = std::chrono::high_resolution_clock::now();
+    auto partition_labels    = raft::make_host_matrix<IdxT, int64_t>(dataset_size, 2);
+    auto partition_histogram = raft::make_host_matrix<IdxT, int64_t>(n_partitions, 2);
+    for (size_t c = 0; c < n_partitions; c++) {
+      partition_histogram(c, 0) = 0;
+      partition_histogram(c, 1) = 0;
+    }
+
+    // Determine minimum partition size for stable KNN graph construction
+    size_t min_partition_size = std::max<size_t>(1000ULL, dataset_size / n_partitions * 0.1);
+
+    ace_get_partition_labels<T, IdxT>(
+      res, dataset, partition_labels.view(), partition_histogram.view(), min_partition_size);
+
+    ace_check_partition_sizes<IdxT>(dataset_size,
+                                    n_partitions,
+                                    partition_labels.view(),
+                                    partition_histogram.view(),
+                                    min_partition_size);
+
+    auto partition_end = std::chrono::high_resolution_clock::now();
+    auto partition_elapsed =
+      std::chrono::duration_cast<std::chrono::milliseconds>(partition_end - partition_start)
+        .count();
+    RAFT_LOG_INFO(
+      "ACE: Partition labeling completed in %ld ms (min_partition_size: "
+      "%lu)",
+      partition_elapsed,
+      min_partition_size);
+
+    // Create vector lists for each partition
+    auto vectorlist_start      = std::chrono::high_resolution_clock::now();
+    auto core_forward_mapping  = use_disk_mode ? raft::make_host_vector<IdxT, int64_t>(dataset_size)
+                                               : raft::make_host_vector<IdxT, int64_t>(0);
+    auto core_backward_mapping = raft::make_host_vector<IdxT, int64_t>(dataset_size);
+    auto augmented_backward_mapping  = raft::make_host_vector<IdxT, int64_t>(dataset_size);
+    auto core_partition_offsets      = raft::make_host_vector<IdxT, int64_t>(n_partitions + 1);
+    auto augmented_partition_offsets = raft::make_host_vector<IdxT, int64_t>(n_partitions + 1);
+
+    ace_create_forward_and_backward_lists<IdxT>(dataset_size,
+                                                n_partitions,
+                                                partition_labels.view(),
+                                                partition_histogram.view(),
+                                                core_forward_mapping.view(),
+                                                core_backward_mapping.view(),
+                                                augmented_backward_mapping.view(),
+                                                core_partition_offsets.view(),
+                                                augmented_partition_offsets.view());
+
+    auto vectorlist_end = std::chrono::high_resolution_clock::now();
+    auto vectorlist_elapsed =
+      std::chrono::duration_cast<std::chrono::milliseconds>(vectorlist_end - vectorlist_start)
+        .count();
+    RAFT_LOG_INFO("ACE: Vector list creation completed in %ld ms", vectorlist_elapsed);
+
+    // Reorder the dataset based on partitions and store to disk. Uses write buffers to improve
+    // performance.
+    if (use_disk_mode) {
+      ace_reorder_and_store_dataset<T, IdxT>(res,
+                                             build_dir,
+                                             dataset,
+                                             partition_labels.view(),
+                                             partition_histogram.view(),
+                                             core_backward_mapping.view(),
+                                             core_partition_offsets.view(),
+                                             augmented_partition_offsets.view(),
+                                             reordered_fd,
+                                             augmented_fd,
+                                             mapping_fd,
+                                             reordered_header_size,
+                                             augmented_header_size,
+                                             mapping_header_size);
+      // core_backward_mapping is not needed anymore.
+      core_backward_mapping = raft::make_host_vector<IdxT, int64_t>(0);
+    }
+
+    // Placeholder search graph for in-memory version
+    auto search_graph = use_disk_mode
+                          ? raft::make_host_matrix<IdxT, int64_t>(0, 0)
+                          : raft::make_host_matrix<IdxT, int64_t>(dataset_size, graph_degree);
+
+    // Process each partition
+    auto partition_processing_start = std::chrono::high_resolution_clock::now();
+    for (size_t partition_id = 0; partition_id < n_partitions; partition_id++) {
+      RAFT_LOG_DEBUG("ACE: Processing partition %lu/%lu", partition_id + 1, n_partitions);
+      auto start = std::chrono::high_resolution_clock::now();
+
+      // Extract vectors for this partition
+      size_t core_sub_dataset_size      = partition_histogram(partition_id, 0);
+      size_t augmented_sub_dataset_size = partition_histogram(partition_id, 1);
+      size_t sub_dataset_size           = core_sub_dataset_size + augmented_sub_dataset_size;
+
+      if (sub_dataset_size == 0) {
+        RAFT_LOG_WARN("ACE: Skipping empty partition %lu", partition_id);
+        continue;
+      }
+      RAFT_LOG_DEBUG("ACE: Sub-dataset size: %lu (%lu + %lu)",
+                     sub_dataset_size,
+                     core_sub_dataset_size,
+                     augmented_sub_dataset_size);
+
+      auto sub_dataset = raft::make_host_matrix<T, int64_t>(sub_dataset_size, dataset_dim);
+
+      if (use_disk_mode) {
+        // Load partition dataset from disk files
+        ace_load_partition_dataset_from_disk<T, IdxT>(res,
+                                                      build_dir,
+                                                      partition_id,
+                                                      dataset_dim,
+                                                      partition_histogram.view(),
+                                                      core_partition_offsets.view(),
+                                                      augmented_partition_offsets.view(),
+                                                      sub_dataset.view());
+      } else {
+        // Gather partition dataset from memory
+        ace_gather_partition_dataset<T, IdxT>(core_sub_dataset_size,
+                                              augmented_sub_dataset_size,
+                                              dataset_dim,
+                                              partition_id,
+                                              dataset,
+                                              core_backward_mapping.view(),
+                                              augmented_backward_mapping.view(),
+                                              core_partition_offsets.view(),
+                                              augmented_partition_offsets.view(),
+                                              sub_dataset.view());
+      }
+      auto read_end = std::chrono::high_resolution_clock::now();
+      auto read_elapsed =
+        std::chrono::duration_cast<std::chrono::milliseconds>(read_end - start).count();
+
+      // Create index for this partition
+      cuvs::neighbors::cagra::index_params sub_index_params;
+      sub_index_params = cuvs::neighbors::cagra::index_params::from_hnsw_params(
+        raft::make_extents<int64_t>(sub_dataset_size, dataset_dim),
+        graph_degree / 2,
+        ef_construction,
+        cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
+        params.metric);
+      sub_index_params.attach_dataset_on_build = false;
+      sub_index_params.guarantee_connectivity  = params.guarantee_connectivity;
+
+      auto sub_index = cuvs::neighbors::cagra::build(
+        res, sub_index_params, raft::make_const_mdspan(sub_dataset.view()));
+
+      auto optimize_end = std::chrono::high_resolution_clock::now();
+      auto optimize_elapsed =
+        std::chrono::duration_cast<std::chrono::milliseconds>(optimize_end - read_end).count();
+
+      // Copy graph edges for core members of this partition
+      auto sub_search_graph =
+        raft::make_host_matrix<IdxT, int64_t>(core_sub_dataset_size, graph_degree);
+      cudaStream_t stream = raft::resource::get_cuda_stream(res);
+      raft::update_host(sub_search_graph.data_handle(),
+                        sub_index.graph().data_handle(),
+                        sub_search_graph.size(),
+                        stream);
+      raft::resource::sync_stream(res, stream);
+
+      if (use_disk_mode) {
+        // Adjust IDs in sub_search_graph in place for disk storage
+        ace_adjust_sub_graph_ids_disk<IdxT>(core_sub_dataset_size,
+                                            augmented_sub_dataset_size,
+                                            graph_degree,
+                                            partition_id,
+                                            sub_search_graph.view(),
+                                            core_partition_offsets.view(),
+                                            augmented_partition_offsets.view(),
+                                            augmented_backward_mapping.view(),
+                                            core_forward_mapping.view());
+      } else {
+        // Adjust IDs in sub_search_graph and save to search_graph
+        ace_adjust_sub_graph_ids<IdxT>(core_sub_dataset_size,
+                                       augmented_sub_dataset_size,
+                                       graph_degree,
+                                       partition_id,
+                                       sub_search_graph.view(),
+                                       search_graph.view(),
+                                       core_partition_offsets.view(),
+                                       augmented_partition_offsets.view(),
+                                       core_backward_mapping.view(),
+                                       augmented_backward_mapping.view());
+      }
+
+      auto adjust_end = std::chrono::high_resolution_clock::now();
+      auto adjust_elapsed =
+        std::chrono::duration_cast<std::chrono::milliseconds>(adjust_end - optimize_end).count();
+
+      if (use_disk_mode) {
+        const size_t graph_offset =
+          static_cast<size_t>(core_partition_offsets(partition_id)) * graph_degree * sizeof(IdxT) +
+          graph_header_size;
+        const size_t graph_bytes = core_sub_dataset_size * graph_degree * sizeof(IdxT);
+        cuvs::util::write_large_file(
+          graph_fd, sub_search_graph.data_handle(), graph_bytes, graph_offset);
+      }
+
+      auto end = std::chrono::high_resolution_clock::now();
+      auto write_elapsed =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - adjust_end).count();
+      auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      double read_throughput  = read_elapsed > 0 ? sub_dataset_size * dataset_dim * sizeof(T) /
+                                                    (1024.0 * 1024.0) / (read_elapsed / 1000.0)
+                                                 : 0.0;
+      double write_throughput = write_elapsed > 0
+                                  ? core_sub_dataset_size * dataset_dim * sizeof(T) /
+                                      (1024.0 * 1024.0) / (write_elapsed / 1000.0)
+                                  : 0.0;
+      RAFT_LOG_INFO(
+        "ACE: Partition %4lu (%8lu + %8lu) completed in %6ld ms: read %6ld ms (%7.1f MiB/s), "
+        "optimize %6ld ms, adjust %6ld ms, write %6ld ms (%7.1f MiB/s)",
+        partition_id,
+        core_sub_dataset_size,
+        augmented_sub_dataset_size,
+        elapsed_ms,
+        read_elapsed,
+        read_throughput,
+        optimize_elapsed,
+        adjust_elapsed,
+        write_elapsed,
+        write_throughput);
+    }
+
+    auto partition_processing_end     = std::chrono::high_resolution_clock::now();
+    auto partition_processing_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                                          partition_processing_end - partition_processing_start)
+                                          .count();
+    RAFT_LOG_INFO("ACE: All partition processing completed in %ld ms (%zu partitions)",
+                  partition_processing_elapsed,
+                  n_partitions);
+
+    // Clean up augmented dataset file to save disk space (no longer needed after partitions
+    // processed)
+    if (use_disk_mode) {
+      const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
+      if (std::filesystem::exists(augmented_dataset_path)) {
+        std::filesystem::remove(augmented_dataset_path);
+        RAFT_LOG_INFO("ACE: Removed augmented dataset file to save disk space");
+      }
+    }
+
+    auto index_creation_start = std::chrono::high_resolution_clock::now();
+    index<T, IdxT> idx(res, params.metric);
+    // Only add graph and dataset if not using disk storage. The returned index is empty if using
+    // disk storage. Use the files written to disk for search.
+    if (!use_disk_mode) {
+      idx.update_graph(res, raft::make_const_mdspan(search_graph.view()));
+
+      if (params.attach_dataset_on_build) {
+        try {
+          idx.update_dataset(res, dataset);
+        } catch (std::bad_alloc& e) {
+          RAFT_LOG_WARN(
+            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
+            "stored.");
+        } catch (raft::logic_error& e) {
+          RAFT_LOG_WARN(
+            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
+            "stored.");
+        }
+      }
+    } else {
+      idx.update_dataset(res, std::move(reordered_fd));
+      idx.update_graph(res, std::move(graph_fd));
+      idx.update_mapping(res, std::move(mapping_fd));
+
+      RAFT_LOG_INFO(
+        "ACE: Set disk storage at %s (dataset shape [%zu, %zu], graph shape [%zu, %zu])",
+        build_dir.c_str(),
+        idx.size(),
+        idx.dim(),
+        idx.size(),
+        idx.graph_degree());
+    }
+
+    auto index_creation_end     = std::chrono::high_resolution_clock::now();
+    auto index_creation_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+                                    index_creation_end - index_creation_start)
+                                    .count();
+    RAFT_LOG_INFO("ACE: Final index creation completed in %ld ms", index_creation_elapsed);
+
+    auto total_end = std::chrono::high_resolution_clock::now();
+    auto total_elapsed =
+      std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
+    RAFT_LOG_INFO("ACE: Partitioned CAGRA build completed in %ld ms total", total_elapsed);
+
+    return idx;
+  } catch (const std::exception& e) {
+    // Clean up build directory on failure if we created it
+    RAFT_LOG_ERROR("ACE: Build failed with exception: %s", e.what());
+    if (cleanup_on_failure && !build_dir.empty()) {
+      RAFT_LOG_INFO("ACE: Cleaning up build directory: %s", build_dir.c_str());
+      try {
+        std::filesystem::remove_all(build_dir);
+        RAFT_LOG_INFO("ACE: Successfully removed build directory");
+      } catch (const std::exception& cleanup_error) {
+        RAFT_LOG_WARN("ACE: Failed to clean up build directory: %s", cleanup_error.what());
+      }
+    }
+    // Re-throw the original exception
+    throw;
+  }
+}
+
 template <typename IdxT>
 void write_to_graph(raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
                     raft::host_matrix_view<int64_t, int64_t, raft::row_major> neighbors_host_view,
@@ -536,26 +1888,23 @@ auto iterative_build_graph(
 
   // Determine graph degree and number of search results while increasing
   // graph size.
-  auto small_graph_degree = std::max(graph_degree / 2, std::min(graph_degree, (uint64_t)32));
-  auto small_topk         = topk * small_graph_degree / graph_degree;
-  RAFT_LOG_DEBUG("# graph_degree = %lu", (uint64_t)graph_degree);
+  auto small_graph_degree = std::max(graph_degree / 2, std::min(graph_degree, (uint64_t)24));
   RAFT_LOG_DEBUG("# small_graph_degree = %lu", (uint64_t)small_graph_degree);
+  RAFT_LOG_DEBUG("# graph_degree = %lu", (uint64_t)graph_degree);
   RAFT_LOG_DEBUG("# topk = %lu", (uint64_t)topk);
-  RAFT_LOG_DEBUG("# small_topk = %lu", (uint64_t)small_topk);
 
   // Create an initial graph. The initial graph created here is not suitable for
   // searching, but connectivity is guaranteed.
-  auto offset       = raft::make_host_vector<IdxT, int64_t>(small_graph_degree);
-  const double base = sqrt((double)2.0);
+  auto offset = raft::make_host_vector<IdxT, int64_t>(small_graph_degree);
   for (uint64_t j = 0; j < small_graph_degree; j++) {
     if (j == 0) {
       offset(j) = 1;
     } else {
       offset(j) = offset(j - 1) + 1;
     }
-    IdxT ofst = initial_graph_size * pow(base, (double)j - small_graph_degree - 1);
+    IdxT ofst = pow((double)(initial_graph_size - 1) / 2, (double)(j + 1) / small_graph_degree);
     if (offset(j) < ofst) { offset(j) = ofst; }
-    RAFT_LOG_DEBUG("# offset(%lu) = %lu\n", (uint64_t)j, (uint64_t)offset(j));
+    RAFT_LOG_DEBUG("# offset(%lu) = %lu", (uint64_t)j, (uint64_t)offset(j));
   }
   cagra_graph = raft::make_host_matrix<IdxT, int64_t>(initial_graph_size, small_graph_degree);
   for (uint64_t i = 0; i < initial_graph_size; i++) {
@@ -572,22 +1921,34 @@ auto iterative_build_graph(
   IdxT* neighbors_ptr = (IdxT*)neighbors_list.data();
   memset(neighbors_ptr, 0, byte_size);
 
+  bool flag_last       = false;
   auto curr_graph_size = initial_graph_size;
   while (true) {
-    RAFT_LOG_DEBUG("# graph_size = %lu (%.3lf)",
-                   (uint64_t)curr_graph_size,
-                   (double)curr_graph_size / final_graph_size);
-
-    auto curr_query_size   = std::min(2 * curr_graph_size, final_graph_size);
-    auto curr_topk         = small_topk;
-    auto curr_itopk_size   = small_topk * 3 / 2;
-    auto curr_graph_degree = small_graph_degree;
-    if (curr_query_size == final_graph_size) {
-      curr_topk         = topk;
-      curr_itopk_size   = topk * 2;
-      curr_graph_degree = graph_degree;
+    auto start           = std::chrono::high_resolution_clock::now();
+    auto curr_query_size = std::min(2 * curr_graph_size, final_graph_size);
+
+    auto next_graph_degree = small_graph_degree;
+    if (curr_graph_size == final_graph_size) { next_graph_degree = graph_degree; }
+
+    // The search count (topk) is set to the next graph degree + 1, because
+    // pruning is not used except in the last iteration.
+    // (*) The appropriate setting for itopk_size requires careful consideration.
+    auto curr_topk       = next_graph_degree + 1;
+    auto curr_itopk_size = next_graph_degree + 32;
+    if (flag_last) {
+      curr_topk       = topk;
+      curr_itopk_size = curr_topk + 32;
     }
 
+    RAFT_LOG_INFO(
+      "# graph_size = %lu (%.3lf), graph_degree = %lu, query_size = %lu, itopk = %lu, topk = %lu",
+      (uint64_t)cagra_graph.extent(0),
+      (double)cagra_graph.extent(0) / final_graph_size,
+      (uint64_t)cagra_graph.extent(1),
+      (uint64_t)curr_query_size,
+      (uint64_t)curr_itopk_size,
+      (uint64_t)curr_topk);
+
     cuvs::neighbors::cagra::search_params search_params;
     search_params.algo        = cuvs::neighbors::cagra::search_algo::AUTO;
     search_params.max_queries = max_chunk_size;
@@ -640,13 +2001,19 @@ auto iterative_build_graph(
     }
 
     // Optimize graph
-    bool flag_last  = (curr_graph_size == final_graph_size);
-    curr_graph_size = curr_query_size;
-    cagra_graph     = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
-    cagra_graph     = raft::make_host_matrix<IdxT, int64_t>(curr_graph_size, curr_graph_degree);
+    auto next_graph_size = curr_query_size;
+    cagra_graph          = raft::make_host_matrix<IdxT, int64_t>(0, 0);  // delete existing grahp
+    cagra_graph = raft::make_host_matrix<IdxT, int64_t>(next_graph_size, next_graph_degree);
     optimize<IdxT>(
       res, neighbors_view, cagra_graph.view(), flag_last ? params.guarantee_connectivity : 0);
+
+    auto end        = std::chrono::high_resolution_clock::now();
+    auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    RAFT_LOG_INFO("# elapsed time: %.3lf sec", (double)elapsed_ms / 1000);
+
     if (flag_last) { break; }
+    flag_last       = (curr_graph_size == final_graph_size);
+    curr_graph_size = next_graph_size;
   }
 
   return cagra_graph;
@@ -670,20 +2037,7 @@ index<T, IdxT> build(
                                      : "device",
     intermediate_degree,
     graph_degree);
-  if (intermediate_degree >= static_cast<size_t>(dataset.extent(0))) {
-    RAFT_LOG_WARN(
-      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
-      dataset.extent(0));
-    intermediate_degree = dataset.extent(0) - 1;
-  }
-  if (intermediate_degree < graph_degree) {
-    RAFT_LOG_WARN(
-      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
-      "graph_degree.",
-      graph_degree,
-      intermediate_degree);
-    graph_degree = intermediate_degree;
-  }
+  check_graph_degree<T, IdxT>(intermediate_degree, graph_degree, dataset.extent(0));
 
   // Set default value in case knn_build_params is not defined.
   auto knn_build_params = params.graph_build_params;
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 26e0aafd2d..45328377be 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -142,6 +142,11 @@ void search_main(raft::resources const& res,
                  raft::device_matrix_view<DistanceT, int64_t, raft::row_major> distances,
                  CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
+  RAFT_EXPECTS(!index.dataset_fd().has_value(),
+               "Cannot search a CAGRA index that is stored on disk. "
+               "Use cuvs::neighbors::hnsw::from_cagra() to convert the index and "
+               "cuvs::neighbors::hnsw::deserialize() to load it into memory before searching.");
+
   // n_rows has the same type as the dataset index (the array extents type)
   using ds_idx_type    = decltype(index.data().n_rows());
   using graph_idx_type = uint32_t;
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index 20984e3e45..866415b1e4 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -44,6 +44,11 @@ void serialize(raft::resources const& res,
 {
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
 
+  RAFT_EXPECTS(!index_.dataset_fd().has_value(),
+               "Cannot serialize a disk-backed CAGRA index. Convert it with "
+               "cuvs::neighbors::hnsw::from_cagra() and load it into memory via "
+               "cuvs::neighbors::hnsw::deserialize() before serialization.");
+
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
 
@@ -80,6 +85,10 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
+  RAFT_EXPECTS(!index_.dataset_fd().has_value(),
+               "Cannot serialize a disk-backed CAGRA index. Convert it with "
+               "cuvs::neighbors::hnsw::from_cagra() and load it into memory via "
+               "cuvs::neighbors::hnsw::deserialize() before serialization.");
   std::ofstream of(filename, std::ios::out | std::ios::binary);
   if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
 
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index a746cf473a..f8091c9e51 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -169,16 +169,17 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   uint64_t* const num_retain = stats;
   uint64_t* const num_full   = stats + 1;
 
-  const uint64_t nid = blockIdx.x + (batch_size * batch_id);
-  if (nid >= graph_size) { return; }
+  const uint64_t iA = blockIdx.x + (batch_size * batch_id);
+  if (iA >= graph_size) { return; }
   for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
     smem_num_detour[k] = 0;
+    if (knn_graph[k + ((uint64_t)graph_degree * iA)] == iA) {
+      // Lower the priority of self-edge
+      smem_num_detour[k] = graph_degree;
+    }
   }
   __syncthreads();
 
-  const uint64_t iA = nid;
-  if (iA >= graph_size) { return; }
-
   // count number of detours (A->D->B)
   for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
     const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
@@ -1410,7 +1411,7 @@ void optimize(
       "overflows occur during the norm computation between the dataset vectors.");
 
     const double time_prune_end = cur_time();
-    RAFT_LOG_DEBUG("# Pruning time: %.1lf sec", time_prune_end - time_prune_start);
+    RAFT_LOG_DEBUG("# Pruning time: %.1lf ms", (time_prune_end - time_prune_start) * 1000.0);
   }
 
   auto rev_graph       = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
@@ -1480,7 +1481,8 @@ void optimize(
                raft::resource::get_cuda_stream(res));
 
     const double time_make_end = cur_time();
-    RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf sec", time_make_end - time_make_start);
+    RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf ms",
+                   (time_make_end - time_make_start) * 1000.0);
   }
 
   {
@@ -1567,7 +1569,8 @@ void optimize(
                  "many MST optimization edges.");
 
     const double time_replace_end = cur_time();
-    RAFT_LOG_DEBUG("# Replacing edges time: %.1lf sec", time_replace_end - time_replace_start);
+    RAFT_LOG_DEBUG("# Replacing edges time: %.1lf ms",
+                   (time_replace_end - time_replace_start) * 1000.0);
 
     /* stats */
     uint64_t num_replaced_edges = 0;
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 2ce9e0dda8..186216a4da 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -9,18 +9,25 @@
 #include "../../core/omp_wrapper.hpp"
 
 #include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/hnsw.hpp>
+#include <cuvs/util/file_io.hpp>
 
+#include <raft/core/detail/mdspan_numpy_serializer.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/pinned_mdarray.hpp>
 
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
 
+#include <fcntl.h>
 #include <filesystem>
+#include <fstream>
 #include <memory>
 #include <random>
+#include <sys/mman.h>
 #include <thread>
+#include <unistd.h>
 
 namespace cuvs::neighbors::hnsw::detail {
 
@@ -107,9 +114,32 @@ struct index_impl : index<T> {
     return space_.get();
   }
 
+  /**
+  @brief Set file descriptor for disk-backed index
+   */
+  void set_file_descriptor(cuvs::util::file_descriptor&& fd) { hnsw_fd_.emplace(std::move(fd)); }
+
+  /**
+  @brief Get file descriptor
+   */
+  auto file_descriptor() const -> const std::optional<cuvs::util::file_descriptor>&
+  {
+    return hnsw_fd_;
+  }
+
+  /**
+  @brief Get file path for disk-backed index
+   */
+  std::string file_path() const override
+  {
+    if (hnsw_fd_.has_value() && hnsw_fd_->is_valid()) { return hnsw_fd_->get_path(); }
+    return "";
+  }
+
  private:
   std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+  std::optional<cuvs::util::file_descriptor> hnsw_fd_;
 };
 
 template <typename T, HnswHierarchy hierarchy>
@@ -179,7 +209,7 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
     hnsw_index->get_space(),
     host_dataset_view.extent(0),
-    cagra_index.graph().extent(1) / 2,
+    (cagra_index.graph().extent(1) + 1) / 2,
     params.ef_construction);
   appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
   [[maybe_unused]] auto num_threads =
@@ -257,12 +287,499 @@ void all_neighbors_graph(raft::resources const& res,
                          raft::host_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
                          cuvs::distance::DistanceType metric)
 {
-  nn_descent::index_params nn_params;
-  nn_params.graph_degree              = neighbors.extent(1);
-  nn_params.intermediate_graph_degree = neighbors.extent(1) * 2;
-  nn_params.metric                    = metric;
-  nn_params.return_distances          = false;
-  auto nn_index                       = nn_descent::build(res, nn_params, dataset, neighbors);
+  // FIXME: choose better heuristic
+  bool use_nn_decent = neighbors.size() < 1e7;
+  if (use_nn_decent) {
+    nn_descent::index_params nn_params;
+    nn_params.graph_degree              = neighbors.extent(1);
+    nn_params.intermediate_graph_degree = neighbors.extent(1) * 2;
+    nn_params.metric                    = metric;
+    nn_params.return_distances          = false;
+    auto nn_index                       = nn_descent::build(res, nn_params, dataset, neighbors);
+  } else {
+    // TODO: choose parameters to minimize memory consumption
+    cagra::graph_build_params::ivf_pq_params ivfpq_params(dataset.extents(), metric);
+    cagra::build_knn_graph(res, dataset, neighbors, ivfpq_params);
+  }
+}
+
+template <typename T, typename IdxT>
+void serialize_to_hnswlib_from_disk(raft::resources const& res,
+                                    std::ostream& os_raw,
+                                    const cuvs::neighbors::hnsw::index_params& params,
+                                    const cuvs::neighbors::cagra::index<T, IdxT>& index_)
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
+
+  auto start_time = std::chrono::system_clock::now();
+
+  cuvs::util::buffered_ofstream os(&os_raw, 1 << 20 /*1MB*/);
+
+  RAFT_EXPECTS(index_.dataset_fd().has_value() && index_.graph_fd().has_value(),
+               "Function only implements serialization from disk.");
+  RAFT_EXPECTS(params.hierarchy != HnswHierarchy::CPU,
+               "Disk2disk serialization not supported for CPU hierarchy.");
+
+  auto n_rows           = index_.size();
+  auto dim              = index_.dim();
+  auto graph_degree_int = static_cast<int>(index_.graph_degree());
+  RAFT_LOG_INFO("Saving CAGRA index to hnswlib format, size %zu, dim %zu, graph_degree %zu",
+                static_cast<size_t>(n_rows),
+                static_cast<size_t>(dim),
+                static_cast<size_t>(graph_degree_int));
+
+  // Get file descriptors from index
+  const auto& graph_fd_opt   = index_.graph_fd();
+  const auto& dataset_fd_opt = index_.dataset_fd();
+  const auto& mapping_fd_opt = index_.mapping_fd();
+
+  RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(),
+               "Graph file descriptor is not available");
+  RAFT_EXPECTS(dataset_fd_opt.has_value() && dataset_fd_opt->is_valid(),
+               "Dataset file descriptor is not available");
+  RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(),
+               "Mapping file descriptor is not available");
+
+  // Get file paths from file descriptors
+  std::string graph_path   = graph_fd_opt->get_path();
+  std::string dataset_path = dataset_fd_opt->get_path();
+  std::string mapping_path = mapping_fd_opt->get_path();
+
+  RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+  RAFT_EXPECTS(!dataset_path.empty(), "Unable to get path from dataset file descriptor");
+  RAFT_EXPECTS(!mapping_path.empty(), "Unable to get path from mapping file descriptor");
+
+  int graph_fd   = graph_fd_opt->get();
+  int dataset_fd = dataset_fd_opt->get();
+  int label_fd   = mapping_fd_opt->get();
+
+  // Read headers from files to get dimensions
+  size_t graph_header_size = 0;
+  size_t graph_n_rows      = 0;
+  size_t graph_n_cols      = 0;
+  {
+    std::ifstream graph_stream(graph_path, std::ios::binary);
+    RAFT_EXPECTS(graph_stream.good(), "Failed to open graph file: %s", graph_path.c_str());
+
+    auto header       = raft::detail::numpy_serializer::read_header(graph_stream);
+    graph_header_size = static_cast<size_t>(graph_stream.tellg());
+    RAFT_EXPECTS(
+      header.shape.size() == 2, "Graph file should be 2D, got %zu dimensions", header.shape.size());
+
+    graph_n_rows = header.shape[0];
+    graph_n_cols = header.shape[1];
+    RAFT_LOG_DEBUG("Graph file: %zu x %zu, header size: %zu bytes",
+                   graph_n_rows,
+                   graph_n_cols,
+                   graph_header_size);
+  }
+
+  size_t dataset_header_size = 0;
+  size_t dataset_n_rows      = 0;
+  size_t dataset_n_cols      = 0;
+  {
+    std::ifstream dataset_stream(dataset_path, std::ios::binary);
+    RAFT_EXPECTS(dataset_stream.good(), "Failed to open dataset file: %s", dataset_path.c_str());
+
+    auto header         = raft::detail::numpy_serializer::read_header(dataset_stream);
+    dataset_header_size = static_cast<size_t>(dataset_stream.tellg());
+    RAFT_EXPECTS(header.shape.size() == 2,
+                 "Dataset file should be 2D, got %zu dimensions",
+                 header.shape.size());
+
+    dataset_n_rows = header.shape[0];
+    dataset_n_cols = header.shape[1];
+    RAFT_LOG_DEBUG("Dataset file: %zu x %zu, header size: %zu bytes",
+                   dataset_n_rows,
+                   dataset_n_cols,
+                   dataset_header_size);
+  }
+
+  size_t label_header_size = 0;
+  size_t label_n_elements  = 0;
+  {
+    std::ifstream mapping_stream(mapping_path, std::ios::binary);
+    RAFT_EXPECTS(mapping_stream.good(), "Failed to open mapping file: %s", mapping_path.c_str());
+
+    auto header       = raft::detail::numpy_serializer::read_header(mapping_stream);
+    label_header_size = static_cast<size_t>(mapping_stream.tellg());
+    RAFT_EXPECTS(header.shape.size() == 1,
+                 "Mapping file should be 1D, got %zu dimensions",
+                 header.shape.size());
+
+    label_n_elements = header.shape[0];
+    RAFT_LOG_DEBUG(
+      "Mapping file: %zu elements, header size: %zu bytes", label_n_elements, label_header_size);
+  }
+
+  // Verify consistency
+  RAFT_EXPECTS(graph_n_rows == static_cast<size_t>(n_rows),
+               "Graph rows (%zu) != index size (%zu)",
+               graph_n_rows,
+               static_cast<size_t>(n_rows));
+  RAFT_EXPECTS(dataset_n_rows == static_cast<size_t>(n_rows),
+               "Dataset rows (%zu) != index size (%zu)",
+               dataset_n_rows,
+               static_cast<size_t>(n_rows));
+  RAFT_EXPECTS(label_n_elements == static_cast<size_t>(n_rows),
+               "Label elements (%zu) != index size (%zu)",
+               label_n_elements,
+               static_cast<size_t>(n_rows));
+  RAFT_EXPECTS(graph_n_cols == static_cast<size_t>(graph_degree_int),
+               "Graph cols (%zu) != graph degree (%d)",
+               graph_n_cols,
+               graph_degree_int);
+  RAFT_EXPECTS(dataset_n_cols == static_cast<size_t>(dim),
+               "Dataset cols (%zu) != dimensions (%zu)",
+               dataset_n_cols,
+               static_cast<size_t>(dim));
+
+  const size_t row_size_bytes =
+    graph_degree_int * sizeof(IdxT) + dim * sizeof(T) + sizeof(uint32_t);
+  const size_t target_batch_bytes = 64 * 1024 * 1024;
+  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / row_size_bytes);
+
+  RAFT_LOG_DEBUG("Using batch size %zu rows (~%.2f MiB/batch)",
+                 batch_size,
+                 (batch_size * row_size_bytes) / (1024.0 * 1024.0));
+
+  // Allocate buffers for batched reading
+  auto graph_buffer   = raft::make_host_matrix<IdxT, int64_t>(batch_size, graph_degree_int);
+  auto dataset_buffer = raft::make_host_matrix<T, int64_t>(batch_size, dim);
+  auto label_buffer   = raft::make_host_vector<uint32_t, int64_t>(batch_size);
+
+  RAFT_LOG_DEBUG("Allocated buffers: graph[%ld,%d], dataset[%ld,%ld], labels[%ld]",
+                 graph_buffer.extent(0),
+                 graph_degree_int,
+                 dataset_buffer.extent(0),
+                 dataset_buffer.extent(1),
+                 label_buffer.extent(0));
+
+  // initialize dummy HNSW index to retrieve constants
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, index_.metric(), params.hierarchy);
+
+  int odd_graph_degree = graph_degree_int % 2;
+  auto appr_algo       = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
+
+  bool create_hierarchy = params.hierarchy != HnswHierarchy::NONE;
+
+  // create hierarchy order
+  // sort the points by levels
+  // roll dice & build histogram
+  std::vector<size_t> hist;
+  std::vector<size_t> order(n_rows);
+  std::vector<size_t> order_bw(n_rows);
+  std::vector<int> levels(n_rows);
+  std::vector<size_t> offsets;
+
+  if (create_hierarchy) {
+    RAFT_LOG_INFO("Sort points by levels");
+    for (int64_t i = 0; i < n_rows; i++) {
+      auto pt_level = appr_algo->getRandomLevel(appr_algo->mult_);
+      while (pt_level >= static_cast<int32_t>(hist.size()))
+        hist.push_back(0);
+      hist[pt_level]++;
+      levels[i] = pt_level;
+    }
+
+    // accumulate
+    offsets.resize(hist.size() + 1, 0);
+    for (size_t i = 0; i < hist.size() - 1; i++) {
+      offsets[i + 1] = offsets[i] + hist[i];
+      RAFT_LOG_INFO("Level %zu : %zu", i + 1, size_t(n_rows) - offsets[i + 1]);
+    }
+
+    // fw/bw indices
+    for (int64_t i = 0; i < n_rows; i++) {
+      auto pt_level              = levels[i];
+      order_bw[i]                = offsets[pt_level];
+      order[offsets[pt_level]++] = i;
+    }
+  }
+
+  // set last point of the highest level as the entry point
+  appr_algo->enterpoint_node_ = create_hierarchy ? order.back() : n_rows / 2;
+  appr_algo->maxlevel_        = create_hierarchy ? hist.size() - 1 : 1;
+
+  // write header information
+  RAFT_LOG_DEBUG("Writing HNSW header: offsetLevel0=%zu, n_rows=%zu, size_data_per_element=%zu",
+                 appr_algo->offsetLevel0_,
+                 static_cast<size_t>(n_rows),
+                 appr_algo->size_data_per_element_);
+  RAFT_LOG_DEBUG("  maxlevel=%d, enterpoint=%d, maxM=%zu, maxM0=%zu, M=%zu",
+                 appr_algo->maxlevel_,
+                 appr_algo->enterpoint_node_,
+                 appr_algo->maxM_,
+                 appr_algo->maxM0_,
+                 appr_algo->M_);
+
+  // offset_level_0
+  os.write(reinterpret_cast<char*>(&appr_algo->offsetLevel0_), sizeof(std::size_t));
+  // 8 max_element - override with n_rows
+  size_t num_elements = (size_t)n_rows;
+  os.write(reinterpret_cast<char*>(&num_elements), sizeof(std::size_t));
+  // 16 curr_element_count - override with n_rows
+  os.write(reinterpret_cast<char*>(&num_elements), sizeof(std::size_t));
+  // 24 size_data_per_element
+  os.write(reinterpret_cast<char*>(&appr_algo->size_data_per_element_), sizeof(std::size_t));
+  // 32 label_offset
+  os.write(reinterpret_cast<char*>(&appr_algo->label_offset_), sizeof(std::size_t));
+  // 40 offset_data
+  os.write(reinterpret_cast<char*>(&appr_algo->offsetData_), sizeof(std::size_t));
+  // 48 maxlevel
+  os.write(reinterpret_cast<char*>(&appr_algo->maxlevel_), sizeof(int));
+  // 52 enterpoint_node
+  os.write(reinterpret_cast<char*>(&appr_algo->enterpoint_node_), sizeof(int));
+  // 56 maxM
+  os.write(reinterpret_cast<char*>(&appr_algo->maxM_), sizeof(std::size_t));
+  // 64 maxM0
+  os.write(reinterpret_cast<char*>(&appr_algo->maxM0_), sizeof(std::size_t));
+  // 72 M
+  os.write(reinterpret_cast<char*>(&appr_algo->M_), sizeof(std::size_t));
+  // 80 mult
+  os.write(reinterpret_cast<char*>(&appr_algo->mult_), sizeof(double));
+  // 88 ef_construction
+  os.write(reinterpret_cast<char*>(&appr_algo->ef_construction_), sizeof(std::size_t));
+
+  // host queries
+  auto host_query_set =
+    raft::make_host_matrix<T, int64_t>(create_hierarchy ? n_rows - hist[0] : 0, dim);
+
+  int64_t d_report_offset    = n_rows / 10;  // Report progress in 10% steps.
+  int64_t next_report_offset = d_report_offset;
+  auto start_clock           = std::chrono::system_clock::now();
+
+  RAFT_EXPECTS(appr_algo->size_data_per_element_ ==
+                 dim * sizeof(T) + appr_algo->maxM0_ * sizeof(IdxT) + sizeof(int) + sizeof(size_t),
+               "Size data per element mismatch");
+
+  RAFT_LOG_INFO("Writing base level");
+  size_t bytes_written = 0;
+  float GiB            = 1 << 30;
+  IdxT zero            = 0;
+  RAFT_EXPECTS(appr_algo->size_data_per_element_ ==
+                 dim * sizeof(T) + appr_algo->maxM0_ * sizeof(IdxT) + sizeof(int) + sizeof(size_t),
+               "Size data per element mismatch");
+
+  // Helper lambda for parallel reading of batches
+  auto read_batch = [&](int64_t start_row, int64_t rows_to_read) {
+    const size_t graph_bytes   = rows_to_read * graph_degree_int * sizeof(IdxT);
+    const size_t dataset_bytes = rows_to_read * dim * sizeof(T);
+    const size_t label_bytes   = rows_to_read * sizeof(uint32_t);
+
+    const off_t graph_offset   = graph_header_size + start_row * graph_degree_int * sizeof(IdxT);
+    const off_t dataset_offset = dataset_header_size + start_row * dim * sizeof(T);
+    const off_t label_offset   = label_header_size + start_row * sizeof(uint32_t);
+
+    RAFT_LOG_DEBUG("Reading batch: row=%ld, rows=%ld", start_row, rows_to_read);
+    RAFT_LOG_DEBUG(
+      "  graph: offset=%zu, bytes=%zu", static_cast<size_t>(graph_offset), graph_bytes);
+    RAFT_LOG_DEBUG(
+      "  dataset: offset=%zu, bytes=%zu", static_cast<size_t>(dataset_offset), dataset_bytes);
+    RAFT_LOG_DEBUG(
+      "  label: offset=%zu, bytes=%zu", static_cast<size_t>(label_offset), label_bytes);
+
+#pragma omp parallel sections num_threads(3)
+    {
+#pragma omp section
+      {
+        ssize_t bytes_read = pread(graph_fd, graph_buffer.data_handle(), graph_bytes, graph_offset);
+        RAFT_EXPECTS(bytes_read == static_cast<ssize_t>(graph_bytes),
+                     "Failed to read graph data: expected %zu, got %zd",
+                     graph_bytes,
+                     bytes_read);
+      }
+#pragma omp section
+      {
+        ssize_t bytes_read =
+          pread(dataset_fd, dataset_buffer.data_handle(), dataset_bytes, dataset_offset);
+        RAFT_EXPECTS(bytes_read == static_cast<ssize_t>(dataset_bytes),
+                     "Failed to read dataset data: expected %zu, got %zd",
+                     dataset_bytes,
+                     bytes_read);
+      }
+#pragma omp section
+      {
+        ssize_t bytes_read = pread(label_fd, label_buffer.data_handle(), label_bytes, label_offset);
+        RAFT_EXPECTS(bytes_read == static_cast<ssize_t>(label_bytes),
+                     "Failed to read label data: expected %zu, got %zd",
+                     label_bytes,
+                     bytes_read);
+      }
+    }
+
+    // Log first few values from first batch for debugging
+    if (start_row == 0 && rows_to_read > 0) {
+      RAFT_LOG_DEBUG("First graph row: [%u, %u, %u, ...]",
+                     static_cast<unsigned int>(graph_buffer(0, 0)),
+                     graph_degree_int > 1 ? static_cast<unsigned int>(graph_buffer(0, 1)) : 0,
+                     graph_degree_int > 2 ? static_cast<unsigned int>(graph_buffer(0, 2)) : 0);
+      RAFT_LOG_DEBUG("First dataset row: [%f, %f, %f, ...]",
+                     static_cast<float>(dataset_buffer(0, 0)),
+                     dim > 1 ? static_cast<float>(dataset_buffer(0, 1)) : 0.0f,
+                     dim > 2 ? static_cast<float>(dataset_buffer(0, 2)) : 0.0f);
+      RAFT_LOG_DEBUG("First labels: [%u, %u, %u, ...]",
+                     static_cast<unsigned int>(label_buffer(0)),
+                     rows_to_read > 1 ? static_cast<unsigned int>(label_buffer(1)) : 0,
+                     rows_to_read > 2 ? static_cast<unsigned int>(label_buffer(2)) : 0);
+    }
+  };
+
+  for (int64_t batch_start = 0; batch_start < n_rows; batch_start += batch_size) {
+    const int64_t current_batch_size = std::min<int64_t>(batch_size, n_rows - batch_start);
+
+    RAFT_LOG_DEBUG("Reading batch: start=%ld, size=%ld (batch_size=%zu)",
+                   batch_start,
+                   current_batch_size,
+                   batch_size);
+    read_batch(batch_start, current_batch_size);
+
+    for (int64_t batch_idx = 0; batch_idx < current_batch_size; batch_idx++) {
+      const int64_t i = batch_start + batch_idx;
+
+      os.write(reinterpret_cast<char*>(&graph_degree_int), sizeof(int));
+
+      const IdxT* graph_row = &graph_buffer(batch_idx, 0);
+      os.write(reinterpret_cast<const char*>(graph_row), sizeof(IdxT) * graph_degree_int);
+
+      if (odd_graph_degree) {
+        RAFT_EXPECTS(odd_graph_degree == static_cast<int>(appr_algo->maxM0_) - graph_degree_int,
+                     "Odd graph degree mismatch");
+        os.write(reinterpret_cast<char*>(&zero), sizeof(IdxT));
+      }
+
+      const T* data_row = &dataset_buffer(batch_idx, 0);
+      os.write(reinterpret_cast<const char*>(data_row), sizeof(T) * dim);
+
+      if (create_hierarchy && levels[i] > 0) {
+        // position in query: order_bw[i]-hist[0]
+        std::copy(data_row,
+                  data_row + dim,
+                  reinterpret_cast<char*>(&host_query_set(order_bw[i] - hist[0], 0)));
+      }
+
+      // assign original label
+      auto label = static_cast<size_t>(label_buffer(batch_idx));
+      os.write(reinterpret_cast<char*>(&label), sizeof(std::size_t));
+
+      bytes_written += appr_algo->size_data_per_element_;
+
+      const auto end_clock = std::chrono::system_clock::now();
+      // if (!os.good()) { RAFT_FAIL("Error writing HNSW file, row %zu", i); }
+      if (i > next_report_offset) {
+        next_report_offset += d_report_offset;
+        const auto time =
+          std::chrono::duration_cast<std::chrono::microseconds>(end_clock - start_clock).count() *
+          1e-6;
+        float throughput      = bytes_written / GiB / time;
+        float rows_throughput = i / time;
+        float ETA             = (n_rows - i) / rows_throughput;
+        RAFT_LOG_INFO(
+          "# Writing rows %12lu / %12lu (%3.2f %%), %3.2f GiB/sec, ETA %d:%3.1f, written %3.2f "
+          "GiB\r",
+          i,
+          n_rows,
+          i / static_cast<double>(n_rows) * 100,
+          throughput,
+          int(ETA / 60),
+          std::fmod(ETA, 60.0f),
+          bytes_written / GiB);
+      }
+    }
+  }
+
+  RAFT_LOG_DEBUG("Completed writing %ld base level rows", n_rows);
+
+  // trigger knn builds for all levels
+  std::vector<raft::host_matrix<IdxT, int64_t>> host_neighbors;
+  if (create_hierarchy) {
+    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
+      auto num_pts       = n_rows - offsets[pt_level - 1];
+      auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
+      host_neighbors.emplace_back(raft::make_host_matrix<IdxT, int64_t>(num_pts, neighbor_size));
+    }
+    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
+      RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level);
+      auto removed_rows = offsets[pt_level - 1] - offsets[0];
+      raft::host_matrix_view<T, int64_t, raft::row_major> sub_query_view(
+        host_query_set.data_handle() + removed_rows * dim,
+        host_query_set.extent(0) - removed_rows,
+        dim);
+      auto neighbor_view = host_neighbors[pt_level - 1].view();
+      all_neighbors_graph(
+        res, raft::make_const_mdspan(sub_query_view), neighbor_view, index_.metric());
+    }
+  }
+
+  if (create_hierarchy) {
+    RAFT_LOG_INFO("Assemble hierarchy linklists");
+    next_report_offset = d_report_offset;
+  }
+  bytes_written = 0;
+  start_clock   = std::chrono::system_clock::now();
+
+  for (int64_t i = 0; i < n_rows; i++) {
+    size_t cur_level = create_hierarchy ? levels[i] : 0;
+    unsigned int linkListSize =
+      create_hierarchy && cur_level > 0 ? appr_algo->size_links_per_element_ * cur_level : 0;
+    os.write(reinterpret_cast<char*>(&linkListSize), sizeof(int));
+    bytes_written += sizeof(int);
+    if (linkListSize) {
+      for (size_t pt_level = 1; pt_level <= cur_level; pt_level++) {
+        auto neighbor_view = host_neighbors[pt_level - 1].view();
+        auto my_row        = order_bw[i] - offsets[pt_level - 1];
+
+        IdxT* neighbors     = &neighbor_view(my_row, 0);
+        unsigned int extent = neighbor_view.extent(1);
+        os.write(reinterpret_cast<char*>(&extent), sizeof(int));
+        for (unsigned int j = 0; j < extent; j++) {
+          const IdxT converted = order[neighbors[j] + offsets[pt_level - 1]];
+          os.write(reinterpret_cast<const char*>(&converted), sizeof(IdxT));
+        }
+        auto remainder = appr_algo->M_ - neighbor_view.extent(1);
+        for (size_t j = 0; j < remainder; j++) {
+          os.write(reinterpret_cast<char*>(&zero), sizeof(IdxT));
+        }
+        bytes_written += (neighbor_view.extent(1) + remainder) * sizeof(IdxT) + sizeof(int);
+        RAFT_EXPECTS(appr_algo->size_links_per_element_ ==
+                       (neighbor_view.extent(1) + remainder) * sizeof(IdxT) + sizeof(int),
+                     "Size links per element mismatch");
+      }
+    }
+
+    const auto end_clock = std::chrono::system_clock::now();
+    if (i > next_report_offset) {
+      next_report_offset += d_report_offset;
+      const auto time =
+        std::chrono::duration_cast<std::chrono::microseconds>(end_clock - start_clock).count() *
+        1e-6;
+      float throughput      = bytes_written / GiB / time;
+      float rows_throughput = i / time;
+      float ETA             = (n_rows - i) / rows_throughput;
+      RAFT_LOG_INFO(
+        "# Writing rows %12lu / %12lu (%3.2f %%), %3.2f GiB/sec, ETA %d:%3.1f, written %3.2f GiB\r",
+        i,
+        n_rows,
+        i / static_cast<double>(n_rows) * 100,
+        throughput,
+        int(ETA / 60),
+        std::fmod(ETA, 60.0f),
+        bytes_written / GiB);
+    }
+  }
+
+  // Flush buffered output and check data was written
+  os.flush();
+  os_raw.flush();
+  auto final_pos = os_raw.tellp();
+  RAFT_LOG_DEBUG("HNSW file size: %ld bytes", static_cast<int64_t>(final_pos));
+  if (!os_raw.good()) { RAFT_LOG_WARN("Output stream is not in good state after serialization"); }
+
+  auto end_time = std::chrono::system_clock::now();
+  auto elapsed_time =
+    std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+  RAFT_LOG_INFO("HNSW serialization from disk complete in %ld ms", elapsed_time);
 }
 
 template <typename T, HnswHierarchy hierarchy>
@@ -315,7 +832,10 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
   // initialize HNSW index
   auto hnsw_index = std::make_unique<index_impl<T>>(dim, cagra_index.metric(), hierarchy);
   auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
-    hnsw_index->get_space(), n_rows, cagra_index.graph().extent(1) / 2, params.ef_construction);
+    hnsw_index->get_space(),
+    n_rows,
+    (cagra_index.graph().extent(1) + 1) / 2,
+    params.ef_construction);
   appr_algo->cur_element_count = n_rows;
 
   // Initialize linked lists
@@ -514,6 +1034,45 @@ std::unique_ptr<index<T>> from_cagra(
   const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
   std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
 {
+  // special treatment for index on disk
+  if (cagra_index.dataset_fd().has_value() && cagra_index.graph_fd().has_value()) {
+    // Get directory from graph file descriptor
+    const auto& graph_fd = cagra_index.graph_fd();
+    RAFT_EXPECTS(graph_fd.has_value() && graph_fd->is_valid(),
+                 "Graph file descriptor is not available for disk-backed index");
+
+    std::string graph_path = graph_fd->get_path();
+    RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+
+    std::string index_directory = std::filesystem::path(graph_path).parent_path().string();
+    RAFT_EXPECTS(
+      std::filesystem::exists(index_directory) && std::filesystem::is_directory(index_directory),
+      "Directory '%s' does not exist",
+      index_directory.c_str());
+    std::string index_filename =
+      (std::filesystem::path(index_directory) / "hnsw_index.bin").string();
+
+    std::ofstream of(index_filename, std::ios::out | std::ios::binary);
+
+    RAFT_EXPECTS(of, "Cannot open file %s", index_filename.c_str());
+
+    serialize_to_hnswlib_from_disk(res, of, params, cagra_index);
+
+    of.close();
+    RAFT_EXPECTS(of, "Error writing output %s", index_filename.c_str());
+
+    // Create an empty HNSW index that holds the file descriptor
+    auto hnsw_index =
+      std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), params.hierarchy);
+
+    // Open file descriptor for the HNSW index file and transfer ownership to the index
+    hnsw_index->set_file_descriptor(cuvs::util::file_descriptor(index_filename, O_RDONLY));
+
+    RAFT_LOG_INFO("HNSW index written to disk at: %s", index_filename.c_str());
+
+    return hnsw_index;
+  }
+
   if (params.hierarchy == HnswHierarchy::NONE) {
     return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index, dataset);
   } else if (params.hierarchy == HnswHierarchy::CPU) {
@@ -531,6 +1090,10 @@ void extend(raft::resources const& res,
             raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset,
             index<T>& idx)
 {
+  auto* idx_impl = dynamic_cast<index_impl<T>*>(&idx);
+  RAFT_EXPECTS(!idx_impl || !idx_impl->file_descriptor().has_value(),
+               "Cannot extend an HNSW index that is stored on disk. "
+               "The index must be deserialized into memory first using hnsw::deserialize().");
   auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
     const_cast<void*>(idx.get_index()));
   auto current_element_count = hnswlib_index->getCurrentElementCount();
@@ -572,6 +1135,11 @@ void search(raft::resources const& res,
             raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances)
 {
+  auto* idx_impl = dynamic_cast<const index_impl<T>*>(&idx);
+  RAFT_EXPECTS(!idx_impl || !idx_impl->file_descriptor().has_value(),
+               "Cannot search an HNSW index that is stored on disk. "
+               "The index must be deserialized into memory first using hnsw::deserialize().");
+
   RAFT_EXPECTS(queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
                "Number of rows in output neighbors and distances matrices must equal the number of "
                "queries.");
@@ -611,6 +1179,10 @@ void search(raft::resources const& res,
 template <typename T>
 void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx)
 {
+  auto* idx_impl = dynamic_cast<const index_impl<T>*>(&idx);
+  RAFT_EXPECTS(!idx_impl || !idx_impl->file_descriptor().has_value(),
+               "Cannot serialize an HNSW index that is stored on disk. "
+               "The index must be deserialized into memory first using hnsw::deserialize().");
   auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
     const_cast<void*>(idx.get_index()));
   hnswlib_index->saveIndex(filename);
diff --git a/cpp/src/util/file_io.cpp b/cpp/src/util/file_io.cpp
new file mode 100644
index 0000000000..d924527e72
--- /dev/null
+++ b/cpp/src/util/file_io.cpp
@@ -0,0 +1,81 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/util/file_io.hpp>
+
+#include <algorithm>
+#include <cstring>
+#include <limits.h>
+
+#include <errno.h>
+#include <unistd.h>
+
+namespace cuvs::util {
+
+void read_large_file(const file_descriptor& fd,
+                     void* dest_ptr,
+                     const size_t total_bytes,
+                     const uint64_t file_offset)
+{
+  RAFT_EXPECTS(total_bytes > 0, "Total bytes must be greater than 0");
+  RAFT_EXPECTS(dest_ptr != nullptr, "Destination pointer must not be nullptr");
+  RAFT_EXPECTS(fd.is_valid(), "File descriptor must be valid");
+
+  const size_t read_chunk_size = std::min<size_t>(1024 * 1024 * 1024, SSIZE_MAX);
+  size_t bytes_remaining       = total_bytes;
+  size_t offset                = 0;
+
+  while (bytes_remaining > 0) {
+    const size_t chunk_size = std::min(read_chunk_size, bytes_remaining);
+    const uint64_t file_pos = file_offset + offset;
+    const ssize_t bytes_read =
+      pread(fd.get(), reinterpret_cast<char*>(dest_ptr) + offset, chunk_size, file_pos);
+
+    RAFT_EXPECTS(
+      bytes_read != -1, "Failed to read from file at offset %lu: %s", file_pos, strerror(errno));
+    RAFT_EXPECTS(bytes_read == static_cast<ssize_t>(chunk_size),
+                 "Incomplete read from file. Expected %zu bytes, got %zd at offset %lu",
+                 chunk_size,
+                 bytes_read,
+                 file_pos);
+
+    bytes_remaining -= chunk_size;
+    offset += chunk_size;
+  }
+}
+
+void write_large_file(const file_descriptor& fd,
+                      const void* data_ptr,
+                      const size_t total_bytes,
+                      const uint64_t file_offset)
+{
+  RAFT_EXPECTS(total_bytes > 0, "Total bytes must be greater than 0");
+  RAFT_EXPECTS(data_ptr != nullptr, "Data pointer must not be nullptr");
+  RAFT_EXPECTS(fd.is_valid(), "File descriptor must be valid");
+
+  const size_t write_chunk_size = std::min<size_t>(1024 * 1024 * 1024, SSIZE_MAX);
+  size_t bytes_remaining        = total_bytes;
+  size_t offset                 = 0;
+
+  while (bytes_remaining > 0) {
+    const size_t chunk_size = std::min(write_chunk_size, bytes_remaining);
+    const uint64_t file_pos = file_offset + offset;
+    const ssize_t chunk_written =
+      pwrite(fd.get(), reinterpret_cast<const char*>(data_ptr) + offset, chunk_size, file_pos);
+
+    RAFT_EXPECTS(
+      chunk_written != -1, "Failed to write to file at offset %lu: %s", file_pos, strerror(errno));
+    RAFT_EXPECTS(chunk_written == static_cast<ssize_t>(chunk_size),
+                 "Incomplete write to file. Expected %zu bytes, wrote %zd at offset %lu",
+                 chunk_size,
+                 chunk_written,
+                 file_pos);
+
+    bytes_remaining -= chunk_size;
+    offset += chunk_size;
+  }
+}
+
+}  // namespace cuvs::util
diff --git a/cpp/src/util/host_memory.cpp b/cpp/src/util/host_memory.cpp
new file mode 100644
index 0000000000..23e5ff4258
--- /dev/null
+++ b/cpp/src/util/host_memory.cpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/util/host_memory.hpp>
+
+#include <fstream>
+#include <string>
+
+namespace cuvs::util {
+
+size_t get_free_host_memory()
+{
+  size_t available_memory = 0;
+  std::ifstream meminfo("/proc/meminfo");
+  std::string line;
+  while (std::getline(meminfo, line)) {
+    if (line.find("MemAvailable:") != std::string::npos) {
+      available_memory = std::stoi(line.substr(line.find(":") + 1));
+    }
+  }
+  available_memory *= 1024;
+  meminfo.close();
+  RAFT_EXPECTS(available_memory > 0, "Failed to get available memory from /proc/meminfo");
+  return available_memory;
+}
+
+}  // namespace cuvs::util
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 56b53ef697..85a28950ec 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -194,6 +194,34 @@ ConfigureTest(
   PERCENT 100
 )
 
+ConfigureTest(
+  NAME NEIGHBORS_ANN_CAGRA_ACE_FLOAT_UINT32_TEST
+  PATH neighbors/ann_cagra_ace/test_float_uint32_t.cu
+  GPUS 1
+  PERCENT 100
+)
+
+ConfigureTest(
+  NAME NEIGHBORS_ANN_CAGRA_ACE_HALF_UINT32_TEST
+  PATH neighbors/ann_cagra_ace/test_half_uint32_t.cu
+  GPUS 1
+  PERCENT 100
+)
+
+ConfigureTest(
+  NAME NEIGHBORS_ANN_CAGRA_ACE_INT8_UINT32_TEST
+  PATH neighbors/ann_cagra_ace/test_int8_t_uint32_t.cu
+  GPUS 1
+  PERCENT 100
+)
+
+ConfigureTest(
+  NAME NEIGHBORS_ANN_CAGRA_ACE_UINT8_UINT32_TEST
+  PATH neighbors/ann_cagra_ace/test_uint8_t_uint32_t.cu
+  GPUS 1
+  PERCENT 100
+)
+
 ConfigureTest(
   NAME NEIGHBORS_ANN_NN_DESCENT_TEST
   PATH neighbors/ann_nn_descent/test_float_uint32_t.cu
diff --git a/cpp/tests/neighbors/ann_cagra_ace.cuh b/cpp/tests/neighbors/ann_cagra_ace.cuh
new file mode 100644
index 0000000000..4c6d96050b
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra_ace.cuh
@@ -0,0 +1,270 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "ann_cagra.cuh"
+
+#include <cuvs/neighbors/hnsw.hpp>
+
+#include <cstdio>
+#include <filesystem>
+
+namespace cuvs::neighbors::cagra {
+
+struct AnnCagraAceInputs {
+  int n_queries;
+  int n_rows;
+  int dim;
+  int k;
+  int npartitions;
+  int ef_construction;
+  bool use_disk;
+  cuvs::distance::DistanceType metric;
+  double min_recall;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraAceInputs& p)
+{
+  os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim
+     << ", k=" << p.k << ", npartitions=" << p.npartitions
+     << ", ef_construction=" << p.ef_construction
+     << ", use_disk=" << (p.use_disk ? "true" : "false") << ", metric=";
+  switch (p.metric) {
+    case cuvs::distance::DistanceType::L2Expanded: os << "L2"; break;
+    case cuvs::distance::DistanceType::InnerProduct: os << "InnerProduct"; break;
+    default: os << "Unknown"; break;
+  }
+  os << ", min_recall=" << p.min_recall << "}";
+  return os;
+}
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnCagraAceTest : public ::testing::TestWithParam<AnnCagraAceInputs> {
+ public:
+  AnnCagraAceTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnCagraAceInputs>::GetParam()),
+      database_dev(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+ protected:
+  void testAce()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_ace(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_ace(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+
+      cuvs::neighbors::naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                                         distances_naive_dev.data(),
+                                                         indices_naive_dev.data(),
+                                                         search_queries.data(),
+                                                         database_dev.data(),
+                                                         ps.n_queries,
+                                                         ps.n_rows,
+                                                         ps.dim,
+                                                         ps.k,
+                                                         ps.metric);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    // Create temporary directory for ACE build
+    std::string temp_dir = std::string("/tmp/cuvs_ace_test_") + std::to_string(std::time(nullptr)) +
+                           "_" + std::to_string(reinterpret_cast<uintptr_t>(this));
+    std::filesystem::create_directories(temp_dir);
+
+    {
+      auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+      raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_);
+      raft::resource::sync_stream(handle_);
+
+      cagra::index_params index_params;
+      index_params.metric                    = ps.metric;
+      index_params.intermediate_graph_degree = 128;
+      index_params.graph_degree              = 64;
+      auto ace_params                        = graph_build_params::ace_params();
+      ace_params.npartitions                 = ps.npartitions;
+      ace_params.ef_construction             = ps.ef_construction;
+      ace_params.build_dir                   = temp_dir;
+      ace_params.use_disk                    = ps.use_disk;
+      index_params.graph_build_params        = ace_params;
+
+      auto index =
+        cagra::build(handle_, index_params, raft::make_const_mdspan(database_host.view()));
+
+      ASSERT_EQ(index.size(), ps.n_rows);
+
+      if (ps.use_disk) {
+        // Verify disk-based ACE index using HNSW index from disk
+        EXPECT_TRUE(index.dataset_fd().has_value() && index.graph_fd().has_value());
+
+        // Verify file directory from graph file descriptor
+        const auto& graph_fd = index.graph_fd();
+        EXPECT_TRUE(graph_fd.has_value() && graph_fd->is_valid());
+        std::string graph_path = graph_fd->get_path();
+        std::string file_dir   = std::filesystem::path(graph_path).parent_path().string();
+        EXPECT_EQ(file_dir, temp_dir);
+
+        EXPECT_TRUE(std::filesystem::exists(temp_dir + "/cagra_graph.npy"));
+        EXPECT_GE(std::filesystem::file_size(temp_dir + "/cagra_graph.npy"),
+                  ps.n_rows * index_params.graph_degree * sizeof(IdxT));
+
+        EXPECT_TRUE(std::filesystem::exists(temp_dir + "/reordered_dataset.npy"));
+        EXPECT_GE(std::filesystem::file_size(temp_dir + "/reordered_dataset.npy"),
+                  ps.n_rows * ps.dim * sizeof(DataT));
+
+        EXPECT_TRUE(std::filesystem::exists(temp_dir + "/dataset_mapping.npy"));
+        EXPECT_GE(std::filesystem::file_size(temp_dir + "/dataset_mapping.npy"),
+                  ps.n_rows * sizeof(IdxT));
+
+        hnsw::index_params hnsw_params;
+        hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU;
+
+        auto hnsw_index = hnsw::from_cagra(handle_, hnsw_params, index);
+        ASSERT_NE(hnsw_index, nullptr);
+
+        std::string hnsw_index_path = temp_dir + "/hnsw_index.bin";
+        EXPECT_TRUE(std::filesystem::exists(hnsw_index_path));
+        // For GPU hierarchy, HNSW index includes multi-layer structure
+        // The size should be at least the base layer size
+        auto hnsw_file_size = std::filesystem::file_size(hnsw_index_path);
+        EXPECT_GE(hnsw_file_size, ps.n_rows * index_params.graph_degree * sizeof(IdxT));
+
+        hnsw::index<DataT>* hnsw_index_raw = nullptr;
+        hnsw::deserialize(
+          handle_, hnsw_params, hnsw_index_path, ps.dim, ps.metric, &hnsw_index_raw);
+        ASSERT_NE(hnsw_index_raw, nullptr);
+
+        std::unique_ptr<hnsw::index<DataT>> hnsw_index_deserialized(hnsw_index_raw);
+        EXPECT_EQ(hnsw_index_deserialized->dim(), ps.dim);
+        EXPECT_EQ(hnsw_index_deserialized->metric(), ps.metric);
+
+        auto queries_host = raft::make_host_matrix<DataT, int64_t>(ps.n_queries, ps.dim);
+        raft::copy(
+          queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_);
+        raft::resource::sync_stream(handle_);
+
+        auto indices_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(ps.n_queries, ps.k);
+        auto distances_hnsw_host = raft::make_host_matrix<DistanceT, int64_t>(ps.n_queries, ps.k);
+
+        hnsw::search_params search_params;
+        search_params.ef          = std::max(ps.ef_construction, ps.k * 2);
+        search_params.num_threads = 1;
+
+        hnsw::search(handle_,
+                     search_params,
+                     *hnsw_index_deserialized,
+                     queries_host.view(),
+                     indices_hnsw_host.view(),
+                     distances_hnsw_host.view());
+
+        for (size_t i = 0; i < queries_size; i++) {
+          indices_ace[i]   = static_cast<IdxT>(indices_hnsw_host.data_handle()[i]);
+          distances_ace[i] = distances_hnsw_host.data_handle()[i];
+        }
+
+        EXPECT_TRUE(eval_neighbours(indices_naive,
+                                    indices_ace,
+                                    distances_naive,
+                                    distances_ace,
+                                    ps.n_queries,
+                                    ps.k,
+                                    0.003,
+                                    ps.min_recall))
+          << "Disk-based ACE index loaded via HNSW failed recall check";
+      } else {
+        // For in-memory ACE, we can search directly
+        EXPECT_FALSE(index.dataset_fd().has_value() || index.graph_fd().has_value());
+        ASSERT_GT(index.graph().size(), 0);
+        EXPECT_EQ(index.graph_degree(), 64);
+
+        rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+        rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+        auto queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_view =
+          raft::make_device_matrix_view<IdxT, int64_t>(indices_dev.data(), ps.n_queries, ps.k);
+        auto distances_view = raft::make_device_matrix_view<DistanceT, int64_t>(
+          distances_dev.data(), ps.n_queries, ps.k);
+
+        cagra::search_params search_params;
+        search_params.itopk_size = 64;
+
+        cagra::search(handle_, search_params, index, queries_view, indices_view, distances_view);
+
+        raft::update_host(distances_ace.data(), distances_dev.data(), queries_size, stream_);
+        raft::update_host(indices_ace.data(), indices_dev.data(), queries_size, stream_);
+        raft::resource::sync_stream(handle_);
+
+        EXPECT_TRUE(eval_neighbours(indices_naive,
+                                    indices_ace,
+                                    distances_naive,
+                                    distances_ace,
+                                    ps.n_queries,
+                                    ps.k,
+                                    0.003,
+                                    ps.min_recall))
+          << "In-memory ACE index failed recall check";
+      }
+    }
+
+    // Clean up temporary directory
+    std::filesystem::remove_all(temp_dir);
+  }
+
+  void SetUp() override
+  {
+    database_dev.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    search_queries.resize(ps.n_queries * ps.dim, stream_);
+    raft::random::RngState r(1234ULL);
+    InitDataset(handle_, database_dev.data(), ps.n_rows, ps.dim, ps.metric, r);
+    InitDataset(handle_, search_queries.data(), ps.n_queries, ps.dim, ps.metric, r);
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database_dev.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnCagraAceInputs ps;
+  rmm::device_uvector<DataT> database_dev;
+  rmm::device_uvector<DataT> search_queries;
+};
+
+inline std::vector<AnnCagraAceInputs> generate_ace_inputs()
+{
+  return raft::util::itertools::product<AnnCagraAceInputs>(
+    {10},           // n_queries
+    {5000},         // n_rows
+    {64, 128},      // dim
+    {10},           // k
+    {2, 4},         // npartitions
+    {100},          // ef_construction
+    {false, true},  // use_disk (test both modes)
+    {cuvs::distance::DistanceType::L2Expanded,
+     cuvs::distance::DistanceType::InnerProduct},  // metric
+    {0.9}                                          // min_recall
+  );
+}
+
+const std::vector<AnnCagraAceInputs> ace_inputs = generate_ace_inputs();
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/tests/neighbors/ann_cagra_ace/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_cagra_ace/test_float_uint32_t.cu
new file mode 100644
index 0000000000..de96a40339
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra_ace/test_float_uint32_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra_ace.cuh"
+
+namespace cuvs::neighbors::cagra {
+
+typedef AnnCagraAceTest<float, float, std::uint32_t> AnnCagraAceTestF_U32;
+TEST_P(AnnCagraAceTestF_U32, AnnCagraAce) { this->testAce(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraAceTest, AnnCagraAceTestF_U32, ::testing::ValuesIn(ace_inputs));
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/tests/neighbors/ann_cagra_ace/test_half_uint32_t.cu b/cpp/tests/neighbors/ann_cagra_ace/test_half_uint32_t.cu
new file mode 100644
index 0000000000..a1a6ec1397
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra_ace/test_half_uint32_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra_ace.cuh"
+
+namespace cuvs::neighbors::cagra {
+
+typedef AnnCagraAceTest<float, half, std::uint32_t> AnnCagraAceTestF16_U32;
+TEST_P(AnnCagraAceTestF16_U32, AnnCagraAce) { this->testAce(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraAceTest, AnnCagraAceTestF16_U32, ::testing::ValuesIn(ace_inputs));
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/tests/neighbors/ann_cagra_ace/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_cagra_ace/test_int8_t_uint32_t.cu
new file mode 100644
index 0000000000..3973b72cd6
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra_ace/test_int8_t_uint32_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra_ace.cuh"
+
+namespace cuvs::neighbors::cagra {
+
+typedef AnnCagraAceTest<float, std::int8_t, std::uint32_t> AnnCagraAceTestI8_U32;
+TEST_P(AnnCagraAceTestI8_U32, AnnCagraAce) { this->testAce(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraAceTest, AnnCagraAceTestI8_U32, ::testing::ValuesIn(ace_inputs));
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/tests/neighbors/ann_cagra_ace/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_cagra_ace/test_uint8_t_uint32_t.cu
new file mode 100644
index 0000000000..5ca6f038df
--- /dev/null
+++ b/cpp/tests/neighbors/ann_cagra_ace/test_uint8_t_uint32_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra_ace.cuh"
+
+namespace cuvs::neighbors::cagra {
+
+typedef AnnCagraAceTest<float, std::uint8_t, std::uint32_t> AnnCagraAceTestU8_U32;
+TEST_P(AnnCagraAceTestU8_U32, AnnCagraAce) { this->testAce(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraAceTest, AnnCagraAceTestU8_U32, ::testing::ValuesIn(ace_inputs));
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/docs/source/cuvs_bench/param_tuning.rst b/docs/source/cuvs_bench/param_tuning.rst
index a0de6daf09..12e804dd56 100644
--- a/docs/source/cuvs_bench/param_tuning.rst
+++ b/docs/source/cuvs_bench/param_tuning.rst
@@ -201,7 +201,7 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
  * - `graph_build_algo`
    - `build`
    - `N`
-   - [`IVF_PQ`, `NN_DESCENT`]
+   - [`IVF_PQ`, `NN_DESCENT`, `ACE`]
    - `IVF_PQ`
    - Algorithm to use for building the initial kNN graph, from which CAGRA will optimize into the navigable CAGRA graph
 
@@ -212,6 +212,34 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
    - `mmap`
    - Where should the dataset reside?
 
+ * - `npartitions`
+   - `build`
+   - N
+   - Positive integer >0
+   - 1
+   - The number of partitions to use for the ACE build. Small values might improve recall but potentially degrade performance and increase memory usage. Partitions should not be too small to prevent issues in KNN graph construction. 100k - 5M vectors per partition is recommended depending on the available host and GPU memory. The partition size is on average 2 * (n_rows / npartitions) * dim * sizeof(T). 2 is because of the core and augmented vectors. Please account for imbalance in the partition sizes (up to 3x in our tests).
+
+ * - `build_dir`
+   - `build`
+   - N
+   - String
+   - "/tmp/ace_build"
+   - The directory to use for the ACE build. Must be specified when using ACE build. This should be the fastest disk in the system and hold enough space for twice the dataset, final graph, and label mapping.
+
+ * - `ef_construction`
+   - `build`
+   - Y
+   - Positive integer >0
+   - 120
+   - Controls index time and accuracy when using ACE build. Bigger values increase the index quality. At some point, increasing this will no longer improve the quality.
+
+ * - `use_disk`
+   - `build`
+   - N
+   - Boolean
+   - `false`
+   - Whether to use disk-based storage for ACE build. When true, forces ACE to use disk-based storage even if the graph fits in host and GPU memory. When false, ACE will use in-memory storage if the graph fits in host and GPU memory and disk-based storage otherwise.
+
  * - `query_memory_type`
    - `search`
    - N
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index bd87df5782..619583a83e 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -30,6 +30,7 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 
 # -------------- compile tasks ----------------- #
 add_executable(BRUTE_FORCE_EXAMPLE src/brute_force_bitmap.cu)
+add_executable(CAGRA_HNSW_ACE_EXAMPLE src/cagra_hnsw_ace_example.cu)
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
 add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
@@ -41,6 +42,7 @@ add_executable(SCANN_EXAMPLE src/scann_example.cu)
 # `$<TARGET_NAME_IF_EXISTS:conda_env>` is a generator expression that ensures that targets are
 # installed in a conda environment, if one exists
 target_link_libraries(BRUTE_FORCE_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(CAGRA_HNSW_ACE_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(
   CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
diff --git a/examples/cpp/src/cagra_hnsw_ace_example.cu b/examples/cpp/src/cagra_hnsw_ace_example.cu
new file mode 100644
index 0000000000..b2474eeab9
--- /dev/null
+++ b/examples/cpp/src/cagra_hnsw_ace_example.cu
@@ -0,0 +1,182 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstdint>
+#include <filesystem>
+#include <memory>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <string>
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/hnsw.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include "common.cuh"
+
+void cagra_build_search_ace(raft::device_resources const& dev_resources,
+                            raft::device_matrix_view<const float, int64_t> dataset,
+                            raft::device_matrix_view<const float, int64_t> queries)
+{
+  using namespace cuvs::neighbors;
+
+  int64_t topk      = 12;
+  int64_t n_queries = queries.extent(0);
+
+  // create output arrays
+  auto neighbors = raft::make_device_matrix<uint32_t>(dev_resources, n_queries, topk);
+  auto distances = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+
+  // CAGRA index parameters
+  cagra::index_params index_params;
+  index_params.intermediate_graph_degree = 128;
+  index_params.graph_degree              = 64;
+
+  // ACE index parameters
+  auto ace_params = cagra::graph_build_params::ace_params();
+  // Set the number of partitions. Small values might improve recall but potentially degrade
+  // performance and increase memory usage. Partitions should not be too small to prevent issues in
+  // KNN graph construction. 100k - 5M vectors per partition is recommended depending on the
+  // available host and GPU memory. The partition size is on average 2 * (n_rows / npartitions) *
+  // dim * sizeof(T). 2 is because of the core and augmented vectors. Please account for imbalance
+  // in the partition sizes (up to 3x in our tests).
+  ace_params.npartitions = 4;
+  // Set the index quality for the ACE build. Bigger values increase the index quality. At some
+  // point, increasing this will no longer improve the quality.
+  ace_params.ef_construction = 120;
+  // Set the directory to store the ACE build artifacts. This should be the fastest disk in the
+  // system and hold enough space for twice the dataset, final graph, and label mapping.
+  ace_params.build_dir = "/tmp/ace_build";
+  // Set whether to use disk-based storage for ACE build. When true, enables disk-based operations
+  // for memory-efficient graph construction. If not set, the index will be built in memory if the
+  // graph fits in host and GPU memory, and on disk otherwise.
+  ace_params.use_disk             = true;
+  index_params.graph_build_params = ace_params;
+
+  // ACE requires the dataset to be on the host
+  auto dataset_host = raft::make_host_matrix<float, int64_t>(dataset.extent(0), dataset.extent(1));
+  raft::copy(dataset_host.data_handle(),
+             dataset.data_handle(),
+             dataset.extent(0) * dataset.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+  auto dataset_host_view = raft::make_host_matrix_view<const float, int64_t, raft::row_major>(
+    dataset_host.data_handle(), dataset_host.extent(0), dataset_host.extent(1));
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto index = cagra::build(dev_resources, index_params, dataset_host_view);
+  // In-memory build of ACE provides the index in memory, so we can search it directly using
+  // cagra::search
+
+  // On-disk build of ACE stores the reordered dataset, the dataset mapping, and the graph on disk.
+  // The index is not directly usable for CAGRA search. Convert to HNSW for search operations.
+
+  // Convert CAGRA index to HNSW
+  // For disk-based indices: serializes CAGRA to HNSW format on disk, returns an index with file
+  // descriptor For in-memory indices: creates HNSW index in memory
+  std::cout << "Converting CAGRA index to HNSW" << std::endl;
+  hnsw::index_params hnsw_params;
+  auto hnsw_index = hnsw::from_cagra(dev_resources, hnsw_params, index);
+
+  // HNSW search requires host matrices
+  auto queries_host = raft::make_host_matrix<float, int64_t>(n_queries, queries.extent(1));
+  raft::copy(queries_host.data_handle(),
+             queries.data_handle(),
+             n_queries * queries.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  // HNSW search outputs uint64_t indices
+  auto indices_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(n_queries, topk);
+  auto distances_hnsw_host = raft::make_host_matrix<float, int64_t>(n_queries, topk);
+
+  hnsw::search_params hnsw_search_params;
+  hnsw_search_params.ef          = std::max(200, static_cast<int>(topk) * 2);
+  hnsw_search_params.num_threads = 1;
+
+  // If the HNSW index is in memory, search directly
+  // std::cout << "HNSW index in memory. Searching..." << std::endl;
+  // hnsw::search(dev_resources,
+  //              hnsw_search_params,
+  //              *hnsw_index,
+  //              queries_host.view(),
+  //              indices_hnsw_host.view(),
+  //              distances_hnsw_host.view());
+
+  // If the HNSW index is stored on disk, deserialize it for searching
+  std::cout << "HNSW index is stored on disk." << std::endl;
+
+  // For disk-based indices, the HNSW index file path can be obtained via file_path()
+  std::string hnsw_index_path = hnsw_index->file_path();
+  std::cout << "HNSW index file location: " << hnsw_index_path << std::endl;
+  std::cout << "Deserializing HNSW index from disk for search." << std::endl;
+
+  hnsw::index<float>* hnsw_index_raw = nullptr;
+  hnsw::deserialize(
+    dev_resources, hnsw_params, hnsw_index_path, index.dim(), index.metric(), &hnsw_index_raw);
+
+  std::unique_ptr<hnsw::index<float>> hnsw_index_deserialized(hnsw_index_raw);
+
+  std::cout << "Searching HNSW index." << std::endl;
+  hnsw::search(dev_resources,
+               hnsw_search_params,
+               *hnsw_index_deserialized,
+               queries_host.view(),
+               indices_hnsw_host.view(),
+               distances_hnsw_host.view());
+
+  // Convert HNSW uint64_t indices back to uint32_t for printing
+  auto neighbors_host = raft::make_host_matrix<uint32_t, int64_t>(n_queries, topk);
+  for (int64_t i = 0; i < n_queries; i++) {
+    for (int64_t j = 0; j < topk; j++) {
+      neighbors_host(i, j) = static_cast<uint32_t>(indices_hnsw_host(i, j));
+    }
+  }
+
+  // Copy results to device
+  raft::copy(neighbors.data_handle(),
+             neighbors_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::copy(distances.data_handle(),
+             distances_hnsw_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Set pool memory resource with 1 GiB initial pool size. All allocations use the same pool.
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(
+    rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(&pool_mr);
+
+  // Alternatively, one could define a pool allocator for temporary arrays (used within RAFT
+  // algorithms). In that case only the internal arrays would use the pool, any other allocation
+  // uses the default RMM memory resource. Here is how to change the workspace memory resource to
+  // a pool with 2 GiB upper limit.
+  // raft::resource::set_workspace_to_pool_resource(dev_resources, 2 * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 90;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  // ACE build and search example.
+  cagra_build_search_ace(dev_resources,
+                         raft::make_const_mdspan(dataset.view()),
+                         raft::make_const_mdspan(queries.view()));
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndexParams.java
index 2e47ac27e2..e185ed9f26 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndexParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndexParams.java
@@ -24,6 +24,7 @@ public class CagraIndexParams {
   private final long nnDescentNiter;
   private final int numWriterThreads;
   private final CuVSIvfPqParams cuVSIvfPqParams;
+  private final CuVSAceParams cuVSAceParams;
   private final CagraCompressionParams cagraCompressionParams;
 
   /**
@@ -41,7 +42,12 @@ public enum CagraGraphBuildAlgo {
     /**
      * Experimental, use NN-Descent to build all-neighbors knn graph
      */
-    NN_DESCENT(2);
+    NN_DESCENT(2),
+    /**
+     * Experimental, use ACE (Augmented Core Extraction) to build graph for large datasets.
+     * 4 to be consistent with the other interfaces.
+     */
+    ACE(4);
 
     /**
      * The value for the enum choice.
@@ -329,6 +335,7 @@ private CagraIndexParams(
       int writerThreads,
       CuvsDistanceType cuvsDistanceType,
       CuVSIvfPqParams cuVSIvfPqParams,
+      CuVSAceParams cuVSAceParams,
       CagraCompressionParams cagraCompressionParams) {
     this.intermediateGraphDegree = intermediateGraphDegree;
     this.graphDegree = graphDegree;
@@ -337,6 +344,7 @@ private CagraIndexParams(
     this.numWriterThreads = writerThreads;
     this.cuvsDistanceType = cuvsDistanceType;
     this.cuVSIvfPqParams = cuVSIvfPqParams;
+    this.cuVSAceParams = cuVSAceParams;
     this.cagraCompressionParams = cagraCompressionParams;
   }
 
@@ -405,6 +413,13 @@ public CuVSIvfPqParams getCuVSIvfPqParams() {
     return cuVSIvfPqParams;
   }
 
+  /**
+   * Gets the ACE parameters.
+   */
+  public CuVSAceParams getCuVSAceParams() {
+    return cuVSAceParams;
+  }
+
   /**
    * Gets the CAGRA build algorithm.
    */
@@ -435,6 +450,8 @@ public String toString() {
         + numWriterThreads
         + ", cuVSIvfPqParams="
         + cuVSIvfPqParams
+        + ", cuVSAceParams="
+        + cuVSAceParams
         + ", cagraCompressionParams="
         + cagraCompressionParams
         + "]";
@@ -452,6 +469,7 @@ public static class Builder {
     private long nnDescentNumIterations = 20;
     private int numWriterThreads = 2;
     private CuVSIvfPqParams cuVSIvfPqParams = new CuVSIvfPqParams.Builder().build();
+    private CuVSAceParams cuVSAceParams = new CuVSAceParams.Builder().build();
     private CagraCompressionParams cagraCompressionParams;
 
     public Builder() {}
@@ -535,6 +553,17 @@ public Builder withCuVSIvfPqParams(CuVSIvfPqParams cuVSIvfPqParams) {
       return this;
     }
 
+    /**
+     * Sets the ACE index parameters.
+     *
+     * @param cuVSAceParams the ACE index parameters
+     * @return an instance of Builder
+     */
+    public Builder withCuVSAceParams(CuVSAceParams cuVSAceParams) {
+      this.cuVSAceParams = cuVSAceParams;
+      return this;
+    }
+
     /**
      * Registers an instance of configured {@link CagraCompressionParams} with this
      * Builder.
@@ -561,6 +590,7 @@ public CagraIndexParams build() {
           numWriterThreads,
           cuvsDistanceType,
           cuVSIvfPqParams,
+          cuVSAceParams,
           cagraCompressionParams);
     }
   }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
new file mode 100644
index 0000000000..54c25814bb
--- /dev/null
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
@@ -0,0 +1,184 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+/**
+ * Parameters for ACE (Augmented Core Extraction) graph build algorithm.
+ * ACE enables building indices for datasets too large to fit in GPU memory by:
+ * 1. Partitioning the dataset in core (closest) and augmented (second-closest)
+ * partitions using balanced k-means.
+ * 2. Building sub-indices for each partition independently
+ * 3. Concatenating sub-graphs into a final unified index
+ *
+ * @since 25.12
+ */
+public class CuVSAceParams {
+
+  /**
+   * Number of partitions for ACE (Augmented Core Extraction) partitioned build.
+   *
+   * Small values might improve recall but potentially degrade performance and increase memory usage.
+   * Partitions should not be too small to prevent issues in KNN graph construction. 100k - 5M
+   * vectors per partition is recommended depending on the available host and GPU memory. The
+   * partition size is on average {@code 2 * (n_rows / npartitions) * dim * sizeof(T)}—the factor 2
+   * accounts for core and augmented vectors. Please account for imbalance in the partition sizes
+   * (up to 3x in our tests).
+   */
+  private final long npartitions;
+
+  /**
+   * The index quality for the ACE build.
+   *
+   * Bigger values increase the index quality. At some point, increasing this will no longer improve
+   * the quality.
+   */
+  private final long efConstruction;
+
+  /**
+   * Directory to store ACE build artifacts (e.g., KNN graph, optimized graph).
+   *
+   * Used when {@link #isUseDisk()} is true or when the graph does not fit in host and GPU memory.
+   * This should be the fastest disk in the system and hold enough space for twice the dataset, final
+   * graph, and label mapping.
+   */
+  private final String buildDir;
+
+  /**
+   * Whether to use disk-based storage for ACE builds.
+   *
+   * When true, enables disk-based operations for memory-efficient graph construction.
+   */
+  private final boolean useDisk;
+
+  private CuVSAceParams(
+      long npartitions, long efConstruction, String buildDir, boolean useDisk) {
+    this.npartitions = npartitions;
+    this.efConstruction = efConstruction;
+    this.buildDir = buildDir;
+    this.useDisk = useDisk;
+  }
+
+  /**
+   * Gets the number of partitions.
+   *
+   * @return the number of partitions
+   */
+  public long getNpartitions() {
+    return npartitions;
+  }
+
+  /**
+   * Gets the {@code ef_construction} parameter.
+   *
+   * @return the {@code ef_construction} parameter
+   */
+  public long getEfConstruction() {
+    return efConstruction;
+  }
+
+  /**
+   * Gets the build directory path.
+   *
+   * @return the build directory path
+   */
+  public String getBuildDir() {
+    return buildDir;
+  }
+
+  /**
+   * Gets whether disk-based mode is enabled.
+   *
+   * @return true if disk-based mode is enabled
+   */
+  public boolean isUseDisk() {
+    return useDisk;
+  }
+
+  @Override
+  public String toString() {
+    return "CuVSAceParams [npartitions="
+        + npartitions
+        + ", efConstruction="
+        + efConstruction
+        + ", buildDir="
+        + buildDir
+        + ", useDisk="
+        + useDisk
+        + "]";
+  }
+
+  /**
+   * Builder configures and creates an instance of {@link CuVSAceParams}.
+   */
+  public static class Builder {
+
+    /** Number of partitions to split the dataset into */
+    private long npartitions = 1;
+
+    /** ef_construction parameter for HNSW used in ACE */
+    private long efConstruction = 120;
+
+    /** Directory to store intermediate build files */
+    private String buildDir = "/tmp/ace_build";
+
+    /** Whether to use disk-based mode for very large datasets */
+    private boolean useDisk = false;
+
+    public Builder() {}
+
+    /**
+     * Sets the number of partitions.
+     *
+     * @param npartitions the number of partitions
+     * @return an instance of Builder
+     */
+    public Builder withNpartitions(long npartitions) {
+      this.npartitions = npartitions;
+      return this;
+    }
+
+    /**
+     * Sets the ef_construction parameter.
+     *
+     * @param efConstruction the ef_construction parameter
+     * @return an instance of Builder
+     */
+    public Builder withEfConstruction(long efConstruction) {
+      this.efConstruction = efConstruction;
+      return this;
+    }
+
+    /**
+     * Sets the build directory path.
+     *
+     * @param buildDir the build directory path
+     * @return an instance of Builder
+     */
+    public Builder withBuildDir(String buildDir) {
+      this.buildDir = buildDir;
+      return this;
+    }
+
+    /**
+     * Sets whether to use disk-based mode.
+     *
+     * @param useDisk whether to use disk-based mode
+     * @return an instance of Builder
+     */
+    public Builder withUseDisk(boolean useDisk) {
+      this.useDisk = useDisk;
+      return this;
+    }
+
+    /**
+     * Builds an instance of {@link CuVSAceParams}.
+     *
+     * @return an instance of {@link CuVSAceParams}
+     */
+    public CuVSAceParams build() {
+      return new CuVSAceParams(npartitions, efConstruction, buildDir, useDisk);
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
index 6837c50505..c09111fcc3 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
@@ -43,6 +43,21 @@ static HnswIndex.Builder newBuilder(CuVSResources cuvsResources) {
     return CuVSProvider.provider().newHnswIndexBuilder(cuvsResources);
   }
 
+  /**
+   * Creates an HNSW index from an existing CAGRA index.
+   *
+   * @param hnswParams Parameters for the HNSW index
+   * @param cagraIndex The CAGRA index to convert from
+   * @return A new HNSW index
+   * @throws Throwable if an error occurs during conversion
+   */
+  static HnswIndex fromCagra(HnswIndexParams hnswParams, CagraIndex cagraIndex)
+      throws Throwable {
+    Objects.requireNonNull(hnswParams);
+    Objects.requireNonNull(cagraIndex);
+    return CuVSProvider.provider().hnswIndexFromCagra(hnswParams, cagraIndex);
+  }
+
   /**
    * Builder helps configure and create an instance of {@link HnswIndex}.
    */
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
index 107da0bd8e..5ff87e5c64 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
@@ -113,6 +113,17 @@ CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources)
   HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources)
       throws UnsupportedOperationException;
 
+  /**
+   * Creates an HNSW index from an existing CAGRA index.
+   *
+   * @param hnswParams Parameters for the HNSW index
+   * @param cagraIndex The CAGRA index to convert from
+   * @return A new HNSW index
+   * @throws Throwable if an error occurs during conversion
+   */
+  HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagraIndex)
+      throws Throwable;
+
   /** Creates a new TieredIndex Builder. */
   TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources)
       throws UnsupportedOperationException;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
index 0b229009dd..d0f244d9ce 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
@@ -40,6 +40,12 @@ public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) {
     throw new UnsupportedOperationException(reasons);
   }
 
+  @Override
+  public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagraIndex)
+      throws Throwable {
+    throw new UnsupportedOperationException(reasons);
+  }
+
   @Override
   public TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources) {
     throw new UnsupportedOperationException(reasons);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
index efc278c1f5..41e46f78fd 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
@@ -505,6 +505,20 @@ public CuVSResources getCuVSResources() {
     return resources;
   }
 
+  /**
+   * Gets the CAGRA index reference (for internal use).
+   * Package-private to allow access from HnswIndexImpl.
+   *
+   * @return the memory segment representing the CAGRA index
+   */
+  MemorySegment getCagraIndexReference() {
+    return cagraIndexReference.getMemorySegment();
+  }
+
+  CuVSMatrix getDatasetForConversion() {
+    return cagraIndexReference.dataset;
+  }
+
   /**
    * Allocates the native CagraIndexParams data structures and fills the configured index parameters in.
    */
@@ -610,6 +624,25 @@ private static void populateNativeIndexParams(
           cuvsIvfPqParamsMemorySegment, params.getCuVSIvfPqParams().getRefinementRate());
 
       cuvsCagraIndexParams.graph_build_params(indexPtr, cuvsIvfPqParamsMemorySegment);
+    } else if (params.getCagraGraphBuildAlgo().equals(CagraGraphBuildAlgo.ACE)) {
+      var aceParams = createAceParams();
+      // Note: Do NOT add aceParams to handles list.
+      // The cuvsCagraIndexParamsDestroy will handle freeing the ACE params
+      // when graph_build_algo is ACE, just like it does for IVF-PQ params.
+      MemorySegment cuvsAceParamsMemorySegment = aceParams.handle();
+      CuVSAceParams cuVSAceParams = params.getCuVSAceParams();
+
+      cuvsAceParams.npartitions(cuvsAceParamsMemorySegment, cuVSAceParams.getNpartitions());
+      cuvsAceParams.ef_construction(cuvsAceParamsMemorySegment, cuVSAceParams.getEfConstruction());
+      cuvsAceParams.use_disk(cuvsAceParamsMemorySegment, cuVSAceParams.isUseDisk());
+
+      String buildDir = cuVSAceParams.getBuildDir();
+      if (buildDir != null && !buildDir.isEmpty()) {
+        MemorySegment buildDirSegment = Util.duplicateNativeString(buildDir);
+        cuvsAceParams.build_dir(cuvsAceParamsMemorySegment, buildDirSegment);
+      }
+
+      cuvsCagraIndexParams.graph_build_params(indexPtr, cuvsAceParamsMemorySegment);
     }
   }
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index b45b90c7ec..9cfc7e2f49 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -122,6 +122,25 @@ public void close() {
     }
   }
 
+  public static CloseableHandle createAceParams() {
+    try (var localArena = Arena.ofConfined()) {
+      var paramsPtrPtr = localArena.allocate(cuvsAceParams_t);
+      checkCuVSError(cuvsAceParamsCreate(paramsPtrPtr), "cuvsAceParamsCreate");
+      var paramsPtr = paramsPtrPtr.get(cuvsAceParams_t, 0L);
+      return new CloseableHandle() {
+        @Override
+        public MemorySegment handle() {
+          return paramsPtr;
+        }
+
+        @Override
+        public void close() {
+          checkCuVSError(cuvsAceParamsDestroy(paramsPtr), "cuvsAceParamsDestroy");
+        }
+      };
+    }
+  }
+
   static CloseableHandle createHnswIndexParams() {
     try (var localArena = Arena.ofConfined()) {
       var paramsPtrPtr = localArena.allocate(cuvsHnswIndexParams_t);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
index 876efce7ef..90ac4e7357 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
@@ -12,6 +12,8 @@
 import static com.nvidia.cuvs.internal.common.Util.prepareTensor;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
+import com.nvidia.cuvs.CagraIndex;
+import com.nvidia.cuvs.CuVSMatrix;
 import com.nvidia.cuvs.CuVSResources;
 import com.nvidia.cuvs.HnswIndex;
 import com.nvidia.cuvs.HnswIndexParams;
@@ -59,6 +61,20 @@ private HnswIndexImpl(
     this.hnswIndexReference = deserialize(inputStream);
   }
 
+  /**
+   * Constructor for creating index from an existing IndexReference
+   *
+   * @param indexReference the index reference
+   * @param resources      an instance of {@link CuVSResources}
+   * @param hnswIndexParams an instance of {@link HnswIndexParams}
+   */
+  private HnswIndexImpl(
+      IndexReference indexReference, CuVSResources resources, HnswIndexParams hnswIndexParams) {
+    this.hnswIndexParams = hnswIndexParams;
+    this.resources = resources;
+    this.hnswIndexReference = indexReference;
+  }
+
   /**
    * Invokes the native destroy_hnsw_index to de-allocate the HNSW index
    */
@@ -222,6 +238,105 @@ public static HnswIndex.Builder newBuilder(CuVSResources cuvsResources) {
     return new HnswIndexImpl.Builder(Objects.requireNonNull(cuvsResources));
   }
 
+  /**
+   * Creates an HNSW index from an existing CAGRA index.
+   *
+   * @param hnswParams Parameters for the HNSW index
+   * @param cagraIndex The CAGRA index to convert from
+   * @return A new HNSW index for in-memory indices, or null for disk-based indices
+   * @throws Throwable if an error occurs during conversion
+   */
+  public static HnswIndex fromCagra(HnswIndexParams hnswParams, CagraIndex cagraIndex)
+      throws Throwable {
+    Objects.requireNonNull(hnswParams);
+    Objects.requireNonNull(cagraIndex);
+
+    // Get the CAGRA index implementation to access internals
+    if (!(cagraIndex instanceof CagraIndexImpl)) {
+      throw new IllegalArgumentException("Invalid CagraIndex implementation");
+    }
+    CagraIndexImpl cagraImpl = (CagraIndexImpl) cagraIndex;
+    CuVSResources resources = cagraImpl.getCuVSResources();
+
+    // Create HNSW index
+    MemorySegment hnswIndex = createHnswIndexHandle();
+
+    initializeIndexDType(hnswIndex, cagraImpl.getDatasetForConversion());
+
+    try (var localArena = Arena.ofConfined();
+        var hnswParamsHandle = createHnswIndexParams()) {
+      MemorySegment hnswParamsMemorySegment = hnswParamsHandle.handle();
+
+      // Set HNSW params
+      cuvsHnswIndexParams.hierarchy(
+          hnswParamsMemorySegment,
+          hnswParams.getHierarchy().value);
+      cuvsHnswIndexParams.ef_construction(
+          hnswParamsMemorySegment,
+          hnswParams.getEfConstruction());
+      cuvsHnswIndexParams.num_threads(
+          hnswParamsMemorySegment,
+          hnswParams.getNumThreads());
+
+      try (var resourcesAccessor = resources.access()) {
+        var cuvsRes = resourcesAccessor.handle();
+
+        // Call cuvsHnswFromCagra
+        int returnValue =
+            cuvsHnswFromCagra(
+                cuvsRes,
+                hnswParamsMemorySegment,
+                cagraImpl.getCagraIndexReference(),
+                hnswIndex);
+        checkCuVSError(returnValue, "cuvsHnswFromCagra");
+
+        returnValue = cuvsStreamSync(cuvsRes);
+        checkCuVSError(returnValue, "cuvsStreamSync");
+      }
+    }
+    return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams);
+  }
+
+  /**
+   * Creates a new HNSW index handle.
+   */
+  private static MemorySegment createHnswIndexHandle() {
+    try (var localArena = Arena.ofConfined()) {
+      MemorySegment indexPtrPtr = localArena.allocate(cuvsHnswIndex_t);
+      var returnValue = cuvsHnswIndexCreate(indexPtrPtr);
+      checkCuVSError(returnValue, "cuvsHnswIndexCreate");
+      return indexPtrPtr.get(cuvsHnswIndex_t, 0);
+    }
+  }
+
+  private static void initializeIndexDType(MemorySegment hnswIndex, CuVSMatrix dataset) {
+    int bits = 32;
+    int code = kDLFloat();
+
+    if (dataset instanceof CuVSMatrixInternal matrixInternal) {
+      bits = matrixInternal.bits();
+      code = matrixInternal.code();
+    } else if (dataset != null) {
+      bits = bitsFromDataType(dataset.dataType());
+      code = CuVSMatrixInternal.code(dataset.dataType());
+    }
+
+    try (var localArena = Arena.ofConfined()) {
+      MemorySegment dtype = DLDataType.allocate(localArena);
+      DLDataType.bits(dtype, (byte) bits);
+      DLDataType.code(dtype, (byte) code);
+      DLDataType.lanes(dtype, (byte) 1);
+      cuvsHnswIndex.dtype(hnswIndex, dtype);
+    }
+  }
+
+  private static int bitsFromDataType(CuVSMatrix.DataType dataType) {
+    return switch (dataType) {
+      case BYTE -> 8;
+      default -> 32;
+    };
+  }
+
   /**
    * Builder helps configure and create an instance of {@link HnswIndex}.
    */
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java
index f84eed5dbe..1117c08f23 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java
@@ -54,6 +54,11 @@ private Util() {}
       LINKER.downcallHandle(
           cudaMemcpyAsync$address(), cudaMemcpyAsync$descriptor(), Linker.Option.critical(true));
 
+  private static final MethodHandle strdup$mh =
+      LINKER.downcallHandle(
+          SYMBOL_LOOKUP.find("strdup").orElseThrow(UnsatisfiedLinkError::new),
+          FunctionDescriptor.of(C_POINTER, C_POINTER));
+
   private static final MethodHandle cudaGetDeviceProperties$mh =
       LINKER.downcallHandle(
           SYMBOL_LOOKUP
@@ -215,6 +220,19 @@ public static MemorySegment buildMemorySegment(Arena arena, String str) {
     return stringMemorySegment;
   }
 
+  /**
+   * Allocates a native (C-owned) copy of a string using strdup(). The returned memory must be freed
+   * by the native side (e.g. cuVS APIs) via free().
+   */
+  public static MemorySegment duplicateNativeString(String str) {
+    try (var arena = Arena.ofConfined()) {
+      MemorySegment src = buildMemorySegment(arena, str);
+      return (MemorySegment) strdup$mh.invokeExact(src);
+    } catch (Throwable t) {
+      throw new RuntimeException("Failed to duplicate native string", t);
+    }
+  }
+
   /**
    * A utility method for building a {@link MemorySegment} for a 1D long array.
    *
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index facbc36a51..c639c48460 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -156,6 +156,12 @@ public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) {
     return HnswIndexImpl.newBuilder(Objects.requireNonNull(cuVSResources));
   }
 
+  @Override
+  public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagraIndex)
+      throws Throwable {
+    return HnswIndexImpl.fromCagra(hnswParams, cagraIndex);
+  }
+
   @Override
   public TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources) {
     return TieredIndexImpl.newBuilder(Objects.requireNonNull(cuVSResources));
diff --git a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraAceBuildAndSearchIT.java b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraAceBuildAndSearchIT.java
new file mode 100644
index 0000000000..997fa560a7
--- /dev/null
+++ b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraAceBuildAndSearchIT.java
@@ -0,0 +1,243 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.assumeTrue;
+import static org.junit.Assert.*;
+
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo;
+import com.nvidia.cuvs.CagraIndexParams.CuvsDistanceType;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Integration tests for CAGRA index using ACE (Augmented Core Extraction) build algorithm.
+ * ACE enables building indices for datasets too large to fit in GPU memory by partitioning
+ * the dataset and building sub-indices.
+ *
+ * @since 25.12
+ */
+@RunWith(RandomizedRunner.class)
+public class CagraAceBuildAndSearchIT extends CuVSTestCase {
+
+  private static final Logger log = LoggerFactory.getLogger(CagraAceBuildAndSearchIT.class);
+
+  @Before
+  public void setup() {
+    assumeTrue("not supported on " + System.getProperty("os.name"), isLinuxAmd64());
+    initializeRandom();
+    log.trace("Random context initialized for test.");
+  }
+
+  private static List<Map<Integer, Float>> getExpectedResults() {
+    return Arrays.asList(
+        Map.of(3, 0.038782578f, 2, 0.3590463f, 0, 0.83774555f),
+        Map.of(0, 0.12472608f, 2, 0.21700792f, 1, 0.31918612f),
+        Map.of(3, 0.047766715f, 2, 0.20332818f, 0, 0.48305473f),
+        Map.of(1, 0.15224178f, 0, 0.59063464f, 3, 0.5986642f));
+  }
+
+  private static float[][] createSampleQueries() {
+    return new float[][] {
+      {0.48216683f, 0.0428398f},
+      {0.5084142f, 0.6545497f},
+      {0.51260436f, 0.2643005f},
+      {0.05198065f, 0.5789965f}
+    };
+  }
+
+  private static float[][] createSampleData() {
+    return new float[][] {
+      {0.74021935f, 0.9209938f},
+      {0.03902049f, 0.9689629f},
+      {0.92514056f, 0.4463501f},
+      {0.6673192f, 0.10993068f}
+    };
+  }
+
+  /**
+   * Test ACE build with in-memory mode (use_disk=false).
+   * This tests the basic ACE functionality with small datasets that fit in memory.
+   */
+  @Test
+  public void testAceInMemoryBuild() throws Throwable {
+    float[][] dataset = createSampleData();
+    float[][] queries = createSampleQueries();
+    List<Map<Integer, Float>> expectedResults = getExpectedResults();
+
+    try (CuVSResources resources = CheckedCuVSResources.create()) {
+      // Configure ACE parameters for in-memory mode
+      CuVSAceParams aceParams =
+          new CuVSAceParams.Builder()
+              .withNpartitions(2)
+              .withEfConstruction(120)
+              .withUseDisk(false)
+              .build();
+
+      // Configure index parameters with ACE build algorithm
+      CagraIndexParams indexParams =
+          new CagraIndexParams.Builder()
+              .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.ACE)
+              .withGraphDegree(64)
+              .withIntermediateGraphDegree(128)
+              .withNumWriterThreads(2)
+              .withMetric(CuvsDistanceType.L2Expanded)
+              .withCuVSAceParams(aceParams)
+              .build();
+
+      // Build the index with ACE
+      try (CagraIndex index =
+          CagraIndex.newBuilder(resources)
+              .withDataset(dataset)
+              .withIndexParams(indexParams)
+              .build()) {
+
+        // Verify index was built
+        assertNotNull("Index should not be null", index);
+        log.debug("ACE index built successfully in memory");
+
+        // Perform search
+        CagraSearchParams searchParams = new CagraSearchParams.Builder().build();
+
+        try (var queryVectors = CuVSMatrix.ofArray(queries)) {
+          CagraQuery cuvsQuery =
+              new CagraQuery.Builder(resources)
+                  .withTopK(3)
+                  .withSearchParams(searchParams)
+                  .withQueryVectors(queryVectors)
+                  .build();
+
+          SearchResults results = index.search(cuvsQuery);
+          log.debug("Search results: " + results.getResults().toString());
+
+          // Verify search results
+          checkResults(expectedResults, results.getResults());
+        }
+      }
+    }
+  }
+
+  /**
+   * Test ACE build with disk-based mode (use_disk=true).
+   * This tests ACE's ability to handle large datasets that don't fit in GPU memory.
+   */
+  @Test
+  public void testAceDiskBasedBuild() throws Throwable {
+    float[][] dataset = createSampleData();
+    float[][] queries = createSampleQueries();
+    List<Map<Integer, Float>> expectedResults = getExpectedResults();
+
+    try (CuVSResources resources = CheckedCuVSResources.create()) {
+      // Configure ACE parameters for disk-based mode
+      Path buildDir = Path.of("/tmp/java_ace_test");
+      CuVSAceParams aceParams =
+          new CuVSAceParams.Builder()
+              .withNpartitions(2)
+              .withEfConstruction(120)
+              .withUseDisk(true)
+              .withBuildDir(buildDir.toString())
+              .build();
+
+      // Configure index parameters with ACE build algorithm
+      CagraIndexParams indexParams =
+          new CagraIndexParams.Builder()
+              .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.ACE)
+              .withGraphDegree(64)
+              .withIntermediateGraphDegree(128)
+              .withNumWriterThreads(32)
+              .withMetric(CuvsDistanceType.L2Expanded)
+              .withCuVSAceParams(aceParams)
+              .build();
+
+      // Build the index with ACE in disk mode
+      try (CagraIndex index =
+          CagraIndex.newBuilder(resources)
+              .withDataset(dataset)
+              .withIndexParams(indexParams)
+              .build()) {
+
+        // Verify index was built
+        assertNotNull("Index should not be null", index);
+        log.debug("ACE index built successfully with disk mode");
+
+        // Verify ACE created the expected output files in the build directory
+        assertTrue(
+            "CAGRA graph file should exist",
+            Files.exists(buildDir.resolve("cagra_graph.npy")));
+        assertTrue(
+            "Reordered dataset file should exist",
+            Files.exists(buildDir.resolve("reordered_dataset.npy")));
+        assertTrue(
+            "Dataset mapping file should exist",
+            Files.exists(buildDir.resolve("dataset_mapping.npy")));
+
+        log.debug("ACE disk output files verified");
+
+        // Convert CAGRA index to HNSW using fromCagra
+        // This automatically handles disk-based indices
+        HnswIndexParams hnswIndexParams =
+            new HnswIndexParams.Builder().withVectorDimension(2).build();
+
+        try (var hnswIndexSerialized = HnswIndex.fromCagra(hnswIndexParams, index)) {
+          var hnswIndexSerializedPath = buildDir.resolve("hnsw_index.bin");
+          assertTrue("HNSW index should exist", Files.exists(hnswIndexSerializedPath));
+          log.debug("HNSW index created from disk-based ACE CAGRA index");
+
+          // Load the serialized index from disk
+          try (var inputStreamHNSW = Files.newInputStream(hnswIndexSerializedPath)) {
+            var hnswIndex =
+                HnswIndex.newBuilder(resources)
+                    .from(inputStreamHNSW)
+                    .withIndexParams(hnswIndexParams)
+                    .build();
+
+            HnswSearchParams hnswSearchParams = new HnswSearchParams.Builder().build();
+            HnswQuery hnswQuery =
+                new HnswQuery.Builder(resources)
+                    .withTopK(3)
+                    .withSearchParams(hnswSearchParams)
+                    .withQueryVectors(queries)
+                    .build();
+
+            SearchResults results = hnswIndex.search(hnswQuery);
+            log.debug("HNSW search results: " + results.getResults().toString());
+
+            checkResults(expectedResults, results.getResults());
+            log.debug("HNSW search verification passed");
+
+            hnswIndex.close();
+          }
+        }
+
+        // Clean up the default build directory
+        deleteRecursively(buildDir);
+      }
+    }
+  }
+
+  /**
+   * Helper method to recursively delete a directory and its contents.
+   */
+  private void deleteRecursively(Path path) {
+    try {
+      if (Files.isDirectory(path)) {
+        Files.list(path).forEach(this::deleteRecursively);
+      }
+      Files.deleteIfExists(path);
+    } catch (Exception e) {
+      log.warn("Failed to delete {}: {}", path, e.getMessage());
+    }
+  }
+}
diff --git a/python/cuvs/cuvs/neighbors/cagra/__init__.py b/python/cuvs/cuvs/neighbors/cagra/__init__.py
index 7e59e62ed0..ec70305d72 100644
--- a/python/cuvs/cuvs/neighbors/cagra/__init__.py
+++ b/python/cuvs/cuvs/neighbors/cagra/__init__.py
@@ -3,6 +3,7 @@
 
 
 from .cagra import (
+    AceParams,
     CompressionParams,
     ExtendParams,
     Index,
@@ -17,6 +18,7 @@
 )
 
 __all__ = [
+    "AceParams",
     "CompressionParams",
     "ExtendParams",
     "Index",
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
index 9a8f167c27..44c1fa7aee 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
@@ -40,6 +40,7 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
         IVF_PQ
         NN_DESCENT
         ITERATIVE_CAGRA_SEARCH
+        ACE
 
     ctypedef struct cuvsCagraCompressionParams:
         uint32_t pq_bits
@@ -57,6 +58,13 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
         float refinement_rate
     ctypedef cuvsIvfPqParams* cuvsIvfPqParams_t
 
+    ctypedef struct cuvsAceParams:
+        size_t npartitions
+        size_t ef_construction
+        const char* build_dir
+        bool use_disk
+    ctypedef cuvsAceParams* cuvsAceParams_t
+
     ctypedef struct cuvsCagraIndexParams:
         cuvsDistanceType metric
         size_t intermediate_graph_degree
@@ -64,7 +72,7 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
         cuvsCagraGraphBuildAlgo build_algo
         size_t nn_descent_niter
         cuvsCagraCompressionParams_t compression
-        cuvsIvfPqParams_t graph_build_params
+        void* graph_build_params
 
     ctypedef cuvsCagraIndexParams* cuvsCagraIndexParams_t
 
@@ -111,6 +119,10 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
     cuvsError_t cuvsCagraCompressionParamsDestroy(
         cuvsCagraCompressionParams_t index)
 
+    cuvsError_t cuvsAceParamsCreate(cuvsAceParams_t* params)
+
+    cuvsError_t cuvsAceParamsDestroy(cuvsAceParams_t params)
+
     cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params)
 
     cuvsError_t cuvsCagraIndexParamsDestroy(cuvsCagraIndexParams_t index)
@@ -193,6 +205,7 @@ cdef class IndexParams:
     cdef public object compression
     cdef public object ivf_pq_build_params
     cdef public object ivf_pq_search_params
+    cdef public object ace_params
 
 cdef class SearchParams:
     cdef cuvsCagraSearchParams * params
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index bb59aa55dc..c7ce834f5a 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -17,7 +17,9 @@ from libcpp cimport bool, cast
 from libcpp.string cimport string
 
 from cuvs.common cimport cydlpack
+
 from cuvs.common.device_tensor_view import DeviceTensorView
+
 from cuvs.distance_type cimport cuvsDistanceType
 
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
@@ -37,6 +39,8 @@ from libc.stdint cimport (
     uint64_t,
     uintptr_t,
 )
+from libc.stdlib cimport free, malloc
+from libc.string cimport strdup
 
 from cuvs.common.exceptions import check_cuvs
 from cuvs.neighbors import ivf_pq
@@ -119,6 +123,94 @@ cdef class CompressionParams:
     def get_handle(self):
         return <size_t>self.params
 
+
+cdef class AceParams:
+    """
+    Parameters for ACE (Augmented Core Extraction) graph building algorithm.
+
+    ACE enables building indices for datasets too large to fit in GPU memory by
+    partitioning the dataset using balanced k-means and building sub-indices
+    for each partition independently.
+
+    Parameters
+    ----------
+    npartitions : int, default = 1
+        Number of partitions for ACE partitioned build. Small values might
+        improve recall but potentially degrade performance and increase memory
+        usage. Partitions should not be too small to prevent issues in KNN
+        graph construction. 100k - 5M vectors per partition is recommended
+        depending on the available host and GPU memory. The partition size is
+        on average 2 * (n_rows / npartitions) * dim * sizeof(T). 2 is because
+        of the core and augmented vectors. Please account for imbalance in the
+        partition sizes (up to 3x in our tests).
+    ef_construction : int, default = 120
+        The index quality for the ACE build. Bigger values increase the index
+        quality. At some point, increasing this will no longer improve the
+        quality.
+    build_dir : str, default = "/tmp/ace_build"
+        Directory to store ACE build artifacts (e.g., KNN graph, optimized
+        graph). Used when `use_disk` is true or when the graph does not fit
+        in host and GPU memory. This should be the fastest disk in the system
+        and hold enough space for twice the dataset, final graph, and label
+        mapping.
+    use_disk : bool, default = False
+        Whether to use disk-based storage for ACE build. When true, enables
+        disk-based operations for memory-efficient graph construction.
+    """
+    cdef cuvsAceParams* params
+    cdef bytes _build_dir_bytes  # Keep Python bytes alive for property access
+
+    def __cinit__(self):
+        check_cuvs(cuvsAceParamsCreate(&self.params))
+        self._build_dir_bytes = b""
+
+    def __dealloc__(self):
+        if self.params != NULL:
+            check_cuvs(cuvsAceParamsDestroy(self.params))
+
+    def __init__(self, *,
+                 npartitions=1,
+                 ef_construction=120,
+                 build_dir="/tmp/ace_build",
+                 use_disk=False):
+        self.params.npartitions = npartitions
+        self.params.ef_construction = ef_construction
+        self.params.use_disk = use_disk
+
+        # Need to replace the default build_dir allocated by
+        # cuvsAceParamsCreate
+        # First free the old C string, then allocate new one
+        if self.params.build_dir != NULL:
+            free(<void*>self.params.build_dir)
+
+        # Store Python bytes for property access
+        self._build_dir_bytes = build_dir.encode('utf-8')
+        # Allocate C memory and copy the string (strdup-like behavior)
+        self.params.build_dir = strdup(self._build_dir_bytes)
+
+    @property
+    def npartitions(self):
+        return self.params.npartitions
+
+    @property
+    def ef_construction(self):
+        return self.params.ef_construction
+
+    @property
+    def build_dir(self):
+        if self._build_dir_bytes:
+            return self._build_dir_bytes.decode('utf-8')
+        else:
+            return ""
+
+    @property
+    def use_disk(self):
+        return self.params.use_disk
+
+    def get_handle(self):
+        return <size_t>self.params
+
+
 cdef class IndexParams:
     """
     Parameters to build index for CAGRA nearest neighbor search
@@ -141,7 +233,7 @@ cdef class IndexParams:
     graph_degree : int, default = 64
     build_algo: str, default = "ivf_pq"
         string denoting the graph building algorithm to use. Valid values for
-        algo: ["ivf_pq", "nn_descent", "iterative_cagra_search"], where
+        algo: ["ivf_pq", "nn_descent", "iterative_cagra_search", "ace"], where
 
             - ivf_pq will use the IVF-PQ algorithm for building the knn graph
             - nn_descent (experimental) will use the NN-Descent algorithm for
@@ -149,6 +241,8 @@ cdef class IndexParams:
               faster than ivf_pq.
             - iterative_cagra_search will iteratively build the knn graph using
               CAGRA's search() and optimize()
+            - ace will use ACE (Augmented Core Extraction) for building indices
+              for datasets too large to fit in GPU memory
 
     compression: CompressionParams, optional
         If compression is desired should be a CompressionParams object. If None
@@ -159,6 +253,9 @@ cdef class IndexParams:
     ivf_pq_search_params: cuvs.neighbors.ivf_pq.SearchParams, optional
         Parameters for IVF-PQ search. If provided, it will be used for
         searching the graph.
+    ace_params: AceParams, optional
+        Parameters for ACE algorithm. If provided, it will be used for
+        building the graph with ACE partitioning.
     refinement_rate: float, default = 1.0
 
     """
@@ -168,6 +265,7 @@ cdef class IndexParams:
         self.compression = None
         self.ivf_pq_build_params = None
         self.ivf_pq_search_params = None
+        self.ace_params = None
 
     def __dealloc__(self):
         if self.params != NULL:
@@ -182,7 +280,11 @@ cdef class IndexParams:
                  compression=None,
                  ivf_pq_build_params: ivf_pq.IndexParams = None,
                  ivf_pq_search_params: ivf_pq.SearchParams = None,
+                 ace_params: AceParams = None,
                  refinement_rate: float = 1.0):
+        # Declare cdef variables at the top of the function
+        cdef cuvsIvfPqParams_t ivf_pq_params_ptr
+        cdef cuvsAceParams_t new_ace_params
 
         self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
         self.params.intermediate_graph_degree = intermediate_graph_degree
@@ -194,6 +296,8 @@ cdef class IndexParams:
         elif build_algo == "iterative_cagra_search":
             self.params.build_algo = \
                 cuvsCagraGraphBuildAlgo.ITERATIVE_CAGRA_SEARCH
+        elif build_algo == "ace":
+            self.params.build_algo = cuvsCagraGraphBuildAlgo.ACE
         else:
             raise ValueError(f"Unknown build_algo '{build_algo}'")
 
@@ -202,19 +306,56 @@ cdef class IndexParams:
             self.compression = compression
             self.params.compression = \
                 <cuvsCagraCompressionParams_t><size_t>compression.get_handle()
-        if ivf_pq_build_params is not None:
-            if ivf_pq_build_params.metric != self.metric:
-                raise ValueError("Metric mismatch with IVF-PQ build params")
-            self.ivf_pq_build_params = ivf_pq_build_params
-            self.params.graph_build_params.ivf_pq_build_params = \
-                <cuvsIvfPqIndexParams_t><size_t> \
-                ivf_pq_build_params.get_handle()
-        if ivf_pq_search_params is not None:
-            self.ivf_pq_search_params = ivf_pq_search_params
-            self.params.graph_build_params.ivf_pq_search_params = \
-                <cuvsIvfPqSearchParams_t><size_t> \
-                ivf_pq_search_params.get_handle()
-        self.params.graph_build_params.refinement_rate = refinement_rate
+
+        # Handle graph build params based on build algorithm
+        if build_algo == "ace":
+            if ace_params is None:
+                ace_params = AceParams()
+            self.ace_params = ace_params
+
+            # Create a new C-allocated cuvsAceParams that the C API will own
+            # We cannot pass the Python object's pointer directly because
+            # cuvsCagraIndexParamsDestroy will try to delete it
+            check_cuvs(cuvsAceParamsCreate(&new_ace_params))
+
+            # Copy values from Python object to new C struct
+            new_ace_params.npartitions = ace_params.params.npartitions
+            new_ace_params.ef_construction = ace_params.params.ef_construction
+            new_ace_params.use_disk = ace_params.params.use_disk
+
+            # Copy the build_dir string
+            if new_ace_params.build_dir != NULL:
+                free(<void*>new_ace_params.build_dir)
+            new_ace_params.build_dir = strdup(ace_params.params.build_dir)
+
+            # Pass the new C struct to the index params
+            self.params.graph_build_params = <void*>new_ace_params
+        else:
+            # For IVF-PQ algorithm, handle ivf_pq params
+            # Cast the void* back to cuvsIvfPqParams_t
+            ivf_pq_params_ptr = (
+                <cuvsIvfPqParams_t>self.params.graph_build_params
+            )
+
+            if ivf_pq_build_params is not None:
+                if ivf_pq_build_params.metric != self.metric:
+                    raise ValueError(
+                        "Metric mismatch with IVF-PQ build params"
+                    )
+                self.ivf_pq_build_params = ivf_pq_build_params
+                ivf_pq_params_ptr.ivf_pq_build_params = (
+                    <cuvsIvfPqIndexParams_t><size_t>
+                    ivf_pq_build_params.get_handle()
+                )
+
+            if ivf_pq_search_params is not None:
+                self.ivf_pq_search_params = ivf_pq_search_params
+                ivf_pq_params_ptr.ivf_pq_search_params = (
+                    <cuvsIvfPqSearchParams_t><size_t>
+                    ivf_pq_search_params.get_handle()
+                )
+
+            ivf_pq_params_ptr.refinement_rate = refinement_rate
 
     def get_handle(self):
         return <size_t> self.params
@@ -241,7 +382,15 @@ cdef class IndexParams:
 
     @property
     def refinement_rate(self):
-        return self.params.graph_build_params.refinement_rate
+        # refinement_rate only applies to IVF-PQ builds
+        if self.params.build_algo == cuvsCagraGraphBuildAlgo.IVF_PQ:
+            return (
+                (<cuvsIvfPqParams_t>self.params.graph_build_params)
+                .refinement_rate
+            )
+        else:
+            # For ACE and other algorithms, refinement_rate doesn't apply
+            return 1.0
 
 
 cdef class Index:
@@ -327,6 +476,10 @@ def build(IndexParams index_params, dataset, resources=None):
     It is required that both the dataset and the optimized graph fit the
     GPU memory.
 
+    Note: When using ACE (Augmented Core Extraction) build algorithm, the
+    dataset must be in host memory (CPU). The ACE algorithm is designed for
+    datasets too large to fit in GPU memory.
+
     The following distance metrics are supported:
         - L2
         - InnerProduct
@@ -337,6 +490,8 @@ def build(IndexParams index_params, dataset, resources=None):
     index_params : IndexParams object
     dataset : CUDA array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, half, int8, uint8]
+        **Note:** For ACE build algorithm, the dataset MUST be in host memory.
+        Use NumPy arrays or call .get() on CuPy arrays before passing.
     {resources_docstring}
 
     Returns
@@ -361,8 +516,28 @@ def build(IndexParams index_params, dataset, resources=None):
     ...                                      k)
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
+
+    >>> # ACE example with host data
+    >>> import numpy as np
+    >>> dataset_host = np.random.random_sample(
+    ...     (n_samples, n_features)
+    ... ).astype(np.float32)
+    >>> ace_params = cagra.AceParams(
+    ...     npartitions=4, use_disk=True, build_dir="/tmp/ace"
+    ... )
+    >>> build_params = cagra.IndexParams(
+    ...     metric="sqeuclidean",
+    ...     build_algo="ace",
+    ...     ace_params=ace_params
+    ... )
+    >>> idx = cagra.build(build_params, dataset_host)
     """
 
+    # Check if ACE build is requested
+    is_ace_build = (
+        index_params.params.build_algo == cuvsCagraGraphBuildAlgo.ACE
+    )
+
     # todo(dgd): we can make the check of dtype a parameter of wrap_array
     # in RAFT to make this a single call
     dataset_ai = wrap_array(dataset)
@@ -371,6 +546,16 @@ def build(IndexParams index_params, dataset, resources=None):
                                     np.dtype('byte'),
                                     np.dtype('ubyte')])
 
+    # For ACE, verify dataset is on host
+    if is_ace_build:
+        # Check if data is on device (has __cuda_array_interface__)
+        if hasattr(dataset, '__cuda_array_interface__'):
+            raise ValueError(
+                "ACE build requires dataset to be in host memory. "
+                "Please use NumPy arrays or transfer CuPy arrays to host with "
+                "dataset.get() before calling build()."
+            )
+
     cdef Index idx = Index()
     cdef cydlpack.DLManagedTensor* dataset_dlpack = \
         cydlpack.dlpack_c(dataset_ai)
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
index 2db0902e18..399fc06d47 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -1,10 +1,11 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
 # cython: language_level=3
 
 from libc.stdint cimport int32_t, uintptr_t
+from libcpp cimport bool
 
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
diff --git a/python/cuvs/cuvs/tests/test_cagra_ace.py b/python/cuvs/cuvs/tests/test_cagra_ace.py
new file mode 100644
index 0000000000..d2d9ccf6a4
--- /dev/null
+++ b/python/cuvs/cuvs/tests/test_cagra_ace.py
@@ -0,0 +1,173 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import os
+import tempfile
+
+import cupy as cp
+import numpy as np
+import pytest
+from pylibraft.common import device_ndarray
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import normalize
+
+from cuvs.neighbors import cagra, hnsw
+from cuvs.tests.ann_utils import calc_recall, generate_data
+
+
+def run_cagra_ace_build_search_test(
+    n_rows=5000,
+    n_cols=64,
+    n_queries=10,
+    k=10,
+    dtype=np.float32,
+    metric="sqeuclidean",
+    intermediate_graph_degree=128,
+    graph_degree=64,
+    npartitions=2,
+    ef_construction=100,
+    use_disk=False,
+    hierarchy="none",
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    queries = generate_data((n_queries, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+        queries = normalize(queries, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            # Quantize the normalized data to the int8/uint8 range
+            dtype_max = np.iinfo(dtype).max
+            dataset = (dataset * dtype_max).astype(dtype)
+            queries = (queries * dtype_max).astype(dtype)
+
+    # Create a temporary directory for ACE build
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Set up ACE parameters
+        ace_params = cagra.AceParams(
+            npartitions=npartitions,
+            ef_construction=ef_construction,
+            build_dir=temp_dir,
+            use_disk=use_disk,
+        )
+
+        # Build parameters
+        build_params = cagra.IndexParams(
+            metric=metric,
+            intermediate_graph_degree=intermediate_graph_degree,
+            graph_degree=graph_degree,
+            build_algo="ace",
+            ace_params=ace_params,
+        )
+
+        # Build the index with ACE (uses host memory)
+        index = cagra.build(build_params, dataset)
+
+        assert index.trained
+
+        # For disk-based mode, we can't search directly
+        # (would need HNSW conversion which is tested separately)
+        if not use_disk:
+            # For in-memory mode, we can search directly
+            # But queries need to be on device
+            search_params = cagra.SearchParams(itopk_size=64)
+
+            # Transfer queries to device for search
+            queries_device = device_ndarray(queries)
+
+            out_dist, out_idx = cagra.search(
+                search_params, index, queries_device, k
+            )
+
+            # Convert results back to host
+            out_idx_host = out_idx.copy_to_host()
+
+            # Calculate reference values with sklearn
+            skl_metric = {
+                "sqeuclidean": "sqeuclidean",
+                "inner_product": "cosine",
+                "euclidean": "euclidean",
+            }[metric]
+            nn_skl = NearestNeighbors(
+                n_neighbors=k, algorithm="brute", metric=skl_metric
+            )
+            nn_skl.fit(dataset)
+            skl_idx = nn_skl.kneighbors(queries, return_distance=False)
+
+            recall = calc_recall(out_idx_host, skl_idx)
+            assert recall > 0.7
+
+            # test that we can get the cagra graph from the index
+            graph = index.graph
+            assert graph.shape == (n_rows, graph_degree)
+
+            # make sure we can convert the graph to cupy, and access it
+            cp_graph = cp.array(graph)
+            assert cp_graph.shape == (n_rows, graph_degree)
+        else:
+            # For disk-based mode, verify that expected files were created
+            assert os.path.exists(os.path.join(temp_dir, "cagra_graph.npy"))
+            assert os.path.exists(
+                os.path.join(temp_dir, "reordered_dataset.npy")
+            )
+            assert os.path.exists(
+                os.path.join(temp_dir, "dataset_mapping.npy")
+            )
+
+            # Test HNSW conversion from disk-based ACE index
+            hnsw_params = hnsw.IndexParams(hierarchy=hierarchy)
+            hnsw_index_serialized = hnsw.from_cagra(hnsw_params, index)
+            assert hnsw_index_serialized is not None
+            assert os.path.exists(os.path.join(temp_dir, "hnsw_index.bin"))
+
+            # Deserialize the HNSW index from disk for search
+            hnsw_index = hnsw.load(
+                hnsw_params,
+                os.path.join(temp_dir, "hnsw_index.bin"),
+                n_cols,
+                dtype,
+            )
+
+            search_params = hnsw.SearchParams(ef=200, num_threads=1)
+            out_dist, out_idx = hnsw.search(
+                search_params, hnsw_index, queries, k
+            )
+
+            # Calculate reference values with sklearn
+            skl_metric = {
+                "sqeuclidean": "sqeuclidean",
+                "inner_product": "cosine",
+                "euclidean": "euclidean",
+            }[metric]
+            nn_skl = NearestNeighbors(
+                n_neighbors=k, algorithm="brute", metric=skl_metric
+            )
+            nn_skl.fit(dataset)
+            skl_dist, skl_idx = nn_skl.kneighbors(
+                queries, return_distance=True
+            )
+
+            recall = calc_recall(out_idx, skl_idx)
+            assert recall > 0.7
+
+
+@pytest.mark.parametrize("dim", [64, 128])
+@pytest.mark.parametrize("dtype", [np.float32, np.float16, np.int8, np.uint8])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
+@pytest.mark.parametrize("npartitions", [2, 4])
+@pytest.mark.parametrize("ef_construction", [100, 200])
+@pytest.mark.parametrize("use_disk", [False, True])
+@pytest.mark.parametrize("hierarchy", ["none", "gpu"])
+def test_cagra_ace_dtypes_and_metrics(
+    dim, dtype, metric, npartitions, ef_construction, use_disk, hierarchy
+):
+    """Test ACE with different data types and metrics."""
+    run_cagra_ace_build_search_test(
+        n_cols=dim,
+        dtype=dtype,
+        metric=metric,
+        npartitions=npartitions,
+        ef_construction=ef_construction,
+        use_disk=use_disk,
+        hierarchy=hierarchy,
+    )