From edd7b0308fbec20de7bb1c0dc2244c0b80cd0324 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 10 Dec 2025 00:57:20 +0000
Subject: [PATCH 01/12] fix deterministic mode outliers

---
 cpp/src/umap/simpl_set_embed/algo.cuh         |  63 +----
 .../simpl_set_embed/optimize_batch_kernel.cuh | 246 ++++++++++++------
 2 files changed, 179 insertions(+), 130 deletions(-)
diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index e3c3dab978..bcb61dfd5b 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -91,48 +91,6 @@ void optimization_iteration_finalization(
   seed += 1;
 }
 
-/**
- * Update the embeddings and clear the buffers when using deterministic algorithm.
- */
-template <typename T, typename nnz_t>
-void apply_embedding_updates(T* head_embedding,
-                             T* head_buffer,
-                             int head_n,
-                             T* tail_embedding,
-                             T* tail_buffer,
-                             int tail_n,
-                             UMAPParams* params,
-                             bool move_other,
-                             rmm::cuda_stream_view stream)
-{
-  ASSERT(params->deterministic, "Only used when deterministic is set to true.");
-  nnz_t n_components = params->n_components;
-  if (move_other) {
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0u),
-                     thrust::make_counting_iterator(0u) + std::max(head_n, tail_n) * n_components,
-                     [=] __device__(uint32_t i) {
-                       if (i < head_n * n_components) {
-                         head_embedding[i] += head_buffer[i];
-                         head_buffer[i] = 0.0f;
-                       }
-                       if (i < tail_n * n_components) {
-                         tail_embedding[i] += tail_buffer[i];
-                         tail_buffer[i] = 0.0f;
-                       }
-                     });
-  } else {
-    // No need to update reference embedding
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0u),
-                     thrust::make_counting_iterator(0u) + head_n * n_components,
-                     [=] __device__(uint32_t i) {
-                       head_embedding[i] += head_buffer[i];
-                       head_buffer[i] = 0.0f;
-                     });
-  }
-}
-
 /**
  * \brief Constructs a rounding factor used to truncate elements in a sum such that the
  * sum of the truncated elements is the same no matter what the order of the sum is.
@@ -292,6 +250,15 @@ void optimize_layout(T* head_embedding,
     }
   }
 
+  // TODO: find heuristics for this
+  if (params->deterministic) {
+    if (min_n <= 100000) {
+      num_chunks = 4;
+    } else {
+      num_chunks = 8;
+    }
+  }
+
   rmm::device_uvector<T> epoch_of_next_negative_sample(nnz, stream);
   T nsr_inv = T(1.0) / params->negative_sample_rate;
   raft::linalg::unaryOp<T>(
@@ -335,6 +302,7 @@ void optimize_layout(T* head_embedding,
   for (int n = 0; n < n_epochs; n++) {
     call_optimize_batch_kernel<T, nnz_t, TPB_X>(head_embedding,
                                                 d_head_buffer,
+                                                head_n,
                                                 tail_embedding,
                                                 d_tail_buffer,
                                                 tail_n,
@@ -354,17 +322,6 @@ void optimize_layout(T* head_embedding,
                                                 blk,
                                                 stream,
                                                 rounding);
-    if (params->deterministic) {
-      apply_embedding_updates<T, nnz_t>(head_embedding,
-                                        d_head_buffer,
-                                        head_n,
-                                        tail_embedding,
-                                        d_tail_buffer,
-                                        tail_n,
-                                        params,
-                                        move_other,
-                                        stream_view);
-    }
     RAFT_CUDA_TRY(cudaGetLastError());
     optimization_iteration_finalization(params, head_embedding, alpha, n, n_epochs, seed);
   }
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index 98234388e5..6731f2f845 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -13,6 +13,8 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <cuda_runtime_api.h>
+
 #include <stdint.h>
 
 #include <cstddef>
@@ -23,6 +25,48 @@ namespace Algo {
 
 using namespace ML;
 
+/**
+ * Update the embeddings and clear the buffers when using deterministic algorithm.
+ */
+template <typename T, typename nnz_t>
+void apply_embedding_updates(T* head_embedding,
+                             T* head_buffer,
+                             int head_n,
+                             T* tail_embedding,
+                             T* tail_buffer,
+                             int tail_n,
+                             UMAPParams const* params,
+                             bool move_other,
+                             rmm::cuda_stream_view stream)
+{
+  ASSERT(params->deterministic, "Only used when deterministic is set to true.");
+  nnz_t n_components = params->n_components;
+  if (move_other) {
+    thrust::for_each(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator(0u),
+                     thrust::make_counting_iterator(0u) + std::max(head_n, tail_n) * n_components,
+                     [=] __device__(uint32_t i) {
+                       if (i < head_n * n_components) {
+                         head_embedding[i] += head_buffer[i];
+                         head_buffer[i] = 0.0f;
+                       }
+                       if (i < tail_n * n_components) {
+                         tail_embedding[i] += tail_buffer[i];
+                         tail_buffer[i] = 0.0f;
+                       }
+                     });
+  } else {
+    // No need to update reference embedding
+    thrust::for_each(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator(0u),
+                     thrust::make_counting_iterator(0u) + head_n * n_components,
+                     [=] __device__(uint32_t i) {
+                       head_embedding[i] += head_buffer[i];
+                       head_buffer[i] = 0.0f;
+                     });
+  }
+}
+
 /**
  * Calculate the squared distance between two vectors of size n
  * @{
@@ -106,17 +150,25 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
                                            bool move_other,
                                            UMAPParams params,
                                            T nsr_inv,
-                                           T rounding)
+                                           T rounding,
+                                           size_t offset = 0)
 {
-  nnz_t row       = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x;
+  nnz_t row       = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x + offset;
   nnz_t skip_size = blockDim.x * gridDim.x;
 
   T current_reg[n_components], other_reg[n_components], grads[n_components];
+
+  // Deterministic mode: process exactly one row per thread (no grid-stride loop)
+  // Non-deterministic mode: use grid-stride loop to process multiple rows
   while (row < nnz) {
     auto _epoch_of_next_sample = epoch_of_next_sample[row];
     if (_epoch_of_next_sample > epoch) {
-      row += skip_size;
-      continue;
+      if (params.deterministic) {
+        return;
+      } else {
+        row += skip_size;
+        continue;
+      }
     }
     auto _epochs_per_sample         = epochs_per_sample[row];
     auto epochs_per_negative_sample = _epochs_per_sample * nsr_inv;
@@ -205,7 +257,12 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
     }
     epoch_of_next_negative_sample[row] =
       _epoch_of_next_negative_sample + n_neg_samples * epochs_per_negative_sample;
-    row += skip_size;
+
+    if (params.deterministic) {
+      return;  // Only process one row in deterministic mode
+    } else {
+      row += skip_size;
+    }
   }
 }
 
@@ -228,17 +285,22 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
                                        bool move_other,
                                        UMAPParams params,
                                        T nsr_inv,
-                                       T rounding)
+                                       T rounding,
+                                       size_t offset = 0)
 {
   extern __shared__ T embedding_shared_mem_updates[];
-  nnz_t row       = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x;
+  nnz_t row       = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x + offset;
   nnz_t skip_size = blockDim.x * gridDim.x;
 
   while (row < nnz) {
     auto _epoch_of_next_sample = epoch_of_next_sample[row];
     if (_epoch_of_next_sample > epoch) {
-      row += skip_size;
-      continue;
+      if (params.deterministic) {
+        return;
+      } else {
+        row += skip_size;
+        continue;
+      }
     }
     auto _epochs_per_sample         = epochs_per_sample[row];
     auto epochs_per_negative_sample = _epochs_per_sample * nsr_inv;
@@ -361,7 +423,11 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
     }
     epoch_of_next_negative_sample[row] =
       _epoch_of_next_negative_sample + n_neg_samples * epochs_per_negative_sample;
-    row += skip_size;
+    if (params.deterministic) {
+      return;  // Only process one row in deterministic mode
+    } else {
+      row += skip_size;
+    }
   }
 }
 
@@ -378,9 +444,10 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
  *                     deterministic result.
  */
 template <typename T, typename nnz_t, int TPB_X>
-void call_optimize_batch_kernel(T const* head_embedding,
+void call_optimize_batch_kernel(T* head_embedding,
                                 T* head_buffer,
-                                T const* tail_embedding,
+                                int head_n,
+                                T* tail_embedding,
                                 T* tail_buffer,
                                 int tail_n,
                                 const int* head,
@@ -404,72 +471,97 @@ void call_optimize_batch_kernel(T const* head_embedding,
   requiredSize *= sizeof(T);
   bool use_shared_mem = requiredSize < static_cast<std::size_t>(raft::getSharedMemPerBlock());
   T nsr_inv           = T(1.0) / params->negative_sample_rate;
-  if (params->n_components == 2) {
-    // multicore implementation with registers
-    optimize_batch_kernel_reg<T, nnz_t, TPB_X, 2>
-      <<<grid, blk, 0, stream>>>(head_embedding,
-                                 head_buffer,
-                                 tail_embedding,
-                                 tail_buffer,
-                                 tail_n,
-                                 head,
-                                 tail,
-                                 nnz,
-                                 epochs_per_sample,
-                                 epoch_of_next_negative_sample,
-                                 epoch_of_next_sample,
-                                 alpha,
-                                 n,
-                                 gamma,
-                                 seed,
-                                 move_other,
-                                 *params,
-                                 nsr_inv,
-                                 rounding);
-  } else if (use_shared_mem) {
-    // multicore implementation with shared memory
-    optimize_batch_kernel<T, nnz_t, TPB_X, true>
-      <<<grid, blk, requiredSize, stream>>>(head_embedding,
-                                            head_buffer,
-                                            tail_embedding,
-                                            tail_buffer,
-                                            tail_n,
-                                            head,
-                                            tail,
-                                            nnz,
-                                            epochs_per_sample,
-                                            epoch_of_next_negative_sample,
-                                            epoch_of_next_sample,
-                                            alpha,
-                                            n,
-                                            gamma,
-                                            seed,
-                                            move_other,
-                                            *params,
-                                            nsr_inv,
-                                            rounding);
+
+  auto stream_view = rmm::cuda_stream_view(stream);
+
+  auto launch_kernel = [&](size_t offset = 0) {
+    if (params->n_components == 2) {
+      // multicore implementation with registers
+      optimize_batch_kernel_reg<T, nnz_t, TPB_X, 2>
+        <<<grid, blk, 0, stream>>>(head_embedding,
+                                   head_buffer,
+                                   tail_embedding,
+                                   tail_buffer,
+                                   tail_n,
+                                   head,
+                                   tail,
+                                   nnz,
+                                   epochs_per_sample,
+                                   epoch_of_next_negative_sample,
+                                   epoch_of_next_sample,
+                                   alpha,
+                                   n,
+                                   gamma,
+                                   seed,
+                                   move_other,
+                                   *params,
+                                   nsr_inv,
+                                   rounding,
+                                   offset);
+    } else if (use_shared_mem) {
+      // multicore implementation with shared memory
+      optimize_batch_kernel<T, nnz_t, TPB_X, true>
+        <<<grid, blk, requiredSize, stream>>>(head_embedding,
+                                              head_buffer,
+                                              tail_embedding,
+                                              tail_buffer,
+                                              tail_n,
+                                              head,
+                                              tail,
+                                              nnz,
+                                              epochs_per_sample,
+                                              epoch_of_next_negative_sample,
+                                              epoch_of_next_sample,
+                                              alpha,
+                                              n,
+                                              gamma,
+                                              seed,
+                                              move_other,
+                                              *params,
+                                              nsr_inv,
+                                              rounding,
+                                              offset);
+    } else {
+      // multicore implementation without shared memory
+      optimize_batch_kernel<T, nnz_t, TPB_X, false>
+        <<<grid, blk, 0, stream>>>(head_embedding,
+                                   head_buffer,
+                                   tail_embedding,
+                                   tail_buffer,
+                                   tail_n,
+                                   head,
+                                   tail,
+                                   nnz,
+                                   epochs_per_sample,
+                                   epoch_of_next_negative_sample,
+                                   epoch_of_next_sample,
+                                   alpha,
+                                   n,
+                                   gamma,
+                                   seed,
+                                   move_other,
+                                   *params,
+                                   nsr_inv,
+                                   rounding,
+                                   offset);
+    }
+  };
+
+  if (params->deterministic) {
+    for (size_t offset = 0; offset < nnz; offset += TPB_X * grid.x) {
+      launch_kernel(offset);
+      apply_embedding_updates<T, nnz_t>(head_embedding,
+                                        head_buffer,
+                                        head_n,
+                                        tail_embedding,
+                                        tail_buffer,
+                                        tail_n,
+                                        params,
+                                        move_other,
+                                        stream_view);
+    }
   } else {
-    // multicore implementation without shared memory
-    optimize_batch_kernel<T, nnz_t, TPB_X, false>
-      <<<grid, blk, 0, stream>>>(head_embedding,
-                                 head_buffer,
-                                 tail_embedding,
-                                 tail_buffer,
-                                 tail_n,
-                                 head,
-                                 tail,
-                                 nnz,
-                                 epochs_per_sample,
-                                 epoch_of_next_negative_sample,
-                                 epoch_of_next_sample,
-                                 alpha,
-                                 n,
-                                 gamma,
-                                 seed,
-                                 move_other,
-                                 *params,
-                                 nsr_inv,
-                                 rounding);
+    launch_kernel(0);
   }
 }
 }  // namespace Algo

From 36b1355847ca241048431d2a3dc81a8db28e7742 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Thu, 11 Dec 2025 01:17:36 +0000
Subject: [PATCH 02/12] shuffle

---
 cpp/src/umap/simpl_set_embed/algo.cuh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index bcb61dfd5b..480ac3adc5 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -226,13 +226,15 @@ void optimize_layout(T* head_embedding,
     has_outlier = check_outliers<nnz_t, TPB_X>(tail, tail_n, nnz, threshold_for_outlier, stream);
   }
 
-  if (has_outlier) {
+  if (has_outlier || params->deterministic) {
     // Shuffling is necessary when outliers may be present (i.e., dense points that undergo many
     // updates). It is critical to avoid having too many threads update the same embedding vector
     // simultaneously, as this can affect correctness. By shuffling, potential outlier points are
     // distributed across threads, rather than being processed by consecutive threads that are
     // scheduled together. This approach relies on the GPU's inability to physically schedule all
     // nnz edges at once.
+    // also shuffle when want deterministic behavior to ensure that updates for the same vertex are
+    // processed in different kernel launches.
     auto first =
       thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(head),
                                                    thrust::device_pointer_cast(tail),

From 9b210d862f0eb4e67175a97a0417db2519c0c380 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Mon, 15 Dec 2025 21:15:00 +0000
Subject: [PATCH 03/12] bit flags for sparse apply

---
 cpp/src/umap/simpl_set_embed/algo.cuh         |  28 +++-
 .../simpl_set_embed/optimize_batch_kernel.cuh | 154 +++++++++++++-----
 2 files changed, 135 insertions(+), 47 deletions(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 480ac3adc5..725ef16b5f 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -257,7 +257,7 @@ void optimize_layout(T* head_embedding,
     if (min_n <= 100000) {
       num_chunks = 4;
     } else {
-      num_chunks = 8;
+      num_chunks = 210;
     }
   }
 
@@ -276,21 +276,39 @@ void optimize_layout(T* head_embedding,
   // Buffers used to store the gradient updates to avoid conflicts
   rmm::device_uvector<T> head_buffer(0, stream_view);
   rmm::device_uvector<T> tail_buffer(0, stream_view);
+
+  // Flags to track which vertices were modified per chunk (for sparse apply)
+  rmm::device_uvector<uint32_t> head_flags(0, stream_view);
+  rmm::device_uvector<uint32_t> tail_flags(0, stream_view);
   // Write to embedding directly if deterministic is not needed.
-  T* d_head_buffer = head_embedding;
-  T* d_tail_buffer = tail_embedding;
+  T* d_head_buffer       = head_embedding;
+  T* d_tail_buffer       = tail_embedding;
+  uint32_t* d_head_flags = nullptr;
+  uint32_t* d_tail_flags = nullptr;
   if (params->deterministic) {
     nnz_t n_components = params->n_components;
     head_buffer.resize(head_n * n_components, stream_view);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(head_buffer.data(), '\0', sizeof(T) * head_buffer.size(), stream));
+
+    // Bit flag: 1 bit per vertex, so (n_vertices + 31) / 32 uint32_t words
+    int head_flag_words = (head_n + 31) >> 5;
+    head_flags.resize(head_flag_words, stream_view);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(head_flags.data(), '\0', sizeof(uint32_t) * head_flag_words, stream));
+    d_head_buffer = head_buffer.data();
+    d_head_flags  = head_flags.data();
     // No need for tail if it's not being written.
     if (move_other) {
       tail_buffer.resize(tail_n * n_components, stream_view);
       RAFT_CUDA_TRY(
         cudaMemsetAsync(tail_buffer.data(), '\0', sizeof(T) * tail_buffer.size(), stream));
+      int tail_flag_words = (tail_n + 31) >> 5;
+      tail_flags.resize(tail_flag_words, stream_view);
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(tail_flags.data(), '\0', sizeof(uint32_t) * tail_flag_words, stream));
+      d_tail_flags = tail_flags.data();
     }
-    d_head_buffer = head_buffer.data();
     d_tail_buffer = tail_buffer.data();
   }
 
@@ -304,9 +322,11 @@ void optimize_layout(T* head_embedding,
   for (int n = 0; n < n_epochs; n++) {
     call_optimize_batch_kernel<T, nnz_t, TPB_X>(head_embedding,
                                                 d_head_buffer,
+                                                d_head_flags,
                                                 head_n,
                                                 tail_embedding,
                                                 d_tail_buffer,
+                                                d_tail_flags,
                                                 tail_n,
                                                 head,
                                                 tail,
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index 19007fd586..bb80b78049 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -26,44 +26,86 @@ namespace Algo {
 using namespace ML;
 
 /**
- * Update the embeddings and clear the buffers when using deterministic algorithm.
+ * Set a bit in the flags to mark a vertex as modified.
+ * Uses atomic OR for thread safety.
  */
-template <typename T, typename nnz_t>
-void apply_embedding_updates(T* head_embedding,
-                             T* head_buffer,
-                             int head_n,
-                             T* tail_embedding,
-                             T* tail_buffer,
-                             int tail_n,
-                             UMAPParams const* params,
-                             bool move_other,
-                             rmm::cuda_stream_view stream)
+__device__ __forceinline__ void set_vertex_modified(uint32_t* flags, int vertex_id)
+{
+  int word_idx = vertex_id >> 5;  // vertex_id / 32
+  int bit_idx  = vertex_id & 31;  // vertex_id % 32
+  atomicOr(&flags[word_idx], 1u << bit_idx);
+}
+
+/**
+ * Sparse apply kernel: only updates vertices that have their bit set in the flags.
+ */
+template <typename T, int TPB_X = 256>
+CUML_KERNEL void sparse_apply_kernel(
+  T* embedding, T* buffer, uint32_t* flags, int n_vertices, int n_components)
+{
+  int tid           = blockIdx.x * TPB_X + threadIdx.x;
+  int total_threads = gridDim.x * TPB_X;
+
+  int n_words = (n_vertices + 31) >> 5;
+
+  // each thread strides through the flags in word units
+  for (int word_idx = tid; word_idx < n_words; word_idx += total_threads) {
+    uint32_t word = flags[word_idx];
+    if (word == 0) continue;  // Skip if no bits set
+
+    while (word != 0) {
+      int bit_pos   = __ffs(word) - 1;            // Find first set bit (1-indexed, so -1)
+      int vertex_id = (word_idx << 5) + bit_pos;  // word_idx * 32 (word size) + bit_pos
+
+      if (vertex_id < n_vertices) {
+        // Apply update for this vertex
+        int base = vertex_id * n_components;
+        for (int d = 0; d < n_components; d++) {
+          embedding[base + d] += buffer[base + d];
+          buffer[base + d] = T(0.0);
+        }
+      }
+
+      // Clear the first set bit just processed
+      word &= ~(1u << bit_pos);
+    }
+
+    flags[word_idx] = 0;
+  }
+}
+
+/**
+ * Update the embeddings using sparse apply - only touches modified vertices.
+ * use flags to track which vertices received gradient updates.
+ */
+template <typename T, typename nnz_t, int TPB_X = 256>
+void sparse_apply_embedding_updates(T* head_embedding,
+                                    T* head_buffer,
+                                    uint32_t* head_flags,
+                                    int head_n,
+                                    T* tail_embedding,
+                                    T* tail_buffer,
+                                    uint32_t* tail_flags,
+                                    int tail_n,
+                                    UMAPParams const* params,
+                                    bool move_other,
+                                    cudaStream_t stream)
 {
   ASSERT(params->deterministic, "Only used when deterministic is set to true.");
-  nnz_t n_components = params->n_components;
+
+  // flags: one thread per 32-vertex word
+  int head_words = raft::ceildiv(head_n, 32);
+  dim3 grid_head(raft::ceildiv(head_words, TPB_X), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  sparse_apply_kernel<T, TPB_X><<<grid_head, blk, 0, stream>>>(
+    head_embedding, head_buffer, head_flags, head_n, params->n_components);
+
   if (move_other) {
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0u),
-                     thrust::make_counting_iterator(0u) + std::max(head_n, tail_n) * n_components,
-                     [=] __device__(uint32_t i) {
-                       if (i < head_n * n_components) {
-                         head_embedding[i] += head_buffer[i];
-                         head_buffer[i] = 0.0f;
-                       }
-                       if (i < tail_n * n_components) {
-                         tail_embedding[i] += tail_buffer[i];
-                         tail_buffer[i] = 0.0f;
-                       }
-                     });
-  } else {
-    // No need to update reference embedding
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0u),
-                     thrust::make_counting_iterator(0u) + head_n * n_components,
-                     [=] __device__(uint32_t i) {
-                       head_embedding[i] += head_buffer[i];
-                       head_buffer[i] = 0.0f;
-                     });
+    int tail_words = raft::ceildiv(tail_n, 32);
+    dim3 grid_tail(raft::ceildiv(tail_words, TPB_X), 1, 1);
+    sparse_apply_kernel<T, TPB_X><<<grid_tail, blk, 0, stream>>>(
+      tail_embedding, tail_buffer, tail_flags, tail_n, params->n_components);
   }
 }
 
@@ -134,8 +176,10 @@ DI T truncate_gradient(T const rounding_factor, T const x)
 template <typename T, typename nnz_t, int TPB_X, nnz_t n_components>
 CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
                                            T* head_buffer,
+                                           uint32_t* head_flags,
                                            T const* tail_embedding,
                                            T* tail_buffer,
+                                           uint32_t* tail_flags,
                                            MLCommon::FastIntDiv tail_n,
                                            const int* head,
                                            const int* tail,
@@ -211,6 +255,7 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
       for (int d = 0; d < n_components; d++) {
         raft::myAtomicAdd(oth_write + d, truncate_gradient(rounding, -grads[d]));
       }
+      if (tail_flags != nullptr) { set_vertex_modified(tail_flags, k); }
     }
     epoch_of_next_sample[row] = _epoch_of_next_sample + _epochs_per_sample;
     // number of negative samples to choose
@@ -256,6 +301,8 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
     for (int d = 0; d < n_components; d++) {
       raft::myAtomicAdd(cur_write + d, truncate_gradient(rounding, grads[d]));
     }
+    if (head_flags != nullptr) { set_vertex_modified(head_flags, j); }
+
     epoch_of_next_negative_sample[row] =
       _epoch_of_next_negative_sample + n_neg_samples * epochs_per_negative_sample;
 
@@ -270,8 +317,10 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
 template <typename T, typename nnz_t, int TPB_X, bool use_shared_mem>
 CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
                                        T* head_buffer,
+                                       uint32_t* head_flags,
                                        T const* tail_embedding,
                                        T* tail_buffer,
+                                       uint32_t* tail_flags,
                                        MLCommon::FastIntDiv tail_n,
                                        const int* head,
                                        const int* tail,
@@ -358,12 +407,18 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
         }
       }
     }
+
+    if constexpr (!use_shared_mem) {
+      if (head_flags != nullptr) { set_vertex_modified(head_flags, j); }
+      if (move_other && tail_flags != nullptr) { set_vertex_modified(tail_flags, k); }
+    }
     // storing gradients for negative samples back to global memory
     if (use_shared_mem && move_other) {
       for (int d = 0; d < n_components; d++) {
         auto grad = grads_buffer[d * TPB_X];
         raft::myAtomicAdd<T>((T*)oth_write + d, truncate_gradient(rounding, -grad));
       }
+      if (tail_flags != nullptr) { set_vertex_modified(tail_flags, k); }
     }
     epoch_of_next_sample[row] = _epoch_of_next_sample + _epochs_per_sample;
     // number of negative samples to choose
@@ -422,6 +477,7 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
         raft::myAtomicAdd<T>((T*)cur_write + d,
                              truncate_gradient(rounding, grads_buffer[d * TPB_X]));
       }
+      if (head_flags != nullptr) { set_vertex_modified(head_flags, j); }
     }
     epoch_of_next_negative_sample[row] =
       _epoch_of_next_negative_sample + n_neg_samples * epochs_per_negative_sample;
@@ -438,6 +494,8 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
  *                     result is required.  They are the same pointer if random seed is not
  *                     provided.
  * @param tail_buffer: Similar to head_buffer, but for tail_embedding.
+ * @param head_flags: flags tracking which head vertices were modified (for sparse apply).
+ * @param tail_flags: flags tracking which tail vertices were modified (for sparse apply).
  * @param head:        Row index in COO connectivity graph.
  * @param tail:        Column index in COO connectivity graph.
  * @param alpha:       Learning rate
@@ -448,9 +506,11 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
 template <typename T, typename nnz_t, int TPB_X>
 void call_optimize_batch_kernel(T* head_embedding,
                                 T* head_buffer,
+                                uint32_t* head_flags,
                                 int head_n,
                                 T* tail_embedding,
                                 T* tail_buffer,
+                                uint32_t* tail_flags,
                                 int tail_n,
                                 const int* head,
                                 const int* tail,
@@ -482,8 +542,10 @@ void call_optimize_batch_kernel(T* head_embedding,
       optimize_batch_kernel_reg<T, nnz_t, TPB_X, 2>
         <<<grid, blk, 0, stream>>>(head_embedding,
                                    head_buffer,
+                                   head_flags,
                                    tail_embedding,
                                    tail_buffer,
+                                   tail_flags,
                                    tail_n,
                                    head,
                                    tail,
@@ -505,8 +567,10 @@ void call_optimize_batch_kernel(T* head_embedding,
       optimize_batch_kernel<T, nnz_t, TPB_X, true>
         <<<grid, blk, requiredSize, stream>>>(head_embedding,
                                               head_buffer,
+                                              head_flags,
                                               tail_embedding,
                                               tail_buffer,
+                                              tail_flags,
                                               tail_n,
                                               head,
                                               tail,
@@ -528,8 +592,10 @@ void call_optimize_batch_kernel(T* head_embedding,
       optimize_batch_kernel<T, nnz_t, TPB_X, false>
         <<<grid, blk, 0, stream>>>(head_embedding,
                                    head_buffer,
+                                   head_flags,
                                    tail_embedding,
                                    tail_buffer,
+                                   tail_flags,
                                    tail_n,
                                    head,
                                    tail,
@@ -552,15 +618,17 @@ void call_optimize_batch_kernel(T* head_embedding,
   if (params->deterministic) {
     for (size_t offset = 0; offset < nnz; offset += TPB_X * grid.x) {
       launch_kernel(offset);
-      apply_embedding_updates<T, nnz_t>(head_embedding,
-                                        head_buffer,
-                                        head_n,
-                                        tail_embedding,
-                                        tail_buffer,
-                                        tail_n,
-                                        params,
-                                        move_other,
-                                        stream_view);
+      sparse_apply_embedding_updates<T, nnz_t, TPB_X>(head_embedding,
+                                                      head_buffer,
+                                                      head_flags,
+                                                      head_n,
+                                                      tail_embedding,
+                                                      tail_buffer,
+                                                      tail_flags,
+                                                      tail_n,
+                                                      params,
+                                                      move_other,
+                                                      stream);
     }
   } else {
     launch_kernel(0);

From 4b3f1f9a852a35f8ba3132a21ae2e751d89efb1d Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 17 Dec 2025 19:39:57 +0000
Subject: [PATCH 04/12] add comment and heuristic

---
 cpp/src/umap/simpl_set_embed/algo.cuh                  |  9 ++-------
 cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh | 10 ++++++++++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 725ef16b5f..13d2da5e13 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -252,13 +252,8 @@ void optimize_layout(T* head_embedding,
     }
   }
 
-  // TODO: find heuristics for this
-  if (params->deterministic) {
-    if (min_n <= 100000) {
-      num_chunks = 4;
-    } else {
-      num_chunks = 210;
-    }
+  if (has_outlier && params->deterministic) {
+    if (min_n > 100000) { num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(100000)); }
   }
 
   rmm::device_uvector<T> epoch_of_next_negative_sample(nnz, stream);
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index bb80b78049..2119118a96 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -616,6 +616,12 @@ void call_optimize_batch_kernel(T* head_embedding,
   };
 
   if (params->deterministic) {
+    // for deterministic mode, we enforce a sequential behavior using n_chunks. after 1 chunk
+    // (including TPB_X * grid.x threads each doing an update) is processed, we apply the gradients
+    // in the buffer to the resulting embedding. The next chunk is processed based on the updated
+    // embedding from the previous chunk. this ensures that the updates are applied in a sequential
+    // manner (which is crucial for avoiding outliers), and the resulting embedding is deterministic
+    // because we use buffers.
     for (size_t offset = 0; offset < nnz; offset += TPB_X * grid.x) {
       launch_kernel(offset);
       sparse_apply_embedding_updates<T, nnz_t, TPB_X>(head_embedding,
@@ -631,6 +637,10 @@ void call_optimize_batch_kernel(T* head_embedding,
                                                       stream);
     }
   } else {
+    // for non-deterministic mode, we launch the kernel once with nnz/n_chunks threads.
+    // each thread strides through n_chunks updates and writes gradients immediately to the
+    // resulting embedding inside the kernel, increasing the chances of a sequential update which is
+    // crucial for avoiding outliers.
     launch_kernel(0);
   }
 }

From c7ef2bec51c59886a47a2b5ae35776630499a264 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 17 Dec 2025 20:18:10 +0000
Subject: [PATCH 05/12] comment

---
 cpp/src/umap/simpl_set_embed/algo.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 13d2da5e13..ea3168c99f 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -253,6 +253,8 @@ void optimize_layout(T* head_embedding,
   }
 
   if (has_outlier && params->deterministic) {
+    // for processing in deterministic mode on datasets that are likely to have outliers, we use the
+    // heuristic below to determine the number of chunks.
     if (min_n > 100000) { num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(100000)); }
   }
 

From d217093dd419a360c10e6e1965c35e5fbba980a9 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 17 Dec 2025 20:29:40 +0000
Subject: [PATCH 06/12] cleanup

---
 cpp/src/umap/simpl_set_embed/algo.cuh               |  5 ++---
 .../umap/simpl_set_embed/optimize_batch_kernel.cuh  | 13 +++++--------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index ea3168c99f..015528b690 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -288,8 +288,7 @@ void optimize_layout(T* head_embedding,
     RAFT_CUDA_TRY(
       cudaMemsetAsync(head_buffer.data(), '\0', sizeof(T) * head_buffer.size(), stream));
 
-    // Bit flag: 1 bit per vertex, so (n_vertices + 31) / 32 uint32_t words
-    int head_flag_words = (head_n + 31) >> 5;
+    int head_flag_words = raft::ceildiv(head_n, 32);
     head_flags.resize(head_flag_words, stream_view);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(head_flags.data(), '\0', sizeof(uint32_t) * head_flag_words, stream));
@@ -300,7 +299,7 @@ void optimize_layout(T* head_embedding,
       tail_buffer.resize(tail_n * n_components, stream_view);
       RAFT_CUDA_TRY(
         cudaMemsetAsync(tail_buffer.data(), '\0', sizeof(T) * tail_buffer.size(), stream));
-      int tail_flag_words = (tail_n + 31) >> 5;
+      int tail_flag_words = raft::ceildiv(tail_n, 32);
       tail_flags.resize(tail_flag_words, stream_view);
       RAFT_CUDA_TRY(
         cudaMemsetAsync(tail_flags.data(), '\0', sizeof(uint32_t) * tail_flag_words, stream));
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index 2119118a96..7233b619de 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -27,7 +27,6 @@ using namespace ML;
 
 /**
  * Set a bit in the flags to mark a vertex as modified.
- * Uses atomic OR for thread safety.
  */
 __device__ __forceinline__ void set_vertex_modified(uint32_t* flags, int vertex_id)
 {
@@ -51,10 +50,10 @@ CUML_KERNEL void sparse_apply_kernel(
   // each thread strides through the flags in word units
   for (int word_idx = tid; word_idx < n_words; word_idx += total_threads) {
     uint32_t word = flags[word_idx];
-    if (word == 0) continue;  // Skip if no bits set
+    if (word == 0) continue;  // skip if no bits set
 
     while (word != 0) {
-      int bit_pos   = __ffs(word) - 1;            // Find first set bit (1-indexed, so -1)
+      int bit_pos   = __ffs(word) - 1;            // find first set bit (1-indexed, so -1)
       int vertex_id = (word_idx << 5) + bit_pos;  // word_idx * 32 (word size) + bit_pos
 
       if (vertex_id < n_vertices) {
@@ -70,13 +69,13 @@ CUML_KERNEL void sparse_apply_kernel(
       word &= ~(1u << bit_pos);
     }
 
-    flags[word_idx] = 0;
+    flags[word_idx] = 0;  // clear flag for next chunk
   }
 }
 
 /**
- * Update the embeddings using sparse apply - only touches modified vertices.
- * use flags to track which vertices received gradient updates.
+ * Update the embeddings using sparse apply using bit flags to track which vertices received
+ * gradient updates.
  */
 template <typename T, typename nnz_t, int TPB_X = 256>
 void sparse_apply_embedding_updates(T* head_embedding,
@@ -91,8 +90,6 @@ void sparse_apply_embedding_updates(T* head_embedding,
                                     bool move_other,
                                     cudaStream_t stream)
 {
-  ASSERT(params->deterministic, "Only used when deterministic is set to true.");
-
   // flags: one thread per 32-vertex word
   int head_words = raft::ceildiv(head_n, 32);
   dim3 grid_head(raft::ceildiv(head_words, TPB_X), 1, 1);

From f1329aba8c79963ee1e472df84701ccbe382c62c Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 17 Dec 2025 23:27:24 +0000
Subject: [PATCH 07/12] fix threshold

---
 cpp/src/umap/simpl_set_embed/algo.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 015528b690..6daedd738a 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -255,7 +255,11 @@ void optimize_layout(T* head_embedding,
   if (has_outlier && params->deterministic) {
     // for processing in deterministic mode on datasets that are likely to have outliers, we use the
     // heuristic below to determine the number of chunks.
-    if (min_n > 100000) { num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(100000)); }
+    if (nnz > 100000) {
+      num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(100000));
+    } else if (nnz > 10000) {
+      num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(10000));
+    }
   }
 
   rmm::device_uvector<T> epoch_of_next_negative_sample(nnz, stream);

From 779aba40e3dc91fc01efa9d83dbe35da217d10a7 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Thu, 18 Dec 2025 00:23:49 +0000
Subject: [PATCH 08/12] Add previously failing test

---
 python/cuml/tests/test_umap.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py
index cb4d9ca840..48e202d3a7 100644
--- a/python/cuml/tests/test_umap.py
+++ b/python/cuml/tests/test_umap.py
@@ -921,8 +921,16 @@ def test_umap_small_fit_large_transform():
 
 @pytest.mark.parametrize("n_neighbors", [5, 15])
 @pytest.mark.parametrize("n_components", [2, 5])
-def test_umap_outliers(n_neighbors, n_components):
-    n_rows = 50_000
+@pytest.mark.parametrize("random_state", [None, 42])
+def test_umap_outliers(n_neighbors, n_components, random_state):
+    if random_state is None:
+        n_rows = 50_000
+        build_algo = "nn_descent"
+        init = "spectral"
+    else:
+        n_rows = 100_000
+        build_algo = "brute_force_knn"
+        init = "random"
 
     # This dataset was specifically chosen because UMAP produces outliers
     # on this dataset before the outlier fix.
@@ -930,10 +938,11 @@ def test_umap_outliers(n_neighbors, n_components):
     data = data.astype(np.float32)
 
     gpu_umap = cuUMAP(
-        build_algo="nn_descent",
-        init="spectral",
+        build_algo=build_algo,
+        init=init,
         n_neighbors=n_neighbors,
         n_components=n_components,
+        random_state=random_state,
     )
     gpu_umap_embeddings = gpu_umap.fit_transform(data)
 
@@ -941,7 +950,10 @@ def test_umap_outliers(n_neighbors, n_components):
     # and comparing the max and min values of the resulting embedding. However, running CPU UMAP
     # with this data size using spectral initialization is too slow to run repetitively in CI.
     # Instead, we hardwire a locally determined threshold for this dataset.
-    threshold = 50.0
+    if random_state is None:
+        threshold = 50.0
+    else:
+        threshold = 25.0
 
     assert np.all(
         (gpu_umap_embeddings >= -threshold)

From 50b6ddcd7427192855d9283fbb703457809f9e36 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 9 Jan 2026 20:47:05 +0000
Subject: [PATCH 09/12] static cast

---
 cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index 7233b619de..34213f2c21 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -619,7 +619,8 @@ void call_optimize_batch_kernel(T* head_embedding,
     // embedding from the previous chunk. this ensures that the updates are applied in a sequential
     // manner (which is crucial for avoiding outliers), and the resulting embedding is deterministic
     // because we use buffers.
-    for (size_t offset = 0; offset < nnz; offset += TPB_X * grid.x) {
+    size_t chunk_size = static_cast<size_t>(TPB_X) * static_cast<size_t>(grid.x);
+    for (size_t offset = 0; offset < nnz; offset += chunk_size) {
       launch_kernel(offset);
       sparse_apply_embedding_updates<T, nnz_t, TPB_X>(head_embedding,
                                                       head_buffer,

From 30c2eed98dc7d2c59dd0fa818e79a4522b3f1365 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Wed, 14 Jan 2026 18:26:18 +0000
Subject: [PATCH 10/12] style check

---
 cpp/src/umap/simpl_set_embed/algo.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 6daedd738a..5e6b5d3057 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 

From f54f73d90b7ed1a9a5272e60ace687b666195191 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 16 Jan 2026 14:31:13 -0800
Subject: [PATCH 11/12] add detailed comments

---
 cpp/src/umap/simpl_set_embed/algo.cuh                  | 4 ++++
 cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index 5e6b5d3057..bd575d29b7 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -255,9 +255,13 @@ void optimize_layout(T* head_embedding,
   if (has_outlier && params->deterministic) {
     // for processing in deterministic mode on datasets that are likely to have outliers, we use the
     // heuristic below to determine the number of chunks.
+    // Empirically determined: 100000 edges per chunk provides good balance between determinism and
+    // avoiding gradient accumulation issues on large datasets. See benchmarks in
+    // https://github.com/rapidsai/cuml/pull/7597
     if (nnz > 100000) {
       num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(100000));
     } else if (nnz > 10000) {
+      // more fine-grained chunks for smaller number of edges
       num_chunks = raft::ceildiv(nnz, static_cast<nnz_t>(10000));
     }
   }
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
index 34213f2c21..9940914d31 100644
--- a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
+++ b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -206,6 +206,8 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
     auto _epoch_of_next_sample = epoch_of_next_sample[row];
     if (_epoch_of_next_sample > epoch) {
       if (params.deterministic) {
+        // we return immediately in deterministic mode instead of continuing the grid-stride loop
+        // because we launch a new kernel for the next chunk
         return;
       } else {
         row += skip_size;
@@ -344,6 +346,8 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
     auto _epoch_of_next_sample = epoch_of_next_sample[row];
     if (_epoch_of_next_sample > epoch) {
       if (params.deterministic) {
+        // we return immediately in deterministic mode instead of continuing the grid-stride loop
+        // because we launch a new kernel for the next chunk
         return;
       } else {
         row += skip_size;

From 636ddade538fbaf4ecfc0911c34c98ec795406b4 Mon Sep 17 00:00:00 2001
From: jinsolp <jinsolp@nvidia.com>
Date: Fri, 16 Jan 2026 15:42:36 -0800
Subject: [PATCH 12/12] tail buffer assignment when move_other=true

---
 cpp/src/umap/simpl_set_embed/algo.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index bd575d29b7..8a0e4f540d 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -311,9 +311,9 @@ void optimize_layout(T* head_embedding,
       tail_flags.resize(tail_flag_words, stream_view);
       RAFT_CUDA_TRY(
         cudaMemsetAsync(tail_flags.data(), '\0', sizeof(uint32_t) * tail_flag_words, stream));
-      d_tail_flags = tail_flags.data();
+      d_tail_flags  = tail_flags.data();
+      d_tail_buffer = tail_buffer.data();
     }
-    d_tail_buffer = tail_buffer.data();
   }
 
   // we keep the tpb and change the number of blocks we launch to handle the total number of threads