Fix UMAP outlier issue by checking for outliers and shuffling (#7131)

jinsolp · web-flow · commit e736d05351f7 · 2025-10-01T00:01:55.000Z
Closing #6454 Main difference between out simplicial set embedding and CPU UMAP was in negative sampling. We should use updated values (value after adding gradients) in the negative sampling stage. Dispatched to two kernels (and three usages) based on `n_components. Fixed like below. - `optimize_batch_kernel_reg` (`n_components=2`): update the `current_reg` register value (used later in the negative sampling stage) along with `grads` - `optimize_batch_kernel` (with shared memory): distinguish `current_buffer` (which used to JUST hold the gradient) from the `grad_buffer`. Now `current_buffer` and `grad_buffer` corresponds to the `current_reg` and `grads` registers in the register-approch kernel. - `optimize_batch_kernel` (without shared memory): untouched because the grads are applied directly to global memory. This updated value in global memory is read directly for negative sampling later on. ## Visualizations 2D 50K samples random selected for plotting. From the left - CPU KNN + CPU UMAP - GPU KNN + CPU UMAP - GPU KNN + GPU UMAP Before fix - GPU KNN + GPU UMAP After fix in this PR Using dataset 639K x 384 <img width="2400" height="600" alt="unique_embeddings_Beauty_comparison" src="https://github.com/user-attachments/assets/2b687c82-4a2d-4288-bcaa-d95d54a1b8ae" /> Using dataset 1.8M x 384 <img width="2400" height="600" alt="unique_embeddings_Appliances_comparison" src="https://github.com/user-attachments/assets/66e94360-6a55-4d37-8851-69c00e485685" /> ## Visualizations 3D 50K samples random selected for plotting. Plotting the same dataset with `n_components=3` (Which uses the second kernel). From the left - GPU KNN + CPU UMAP - GPU KNN + GPU UMAP Before fix - GPU KNN + GPU UMAP After fix in this PR Using dataset 639K x 384 (was already doing pretty well without outliers, still doing well) <img width="1905" height="666" alt="Screenshot 2025-08-25 at 1 16 37 PM" src="https://github.com/user-attachments/assets/edbfec64-ae9a-45f6-84b4-cc7e3c431884" /> Using dataset 1.8M x 384 before fix had outliers. <img width="1768" height="716" alt="Screenshot 2025-08-25 at 1 22 41 PM" src="https://github.com/user-attachments/assets/cfcffc8c-0ee3-4ad8-81f3-692483fec70e" /> Authors: - Jinsol Park (https://github.com/jinsolp) - Dante Gama Dessavre (https://github.com/dantegd) - Simon Adorf (https://github.com/csadorf) Approvers: - Victor Lafargue (https://github.com/viclafargue) - Divye Gala (https://github.com/divyegala) - Simon Adorf (https://github.com/csadorf) URL: #7131
diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -21,18 +21,24 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/manifold/umapparams.h>
 
+#include <raft/linalg/init.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/filter.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/device_ptr.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/shuffle.h>
 #include <thrust/system/cuda/execution_policy.h>
+#include <thrust/tuple.h>
 
 #include <curand.h>
 #include <math.h>
@@ -185,6 +191,47 @@ T create_gradient_rounding_factor(
   return create_rounding_factor(max_abs, n_edges);
 }
 
+template <typename nnz_t>
+CUML_KERNEL void compute_degrees_kernel(const int* rows, nnz_t nnz, int* degrees)
+{
+  nnz_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < nnz) {
+    int row = rows[i];
+    atomicAdd(&degrees[row], 1);
+  }
+}
+
+CUML_KERNEL void check_threshold_kernel(const int* degrees,
+                                        int n_vertices,
+                                        int threshold,
+                                        bool* flag)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n_vertices) {
+    if (degrees[i] > threshold) { *flag = true; }
+  }
+}
+
+template <typename nnz_t, int TPB_X>
+bool check_outliers(const int* rows, int m, nnz_t nnz, int threshold, cudaStream_t stream)
+{
+  rmm::device_uvector<int> graph_degree_head(m, stream);
+  raft::linalg::zero(graph_degree_head.data(), m, stream);
+
+  dim3 grid_nnz(raft::ceildiv(nnz, static_cast<nnz_t>(TPB_X)), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+  compute_degrees_kernel<<<grid_nnz, blk, 0, stream>>>(rows, nnz, graph_degree_head.data());
+
+  rmm::device_scalar<bool> has_outlier_d(0, stream);  // initialize to 0
+
+  dim3 grid_head_n(raft::ceildiv(static_cast<nnz_t>(m), static_cast<nnz_t>(TPB_X)), 1, 1);
+  check_threshold_kernel<<<grid_head_n, blk, 0, stream>>>(
+    graph_degree_head.data(), m, threshold, has_outlier_d.data());
+  cudaStreamSynchronize(stream);
+  bool has_outlier_h = has_outlier_d.value(stream);
+  return has_outlier_h;
+}
+
 /**
  * Runs gradient descent using sampling weights defined on
  * both the attraction and repulsion vectors.
@@ -199,8 +246,8 @@ void optimize_layout(T* head_embedding,
                      int head_n,
                      T* tail_embedding,
                      int tail_n,
-                     const int* head,
-                     const int* tail,
+                     int* head,
+                     int* tail,
                      nnz_t nnz,
                      T* epochs_per_sample,
                      float gamma,
@@ -213,6 +260,39 @@ void optimize_layout(T* head_embedding,
   T alpha         = params->initial_alpha;
 
   auto stream_view = rmm::cuda_stream_view(stream);
+
+  T rounding = create_gradient_rounding_factor<T, nnz_t>(head, nnz, head_n, alpha, stream_view);
+
+  auto min_n                = min(head_n, tail_n);
+  int threshold_for_outlier = 1024;  // this is a heuristic value.
+  // for smaller datasets, could be a dense point even with a smaller graph degree
+  if (min_n <= 100000) {
+    threshold_for_outlier = 256;
+  } else if (min_n <= 1000000) {
+    threshold_for_outlier = 512;
+  }
+
+  bool has_outlier = check_outliers<nnz_t, TPB_X>(head, head_n, nnz, threshold_for_outlier, stream);
+  if (move_other && !has_outlier) {
+    has_outlier = check_outliers<nnz_t, TPB_X>(tail, tail_n, nnz, threshold_for_outlier, stream);
+  }
+
+  if (has_outlier) {
+    // Shuffling is necessary when outliers may be present (i.e., dense points that undergo many
+    // updates). It is critical to avoid having too many threads update the same embedding vector
+    // simultaneously, as this can affect correctness. By shuffling, potential outlier points are
+    // distributed across threads, rather than being processed by consecutive threads that are
+    // scheduled together. This approach relies on the GPU's inability to physically schedule all
+    // nnz edges at once.
+    auto first =
+      thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(head),
+                                                   thrust::device_pointer_cast(tail),
+                                                   thrust::device_pointer_cast(epochs_per_sample)));
+
+    thrust::default_random_engine rng(params->random_state);
+    thrust::shuffle(first, first + nnz, rng);
+  }
+
   rmm::device_uvector<T> epoch_of_next_negative_sample(nnz, stream);
   T nsr_inv = T(1.0) / params->negative_sample_rate;
   raft::linalg::unaryOp<T>(
@@ -250,8 +330,6 @@ void optimize_layout(T* head_embedding,
   dim3 blk(TPB_X, 1, 1);
   uint64_t seed = params->random_state;
 
-  T rounding = create_gradient_rounding_factor<T, nnz_t>(head, nnz, head_n, alpha, stream_view);
-
   for (int n = 0; n < n_epochs; n++) {
     call_optimize_batch_kernel<T, nnz_t, TPB_X>(head_embedding,
                                                 d_head_buffer,
diff --git a/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh b/cpp/src/umap/simpl_set_embed/optimize_batch_kernel.cuh
@@ -156,7 +156,8 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
   for (int d = 0; d < n_components; d++) {
     auto diff   = current_reg[d] - other_reg[d];
     auto grad_d = clip<T>(attractive_grad_coeff * diff, T(-4.0), T(4.0));
-    grads[d]    = grad_d * alpha;
+    current_reg[d] += grad_d * alpha;
+    grads[d] = grad_d * alpha;
   }
   // storing gradients for negative samples back to global memory
   if (move_other) {
@@ -200,6 +201,7 @@ CUML_KERNEL void optimize_batch_kernel_reg(T const* head_embedding,
         grad_d = clip<T>(repulsive_grad_coeff * diff, T(-4.0), T(4.0));
       else
         grad_d = T(4.0);
+      current_reg[d] += grad_d * alpha;
       grads[d] += grad_d * alpha;
     }
   }
@@ -252,8 +254,17 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
   T* cur_write = head_buffer + (j * n_components);
   T* oth_write = tail_buffer + (k * n_components);
 
+  // for reducing access to global memory. load values from global memory, and accumulate grads onto
+  // this shared memory position instead of reading from global memory every time.
   T* current_buffer{nullptr};
-  if (use_shared_mem) { current_buffer = (T*)embedding_shared_mem_updates + threadIdx.x; }
+  // for keeping track of grads, final write to global memory
+  T* grads_buffer{nullptr};
+  if constexpr (use_shared_mem) {
+    // n_components for thread0, then the next n_components for thread1 ...
+    current_buffer = (T*)embedding_shared_mem_updates + threadIdx.x * n_components;
+    // TPB_X for first component, then another TPB_X for the next component for better coalescing...
+    grads_buffer = (T*)embedding_shared_mem_updates + TPB_X * n_components + threadIdx.x;
+  }
   auto dist_squared = rdist<T>(current, other, n_components);
   // Attractive force between the two vertices, since they
   // are connected by an edge in the 1-skeleton.
@@ -267,10 +278,13 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
    * performing unsupervised training).
    */
   for (int d = 0; d < n_components; d++) {
-    auto grad_d = clip<T>(attractive_grad_coeff * (current[d] - other[d]), T(-4.0), T(4.0));
+    T current_val = current[d];
+    if constexpr (use_shared_mem) { current_buffer[d] = current_val; }
+    auto grad_d = clip<T>(attractive_grad_coeff * (current_val - other[d]), T(-4.0), T(4.0));
     grad_d *= alpha;
-    if (use_shared_mem) {
-      current_buffer[d * TPB_X] = grad_d;
+    if constexpr (use_shared_mem) {
+      current_buffer[d] += grad_d;
+      grads_buffer[d * TPB_X] = grad_d;
     } else {
       raft::myAtomicAdd<T>((T*)cur_write + d, truncate_gradient(rounding, grad_d));
       if (move_other) {  // happens only during unsupervised training
@@ -282,7 +296,7 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
   if (use_shared_mem && move_other) {
     __syncthreads();
     for (int d = 0; d < n_components; d++) {
-      auto grad = current_buffer[d * TPB_X];
+      auto grad = grads_buffer[d * TPB_X];
       raft::myAtomicAdd<T>((T*)oth_write + d, truncate_gradient(rounding, -grad));
     }
   }
@@ -299,7 +313,11 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
     gen.next(r);
     nnz_t t                  = r % tail_n;
     T const* negative_sample = tail_embedding + (t * n_components);
-    dist_squared             = rdist<T>(current, negative_sample, n_components);
+    if constexpr (use_shared_mem) {
+      dist_squared = rdist<T>(current_buffer, negative_sample, n_components);
+    } else {
+      dist_squared = rdist<T>(current, negative_sample, n_components);
+    }
     // repulsive force between two vertices
     auto repulsive_grad_coeff = T(0.0);
     if (dist_squared > T(0.0)) {
@@ -313,25 +331,31 @@ CUML_KERNEL void optimize_batch_kernel(T const* head_embedding,
      */
     for (int d = 0; d < n_components; d++) {
       auto grad_d = T(0.0);
-      if (repulsive_grad_coeff > T(0.0))
-        grad_d = clip<T>(repulsive_grad_coeff * (current[d] - negative_sample[d]), T(-4.0), T(4.0));
-      else
+      if (repulsive_grad_coeff > T(0.0)) {
+        if constexpr (use_shared_mem) {
+          grad_d = clip<T>(
+            repulsive_grad_coeff * (current_buffer[d] - negative_sample[d]), T(-4.0), T(4.0));
+        } else {
+          grad_d =
+            clip<T>(repulsive_grad_coeff * (current[d] - negative_sample[d]), T(-4.0), T(4.0));
+        }
+      } else
         grad_d = T(4.0);
       grad_d *= alpha;
-      if (use_shared_mem) {
-        current_buffer[d * TPB_X] += grad_d;
+      if constexpr (use_shared_mem) {
+        current_buffer[d] += grad_d;
+        grads_buffer[d * TPB_X] += grad_d;
       } else {
         raft::myAtomicAdd<T>((T*)cur_write + d, truncate_gradient(rounding, grad_d));
       }
     }
   }
 
   // storing gradients for positive samples back to global memory
-  if (use_shared_mem) {
+  if constexpr (use_shared_mem) {
     __syncthreads();
     for (int d = 0; d < n_components; d++) {
-      raft::myAtomicAdd<T>((T*)cur_write + d,
-                           truncate_gradient(rounding, current_buffer[d * TPB_X]));
+      raft::myAtomicAdd<T>((T*)cur_write + d, truncate_gradient(rounding, grads_buffer[d * TPB_X]));
     }
   }
   epoch_of_next_negative_sample[row] =
@@ -373,7 +397,7 @@ void call_optimize_batch_kernel(T const* head_embedding,
                                 cudaStream_t& stream,
                                 T rounding)
 {
-  std::size_t requiredSize = TPB_X * params->n_components;
+  std::size_t requiredSize = TPB_X * params->n_components * 2;
   requiredSize *= sizeof(T);
   bool use_shared_mem = requiredSize < static_cast<std::size_t>(raft::getSharedMemPerBlock());
   T nsr_inv           = T(1.0) / params->negative_sample_rate;
diff --git a/python/cuml/tests/test_umap.py b/python/cuml/tests/test_umap.py
@@ -29,7 +29,7 @@
 from pylibraft.common import DeviceResourcesSNMG
 from sklearn import datasets
 from sklearn.cluster import KMeans
-from sklearn.datasets import make_blobs
+from sklearn.datasets import make_blobs, make_moons
 from sklearn.manifold import trustworthiness
 from sklearn.metrics import adjusted_rand_score
 from sklearn.neighbors import NearestNeighbors
@@ -924,3 +924,65 @@ def test_umap_small_fit_large_transform():
 
     trust = trustworthiness(infer, embeddings, n_neighbors=10)
     assert trust >= 0.9
+
+
+@pytest.mark.parametrize("n_neighbors", [5, 15])
+@pytest.mark.parametrize("n_components", [2, 5])
+def test_umap_outliers(n_neighbors, n_components):
+    all_neighbors = pytest.importorskip("cuvs.neighbors.all_neighbors")
+    nn_descent = pytest.importorskip("cuvs.neighbors.nn_descent")
+
+    k = n_neighbors
+    n_rows = 50_000
+
+    # This dataset was specifically chosen because UMAP produces outliers
+    # on this dataset before the outlier fix.
+    data, _ = make_moons(n_samples=n_rows, noise=0.0, random_state=42)
+    data = data.astype(np.float32)
+
+    # precompute knn for faster testing with CPU UMAP
+    nn_descent_params = nn_descent.IndexParams(
+        metric="euclidean",
+        graph_degree=k,
+        intermediate_graph_degree=k * 2,
+    )
+    params = all_neighbors.AllNeighborsParams(
+        algo="nn_descent",
+        metric="euclidean",
+        nn_descent_params=nn_descent_params,
+    )
+    indices, distances = all_neighbors.build(
+        data,
+        k,
+        params,
+        distances=cp.empty((n_rows, k), dtype=cp.float32),
+    )
+    indices = cp.asnumpy(indices)
+    distances = cp.asnumpy(distances)
+
+    gpu_umap = cuUMAP(
+        precomputed_knn=(indices, distances),
+        build_algo="nn_descent",
+        init="spectral",
+        n_neighbors=n_neighbors,
+        n_components=n_components,
+    )
+    gpu_umap_embeddings = gpu_umap.fit_transform(data)
+
+    cpu_umap = umap.UMAP(
+        precomputed_knn=(indices, distances),
+        init="spectral",
+        n_neighbors=n_neighbors,
+        n_components=n_components,
+    )
+    cpu_umap_embeddings = cpu_umap.fit_transform(data)
+
+    # test to see if there are values in the final embedding that are too out of range
+    # compared to the cpu umap output.
+    lower_bound = 3 * cpu_umap_embeddings.min()
+    upper_bound = 3 * cpu_umap_embeddings.max()
+
+    assert np.all(
+        (gpu_umap_embeddings >= lower_bound)
+        & (gpu_umap_embeddings <= upper_bound)
+    )