Skip to content

[BUG] cuVS Nearest Neighbors recall lower than expected for some datasets #1370

@jinsolp

Description

@jinsolp

Describe the bug
cuVS brute force does not 100% match python-computed brute force results.
cuVS NN Descent seems to have low recall for some datasets (especially for smaller dimensions)

Steps/Code to reproduce bug

from pynndescent import NNDescent
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
import numpy as np
from cuvs.neighbors import nn_descent, brute_force
import cupy as cp

def knn_recall(pred: np.ndarray, gt: np.ndarray) -> float:
    N, k = pred.shape
    correct = 0

    for i in range(N):
        pred_set = set(pred[i])
        gt_set = set(gt[i])
        correct += len(pred_set & gt_set)

    return correct / (N * k)

def run_exp(X, data_tag, k=16):
    # CPU Brute Force
    nn = NearestNeighbors(n_neighbors=k, algorithm="brute", metric="euclidean")
    nn.fit(X)
    cpu_bf_distances, cpu_bf_indices = nn.kneighbors(X)

    # CPU NN Descent
    index = NNDescent(X, metric="euclidean")
    cpu_nnd_indices, cpu_nnd_distances = index.query(X, k=k)

    # GPU Brute Force
    X_cp = cp.asarray(X)
    index = brute_force.build(X_cp, metric="euclidean")
    gpu_bf_distances, gpu_bf_indices = brute_force.search(index, X_cp, k)
    gpu_bf_indices = gpu_bf_indices.copy_to_host()
    gpu_bf_distances = gpu_bf_distances.copy_to_host()

    # GPU NN Descent
    build_params = nn_descent.IndexParams(metric="euclidean", return_distances=True, graph_degree=k)
    index = nn_descent.build(build_params, X)
    gpu_nnd_indices = index.graph
    gpu_nnd_distances = index.distances

    print(f"cpu bf vs cpu nnd on {data_tag}: {knn_recall(cpu_bf_indices, cpu_nnd_indices)}")
    print(f"cpu bf vs gpu nnd on {data_tag}: {knn_recall(cpu_bf_indices, gpu_nnd_indices)}")
    print(f"cpu bf vs gpu bf on {data_tag}: {knn_recall(cpu_bf_indices, gpu_bf_indices)}\n")


X, y = datasets.make_blobs(n_samples=10_000, n_features=16, centers=5, random_state=42)
run_exp(X.astype(np.float32), "blobs data with dim=16")

X, y = datasets.make_blobs(n_samples=10_000, n_features=2, centers=5, random_state=42)
run_exp(X.astype(np.float32), "blobs data with dim=2")

X, y = datasets.make_moons(n_samples=10_000, noise=0.1, random_state=42)
run_exp(X.astype(np.float32), "noisy moons data with dim=2")

Running this script results in:

cpu bf vs cpu nnd on blobs data with dim=16: 0.9886125
cpu bf vs gpu nnd on blobs data with dim=16: 0.93061875
cpu bf vs gpu bf on blobs data with dim=16: 0.9999625

cpu bf vs cpu nnd on blobs data with dim=2: 0.99999375
cpu bf vs gpu nnd on blobs data with dim=2: 0.6928
cpu bf vs gpu bf on blobs data with dim=2: 0.99985625

cpu bf vs cpu nnd on noisy moons data with dim=2: 0.999975
cpu bf vs gpu nnd on noisy moons data with dim=2: 0.808175
cpu bf vs gpu bf on noisy moons data with dim=2: 0.99991875

Expected behavior
Higher recall for GPU NN Descent and 1.0 recall for GPU Brute force

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    Projects

    Status

    Done

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions