from pynndescent import NNDescent
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
import numpy as np
from cuvs.neighbors import nn_descent, brute_force
import cupy as cp
def knn_recall(pred: np.ndarray, gt: np.ndarray) -> float:
N, k = pred.shape
correct = 0
for i in range(N):
pred_set = set(pred[i])
gt_set = set(gt[i])
correct += len(pred_set & gt_set)
return correct / (N * k)
def run_exp(X, data_tag, k=16):
# CPU Brute Force
nn = NearestNeighbors(n_neighbors=k, algorithm="brute", metric="euclidean")
nn.fit(X)
cpu_bf_distances, cpu_bf_indices = nn.kneighbors(X)
# CPU NN Descent
index = NNDescent(X, metric="euclidean")
cpu_nnd_indices, cpu_nnd_distances = index.query(X, k=k)
# GPU Brute Force
X_cp = cp.asarray(X)
index = brute_force.build(X_cp, metric="euclidean")
gpu_bf_distances, gpu_bf_indices = brute_force.search(index, X_cp, k)
gpu_bf_indices = gpu_bf_indices.copy_to_host()
gpu_bf_distances = gpu_bf_distances.copy_to_host()
# GPU NN Descent
build_params = nn_descent.IndexParams(metric="euclidean", return_distances=True, graph_degree=k)
index = nn_descent.build(build_params, X)
gpu_nnd_indices = index.graph
gpu_nnd_distances = index.distances
print(f"cpu bf vs cpu nnd on {data_tag}: {knn_recall(cpu_bf_indices, cpu_nnd_indices)}")
print(f"cpu bf vs gpu nnd on {data_tag}: {knn_recall(cpu_bf_indices, gpu_nnd_indices)}")
print(f"cpu bf vs gpu bf on {data_tag}: {knn_recall(cpu_bf_indices, gpu_bf_indices)}\n")
X, y = datasets.make_blobs(n_samples=10_000, n_features=16, centers=5, random_state=42)
run_exp(X.astype(np.float32), "blobs data with dim=16")
X, y = datasets.make_blobs(n_samples=10_000, n_features=2, centers=5, random_state=42)
run_exp(X.astype(np.float32), "blobs data with dim=2")
X, y = datasets.make_moons(n_samples=10_000, noise=0.1, random_state=42)
run_exp(X.astype(np.float32), "noisy moons data with dim=2")
cpu bf vs cpu nnd on blobs data with dim=16: 0.9886125
cpu bf vs gpu nnd on blobs data with dim=16: 0.93061875
cpu bf vs gpu bf on blobs data with dim=16: 0.9999625
cpu bf vs cpu nnd on blobs data with dim=2: 0.99999375
cpu bf vs gpu nnd on blobs data with dim=2: 0.6928
cpu bf vs gpu bf on blobs data with dim=2: 0.99985625
cpu bf vs cpu nnd on noisy moons data with dim=2: 0.999975
cpu bf vs gpu nnd on noisy moons data with dim=2: 0.808175
cpu bf vs gpu bf on noisy moons data with dim=2: 0.99991875
Describe the bug
cuVS brute force does not 100% match python-computed brute force results.
cuVS NN Descent seems to have low recall for some datasets (especially for smaller dimensions)
Steps/Code to reproduce bug
Running this script results in:
Expected behavior
Higher recall for GPU NN Descent and 1.0 recall for GPU Brute force