From 900241c257d4fec03247242a9c36c24103f3eb61 Mon Sep 17 00:00:00 2001 From: Matthijs Douze Date: Fri, 20 Sep 2024 03:23:19 -0700 Subject: [PATCH] rewrite python kmeans without scipy (#3873) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3873 The previous version required scipy to do the accumulation, which is replaced here with a nifty piece of numpy accumulation. This removes the need for scipy for non-sparse data. Reviewed By: junjieqi Differential Revision: D62884307 --- contrib/clustering.py | 16 ++++++++-------- tests/test_contrib.py | 20 ++++++++++++++++++++ tests/test_contrib_with_scipy.py | 20 -------------------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/contrib/clustering.py b/contrib/clustering.py index e84a7e63f6..79b6b05a5f 100644 --- a/contrib/clustering.py +++ b/contrib/clustering.py @@ -151,14 +151,12 @@ def assign_to(self, centroids, weights=None): I = I.ravel() D = D.ravel() - n = len(self.x) + nc, d = centroids.shape + sum_per_centroid = np.zeros((nc, d), dtype='float32') if weights is None: - weights = np.ones(n, dtype='float32') - nc = len(centroids) - m = scipy.sparse.csc_matrix( - (weights, I, np.arange(n + 1)), - shape=(nc, n)) - sum_per_centroid = m * self.x + np.add.at(sum_per_centroid, I, self.x) + else: + np.add.at(sum_per_centroid, I, weights[:, np.newaxis] * self.x) return I, D, sum_per_centroid @@ -185,7 +183,8 @@ def perform_search(self, centroids): def sparse_assign_to_dense(xq, xb, xq_norms=None, xb_norms=None): """ assignment function for xq is sparse, xb is dense - uses a matrix multiplication. The squared norms can be provided if available. + uses a matrix multiplication. The squared norms can be provided if + available. """ nq = xq.shape[0] nb = xb.shape[0] @@ -272,6 +271,7 @@ def assign_to(self, centroids, weights=None): if weights is None: weights = np.ones(n, dtype='float32') nc = len(centroids) + m = scipy.sparse.csc_matrix( (weights, I, np.arange(n + 1)), shape=(nc, n)) diff --git a/tests/test_contrib.py b/tests/test_contrib.py index 05a2c4ac8b..fa5d85ab51 100644 --- a/tests/test_contrib.py +++ b/tests/test_contrib.py @@ -517,6 +517,26 @@ def test_binary(self): class TestClustering(unittest.TestCase): + def test_python_kmeans(self): + """ Test the python implementation of kmeans """ + ds = datasets.SyntheticDataset(32, 10000, 0, 0) + x = ds.get_train() + + # bad distribution to stress-test split code + xt = x[:10000].copy() + xt[:5000] = x[0] + + km_ref = faiss.Kmeans(ds.d, 100, niter=10) + km_ref.train(xt) + err = faiss.knn(xt, km_ref.centroids, 1)[0].sum() + + data = clustering.DatasetAssign(xt) + centroids = clustering.kmeans(100, data, 10) + err2 = faiss.knn(xt, centroids, 1)[0].sum() + + # err=33498.332 err2=33380.477 + self.assertLess(err2, err * 1.1) + def test_2level(self): " verify that 2-level clustering is not too sub-optimal " ds = datasets.SyntheticDataset(32, 10000, 0, 0) diff --git a/tests/test_contrib_with_scipy.py b/tests/test_contrib_with_scipy.py index 4f89e2fc1b..618a550b73 100644 --- a/tests/test_contrib_with_scipy.py +++ b/tests/test_contrib_with_scipy.py @@ -17,26 +17,6 @@ class TestClustering(unittest.TestCase): - def test_python_kmeans(self): - """ Test the python implementation of kmeans """ - ds = datasets.SyntheticDataset(32, 10000, 0, 0) - x = ds.get_train() - - # bad distribution to stress-test split code - xt = x[:10000].copy() - xt[:5000] = x[0] - - km_ref = faiss.Kmeans(ds.d, 100, niter=10) - km_ref.train(xt) - err = faiss.knn(xt, km_ref.centroids, 1)[0].sum() - - data = clustering.DatasetAssign(xt) - centroids = clustering.kmeans(100, data, 10) - err2 = faiss.knn(xt, centroids, 1)[0].sum() - - # 33517.645 and 33031.098 - self.assertLess(err2, err * 1.1) - def test_sparse_routines(self): """ the sparse assignment routine """ ds = datasets.SyntheticDataset(1000, 2000, 0, 200)