rewrite python kmeans without scipy (#3873)

mdouze · facebook-github-bot · commit 900241c257d4 · 2024-09-20T03:23:19.000-07:00
Summary: Pull Request resolved: #3873 The previous version required scipy to do the accumulation, which is replaced here with a nifty piece of numpy accumulation. This removes the need for scipy for non-sparse data. Reviewed By: junjieqi Differential Revision: D62884307
diff --git a/contrib/clustering.py b/contrib/clustering.py
@@ -151,14 +151,12 @@ def assign_to(self, centroids, weights=None):
 
         I = I.ravel()
         D = D.ravel()
-        n = len(self.x)
+        nc, d = centroids.shape
+        sum_per_centroid = np.zeros((nc, d), dtype='float32')
         if weights is None:
-            weights = np.ones(n, dtype='float32')
-        nc = len(centroids)
-        m = scipy.sparse.csc_matrix(
-            (weights, I, np.arange(n + 1)),
-            shape=(nc, n))
-        sum_per_centroid = m * self.x
+            np.add.at(sum_per_centroid, I, self.x)
+        else: 
+            np.add.at(sum_per_centroid, I, weights[:, np.newaxis] * self.x)
 
         return I, D, sum_per_centroid
 
@@ -185,7 +183,8 @@ def perform_search(self, centroids):
 
 def sparse_assign_to_dense(xq, xb, xq_norms=None, xb_norms=None):
     """ assignment function for xq is sparse, xb is dense
-    uses a matrix multiplication. The squared norms can be provided if available.
+    uses a matrix multiplication. The squared norms can be provided if 
+    available.
     """
     nq = xq.shape[0]
     nb = xb.shape[0]
@@ -272,6 +271,7 @@ def assign_to(self, centroids, weights=None):
         if weights is None:
             weights = np.ones(n, dtype='float32')
         nc = len(centroids)
+        
         m = scipy.sparse.csc_matrix(
             (weights, I, np.arange(n + 1)),
             shape=(nc, n))
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
@@ -517,6 +517,26 @@ def test_binary(self):
 
 class TestClustering(unittest.TestCase):
 
+    def test_python_kmeans(self):
+        """ Test the python implementation of kmeans """
+        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
+        x = ds.get_train()
+
+        # bad distribution to stress-test split code
+        xt = x[:10000].copy()
+        xt[:5000] = x[0]
+
+        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
+        km_ref.train(xt)
+        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
+
+        data = clustering.DatasetAssign(xt)
+        centroids = clustering.kmeans(100, data, 10)
+        err2 = faiss.knn(xt, centroids, 1)[0].sum()
+
+        # err=33498.332 err2=33380.477
+        self.assertLess(err2, err * 1.1)
+
     def test_2level(self):
         " verify that 2-level clustering is not too sub-optimal "
         ds = datasets.SyntheticDataset(32, 10000, 0, 0)
diff --git a/tests/test_contrib_with_scipy.py b/tests/test_contrib_with_scipy.py
@@ -17,26 +17,6 @@
 
 class TestClustering(unittest.TestCase):
 
-    def test_python_kmeans(self):
-        """ Test the python implementation of kmeans """
-        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
-        x = ds.get_train()
-
-        # bad distribution to stress-test split code
-        xt = x[:10000].copy()
-        xt[:5000] = x[0]
-
-        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
-        km_ref.train(xt)
-        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
-
-        data = clustering.DatasetAssign(xt)
-        centroids = clustering.kmeans(100, data, 10)
-        err2 = faiss.knn(xt, centroids, 1)[0].sum()
-
-        # 33517.645 and 33031.098
-        self.assertLess(err2, err * 1.1)
-
     def test_sparse_routines(self):
         """ the sparse assignment routine """
         ds = datasets.SyntheticDataset(1000, 2000, 0, 200)