Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions contrib/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,12 @@ def assign_to(self, centroids, weights=None):

I = I.ravel()
D = D.ravel()
n = len(self.x)
nc, d = centroids.shape
sum_per_centroid = np.zeros((nc, d), dtype='float32')
if weights is None:
weights = np.ones(n, dtype='float32')
nc = len(centroids)
m = scipy.sparse.csc_matrix(
(weights, I, np.arange(n + 1)),
shape=(nc, n))
sum_per_centroid = m * self.x
np.add.at(sum_per_centroid, I, self.x)
else:
np.add.at(sum_per_centroid, I, weights[:, np.newaxis] * self.x)

return I, D, sum_per_centroid

Expand All @@ -185,7 +183,8 @@ def perform_search(self, centroids):

def sparse_assign_to_dense(xq, xb, xq_norms=None, xb_norms=None):
""" assignment function for xq is sparse, xb is dense
uses a matrix multiplication. The squared norms can be provided if available.
uses a matrix multiplication. The squared norms can be provided if
available.
"""
nq = xq.shape[0]
nb = xb.shape[0]
Expand Down Expand Up @@ -272,6 +271,7 @@ def assign_to(self, centroids, weights=None):
if weights is None:
weights = np.ones(n, dtype='float32')
nc = len(centroids)

m = scipy.sparse.csc_matrix(
(weights, I, np.arange(n + 1)),
shape=(nc, n))
Expand Down
20 changes: 20 additions & 0 deletions tests/test_contrib.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,26 @@ def test_binary(self):

class TestClustering(unittest.TestCase):

def test_python_kmeans(self):
""" Test the python implementation of kmeans """
ds = datasets.SyntheticDataset(32, 10000, 0, 0)
x = ds.get_train()

# bad distribution to stress-test split code
xt = x[:10000].copy()
xt[:5000] = x[0]

km_ref = faiss.Kmeans(ds.d, 100, niter=10)
km_ref.train(xt)
err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

data = clustering.DatasetAssign(xt)
centroids = clustering.kmeans(100, data, 10)
err2 = faiss.knn(xt, centroids, 1)[0].sum()

# err=33498.332 err2=33380.477
self.assertLess(err2, err * 1.1)

def test_2level(self):
" verify that 2-level clustering is not too sub-optimal "
ds = datasets.SyntheticDataset(32, 10000, 0, 0)
Expand Down
20 changes: 0 additions & 20 deletions tests/test_contrib_with_scipy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,6 @@

class TestClustering(unittest.TestCase):

def test_python_kmeans(self):
""" Test the python implementation of kmeans """
ds = datasets.SyntheticDataset(32, 10000, 0, 0)
x = ds.get_train()

# bad distribution to stress-test split code
xt = x[:10000].copy()
xt[:5000] = x[0]

km_ref = faiss.Kmeans(ds.d, 100, niter=10)
km_ref.train(xt)
err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

data = clustering.DatasetAssign(xt)
centroids = clustering.kmeans(100, data, 10)
err2 = faiss.knn(xt, centroids, 1)[0].sum()

# 33517.645 and 33031.098
self.assertLess(err2, err * 1.1)

def test_sparse_routines(self):
""" the sparse assignment routine """
ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
Expand Down