From 34d7487750c4de6fcca0fab712dc092433eddd2a Mon Sep 17 00:00:00 2001 From: Michael Norris Date: Wed, 4 Dec 2024 20:48:11 -0800 Subject: [PATCH 1/4] Add more unit testing for IndexHNSW [1/n] (#4054) Summary: Part 1 of more HNSW unit tests Differential Revision: D66690398 --- tests/test_graph_based.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py index 1f840e6cac..6fb60b3598 100644 --- a/tests/test_graph_based.py +++ b/tests/test_graph_based.py @@ -73,6 +73,18 @@ def test_hnsw_unbounded_queue(self): self.io_and_retest(index, Dhnsw, Ihnsw) + def test_hnsw_no_init_level0(self): + d = self.xq.shape[1] + + index = faiss.IndexHNSWFlat(d, 16) + index.init_level0 = False + index.add(self.xb) + Dhnsw, Ihnsw = index.search(self.xq, 1) + + self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 25) + + self.io_and_retest(index, Dhnsw, Ihnsw) + def io_and_retest(self, index, Dhnsw, Ihnsw): index2 = faiss.deserialize_index(faiss.serialize_index(index)) Dhnsw2, Ihnsw2 = index2.search(self.xq, 1) @@ -175,7 +187,7 @@ def test_abs_inner_product(self): xb = self.xb - self.xb.mean(axis=0) # need to be centered to give interesting directions xq = self.xq - self.xq.mean(axis=0) Dref, Iref = faiss.knn(xq, xb, 10, faiss.METRIC_ABS_INNER_PRODUCT) - + index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_ABS_INNER_PRODUCT) index.add(xb) Dnew, Inew = index.search(xq, 10) @@ -183,8 +195,23 @@ def test_abs_inner_product(self): inter = faiss.eval_intersection(Iref, Inew) # 4769 vs. 500*10 self.assertGreater(inter, Iref.size * 0.9) - - + + def test_hnsw_reset(self): + d = self.xb.shape[1] + index_flat = faiss.IndexFlat(d) + index_flat.add(self.xb) + self.assertEqual(index_flat.ntotal, self.xb.shape[0]) + index_hnsw = faiss.IndexHNSW(index_flat) + index_hnsw.add(self.xb) + # * 2 because we add to storage twice. This is just for testing + # that storage gets cleared correctly. + self.assertEqual(index_hnsw.ntotal, self.xb.shape[0] * 2) + + index_hnsw.reset() + + self.assertEqual(index_flat.ntotal, 0) + self.assertEqual(index_hnsw.ntotal, 0) + class Issue3684(unittest.TestCase): def test_issue3684(self): From b22e22b5f1a12a2758a905a5e86a3de23e11d543 Mon Sep 17 00:00:00 2001 From: Michael Norris Date: Wed, 4 Dec 2024 20:48:11 -0800 Subject: [PATCH 2/4] Add more unit testing for IndexHNSW [2/n] Summary: Part 2 of more HNSW unit tests Added comments indicating some currently unused code. Differential Revision: D66782376 --- faiss/IndexHNSW.cpp | 11 +++++++++++ tests/test_graph_based.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index 6a40196f00..4cc58d211c 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -351,6 +351,17 @@ void IndexHNSW::reconstruct(idx_t key, float* recons) const { storage->reconstruct(key, recons); } +/************************************************************** + * This section of functions were used during the development of HNSW support. + * They may be useful in the future but are dormant for now, and thus are not + * unit tested at the moment. + * shrink_level_0_neighbors + * search_level_0 + * init_level_0_from_knngraph + * init_level_0_from_entry_points + * reorder_links + * link_singletons + **************************************************************/ void IndexHNSW::shrink_level_0_neighbors(int new_size) { #pragma omp parallel { diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py index 6fb60b3598..a953617a8c 100644 --- a/tests/test_graph_based.py +++ b/tests/test_graph_based.py @@ -81,6 +81,8 @@ def test_hnsw_no_init_level0(self): index.add(self.xb) Dhnsw, Ihnsw = index.search(self.xq, 1) + # This is expected to be smaller because we are not initializing + # vectors into level 0. self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 25) self.io_and_retest(index, Dhnsw, Ihnsw) @@ -113,6 +115,24 @@ def test_hnsw_2level(self): self.io_and_retest(index, Dhnsw, Ihnsw) + def test_hnsw_2level_mixed_search(self): + d = self.xq.shape[1] + + quant = faiss.IndexFlatL2(d) + + storage = faiss.IndexIVFPQ(quant, d, 32, 8, 8) + storage.make_direct_map() + index = faiss.IndexHNSW2Level(quant, 32, 8, 8) + index.storage = storage + index.train(self.xb) + index.add(self.xb) + Dhnsw, Ihnsw = index.search(self.xq, 1) + + # It is expected that the mixed search will perform worse. + self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 200) + + self.io_and_retest(index, Dhnsw, Ihnsw) + def test_add_0_vecs(self): index = faiss.IndexHNSWFlat(10, 16) zero_vecs = np.zeros((0, 10), dtype='float32') From a8c7a1a67cb0d851fe59dbcf5bb1fd547d9f4958 Mon Sep 17 00:00:00 2001 From: Michael Norris Date: Wed, 4 Dec 2024 20:48:11 -0800 Subject: [PATCH 3/4] Add more unit testing for HNSW [3/n] (#4059) Summary: Moved the unused reference version to test_hnsw to match the existing pattern Differential Revision: D66793367 --- faiss/impl/HNSW.cpp | 122 +++++++++++++++++--------------------------- faiss/impl/HNSW.h | 9 ++++ tests/test_hnsw.cpp | 94 ++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 76 deletions(-) diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp index 4f1ac7c3ba..a2267bd622 100644 --- a/faiss/impl/HNSW.cpp +++ b/faiss/impl/HNSW.cpp @@ -351,6 +351,8 @@ void add_link( } } +} // namespace + /// search neighbors on a single level, starting from an entry point void search_neighbors_to_add( HNSW& hnsw, @@ -360,9 +362,6 @@ void search_neighbors_to_add( float d_entry_point, int level, VisitedTable& vt) { - // selects a version - const bool reference_version = false; - // top is nearest candidate std::priority_queue candidates; @@ -385,93 +384,64 @@ void search_neighbors_to_add( size_t begin, end; hnsw.neighbor_range(currNode, level, &begin, &end); - // select a version, based on a flag - if (reference_version) { - // a reference version - for (size_t i = begin; i < end; i++) { - storage_idx_t nodeId = hnsw.neighbors[i]; - if (nodeId < 0) - break; - if (vt.get(nodeId)) - continue; - vt.set(nodeId); - - float dis = qdis(nodeId); - NodeDistFarther evE1(dis, nodeId); - - if (results.size() < hnsw.efConstruction || - results.top().d > dis) { - results.emplace(dis, nodeId); - candidates.emplace(dis, nodeId); - if (results.size() > hnsw.efConstruction) { - results.pop(); - } + // process 4 neighbors at a time + // Compare this to reference version in test_hnsw.cpp + auto update_with_candidate = [&](const storage_idx_t idx, + const float dis) { + if (results.size() < hnsw.efConstruction || results.top().d > dis) { + results.emplace(dis, idx); + candidates.emplace(dis, idx); + if (results.size() > hnsw.efConstruction) { + results.pop(); } } - } else { - // a faster version - - // the following version processes 4 neighbors at a time - auto update_with_candidate = [&](const storage_idx_t idx, - const float dis) { - if (results.size() < hnsw.efConstruction || - results.top().d > dis) { - results.emplace(dis, idx); - candidates.emplace(dis, idx); - if (results.size() > hnsw.efConstruction) { - results.pop(); - } - } - }; + }; + + int n_buffered = 0; + storage_idx_t buffered_ids[4]; - int n_buffered = 0; - storage_idx_t buffered_ids[4]; + for (size_t j = begin; j < end; j++) { + storage_idx_t nodeId = hnsw.neighbors[j]; + if (nodeId < 0) + break; + if (vt.get(nodeId)) { + continue; + } + vt.set(nodeId); - for (size_t j = begin; j < end; j++) { - storage_idx_t nodeId = hnsw.neighbors[j]; - if (nodeId < 0) - break; - if (vt.get(nodeId)) { - continue; - } - vt.set(nodeId); - - buffered_ids[n_buffered] = nodeId; - n_buffered += 1; - - if (n_buffered == 4) { - float dis[4]; - qdis.distances_batch_4( - buffered_ids[0], - buffered_ids[1], - buffered_ids[2], - buffered_ids[3], - dis[0], - dis[1], - dis[2], - dis[3]); - - for (size_t id4 = 0; id4 < 4; id4++) { - update_with_candidate(buffered_ids[id4], dis[id4]); - } + buffered_ids[n_buffered] = nodeId; + n_buffered += 1; - n_buffered = 0; + if (n_buffered == 4) { + float dis[4]; + qdis.distances_batch_4( + buffered_ids[0], + buffered_ids[1], + buffered_ids[2], + buffered_ids[3], + dis[0], + dis[1], + dis[2], + dis[3]); + + for (size_t id4 = 0; id4 < 4; id4++) { + update_with_candidate(buffered_ids[id4], dis[id4]); } - } - // process leftovers - for (size_t icnt = 0; icnt < n_buffered; icnt++) { - float dis = qdis(buffered_ids[icnt]); - update_with_candidate(buffered_ids[icnt], dis); + n_buffered = 0; } } + + // process leftovers + for (size_t icnt = 0; icnt < n_buffered; icnt++) { + float dis = qdis(buffered_ids[icnt]); + update_with_candidate(buffered_ids[icnt], dis); + } } vt.advance(); } -} // namespace - /// Finds neighbors and builds links with them, starting from an entry /// point. The own neighbor list is assumed to be locked. void HNSW::add_links_starting_from( diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h index 71419edbb5..e7fca79e88 100644 --- a/faiss/impl/HNSW.h +++ b/faiss/impl/HNSW.h @@ -281,4 +281,13 @@ std::priority_queue search_from_candidate_unbounded( VisitedTable* vt, HNSWStats& stats); +void search_neighbors_to_add( + HNSW& hnsw, + DistanceComputer& qdis, + std::priority_queue& results, + int entry_point, + float d_entry_point, + int level, + VisitedTable& vt); + } // namespace faiss diff --git a/tests/test_hnsw.cpp b/tests/test_hnsw.cpp index c546a76778..ccd4f8932f 100644 --- a/tests/test_hnsw.cpp +++ b/tests/test_hnsw.cpp @@ -425,6 +425,62 @@ std::priority_queue reference_search_from_candidate_unbounded return top_candidates; } +/// search neighbors on a single level, starting from an entry point +void reference_search_neighbors_to_add( + faiss::HNSW& hnsw, + faiss::DistanceComputer& qdis, + std::priority_queue& results, + int entry_point, + float d_entry_point, + int level, + faiss::VisitedTable& vt) { + // top is nearest candidate + std::priority_queue candidates; + + faiss::HNSW::NodeDistFarther ev(d_entry_point, entry_point); + candidates.push(ev); + results.emplace(d_entry_point, entry_point); + vt.set(entry_point); + + while (!candidates.empty()) { + // get nearest + const faiss::HNSW::NodeDistFarther& currEv = candidates.top(); + + if (currEv.d > results.top().d) { + break; + } + int currNode = currEv.id; + candidates.pop(); + + // loop over neighbors + size_t begin, end; + hnsw.neighbor_range(currNode, level, &begin, &end); + + // a reference version + for (size_t i = begin; i < end; i++) { + faiss::HNSW::storage_idx_t nodeId = hnsw.neighbors[i]; + if (nodeId < 0) + break; + if (vt.get(nodeId)) + continue; + vt.set(nodeId); + + float dis = qdis(nodeId); + faiss::HNSW::NodeDistFarther evE1(dis, nodeId); + + if (results.size() < hnsw.efConstruction || results.top().d > dis) { + results.emplace(dis, nodeId); + candidates.emplace(dis, nodeId); + if (results.size() > hnsw.efConstruction) { + results.pop(); + } + } + } + } + + vt.advance(); +} + TEST_F(HNSWTest, TEST_search_from_candidate_unbounded) { omp_set_num_threads(1); auto nearest = index->hnsw.entry_point; @@ -541,3 +597,41 @@ TEST_F(HNSWTest, TEST_search_from_candidates) { EXPECT_EQ(reference_stats.n1, stats.n1); EXPECT_EQ(reference_stats.n2, stats.n2); } + +TEST_F(HNSWTest, TEST_search_neighbors_to_add) { + omp_set_num_threads(1); + + faiss::VisitedTable vt(index->ntotal); + faiss::VisitedTable reference_vt(index->ntotal); + + std::priority_queue link_targets; + std::priority_queue reference_link_targets; + + faiss::search_neighbors_to_add( + index->hnsw, + *dis, + link_targets, + index->hnsw.entry_point, + (*dis)(index->hnsw.entry_point), + index->hnsw.max_level, + vt); + + reference_search_neighbors_to_add( + index->hnsw, + *dis, + reference_link_targets, + index->hnsw.entry_point, + (*dis)(index->hnsw.entry_point), + index->hnsw.max_level, + reference_vt); + + EXPECT_EQ(link_targets.size(), reference_link_targets.size()); + while (!link_targets.empty()) { + auto val = link_targets.top(); + auto reference_val = reference_link_targets.top(); + EXPECT_EQ(val.d, reference_val.d); + EXPECT_EQ(val.id, reference_val.id); + link_targets.pop(); + reference_link_targets.pop(); + } +} From 480a1246e258bde19b140633eeadfe14962239fc Mon Sep 17 00:00:00 2001 From: Michael Norris Date: Wed, 4 Dec 2024 20:48:11 -0800 Subject: [PATCH 4/4] Add more unit testing for HNSW [4/n] Summary: search_level_0 is not unit tested yet. It isn't called anywhere so we can't test from the Index level, but it may be in the future, so adding a quick test for it. Differential Revision: D66800813 --- tests/test_hnsw.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_hnsw.cpp b/tests/test_hnsw.cpp index ccd4f8932f..e2e9b4b1b7 100644 --- a/tests/test_hnsw.cpp +++ b/tests/test_hnsw.cpp @@ -635,3 +635,46 @@ TEST_F(HNSWTest, TEST_search_neighbors_to_add) { reference_link_targets.pop(); } } + +TEST_F(HNSWTest, TEST_search_level_0) { + omp_set_num_threads(1); + std::vector I(k * nq); + std::vector D(k * nq); + + using RH = faiss::HeapBlockResultHandler; + RH bres1(nq, D.data(), I.data(), k); + faiss::HeapBlockResultHandler::SingleResultHandler res1( + bres1); + RH bres2(nq, D.data(), I.data(), k); + faiss::HeapBlockResultHandler::SingleResultHandler res2( + bres2); + + faiss::HNSWStats stats1, stats2; + faiss::VisitedTable vt1(index->ntotal); + faiss::VisitedTable vt2(index->ntotal); + auto nprobe = 5; + const faiss::HNSW::storage_idx_t values[] = {1, 2, 3, 4, 5}; + const faiss::HNSW::storage_idx_t* nearest_i = values; + const float distances[] = {0.1, 0.2, 0.3, 0.4, 0.5}; + const float* nearest_d = distances; + + // search_type == 1 + res1.begin(0); + index->hnsw.search_level_0( + *dis, res1, nprobe, nearest_i, nearest_d, 1, stats1, vt1, nullptr); + res1.end(); + + // search_type == 2 + res2.begin(0); + index->hnsw.search_level_0( + *dis, res2, nprobe, nearest_i, nearest_d, 2, stats2, vt2, nullptr); + res2.end(); + + // search_type 1 calls search_from_candidates in a loop nprobe times. + // search_type 2 pushes the candidates and just calls search_from_candidates + // once, so those stats will be much less. + EXPECT_GT(stats1.ndis, stats2.ndis); + EXPECT_GT(stats1.nhops, stats2.nhops); + EXPECT_GT(stats1.n1, stats2.n1); + EXPECT_GT(stats1.n2, stats2.n2); +}