Skip to content

Commit 87a760e

Browse files
gtwang01facebook-github-bot
authored andcommitted
Add testing for utils/hamming.cpp (facebookresearch#4079)
Summary: Pull Request resolved: facebookresearch#4079 As title Reviewed By: mengdilin Differential Revision: D66976823 fbshipit-source-id: 71f8d261787a75d7ec59ad4bea6e27676e2410b5
1 parent 5f0a99a commit 87a760e

File tree

4 files changed

+299
-4
lines changed

4 files changed

+299
-4
lines changed

faiss/utils/hamming.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
#include <faiss/utils/hamming.h>
2525

2626
#include <algorithm>
27-
#include <cmath>
2827
#include <cstdio>
2928
#include <memory>
3029
#include <vector>

faiss/utils/hamming_distance/neon-inl.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,9 @@ inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
9898

9999
/* Hamming distances for multiple of 64 bits */
100100
inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
101-
const size_t nwords256 = nwords / 256;
102-
const size_t nwords128 = (nwords - nwords256 * 256) / 128;
103-
const size_t nwords64 = (nwords - nwords256 * 256 - nwords128 * 128) / 64;
101+
const size_t nwords256 = nwords / 4;
102+
const size_t nwords128 = (nwords % 4) / 2;
103+
const size_t nwords64 = nwords % 2;
104104

105105
hamdis_t h = 0;
106106
if (nwords256 > 0) {

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ set(FAISS_TEST_SRC
3535
test_common_ivf_empty_index.cpp
3636
test_callback.cpp
3737
test_utils.cpp
38+
test_hamming.cpp
3839
)
3940

4041
add_executable(faiss_test ${FAISS_TEST_SRC})

tests/test_hamming.cpp

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* This source code is licensed under the MIT license found in the
5+
* LICENSE file in the root directory of this source tree.
6+
*/
7+
8+
#include <gtest/gtest.h>
9+
10+
#include <faiss/impl/FaissAssert.h>
11+
#include <faiss/utils/hamming.h>
12+
#include <random>
13+
14+
using namespace ::testing;
15+
16+
template <typename T>
17+
std::string print_data(
18+
std::shared_ptr<std::vector<T>> data,
19+
const size_t divider) {
20+
std::string ret = "";
21+
for (int i = 0; i < data->size(); ++i) {
22+
if (i % divider) {
23+
ret += " ";
24+
} else {
25+
ret += "|";
26+
}
27+
ret += std::to_string((*data)[i]);
28+
}
29+
ret += "|";
30+
return ret;
31+
}
32+
33+
std::stringstream get_correct_hamming_example(
34+
const size_t na, // number of queries
35+
const size_t nb, // number of candidates
36+
const size_t k,
37+
const size_t code_size,
38+
std::shared_ptr<std::vector<uint8_t>> a,
39+
std::shared_ptr<std::vector<uint8_t>> b,
40+
std::shared_ptr<std::vector<long>> true_ids,
41+
std::shared_ptr<std::vector<int>> true_distances) {
42+
assert(nb > k);
43+
44+
// Initialization
45+
std::default_random_engine rng(123);
46+
std::uniform_int_distribution<int32_t> uniform(0, nb - 1);
47+
48+
const size_t nresults = na * k;
49+
50+
a->clear();
51+
a->resize(na * code_size, 1); // query vectors are all 1
52+
b->clear();
53+
b->resize(nb * code_size, 2); // database vectors are all 2
54+
55+
true_ids->clear();
56+
true_ids->reserve(nresults);
57+
true_distances->clear();
58+
true_distances->reserve(nresults);
59+
60+
// define correct ids (must be unique)
61+
std::set<long> correct_ids;
62+
do {
63+
correct_ids.insert(uniform(rng));
64+
} while (correct_ids.size() < k);
65+
66+
// replace database vector at id with vector more similar to query
67+
// ordered, so earlier ids must be more similar
68+
for (size_t nmatches = k; nmatches > 0; --nmatches) {
69+
// get id and erase it
70+
const size_t id = *correct_ids.begin();
71+
*correct_ids.erase(correct_ids.begin());
72+
73+
// assemble true id and distance at locations
74+
true_ids->push_back(id);
75+
true_distances->push_back(code_size - nmatches); // hamming dist
76+
for (size_t i = 0; i < nmatches; ++i) {
77+
b->begin()[id * code_size + i] = 1;
78+
}
79+
}
80+
81+
// true_ids and true_distances only contain results for the first query
82+
// each query is identical, so copy the first query na-1 times
83+
for (size_t i = 1; i < na; ++i) {
84+
true_ids->insert(
85+
true_ids->end(), true_ids->begin(), true_ids->begin() + k);
86+
true_distances->insert(
87+
true_distances->end(),
88+
true_distances->begin(),
89+
true_distances->begin() + k);
90+
}
91+
92+
// assemble string for debugging
93+
std::stringstream ret;
94+
ret << "na: " << na << std::endl
95+
<< "nb: " << nb << std::endl
96+
<< "k: " << k << std::endl
97+
<< "code_size: " << code_size << std::endl
98+
<< "a: " << print_data(a, code_size) << std::endl
99+
<< "b: " << print_data(b, code_size) << std::endl
100+
<< "true_ids: " << print_data(true_ids, k) << std::endl
101+
<< "true_distances: " << print_data(true_distances, k) << std::endl;
102+
return ret;
103+
}
104+
105+
TEST(TestHamming, test_crosshamming_count_thres) {
106+
// Initialize randomizer
107+
std::default_random_engine rng(123);
108+
std::uniform_int_distribution<int32_t> uniform(0, 255);
109+
110+
// Initialize inputs
111+
const size_t n = 10; // number of codes
112+
const hamdis_t hamming_threshold = 20;
113+
114+
// one for each case - 65 is default
115+
for (auto ncodes : {8, 16, 32, 64, 65}) {
116+
// initialize inputs
117+
const int nbits = ncodes * 8;
118+
const size_t nwords = nbits / 64;
119+
// 8 to for later conversion to uint64_t, and 2 for buffer
120+
std::vector<uint8_t> dbs(nwords * n * 8 * 2);
121+
for (int i = 0; i < dbs.size(); ++i) {
122+
dbs[i] = uniform(rng);
123+
}
124+
125+
// get true distance
126+
size_t true_count = 0;
127+
uint64_t* bs1 = (uint64_t*)dbs.data();
128+
for (int i = 0; i < n; ++i) {
129+
uint64_t* bs2 = bs1 + 2;
130+
for (int j = i + 1; j < n; ++j) {
131+
if (faiss::hamming(bs1 + i * nwords, bs2 + j * nwords, nwords) <
132+
hamming_threshold) {
133+
++true_count;
134+
}
135+
}
136+
}
137+
138+
// run test and check correctness
139+
size_t count;
140+
if (ncodes == 65) {
141+
ASSERT_THROW(
142+
faiss::crosshamming_count_thres(
143+
dbs.data(), n, hamming_threshold, ncodes, &count),
144+
faiss::FaissException);
145+
continue;
146+
}
147+
faiss::crosshamming_count_thres(
148+
dbs.data(), n, hamming_threshold, ncodes, &count);
149+
150+
ASSERT_EQ(count, true_count) << "ncodes = " << ncodes;
151+
}
152+
}
153+
TEST(TestHamming, test_hamming_thres) {
154+
// Initialize randomizer
155+
std::default_random_engine rng(123);
156+
std::uniform_int_distribution<int32_t> uniform(0, 255);
157+
158+
// Initialize inputs
159+
const size_t n1 = 10;
160+
const size_t n2 = 15;
161+
const hamdis_t hamming_threshold = 100;
162+
163+
// one for each case - 65 is default
164+
for (auto ncodes : {8, 16, 32, 64, 65}) {
165+
// initialize inputs
166+
const int nbits = ncodes * 8;
167+
const size_t nwords = nbits / 64;
168+
std::vector<uint8_t> bs1(nwords * n1 * 8);
169+
std::vector<uint8_t> bs2(nwords * n2 * 8);
170+
for (int i = 0; i < bs1.size(); ++i) {
171+
bs1[i] = uniform(rng);
172+
}
173+
for (int i = 0; i < bs2.size(); ++i) {
174+
bs2[i] = uniform(rng);
175+
}
176+
177+
// get true distance
178+
size_t true_count = 0;
179+
std::vector<int64_t> true_idx;
180+
std::vector<hamdis_t> true_dis;
181+
182+
uint64_t* bs1_64 = (uint64_t*)bs1.data();
183+
uint64_t* bs2_64 = (uint64_t*)bs2.data();
184+
for (int i = 0; i < n1; ++i) {
185+
for (int j = 0; j < n2; ++j) {
186+
hamdis_t ham_dist = faiss::hamming(
187+
bs1_64 + i * nwords, bs2_64 + j * nwords, nwords);
188+
if (ham_dist < hamming_threshold) {
189+
++true_count;
190+
true_idx.push_back(i);
191+
true_idx.push_back(j);
192+
true_dis.push_back(ham_dist);
193+
}
194+
}
195+
}
196+
197+
// run test and check correctness for both
198+
// match_hamming_thres and hamming_count_thres
199+
std::vector<int64_t> idx(true_idx.size());
200+
std::vector<hamdis_t> dis(true_dis.size());
201+
if (ncodes == 65) {
202+
ASSERT_THROW(
203+
faiss::match_hamming_thres(
204+
bs1.data(),
205+
bs2.data(),
206+
n1,
207+
n2,
208+
hamming_threshold,
209+
ncodes,
210+
idx.data(),
211+
dis.data()),
212+
faiss::FaissException);
213+
ASSERT_THROW(
214+
faiss::hamming_count_thres(
215+
bs1.data(),
216+
bs2.data(),
217+
n1,
218+
n2,
219+
hamming_threshold,
220+
ncodes,
221+
nullptr),
222+
faiss::FaissException);
223+
continue;
224+
}
225+
size_t match_count = faiss::match_hamming_thres(
226+
bs1.data(),
227+
bs2.data(),
228+
n1,
229+
n2,
230+
hamming_threshold,
231+
ncodes,
232+
idx.data(),
233+
dis.data());
234+
size_t count_count;
235+
faiss::hamming_count_thres(
236+
bs1.data(),
237+
bs2.data(),
238+
n1,
239+
n2,
240+
hamming_threshold,
241+
ncodes,
242+
&count_count);
243+
244+
ASSERT_EQ(match_count, true_count) << "ncodes = " << ncodes;
245+
ASSERT_EQ(count_count, true_count) << "ncodes = " << ncodes;
246+
ASSERT_EQ(idx, true_idx) << "ncodes = " << ncodes;
247+
ASSERT_EQ(dis, true_dis) << "ncodes = " << ncodes;
248+
}
249+
}
250+
251+
TEST(TestHamming, test_hamming_knn) {
252+
// Initialize randomizer
253+
std::default_random_engine rng(123);
254+
std::uniform_int_distribution<int32_t> uniform(0, 32);
255+
256+
// Initialize inputs
257+
const size_t na = 4;
258+
const size_t nb = 12; // number of candidates
259+
const size_t k = 6;
260+
261+
auto a = std::make_shared<std::vector<uint8_t>>();
262+
auto b = std::make_shared<std::vector<uint8_t>>();
263+
auto true_ids = std::make_shared<std::vector<long>>();
264+
auto true_distances = std::make_shared<std::vector<int>>();
265+
266+
// 8, 16, 32 are cases - 24 will hit default case
267+
// all should be multiples of 8
268+
for (auto code_size : {8, 16, 24, 32}) {
269+
// get example
270+
std::stringstream assert_str = get_correct_hamming_example(
271+
na, nb, k, code_size, a, b, true_ids, true_distances);
272+
273+
// run test on generalized_hammings_knn_hc
274+
std::vector<long> ids_gen(na * k);
275+
std::vector<int> dist_gen(na * k);
276+
faiss::int_maxheap_array_t res = {
277+
na, k, ids_gen.data(), dist_gen.data()};
278+
faiss::generalized_hammings_knn_hc(
279+
&res, a->data(), b->data(), nb, code_size, true);
280+
ASSERT_EQ(ids_gen, *true_ids) << assert_str.str();
281+
ASSERT_EQ(dist_gen, *true_distances) << assert_str.str();
282+
283+
// run test on hammings_knn
284+
std::vector<long> ids_ham_knn(na * k, 0);
285+
std::vector<int> dist_ham_knn(na * k, 0);
286+
res = {na, k, ids_ham_knn.data(), dist_ham_knn.data()};
287+
faiss::hammings_knn(&res, a->data(), b->data(), nb, code_size, true);
288+
ASSERT_EQ(ids_ham_knn, *true_ids) << assert_str.str();
289+
// hammings_knn results in twice the distance for some reason :/
290+
for (int i = 0; i < dist_ham_knn.size(); ++i) {
291+
dist_ham_knn[i] /= 2;
292+
}
293+
ASSERT_EQ(dist_ham_knn, *true_distances) << assert_str.str();
294+
}
295+
}

0 commit comments

Comments
 (0)