Skip to content

Commit b109d08

Browse files
mdouzefacebook-github-bot
authored andcommitted
Search and return codes (facebookresearch#3143)
Summary: This PR adds a functionality where an IVF index can be searched and the corresponding codes be returned. It also adds a few functions to compress int arrays into a bit-compact representation. Pull Request resolved: facebookresearch#3143 Test Plan: ``` buck test //faiss/tests/:test_index_composite -- TestSearchAndReconstruct buck test //faiss/tests/:test_standalone_codec -- test_arrays ``` Reviewed By: algoriddle Differential Revision: D51544613 Pulled By: mdouze fbshipit-source-id: 875f72d0f9140096851592422570efa0f65431fc
1 parent 467f70e commit b109d08

15 files changed

+929
-277
lines changed

benchs/bench_all_ivf/bench_all_ivf.py

Lines changed: 341 additions & 241 deletions
Large diffs are not rendered by default.

benchs/bench_hybrid_cpu_gpu.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -530,14 +530,7 @@ def aa(*args, **kwargs):
530530
raise RuntimeError()
531531

532532
totex = op.num_experiments()
533-
rs = np.random.RandomState(123)
534-
if totex < args.n_autotune:
535-
experiments = rs.permutation(totex - 2) + 1
536-
else:
537-
experiments = rs.randint(
538-
totex - 2, size=args.n_autotune - 2, replace=False)
539-
540-
experiments = [0, totex - 1] + list(experiments)
533+
experiments = op.sample_experiments()
541534
print(f"total nb experiments {totex}, running {len(experiments)}")
542535

543536
print("perform search")

contrib/evaluation.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,23 @@ def do_nothing_key(self):
380380
return np.zeros(len(self.ranges), dtype=int)
381381

382382
def num_experiments(self):
383-
return np.prod([len(values) for name, values in self.ranges])
383+
return int(np.prod([len(values) for name, values in self.ranges]))
384+
385+
def sample_experiments(self, n_autotune, rs=np.random):
386+
""" sample a set of experiments of max size n_autotune
387+
(run all experiments in random order if n_autotune is 0)
388+
"""
389+
assert n_autotune == 0 or n_autotune >= 2
390+
totex = self.num_experiments()
391+
rs = np.random.RandomState(123)
392+
if n_autotune == 0 or totex < n_autotune:
393+
experiments = rs.permutation(totex - 2)
394+
else:
395+
experiments = rs.choice(
396+
totex - 2, size=n_autotune - 2, replace=False)
397+
398+
experiments = [0, totex - 1] + [int(cno) + 1 for cno in experiments]
399+
return experiments
384400

385401
def cno_to_key(self, cno):
386402
"""Convert a sequential experiment number to a key"""

faiss/IndexIVF.cpp

Lines changed: 85 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -977,44 +977,107 @@ void IndexIVF::search_and_reconstruct(
977977
std::min(nlist, params ? params->nprobe : this->nprobe);
978978
FAISS_THROW_IF_NOT(nprobe > 0);
979979

980-
idx_t* idx = new idx_t[n * nprobe];
981-
ScopeDeleter<idx_t> del(idx);
982-
float* coarse_dis = new float[n * nprobe];
983-
ScopeDeleter<float> del2(coarse_dis);
980+
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
981+
std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
984982

985-
quantizer->search(n, x, nprobe, coarse_dis, idx);
983+
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
986984

987-
invlists->prefetch_lists(idx, n * nprobe);
985+
invlists->prefetch_lists(idx.get(), n * nprobe);
988986

989987
// search_preassigned() with `store_pairs` enabled to obtain the list_no
990988
// and offset into `codes` for reconstruction
991989
search_preassigned(
992990
n,
993991
x,
994992
k,
995-
idx,
996-
coarse_dis,
993+
idx.get(),
994+
coarse_dis.get(),
997995
distances,
998996
labels,
999997
true /* store_pairs */,
1000998
params);
1001-
for (idx_t i = 0; i < n; ++i) {
1002-
for (idx_t j = 0; j < k; ++j) {
1003-
idx_t ij = i * k + j;
1004-
idx_t key = labels[ij];
1005-
float* reconstructed = recons + ij * d;
1006-
if (key < 0) {
1007-
// Fill with NaNs
1008-
memset(reconstructed, -1, sizeof(*reconstructed) * d);
1009-
} else {
1010-
int list_no = lo_listno(key);
1011-
int offset = lo_offset(key);
999+
#pragma omp parallel for if (n * k > 1000)
1000+
for (idx_t ij = 0; ij < n * k; ij++) {
1001+
idx_t key = labels[ij];
1002+
float* reconstructed = recons + ij * d;
1003+
if (key < 0) {
1004+
// Fill with NaNs
1005+
memset(reconstructed, -1, sizeof(*reconstructed) * d);
1006+
} else {
1007+
int list_no = lo_listno(key);
1008+
int offset = lo_offset(key);
1009+
1010+
// Update label to the actual id
1011+
labels[ij] = invlists->get_single_id(list_no, offset);
1012+
1013+
reconstruct_from_offset(list_no, offset, reconstructed);
1014+
}
1015+
}
1016+
}
1017+
1018+
void IndexIVF::search_and_return_codes(
1019+
idx_t n,
1020+
const float* x,
1021+
idx_t k,
1022+
float* distances,
1023+
idx_t* labels,
1024+
uint8_t* codes,
1025+
bool include_listno,
1026+
const SearchParameters* params_in) const {
1027+
const IVFSearchParameters* params = nullptr;
1028+
if (params_in) {
1029+
params = dynamic_cast<const IVFSearchParameters*>(params_in);
1030+
FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
1031+
}
1032+
const size_t nprobe =
1033+
std::min(nlist, params ? params->nprobe : this->nprobe);
1034+
FAISS_THROW_IF_NOT(nprobe > 0);
1035+
1036+
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
1037+
std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
1038+
1039+
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
1040+
1041+
invlists->prefetch_lists(idx.get(), n * nprobe);
1042+
1043+
// search_preassigned() with `store_pairs` enabled to obtain the list_no
1044+
// and offset into `codes` for reconstruction
1045+
search_preassigned(
1046+
n,
1047+
x,
1048+
k,
1049+
idx.get(),
1050+
coarse_dis.get(),
1051+
distances,
1052+
labels,
1053+
true /* store_pairs */,
1054+
params);
1055+
1056+
size_t code_size_1 = code_size;
1057+
if (include_listno) {
1058+
code_size_1 += coarse_code_size();
1059+
}
1060+
1061+
#pragma omp parallel for if (n * k > 1000)
1062+
for (idx_t ij = 0; ij < n * k; ij++) {
1063+
idx_t key = labels[ij];
1064+
uint8_t* code1 = codes + ij * code_size_1;
1065+
1066+
if (key < 0) {
1067+
// Fill with 0xff
1068+
memset(code1, -1, code_size_1);
1069+
} else {
1070+
int list_no = lo_listno(key);
1071+
int offset = lo_offset(key);
1072+
const uint8_t* cc = invlists->get_single_code(list_no, offset);
10121073

1013-
// Update label to the actual id
1014-
labels[ij] = invlists->get_single_id(list_no, offset);
1074+
labels[ij] = invlists->get_single_id(list_no, offset);
10151075

1016-
reconstruct_from_offset(list_no, offset, reconstructed);
1076+
if (include_listno) {
1077+
encode_listno(list_no, code1);
1078+
code1 += code_size_1 - code_size;
10171079
}
1080+
memcpy(code1, cc, code_size);
10181081
}
10191082
}
10201083
}

faiss/IndexIVF.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,24 @@ struct IndexIVF : Index, IndexIVFInterface {
357357
float* recons,
358358
const SearchParameters* params = nullptr) const override;
359359

360+
/** Similar to search, but also returns the codes corresponding to the
361+
* stored vectors for the search results.
362+
*
363+
* @param codes codes (n, k, code_size)
364+
* @param include_listno
365+
* include the list ids in the code (in this case add
366+
* ceil(log8(nlist)) to the code size)
367+
*/
368+
void search_and_return_codes(
369+
idx_t n,
370+
const float* x,
371+
idx_t k,
372+
float* distances,
373+
idx_t* labels,
374+
uint8_t* recons,
375+
bool include_listno = false,
376+
const SearchParameters* params = nullptr) const;
377+
360378
/** Reconstruct a vector given the location in terms of (inv list index +
361379
* inv list offset) instead of the id.
362380
*

faiss/IndexIVFAdditiveQuantizer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ struct AQInvertedListScanner : InvertedListScanner {
149149
const float* q;
150150
/// following codes come from this inverted list
151151
void set_list(idx_t list_no, float coarse_dis) override {
152+
this->list_no = list_no;
152153
if (ia.metric_type == METRIC_L2 && ia.by_residual) {
153154
ia.quantizer->compute_residual(q0, tmp.data(), list_no);
154155
q = tmp.data();

faiss/impl/AdditiveQuantizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ void AdditiveQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
261261
is_trained, "The additive quantizer is not trained yet.");
262262

263263
// standard additive quantizer decoding
264-
#pragma omp parallel for if (n > 1000)
264+
#pragma omp parallel for if (n > 100)
265265
for (int64_t i = 0; i < n; i++) {
266266
BitstringReader bsr(code + i * code_size, code_size);
267267
float* xi = x + i * d;

faiss/impl/ProductQuantizer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,8 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const {
306306
}
307307

308308
void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
309-
for (size_t i = 0; i < n; i++) {
309+
#pragma omp parallel for if (n > 100)
310+
for (int64_t i = 0; i < n; i++) {
310311
this->decode(code + code_size * i, x + d * i);
311312
}
312313
}

faiss/python/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
from faiss.extra_wrappers import kmin, kmax, pairwise_distances, rand, randint, \
2323
lrand, randn, rand_smooth_vectors, eval_intersection, normalize_L2, \
2424
ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort, \
25-
merge_knn_results, MapInt64ToInt64, knn_hamming
25+
merge_knn_results, MapInt64ToInt64, knn_hamming, \
26+
pack_bitstrings, unpack_bitstrings
2627

2728

2829
__version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,

faiss/python/class_wrappers.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,74 @@ def replacement_search_and_reconstruct(self, x, k, *, params=None, D=None, I=Non
402402
)
403403
return D, I, R
404404

405+
def replacement_search_and_return_codes(
406+
self, x, k, *,
407+
include_listnos=False, params=None, D=None, I=None, codes=None):
408+
"""Find the k nearest neighbors of the set of vectors x in the index,
409+
and return the codes stored for these vectors
410+
411+
Parameters
412+
----------
413+
x : array_like
414+
Query vectors, shape (n, d) where d is appropriate for the index.
415+
`dtype` must be float32.
416+
k : int
417+
Number of nearest neighbors.
418+
params : SearchParameters
419+
Search parameters of the current search (overrides the class-level params)
420+
include_listnos : bool, optional
421+
whether to include the list ids in the first bytes of each code
422+
D : array_like, optional
423+
Distance array to store the result.
424+
I : array_like, optional
425+
Labels array to store the result.
426+
codes : array_like, optional
427+
codes array to store
428+
429+
Returns
430+
-------
431+
D : array_like
432+
Distances of the nearest neighbors, shape (n, k). When not enough results are found
433+
the label is set to +Inf or -Inf.
434+
I : array_like
435+
Labels of the nearest neighbors, shape (n, k). When not enough results are found,
436+
the label is set to -1
437+
R : array_like
438+
Approximate (reconstructed) nearest neighbor vectors, shape (n, k, d).
439+
"""
440+
n, d = x.shape
441+
assert d == self.d
442+
x = np.ascontiguousarray(x, dtype='float32')
443+
444+
assert k > 0
445+
446+
if D is None:
447+
D = np.empty((n, k), dtype=np.float32)
448+
else:
449+
assert D.shape == (n, k)
450+
451+
if I is None:
452+
I = np.empty((n, k), dtype=np.int64)
453+
else:
454+
assert I.shape == (n, k)
455+
456+
code_size_1 = self.code_size
457+
if include_listnos:
458+
code_size_1 += self.coarse_code_size()
459+
460+
if codes is None:
461+
codes = np.empty((n, k, code_size_1), dtype=np.uint8)
462+
else:
463+
assert codes.shape == (n, k, code_size_1)
464+
465+
self.search_and_return_codes_c(
466+
n, swig_ptr(x),
467+
k, swig_ptr(D),
468+
swig_ptr(I), swig_ptr(codes), include_listnos,
469+
params
470+
)
471+
return D, I, codes
472+
405473
def replacement_remove_ids(self, x):
406474
"""Remove some ids from the index.
407475
This is a O(ntotal) operation by default, so could be expensive.
@@ -734,6 +802,8 @@ def replacement_permute_entries(self, perm):
734802
ignore_missing=True)
735803
replace_method(the_class, 'search_and_reconstruct',
736804
replacement_search_and_reconstruct, ignore_missing=True)
805+
replace_method(the_class, 'search_and_return_codes',
806+
replacement_search_and_return_codes, ignore_missing=True)
737807

738808
# these ones are IVF-specific
739809
replace_method(the_class, 'search_preassigned',

0 commit comments

Comments
 (0)