diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index ce1f5ddf1a..c581de791f 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -805,8 +805,9 @@ struct RangeSearchResults { * The scanning functions call their favorite precompute_* * function to precompute the tables they need. *****************************************************/ -template +template struct IVFPQScannerT : QueryTables { + using PQDecoder = typename PQCodeDistance::PQDecoder; const uint8_t* list_codes; const IDType* list_ids; size_t list_size; @@ -882,7 +883,7 @@ struct IVFPQScannerT : QueryTables { float distance_1 = 0; float distance_2 = 0; float distance_3 = 0; - distance_four_codes( + PQCodeDistance::distance_four_codes( pq.M, pq.nbits, sim_table, @@ -905,7 +906,7 @@ struct IVFPQScannerT : QueryTables { if (counter >= 1) { float dis = dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( pq.M, pq.nbits, sim_table, @@ -914,7 +915,7 @@ struct IVFPQScannerT : QueryTables { } if (counter >= 2) { float dis = dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( pq.M, pq.nbits, sim_table, @@ -923,7 +924,7 @@ struct IVFPQScannerT : QueryTables { } if (counter >= 3) { float dis = dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( pq.M, pq.nbits, sim_table, @@ -1089,7 +1090,7 @@ struct IVFPQScannerT : QueryTables { float distance_1 = dis0; float distance_2 = dis0; float distance_3 = dis0; - distance_four_codes( + PQCodeDistance::distance_four_codes( pq.M, pq.nbits, sim_table, @@ -1120,7 +1121,7 @@ struct IVFPQScannerT : QueryTables { n_hamming_pass++; float dis = dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( pq.M, pq.nbits, sim_table, @@ -1140,7 +1141,7 @@ struct IVFPQScannerT : QueryTables { n_hamming_pass++; float dis = dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( pq.M, pq.nbits, sim_table, @@ -1185,8 +1186,8 @@ struct IVFPQScannerT : QueryTables { * * use_sel: store or ignore the IDSelector */ -template -struct IVFPQScanner : IVFPQScannerT, +template +struct IVFPQScanner : IVFPQScannerT, InvertedListScanner { int precompute_mode; const IDSelector* sel; @@ -1196,7 +1197,7 @@ struct IVFPQScanner : IVFPQScannerT, bool store_pairs, int precompute_mode, const IDSelector* sel) - : IVFPQScannerT(ivfpq, nullptr), + : IVFPQScannerT(ivfpq, nullptr), precompute_mode(precompute_mode), sel(sel) { this->store_pairs = store_pairs; @@ -1215,7 +1216,7 @@ struct IVFPQScanner : IVFPQScannerT, float distance_to_code(const uint8_t* code) const override { assert(precompute_mode == 2); float dis = this->dis0 + - distance_single_code( + PQCodeDistance::distance_single_code( this->pq.M, this->pq.nbits, this->sim_table, code); return dis; } @@ -1279,7 +1280,9 @@ struct IVFPQScanner : IVFPQScannerT, } }; -template +/** follow 3 stages of template dispatching */ + +template InvertedListScanner* get_InvertedListScanner1( const IndexIVFPQ& index, bool store_pairs, @@ -1288,32 +1291,47 @@ InvertedListScanner* get_InvertedListScanner1( return new IVFPQScanner< METRIC_INNER_PRODUCT, CMin, - PQDecoder, + PQCodeDistance, use_sel>(index, store_pairs, 2, sel); } else if (index.metric_type == METRIC_L2) { return new IVFPQScanner< METRIC_L2, CMax, - PQDecoder, + PQCodeDistance, use_sel>(index, store_pairs, 2, sel); } return nullptr; } -template +template InvertedListScanner* get_InvertedListScanner2( const IndexIVFPQ& index, bool store_pairs, const IDSelector* sel) { if (index.pq.nbits == 8) { - return get_InvertedListScanner1( - index, store_pairs, sel); + return get_InvertedListScanner1< + PQCodeDistance, + use_sel>(index, store_pairs, sel); } else if (index.pq.nbits == 16) { - return get_InvertedListScanner1( - index, store_pairs, sel); + return get_InvertedListScanner1< + PQCodeDistance, + use_sel>(index, store_pairs, sel); + } else { + return get_InvertedListScanner1< + PQCodeDistance, + use_sel>(index, store_pairs, sel); + } +} + +template +InvertedListScanner* get_InvertedListScanner3( + const IndexIVFPQ& index, + bool store_pairs, + const IDSelector* sel) { + if (sel) { + return get_InvertedListScanner2(index, store_pairs, sel); } else { - return get_InvertedListScanner1( - index, store_pairs, sel); + return get_InvertedListScanner2(index, store_pairs, sel); } } @@ -1323,11 +1341,7 @@ InvertedListScanner* IndexIVFPQ::get_InvertedListScanner( bool store_pairs, const IDSelector* sel, const IVFSearchParameters*) const { - if (sel) { - return get_InvertedListScanner2(*this, store_pairs, sel); - } else { - return get_InvertedListScanner2(*this, store_pairs, sel); - } + DISPATCH_SIMDLevel(get_InvertedListScanner3, *this, store_pairs, sel); return nullptr; } diff --git a/faiss/IndexPQ.cpp b/faiss/IndexPQ.cpp index 8193e78b17..99347f9c8c 100644 --- a/faiss/IndexPQ.cpp +++ b/faiss/IndexPQ.cpp @@ -73,7 +73,7 @@ void IndexPQ::train(idx_t n, const float* x) { namespace { -template +template struct PQDistanceComputer : FlatCodesDistanceComputer { size_t d; MetricType metric; @@ -86,7 +86,7 @@ struct PQDistanceComputer : FlatCodesDistanceComputer { float distance_to_code(const uint8_t* code) final { ndis++; - float dis = distance_single_code( + float dis = PQCodeDistance::distance_single_code( pq.M, pq.nbits, precomputed_table.data(), code); return dis; } @@ -95,8 +95,10 @@ struct PQDistanceComputer : FlatCodesDistanceComputer { FAISS_THROW_IF_NOT(sdc); const float* sdci = sdc; float accu = 0; - PQDecoder codei(codes + i * code_size, pq.nbits); - PQDecoder codej(codes + j * code_size, pq.nbits); + typename PQCodeDistance::PQDecoder codei( + codes + i * code_size, pq.nbits); + typename PQCodeDistance::PQDecoder codej( + codes + j * code_size, pq.nbits); for (int l = 0; l < pq.M; l++) { accu += sdci[codei.decode() + (codej.decode() << codei.nbits)]; @@ -132,16 +134,24 @@ struct PQDistanceComputer : FlatCodesDistanceComputer { } }; +template +FlatCodesDistanceComputer* get_FlatCodesDistanceComputer1( + const IndexPQ& index) { + int nbits = index.pq.nbits; + if (nbits == 8) { + return new PQDistanceComputer>(index); + } else if (nbits == 16) { + return new PQDistanceComputer>(index); + } else { + return new PQDistanceComputer>( + index); + } +} + } // namespace FlatCodesDistanceComputer* IndexPQ::get_FlatCodesDistanceComputer() const { - if (pq.nbits == 8) { - return new PQDistanceComputer(*this); - } else if (pq.nbits == 16) { - return new PQDistanceComputer(*this); - } else { - return new PQDistanceComputer(*this); - } + DISPATCH_SIMDLevel(get_FlatCodesDistanceComputer1, *this); } /***************************************** diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp index e05f3a1f25..adfa9ce4d5 100644 --- a/faiss/impl/ScalarQuantizer.cpp +++ b/faiss/impl/ScalarQuantizer.cpp @@ -5,1965 +5,71 @@ * LICENSE file in the root directory of this source tree. */ -// -*- c++ -*- - -#include - -#include -#include - -#include -#include - -#ifdef __SSE__ -#include -#endif - -#include -#include -#include -#include -#include -#include -#include - -namespace faiss { - -/******************************************************************* - * ScalarQuantizer implementation - * - * The main source of complexity is to support combinations of 4 - * variants without incurring runtime tests or virtual function calls: - * - * - 4 / 8 bits per code component - * - uniform / non-uniform - * - IP / L2 distance search - * - scalar / AVX distance computation - * - * The appropriate Quantizer object is returned via select_quantizer - * that hides the template mess. - ********************************************************************/ - -#if defined(__AVX512F__) && defined(__F16C__) -#define USE_AVX512_F16C -#elif defined(__AVX2__) -#ifdef __F16C__ -#define USE_F16C -#else -#warning \ - "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well" -#endif -#endif - -#if defined(__aarch64__) -#if defined(__GNUC__) && __GNUC__ < 8 -#warning \ - "Cannot enable NEON optimizations in scalar quantizer if the compiler is GCC<8" -#else -#define USE_NEON -#endif -#endif - -namespace { - -typedef ScalarQuantizer::QuantizerType QuantizerType; -typedef ScalarQuantizer::RangeStat RangeStat; -using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; - -/******************************************************************* - * Codec: converts between values in [0, 1] and an index in a code - * array. The "i" parameter is the vector component index (not byte - * index). - */ - -struct Codec8bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i] = (int)(255 * x); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (code[i] + 0.5f) / 255.0f; - } - -#if defined(__AVX512F__) - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i)); - const __m512i i32 = _mm512_cvtepu8_epi32(c16); - const __m512 f16 = _mm512_cvtepi32_ps(i32); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 255.f); - return _mm512_fmadd_ps(f16, one_255, half_one_255); - } -#elif defined(__AVX2__) - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - const uint64_t c8 = *(uint64_t*)(code + i); - - const __m128i i8 = _mm_set1_epi64x(c8); - const __m256i i32 = _mm256_cvtepu8_epi32(i8); - const __m256 f8 = _mm256_cvtepi32_ps(i32); - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 255.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } -#endif - -#ifdef USE_NEON - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -#endif -}; - -struct Codec4bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; - } - -#if defined(__AVX512F__) - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - uint64_t c8 = *(uint64_t*)(code + (i >> 1)); - uint64_t mask = 0x0f0f0f0f0f0f0f0f; - uint64_t c8ev = c8 & mask; - uint64_t c8od = (c8 >> 4) & mask; - - __m128i c16 = - _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); - __m256i c8lo = _mm256_cvtepu8_epi32(c16); - __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8)); - __m512i i16 = _mm512_castsi256_si512(c8lo); - i16 = _mm512_inserti32x8(i16, c8hi, 1); - __m512 f16 = _mm512_cvtepi32_ps(i16); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 15.f); - return _mm512_fmadd_ps(f16, one_255, half_one_255); - } -#elif defined(__AVX2__) - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; - uint32_t c4od = (c4 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c8 = - _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); - __m128i c4lo = _mm_cvtepu8_epi32(c8); - __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); - __m256i i8 = _mm256_castsi128_si256(c4lo); - i8 = _mm256_insertf128_si256(i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps(i8); - __m256 half = _mm256_set1_ps(0.5f); - f8 = _mm256_add_ps(f8, half); - __m256 one_255 = _mm256_set1_ps(1.f / 15.f); - return _mm256_mul_ps(f8, one_255); - } -#endif - -#ifdef USE_NEON - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -#endif -}; - -struct Codec6bit { - static FAISS_ALWAYS_INLINE void encode_component( - float x, - uint8_t* code, - int i) { - int bits = (int)(x * 63.0); - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - code[0] |= bits; - break; - case 1: - code[0] |= bits << 6; - code[1] |= bits >> 2; - break; - case 2: - code[1] |= bits << 4; - code[2] |= bits >> 4; - break; - case 3: - code[2] |= bits << 2; - break; - } - } - - static FAISS_ALWAYS_INLINE float decode_component( - const uint8_t* code, - int i) { - uint8_t bits; - code += (i >> 2) * 3; - switch (i & 3) { - case 0: - bits = code[0] & 0x3f; - break; - case 1: - bits = code[0] >> 6; - bits |= (code[1] & 0xf) << 2; - break; - case 2: - bits = code[1] >> 4; - bits |= (code[2] & 3) << 4; - break; - case 3: - bits = code[2] >> 2; - break; - } - return (bits + 0.5f) / 63.0f; - } - -#if defined(__AVX512F__) - - static FAISS_ALWAYS_INLINE __m512 - decode_16_components(const uint8_t* code, int i) { - // pure AVX512 implementation (not necessarily the fastest). - // see: - // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h - - // clang-format off - - // 16 components, 16x6 bit=12 bytes - const __m128i bit_6v = - _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3); - const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v); - - // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - // 00 01 02 03 - const __m256i shuffle_mask = _mm256_setr_epi16( - 0xFF00, 0x0100, 0x0201, 0xFF02, - 0xFF03, 0x0403, 0x0504, 0xFF05, - 0xFF06, 0x0706, 0x0807, 0xFF08, - 0xFF09, 0x0A09, 0x0B0A, 0xFF0B); - const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask); - - // 0: xxxxxxxx xx543210 - // 1: xxxx5432 10xxxxxx - // 2: xxxxxx54 3210xxxx - // 3: xxxxxxxx 543210xx - const __m256i shift_right_v = _mm256_setr_epi16( - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U, - 0x0U, 0x6U, 0x4U, 0x2U); - __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v); - - // remove unneeded bits - shuffled_shifted = - _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F)); - - // scale - const __m512 f8 = - _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted)); - const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); - const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); - return _mm512_fmadd_ps(f8, one_255, half_one_255); - - // clang-format on - } - -#elif defined(__AVX2__) - - /* Load 6 bytes that represent 8 6-bit values, return them as a - * 8*32 bit vector register */ - static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { - const __m128i perm = _mm_set_epi8( - -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); - const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); - - // load 6 bytes - __m128i c1 = - _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]); - - // put in 8 * 32 bits - __m128i c2 = _mm_shuffle_epi8(c1, perm); - __m256i c3 = _mm256_cvtepi16_epi32(c2); - - // shift and mask out useless bits - __m256i c4 = _mm256_srlv_epi32(c3, shifts); - __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4); - return c5; - } - - static FAISS_ALWAYS_INLINE __m256 - decode_8_components(const uint8_t* code, int i) { - // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here - // // for the reference, maybe, it becomes used oned day. - // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3); - // const uint32_t* data32 = (const uint32_t*)data16; - // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32); - // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL); - // const __m128i i8 = _mm_set1_epi64x(vext); - // const __m256i i32 = _mm256_cvtepi8_epi32(i8); - // const __m256 f8 = _mm256_cvtepi32_ps(i32); - // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - // return _mm256_fmadd_ps(f8, one_255, half_one_255); - - __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); - __m256 f8 = _mm256_cvtepi32_ps(i8); - // this could also be done with bit manipulations but it is - // not obviously faster - const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); - const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); - return _mm256_fmadd_ps(f8, one_255, half_one_255); - } - -#endif - -#ifdef USE_NEON - static FAISS_ALWAYS_INLINE float32x4x2_t - decode_8_components(const uint8_t* code, int i) { - float32_t result[8] = {}; - for (size_t j = 0; j < 8; j++) { - result[j] = decode_component(code, i + j); - } - float32x4_t res1 = vld1q_f32(result); - float32x4_t res2 = vld1q_f32(result + 4); - return {res1, res2}; - } -#endif -}; - -/******************************************************************* - * Quantizer: normalizes scalar vector components, then passes them - * through a codec - *******************************************************************/ - -enum class QuantizerTemplateScaling { UNIFORM = 0, NON_UNIFORM = 1 }; - -template -struct QuantizerTemplate {}; - -template -struct QuantizerTemplate - : ScalarQuantizer::SQuantizer { - const size_t d; - const float vmin, vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained[0]), vdiff(trained[1]) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff != 0) { - xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin + xi * vdiff; - } -}; - -#if defined(__AVX512F__) - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate( - d, - trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)); - } -}; - -#elif defined(__AVX2__) - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate( - d, - trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)); - } -}; - -#endif - -#ifdef USE_NEON - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate( - d, - trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - return {vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[0], - vdupq_n_f32(this->vdiff)), - vfmaq_f32( - vdupq_n_f32(this->vmin), - xi.val[1], - vdupq_n_f32(this->vdiff))}; - } -}; - -#endif - -template -struct QuantizerTemplate - : ScalarQuantizer::SQuantizer { - const size_t d; - const float *vmin, *vdiff; - - QuantizerTemplate(size_t d, const std::vector& trained) - : d(d), vmin(trained.data()), vdiff(trained.data() + d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = 0; - if (vdiff[i] != 0) { - xi = (x[i] - vmin[i]) / vdiff[i]; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin[i] + xi * vdiff[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - float xi = Codec::decode_component(code, i); - return vmin[i] + xi * vdiff[i]; - } -}; - -#if defined(__AVX512F__) - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m512 xi = Codec::decode_16_components(code, i); - return _mm512_fmadd_ps( - xi, - _mm512_loadu_ps(this->vdiff + i), - _mm512_loadu_ps(this->vmin + i)); - } -}; - -#elif defined(__AVX2__) - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m256 xi = Codec::decode_8_components(code, i); - return _mm256_fmadd_ps( - xi, - _mm256_loadu_ps(this->vdiff + i), - _mm256_loadu_ps(this->vmin + i)); - } -}; - -#endif - -#ifdef USE_NEON - -template -struct QuantizerTemplate - : QuantizerTemplate { - QuantizerTemplate(size_t d, const std::vector& trained) - : QuantizerTemplate< - Codec, - QuantizerTemplateScaling::NON_UNIFORM, - 1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - float32x4x2_t xi = Codec::decode_8_components(code, i); - - float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i); - float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i); - - return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]), - vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])}; - } -}; - -#endif - -/******************************************************************* - * FP16 quantizer - *******************************************************************/ - -template -struct QuantizerFP16 {}; - -template <> -struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - QuantizerFP16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_fp16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_fp16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_fp16(((uint16_t*)code)[i]); - } -}; - -#if defined(USE_AVX512_F16C) - -template <> -struct QuantizerFP16<16> : QuantizerFP16<1> { - QuantizerFP16(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - return _mm512_cvtph_ps(codei); - } -}; - -#endif - -#if defined(USE_F16C) - -template <> -struct QuantizerFP16<8> : QuantizerFP16<1> { - QuantizerFP16(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps(codei); - } -}; - -#endif - -#ifdef USE_NEON - -template <> -struct QuantizerFP16<8> : QuantizerFP16<1> { - QuantizerFP16(size_t d, const std::vector& trained) - : QuantizerFP16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), - vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}; - } -}; -#endif - -/******************************************************************* - * BF16 quantizer - *******************************************************************/ - -template -struct QuantizerBF16 {}; - -template <> -struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - QuantizerBF16(size_t d, const std::vector& /* unused */) : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_bf16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_bf16(((uint16_t*)code)[i]); - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return decode_bf16(((uint16_t*)code)[i]); - } -}; - -#if defined(__AVX512F__) - -template <> -struct QuantizerBF16<16> : QuantizerBF16<1> { - QuantizerBF16(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); - __m512i code_512i = _mm512_cvtepu16_epi32(code_256i); - code_512i = _mm512_slli_epi32(code_512i, 16); - return _mm512_castsi512_ps(code_512i); - } -}; - -#elif defined(__AVX2__) - -template <> -struct QuantizerBF16<8> : QuantizerBF16<1> { - QuantizerBF16(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i)); - __m256i code_256i = _mm256_cvtepu16_epi32(code_128i); - code_256i = _mm256_slli_epi32(code_256i, 16); - return _mm256_castsi256_ps(code_256i); - } -}; - -#endif - -#ifdef USE_NEON - -template <> -struct QuantizerBF16<8> : QuantizerBF16<1> { - QuantizerBF16(size_t d, const std::vector& trained) - : QuantizerBF16<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); - return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), - vreinterpretq_f32_u32( - vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}; - } -}; -#endif - -/******************************************************************* - * 8bit_direct quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirect {}; - -template <> -struct Quantizer8bitDirect<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - Quantizer8bitDirect(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i]; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i]; - } -}; - -#if defined(__AVX512F__) - -template <> -struct Quantizer8bitDirect<16> : Quantizer8bitDirect<1> { - Quantizer8bitDirect(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - return _mm512_cvtepi32_ps(y16); // 16 * float32 - } -}; - -#elif defined(__AVX2__) - -template <> -struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> { - Quantizer8bitDirect(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - return _mm256_cvtepi32_ps(y8); // 8 * float32 - } -}; - -#endif - -#ifdef USE_NEON - -template <> -struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> { - Quantizer8bitDirect(size_t d, const std::vector& trained) - : Quantizer8bitDirect<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - // convert uint16 -> uint32 -> fp32 - return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))}; - } -}; - -#endif - -/******************************************************************* - * 8bit_direct_signed quantizer - *******************************************************************/ - -template -struct Quantizer8bitDirectSigned {}; - -template <> -struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer { - const size_t d; - - Quantizer8bitDirectSigned(size_t d, const std::vector& /* unused */) - : d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)(x[i] + 128); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i] - 128; - } - } - - FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) - const { - return code[i] - 128; - } -}; - -#if defined(__AVX512F__) - -template <> -struct Quantizer8bitDirectSigned<16> : Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m512 - reconstruct_16_components(const uint8_t* code, int i) const { - __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 - __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 - __m512i c16 = _mm512_set1_epi32(128); - __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes - return _mm512_cvtepi32_ps(z16); // 16 * float32 - } -}; - -#elif defined(__AVX2__) - -template <> -struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE __m256 - reconstruct_8_components(const uint8_t* code, int i) const { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - __m256i c8 = _mm256_set1_epi32(128); - __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes - return _mm256_cvtepi32_ps(z8); // 8 * float32 - } -}; - -#endif - -#ifdef USE_NEON - -template <> -struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> { - Quantizer8bitDirectSigned(size_t d, const std::vector& trained) - : Quantizer8bitDirectSigned<1>(d, trained) {} - - FAISS_ALWAYS_INLINE float32x4x2_t - reconstruct_8_components(const uint8_t* code, int i) const { - uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); - uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16 - uint16x4_t y8_0 = vget_low_u16(y8); - uint16x4_t y8_1 = vget_high_u16(y8); - - float32x4_t z8_0 = vcvtq_f32_u32( - vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32 - float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1)); - - // subtract 128 to convert into signed numbers - return {vsubq_f32(z8_0, vmovq_n_f32(128.0)), - vsubq_f32(z8_1, vmovq_n_f32(128.0))}; - } -}; - -#endif - -template -ScalarQuantizer::SQuantizer* select_quantizer_1( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - switch (qtype) { - case ScalarQuantizer::QT_8bit: - return new QuantizerTemplate< - Codec8bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case ScalarQuantizer::QT_6bit: - return new QuantizerTemplate< - Codec6bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case ScalarQuantizer::QT_4bit: - return new QuantizerTemplate< - Codec4bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>(d, trained); - case ScalarQuantizer::QT_8bit_uniform: - return new QuantizerTemplate< - Codec8bit, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case ScalarQuantizer::QT_4bit_uniform: - return new QuantizerTemplate< - Codec4bit, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>(d, trained); - case ScalarQuantizer::QT_fp16: - return new QuantizerFP16(d, trained); - case ScalarQuantizer::QT_bf16: - return new QuantizerBF16(d, trained); - case ScalarQuantizer::QT_8bit_direct: - return new Quantizer8bitDirect(d, trained); - case ScalarQuantizer::QT_8bit_direct_signed: - return new Quantizer8bitDirectSigned(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); -} - -/******************************************************************* - * Quantizer range training - */ - -static float sqr(float x) { - return x * x; -} - -void train_Uniform( - RangeStat rs, - float rs_arg, - idx_t n, - int k, - const float* x, - std::vector& trained) { - trained.resize(2); - float& vmin = trained[0]; - float& vmax = trained[1]; - - if (rs == ScalarQuantizer::RS_minmax) { - vmin = HUGE_VAL; - vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - } - float vexp = (vmax - vmin) * rs_arg; - vmin -= vexp; - vmax += vexp; - } else if (rs == ScalarQuantizer::RS_meanstd) { - double sum = 0, sum2 = 0; - for (size_t i = 0; i < n; i++) { - sum += x[i]; - sum2 += x[i] * x[i]; - } - float mean = sum / n; - float var = sum2 / n - mean * mean; - float std = var <= 0 ? 1.0 : sqrt(var); - - vmin = mean - std * rs_arg; - vmax = mean + std * rs_arg; - } else if (rs == ScalarQuantizer::RS_quantiles) { - std::vector x_copy(n); - memcpy(x_copy.data(), x, n * sizeof(*x)); - // TODO just do a quickselect - std::sort(x_copy.begin(), x_copy.end()); - int o = int(rs_arg * n); - if (o < 0) - o = 0; - if (o > n - o) - o = n / 2; - vmin = x_copy[o]; - vmax = x_copy[n - 1 - o]; - - } else if (rs == ScalarQuantizer::RS_optim) { - float a, b; - float sx = 0; - { - vmin = HUGE_VAL, vmax = -HUGE_VAL; - for (size_t i = 0; i < n; i++) { - if (x[i] < vmin) - vmin = x[i]; - if (x[i] > vmax) - vmax = x[i]; - sx += x[i]; - } - b = vmin; - a = (vmax - vmin) / (k - 1); - } - int verbose = false; - int niter = 2000; - float last_err = -1; - int iter_last_err = 0; - for (int it = 0; it < niter; it++) { - float sn = 0, sn2 = 0, sxn = 0, err1 = 0; - - for (idx_t i = 0; i < n; i++) { - float xi = x[i]; - float ni = floor((xi - b) / a + 0.5); - if (ni < 0) - ni = 0; - if (ni >= k) - ni = k - 1; - err1 += sqr(xi - (ni * a + b)); - sn += ni; - sn2 += ni * ni; - sxn += ni * xi; - } - - if (err1 == last_err) { - iter_last_err++; - if (iter_last_err == 16) - break; - } else { - last_err = err1; - iter_last_err = 0; - } - - float det = sqr(sn) - sn2 * n; - - b = (sn * sxn - sn2 * sx) / det; - a = (sn * sx - n * sxn) / det; - if (verbose) { - printf("it %d, err1=%g \r", it, err1); - fflush(stdout); - } - } - if (verbose) - printf("\n"); - - vmin = b; - vmax = b + a * (k - 1); - - } else { - FAISS_THROW_MSG("Invalid qtype"); - } - vmax -= vmin; -} - -void train_NonUniform( - RangeStat rs, - float rs_arg, - idx_t n, - int d, - int k, - const float* x, - std::vector& trained) { - trained.resize(2 * d); - float* vmin = trained.data(); - float* vmax = trained.data() + d; - if (rs == ScalarQuantizer::RS_minmax) { - memcpy(vmin, x, sizeof(*x) * d); - memcpy(vmax, x, sizeof(*x) * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - if (xi[j] < vmin[j]) - vmin[j] = xi[j]; - if (xi[j] > vmax[j]) - vmax[j] = xi[j]; - } - } - float* vdiff = vmax; - for (size_t j = 0; j < d; j++) { - float vexp = (vmax[j] - vmin[j]) * rs_arg; - vmin[j] -= vexp; - vmax[j] += vexp; - vdiff[j] = vmax[j] - vmin[j]; - } - } else { - // transpose - std::vector xt(n * d); - for (size_t i = 1; i < n; i++) { - const float* xi = x + i * d; - for (size_t j = 0; j < d; j++) { - xt[j * n + i] = xi[j]; - } - } - std::vector trained_d(2); -#pragma omp parallel for - for (int j = 0; j < d; j++) { - train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d); - vmin[j] = trained_d[0]; - vmax[j] = trained_d[1]; - } - } -} - -/******************************************************************* - * Similarity: gets vector components and computes a similarity wrt. a - * query vector stored in the object. The data fields just encapsulate - * an accumulator. - */ - -template -struct SimilarityL2 {}; - -template <> -struct SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2(const float* y) : y(y) {} - - /******* scalar accumulator *******/ - - float accu; - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - float tmp = *yi++ - x; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - float tmp = x1 - x2; - accu += tmp * tmp; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -#if defined(__AVX512F__) - -template <> -struct SimilarityL2<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2(const float* y) : y(y) {} - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - __m512 tmp = _mm512_sub_ps(yiv, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y_2) { - __m512 tmp = _mm512_sub_ps(y_2, x); - accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - // performs better than dividing into _mm256 and adding - return _mm512_reduce_add_ps(accu16); - } -}; - -#elif defined(__AVX2__) - -template <> -struct SimilarityL2<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - - explicit SimilarityL2(const float* y) : y(y) {} - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - __m256 tmp = _mm256_sub_ps(yiv, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) { - __m256 tmp = _mm256_sub_ps(y_2, x); - accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; - -#endif - -#ifdef USE_NEON -template <> -struct SimilarityL2<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - - const float *y, *yi; - explicit SimilarityL2(const float* y) : y(y) {} - float32x4x2_t accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; - - float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x, - float32x4x2_t y) { - float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); - float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); - - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); - - accu8 = {accu8_0, accu8_1}; - } - - FAISS_ALWAYS_INLINE float result_8() { - float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]); - float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]); - - float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); - float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); - return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); - } -}; -#endif - -template -struct SimilarityIP {}; - -template <> -struct SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - - float accu; - - explicit SimilarityIP(const float* y) : y(y) {} - - FAISS_ALWAYS_INLINE void begin() { - accu = 0; - yi = y; - } - - FAISS_ALWAYS_INLINE void add_component(float x) { - accu += *yi++ * x; - } - - FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { - accu += x1 * x2; - } - - FAISS_ALWAYS_INLINE float result() { - return accu; - } -}; - -#if defined(__AVX512F__) - -template <> -struct SimilarityIP<16> { - static constexpr int simdwidth = 16; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP(const float* y) : y(y) {} - - __m512 accu16; - - FAISS_ALWAYS_INLINE void begin_16() { - accu16 = _mm512_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { - __m512 yiv = _mm512_loadu_ps(yi); - yi += 16; - accu16 = _mm512_fmadd_ps(yiv, x, accu16); - } - - FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) { - accu16 = _mm512_fmadd_ps(x1, x2, accu16); - } - - FAISS_ALWAYS_INLINE float result_16() { - // performs better than dividing into _mm256 and adding - return _mm512_reduce_add_ps(accu16); - } -}; - -#elif defined(__AVX2__) - -template <> -struct SimilarityIP<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - - const float *y, *yi; - - float accu; - - explicit SimilarityIP(const float* y) : y(y) {} - - __m256 accu8; - - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { - __m256 yiv = _mm256_loadu_ps(yi); - yi += 8; - accu8 = _mm256_fmadd_ps(yiv, x, accu8); - } - - FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) { - accu8 = _mm256_fmadd_ps(x1, x2, accu8); - } - - FAISS_ALWAYS_INLINE float result_8() { - const __m128 sum = _mm_add_ps( - _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); - const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(sum, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); - } -}; -#endif - -#ifdef USE_NEON - -template <> -struct SimilarityIP<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; +#include - const float *y, *yi; +#include - explicit SimilarityIP(const float* y) : y(y) {} - float32x4x2_t accu8; +#include +#include - FAISS_ALWAYS_INLINE void begin_8() { - accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; - yi = y; - } +#include - FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { - float32x4x2_t yiv = vld1q_f32_x2(yi); - yi += 8; +#include - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]); - accu8 = {accu8_0, accu8_1}; - } +#include +#include +#include +#include +#include - FAISS_ALWAYS_INLINE void add_8_components_2( - float32x4x2_t x1, - float32x4x2_t x2) { - float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]); - float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]); - accu8 = {accu8_0, accu8_1}; - } +/******************************************************************* + * ScalarQuantizer implementation + * + * The main source of complexity is to support combinations of 4 + * variants without incurring runtime tests or virtual function calls: + * + * - 4 / 6 / 8 bits per code component + * - uniform / non-uniform + * - IP / L2 distance search + * - scalar / SIMD distance computation + * + * The appropriate Quantizer object is returned via select_quantizer + * that hides the template mess. + ********************************************************************/ - FAISS_ALWAYS_INLINE float result_8() { - float32x4x2_t sum = { - vpaddq_f32(accu8.val[0], accu8.val[0]), - vpaddq_f32(accu8.val[1], accu8.val[1])}; +/******************************************************************* + * Codec: converts between values in [0, 1] and an index in a code + * array. The "i" parameter is the vector component index (not byte + * index). + */ - float32x4x2_t sum2 = { - vpaddq_f32(sum.val[0], sum.val[0]), - vpaddq_f32(sum.val[1], sum.val[1])}; - return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); - } -}; -#endif +#include /******************************************************************* - * DistanceComputer: combines a similarity and a quantizer to do - * code-to-vector or code-to-code comparisons + * Quantizer: normalizes scalar vector components, then passes them + * through a codec *******************************************************************/ -template -struct DCTemplate : SQDistanceComputer {}; - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float xi = quant.reconstruct_component(code, i); - sim.add_component(xi); - } - return sim.result(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float x1 = quant.reconstruct_component(code1, i); - float x2 = quant.reconstruct_component(code2, i); - sim.add_component_2(x1, x2); - } - return sim.result(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_distance(q, code); - } -}; - -#if defined(USE_AVX512_F16C) - -template -struct DCTemplate - : SQDistanceComputer { // Update to handle 16 lanes - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 xi = quant.reconstruct_16_components(code, i); - sim.add_16_components(xi); - } - return sim.result_16(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_16(); - for (size_t i = 0; i < quant.d; i += 16) { - __m512 x1 = quant.reconstruct_16_components(code1, i); - __m512 x2 = quant.reconstruct_16_components(code2, i); - sim.add_16_components_2(x1, x2); - } - return sim.result_16(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_distance(q, code); - } -}; - -#elif defined(USE_F16C) - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_distance(q, code); - } -}; - -#endif - -#ifdef USE_NEON - -template -struct DCTemplate : SQDistanceComputer { - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector& trained) - : quant(d, trained) {} - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - float32x4x2_t x1 = quant.reconstruct_8_components(code1, i); - float32x4x2_t x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query(const float* x) final { - q = x; - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_distance(q, code); - } -}; -#endif +#include /******************************************************************* - * DistanceComputerByte: computes distances in the integer domain + * Similarity: gets vector components and computes a similarity wrt. a + * query vector stored in the object. The data fields just encapsulate + * an accumulator. + * DistanceComputer: combines a similarity and a quantizer to do + * code-to-vector or code-to-code comparisons *******************************************************************/ -template -struct DistanceComputerByte : SQDistanceComputer {}; - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_code_distance(tmp.data(), code); - } -}; - -#if defined(__AVX512F__) - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - __m512i accu = _mm512_setzero_si512(); - for (int i = 0; i < d; i += 32) { // Process 32 bytes at a time - __m512i c1 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code1 + i))); - __m512i c2 = _mm512_cvtepu8_epi16( - _mm256_loadu_si256((__m256i*)(code2 + i))); - __m512i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm512_madd_epi16(c1, c2); - } else { - __m512i diff = _mm512_sub_epi16(c1, c2); - prod32 = _mm512_madd_epi16(diff, diff); - } - accu = _mm512_add_epi32(accu, prod32); - } - // Horizontally add elements of accu - return _mm512_reduce_add_epi32(accu); - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_code_distance(tmp.data(), code); - } -}; - -#elif defined(__AVX2__) - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m256i accu = _mm256_setzero_si256(); - for (int i = 0; i < d; i += 16) { - // load 16 bytes, convert to 16 uint16_t - __m256i c1 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code1 + i))); - __m256i c2 = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i*)(code2 + i))); - __m256i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm256_madd_epi16(c1, c2); - } else { - __m256i diff = _mm256_sub_epi16(c1, c2); - prod32 = _mm256_madd_epi16(diff, diff); - } - accu = _mm256_add_epi32(accu, prod32); - } - __m128i sum = _mm256_extractf128_si256(accu, 0); - sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1)); - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - return _mm_cvtsi128_si32(sum); - } - - void set_query(const float* x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_code_distance(tmp.data(), code); - } -}; - -#endif - -#ifdef USE_NEON - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query(const float* x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return compute_code_distance( - codes + i * code_size, codes + j * code_size); - } - - float query_to_code(const uint8_t* code) const final { - return compute_code_distance(tmp.data(), code); - } -}; - -#endif +#include /******************************************************************* - * select_distance_computer: runtime selection of template - * specialization + * InvertedListScanner: scans series of codes and keeps the best ones *******************************************************************/ -template -SQDistanceComputer* select_distance_computer( - QuantizerType qtype, - size_t d, - const std::vector& trained) { - constexpr int SIMDWIDTH = Sim::simdwidth; - switch (qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return new DCTemplate< - QuantizerTemplate< - Codec8bit, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit_uniform: - return new DCTemplate< - QuantizerTemplate< - Codec4bit, - QuantizerTemplateScaling::UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_8bit: - return new DCTemplate< - QuantizerTemplate< - Codec8bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_6bit: - return new DCTemplate< - QuantizerTemplate< - Codec6bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_4bit: - return new DCTemplate< - QuantizerTemplate< - Codec4bit, - QuantizerTemplateScaling::NON_UNIFORM, - SIMDWIDTH>, - Sim, - SIMDWIDTH>(d, trained); - - case ScalarQuantizer::QT_fp16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_bf16: - return new DCTemplate, Sim, SIMDWIDTH>( - d, trained); - - case ScalarQuantizer::QT_8bit_direct: -#if defined(__AVX512F__) - if (d % 32 == 0) { - return new DistanceComputerByte(d, trained); - } else -#elif defined(__AVX2__) - if (d % 16 == 0) { - return new DistanceComputerByte(d, trained); - } else -#endif - { - return new DCTemplate< - Quantizer8bitDirect, - Sim, - SIMDWIDTH>(d, trained); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return new DCTemplate< - Quantizer8bitDirectSigned, - Sim, - SIMDWIDTH>(d, trained); - } - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} +#include -} // anonymous namespace +namespace faiss { +using namespace scalar_quantizer; /******************************************************************* * ScalarQuantizer implementation @@ -2046,18 +152,19 @@ void ScalarQuantizer::train(size_t n, const float* x) { } ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const { -#if defined(USE_AVX512_F16C) - if (d % 16 == 0) { - return select_quantizer_1<16>(qtype, d, trained); + // here we can't just dispatch because the SIMD code works only on certain + // vector sizes +#ifdef COMPILE_SIMD_AVX512 + if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) { + return select_quantizer_1(qtype, d, trained); } else -#elif defined(USE_F16C) || defined(USE_NEON) - if (d % 8 == 0) { - return select_quantizer_1<8>(qtype, d, trained); +#endif +#ifdef COMPILE_SIMD_AVX2 + if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) { + return select_quantizer_1(qtype, d, trained); } else #endif - { - return select_quantizer_1<1>(qtype, d, trained); - } + return select_quantizer_1(qtype, d, trained); } void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n) @@ -2080,33 +187,20 @@ void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const { SQDistanceComputer* ScalarQuantizer::get_distance_computer( MetricType metric) const { - FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); -#if defined(USE_AVX512_F16C) - if (d % 16 == 0) { - if (metric == METRIC_L2) { - return select_distance_computer>( - qtype, d, trained); - } else { - return select_distance_computer>( - qtype, d, trained); - } +#ifdef COMPILE_SIMD_AVX512 + if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) { + return select_distance_computer_1( + metric, qtype, d, trained); } else -#elif defined(USE_F16C) || defined(USE_NEON) - if (d % 8 == 0) { - if (metric == METRIC_L2) { - return select_distance_computer>(qtype, d, trained); - } else { - return select_distance_computer>(qtype, d, trained); - } +#endif +#ifdef COMPILE_SIMD_AVX2 + if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) { + return select_distance_computer_1( + metric, qtype, d, trained); } else #endif - { - if (metric == METRIC_L2) { - return select_distance_computer>(qtype, d, trained); - } else { - return select_distance_computer>(qtype, d, trained); - } - } + return select_distance_computer_1( + metric, qtype, d, trained); } /******************************************************************* @@ -2116,366 +210,26 @@ SQDistanceComputer* ScalarQuantizer::get_distance_computer( * IndexScalarQuantizer as well. ********************************************************************/ -namespace { - -template -struct IVFSQScannerIP : InvertedListScanner { - DCClass dc; - bool by_residual; - - float accu0; /// added to all distances - - IVFSQScannerIP( - int d, - const std::vector& trained, - size_t code_size, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), by_residual(by_residual), accu0(0) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - this->keep_max = true; - } - - void set_query(const float* query) override { - dc.set_query(query); - } - - void set_list(idx_t list_no, float coarse_dis) override { - this->list_no = list_no; - accu0 = by_residual ? coarse_dis : 0; - } - - float distance_to_code(const uint8_t* code) const final { - return accu0 + dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k) const override { - size_t nup = 0; - - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - float accu = accu0 + dc.query_to_code(codes); - - if (accu > simi[0]) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - minheap_replace_top(k, simi, idxi, accu, id); - nup++; - } - } - return nup; - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - float accu = accu0 + dc.query_to_code(codes); - if (accu > radius) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - res.add(accu, id); - } - } - } -}; - -/* use_sel = 0: don't check selector - * = 1: check on ids[j] - * = 2: check in j directly (normally ids is nullptr and store_pairs) - */ -template -struct IVFSQScannerL2 : InvertedListScanner { - DCClass dc; - - bool by_residual; - const Index* quantizer; - const float* x; /// current query - - std::vector tmp; - - IVFSQScannerL2( - int d, - const std::vector& trained, - size_t code_size, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) - : dc(d, trained), - by_residual(by_residual), - quantizer(quantizer), - x(nullptr), - tmp(d) { - this->store_pairs = store_pairs; - this->sel = sel; - this->code_size = code_size; - } - - void set_query(const float* query) override { - x = query; - if (!quantizer) { - dc.set_query(query); - } - } - - void set_list(idx_t list_no, float /*coarse_dis*/) override { - this->list_no = list_no; - if (by_residual) { - // shift of x_in wrt centroid - quantizer->compute_residual(x, tmp.data(), list_no); - dc.set_query(tmp.data()); - } else { - dc.set_query(x); - } - } - - float distance_to_code(const uint8_t* code) const final { - return dc.query_to_code(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k) const override { - size_t nup = 0; - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - float dis = dc.query_to_code(codes); - - if (dis < simi[0]) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - maxheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - } - return nup; - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++, codes += code_size) { - if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { - continue; - } - - float dis = dc.query_to_code(codes); - if (dis < radius) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - res.add(dis, id); - } - } - } -}; - -template -InvertedListScanner* sel3_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (DCClass::Sim::metric_type == METRIC_L2) { - return new IVFSQScannerL2( - sq->d, - sq->trained, - sq->code_size, - quantizer, - store_pairs, - sel, - r); - } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { - return new IVFSQScannerIP( - sq->d, sq->trained, sq->code_size, store_pairs, sel, r); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -template -InvertedListScanner* sel2_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - if (sel) { - if (store_pairs) { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } - } else { - return sel3_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); - } -} - -template -InvertedListScanner* sel12_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate; - using DCClass = DCTemplate; - return sel2_InvertedListScanner( - sq, quantizer, store_pairs, sel, r); -} - -template -InvertedListScanner* sel1_InvertedListScanner( - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool r) { - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch (sq->qtype) { - case ScalarQuantizer::QT_8bit_uniform: - return sel12_InvertedListScanner< - Similarity, - Codec8bit, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit_uniform: - return sel12_InvertedListScanner< - Similarity, - Codec4bit, - QuantizerTemplateScaling::UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit: - return sel12_InvertedListScanner< - Similarity, - Codec8bit, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_4bit: - return sel12_InvertedListScanner< - Similarity, - Codec4bit, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_6bit: - return sel12_InvertedListScanner< - Similarity, - Codec6bit, - QuantizerTemplateScaling::NON_UNIFORM>( - sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_fp16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_bf16: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - case ScalarQuantizer::QT_8bit_direct: -#if defined(__AVX512F__) - if (sq->d % 32 == 0) { - return sel2_InvertedListScanner< - DistanceComputerByte>( - sq, quantizer, store_pairs, sel, r); - } else -#elif defined(__AVX2__) - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner< - DistanceComputerByte>( - sq, quantizer, store_pairs, sel, r); - } else -#endif - { - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - case ScalarQuantizer::QT_8bit_direct_signed: - return sel2_InvertedListScanner, - Similarity, - SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r); - } - - FAISS_THROW_MSG("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner( - MetricType mt, - const ScalarQuantizer* sq, - const Index* quantizer, - bool store_pairs, - const IDSelector* sel, - bool by_residual) { - if (mt == METRIC_L2) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner>( - sq, quantizer, store_pairs, sel, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -} // anonymous namespace - InvertedListScanner* ScalarQuantizer::select_InvertedListScanner( MetricType mt, const Index* quantizer, bool store_pairs, const IDSelector* sel, bool by_residual) const { -#if defined(USE_AVX512_F16C) - if (d % 16 == 0) { - return sel0_InvertedListScanner<16>( +#ifdef COMPILE_SIMD_AVX512 + if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) { + return sel0_InvertedListScanner( mt, this, quantizer, store_pairs, sel, by_residual); } else -#elif defined(USE_F16C) || defined(USE_NEON) - if (d % 8 == 0) { - return sel0_InvertedListScanner<8>( +#endif +#ifdef COMPILE_SIMD_AVX2 + if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) { + return sel0_InvertedListScanner( mt, this, quantizer, store_pairs, sel, by_residual); } else #endif - { - return sel0_InvertedListScanner<1>( + return sel0_InvertedListScanner( mt, this, quantizer, store_pairs, sel, by_residual); - } } } // namespace faiss diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h index c1f4f98f63..279938443a 100644 --- a/faiss/impl/ScalarQuantizer.h +++ b/faiss/impl/ScalarQuantizer.h @@ -5,8 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -// -*- c++ -*- - #pragma once #include diff --git a/faiss/impl/code_distance/code_distance-avx2.cpp b/faiss/impl/code_distance/code_distance-avx2.cpp new file mode 100644 index 0000000000..b140b9ece0 --- /dev/null +++ b/faiss/impl/code_distance/code_distance-avx2.cpp @@ -0,0 +1,486 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782 +#if defined(__GNUC__) && __GNUC__ < 9 +#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x)) +#endif + +namespace { + +inline float horizontal_sum(const __m128 v) { + const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2)); + const __m128 v1 = _mm_add_ps(v, v0); + __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 v3 = _mm_add_ps(v1, v2); + return _mm_cvtss_f32(v3); +} + +// Computes a horizontal sum over an __m256 register +inline float horizontal_sum(const __m256 v) { + const __m128 v0 = + _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); + return horizontal_sum(v0); +} + +// processes a single code for M=4, ksub=256, nbits=8 +float inline distance_single_code_avx2_pqdecoder8_m4( + // precomputed distances, layout (4, 256) + const float* sim_table, + const uint8_t* code) { + float result = 0; + + const float* tab = sim_table; + constexpr size_t ksub = 1 << 8; + + const __m128i vksub = _mm_set1_epi32(ksub); + __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3); + offsets_0 = _mm_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m128 partialSum; + + // load 4 uint8 values + const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code)); + { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m128i idx1 = _mm_cvtepu8_epi32(mm1); + + // add offsets + const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m128 collected = + _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSum = collected; + } + + // horizontal sum for partialSum + result = horizontal_sum(partialSum); + return result; +} + +// processes a single code for M=8, ksub=256, nbits=8 +float inline distance_single_code_avx2_pqdecoder8_m8( + // precomputed distances, layout (8, 256) + const float* sim_table, + const uint8_t* code) { + float result = 0; + + const float* tab = sim_table; + constexpr size_t ksub = 1 << 8; + + const __m256i vksub = _mm256_set1_epi32(ksub); + __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m256 partialSum; + + // load 8 uint8 values + const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code); + { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm1); + + // add offsets + const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = + _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSum = collected; + } + + // horizontal sum for partialSum + result = horizontal_sum(partialSum); + return result; +} + +// processes four codes for M=4, ksub=256, nbits=8 +inline void distance_four_codes_avx2_pqdecoder8_m4( + // precomputed distances, layout (4, 256) + const float* sim_table, + // codes + const uint8_t* __restrict code0, + const uint8_t* __restrict code1, + const uint8_t* __restrict code2, + const uint8_t* __restrict code3, + // computed distances + float& result0, + float& result1, + float& result2, + float& result3) { + constexpr intptr_t N = 4; + + const float* tab = sim_table; + constexpr size_t ksub = 1 << 8; + + // process 8 values + const __m128i vksub = _mm_set1_epi32(ksub); + __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3); + offsets_0 = _mm_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m128 partialSums[N]; + + // load 4 uint8 values + __m128i mm1[N]; + mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0)); + mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1)); + mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2)); + mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3)); + + for (intptr_t j = 0; j < N; j++) { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]); + + // add offsets + const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0); + + // gather 4 values, similar to 4 operations of tab[idx] + __m128 collected = + _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSums[j] = collected; + } + + // horizontal sum for partialSum + result0 = horizontal_sum(partialSums[0]); + result1 = horizontal_sum(partialSums[1]); + result2 = horizontal_sum(partialSums[2]); + result3 = horizontal_sum(partialSums[3]); +} + +// processes four codes for M=8, ksub=256, nbits=8 +inline void distance_four_codes_avx2_pqdecoder8_m8( + // precomputed distances, layout (8, 256) + const float* sim_table, + // codes + const uint8_t* __restrict code0, + const uint8_t* __restrict code1, + const uint8_t* __restrict code2, + const uint8_t* __restrict code3, + // computed distances + float& result0, + float& result1, + float& result2, + float& result3) { + constexpr intptr_t N = 4; + + const float* tab = sim_table; + constexpr size_t ksub = 1 << 8; + + // process 8 values + const __m256i vksub = _mm256_set1_epi32(ksub); + __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m256 partialSums[N]; + + // load 8 uint8 values + __m128i mm1[N]; + mm1[0] = _mm_loadu_si64((const __m128i_u*)code0); + mm1[1] = _mm_loadu_si64((const __m128i_u*)code1); + mm1[2] = _mm_loadu_si64((const __m128i_u*)code2); + mm1[3] = _mm_loadu_si64((const __m128i_u*)code3); + + for (intptr_t j = 0; j < N; j++) { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]); + + // add offsets + const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = + _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSums[j] = collected; + } + + // horizontal sum for partialSum + result0 = horizontal_sum(partialSums[0]); + result1 = horizontal_sum(partialSums[1]); + result2 = horizontal_sum(partialSums[2]); + result3 = horizontal_sum(partialSums[3]); +} + +} // namespace + +namespace faiss { + +template <> +struct PQCodeDistance { + float distance_single_code( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + const uint8_t* code) { + if (M == 4) { + return distance_single_code_avx2_pqdecoder8_m4(sim_table, code); + } + if (M == 8) { + return distance_single_code_avx2_pqdecoder8_m8(sim_table, code); + } + + float result = 0; + constexpr size_t ksub = 1 << 8; + + size_t m = 0; + const size_t pqM16 = M / 16; + + const float* tab = sim_table; + + if (pqM16 > 0) { + // process 16 values per loop + + const __m256i vksub = _mm256_set1_epi32(ksub); + __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m256 partialSum = _mm256_setzero_ps(); + + // loop + for (m = 0; m < pqM16 * 16; m += 16) { + // load 16 uint8 values + const __m128i mm1 = + _mm_loadu_si128((const __m128i_u*)(code + m)); + { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm1); + + // add offsets + const __m256i indices_to_read_from = + _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = _mm256_i32gather_ps( + tab, indices_to_read_from, sizeof(float)); + tab += ksub * 8; + + // collect partial sums + partialSum = _mm256_add_ps(partialSum, collected); + } + + // move high 8 uint8 to low ones + const __m128i mm2 = + _mm_unpackhi_epi64(mm1, _mm_setzero_si128()); + { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm2); + + // add offsets + const __m256i indices_to_read_from = + _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = _mm256_i32gather_ps( + tab, indices_to_read_from, sizeof(float)); + tab += ksub * 8; + + // collect partial sums + partialSum = _mm256_add_ps(partialSum, collected); + } + } + + // horizontal sum for partialSum + result += horizontal_sum(partialSum); + } + + // + if (m < M) { + // process leftovers + PQDecoder8 decoder(code + m, nbits); + + for (; m < M; m++) { + result += tab[decoder.decode()]; + tab += ksub; + } + } + + return result; + } + + // Combines 4 operations of distance_single_code() + void distance_four_codes( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + // codes + const uint8_t* __restrict code0, + const uint8_t* __restrict code1, + const uint8_t* __restrict code2, + const uint8_t* __restrict code3, + // computed distances + float& result0, + float& result1, + float& result2, + float& result3) { + if (M == 4) { + distance_four_codes_avx2_pqdecoder8_m4( + sim_table, + code0, + code1, + code2, + code3, + result0, + result1, + result2, + result3); + return; + } + if (M == 8) { + distance_four_codes_avx2_pqdecoder8_m8( + sim_table, + code0, + code1, + code2, + code3, + result0, + result1, + result2, + result3); + return; + } + + result0 = 0; + result1 = 0; + result2 = 0; + result3 = 0; + constexpr size_t ksub = 1 << 8; + + size_t m = 0; + const size_t pqM16 = M / 16; + + constexpr intptr_t N = 4; + + const float* tab = sim_table; + + if (pqM16 > 0) { + // process 16 values per loop + const __m256i vksub = _mm256_set1_epi32(ksub); + __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m256 partialSums[N]; + for (intptr_t j = 0; j < N; j++) { + partialSums[j] = _mm256_setzero_ps(); + } + + // loop + for (m = 0; m < pqM16 * 16; m += 16) { + // load 16 uint8 values + __m128i mm1[N]; + mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); + mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m)); + mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m)); + mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m)); + + // process first 8 codes + for (intptr_t j = 0; j < N; j++) { + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]); + + // add offsets + const __m256i indices_to_read_from = + _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = _mm256_i32gather_ps( + tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSums[j] = _mm256_add_ps(partialSums[j], collected); + } + tab += ksub * 8; + + // process next 8 codes + for (intptr_t j = 0; j < N; j++) { + // move high 8 uint8 to low ones + const __m128i mm2 = + _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128()); + + // convert uint8 values (low part of __m128i) to int32 + // values + const __m256i idx1 = _mm256_cvtepu8_epi32(mm2); + + // add offsets + const __m256i indices_to_read_from = + _mm256_add_epi32(idx1, offsets_0); + + // gather 8 values, similar to 8 operations of tab[idx] + __m256 collected = _mm256_i32gather_ps( + tab, indices_to_read_from, sizeof(float)); + + // collect partial sums + partialSums[j] = _mm256_add_ps(partialSums[j], collected); + } + + tab += ksub * 8; + } + + // horizontal sum for partialSum + result0 += horizontal_sum(partialSums[0]); + result1 += horizontal_sum(partialSums[1]); + result2 += horizontal_sum(partialSums[2]); + result3 += horizontal_sum(partialSums[3]); + } + + // + if (m < M) { + // process leftovers + PQDecoder8 decoder0(code0 + m, nbits); + PQDecoder8 decoder1(code1 + m, nbits); + PQDecoder8 decoder2(code2 + m, nbits); + PQDecoder8 decoder3(code3 + m, nbits); + for (; m < M; m++) { + result0 += tab[decoder0.decode()]; + result1 += tab[decoder1.decode()]; + result2 += tab[decoder2.decode()]; + result3 += tab[decoder3.decode()]; + tab += ksub; + } + } + } +}; + +// explicit template instanciations +// template struct PQCodeDistance; + +// these two will automatically use the generic implementation +template struct PQCodeDistance; +template struct PQCodeDistance; + +} // namespace faiss diff --git a/faiss/impl/code_distance/code_distance-avx2.h b/faiss/impl/code_distance/code_distance-avx2.h deleted file mode 100644 index 53380b6e46..0000000000 --- a/faiss/impl/code_distance/code_distance-avx2.h +++ /dev/null @@ -1,534 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#ifdef __AVX2__ - -#include - -#include - -#include -#include - -// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782 -#if defined(__GNUC__) && __GNUC__ < 9 -#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x)) -#endif - -namespace { - -inline float horizontal_sum(const __m128 v) { - const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2)); - const __m128 v1 = _mm_add_ps(v, v0); - __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 v3 = _mm_add_ps(v1, v2); - return _mm_cvtss_f32(v3); -} - -// Computes a horizontal sum over an __m256 register -inline float horizontal_sum(const __m256 v) { - const __m128 v0 = - _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); - return horizontal_sum(v0); -} - -// processes a single code for M=4, ksub=256, nbits=8 -float inline distance_single_code_avx2_pqdecoder8_m4( - // precomputed distances, layout (4, 256) - const float* sim_table, - const uint8_t* code) { - float result = 0; - - const float* tab = sim_table; - constexpr size_t ksub = 1 << 8; - - const __m128i vksub = _mm_set1_epi32(ksub); - __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3); - offsets_0 = _mm_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m128 partialSum; - - // load 4 uint8 values - const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code)); - { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m128i idx1 = _mm_cvtepu8_epi32(mm1); - - // add offsets - const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m128 collected = - _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSum = collected; - } - - // horizontal sum for partialSum - result = horizontal_sum(partialSum); - return result; -} - -// processes a single code for M=8, ksub=256, nbits=8 -float inline distance_single_code_avx2_pqdecoder8_m8( - // precomputed distances, layout (8, 256) - const float* sim_table, - const uint8_t* code) { - float result = 0; - - const float* tab = sim_table; - constexpr size_t ksub = 1 << 8; - - const __m256i vksub = _mm256_set1_epi32(ksub); - __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m256 partialSum; - - // load 8 uint8 values - const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code); - { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm1); - - // add offsets - const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = - _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSum = collected; - } - - // horizontal sum for partialSum - result = horizontal_sum(partialSum); - return result; -} - -// processes four codes for M=4, ksub=256, nbits=8 -inline void distance_four_codes_avx2_pqdecoder8_m4( - // precomputed distances, layout (4, 256) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - constexpr intptr_t N = 4; - - const float* tab = sim_table; - constexpr size_t ksub = 1 << 8; - - // process 8 values - const __m128i vksub = _mm_set1_epi32(ksub); - __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3); - offsets_0 = _mm_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m128 partialSums[N]; - - // load 4 uint8 values - __m128i mm1[N]; - mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0)); - mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1)); - mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2)); - mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3)); - - for (intptr_t j = 0; j < N; j++) { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]); - - // add offsets - const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0); - - // gather 4 values, similar to 4 operations of tab[idx] - __m128 collected = - _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSums[j] = collected; - } - - // horizontal sum for partialSum - result0 = horizontal_sum(partialSums[0]); - result1 = horizontal_sum(partialSums[1]); - result2 = horizontal_sum(partialSums[2]); - result3 = horizontal_sum(partialSums[3]); -} - -// processes four codes for M=8, ksub=256, nbits=8 -inline void distance_four_codes_avx2_pqdecoder8_m8( - // precomputed distances, layout (8, 256) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - constexpr intptr_t N = 4; - - const float* tab = sim_table; - constexpr size_t ksub = 1 << 8; - - // process 8 values - const __m256i vksub = _mm256_set1_epi32(ksub); - __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m256 partialSums[N]; - - // load 8 uint8 values - __m128i mm1[N]; - mm1[0] = _mm_loadu_si64((const __m128i_u*)code0); - mm1[1] = _mm_loadu_si64((const __m128i_u*)code1); - mm1[2] = _mm_loadu_si64((const __m128i_u*)code2); - mm1[3] = _mm_loadu_si64((const __m128i_u*)code3); - - for (intptr_t j = 0; j < N; j++) { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]); - - // add offsets - const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = - _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSums[j] = collected; - } - - // horizontal sum for partialSum - result0 = horizontal_sum(partialSums[0]); - result1 = horizontal_sum(partialSums[1]); - result2 = horizontal_sum(partialSums[2]); - result3 = horizontal_sum(partialSums[3]); -} - -} // namespace - -namespace faiss { - -template -typename std::enable_if::value, float>:: - type inline distance_single_code_avx2( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - const uint8_t* code) { - // default implementation - return distance_single_code_generic(M, nbits, sim_table, code); -} - -template -typename std::enable_if::value, float>:: - type inline distance_single_code_avx2( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - const uint8_t* code) { - if (M == 4) { - return distance_single_code_avx2_pqdecoder8_m4(sim_table, code); - } - if (M == 8) { - return distance_single_code_avx2_pqdecoder8_m8(sim_table, code); - } - - float result = 0; - constexpr size_t ksub = 1 << 8; - - size_t m = 0; - const size_t pqM16 = M / 16; - - const float* tab = sim_table; - - if (pqM16 > 0) { - // process 16 values per loop - - const __m256i vksub = _mm256_set1_epi32(ksub); - __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m256 partialSum = _mm256_setzero_ps(); - - // loop - for (m = 0; m < pqM16 * 16; m += 16) { - // load 16 uint8 values - const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m)); - { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm1); - - // add offsets - const __m256i indices_to_read_from = - _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = _mm256_i32gather_ps( - tab, indices_to_read_from, sizeof(float)); - tab += ksub * 8; - - // collect partial sums - partialSum = _mm256_add_ps(partialSum, collected); - } - - // move high 8 uint8 to low ones - const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128()); - { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm2); - - // add offsets - const __m256i indices_to_read_from = - _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = _mm256_i32gather_ps( - tab, indices_to_read_from, sizeof(float)); - tab += ksub * 8; - - // collect partial sums - partialSum = _mm256_add_ps(partialSum, collected); - } - } - - // horizontal sum for partialSum - result += horizontal_sum(partialSum); - } - - // - if (m < M) { - // process leftovers - PQDecoder8 decoder(code + m, nbits); - - for (; m < M; m++) { - result += tab[decoder.decode()]; - tab += ksub; - } - } - - return result; -} - -template -typename std::enable_if::value, void>:: - type - distance_four_codes_avx2( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - distance_four_codes_generic( - M, - nbits, - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); -} - -// Combines 4 operations of distance_single_code() -template -typename std::enable_if::value, void>::type -distance_four_codes_avx2( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - if (M == 4) { - distance_four_codes_avx2_pqdecoder8_m4( - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); - return; - } - if (M == 8) { - distance_four_codes_avx2_pqdecoder8_m8( - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); - return; - } - - result0 = 0; - result1 = 0; - result2 = 0; - result3 = 0; - constexpr size_t ksub = 1 << 8; - - size_t m = 0; - const size_t pqM16 = M / 16; - - constexpr intptr_t N = 4; - - const float* tab = sim_table; - - if (pqM16 > 0) { - // process 16 values per loop - const __m256i vksub = _mm256_set1_epi32(ksub); - __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - offsets_0 = _mm256_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m256 partialSums[N]; - for (intptr_t j = 0; j < N; j++) { - partialSums[j] = _mm256_setzero_ps(); - } - - // loop - for (m = 0; m < pqM16 * 16; m += 16) { - // load 16 uint8 values - __m128i mm1[N]; - mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); - mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m)); - mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m)); - mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m)); - - // process first 8 codes - for (intptr_t j = 0; j < N; j++) { - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]); - - // add offsets - const __m256i indices_to_read_from = - _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = _mm256_i32gather_ps( - tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSums[j] = _mm256_add_ps(partialSums[j], collected); - } - tab += ksub * 8; - - // process next 8 codes - for (intptr_t j = 0; j < N; j++) { - // move high 8 uint8 to low ones - const __m128i mm2 = - _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128()); - - // convert uint8 values (low part of __m128i) to int32 - // values - const __m256i idx1 = _mm256_cvtepu8_epi32(mm2); - - // add offsets - const __m256i indices_to_read_from = - _mm256_add_epi32(idx1, offsets_0); - - // gather 8 values, similar to 8 operations of tab[idx] - __m256 collected = _mm256_i32gather_ps( - tab, indices_to_read_from, sizeof(float)); - - // collect partial sums - partialSums[j] = _mm256_add_ps(partialSums[j], collected); - } - - tab += ksub * 8; - } - - // horizontal sum for partialSum - result0 += horizontal_sum(partialSums[0]); - result1 += horizontal_sum(partialSums[1]); - result2 += horizontal_sum(partialSums[2]); - result3 += horizontal_sum(partialSums[3]); - } - - // - if (m < M) { - // process leftovers - PQDecoder8 decoder0(code0 + m, nbits); - PQDecoder8 decoder1(code1 + m, nbits); - PQDecoder8 decoder2(code2 + m, nbits); - PQDecoder8 decoder3(code3 + m, nbits); - for (; m < M; m++) { - result0 += tab[decoder0.decode()]; - result1 += tab[decoder1.decode()]; - result2 += tab[decoder2.decode()]; - result3 += tab[decoder3.decode()]; - tab += ksub; - } - } -} - -} // namespace faiss - -#endif diff --git a/faiss/impl/code_distance/code_distance-avx512.cpp b/faiss/impl/code_distance/code_distance-avx512.cpp new file mode 100644 index 0000000000..891db07975 --- /dev/null +++ b/faiss/impl/code_distance/code_distance-avx512.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +// According to experiments, the AVX-512 version may be SLOWER than +// the AVX2 version, which is somewhat unexpected. +// This version is not used for now, but it may be used later. +// +// TODO: test for AMD CPUs. + +namespace faiss { + +template <> +struct PQCodeDistance { + float distance_single_code( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + const uint8_t* code0) { + float result0 = 0; + constexpr size_t ksub = 1 << 8; + + size_t m = 0; + const size_t pqM16 = M / 16; + + constexpr intptr_t N = 1; + + const float* tab = sim_table; + + if (pqM16 > 0) { + // process 16 values per loop + const __m512i vksub = _mm512_set1_epi32(ksub); + __m512i offsets_0 = _mm512_setr_epi32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + offsets_0 = _mm512_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m512 partialSums[N]; + for (intptr_t j = 0; j < N; j++) { + partialSums[j] = _mm512_setzero_ps(); + } + + // loop + for (m = 0; m < pqM16 * 16; m += 16) { + // load 16 uint8 values + __m128i mm1[N]; + mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); + + // process first 8 codes + for (intptr_t j = 0; j < N; j++) { + const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]); + + // add offsets + const __m512i indices_to_read_from = + _mm512_add_epi32(idx1, offsets_0); + + // gather 16 values, similar to 16 operations of tab[idx] + __m512 collected = _mm512_i32gather_ps( + indices_to_read_from, tab, sizeof(float)); + + // collect partial sums + partialSums[j] = _mm512_add_ps(partialSums[j], collected); + } + tab += ksub * 16; + } + + // horizontal sum for partialSum + result0 += _mm512_reduce_add_ps(partialSums[0]); + } + + // + if (m < M) { + // process leftovers + PQDecoder8 decoder0(code0 + m, nbits); + for (; m < M; m++) { + result0 += tab[decoder0.decode()]; + tab += ksub; + } + } + + return result0; + } + + void distance_four_codes_avx512( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + // codes + const uint8_t* __restrict code0, + const uint8_t* __restrict code1, + const uint8_t* __restrict code2, + const uint8_t* __restrict code3, + // computed distances + float& result0, + float& result1, + float& result2, + float& result3) { + result0 = 0; + result1 = 0; + result2 = 0; + result3 = 0; + constexpr size_t ksub = 1 << 8; + + size_t m = 0; + const size_t pqM16 = M / 16; + + constexpr intptr_t N = 4; + + const float* tab = sim_table; + + if (pqM16 > 0) { + // process 16 values per loop + const __m512i vksub = _mm512_set1_epi32(ksub); + __m512i offsets_0 = _mm512_setr_epi32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + offsets_0 = _mm512_mullo_epi32(offsets_0, vksub); + + // accumulators of partial sums + __m512 partialSums[N]; + for (intptr_t j = 0; j < N; j++) { + partialSums[j] = _mm512_setzero_ps(); + } + + // loop + for (m = 0; m < pqM16 * 16; m += 16) { + // load 16 uint8 values + __m128i mm1[N]; + mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); + mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m)); + mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m)); + mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m)); + + // process first 8 codes + for (intptr_t j = 0; j < N; j++) { + const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]); + + // add offsets + const __m512i indices_to_read_from = + _mm512_add_epi32(idx1, offsets_0); + + // gather 16 values, similar to 16 operations of tab[idx] + __m512 collected = _mm512_i32gather_ps( + indices_to_read_from, tab, sizeof(float)); + + // collect partial sums + partialSums[j] = _mm512_add_ps(partialSums[j], collected); + } + tab += ksub * 16; + } + + // horizontal sum for partialSum + result0 += _mm512_reduce_add_ps(partialSums[0]); + result1 += _mm512_reduce_add_ps(partialSums[1]); + result2 += _mm512_reduce_add_ps(partialSums[2]); + result3 += _mm512_reduce_add_ps(partialSums[3]); + } + + // + if (m < M) { + // process leftovers + PQDecoder8 decoder0(code0 + m, nbits); + PQDecoder8 decoder1(code1 + m, nbits); + PQDecoder8 decoder2(code2 + m, nbits); + PQDecoder8 decoder3(code3 + m, nbits); + for (; m < M; m++) { + result0 += tab[decoder0.decode()]; + result1 += tab[decoder1.decode()]; + result2 += tab[decoder2.decode()]; + result3 += tab[decoder3.decode()]; + tab += ksub; + } + } + } +}; + +// explicit template instanciations +// template struct PQCodeDistance; + +// these two will automatically use the generic implementation +template struct PQCodeDistance; +template struct PQCodeDistance; + +} // namespace faiss diff --git a/faiss/impl/code_distance/code_distance-avx512.h b/faiss/impl/code_distance/code_distance-avx512.h deleted file mode 100644 index d05c41c19c..0000000000 --- a/faiss/impl/code_distance/code_distance-avx512.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#ifdef __AVX512F__ - -#include - -#include - -#include -#include - -namespace faiss { - -// According to experiments, the AVX-512 version may be SLOWER than -// the AVX2 version, which is somewhat unexpected. -// This version is not used for now, but it may be used later. -// -// TODO: test for AMD CPUs. - -template -typename std::enable_if::value, float>:: - type inline distance_single_code_avx512( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - const uint8_t* code) { - // default implementation - return distance_single_code_generic(M, nbits, sim_table, code); -} - -template -typename std::enable_if::value, float>:: - type inline distance_single_code_avx512( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - const uint8_t* code0) { - float result0 = 0; - constexpr size_t ksub = 1 << 8; - - size_t m = 0; - const size_t pqM16 = M / 16; - - constexpr intptr_t N = 1; - - const float* tab = sim_table; - - if (pqM16 > 0) { - // process 16 values per loop - const __m512i vksub = _mm512_set1_epi32(ksub); - __m512i offsets_0 = _mm512_setr_epi32( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - offsets_0 = _mm512_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m512 partialSums[N]; - for (intptr_t j = 0; j < N; j++) { - partialSums[j] = _mm512_setzero_ps(); - } - - // loop - for (m = 0; m < pqM16 * 16; m += 16) { - // load 16 uint8 values - __m128i mm1[N]; - mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); - - // process first 8 codes - for (intptr_t j = 0; j < N; j++) { - const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]); - - // add offsets - const __m512i indices_to_read_from = - _mm512_add_epi32(idx1, offsets_0); - - // gather 16 values, similar to 16 operations of tab[idx] - __m512 collected = _mm512_i32gather_ps( - indices_to_read_from, tab, sizeof(float)); - - // collect partial sums - partialSums[j] = _mm512_add_ps(partialSums[j], collected); - } - tab += ksub * 16; - } - - // horizontal sum for partialSum - result0 += _mm512_reduce_add_ps(partialSums[0]); - } - - // - if (m < M) { - // process leftovers - PQDecoder8 decoder0(code0 + m, nbits); - for (; m < M; m++) { - result0 += tab[decoder0.decode()]; - tab += ksub; - } - } - - return result0; -} - -template -typename std::enable_if::value, void>:: - type - distance_four_codes_avx512( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - distance_four_codes_generic( - M, - nbits, - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); -} - -// Combines 4 operations of distance_single_code() -template -typename std::enable_if::value, void>::type -distance_four_codes_avx512( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - result0 = 0; - result1 = 0; - result2 = 0; - result3 = 0; - constexpr size_t ksub = 1 << 8; - - size_t m = 0; - const size_t pqM16 = M / 16; - - constexpr intptr_t N = 4; - - const float* tab = sim_table; - - if (pqM16 > 0) { - // process 16 values per loop - const __m512i vksub = _mm512_set1_epi32(ksub); - __m512i offsets_0 = _mm512_setr_epi32( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - offsets_0 = _mm512_mullo_epi32(offsets_0, vksub); - - // accumulators of partial sums - __m512 partialSums[N]; - for (intptr_t j = 0; j < N; j++) { - partialSums[j] = _mm512_setzero_ps(); - } - - // loop - for (m = 0; m < pqM16 * 16; m += 16) { - // load 16 uint8 values - __m128i mm1[N]; - mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m)); - mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m)); - mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m)); - mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m)); - - // process first 8 codes - for (intptr_t j = 0; j < N; j++) { - const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]); - - // add offsets - const __m512i indices_to_read_from = - _mm512_add_epi32(idx1, offsets_0); - - // gather 16 values, similar to 16 operations of tab[idx] - __m512 collected = _mm512_i32gather_ps( - indices_to_read_from, tab, sizeof(float)); - - // collect partial sums - partialSums[j] = _mm512_add_ps(partialSums[j], collected); - } - tab += ksub * 16; - } - - // horizontal sum for partialSum - result0 += _mm512_reduce_add_ps(partialSums[0]); - result1 += _mm512_reduce_add_ps(partialSums[1]); - result2 += _mm512_reduce_add_ps(partialSums[2]); - result3 += _mm512_reduce_add_ps(partialSums[3]); - } - - // - if (m < M) { - // process leftovers - PQDecoder8 decoder0(code0 + m, nbits); - PQDecoder8 decoder1(code1 + m, nbits); - PQDecoder8 decoder2(code2 + m, nbits); - PQDecoder8 decoder3(code3 + m, nbits); - for (; m < M; m++) { - result0 += tab[decoder0.decode()]; - result1 += tab[decoder1.decode()]; - result2 += tab[decoder2.decode()]; - result3 += tab[decoder3.decode()]; - tab += ksub; - } - } -} - -} // namespace faiss - -#endif diff --git a/faiss/impl/code_distance/code_distance-generic.cpp b/faiss/impl/code_distance/code_distance-generic.cpp new file mode 100644 index 0000000000..ac9561ed93 --- /dev/null +++ b/faiss/impl/code_distance/code_distance-generic.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace faiss { + +// explicit template instanciations +template struct PQCodeDistance; +template struct PQCodeDistance; +template struct PQCodeDistance; + +} // namespace faiss diff --git a/faiss/impl/code_distance/code_distance-generic.h b/faiss/impl/code_distance/code_distance-generic.h deleted file mode 100644 index c02551c415..0000000000 --- a/faiss/impl/code_distance/code_distance-generic.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace faiss { - -/// Returns the distance to a single code. -template -inline float distance_single_code_generic( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // the code - const uint8_t* code) { - PQDecoderT decoder(code, nbits); - const size_t ksub = 1 << nbits; - - const float* tab = sim_table; - float result = 0; - - for (size_t m = 0; m < M; m++) { - result += tab[decoder.decode()]; - tab += ksub; - } - - return result; -} - -/// Combines 4 operations of distance_single_code() -/// General-purpose version. -template -inline void distance_four_codes_generic( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - PQDecoderT decoder0(code0, nbits); - PQDecoderT decoder1(code1, nbits); - PQDecoderT decoder2(code2, nbits); - PQDecoderT decoder3(code3, nbits); - const size_t ksub = 1 << nbits; - - const float* tab = sim_table; - result0 = 0; - result1 = 0; - result2 = 0; - result3 = 0; - - for (size_t m = 0; m < M; m++) { - result0 += tab[decoder0.decode()]; - result1 += tab[decoder1.decode()]; - result2 += tab[decoder2.decode()]; - result3 += tab[decoder3.decode()]; - tab += ksub; - } -} - -} // namespace faiss diff --git a/faiss/impl/code_distance/code_distance-sve.h b/faiss/impl/code_distance/code_distance-sve.cpp similarity index 99% rename from faiss/impl/code_distance/code_distance-sve.h rename to faiss/impl/code_distance/code_distance-sve.cpp index 82f7746be6..9a941798ff 100644 --- a/faiss/impl/code_distance/code_distance-sve.h +++ b/faiss/impl/code_distance/code_distance-sve.cpp @@ -5,8 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -#pragma once - #ifdef __ARM_FEATURE_SVE #include @@ -15,7 +13,7 @@ #include #include -#include +#include namespace faiss { diff --git a/faiss/impl/code_distance/code_distance.h b/faiss/impl/code_distance/code_distance.h index 8f29abda97..585890cb40 100644 --- a/faiss/impl/code_distance/code_distance.h +++ b/faiss/impl/code_distance/code_distance.h @@ -9,6 +9,10 @@ #include +#include + +#include + // This directory contains functions to compute a distance // from a given PQ code to a query vector, given that the // distances to a query vector for pq.M codebooks are precomputed. @@ -24,163 +28,76 @@ // why the names of the functions for custom implementations // have this _generic or _avx2 suffix. -#ifdef __AVX2__ - -#include - namespace faiss { -template -inline float distance_single_code( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // the code - const uint8_t* code) { - return distance_single_code_avx2(M, nbits, sim_table, code); -} - -template -inline void distance_four_codes( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - distance_four_codes_avx2( - M, - nbits, - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); -} +// definiton and default implementation +template +struct PQCodeDistance { + using PQDecoder = PQDecoderT; + + /// Returns the distance to a single code. + static float distance_single_code( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + // the code + const uint8_t* code) { + PQDecoderT decoder(code, nbits); + const size_t ksub = 1 << nbits; + + const float* tab = sim_table; + float result = 0; + + for (size_t m = 0; m < M; m++) { + result += tab[decoder.decode()]; + tab += ksub; + } + + return result; + } + + /// Combines 4 operations of distance_single_code() + /// General-purpose version. + static void distance_four_codes( + // number of subquantizers + const size_t M, + // number of bits per quantization index + const size_t nbits, + // precomputed distances, layout (M, ksub) + const float* sim_table, + // codes + const uint8_t* __restrict code0, + const uint8_t* __restrict code1, + const uint8_t* __restrict code2, + const uint8_t* __restrict code3, + // computed distances + float& result0, + float& result1, + float& result2, + float& result3) { + PQDecoderT decoder0(code0, nbits); + PQDecoderT decoder1(code1, nbits); + PQDecoderT decoder2(code2, nbits); + PQDecoderT decoder3(code3, nbits); + const size_t ksub = 1 << nbits; + + const float* tab = sim_table; + result0 = 0; + result1 = 0; + result2 = 0; + result3 = 0; + + for (size_t m = 0; m < M; m++) { + result0 += tab[decoder0.decode()]; + result1 += tab[decoder1.decode()]; + result2 += tab[decoder2.decode()]; + result3 += tab[decoder3.decode()]; + tab += ksub; + } + } +}; } // namespace faiss - -#elif defined(__ARM_FEATURE_SVE) - -#include - -namespace faiss { - -template -inline float distance_single_code( - // the product quantizer - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // the code - const uint8_t* code) { - return distance_single_code_sve(M, nbits, sim_table, code); -} - -template -inline void distance_four_codes( - // the product quantizer - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - distance_four_codes_sve( - M, - nbits, - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); -} - -} // namespace faiss - -#else - -#include - -namespace faiss { - -template -inline float distance_single_code( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // the code - const uint8_t* code) { - return distance_single_code_generic(M, nbits, sim_table, code); -} - -template -inline void distance_four_codes( - // number of subquantizers - const size_t M, - // number of bits per quantization index - const size_t nbits, - // precomputed distances, layout (M, ksub) - const float* sim_table, - // codes - const uint8_t* __restrict code0, - const uint8_t* __restrict code1, - const uint8_t* __restrict code2, - const uint8_t* __restrict code3, - // computed distances - float& result0, - float& result1, - float& result2, - float& result3) { - distance_four_codes_generic( - M, - nbits, - sim_table, - code0, - code1, - code2, - code3, - result0, - result1, - result2, - result3); -} - -} // namespace faiss - -#endif diff --git a/faiss/impl/scalar_quantizer/codecs.h b/faiss/impl/scalar_quantizer/codecs.h new file mode 100644 index 0000000000..b5c20d464b --- /dev/null +++ b/faiss/impl/scalar_quantizer/codecs.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/******************************************************************* + * Codec: converts between values in [0, 1] and an index in a code + * array. The "i" parameter is the vector component index (not byte + * index). + */ + +template +struct Codec8bit {}; + +template +struct Codec4bit {}; + +template +struct Codec6bit {}; + +template <> +struct Codec8bit { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { + code[i] = (int)(255 * x); + } + + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { + return (code[i] + 0.5f) / 255.0f; + } +}; +template <> +struct Codec4bit { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { + code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); + } + + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { + return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; + } +}; + +template <> +struct Codec6bit { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { + int bits = (int)(x * 63.0); + code += (i >> 2) * 3; + switch (i & 3) { + case 0: + code[0] |= bits; + break; + case 1: + code[0] |= bits << 6; + code[1] |= bits >> 2; + break; + case 2: + code[1] |= bits << 4; + code[2] |= bits >> 4; + break; + case 3: + code[2] |= bits << 2; + break; + } + } + + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { + uint8_t bits; + code += (i >> 2) * 3; + switch (i & 3) { + case 0: + bits = code[0] & 0x3f; + break; + case 1: + bits = code[0] >> 6; + bits |= (code[1] & 0xf) << 2; + break; + case 2: + bits = code[1] >> 4; + bits |= (code[2] & 3) << 4; + break; + case 3: + bits = code[2] >> 2; + break; + } + return (bits + 0.5f) / 63.0f; + } +}; + +} // namespace scalar_quantizer +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/distance_computers.h b/faiss/impl/scalar_quantizer/distance_computers.h new file mode 100644 index 0000000000..c50f443ec4 --- /dev/null +++ b/faiss/impl/scalar_quantizer/distance_computers.h @@ -0,0 +1,277 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/******************************************************************* + * Similarities: accumulates the element-wise similarities + *******************************************************************/ + +template +struct SimilarityL2 {}; + +template +struct SimilarityIP {}; + +template <> +struct SimilarityL2 { + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::NONE; + static constexpr int simdwidth = 1; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2(const float* y) : y(y) {} + + /******* scalar accumulator *******/ + + float accu; + + FAISS_ALWAYS_INLINE void begin() { + accu = 0; + yi = y; + } + + FAISS_ALWAYS_INLINE void add_component(float x) { + float tmp = *yi++ - x; + accu += tmp * tmp; + } + + FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { + float tmp = x1 - x2; + accu += tmp * tmp; + } + + FAISS_ALWAYS_INLINE float result() { + return accu; + } +}; + +template <> +struct SimilarityIP { + static constexpr int simdwidth = 1; + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::NONE; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + const float *y, *yi; + + float accu; + + explicit SimilarityIP(const float* y) : y(y) {} + + FAISS_ALWAYS_INLINE void begin() { + accu = 0; + yi = y; + } + + FAISS_ALWAYS_INLINE void add_component(float x) { + accu += *yi++ * x; + } + + FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { + accu += x1 * x2; + } + + FAISS_ALWAYS_INLINE float result() { + return accu; + } +}; + +/******************************************************************* + * Distance computers: compute distances between a query and a code + *******************************************************************/ + +using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer; + +template +struct DCTemplate : SQDistanceComputer {}; + +template +struct DCTemplate : SQDistanceComputer { + using Sim = Similarity; + + Quantizer quant; + + DCTemplate(size_t d, const std::vector& trained) + : quant(d, trained) {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin(); + for (size_t i = 0; i < quant.d; i++) { + float xi = quant.reconstruct_component(code, i); + sim.add_component(xi); + } + return sim.result(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin(); + for (size_t i = 0; i < quant.d; i++) { + float x1 = quant.reconstruct_component(code1, i); + float x2 = quant.reconstruct_component(code2, i); + sim.add_component_2(x1, x2); + } + return sim.result(); + } + + void set_query(const float* x) final { + q = x; + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_distance(q, code); + } +}; + +/******************************************************************* + * DistanceComputerByte: computes distances in the integer domain + *******************************************************************/ + +template +struct DistanceComputerByte : SQDistanceComputer {}; + +template +struct DistanceComputerByte : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + int accu = 0; + for (int i = 0; i < d; i++) { + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + accu += int(code1[i]) * code2[i]; + } else { + int diff = int(code1[i]) - code2[i]; + accu += diff * diff; + } + } + return accu; + } + + void set_query(const float* x) final { + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_code_distance(tmp.data(), code); + } +}; + +/******************************************************************* + * select_distance_computer: runtime selection of template + * specialization + *******************************************************************/ + +template +SQDistanceComputer* select_distance_computer( + QuantizerType qtype, + size_t d, + const std::vector& trained) { + constexpr SIMDLevel SL = Sim::SIMD_LEVEL; + constexpr QScaling NU = QScaling::NON_UNIFORM; + constexpr QScaling U = QScaling::UNIFORM; + switch (qtype) { + case ScalarQuantizer::QT_8bit_uniform: + return new DCTemplate, U, SL>, Sim, SL>( + d, trained); + + case ScalarQuantizer::QT_4bit_uniform: + return new DCTemplate, U, SL>, Sim, SL>( + d, trained); + + case ScalarQuantizer::QT_8bit: + return new DCTemplate, NU, SL>, Sim, SL>( + d, trained); + + case ScalarQuantizer::QT_6bit: + return new DCTemplate, NU, SL>, Sim, SL>( + d, trained); + + case ScalarQuantizer::QT_4bit: + return new DCTemplate, NU, SL>, Sim, SL>( + d, trained); + + case ScalarQuantizer::QT_fp16: + return new DCTemplate, Sim, SL>(d, trained); + + case ScalarQuantizer::QT_bf16: + return new DCTemplate, Sim, SL>(d, trained); + + case ScalarQuantizer::QT_8bit_direct: + return new DCTemplate, Sim, SL>(d, trained); + case ScalarQuantizer::QT_8bit_direct_signed: + return new DCTemplate, Sim, SL>( + d, trained); + } + FAISS_THROW_MSG("unknown qtype"); + return nullptr; +} + +template +SQDistanceComputer* select_distance_computer_1( + MetricType metric_type, + QuantizerType qtype, + size_t d, + const std::vector& trained) { + if (metric_type == METRIC_L2) { + return select_distance_computer>(qtype, d, trained); + } else if (metric_type == METRIC_INNER_PRODUCT) { + return select_distance_computer>(qtype, d, trained); + } else { + FAISS_THROW_MSG("unsuppored metric type"); + } +} + +// prevent implicit instantiation of the template +extern template SQDistanceComputer* select_distance_computer_1( + MetricType metric_type, + QuantizerType qtype, + size_t d, + const std::vector& trained); + +extern template SQDistanceComputer* select_distance_computer_1< + SIMDLevel::AVX512F>( + MetricType metric_type, + QuantizerType qtype, + size_t d, + const std::vector& trained); + +} // namespace scalar_quantizer +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/impl-avx2.cpp b/faiss/impl/scalar_quantizer/impl-avx2.cpp new file mode 100644 index 0000000000..a044ad9aaf --- /dev/null +++ b/faiss/impl/scalar_quantizer/impl-avx2.cpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef __F16C__ +#define USE_F16C +#else +#warning \ + "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well" +#endif + +namespace faiss { + +namespace scalar_quantizer { + +/****************************************** Specialization of codecs */ + +template <> +struct Codec8bit : Codec8bit { + static FAISS_ALWAYS_INLINE simd8float32 + decode_8_components(const uint8_t* code, int i) { + const uint64_t c8 = *(uint64_t*)(code + i); + + const __m128i i8 = _mm_set1_epi64x(c8); + const __m256i i32 = _mm256_cvtepu8_epi32(i8); + const __m256 f8 = _mm256_cvtepi32_ps(i32); + const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f); + const __m256 one_255 = _mm256_set1_ps(1.f / 255.f); + return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255)); + } +}; + +template <> +struct Codec4bit : Codec4bit { + static FAISS_ALWAYS_INLINE simd8float32 + decode_8_components(const uint8_t* code, int i) { + uint32_t c4 = *(uint32_t*)(code + (i >> 1)); + uint32_t mask = 0x0f0f0f0f; + uint32_t c4ev = c4 & mask; + uint32_t c4od = (c4 >> 4) & mask; + + // the 8 lower bytes of c8 contain the values + __m128i c8 = + _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od)); + __m128i c4lo = _mm_cvtepu8_epi32(c8); + __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4)); + __m256i i8 = _mm256_castsi128_si256(c4lo); + i8 = _mm256_insertf128_si256(i8, c4hi, 1); + __m256 f8 = _mm256_cvtepi32_ps(i8); + __m256 half = _mm256_set1_ps(0.5f); + f8 = _mm256_add_ps(f8, half); + __m256 one_255 = _mm256_set1_ps(1.f / 15.f); + return simd8float32(_mm256_mul_ps(f8, one_255)); + } +}; + +template <> +struct Codec6bit : Codec6bit { + /* Load 6 bytes that represent 8 6-bit values, return them as a + * 8*32 bit vector register */ + static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { + const __m128i perm = _mm_set_epi8( + -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); + const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); + + // load 6 bytes + __m128i c1 = + _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]); + + // put in 8 * 32 bits + __m128i c2 = _mm_shuffle_epi8(c1, perm); + __m256i c3 = _mm256_cvtepi16_epi32(c2); + + // shift and mask out useless bits + __m256i c4 = _mm256_srlv_epi32(c3, shifts); + __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4); + return c5; + } + + static FAISS_ALWAYS_INLINE simd8float32 + decode_8_components(const uint8_t* code, int i) { + // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here + // // for the reference, maybe, it becomes used oned day. + // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3); + // const uint32_t* data32 = (const uint32_t*)data16; + // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32); + // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL); + // const __m128i i8 = _mm_set1_epi64x(vext); + // const __m256i i32 = _mm256_cvtepi8_epi32(i8); + // const __m256 f8 = _mm256_cvtepi32_ps(i32); + // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); + // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); + // return _mm256_fmadd_ps(f8, one_255, half_one_255); + + __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); + __m256 f8 = _mm256_cvtepi32_ps(i8); + // this could also be done with bit manipulations but it is + // not obviously faster + const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); + const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); + return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255)); + } +}; + +/****************************************** Specialization of quantizers */ + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT( + d, + trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m256 xi = Codec::decode_8_components(code, i).f; + return simd8float32(_mm256_fmadd_ps( + xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin))); + } +}; + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT( + d, + trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m256 xi = Codec::decode_8_components(code, i).f; + return simd8float32(_mm256_fmadd_ps( + xi, + _mm256_loadu_ps(this->vdiff + i), + _mm256_loadu_ps(this->vmin + i))); + } +}; + +template <> +struct QuantizerFP16 : QuantizerFP16 { + QuantizerFP16(size_t d, const std::vector& trained) + : QuantizerFP16(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); + return simd8float32(_mm256_cvtph_ps(codei)); + } +}; + +template <> +struct QuantizerBF16 : QuantizerBF16 { + QuantizerBF16(size_t d, const std::vector& trained) + : QuantizerBF16(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i)); + __m256i code_256i = _mm256_cvtepu16_epi32(code_128i); + code_256i = _mm256_slli_epi32(code_256i, 16); + return simd8float32(_mm256_castsi256_ps(code_256i)); + } +}; + +template <> +struct Quantizer8bitDirect + : Quantizer8bitDirect { + Quantizer8bitDirect(size_t d, const std::vector& trained) + : Quantizer8bitDirect(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 + __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 + return simd8float32(_mm256_cvtepi32_ps(y8)); // 8 * float32 + } +}; + +template <> +struct Quantizer8bitDirectSigned + : Quantizer8bitDirectSigned { + Quantizer8bitDirectSigned(size_t d, const std::vector& trained) + : Quantizer8bitDirectSigned(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 + __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 + __m256i c8 = _mm256_set1_epi32(128); + __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes + return simd8float32(_mm256_cvtepi32_ps(z8)); // 8 * float32 + } +}; + +/****************************************** Specialization of similarities */ + +template <> +struct SimilarityL2 { + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX2; + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2(const float* y) : y(y) {} + simd8float32 accu8; + + FAISS_ALWAYS_INLINE void begin_8() { + accu8.clear(); + yi = y; + } + + FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) { + __m256 yiv = _mm256_loadu_ps(yi); + yi += 8; + __m256 tmp = _mm256_sub_ps(yiv, x.f); + accu8 = simd8float32(_mm256_fmadd_ps(tmp, tmp, accu8.f)); + } + + FAISS_ALWAYS_INLINE void add_8_components_2( + simd8float32 x, + simd8float32 y_2) { + __m256 tmp = _mm256_sub_ps(y_2.f, x.f); + accu8 = simd8float32(_mm256_fmadd_ps(tmp, tmp, accu8.f)); + } + + FAISS_ALWAYS_INLINE float result_8() { + const __m128 sum = _mm_add_ps( + _mm256_castps256_ps128(accu8.f), + _mm256_extractf128_ps(accu8.f, 1)); + const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); + const __m128 v1 = _mm_add_ps(sum, v0); + __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 v3 = _mm_add_ps(v1, v2); + return _mm_cvtss_f32(v3); + } +}; + +template <> +struct SimilarityIP { + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX2; + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + + const float *y, *yi; + + float accu; + + explicit SimilarityIP(const float* y) : y(y) {} + + simd8float32 accu8; + + FAISS_ALWAYS_INLINE void begin_8() { + accu8.clear(); + yi = y; + } + + FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) { + __m256 yiv = _mm256_loadu_ps(yi); + yi += 8; + accu8.f = _mm256_fmadd_ps(yiv, x.f, accu8.f); + } + + FAISS_ALWAYS_INLINE void add_8_components_2( + simd8float32 x1, + simd8float32 x2) { + accu8.f = _mm256_fmadd_ps(x1.f, x2.f, accu8.f); + } + + FAISS_ALWAYS_INLINE float result_8() { + const __m128 sum = _mm_add_ps( + _mm256_castps256_ps128(accu8.f), + _mm256_extractf128_ps(accu8.f, 1)); + const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); + const __m128 v1 = _mm_add_ps(sum, v0); + __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 v3 = _mm_add_ps(v1, v2); + return _mm_cvtss_f32(v3); + } +}; + +/****************************************** Specialization of distance computers + */ + +template +struct DCTemplate : SQDistanceComputer { + using Sim = Similarity; + + Quantizer quant; + + DCTemplate(size_t d, const std::vector& trained) + : quant(d, trained) {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 xi = quant.reconstruct_8_components(code, i); + sim.add_8_components(xi); + } + return sim.result_8(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 x1 = quant.reconstruct_8_components(code1, i); + simd8float32 x2 = quant.reconstruct_8_components(code2, i); + sim.add_8_components_2(x1, x2); + } + return sim.result_8(); + } + + void set_query(const float* x) final { + q = x; + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_distance(q, code); + } +}; + +template +struct DistanceComputerByte : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + // __m256i accu = _mm256_setzero_ps (); + __m256i accu = _mm256_setzero_si256(); + for (int i = 0; i < d; i += 16) { + // load 16 bytes, convert to 16 uint16_t + __m256i c1 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i*)(code1 + i))); + __m256i c2 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((__m128i*)(code2 + i))); + __m256i prod32; + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + prod32 = _mm256_madd_epi16(c1, c2); + } else { + __m256i diff = _mm256_sub_epi16(c1, c2); + prod32 = _mm256_madd_epi16(diff, diff); + } + accu = _mm256_add_epi32(accu, prod32); + } + __m128i sum = _mm256_extractf128_si256(accu, 0); + sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1)); + sum = _mm_hadd_epi32(sum, sum); + sum = _mm_hadd_epi32(sum, sum); + return _mm_cvtsi128_si32(sum); + } + + void set_query(const float* x) final { + /* + for (int i = 0; i < d; i += 8) { + __m256 xi = _mm256_loadu_ps (x + i); + __m256i ci = _mm256_cvtps_epi32(xi); + */ + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_code_distance(tmp.data(), code); + } +}; + +// explicit instantiation + +template ScalarQuantizer::SQuantizer* select_quantizer_1( + QuantizerType qtype, + size_t d, + const std::vector& trained); + +template SQDistanceComputer* select_distance_computer_1( + MetricType metric_type, + QuantizerType qtype, + size_t d, + const std::vector& trained); + +template InvertedListScanner* sel0_InvertedListScanner( + MetricType mt, + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual); + +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/impl-avx512.cpp b/faiss/impl/scalar_quantizer/impl-avx512.cpp new file mode 100644 index 0000000000..ad782c61cb --- /dev/null +++ b/faiss/impl/scalar_quantizer/impl-avx512.cpp @@ -0,0 +1,403 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#if defined(__AVX512F__) && defined(__F16C__) +#define USE_AVX512_F16C +#else +#warning "Wrong compiler flags for AVX512_F16C" +#endif + +namespace faiss { + +namespace scalar_quantizer { + +/******************************** Codec specializations */ + +template <> +struct Codec8bit : Codec8bit { + static FAISS_ALWAYS_INLINE simd16float32 + decode_16_components(const uint8_t* code, int i) { + const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i)); + const __m512i i32 = _mm512_cvtepu8_epi32(c16); + const __m512 f16 = _mm512_cvtepi32_ps(i32); + const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f); + const __m512 one_255 = _mm512_set1_ps(1.f / 255.f); + return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255)); + } +}; + +template <> +struct Codec4bit : Codec4bit { + static FAISS_ALWAYS_INLINE simd16float32 + decode_16_components(const uint8_t* code, int i) { + uint64_t c8 = *(uint64_t*)(code + (i >> 1)); + uint64_t mask = 0x0f0f0f0f0f0f0f0f; + uint64_t c8ev = c8 & mask; + uint64_t c8od = (c8 >> 4) & mask; + + __m128i c16 = + _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); + __m256i c8lo = _mm256_cvtepu8_epi32(c16); + __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8)); + __m512i i16 = _mm512_castsi256_si512(c8lo); + i16 = _mm512_inserti32x8(i16, c8hi, 1); + __m512 f16 = _mm512_cvtepi32_ps(i16); + const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f); + const __m512 one_255 = _mm512_set1_ps(1.f / 15.f); + return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255)); + } +}; + +template <> +struct Codec6bit : Codec6bit { + static FAISS_ALWAYS_INLINE simd16float32 + decode_16_components(const uint8_t* code, int i) { + // pure AVX512 implementation (not necessarily the fastest). + // see: + // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h + + // clang-format off + + // 16 components, 16x6 bit=12 bytes + const __m128i bit_6v = + _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3); + const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v); + + // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + // 00 01 02 03 + const __m256i shuffle_mask = _mm256_setr_epi16( + 0xFF00, 0x0100, 0x0201, 0xFF02, + 0xFF03, 0x0403, 0x0504, 0xFF05, + 0xFF06, 0x0706, 0x0807, 0xFF08, + 0xFF09, 0x0A09, 0x0B0A, 0xFF0B); + const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask); + + // 0: xxxxxxxx xx543210 + // 1: xxxx5432 10xxxxxx + // 2: xxxxxx54 3210xxxx + // 3: xxxxxxxx 543210xx + const __m256i shift_right_v = _mm256_setr_epi16( + 0x0U, 0x6U, 0x4U, 0x2U, + 0x0U, 0x6U, 0x4U, 0x2U, + 0x0U, 0x6U, 0x4U, 0x2U, + 0x0U, 0x6U, 0x4U, 0x2U); + __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v); + + // remove unneeded bits + shuffled_shifted = + _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F)); + + // scale + const __m512 f8 = + _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted)); + const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f); + const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); + return simd16float32(_mm512_fmadd_ps(f8, one_255, half_one_255)); + + // clang-format on + } +}; + +/******************************** Quantizer specializations */ + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT( + d, + trained) {} + + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m512 xi = Codec::decode_16_components(code, i).f; + return simd16float32(_mm512_fmadd_ps( + xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin))); + } +}; + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT( + d, + trained) {} + + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m512 xi = Codec::decode_16_components(code, i).f; + return simd16float32(_mm512_fmadd_ps( + xi, + _mm512_loadu_ps(this->vdiff + i), + _mm512_loadu_ps(this->vmin + i))); + } +}; + +template <> +struct QuantizerFP16 : QuantizerFP16 { + QuantizerFP16(size_t d, const std::vector& trained) + : QuantizerFP16(d, trained) {} + + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); + return simd16float32(_mm512_cvtph_ps(codei)); + } +}; + +template <> +struct QuantizerBF16 : QuantizerBF16 { + QuantizerBF16(size_t d, const std::vector& trained) + : QuantizerBF16(d, trained) {} + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); + __m512i code_512i = _mm512_cvtepu16_epi32(code_256i); + code_512i = _mm512_slli_epi32(code_512i, 16); + return simd16float32(_mm512_castsi512_ps(code_512i)); + } +}; + +template <> +struct Quantizer8bitDirect + : Quantizer8bitDirect { + Quantizer8bitDirect(size_t d, const std::vector& trained) + : Quantizer8bitDirect(d, trained) {} + + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 + __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 + return simd16float32(_mm512_cvtepi32_ps(y16)); // 16 * float32 + } +}; + +template <> +struct Quantizer8bitDirectSigned + : Quantizer8bitDirectSigned { + Quantizer8bitDirectSigned(size_t d, const std::vector& trained) + : Quantizer8bitDirectSigned(d, trained) {} + + FAISS_ALWAYS_INLINE simd16float32 + reconstruct_16_components(const uint8_t* code, int i) const { + __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8 + __m512i y16 = _mm512_cvtepu8_epi32(x16); // 16 * int32 + __m512i c16 = _mm512_set1_epi32(128); + __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes + return simd16float32(_mm512_cvtepi32_ps(z16)); // 16 * float32 + } +}; + +/****************************************** Specialization of similarities */ + +template <> +struct SimilarityL2 { + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX512F; + static constexpr int simdwidth = 16; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2(const float* y) : y(y) {} + simd16float32 accu16; + + FAISS_ALWAYS_INLINE void begin_16() { + accu16.clear(); + yi = y; + } + + FAISS_ALWAYS_INLINE void add_16_components(simd16float32 x) { + __m512 yiv = _mm512_loadu_ps(yi); + yi += 16; + __m512 tmp = _mm512_sub_ps(yiv, x.f); + accu16 = simd16float32(_mm512_fmadd_ps(tmp, tmp, accu16.f)); + } + + FAISS_ALWAYS_INLINE void add_16_components_2( + simd16float32 x, + simd16float32 y_2) { + __m512 tmp = _mm512_sub_ps(y_2.f, x.f); + accu16 = simd16float32(_mm512_fmadd_ps(tmp, tmp, accu16.f)); + } + + FAISS_ALWAYS_INLINE float result_16() { + // performs better than dividing into _mm256 and adding + return _mm512_reduce_add_ps(accu16.f); + } +}; + +template <> +struct SimilarityIP { + static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX512F; + static constexpr int simdwidth = 16; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + + const float *y, *yi; + + float accu; + + explicit SimilarityIP(const float* y) : y(y) {} + + simd16float32 accu16; + + FAISS_ALWAYS_INLINE void begin_16() { + accu16.clear(); + yi = y; + } + + FAISS_ALWAYS_INLINE void add_16_components(simd16float32 x) { + __m512 yiv = _mm512_loadu_ps(yi); + yi += 16; + accu16.f = _mm512_fmadd_ps(yiv, x.f, accu16.f); + } + + FAISS_ALWAYS_INLINE void add_16_components_2( + simd16float32 x1, + simd16float32 x2) { + accu16.f = _mm512_fmadd_ps(x1.f, x2.f, accu16.f); + } + + FAISS_ALWAYS_INLINE float result_16() { + // performs better than dividing into _mm256 and adding + return _mm512_reduce_add_ps(accu16.f); + } +}; + +/****************************************** Specialization of distance computers + */ + +template +struct DCTemplate + : SQDistanceComputer { // Update to handle 16 lanes + using Sim = Similarity; + + Quantizer quant; + + DCTemplate(size_t d, const std::vector& trained) + : quant(d, trained) {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin_16(); + for (size_t i = 0; i < quant.d; i += 16) { + simd16float32 xi = quant.reconstruct_16_components(code, i); + sim.add_16_components(xi); + } + return sim.result_16(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin_16(); + for (size_t i = 0; i < quant.d; i += 16) { + simd16float32 x1 = quant.reconstruct_16_components(code1, i); + simd16float32 x2 = quant.reconstruct_16_components(code2, i); + sim.add_16_components_2(x1, x2); + } + return sim.result_16(); + } + + void set_query(const float* x) final { + q = x; + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_distance(q, code); + } +}; + +template +struct DistanceComputerByte + : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + __m512i accu = _mm512_setzero_si512(); + for (int i = 0; i < d; i += 32) { // Process 32 bytes at a time + __m512i c1 = _mm512_cvtepu8_epi16( + _mm256_loadu_si256((__m256i*)(code1 + i))); + __m512i c2 = _mm512_cvtepu8_epi16( + _mm256_loadu_si256((__m256i*)(code2 + i))); + __m512i prod32; + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + prod32 = _mm512_madd_epi16(c1, c2); + } else { + __m512i diff = _mm512_sub_epi16(c1, c2); + prod32 = _mm512_madd_epi16(diff, diff); + } + accu = _mm512_add_epi32(accu, prod32); + } + // Horizontally add elements of accu + return _mm512_reduce_add_epi32(accu); + } + + void set_query(const float* x) final { + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_code_distance(tmp.data(), code); + } +}; + +// explicit instantiation + +template ScalarQuantizer::SQuantizer* select_quantizer_1( + QuantizerType qtype, + size_t d, + const std::vector& trained); + +template SQDistanceComputer* select_distance_computer_1( + MetricType metric_type, + QuantizerType qtype, + size_t d, + const std::vector& trained); + +template InvertedListScanner* sel0_InvertedListScanner( + MetricType mt, + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual); + +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/impl-neon.cpp b/faiss/impl/scalar_quantizer/impl-neon.cpp new file mode 100644 index 0000000000..7daeeae5b6 --- /dev/null +++ b/faiss/impl/scalar_quantizer/impl-neon.cpp @@ -0,0 +1,375 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#if defined(__aarch64__) +#if defined(__GNUC__) && __GNUC__ < 8 +#warning \ + "Cannot enable NEON optimizations in scalar quantizer if the compiler is GCC<8" +#else +#define USE_NEON +#endif +#endif + +namespace faiss { + +namespace scalar_quantizer { +/******************************** Codec specializations */ + +template <> +struct Codec8bit { + static FAISS_ALWAYS_INLINE decode_8_components(const uint8_t* code, int i) { + float32_t result[8] = {}; + for (size_t j = 0; j < 8; j++) { + result[j] = decode_component(code, i + j); + } + float32x4_t res1 = vld1q_f32(result); + float32x4_t res2 = vld1q_f32(result + 4); + return simd8float32(float32x4x2_t{res1, res2}); + } +}; + +template <> +struct Codec4bit { + static FAISS_ALWAYS_INLINE simd8float32 + decode_8_components(const uint8_t* code, int i) { + float32_t result[8] = {}; + for (size_t j = 0; j < 8; j++) { + result[j] = decode_component(code, i + j); + } + float32x4_t res1 = vld1q_f32(result); + float32x4_t res2 = vld1q_f32(result + 4); + return simd8float32({res1, res2}); + } +}; + +template <> +struct Codec6bit { + static FAISS_ALWAYS_INLINE simd8float32 + decode_8_components(const uint8_t* code, int i) { + float32_t result[8] = {}; + for (size_t j = 0; j < 8; j++) { + result[j] = decode_component(code, i + j); + } + float32x4_t res1 = vld1q_f32(result); + float32x4_t res2 = vld1q_f32(result + 4); + return simd8float32(float32x4x2_t({res1, res2})); + } +}; +/******************************** Quantizatoin specializations */ + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + float32x4x2_t xi = Codec::decode_8_components(code, i); + return simd8float32(float32x4x2_t( + {vfmaq_f32( + vdupq_n_f32(this->vmin), + xi.val[0], + vdupq_n_f32(this->vdiff)), + vfmaq_f32( + vdupq_n_f32(this->vmin), + xi.val[1], + vdupq_n_f32(this->vdiff))})); + } +}; + +template +struct QuantizerT + : QuantizerT { + QuantizerT(size_t d, const std::vector& trained) + : QuantizerT(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + float32x4x2_t xi = Codec::decode_8_components(code, i).data; + + float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i); + float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i); + + return simd8float32( + {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]), + vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])}); + } +}; + +template <> +struct QuantizerFP16 : QuantizerFP16 { + QuantizerFP16(size_t d, const std::vector& trained) + : QuantizerFP16<1>(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); + return simd8float32( + {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])), + vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))}); + } +}; + +template <> +struct QuantizerBF16 : QuantizerBF16 { + QuantizerBF16(size_t d, const std::vector& trained) + : QuantizerBF16<1>(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i)); + return simd8float32( + {vreinterpretq_f32_u32( + vshlq_n_u32(vmovl_u16(codei.val[0]), 16)), + vreinterpretq_f32_u32( + vshlq_n_u32(vmovl_u16(codei.val[1]), 16))}); + } +}; + +template <> +struct Quantizer8bitDirect + : Quantizer8bitDirect { + Quantizer8bitDirect(size_t d, const std::vector& trained) + : Quantizer8bitDirect<1>(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); + uint16x8_t y8 = vmovl_u8(x8); + uint16x4_t y8_0 = vget_low_u16(y8); + uint16x4_t y8_1 = vget_high_u16(y8); + + // convert uint16 -> uint32 -> fp32 + return simd8float32( + {vcvtq_f32_u32(vmovl_u16(y8_0)), + vcvtq_f32_u32(vmovl_u16(y8_1))}); + } +}; + +template <> +struct Quantizer8bitDirectSigned + : Quantizer8bitDirectSigned { + Quantizer8bitDirectSigned(size_t d, const std::vector& trained) + : Quantizer8bitDirectSigned<1>(d, trained) {} + + FAISS_ALWAYS_INLINE simd8float32 + reconstruct_8_components(const uint8_t* code, int i) const { + uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i)); + uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16 + uint16x4_t y8_0 = vget_low_u16(y8); + uint16x4_t y8_1 = vget_high_u16(y8); + + float32x4_t z8_0 = vcvtq_f32_u32( + vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32 + float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1)); + + // subtract 128 to convert into signed numbers + return simd8float32( + {vsubq_f32(z8_0, vmovq_n_f32(128.0)), + vsubq_f32(z8_1, vmovq_n_f32(128.0))}); + } +}; + +/****************************************** Specialization of similarities */ + +template <> +struct SimilarityL2 { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + explicit SimilarityL2(const float* y) : y(y) {} + simd8float32 accu8; + + FAISS_ALWAYS_INLINE void begin_8() { + accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; + yi = y; + } + + FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) { + float32x4x2_t yiv = vld1q_f32_x2(yi); + yi += 8; + + float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]); + float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]); + + float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); + float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); + + accu8 = simd8float32({accu8_0, accu8_1}); + } + + FAISS_ALWAYS_INLINE void add_8_components_2( + simd8float32 x, + simd8float32 y) { + float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]); + float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]); + + float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0); + float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1); + + accu8 = simd8float32({accu8_0, accu8_1}); + } + + FAISS_ALWAYS_INLINE float result_8() { + float32x4_t sum_0 = vpaddq_f32(accu8.data.val[0], accu8.data.val[0]); + float32x4_t sum_1 = vpaddq_f32(accu8.data.val[1], accu8.data.val[1]); + + float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0); + float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1); + return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0); + } +}; + +template <> +struct SimilarityIP { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + + const float *y, *yi; + + explicit SimilarityIP(const float* y) : y(y) {} + float32x4x2_t accu8; + + FAISS_ALWAYS_INLINE void begin_8() { + accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)}; + yi = y; + } + + FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) { + float32x4x2_t yiv = vld1q_f32_x2(yi); + yi += 8; + + float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]); + float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]); + accu8 = {accu8_0, accu8_1}; + } + + FAISS_ALWAYS_INLINE void add_8_components_2( + float32x4x2_t x1, + float32x4x2_t x2) { + float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]); + float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]); + accu8 = {accu8_0, accu8_1}; + } + + FAISS_ALWAYS_INLINE float result_8() { + float32x4x2_t sum = { + vpaddq_f32(accu8.val[0], accu8.val[0]), + vpaddq_f32(accu8.val[1], accu8.val[1])}; + + float32x4x2_t sum2 = { + vpaddq_f32(sum.val[0], sum.val[0]), + vpaddq_f32(sum.val[1], sum.val[1])}; + return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0); + } +}; + +/****************************************** Specialization of distance computers + */ + +// this is the same code as the AVX2 version... Possible to mutualize? +template +struct DCTemplate : SQDistanceComputer { + using Sim = Similarity; + + Quantizer quant; + + DCTemplate(size_t d, const std::vector& trained) + : quant(d, trained) {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 xi = quant.reconstruct_8_components(code, i); + sim.add_8_components(xi); + } + return sim.result_8(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + simd8float32 x1 = quant.reconstruct_8_components(code1, i); + simd8float32 x2 = quant.reconstruct_8_components(code2, i); + sim.add_8_components_2(x1, x2); + } + return sim.result_8(); + } + + void set_query(const float* x) final { + q = x; + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_distance(q, code); + } +}; + +template +struct DistanceComputerByte + : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte(int d, const std::vector&) : d(d), tmp(d) {} + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + int accu = 0; + for (int i = 0; i < d; i++) { + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + accu += int(code1[i]) * code2[i]; + } else { + int diff = int(code1[i]) - code2[i]; + accu += diff * diff; + } + } + return accu; + } + + void set_query(const float* x) final { + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + float symmetric_dis(idx_t i, idx_t j) override { + return compute_code_distance( + codes + i * code_size, codes + j * code_size); + } + + float query_to_code(const uint8_t* code) const final { + return compute_code_distance(tmp.data(), code); + } +}; + +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/quantizers.h b/faiss/impl/scalar_quantizer/quantizers.h new file mode 100644 index 0000000000..a9865722d6 --- /dev/null +++ b/faiss/impl/scalar_quantizer/quantizers.h @@ -0,0 +1,293 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +using QuantizerType = ScalarQuantizer::QuantizerType; + +/******************************************************************* + * Quantizer: normalizes scalar vector components, then passes them + * through a codec + *******************************************************************/ + +enum class QScaling { UNIFORM = 0, NON_UNIFORM = 1 }; + +template +struct QuantizerT {}; + +template +struct QuantizerT + : ScalarQuantizer::SQuantizer { + const size_t d; + const float vmin, vdiff; + + QuantizerT(size_t d, const std::vector& trained) + : d(d), vmin(trained[0]), vdiff(trained[1]) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + float xi = 0; + if (vdiff != 0) { + xi = (x[i] - vmin) / vdiff; + if (xi < 0) { + xi = 0; + } + if (xi > 1.0) { + xi = 1.0; + } + } + Codec::encode_component(xi, code, i); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + float xi = Codec::decode_component(code, i); + x[i] = vmin + xi * vdiff; + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + float xi = Codec::decode_component(code, i); + return vmin + xi * vdiff; + } +}; + +template +struct QuantizerT + : ScalarQuantizer::SQuantizer { + const size_t d; + const float *vmin, *vdiff; + + QuantizerT(size_t d, const std::vector& trained) + : d(d), vmin(trained.data()), vdiff(trained.data() + d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + float xi = 0; + if (vdiff[i] != 0) { + xi = (x[i] - vmin[i]) / vdiff[i]; + if (xi < 0) { + xi = 0; + } + if (xi > 1.0) { + xi = 1.0; + } + } + Codec::encode_component(xi, code, i); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + float xi = Codec::decode_component(code, i); + x[i] = vmin[i] + xi * vdiff[i]; + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + float xi = Codec::decode_component(code, i); + return vmin[i] + xi * vdiff[i]; + } +}; + +/******************************************************************* + * Quantizers that are not based on codecs + *******************************************************************/ + +/******************************************************************* + * FP16 quantizer + *******************************************************************/ + +template +struct QuantizerFP16 {}; + +template <> +struct QuantizerFP16 : ScalarQuantizer::SQuantizer { + const size_t d; + + QuantizerFP16(size_t d, const std::vector& /* unused */) : d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + ((uint16_t*)code)[i] = encode_fp16(x[i]); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = decode_fp16(((uint16_t*)code)[i]); + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + return decode_fp16(((uint16_t*)code)[i]); + } +}; + +/******************************************************************* + * BF16 quantizer + *******************************************************************/ + +template +struct QuantizerBF16 {}; + +template <> +struct QuantizerBF16 : ScalarQuantizer::SQuantizer { + const size_t d; + + QuantizerBF16(size_t d, const std::vector& /* unused */) : d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + ((uint16_t*)code)[i] = encode_bf16(x[i]); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = decode_bf16(((uint16_t*)code)[i]); + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + return decode_bf16(((uint16_t*)code)[i]); + } +}; + +/******************************************************************* + * 8bit_direct quantizer + *******************************************************************/ + +template +struct Quantizer8bitDirect {}; + +template <> +struct Quantizer8bitDirect : ScalarQuantizer::SQuantizer { + const size_t d; + + Quantizer8bitDirect(size_t d, const std::vector& /* unused */) + : d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + code[i] = (uint8_t)x[i]; + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = code[i]; + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + return code[i]; + } +}; + +/******************************************************************* + * 8bit_direct_signed quantizer + *******************************************************************/ + +template +struct Quantizer8bitDirectSigned {}; + +template <> +struct Quantizer8bitDirectSigned + : ScalarQuantizer::SQuantizer { + const size_t d; + + Quantizer8bitDirectSigned(size_t d, const std::vector& /* unused */) + : d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + code[i] = (uint8_t)(x[i] + 128); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = code[i] - 128; + } + } + + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { + return code[i] - 128; + } +}; + +template +ScalarQuantizer::SQuantizer* select_quantizer_1( + QuantizerType qtype, + size_t d, + const std::vector& trained) { + // constexpr SIMDLevel SL = INSTANCIATE_SIMD_LEVEL; + constexpr QScaling NU = QScaling::NON_UNIFORM; + constexpr QScaling U = QScaling::UNIFORM; + switch (qtype) { + case ScalarQuantizer::QT_8bit: + return new QuantizerT, NU, SL>(d, trained); + + case ScalarQuantizer::QT_6bit: + return new QuantizerT, NU, SL>(d, trained); + case ScalarQuantizer::QT_4bit: + return new QuantizerT, NU, SL>(d, trained); + case ScalarQuantizer::QT_8bit_uniform: + return new QuantizerT, U, SL>(d, trained); + case ScalarQuantizer::QT_4bit_uniform: + return new QuantizerT, U, SL>(d, trained); + case ScalarQuantizer::QT_fp16: + return new QuantizerFP16(d, trained); + case ScalarQuantizer::QT_bf16: + return new QuantizerBF16(d, trained); + case ScalarQuantizer::QT_8bit_direct: + return new Quantizer8bitDirect(d, trained); + case ScalarQuantizer::QT_8bit_direct_signed: + return new Quantizer8bitDirectSigned(d, trained); + default: + FAISS_THROW_MSG("unknown qtype"); + return nullptr; + } +} + +// prevent implicit instanciation +extern template ScalarQuantizer::SQuantizer* select_quantizer_1< + SIMDLevel::AVX2>( + QuantizerType qtype, + size_t d, + const std::vector& trained); + +extern template ScalarQuantizer::SQuantizer* select_quantizer_1< + SIMDLevel::AVX512F>( + QuantizerType qtype, + size_t d, + const std::vector& trained); + +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/scanners.h b/faiss/impl/scalar_quantizer/scanners.h new file mode 100644 index 0000000000..b0e7f04286 --- /dev/null +++ b/faiss/impl/scalar_quantizer/scanners.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace faiss { + +namespace scalar_quantizer { + +/******************************************************************* + * InvertedListScanner: scans series of codes and keeps the best ones + *******************************************************************/ + +template +struct IVFSQScannerIP : InvertedListScanner { + DCClass dc; + bool by_residual; + + float accu0; /// added to all distances + + IVFSQScannerIP( + int d, + const std::vector& trained, + size_t code_size, + bool store_pairs, + const IDSelector* sel, + bool by_residual) + : dc(d, trained), by_residual(by_residual), accu0(0) { + this->store_pairs = store_pairs; + this->sel = sel; + this->code_size = code_size; + this->keep_max = true; + } + + void set_query(const float* query) override { + dc.set_query(query); + } + + void set_list(idx_t list_no, float coarse_dis) override { + this->list_no = list_no; + accu0 = by_residual ? coarse_dis : 0; + } + + float distance_to_code(const uint8_t* code) const final { + return accu0 + dc.query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k) const override { + size_t nup = 0; + + for (size_t j = 0; j < list_size; j++, codes += code_size) { + if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { + continue; + } + + float accu = accu0 + dc.query_to_code(codes); + + if (accu > simi[0]) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + minheap_replace_top(k, simi, idxi, accu, id); + nup++; + } + } + return nup; + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++, codes += code_size) { + if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { + continue; + } + + float accu = accu0 + dc.query_to_code(codes); + if (accu > radius) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + res.add(accu, id); + } + } + } +}; + +/* use_sel = 0: don't check selector + * = 1: check on ids[j] + * = 2: check in j directly (normally ids is nullptr and store_pairs) + */ +template +struct IVFSQScannerL2 : InvertedListScanner { + DCClass dc; + + bool by_residual; + const Index* quantizer; + const float* x; /// current query + + std::vector tmp; + + IVFSQScannerL2( + int d, + const std::vector& trained, + size_t code_size, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual) + : dc(d, trained), + by_residual(by_residual), + quantizer(quantizer), + x(nullptr), + tmp(d) { + this->store_pairs = store_pairs; + this->sel = sel; + this->code_size = code_size; + } + + void set_query(const float* query) override { + x = query; + if (!quantizer) { + dc.set_query(query); + } + } + + void set_list(idx_t list_no, float /*coarse_dis*/) override { + this->list_no = list_no; + if (by_residual) { + // shift of x_in wrt centroid + quantizer->compute_residual(x, tmp.data(), list_no); + dc.set_query(tmp.data()); + } else { + dc.set_query(x); + } + } + + float distance_to_code(const uint8_t* code) const final { + return dc.query_to_code(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++, codes += code_size) { + if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { + continue; + } + + float dis = dc.query_to_code(codes); + + if (dis < simi[0]) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + maxheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + } + return nup; + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float radius, + RangeQueryResult& res) const override { + for (size_t j = 0; j < list_size; j++, codes += code_size) { + if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) { + continue; + } + + float dis = dc.query_to_code(codes); + if (dis < radius) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + res.add(dis, id); + } + } + } +}; + +template +InvertedListScanner* sel3_InvertedListScanner( + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool r) { + if (DCClass::Sim::metric_type == METRIC_L2) { + return new IVFSQScannerL2( + sq->d, + sq->trained, + sq->code_size, + quantizer, + store_pairs, + sel, + r); + } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { + return new IVFSQScannerIP( + sq->d, sq->trained, sq->code_size, store_pairs, sel, r); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} + +template +InvertedListScanner* sel2_InvertedListScanner( + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool r) { + if (sel) { + if (store_pairs) { + return sel3_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); + } else { + return sel3_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); + } + } else { + return sel3_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); + } +} + +template +InvertedListScanner* sel12_InvertedListScanner( + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool r) { + constexpr SIMDLevel SL = Similarity::SIMD_LEVEL; + using QuantizerClass = QuantizerT; + using DCClass = DCTemplate; + return sel2_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); +} + +template +InvertedListScanner* sel1_InvertedListScanner( + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool r) { + constexpr SIMDLevel SL = Sim::SIMD_LEVEL; + constexpr QScaling NU = QScaling::NON_UNIFORM; + constexpr QScaling U = QScaling::UNIFORM; + + switch (sq->qtype) { + case ScalarQuantizer::QT_8bit_uniform: + return sel12_InvertedListScanner, U>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_4bit_uniform: + return sel12_InvertedListScanner, U>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_8bit: + return sel12_InvertedListScanner, NU>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_4bit: + return sel12_InvertedListScanner, NU>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_6bit: + return sel12_InvertedListScanner, NU>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_fp16: + return sel2_InvertedListScanner< + DCTemplate, Sim, SL>>( + sq, quantizer, store_pairs, sel, r); + + case ScalarQuantizer::QT_bf16: + return sel2_InvertedListScanner< + DCTemplate, Sim, SL>>( + sq, quantizer, store_pairs, sel, r); + case ScalarQuantizer::QT_8bit_direct: + return sel2_InvertedListScanner< + DCTemplate, Sim, SL>>( + sq, quantizer, store_pairs, sel, r); + + case ScalarQuantizer::QT_8bit_direct_signed: + return sel2_InvertedListScanner< + DCTemplate, Sim, SL>>( + sq, quantizer, store_pairs, sel, r); + default: + FAISS_THROW_MSG("unknown qtype"); + return nullptr; + } +} + +template +InvertedListScanner* sel0_InvertedListScanner( + MetricType mt, + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual) { + if (mt == METRIC_L2) { + return sel1_InvertedListScanner>( + sq, quantizer, store_pairs, sel, by_residual); + } else if (mt == METRIC_INNER_PRODUCT) { + return sel1_InvertedListScanner>( + sq, quantizer, store_pairs, sel, by_residual); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} + +// prevent implicit instantiation of the template when there are +// SIMD optimized versions... +extern template InvertedListScanner* sel0_InvertedListScanner( + MetricType mt, + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual); + +extern template InvertedListScanner* sel0_InvertedListScanner< + SIMDLevel::AVX512F>( + MetricType mt, + const ScalarQuantizer* sq, + const Index* quantizer, + bool store_pairs, + const IDSelector* sel, + bool by_residual); + +} // namespace scalar_quantizer +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/training.cpp b/faiss/impl/scalar_quantizer/training.cpp new file mode 100644 index 0000000000..23c51384fd --- /dev/null +++ b/faiss/impl/scalar_quantizer/training.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace faiss { + +namespace scalar_quantizer { +/******************************************************************* + * Quantizer range training + */ + +static float sqr(float x) { + return x * x; +} + +void train_Uniform( + RangeStat rs, + float rs_arg, + idx_t n, + int k, + const float* x, + std::vector& trained) { + trained.resize(2); + float& vmin = trained[0]; + float& vmax = trained[1]; + + if (rs == ScalarQuantizer::RS_minmax) { + vmin = HUGE_VAL; + vmax = -HUGE_VAL; + for (size_t i = 0; i < n; i++) { + if (x[i] < vmin) + vmin = x[i]; + if (x[i] > vmax) + vmax = x[i]; + } + float vexp = (vmax - vmin) * rs_arg; + vmin -= vexp; + vmax += vexp; + } else if (rs == ScalarQuantizer::RS_meanstd) { + double sum = 0, sum2 = 0; + for (size_t i = 0; i < n; i++) { + sum += x[i]; + sum2 += x[i] * x[i]; + } + float mean = sum / n; + float var = sum2 / n - mean * mean; + float std = var <= 0 ? 1.0 : sqrt(var); + + vmin = mean - std * rs_arg; + vmax = mean + std * rs_arg; + } else if (rs == ScalarQuantizer::RS_quantiles) { + std::vector x_copy(n); + memcpy(x_copy.data(), x, n * sizeof(*x)); + // TODO just do a quickselect + std::sort(x_copy.begin(), x_copy.end()); + int o = int(rs_arg * n); + if (o < 0) + o = 0; + if (o > n - o) + o = n / 2; + vmin = x_copy[o]; + vmax = x_copy[n - 1 - o]; + + } else if (rs == ScalarQuantizer::RS_optim) { + float a, b; + float sx = 0; + { + vmin = HUGE_VAL, vmax = -HUGE_VAL; + for (size_t i = 0; i < n; i++) { + if (x[i] < vmin) + vmin = x[i]; + if (x[i] > vmax) + vmax = x[i]; + sx += x[i]; + } + b = vmin; + a = (vmax - vmin) / (k - 1); + } + int verbose = false; + int niter = 2000; + float last_err = -1; + int iter_last_err = 0; + for (int it = 0; it < niter; it++) { + float sn = 0, sn2 = 0, sxn = 0, err1 = 0; + + for (idx_t i = 0; i < n; i++) { + float xi = x[i]; + float ni = floor((xi - b) / a + 0.5); + if (ni < 0) + ni = 0; + if (ni >= k) + ni = k - 1; + err1 += sqr(xi - (ni * a + b)); + sn += ni; + sn2 += ni * ni; + sxn += ni * xi; + } + + if (err1 == last_err) { + iter_last_err++; + if (iter_last_err == 16) + break; + } else { + last_err = err1; + iter_last_err = 0; + } + + float det = sqr(sn) - sn2 * n; + + b = (sn * sxn - sn2 * sx) / det; + a = (sn * sx - n * sxn) / det; + if (verbose) { + printf("it %d, err1=%g \r", it, err1); + fflush(stdout); + } + } + if (verbose) + printf("\n"); + + vmin = b; + vmax = b + a * (k - 1); + + } else { + FAISS_THROW_MSG("Invalid qtype"); + } + vmax -= vmin; +} + +void train_NonUniform( + RangeStat rs, + float rs_arg, + idx_t n, + int d, + int k, + const float* x, + std::vector& trained) { + trained.resize(2 * d); + float* vmin = trained.data(); + float* vmax = trained.data() + d; + if (rs == ScalarQuantizer::RS_minmax) { + memcpy(vmin, x, sizeof(*x) * d); + memcpy(vmax, x, sizeof(*x) * d); + for (size_t i = 1; i < n; i++) { + const float* xi = x + i * d; + for (size_t j = 0; j < d; j++) { + if (xi[j] < vmin[j]) + vmin[j] = xi[j]; + if (xi[j] > vmax[j]) + vmax[j] = xi[j]; + } + } + float* vdiff = vmax; + for (size_t j = 0; j < d; j++) { + float vexp = (vmax[j] - vmin[j]) * rs_arg; + vmin[j] -= vexp; + vmax[j] += vexp; + vdiff[j] = vmax[j] - vmin[j]; + } + } else { + // transpose + std::vector xt(n * d); + for (size_t i = 1; i < n; i++) { + const float* xi = x + i * d; + for (size_t j = 0; j < d; j++) { + xt[j * n + i] = xi[j]; + } + } + std::vector trained_d(2); +#pragma omp parallel for + for (int j = 0; j < d; j++) { + train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d); + vmin[j] = trained_d[0]; + vmax[j] = trained_d[1]; + } + } +} + +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/impl/scalar_quantizer/training.h b/faiss/impl/scalar_quantizer/training.h new file mode 100644 index 0000000000..6d97507c8f --- /dev/null +++ b/faiss/impl/scalar_quantizer/training.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/******************************************************************* + * Quantizer range training + */ + +#include + +namespace faiss { + +namespace scalar_quantizer { + +using RangeStat = ScalarQuantizer::RangeStat; + +void train_Uniform( + RangeStat rs, + float rs_arg, + idx_t n, + int k, + const float* x, + std::vector& trained); + +void train_NonUniform( + RangeStat rs, + float rs_arg, + idx_t n, + int d, + int k, + const float* x, + std::vector& trained); +} // namespace scalar_quantizer + +} // namespace faiss diff --git a/faiss/test_dispatch/a.out b/faiss/test_dispatch/a.out new file mode 100755 index 0000000000..4648aebd57 Binary files /dev/null and b/faiss/test_dispatch/a.out differ diff --git a/faiss/test_dispatch/f.cpp b/faiss/test_dispatch/f.cpp new file mode 100644 index 0000000000..c0dc736229 --- /dev/null +++ b/faiss/test_dispatch/f.cpp @@ -0,0 +1,20 @@ + + +#include "f.h" + +SIMDLevel simd_level = SIMDLevel::NONE; + +template <> +float fvec_norm_L2sqr(const float* x, size_t d) { + // the double in the _ref is suspected to be a typo. Some of the manual + // implementations this replaces used float. + float res = 0; + for (size_t i = 0; i != d; ++i) { + res += x[i] * x[i]; + } + return res; +} + +float fvec_norm_L2sqr(const float* x, size_t d) { + DISPATCH_SIMDLevel(fvec_norm_L2sqr, x, d); +} diff --git a/faiss/test_dispatch/f.h b/faiss/test_dispatch/f.h new file mode 100644 index 0000000000..04351f63cc --- /dev/null +++ b/faiss/test_dispatch/f.h @@ -0,0 +1,77 @@ + +#include +#include + +enum SIMDLevel { + NONE, + AVX2, + AVX512F, + ARM_NEON, + ARM_SVE, +}; + +extern SIMDLevel simd_level; + +#ifdef __ARM_FEATURE_SVE +#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) \ + case SIMDLevel::ARM_SVE: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) +#endif + +// #ifdef __AVX2__ +#if 1 +#define DISPATCH_SIMDLevel_AVX2(f, ...) \ + case SIMDLevel::AVX2: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_AVX2(f, ...) +#endif + +#ifdef __AVX512F__ +#define DISPATCH_SIMDLevel_AVX512F(f, ...) \ + case SIMDLevel::AVX512F: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_AVX512F(f, ...) +#endif + +#define DISPATCH_SIMDLevel(f, ...) \ + switch (simd_level) { \ + case SIMDLevel::NONE: \ + return f(__VA_ARGS__); \ + DISPATCH_SIMDLevel_ARM_SVE(f, __VA_ARGS__); \ + DISPATCH_SIMDLevel_AVX2(f, __VA_ARGS__); \ + DISPATCH_SIMDLevel_AVX512F(f, __VA_ARGS__); \ + default: \ + assert(!"invlalid SIMD level"); \ + } + +template +float fvec_norm_L2sqr(const float* x, size_t d); + +float fvec_norm_L2sqr(const float* x, size_t d); + +template +struct FF { + static void func(T* x) { + // default implementation + } +}; + +template +struct FF { + static void func(T* x) { + printf("sizeof T = %d\n", sizeof(T)); + // specialized implementation for i = 1 + } +}; + +template +struct FF { + static void func(T* x) { + printf("sizeof T = %d\n", sizeof(T)); + // specialized implementation for i = 1 + } +}; diff --git a/faiss/test_dispatch/f_avx2.cpp b/faiss/test_dispatch/f_avx2.cpp new file mode 100644 index 0000000000..1f94329032 --- /dev/null +++ b/faiss/test_dispatch/f_avx2.cpp @@ -0,0 +1,24 @@ + +#include "f.h" + +#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \ + _Pragma("float_control(precise, off, push)") +#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)") + +#define FAISS_PRAGMA_IMPRECISE_LOOP \ + _Pragma("clang loop vectorize(enable) interleave(enable)") + +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template <> +float fvec_norm_L2sqr(const float* x, size_t d) { + // the double in the _ref is suspected to be a typo. Some of the manual + // implementations this replaces used float. + float res = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t i = 0; i != d; ++i) { + res += x[i] * x[i]; + } + + return res; +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END diff --git a/faiss/test_dispatch/f_avx2.o b/faiss/test_dispatch/f_avx2.o new file mode 100644 index 0000000000..a21f7395c5 Binary files /dev/null and b/faiss/test_dispatch/f_avx2.o differ diff --git a/faiss/test_dispatch/f_avx512.cpp b/faiss/test_dispatch/f_avx512.cpp new file mode 100644 index 0000000000..d8a58da3ea --- /dev/null +++ b/faiss/test_dispatch/f_avx512.cpp @@ -0,0 +1,24 @@ + +#include "f.h" + +#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \ + _Pragma("float_control(precise, off, push)") +#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)") + +#define FAISS_PRAGMA_IMPRECISE_LOOP \ + _Pragma("clang loop vectorize(enable) interleave(enable)") + +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template <> +float fvec_norm_L2sqr(const float* x, size_t d) { + // the double in the _ref is suspected to be a typo. Some of the manual + // implementations this replaces used float. + float res = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t i = 0; i != d; ++i) { + res += x[i] * x[i]; + } + + return res; +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END diff --git a/faiss/test_dispatch/f_avx512.o b/faiss/test_dispatch/f_avx512.o new file mode 100644 index 0000000000..3f6940ed6b Binary files /dev/null and b/faiss/test_dispatch/f_avx512.o differ diff --git a/faiss/test_dispatch/g.cpp b/faiss/test_dispatch/g.cpp new file mode 100644 index 0000000000..6402c8751d --- /dev/null +++ b/faiss/test_dispatch/g.cpp @@ -0,0 +1,14 @@ +#include + +template +void func(); + +template <> +void func<1>() { + printf("func 1\n"); +} + +template <> +void func<2>() { + printf("func 2\n"); +} diff --git a/faiss/test_dispatch/g.o b/faiss/test_dispatch/g.o new file mode 100644 index 0000000000..93aafd4746 Binary files /dev/null and b/faiss/test_dispatch/g.o differ diff --git a/faiss/test_dispatch/main.cpp b/faiss/test_dispatch/main.cpp new file mode 100644 index 0000000000..dfaf03511f --- /dev/null +++ b/faiss/test_dispatch/main.cpp @@ -0,0 +1,37 @@ + +#include +#include "f.h" + +// fw declaration +template +void func(); + +int main() { + func<1>(); + func<2>(); + + float x[16]; + for (int i = 0; i < 16; i++) { + x[i] = i + 1; + } + float norm = fvec_norm_L2sqr(x, 16); + printf("norm=%g\n", norm); + + simd_level = SIMDLevel::AVX2; + norm = fvec_norm_L2sqr(x, 16); + printf("norm=%g\n", norm); + + /* + simd_level = SIMDLevel::AVX512F; + norm = fvec_norm_L2sqr(x, 16); + printf("norm=%g\n", norm); +*/ + char* env_var = getenv("FAISS_SIMD_LEVEL"); + + printf("env_var=%s\n", env_var); + + int i; + FF::func(&i); + + return 0; +} diff --git a/faiss/utils/distances.h b/faiss/utils/distances.h index 80d2cfc699..238e4bb40e 100644 --- a/faiss/utils/distances.h +++ b/faiss/utils/distances.h @@ -15,6 +15,7 @@ #include #include +#include namespace faiss { @@ -27,9 +28,15 @@ struct IDSelector; /// Squared L2 distance between two vectors float fvec_L2sqr(const float* x, const float* y, size_t d); +template +float fvec_L2sqr(const float* x, const float* y, size_t d); + /// inner product float fvec_inner_product(const float* x, const float* y, size_t d); +template +float fvec_inner_product(const float* x, const float* y, size_t d); + /// L1 distance float fvec_L1(const float* x, const float* y, size_t d); @@ -138,6 +145,9 @@ size_t fvec_L2sqr_ny_nearest_y_transposed( /** squared norm of a vector */ float fvec_norm_L2sqr(const float* x, size_t d); +template +float fvec_norm_L2sqr(const float* x, size_t d); + /** compute the L2 norms for a set of vectors * * @param norms output norms, size nx @@ -473,6 +483,10 @@ void compute_PQ_dis_tables_dsub2( */ void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c); +/* same statically */ +template +void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c); + /** same as fvec_madd, also return index of the min of the result table * @return index of the min of table c */ diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp index 1990e46aae..e829ae54ec 100644 --- a/faiss/utils/distances_simd.cpp +++ b/faiss/utils/distances_simd.cpp @@ -19,6 +19,9 @@ #include #include +#define AUTOVEC_LEVEL SIMDLevel::NONE +#include + #ifdef __SSE3__ #include #endif @@ -191,43 +194,19 @@ void fvec_inner_products_ny_ref( * Autovectorized implementations */ -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN -float fvec_inner_product(const float* x, const float* y, size_t d) { - float res = 0.F; - FAISS_PRAGMA_IMPRECISE_LOOP - for (size_t i = 0; i != d; ++i) { - res += x[i] * y[i]; - } - return res; -} -FAISS_PRAGMA_IMPRECISE_FUNCTION_END +// dispatching functions -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float fvec_norm_L2sqr(const float* x, size_t d) { - // the double in the _ref is suspected to be a typo. Some of the manual - // implementations this replaces used float. - float res = 0; - FAISS_PRAGMA_IMPRECISE_LOOP - for (size_t i = 0; i != d; ++i) { - res += x[i] * x[i]; - } - - return res; + DISPATCH_SIMDLevel(fvec_norm_L2sqr, x, d); } -FAISS_PRAGMA_IMPRECISE_FUNCTION_END -FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN float fvec_L2sqr(const float* x, const float* y, size_t d) { - size_t i; - float res = 0; - FAISS_PRAGMA_IMPRECISE_LOOP - for (i = 0; i < d; i++) { - const float tmp = x[i] - y[i]; - res += tmp * tmp; - } - return res; + DISPATCH_SIMDLevel(fvec_L2sqr, x, y, d); +} + +float fvec_inner_product(const float* x, const float* y, size_t d) { + DISPATCH_SIMDLevel(fvec_inner_product, x, y, d); } -FAISS_PRAGMA_IMPRECISE_FUNCTION_END /// Special version of inner product that computes 4 distances /// between x and yi @@ -3246,7 +3225,8 @@ void fvec_inner_products_ny( * heavily optimized table computations ***************************************************************************/ -[[maybe_unused]] static inline void fvec_madd_ref( +template <> +void fvec_madd( size_t n, const float* a, float bf, @@ -3256,94 +3236,6 @@ void fvec_inner_products_ny( c[i] = a[i] + bf * b[i]; } -#if defined(__AVX512F__) - -static inline void fvec_madd_avx512( - const size_t n, - const float* __restrict a, - const float bf, - const float* __restrict b, - float* __restrict c) { - const size_t n16 = n / 16; - const size_t n_for_masking = n % 16; - - const __m512 bfmm = _mm512_set1_ps(bf); - - size_t idx = 0; - for (idx = 0; idx < n16 * 16; idx += 16) { - const __m512 ax = _mm512_loadu_ps(a + idx); - const __m512 bx = _mm512_loadu_ps(b + idx); - const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax); - _mm512_storeu_ps(c + idx, abmul); - } - - if (n_for_masking > 0) { - const __mmask16 mask = (1 << n_for_masking) - 1; - - const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx); - const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx); - const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax); - _mm512_mask_storeu_ps(c + idx, mask, abmul); - } -} - -#elif defined(__AVX2__) - -static inline void fvec_madd_avx2( - const size_t n, - const float* __restrict a, - const float bf, - const float* __restrict b, - float* __restrict c) { - // - const size_t n8 = n / 8; - const size_t n_for_masking = n % 8; - - const __m256 bfmm = _mm256_set1_ps(bf); - - size_t idx = 0; - for (idx = 0; idx < n8 * 8; idx += 8) { - const __m256 ax = _mm256_loadu_ps(a + idx); - const __m256 bx = _mm256_loadu_ps(b + idx); - const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax); - _mm256_storeu_ps(c + idx, abmul); - } - - if (n_for_masking > 0) { - __m256i mask; - switch (n_for_masking) { - case 1: - mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - break; - case 2: - mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - break; - case 3: - mask = _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1); - break; - case 4: - mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - break; - case 5: - mask = _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1); - break; - case 6: - mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1); - break; - case 7: - mask = _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1); - break; - } - - const __m256 ax = _mm256_maskload_ps(a + idx, mask); - const __m256 bx = _mm256_maskload_ps(b + idx, mask); - const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax); - _mm256_maskstore_ps(c + idx, mask, abmul); - } -} - -#endif - #ifdef __SSE3__ [[maybe_unused]] static inline void fvec_madd_sse( @@ -3367,16 +3259,7 @@ static inline void fvec_madd_avx2( } void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) { -#ifdef __AVX512F__ - fvec_madd_avx512(n, a, bf, b, c); -#elif __AVX2__ - fvec_madd_avx2(n, a, bf, b, c); -#else - if ((n & 3) == 0 && ((((long)a) | ((long)b) | ((long)c)) & 15) == 0) - fvec_madd_sse(n, a, bf, b, c); - else - fvec_madd_ref(n, a, bf, b, c); -#endif + DISPATCH_SIMDLevel(fvec_madd, n, a, bf, b, c); } #elif defined(__ARM_FEATURE_SVE) diff --git a/faiss/utils/simd_impl/distances_autovec-inl.h b/faiss/utils/simd_impl/distances_autovec-inl.h new file mode 100644 index 0000000000..ace6dccccb --- /dev/null +++ b/faiss/utils/simd_impl/distances_autovec-inl.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace faiss { + +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template <> +float fvec_norm_L2sqr(const float* x, size_t d) { + // the double in the _ref is suspected to be a typo. Some of the manual + // implementations this replaces used float. + float res = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t i = 0; i != d; ++i) { + res += x[i] * x[i]; + } + + return res; +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END + +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template <> +float fvec_L2sqr(const float* x, const float* y, size_t d) { + size_t i; + float res = 0; + FAISS_PRAGMA_IMPRECISE_LOOP + for (i = 0; i < d; i++) { + const float tmp = x[i] - y[i]; + res += tmp * tmp; + } + return res; +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END + +FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN +template <> +float fvec_inner_product( + const float* x, + const float* y, + size_t d) { + float res = 0.F; + FAISS_PRAGMA_IMPRECISE_LOOP + for (size_t i = 0; i != d; ++i) { + res += x[i] * y[i]; + } + return res; +} +FAISS_PRAGMA_IMPRECISE_FUNCTION_END + +} // namespace faiss diff --git a/faiss/utils/simd_impl/distances_avx2.cpp b/faiss/utils/simd_impl/distances_avx2.cpp new file mode 100644 index 0000000000..cd52c470f9 --- /dev/null +++ b/faiss/utils/simd_impl/distances_avx2.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#define AUTOVEC_LEVEL SIMDLevel::AVX2 +#include + +namespace faiss { + +template <> +void fvec_madd( + const size_t n, + const float* __restrict a, + const float bf, + const float* __restrict b, + float* __restrict c) { + // + const size_t n8 = n / 8; + const size_t n_for_masking = n % 8; + + const __m256 bfmm = _mm256_set1_ps(bf); + + size_t idx = 0; + for (idx = 0; idx < n8 * 8; idx += 8) { + const __m256 ax = _mm256_loadu_ps(a + idx); + const __m256 bx = _mm256_loadu_ps(b + idx); + const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax); + _mm256_storeu_ps(c + idx, abmul); + } + + if (n_for_masking > 0) { + __m256i mask; + switch (n_for_masking) { + case 1: + mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + break; + case 2: + mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + break; + case 3: + mask = _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1); + break; + case 4: + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + break; + case 5: + mask = _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1); + break; + case 6: + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1); + break; + case 7: + mask = _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1); + break; + } + + const __m256 ax = _mm256_maskload_ps(a + idx, mask); + const __m256 bx = _mm256_maskload_ps(b + idx, mask); + const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax); + _mm256_maskstore_ps(c + idx, mask, abmul); + } +} + +} // namespace faiss diff --git a/faiss/utils/simd_impl/distances_avx512.cpp b/faiss/utils/simd_impl/distances_avx512.cpp new file mode 100644 index 0000000000..e87df652fd --- /dev/null +++ b/faiss/utils/simd_impl/distances_avx512.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#define AUTOVEC_LEVEL SIMDLevel::AVX512F +#include + +namespace faiss { + +template <> +void fvec_madd( + const size_t n, + const float* __restrict a, + const float bf, + const float* __restrict b, + float* __restrict c) { + const size_t n16 = n / 16; + const size_t n_for_masking = n % 16; + + const __m512 bfmm = _mm512_set1_ps(bf); + + size_t idx = 0; + for (idx = 0; idx < n16 * 16; idx += 16) { + const __m512 ax = _mm512_loadu_ps(a + idx); + const __m512 bx = _mm512_loadu_ps(b + idx); + const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax); + _mm512_storeu_ps(c + idx, abmul); + } + + if (n_for_masking > 0) { + const __mmask16 mask = (1 << n_for_masking) - 1; + + const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx); + const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx); + const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax); + _mm512_mask_storeu_ps(c + idx, mask, abmul); + } +} + +} // namespace faiss diff --git a/faiss/utils/simdlib_avx2.h b/faiss/utils/simd_impl/simdlib_avx2.h similarity index 100% rename from faiss/utils/simdlib_avx2.h rename to faiss/utils/simd_impl/simdlib_avx2.h diff --git a/faiss/utils/simdlib_avx512.h b/faiss/utils/simd_impl/simdlib_avx512.h similarity index 86% rename from faiss/utils/simdlib_avx512.h rename to faiss/utils/simd_impl/simdlib_avx512.h index 63b23f9b19..80889dc508 100644 --- a/faiss/utils/simdlib_avx512.h +++ b/faiss/utils/simd_impl/simdlib_avx512.h @@ -14,7 +14,7 @@ #include -#include +#include namespace faiss { @@ -293,4 +293,47 @@ struct simd64uint8 : simd512bit { } }; +struct simd16float32 : simd512bit { + simd16float32() {} + + explicit simd16float32(simd512bit x) : simd512bit(x) {} + + explicit simd16float32(__m512 x) : simd512bit(x) {} + + explicit simd16float32(float x) : simd512bit(_mm512_set1_ps(x)) {} + + explicit simd16float32(const float* x) + : simd16float32(_mm512_loadu_ps(x)) {} + + simd16float32 operator*(simd16float32 other) const { + return simd16float32(_mm512_mul_ps(f, other.f)); + } + + simd16float32 operator+(simd16float32 other) const { + return simd16float32(_mm512_add_ps(f, other.f)); + } + + simd16float32 operator-(simd16float32 other) const { + return simd16float32(_mm512_sub_ps(f, other.f)); + } + + simd16float32& operator+=(const simd16float32& other) { + f = _mm512_add_ps(f, other.f); + return *this; + } + + std::string tostring() const { + float tab[16]; + storeu((void*)tab); + char res[1000]; + char* ptr = res; + for (int i = 0; i < 16; i++) { + ptr += sprintf(ptr, "%g,", tab[i]); + } + // strip last , + ptr[-1] = 0; + return std::string(res); + } +}; + } // namespace faiss diff --git a/faiss/utils/simdlib_emulated.h b/faiss/utils/simd_impl/simdlib_emulated.h similarity index 100% rename from faiss/utils/simdlib_emulated.h rename to faiss/utils/simd_impl/simdlib_emulated.h diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simd_impl/simdlib_neon.h similarity index 100% rename from faiss/utils/simdlib_neon.h rename to faiss/utils/simd_impl/simdlib_neon.h diff --git a/faiss/utils/simdlib_ppc64.h b/faiss/utils/simd_impl/simdlib_ppc64.h similarity index 100% rename from faiss/utils/simdlib_ppc64.h rename to faiss/utils/simd_impl/simdlib_ppc64.h diff --git a/faiss/utils/simd_levels.cpp b/faiss/utils/simd_levels.cpp new file mode 100644 index 0000000000..894c9aa271 --- /dev/null +++ b/faiss/utils/simd_levels.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace faiss { + +SIMDLevel SIMDConfig::level = SIMDLevel::NONE; + +const char* SIMDConfig::level_names[] = + {"NONE", "AVX2", "AVX512F", "ARM_NEON", "ARM_SVE", "PPC"}; + +SIMDConfig::SIMDConfig() { + char* env_var = getenv("FAISS_SIMD_LEVEL"); + if (env_var) { + int i; + for (i = 0; i <= sizeof(level_names); i++) { + if (strcmp(env_var, level_names[i]) == 0) { + level = (SIMDLevel)i; + break; + } + } + FAISS_THROW_IF_NOT_FMT( + i != sizeof(level_names), + "FAISS_SIMD_LEVEL %s unknown", + env_var); + return; + } + +#ifdef __x86_64__ + { + unsigned int eax, ebx, ecx, edx; + asm volatile("cpuid" + : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(0)); + +#ifdef COMPILE_SIMD_AVX512F + if (ebx & (1 << 16)) { + level = AVX512F; + } else +#endif + +#ifdef COMPILE_SIMD_AVX2 + if (ecx & 32) { + level = AVX2; + } else +#endif + level = NONE; + } +#endif +} + +} // namespace faiss diff --git a/faiss/utils/simd_levels.h b/faiss/utils/simd_levels.h new file mode 100644 index 0000000000..c7d6056bc8 --- /dev/null +++ b/faiss/utils/simd_levels.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +namespace faiss { + +// levels should be of increasing capacity +enum SIMDLevel { + NONE, + // x86 + AVX2, + AVX512F, + // arm + ARM_NEON, + ARM_SVE, + // ppc + PPC_ALTIVEC, +}; + +struct SIMDConfig { + static SIMDLevel level; + static const char* level_names[]; + // initializes the simd_level from the cpuid and the FAISS_SIMD_LEVEL + // environment variable + SIMDConfig(); +}; + +/*********************** x86 SIMD */ + +#ifdef COMPILE_SIMD_AVX2 +#define DISPATCH_SIMDLevel_AVX2(f, ...) \ + case SIMDLevel::AVX2: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_AVX2(f, ...) +#endif + +#ifdef COMPILE_SIMD_AVX512F +#define DISPATCH_SIMDLevel_AVX512F(f, ...) \ + case SIMDLevel::AVX512F: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_AVX512F(f, ...) +#endif + +/*********************** ARM SIMD */ + +#ifdef COMPILE_SIMD_NEON +#define DISPATCH_SIMDLevel_ARM_NEON(f, ...) \ + case SIMDLevel::ARM_NEON: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_ARM_NEON(f, ...) +#endif + +#ifdef COMPILE_SIMD_SVE +#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) \ + case SIMDLevel::ARM_SVE: \ + return f(__VA_ARGS__) +#else +#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) +#endif + +/* dispatch function f to f */ + +#define DISPATCH_SIMDLevel(f, ...) \ + switch (SIMDConfig::level) { \ + case SIMDLevel::NONE: \ + return f(__VA_ARGS__); \ + DISPATCH_SIMDLevel_AVX2(f, __VA_ARGS__); \ + DISPATCH_SIMDLevel_AVX512F(f, __VA_ARGS__); \ + DISPATCH_SIMDLevel_ARM_NEON(f, __VA_ARGS__); \ + DISPATCH_SIMDLevel_ARM_SVE(f, __VA_ARGS__); \ + default: \ + assert(!"invlalid SIMD level"); \ + } + +} // namespace faiss diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h index eadfb78ae3..2b8bef4716 100644 --- a/faiss/utils/simdlib.h +++ b/faiss/utils/simdlib.h @@ -16,25 +16,25 @@ #if defined(__AVX512F__) -#include -#include +#include +#include #elif defined(__AVX2__) -#include +#include #elif defined(__aarch64__) -#include +#include #elif defined(__PPC64__) -#include +#include #else // emulated = all operations are implemented as scalars -#include +#include // FIXME: make a SSE version // is this ever going to happen? We will probably rather implement AVX512