diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index ce1f5ddf1a..c581de791f 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -805,8 +805,9 @@ struct RangeSearchResults {
  * The scanning functions call their favorite precompute_*
  * function to precompute the tables they need.
  *****************************************************/
-template <typename IDType, MetricType METRIC_TYPE, class PQDecoder>
+template <typename IDType, MetricType METRIC_TYPE, class PQCodeDistance>
 struct IVFPQScannerT : QueryTables {
+    using PQDecoder = typename PQCodeDistance::PQDecoder;
     const uint8_t* list_codes;
     const IDType* list_ids;
     size_t list_size;
@@ -882,7 +883,7 @@ struct IVFPQScannerT : QueryTables {
                 float distance_1 = 0;
                 float distance_2 = 0;
                 float distance_3 = 0;
-                distance_four_codes<PQDecoder>(
+                PQCodeDistance::distance_four_codes(
                         pq.M,
                         pq.nbits,
                         sim_table,
@@ -905,7 +906,7 @@ struct IVFPQScannerT : QueryTables {
 
         if (counter >= 1) {
             float dis = dis0 +
-                    distance_single_code<PQDecoder>(
+                    PQCodeDistance::distance_single_code(
                                 pq.M,
                                 pq.nbits,
                                 sim_table,
@@ -914,7 +915,7 @@ struct IVFPQScannerT : QueryTables {
         }
         if (counter >= 2) {
             float dis = dis0 +
-                    distance_single_code<PQDecoder>(
+                    PQCodeDistance::distance_single_code(
                                 pq.M,
                                 pq.nbits,
                                 sim_table,
@@ -923,7 +924,7 @@ struct IVFPQScannerT : QueryTables {
         }
         if (counter >= 3) {
             float dis = dis0 +
-                    distance_single_code<PQDecoder>(
+                    PQCodeDistance::distance_single_code(
                                 pq.M,
                                 pq.nbits,
                                 sim_table,
@@ -1089,7 +1090,7 @@ struct IVFPQScannerT : QueryTables {
                 float distance_1 = dis0;
                 float distance_2 = dis0;
                 float distance_3 = dis0;
-                distance_four_codes<PQDecoder>(
+                PQCodeDistance::distance_four_codes(
                         pq.M,
                         pq.nbits,
                         sim_table,
@@ -1120,7 +1121,7 @@ struct IVFPQScannerT : QueryTables {
             n_hamming_pass++;
 
             float dis = dis0 +
-                    distance_single_code<PQDecoder>(
+                    PQCodeDistance::distance_single_code(
                                 pq.M,
                                 pq.nbits,
                                 sim_table,
@@ -1140,7 +1141,7 @@ struct IVFPQScannerT : QueryTables {
                 n_hamming_pass++;
 
                 float dis = dis0 +
-                        distance_single_code<PQDecoder>(
+                        PQCodeDistance::distance_single_code(
                                     pq.M,
                                     pq.nbits,
                                     sim_table,
@@ -1185,8 +1186,8 @@ struct IVFPQScannerT : QueryTables {
  *
  * use_sel: store or ignore the IDSelector
  */
-template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
-struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
+template <MetricType METRIC_TYPE, class C, class PQCodeDistance, bool use_sel>
+struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDistance>,
                       InvertedListScanner {
     int precompute_mode;
     const IDSelector* sel;
@@ -1196,7 +1197,7 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
             bool store_pairs,
             int precompute_mode,
             const IDSelector* sel)
-            : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
+            : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDistance>(ivfpq, nullptr),
               precompute_mode(precompute_mode),
               sel(sel) {
         this->store_pairs = store_pairs;
@@ -1215,7 +1216,7 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
         float dis = this->dis0 +
-                distance_single_code<PQDecoder>(
+                PQCodeDistance::distance_single_code(
                             this->pq.M, this->pq.nbits, this->sim_table, code);
         return dis;
     }
@@ -1279,7 +1280,9 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
     }
 };
 
-template <class PQDecoder, bool use_sel>
+/** follow 3 stages of template dispatching  */
+
+template <class PQCodeDistance, bool use_sel>
 InvertedListScanner* get_InvertedListScanner1(
         const IndexIVFPQ& index,
         bool store_pairs,
@@ -1288,32 +1291,47 @@ InvertedListScanner* get_InvertedListScanner1(
         return new IVFPQScanner<
                 METRIC_INNER_PRODUCT,
                 CMin<float, idx_t>,
-                PQDecoder,
+                PQCodeDistance,
                 use_sel>(index, store_pairs, 2, sel);
     } else if (index.metric_type == METRIC_L2) {
         return new IVFPQScanner<
                 METRIC_L2,
                 CMax<float, idx_t>,
-                PQDecoder,
+                PQCodeDistance,
                 use_sel>(index, store_pairs, 2, sel);
     }
     return nullptr;
 }
 
-template <bool use_sel>
+template <bool use_sel, SIMDLevel SL>
 InvertedListScanner* get_InvertedListScanner2(
         const IndexIVFPQ& index,
         bool store_pairs,
         const IDSelector* sel) {
     if (index.pq.nbits == 8) {
-        return get_InvertedListScanner1<PQDecoder8, use_sel>(
-                index, store_pairs, sel);
+        return get_InvertedListScanner1<
+                PQCodeDistance<PQDecoder8, SL>,
+                use_sel>(index, store_pairs, sel);
     } else if (index.pq.nbits == 16) {
-        return get_InvertedListScanner1<PQDecoder16, use_sel>(
-                index, store_pairs, sel);
+        return get_InvertedListScanner1<
+                PQCodeDistance<PQDecoder16, SL>,
+                use_sel>(index, store_pairs, sel);
+    } else {
+        return get_InvertedListScanner1<
+                PQCodeDistance<PQDecoderGeneric, SL>,
+                use_sel>(index, store_pairs, sel);
+    }
+}
+
+template <SIMDLevel SL>
+InvertedListScanner* get_InvertedListScanner3(
+        const IndexIVFPQ& index,
+        bool store_pairs,
+        const IDSelector* sel) {
+    if (sel) {
+        return get_InvertedListScanner2<true, SL>(index, store_pairs, sel);
     } else {
-        return get_InvertedListScanner1<PQDecoderGeneric, use_sel>(
-                index, store_pairs, sel);
+        return get_InvertedListScanner2<false, SL>(index, store_pairs, sel);
     }
 }
 
@@ -1323,11 +1341,7 @@ InvertedListScanner* IndexIVFPQ::get_InvertedListScanner(
         bool store_pairs,
         const IDSelector* sel,
         const IVFSearchParameters*) const {
-    if (sel) {
-        return get_InvertedListScanner2<true>(*this, store_pairs, sel);
-    } else {
-        return get_InvertedListScanner2<false>(*this, store_pairs, sel);
-    }
+    DISPATCH_SIMDLevel(get_InvertedListScanner3, *this, store_pairs, sel);
     return nullptr;
 }
 
diff --git a/faiss/IndexPQ.cpp b/faiss/IndexPQ.cpp
index 8193e78b17..99347f9c8c 100644
--- a/faiss/IndexPQ.cpp
+++ b/faiss/IndexPQ.cpp
@@ -73,7 +73,7 @@ void IndexPQ::train(idx_t n, const float* x) {
 
 namespace {
 
-template <class PQDecoder>
+template <class PQCodeDistance>
 struct PQDistanceComputer : FlatCodesDistanceComputer {
     size_t d;
     MetricType metric;
@@ -86,7 +86,7 @@ struct PQDistanceComputer : FlatCodesDistanceComputer {
     float distance_to_code(const uint8_t* code) final {
         ndis++;
 
-        float dis = distance_single_code<PQDecoder>(
+        float dis = PQCodeDistance::distance_single_code(
                 pq.M, pq.nbits, precomputed_table.data(), code);
         return dis;
     }
@@ -95,8 +95,10 @@ struct PQDistanceComputer : FlatCodesDistanceComputer {
         FAISS_THROW_IF_NOT(sdc);
         const float* sdci = sdc;
         float accu = 0;
-        PQDecoder codei(codes + i * code_size, pq.nbits);
-        PQDecoder codej(codes + j * code_size, pq.nbits);
+        typename PQCodeDistance::PQDecoder codei(
+                codes + i * code_size, pq.nbits);
+        typename PQCodeDistance::PQDecoder codej(
+                codes + j * code_size, pq.nbits);
 
         for (int l = 0; l < pq.M; l++) {
             accu += sdci[codei.decode() + (codej.decode() << codei.nbits)];
@@ -132,16 +134,24 @@ struct PQDistanceComputer : FlatCodesDistanceComputer {
     }
 };
 
+template <SIMDLevel SL>
+FlatCodesDistanceComputer* get_FlatCodesDistanceComputer1(
+        const IndexPQ& index) {
+    int nbits = index.pq.nbits;
+    if (nbits == 8) {
+        return new PQDistanceComputer<PQCodeDistance<PQDecoder8, SL>>(index);
+    } else if (nbits == 16) {
+        return new PQDistanceComputer<PQCodeDistance<PQDecoder16, SL>>(index);
+    } else {
+        return new PQDistanceComputer<PQCodeDistance<PQDecoderGeneric, SL>>(
+                index);
+    }
+}
+
 } // namespace
 
 FlatCodesDistanceComputer* IndexPQ::get_FlatCodesDistanceComputer() const {
-    if (pq.nbits == 8) {
-        return new PQDistanceComputer<PQDecoder8>(*this);
-    } else if (pq.nbits == 16) {
-        return new PQDistanceComputer<PQDecoder16>(*this);
-    } else {
-        return new PQDistanceComputer<PQDecoderGeneric>(*this);
-    }
+    DISPATCH_SIMDLevel(get_FlatCodesDistanceComputer1, *this);
 }
 
 /*****************************************
diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index e05f3a1f25..adfa9ce4d5 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -5,1965 +5,71 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
-#include <faiss/impl/ScalarQuantizer.h>
-
-#include <algorithm>
-#include <cstdio>
-
-#include <faiss/impl/platform_macros.h>
-#include <omp.h>
-
-#ifdef __SSE__
-#include <immintrin.h>
-#endif
-
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/bf16.h>
-#include <faiss/utils/fp16.h>
-#include <faiss/utils/utils.h>
-
-namespace faiss {
-
-/*******************************************************************
- * ScalarQuantizer implementation
- *
- * The main source of complexity is to support combinations of 4
- * variants without incurring runtime tests or virtual function calls:
- *
- * - 4 / 8 bits per code component
- * - uniform / non-uniform
- * - IP / L2 distance search
- * - scalar / AVX distance computation
- *
- * The appropriate Quantizer object is returned via select_quantizer
- * that hides the template mess.
- ********************************************************************/
-
-#if defined(__AVX512F__) && defined(__F16C__)
-#define USE_AVX512_F16C
-#elif defined(__AVX2__)
-#ifdef __F16C__
-#define USE_F16C
-#else
-#warning \
-        "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well"
-#endif
-#endif
-
-#if defined(__aarch64__)
-#if defined(__GNUC__) && __GNUC__ < 8
-#warning \
-        "Cannot enable NEON optimizations in scalar quantizer if the compiler is GCC<8"
-#else
-#define USE_NEON
-#endif
-#endif
-
-namespace {
-
-typedef ScalarQuantizer::QuantizerType QuantizerType;
-typedef ScalarQuantizer::RangeStat RangeStat;
-using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
-
-/*******************************************************************
- * Codec: converts between values in [0, 1] and an index in a code
- * array. The "i" parameter is the vector component index (not byte
- * index).
- */
-
-struct Codec8bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        code[i] = (int)(255 * x);
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        return (code[i] + 0.5f) / 255.0f;
-    }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i));
-        const __m512i i32 = _mm512_cvtepu8_epi32(c16);
-        const __m512 f16 = _mm512_cvtepi32_ps(i32);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
-        return _mm512_fmadd_ps(f16, one_255, half_one_255);
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        const uint64_t c8 = *(uint64_t*)(code + i);
-
-        const __m128i i8 = _mm_set1_epi64x(c8);
-        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
-        const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
-        return _mm256_fmadd_ps(f8, one_255, half_one_255);
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-struct Codec4bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
-    }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + (i >> 1));
-        uint64_t mask = 0x0f0f0f0f0f0f0f0f;
-        uint64_t c8ev = c8 & mask;
-        uint64_t c8od = (c8 >> 4) & mask;
-
-        __m128i c16 =
-                _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od));
-        __m256i c8lo = _mm256_cvtepu8_epi32(c16);
-        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8));
-        __m512i i16 = _mm512_castsi256_si512(c8lo);
-        i16 = _mm512_inserti32x8(i16, c8hi, 1);
-        __m512 f16 = _mm512_cvtepi32_ps(i16);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 15.f);
-        return _mm512_fmadd_ps(f16, one_255, half_one_255);
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
-        uint32_t mask = 0x0f0f0f0f;
-        uint32_t c4ev = c4 & mask;
-        uint32_t c4od = (c4 >> 4) & mask;
-
-        // the 8 lower bytes of c8 contain the values
-        __m128i c8 =
-                _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od));
-        __m128i c4lo = _mm_cvtepu8_epi32(c8);
-        __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4));
-        __m256i i8 = _mm256_castsi128_si256(c4lo);
-        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_255 = _mm256_set1_ps(1.f / 15.f);
-        return _mm256_mul_ps(f8, one_255);
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-struct Codec6bit {
-    static FAISS_ALWAYS_INLINE void encode_component(
-            float x,
-            uint8_t* code,
-            int i) {
-        int bits = (int)(x * 63.0);
-        code += (i >> 2) * 3;
-        switch (i & 3) {
-            case 0:
-                code[0] |= bits;
-                break;
-            case 1:
-                code[0] |= bits << 6;
-                code[1] |= bits >> 2;
-                break;
-            case 2:
-                code[1] |= bits << 4;
-                code[2] |= bits >> 4;
-                break;
-            case 3:
-                code[2] |= bits << 2;
-                break;
-        }
-    }
-
-    static FAISS_ALWAYS_INLINE float decode_component(
-            const uint8_t* code,
-            int i) {
-        uint8_t bits;
-        code += (i >> 2) * 3;
-        switch (i & 3) {
-            case 0:
-                bits = code[0] & 0x3f;
-                break;
-            case 1:
-                bits = code[0] >> 6;
-                bits |= (code[1] & 0xf) << 2;
-                break;
-            case 2:
-                bits = code[1] >> 4;
-                bits |= (code[2] & 3) << 4;
-                break;
-            case 3:
-                bits = code[2] >> 2;
-                break;
-        }
-        return (bits + 0.5f) / 63.0f;
-    }
-
-#if defined(__AVX512F__)
-
-    static FAISS_ALWAYS_INLINE __m512
-    decode_16_components(const uint8_t* code, int i) {
-        // pure AVX512 implementation (not necessarily the fastest).
-        // see:
-        // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
-
-        // clang-format off
-
-        // 16 components, 16x6 bit=12 bytes
-        const __m128i bit_6v =
-                _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3);
-        const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v);
-
-        // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
-        // 00          01          02          03
-        const __m256i shuffle_mask = _mm256_setr_epi16(
-                0xFF00, 0x0100, 0x0201, 0xFF02,
-                0xFF03, 0x0403, 0x0504, 0xFF05,
-                0xFF06, 0x0706, 0x0807, 0xFF08,
-                0xFF09, 0x0A09, 0x0B0A, 0xFF0B);
-        const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask);
-
-        // 0: xxxxxxxx xx543210
-        // 1: xxxx5432 10xxxxxx
-        // 2: xxxxxx54 3210xxxx
-        // 3: xxxxxxxx 543210xx
-        const __m256i shift_right_v = _mm256_setr_epi16(
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U,
-                0x0U, 0x6U, 0x4U, 0x2U);
-        __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v);
-
-        // remove unneeded bits
-        shuffled_shifted =
-                _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F));
-
-        // scale
-        const __m512 f8 =
-                _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted));
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
-        return _mm512_fmadd_ps(f8, one_255, half_one_255);
-
-        // clang-format on
-    }
-
-#elif defined(__AVX2__)
-
-    /* Load 6 bytes that represent 8 6-bit values, return them as a
-     * 8*32 bit vector register */
-    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
-        const __m128i perm = _mm_set_epi8(
-                -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
-        const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
-
-        // load 6 bytes
-        __m128i c1 =
-                _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]);
-
-        // put in 8 * 32 bits
-        __m128i c2 = _mm_shuffle_epi8(c1, perm);
-        __m256i c3 = _mm256_cvtepi16_epi32(c2);
-
-        // shift and mask out useless bits
-        __m256i c4 = _mm256_srlv_epi32(c3, shifts);
-        __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4);
-        return c5;
-    }
-
-    static FAISS_ALWAYS_INLINE __m256
-    decode_8_components(const uint8_t* code, int i) {
-        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
-        // // for the reference, maybe, it becomes used oned day.
-        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
-        // const uint32_t* data32 = (const uint32_t*)data16;
-        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
-        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
-        // const __m128i i8 = _mm_set1_epi64x(vext);
-        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
-        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
-
-        __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        // this could also be done with bit manipulations but it is
-        // not obviously faster
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        return _mm256_fmadd_ps(f8, one_255, half_one_255);
-    }
-
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE float32x4x2_t
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
-    }
-#endif
-};
-
-/*******************************************************************
- * Quantizer: normalizes scalar vector components, then passes them
- * through a codec
- *******************************************************************/
-
-enum class QuantizerTemplateScaling { UNIFORM = 0, NON_UNIFORM = 1 };
-
-template <class Codec, QuantizerTemplateScaling SCALING, int SIMD>
-struct QuantizerTemplate {};
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>
-        : ScalarQuantizer::SQuantizer {
-    const size_t d;
-    const float vmin, vdiff;
-
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : d(d), vmin(trained[0]), vdiff(trained[1]) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = 0;
-            if (vdiff != 0) {
-                xi = (x[i] - vmin) / vdiff;
-                if (xi < 0) {
-                    xi = 0;
-                }
-                if (xi > 1.0) {
-                    xi = 1.0;
-                }
-            }
-            Codec::encode_component(xi, code, i);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = Codec::decode_component(code, i);
-            x[i] = vmin + xi * vdiff;
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        float xi = Codec::decode_component(code, i);
-        return vmin + xi * vdiff;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 16>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m512 xi = Codec::decode_16_components(code, i);
-        return _mm512_fmadd_ps(
-                xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin));
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_fmadd_ps(
-                xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin));
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<Codec, QuantizerTemplateScaling::UNIFORM, 1>(
-                      d,
-                      trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        float32x4x2_t xi = Codec::decode_8_components(code, i);
-        return {vfmaq_f32(
-                        vdupq_n_f32(this->vmin),
-                        xi.val[0],
-                        vdupq_n_f32(this->vdiff)),
-                vfmaq_f32(
-                        vdupq_n_f32(this->vmin),
-                        xi.val[1],
-                        vdupq_n_f32(this->vdiff))};
-    }
-};
-
-#endif
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1>
-        : ScalarQuantizer::SQuantizer {
-    const size_t d;
-    const float *vmin, *vdiff;
-
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = 0;
-            if (vdiff[i] != 0) {
-                xi = (x[i] - vmin[i]) / vdiff[i];
-                if (xi < 0) {
-                    xi = 0;
-                }
-                if (xi > 1.0) {
-                    xi = 1.0;
-                }
-            }
-            Codec::encode_component(xi, code, i);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            float xi = Codec::decode_component(code, i);
-            x[i] = vmin[i] + xi * vdiff[i];
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        float xi = Codec::decode_component(code, i);
-        return vmin[i] + xi * vdiff[i];
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 16>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m512 xi = Codec::decode_16_components(code, i);
-        return _mm512_fmadd_ps(
-                xi,
-                _mm512_loadu_ps(this->vdiff + i),
-                _mm512_loadu_ps(this->vmin + i));
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m256 xi = Codec::decode_8_components(code, i);
-        return _mm256_fmadd_ps(
-                xi,
-                _mm256_loadu_ps(this->vdiff + i),
-                _mm256_loadu_ps(this->vmin + i));
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Codec>
-struct QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 8>
-        : QuantizerTemplate<Codec, QuantizerTemplateScaling::NON_UNIFORM, 1> {
-    QuantizerTemplate(size_t d, const std::vector<float>& trained)
-            : QuantizerTemplate<
-                      Codec,
-                      QuantizerTemplateScaling::NON_UNIFORM,
-                      1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        float32x4x2_t xi = Codec::decode_8_components(code, i);
-
-        float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
-        float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
-
-        return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
-                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])};
-    }
-};
-
-#endif
-
-/*******************************************************************
- * FP16 quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct QuantizerFP16 {};
-
-template <>
-struct QuantizerFP16<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    QuantizerFP16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            ((uint16_t*)code)[i] = encode_fp16(x[i]);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = decode_fp16(((uint16_t*)code)[i]);
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return decode_fp16(((uint16_t*)code)[i]);
-    }
-};
-
-#if defined(USE_AVX512_F16C)
-
-template <>
-struct QuantizerFP16<16> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
-        return _mm512_cvtph_ps(codei);
-    }
-};
-
-#endif
-
-#if defined(USE_F16C)
-
-template <>
-struct QuantizerFP16<8> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i));
-        return _mm256_cvtph_ps(codei);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct QuantizerFP16<8> : QuantizerFP16<1> {
-    QuantizerFP16(size_t d, const std::vector<float>& trained)
-            : QuantizerFP16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
-        return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))};
-    }
-};
-#endif
-
-/*******************************************************************
- * BF16 quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct QuantizerBF16 {};
-
-template <>
-struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            ((uint16_t*)code)[i] = encode_bf16(x[i]);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = decode_bf16(((uint16_t*)code)[i]);
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return decode_bf16(((uint16_t*)code)[i]);
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct QuantizerBF16<16> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
-        __m512i code_512i = _mm512_cvtepu16_epi32(code_256i);
-        code_512i = _mm512_slli_epi32(code_512i, 16);
-        return _mm512_castsi512_ps(code_512i);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct QuantizerBF16<8> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
-        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
-        code_256i = _mm256_slli_epi32(code_256i, 16);
-        return _mm256_castsi256_ps(code_256i);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct QuantizerBF16<8> : QuantizerBF16<1> {
-    QuantizerBF16(size_t d, const std::vector<float>& trained)
-            : QuantizerBF16<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
-        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
-                vreinterpretq_f32_u32(
-                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
-    }
-};
-#endif
-
-/*******************************************************************
- * 8bit_direct quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct Quantizer8bitDirect {};
-
-template <>
-struct Quantizer8bitDirect<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    Quantizer8bitDirect(size_t d, const std::vector<float>& /* unused */)
-            : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            code[i] = (uint8_t)x[i];
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = code[i];
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return code[i];
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct Quantizer8bitDirect<16> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
-        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
-        return _mm512_cvtepi32_ps(y16);                      // 16 * float32
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
-        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
-        return _mm256_cvtepi32_ps(y8);                      // 8 * float32
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
-    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirect<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
-        uint16x8_t y8 = vmovl_u8(x8);
-        uint16x4_t y8_0 = vget_low_u16(y8);
-        uint16x4_t y8_1 = vget_high_u16(y8);
-
-        // convert uint16 -> uint32 -> fp32
-        return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))};
-    }
-};
-
-#endif
-
-/*******************************************************************
- * 8bit_direct_signed quantizer
- *******************************************************************/
-
-template <int SIMDWIDTH>
-struct Quantizer8bitDirectSigned {};
-
-template <>
-struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer {
-    const size_t d;
-
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& /* unused */)
-            : d(d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const final {
-        for (size_t i = 0; i < d; i++) {
-            code[i] = (uint8_t)(x[i] + 128);
-        }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const final {
-        for (size_t i = 0; i < d; i++) {
-            x[i] = code[i] - 128;
-        }
-    }
-
-    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
-            const {
-        return code[i] - 128;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct Quantizer8bitDirectSigned<16> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m512
-    reconstruct_16_components(const uint8_t* code, int i) const {
-        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
-        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
-        __m512i c16 = _mm512_set1_epi32(128);
-        __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes
-        return _mm512_cvtepi32_ps(z16);           // 16 * float32
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE __m256
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
-        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
-        __m256i c8 = _mm256_set1_epi32(128);
-        __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes
-        return _mm256_cvtepi32_ps(z8);         // 8 * float32
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
-    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
-            : Quantizer8bitDirectSigned<1>(d, trained) {}
-
-    FAISS_ALWAYS_INLINE float32x4x2_t
-    reconstruct_8_components(const uint8_t* code, int i) const {
-        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
-        uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16
-        uint16x4_t y8_0 = vget_low_u16(y8);
-        uint16x4_t y8_1 = vget_high_u16(y8);
-
-        float32x4_t z8_0 = vcvtq_f32_u32(
-                vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32
-        float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1));
-
-        // subtract 128 to convert into signed numbers
-        return {vsubq_f32(z8_0, vmovq_n_f32(128.0)),
-                vsubq_f32(z8_1, vmovq_n_f32(128.0))};
-    }
-};
-
-#endif
-
-template <int SIMDWIDTH>
-ScalarQuantizer::SQuantizer* select_quantizer_1(
-        QuantizerType qtype,
-        size_t d,
-        const std::vector<float>& trained) {
-    switch (qtype) {
-        case ScalarQuantizer::QT_8bit:
-            return new QuantizerTemplate<
-                    Codec8bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_6bit:
-            return new QuantizerTemplate<
-                    Codec6bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_4bit:
-            return new QuantizerTemplate<
-                    Codec4bit,
-                    QuantizerTemplateScaling::NON_UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerTemplate<
-                    Codec8bit,
-                    QuantizerTemplateScaling::UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerTemplate<
-                    Codec4bit,
-                    QuantizerTemplateScaling::UNIFORM,
-                    SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_fp16:
-            return new QuantizerFP16<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_bf16:
-            return new QuantizerBF16<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_direct:
-            return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return new Quantizer8bitDirectSigned<SIMDWIDTH>(d, trained);
-    }
-    FAISS_THROW_MSG("unknown qtype");
-}
-
-/*******************************************************************
- * Quantizer range training
- */
-
-static float sqr(float x) {
-    return x * x;
-}
-
-void train_Uniform(
-        RangeStat rs,
-        float rs_arg,
-        idx_t n,
-        int k,
-        const float* x,
-        std::vector<float>& trained) {
-    trained.resize(2);
-    float& vmin = trained[0];
-    float& vmax = trained[1];
-
-    if (rs == ScalarQuantizer::RS_minmax) {
-        vmin = HUGE_VAL;
-        vmax = -HUGE_VAL;
-        for (size_t i = 0; i < n; i++) {
-            if (x[i] < vmin)
-                vmin = x[i];
-            if (x[i] > vmax)
-                vmax = x[i];
-        }
-        float vexp = (vmax - vmin) * rs_arg;
-        vmin -= vexp;
-        vmax += vexp;
-    } else if (rs == ScalarQuantizer::RS_meanstd) {
-        double sum = 0, sum2 = 0;
-        for (size_t i = 0; i < n; i++) {
-            sum += x[i];
-            sum2 += x[i] * x[i];
-        }
-        float mean = sum / n;
-        float var = sum2 / n - mean * mean;
-        float std = var <= 0 ? 1.0 : sqrt(var);
-
-        vmin = mean - std * rs_arg;
-        vmax = mean + std * rs_arg;
-    } else if (rs == ScalarQuantizer::RS_quantiles) {
-        std::vector<float> x_copy(n);
-        memcpy(x_copy.data(), x, n * sizeof(*x));
-        // TODO just do a quickselect
-        std::sort(x_copy.begin(), x_copy.end());
-        int o = int(rs_arg * n);
-        if (o < 0)
-            o = 0;
-        if (o > n - o)
-            o = n / 2;
-        vmin = x_copy[o];
-        vmax = x_copy[n - 1 - o];
-
-    } else if (rs == ScalarQuantizer::RS_optim) {
-        float a, b;
-        float sx = 0;
-        {
-            vmin = HUGE_VAL, vmax = -HUGE_VAL;
-            for (size_t i = 0; i < n; i++) {
-                if (x[i] < vmin)
-                    vmin = x[i];
-                if (x[i] > vmax)
-                    vmax = x[i];
-                sx += x[i];
-            }
-            b = vmin;
-            a = (vmax - vmin) / (k - 1);
-        }
-        int verbose = false;
-        int niter = 2000;
-        float last_err = -1;
-        int iter_last_err = 0;
-        for (int it = 0; it < niter; it++) {
-            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
-
-            for (idx_t i = 0; i < n; i++) {
-                float xi = x[i];
-                float ni = floor((xi - b) / a + 0.5);
-                if (ni < 0)
-                    ni = 0;
-                if (ni >= k)
-                    ni = k - 1;
-                err1 += sqr(xi - (ni * a + b));
-                sn += ni;
-                sn2 += ni * ni;
-                sxn += ni * xi;
-            }
-
-            if (err1 == last_err) {
-                iter_last_err++;
-                if (iter_last_err == 16)
-                    break;
-            } else {
-                last_err = err1;
-                iter_last_err = 0;
-            }
-
-            float det = sqr(sn) - sn2 * n;
-
-            b = (sn * sxn - sn2 * sx) / det;
-            a = (sn * sx - n * sxn) / det;
-            if (verbose) {
-                printf("it %d, err1=%g            \r", it, err1);
-                fflush(stdout);
-            }
-        }
-        if (verbose)
-            printf("\n");
-
-        vmin = b;
-        vmax = b + a * (k - 1);
-
-    } else {
-        FAISS_THROW_MSG("Invalid qtype");
-    }
-    vmax -= vmin;
-}
-
-void train_NonUniform(
-        RangeStat rs,
-        float rs_arg,
-        idx_t n,
-        int d,
-        int k,
-        const float* x,
-        std::vector<float>& trained) {
-    trained.resize(2 * d);
-    float* vmin = trained.data();
-    float* vmax = trained.data() + d;
-    if (rs == ScalarQuantizer::RS_minmax) {
-        memcpy(vmin, x, sizeof(*x) * d);
-        memcpy(vmax, x, sizeof(*x) * d);
-        for (size_t i = 1; i < n; i++) {
-            const float* xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                if (xi[j] < vmin[j])
-                    vmin[j] = xi[j];
-                if (xi[j] > vmax[j])
-                    vmax[j] = xi[j];
-            }
-        }
-        float* vdiff = vmax;
-        for (size_t j = 0; j < d; j++) {
-            float vexp = (vmax[j] - vmin[j]) * rs_arg;
-            vmin[j] -= vexp;
-            vmax[j] += vexp;
-            vdiff[j] = vmax[j] - vmin[j];
-        }
-    } else {
-        // transpose
-        std::vector<float> xt(n * d);
-        for (size_t i = 1; i < n; i++) {
-            const float* xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                xt[j * n + i] = xi[j];
-            }
-        }
-        std::vector<float> trained_d(2);
-#pragma omp parallel for
-        for (int j = 0; j < d; j++) {
-            train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d);
-            vmin[j] = trained_d[0];
-            vmax[j] = trained_d[1];
-        }
-    }
-}
-
-/*******************************************************************
- * Similarity: gets vector components and computes a similarity wrt. a
- * query vector stored in the object. The data fields just encapsulate
- * an accumulator.
- */
-
-template <int SIMDWIDTH>
-struct SimilarityL2 {};
-
-template <>
-struct SimilarityL2<1> {
-    static constexpr int simdwidth = 1;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-
-    /******* scalar accumulator *******/
-
-    float accu;
-
-    FAISS_ALWAYS_INLINE void begin() {
-        accu = 0;
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component(float x) {
-        float tmp = *yi++ - x;
-        accu += tmp * tmp;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
-        float tmp = x1 - x2;
-        accu += tmp * tmp;
-    }
-
-    FAISS_ALWAYS_INLINE float result() {
-        return accu;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct SimilarityL2<16> {
-    static constexpr int simdwidth = 16;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-    __m512 accu16;
-
-    FAISS_ALWAYS_INLINE void begin_16() {
-        accu16 = _mm512_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components(__m512 x) {
-        __m512 yiv = _mm512_loadu_ps(yi);
-        yi += 16;
-        __m512 tmp = _mm512_sub_ps(yiv, x);
-        accu16 = _mm512_fmadd_ps(tmp, tmp, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y_2) {
-        __m512 tmp = _mm512_sub_ps(y_2, x);
-        accu16 = _mm512_fmadd_ps(tmp, tmp, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE float result_16() {
-        // performs better than dividing into _mm256 and adding
-        return _mm512_reduce_add_ps(accu16);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct SimilarityL2<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-
-    explicit SimilarityL2(const float* y) : y(y) {}
-    __m256 accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
-        __m256 yiv = _mm256_loadu_ps(yi);
-        yi += 8;
-        __m256 tmp = _mm256_sub_ps(yiv, x);
-        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) {
-        __m256 tmp = _mm256_sub_ps(y_2, x);
-        accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        const __m128 sum = _mm_add_ps(
-                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
-        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
-        const __m128 v1 = _mm_add_ps(sum, v0);
-        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-        const __m128 v3 = _mm_add_ps(v1, v2);
-        return _mm_cvtss_f32(v3);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-template <>
-struct SimilarityL2<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_L2;
-
-    const float *y, *yi;
-    explicit SimilarityL2(const float* y) : y(y) {}
-    float32x4x2_t accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
-        float32x4x2_t yiv = vld1q_f32_x2(yi);
-        yi += 8;
-
-        float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]);
-        float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]);
-
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
-
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(
-            float32x4x2_t x,
-            float32x4x2_t y) {
-        float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]);
-        float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]);
-
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
-
-        accu8 = {accu8_0, accu8_1};
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]);
-        float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]);
-
-        float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0);
-        float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1);
-        return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0);
-    }
-};
-#endif
-
-template <int SIMDWIDTH>
-struct SimilarityIP {};
-
-template <>
-struct SimilarityIP<1> {
-    static constexpr int simdwidth = 1;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    FAISS_ALWAYS_INLINE void begin() {
-        accu = 0;
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component(float x) {
-        accu += *yi++ * x;
-    }
-
-    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
-        accu += x1 * x2;
-    }
-
-    FAISS_ALWAYS_INLINE float result() {
-        return accu;
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <>
-struct SimilarityIP<16> {
-    static constexpr int simdwidth = 16;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    __m512 accu16;
-
-    FAISS_ALWAYS_INLINE void begin_16() {
-        accu16 = _mm512_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components(__m512 x) {
-        __m512 yiv = _mm512_loadu_ps(yi);
-        yi += 16;
-        accu16 = _mm512_fmadd_ps(yiv, x, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) {
-        accu16 = _mm512_fmadd_ps(x1, x2, accu16);
-    }
-
-    FAISS_ALWAYS_INLINE float result_16() {
-        // performs better than dividing into _mm256 and adding
-        return _mm512_reduce_add_ps(accu16);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <>
-struct SimilarityIP<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
-
-    const float *y, *yi;
-
-    float accu;
-
-    explicit SimilarityIP(const float* y) : y(y) {}
-
-    __m256 accu8;
-
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components(__m256 x) {
-        __m256 yiv = _mm256_loadu_ps(yi);
-        yi += 8;
-        accu8 = _mm256_fmadd_ps(yiv, x, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) {
-        accu8 = _mm256_fmadd_ps(x1, x2, accu8);
-    }
-
-    FAISS_ALWAYS_INLINE float result_8() {
-        const __m128 sum = _mm_add_ps(
-                _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1));
-        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
-        const __m128 v1 = _mm_add_ps(sum, v0);
-        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-        const __m128 v3 = _mm_add_ps(v1, v2);
-        return _mm_cvtss_f32(v3);
-    }
-};
-#endif
-
-#ifdef USE_NEON
-
-template <>
-struct SimilarityIP<8> {
-    static constexpr int simdwidth = 8;
-    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+#include <faiss/impl/ScalarQuantizer.h>
 
-    const float *y, *yi;
+#include <cstdio>
 
-    explicit SimilarityIP(const float* y) : y(y) {}
-    float32x4x2_t accu8;
+#include <faiss/impl/platform_macros.h>
+#include <omp.h>
 
-    FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
-        yi = y;
-    }
+#include <faiss/utils/simdlib.h>
 
-    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
-        float32x4x2_t yiv = vld1q_f32_x2(yi);
-        yi += 8;
+#include <faiss/impl/scalar_quantizer/training.h>
 
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
-        accu8 = {accu8_0, accu8_1};
-    }
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/utils/utils.h>
 
-    FAISS_ALWAYS_INLINE void add_8_components_2(
-            float32x4x2_t x1,
-            float32x4x2_t x2) {
-        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
-        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
-        accu8 = {accu8_0, accu8_1};
-    }
+/*******************************************************************
+ * ScalarQuantizer implementation
+ *
+ * The main source of complexity is to support combinations of 4
+ * variants without incurring runtime tests or virtual function calls:
+ *
+ * - 4 / 6 / 8 bits per code component
+ * - uniform / non-uniform
+ * - IP / L2 distance search
+ * - scalar / SIMD distance computation
+ *
+ * The appropriate Quantizer object is returned via select_quantizer
+ * that hides the template mess.
+ ********************************************************************/
 
-    FAISS_ALWAYS_INLINE float result_8() {
-        float32x4x2_t sum = {
-                vpaddq_f32(accu8.val[0], accu8.val[0]),
-                vpaddq_f32(accu8.val[1], accu8.val[1])};
+/*******************************************************************
+ * Codec: converts between values in [0, 1] and an index in a code
+ * array. The "i" parameter is the vector component index (not byte
+ * index).
+ */
 
-        float32x4x2_t sum2 = {
-                vpaddq_f32(sum.val[0], sum.val[0]),
-                vpaddq_f32(sum.val[1], sum.val[1])};
-        return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
-    }
-};
-#endif
+#include <faiss/impl/scalar_quantizer/codecs.h>
 
 /*******************************************************************
- * DistanceComputer: combines a similarity and a quantizer to do
- * code-to-vector or code-to-code comparisons
+ * Quantizer: normalizes scalar vector components, then passes them
+ * through a codec
  *******************************************************************/
 
-template <class Quantizer, class Similarity, int SIMDWIDTH>
-struct DCTemplate : SQDistanceComputer {};
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin();
-        for (size_t i = 0; i < quant.d; i++) {
-            float xi = quant.reconstruct_component(code, i);
-            sim.add_component(xi);
-        }
-        return sim.result();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin();
-        for (size_t i = 0; i < quant.d; i++) {
-            float x1 = quant.reconstruct_component(code1, i);
-            float x2 = quant.reconstruct_component(code2, i);
-            sim.add_component_2(x1, x2);
-        }
-        return sim.result();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#if defined(USE_AVX512_F16C)
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 16>
-        : SQDistanceComputer { // Update to handle 16 lanes
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_16();
-        for (size_t i = 0; i < quant.d; i += 16) {
-            __m512 xi = quant.reconstruct_16_components(code, i);
-            sim.add_16_components(xi);
-        }
-        return sim.result_16();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_16();
-        for (size_t i = 0; i < quant.d; i += 16) {
-            __m512 x1 = quant.reconstruct_16_components(code1, i);
-            __m512 x2 = quant.reconstruct_16_components(code2, i);
-            sim.add_16_components_2(x1, x2);
-        }
-        return sim.result_16();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#elif defined(USE_F16C)
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            __m256 xi = quant.reconstruct_8_components(code, i);
-            sim.add_8_components(xi);
-        }
-        return sim.result_8();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            __m256 x1 = quant.reconstruct_8_components(code1, i);
-            __m256 x2 = quant.reconstruct_8_components(code2, i);
-            sim.add_8_components_2(x1, x2);
-        }
-        return sim.result_8();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Quantizer, class Similarity>
-struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    Quantizer quant;
-
-    DCTemplate(size_t d, const std::vector<float>& trained)
-            : quant(d, trained) {}
-    float compute_distance(const float* x, const uint8_t* code) const {
-        Similarity sim(x);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            float32x4x2_t xi = quant.reconstruct_8_components(code, i);
-            sim.add_8_components(xi);
-        }
-        return sim.result_8();
-    }
-
-    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        Similarity sim(nullptr);
-        sim.begin_8();
-        for (size_t i = 0; i < quant.d; i += 8) {
-            float32x4x2_t x1 = quant.reconstruct_8_components(code1, i);
-            float32x4x2_t x2 = quant.reconstruct_8_components(code2, i);
-            sim.add_8_components_2(x1, x2);
-        }
-        return sim.result_8();
-    }
-
-    void set_query(const float* x) final {
-        q = x;
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_distance(q, code);
-    }
-};
-#endif
+#include <faiss/impl/scalar_quantizer/quantizers.h>
 
 /*******************************************************************
- * DistanceComputerByte: computes distances in the integer domain
+ * Similarity: gets vector components and computes a similarity wrt. a
+ * query vector stored in the object. The data fields just encapsulate
+ * an accumulator.
+ * DistanceComputer: combines a similarity and a quantizer to do
+ * code-to-vector or code-to-code comparisons
  *******************************************************************/
 
-template <class Similarity, int SIMDWIDTH>
-struct DistanceComputerByte : SQDistanceComputer {};
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        int accu = 0;
-        for (int i = 0; i < d; i++) {
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                accu += int(code1[i]) * code2[i];
-            } else {
-                int diff = int(code1[i]) - code2[i];
-                accu += diff * diff;
-            }
-        }
-        return accu;
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#if defined(__AVX512F__)
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 16> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        __m512i accu = _mm512_setzero_si512();
-        for (int i = 0; i < d; i += 32) { // Process 32 bytes at a time
-            __m512i c1 = _mm512_cvtepu8_epi16(
-                    _mm256_loadu_si256((__m256i*)(code1 + i)));
-            __m512i c2 = _mm512_cvtepu8_epi16(
-                    _mm256_loadu_si256((__m256i*)(code2 + i)));
-            __m512i prod32;
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                prod32 = _mm512_madd_epi16(c1, c2);
-            } else {
-                __m512i diff = _mm512_sub_epi16(c1, c2);
-                prod32 = _mm512_madd_epi16(diff, diff);
-            }
-            accu = _mm512_add_epi32(accu, prod32);
-        }
-        // Horizontally add elements of accu
-        return _mm512_reduce_add_epi32(accu);
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#elif defined(__AVX2__)
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        // __m256i accu = _mm256_setzero_ps ();
-        __m256i accu = _mm256_setzero_si256();
-        for (int i = 0; i < d; i += 16) {
-            // load 16 bytes, convert to 16 uint16_t
-            __m256i c1 = _mm256_cvtepu8_epi16(
-                    _mm_loadu_si128((__m128i*)(code1 + i)));
-            __m256i c2 = _mm256_cvtepu8_epi16(
-                    _mm_loadu_si128((__m128i*)(code2 + i)));
-            __m256i prod32;
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                prod32 = _mm256_madd_epi16(c1, c2);
-            } else {
-                __m256i diff = _mm256_sub_epi16(c1, c2);
-                prod32 = _mm256_madd_epi16(diff, diff);
-            }
-            accu = _mm256_add_epi32(accu, prod32);
-        }
-        __m128i sum = _mm256_extractf128_si256(accu, 0);
-        sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1));
-        sum = _mm_hadd_epi32(sum, sum);
-        sum = _mm_hadd_epi32(sum, sum);
-        return _mm_cvtsi128_si32(sum);
-    }
-
-    void set_query(const float* x) final {
-        /*
-        for (int i = 0; i < d; i += 8) {
-            __m256 xi = _mm256_loadu_ps (x + i);
-            __m256i ci = _mm256_cvtps_epi32(xi);
-        */
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#endif
-
-#ifdef USE_NEON
-
-template <class Similarity>
-struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
-    using Sim = Similarity;
-
-    int d;
-    std::vector<uint8_t> tmp;
-
-    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
-
-    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
-            const {
-        int accu = 0;
-        for (int i = 0; i < d; i++) {
-            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
-                accu += int(code1[i]) * code2[i];
-            } else {
-                int diff = int(code1[i]) - code2[i];
-                accu += diff * diff;
-            }
-        }
-        return accu;
-    }
-
-    void set_query(const float* x) final {
-        for (int i = 0; i < d; i++) {
-            tmp[i] = int(x[i]);
-        }
-    }
-
-    int compute_distance(const float* x, const uint8_t* code) {
-        set_query(x);
-        return compute_code_distance(tmp.data(), code);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return compute_code_distance(
-                codes + i * code_size, codes + j * code_size);
-    }
-
-    float query_to_code(const uint8_t* code) const final {
-        return compute_code_distance(tmp.data(), code);
-    }
-};
-
-#endif
+#include <faiss/impl/scalar_quantizer/distance_computers.h>
 
 /*******************************************************************
- * select_distance_computer: runtime selection of template
- * specialization
+ * InvertedListScanner: scans series of codes and keeps the best ones
  *******************************************************************/
 
-template <class Sim>
-SQDistanceComputer* select_distance_computer(
-        QuantizerType qtype,
-        size_t d,
-        const std::vector<float>& trained) {
-    constexpr int SIMDWIDTH = Sim::simdwidth;
-    switch (qtype) {
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec8bit,
-                            QuantizerTemplateScaling::UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec4bit,
-                            QuantizerTemplateScaling::UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_8bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec8bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_6bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec6bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_4bit:
-            return new DCTemplate<
-                    QuantizerTemplate<
-                            Codec4bit,
-                            QuantizerTemplateScaling::NON_UNIFORM,
-                            SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-
-        case ScalarQuantizer::QT_fp16:
-            return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
-                    d, trained);
-
-        case ScalarQuantizer::QT_bf16:
-            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
-                    d, trained);
-
-        case ScalarQuantizer::QT_8bit_direct:
-#if defined(__AVX512F__)
-            if (d % 32 == 0) {
-                return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
-            } else
-#elif defined(__AVX2__)
-            if (d % 16 == 0) {
-                return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
-            } else
-#endif
-            {
-                return new DCTemplate<
-                        Quantizer8bitDirect<SIMDWIDTH>,
-                        Sim,
-                        SIMDWIDTH>(d, trained);
-            }
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return new DCTemplate<
-                    Quantizer8bitDirectSigned<SIMDWIDTH>,
-                    Sim,
-                    SIMDWIDTH>(d, trained);
-    }
-    FAISS_THROW_MSG("unknown qtype");
-    return nullptr;
-}
+#include <faiss/impl/scalar_quantizer/scanners.h>
 
-} // anonymous namespace
+namespace faiss {
+using namespace scalar_quantizer;
 
 /*******************************************************************
  * ScalarQuantizer implementation
@@ -2046,18 +152,19 @@ void ScalarQuantizer::train(size_t n, const float* x) {
 }
 
 ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const {
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        return select_quantizer_1<16>(qtype, d, trained);
+    // here we can't just dispatch because the SIMD code works only on certain
+    // vector sizes
+#ifdef COMPILE_SIMD_AVX512
+    if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) {
+        return select_quantizer_1<SIMDLevel::AVX512F>(qtype, d, trained);
     } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        return select_quantizer_1<8>(qtype, d, trained);
+#endif
+#ifdef COMPILE_SIMD_AVX2
+            if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) {
+        return select_quantizer_1<SIMDLevel::AVX2>(qtype, d, trained);
     } else
 #endif
-    {
-        return select_quantizer_1<1>(qtype, d, trained);
-    }
+        return select_quantizer_1<SIMDLevel::NONE>(qtype, d, trained);
 }
 
 void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
@@ -2080,33 +187,20 @@ void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
 
 SQDistanceComputer* ScalarQuantizer::get_distance_computer(
         MetricType metric) const {
-    FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<16>>(
-                    qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<16>>(
-                    qtype, d, trained);
-        }
+#ifdef COMPILE_SIMD_AVX512
+    if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) {
+        return select_distance_computer_1<SIMDLevel::AVX512F>(
+                metric, qtype, d, trained);
     } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<8>>(qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<8>>(qtype, d, trained);
-        }
+#endif
+#ifdef COMPILE_SIMD_AVX2
+            if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) {
+        return select_distance_computer_1<SIMDLevel::AVX2>(
+                metric, qtype, d, trained);
     } else
 #endif
-    {
-        if (metric == METRIC_L2) {
-            return select_distance_computer<SimilarityL2<1>>(qtype, d, trained);
-        } else {
-            return select_distance_computer<SimilarityIP<1>>(qtype, d, trained);
-        }
-    }
+        return select_distance_computer_1<SIMDLevel::NONE>(
+                metric, qtype, d, trained);
 }
 
 /*******************************************************************
@@ -2116,366 +210,26 @@ SQDistanceComputer* ScalarQuantizer::get_distance_computer(
  * IndexScalarQuantizer as well.
  ********************************************************************/
 
-namespace {
-
-template <class DCClass, int use_sel>
-struct IVFSQScannerIP : InvertedListScanner {
-    DCClass dc;
-    bool by_residual;
-
-    float accu0; /// added to all distances
-
-    IVFSQScannerIP(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
-            bool store_pairs,
-            const IDSelector* sel,
-            bool by_residual)
-            : dc(d, trained), by_residual(by_residual), accu0(0) {
-        this->store_pairs = store_pairs;
-        this->sel = sel;
-        this->code_size = code_size;
-        this->keep_max = true;
-    }
-
-    void set_query(const float* query) override {
-        dc.set_query(query);
-    }
-
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-        accu0 = by_residual ? coarse_dis : 0;
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return accu0 + dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float accu = accu0 + dc.query_to_code(codes);
-
-            if (accu > simi[0]) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                minheap_replace_top(k, simi, idxi, accu, id);
-                nup++;
-            }
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float accu = accu0 + dc.query_to_code(codes);
-            if (accu > radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                res.add(accu, id);
-            }
-        }
-    }
-};
-
-/* use_sel = 0: don't check selector
- * = 1: check on ids[j]
- * = 2: check in j directly (normally ids is nullptr and store_pairs)
- */
-template <class DCClass, int use_sel>
-struct IVFSQScannerL2 : InvertedListScanner {
-    DCClass dc;
-
-    bool by_residual;
-    const Index* quantizer;
-    const float* x; /// current query
-
-    std::vector<float> tmp;
-
-    IVFSQScannerL2(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
-            const Index* quantizer,
-            bool store_pairs,
-            const IDSelector* sel,
-            bool by_residual)
-            : dc(d, trained),
-              by_residual(by_residual),
-              quantizer(quantizer),
-              x(nullptr),
-              tmp(d) {
-        this->store_pairs = store_pairs;
-        this->sel = sel;
-        this->code_size = code_size;
-    }
-
-    void set_query(const float* query) override {
-        x = query;
-        if (!quantizer) {
-            dc.set_query(query);
-        }
-    }
-
-    void set_list(idx_t list_no, float /*coarse_dis*/) override {
-        this->list_no = list_no;
-        if (by_residual) {
-            // shift of x_in wrt centroid
-            quantizer->compute_residual(x, tmp.data(), list_no);
-            dc.set_query(tmp.data());
-        } else {
-            dc.set_query(x);
-        }
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float dis = dc.query_to_code(codes);
-
-            if (dis < simi[0]) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                maxheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++, codes += code_size) {
-            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
-                continue;
-            }
-
-            float dis = dc.query_to_code(codes);
-            if (dis < radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
-                res.add(dis, id);
-            }
-        }
-    }
-};
-
-template <class DCClass, int use_sel>
-InvertedListScanner* sel3_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    if (DCClass::Sim::metric_type == METRIC_L2) {
-        return new IVFSQScannerL2<DCClass, use_sel>(
-                sq->d,
-                sq->trained,
-                sq->code_size,
-                quantizer,
-                store_pairs,
-                sel,
-                r);
-    } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFSQScannerIP<DCClass, use_sel>(
-                sq->d, sq->trained, sq->code_size, store_pairs, sel, r);
-    } else {
-        FAISS_THROW_MSG("unsupported metric type");
-    }
-}
-
-template <class DCClass>
-InvertedListScanner* sel2_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    if (sel) {
-        if (store_pairs) {
-            return sel3_InvertedListScanner<DCClass, 2>(
-                    sq, quantizer, store_pairs, sel, r);
-        } else {
-            return sel3_InvertedListScanner<DCClass, 1>(
-                    sq, quantizer, store_pairs, sel, r);
-        }
-    } else {
-        return sel3_InvertedListScanner<DCClass, 0>(
-                sq, quantizer, store_pairs, sel, r);
-    }
-}
-
-template <class Similarity, class Codec, QuantizerTemplateScaling SCALING>
-InvertedListScanner* sel12_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    constexpr int SIMDWIDTH = Similarity::simdwidth;
-    using QuantizerClass = QuantizerTemplate<Codec, SCALING, SIMDWIDTH>;
-    using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
-    return sel2_InvertedListScanner<DCClass>(
-            sq, quantizer, store_pairs, sel, r);
-}
-
-template <class Similarity>
-InvertedListScanner* sel1_InvertedListScanner(
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool r) {
-    constexpr int SIMDWIDTH = Similarity::simdwidth;
-    switch (sq->qtype) {
-        case ScalarQuantizer::QT_8bit_uniform:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec8bit,
-                    QuantizerTemplateScaling::UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec4bit,
-                    QuantizerTemplateScaling::UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_8bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec8bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_4bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec4bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_6bit:
-            return sel12_InvertedListScanner<
-                    Similarity,
-                    Codec6bit,
-                    QuantizerTemplateScaling::NON_UNIFORM>(
-                    sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_fp16:
-            return sel2_InvertedListScanner<DCTemplate<
-                    QuantizerFP16<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_bf16:
-            return sel2_InvertedListScanner<DCTemplate<
-                    QuantizerBF16<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-        case ScalarQuantizer::QT_8bit_direct:
-#if defined(__AVX512F__)
-            if (sq->d % 32 == 0) {
-                return sel2_InvertedListScanner<
-                        DistanceComputerByte<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, sel, r);
-            } else
-#elif defined(__AVX2__)
-            if (sq->d % 16 == 0) {
-                return sel2_InvertedListScanner<
-                        DistanceComputerByte<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, sel, r);
-            } else
-#endif
-            {
-                return sel2_InvertedListScanner<DCTemplate<
-                        Quantizer8bitDirect<SIMDWIDTH>,
-                        Similarity,
-                        SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-            }
-        case ScalarQuantizer::QT_8bit_direct_signed:
-            return sel2_InvertedListScanner<DCTemplate<
-                    Quantizer8bitDirectSigned<SIMDWIDTH>,
-                    Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
-    }
-
-    FAISS_THROW_MSG("unknown qtype");
-    return nullptr;
-}
-
-template <int SIMDWIDTH>
-InvertedListScanner* sel0_InvertedListScanner(
-        MetricType mt,
-        const ScalarQuantizer* sq,
-        const Index* quantizer,
-        bool store_pairs,
-        const IDSelector* sel,
-        bool by_residual) {
-    if (mt == METRIC_L2) {
-        return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, sel, by_residual);
-    } else if (mt == METRIC_INNER_PRODUCT) {
-        return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, sel, by_residual);
-    } else {
-        FAISS_THROW_MSG("unsupported metric type");
-    }
-}
-
-} // anonymous namespace
-
 InvertedListScanner* ScalarQuantizer::select_InvertedListScanner(
         MetricType mt,
         const Index* quantizer,
         bool store_pairs,
         const IDSelector* sel,
         bool by_residual) const {
-#if defined(USE_AVX512_F16C)
-    if (d % 16 == 0) {
-        return sel0_InvertedListScanner<16>(
+#ifdef COMPILE_SIMD_AVX512
+    if (d % 16 == 0 && SIMDConfig::level >= SIMDLevel::AVX512F) {
+        return sel0_InvertedListScanner<SIMDLevel::AVX512F>(
                 mt, this, quantizer, store_pairs, sel, by_residual);
     } else
-#elif defined(USE_F16C) || defined(USE_NEON)
-    if (d % 8 == 0) {
-        return sel0_InvertedListScanner<8>(
+#endif
+#ifdef COMPILE_SIMD_AVX2
+            if (d % 8 == 0 && SIMDConfig::level >= SIMDLevel::AVX2) {
+        return sel0_InvertedListScanner<SIMDLevel::AVX2>(
                 mt, this, quantizer, store_pairs, sel, by_residual);
     } else
 #endif
-    {
-        return sel0_InvertedListScanner<1>(
+        return sel0_InvertedListScanner<SIMDLevel::NONE>(
                 mt, this, quantizer, store_pairs, sel, by_residual);
-    }
 }
 
 } // namespace faiss
diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h
index c1f4f98f63..279938443a 100644
--- a/faiss/impl/ScalarQuantizer.h
+++ b/faiss/impl/ScalarQuantizer.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
diff --git a/faiss/impl/code_distance/code_distance-avx2.cpp b/faiss/impl/code_distance/code_distance-avx2.cpp
new file mode 100644
index 0000000000..b140b9ece0
--- /dev/null
+++ b/faiss/impl/code_distance/code_distance-avx2.cpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <immintrin.h>
+
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/code_distance/code_distance.h>
+
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+#if defined(__GNUC__) && __GNUC__ < 9
+#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
+#endif
+
+namespace {
+
+inline float horizontal_sum(const __m128 v) {
+    const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
+    const __m128 v1 = _mm_add_ps(v, v0);
+    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+    const __m128 v3 = _mm_add_ps(v1, v2);
+    return _mm_cvtss_f32(v3);
+}
+
+// Computes a horizontal sum over an __m256 register
+inline float horizontal_sum(const __m256 v) {
+    const __m128 v0 =
+            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+    return horizontal_sum(v0);
+}
+
+// processes a single code for M=4, ksub=256, nbits=8
+float inline distance_single_code_avx2_pqdecoder8_m4(
+        // precomputed distances, layout (4, 256)
+        const float* sim_table,
+        const uint8_t* code) {
+    float result = 0;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    const __m128i vksub = _mm_set1_epi32(ksub);
+    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
+    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m128 partialSum;
+
+    // load 4 uint8 values
+    const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code));
+    {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m128i idx1 = _mm_cvtepu8_epi32(mm1);
+
+        // add offsets
+        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m128 collected =
+                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSum = collected;
+    }
+
+    // horizontal sum for partialSum
+    result = horizontal_sum(partialSum);
+    return result;
+}
+
+// processes a single code for M=8, ksub=256, nbits=8
+float inline distance_single_code_avx2_pqdecoder8_m8(
+        // precomputed distances, layout (8, 256)
+        const float* sim_table,
+        const uint8_t* code) {
+    float result = 0;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    const __m256i vksub = _mm256_set1_epi32(ksub);
+    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m256 partialSum;
+
+    // load 8 uint8 values
+    const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code);
+    {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+
+        // add offsets
+        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m256 collected =
+                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSum = collected;
+    }
+
+    // horizontal sum for partialSum
+    result = horizontal_sum(partialSum);
+    return result;
+}
+
+// processes four codes for M=4, ksub=256, nbits=8
+inline void distance_four_codes_avx2_pqdecoder8_m4(
+        // precomputed distances, layout (4, 256)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    // process 8 values
+    const __m128i vksub = _mm_set1_epi32(ksub);
+    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
+    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m128 partialSums[N];
+
+    // load 4 uint8 values
+    __m128i mm1[N];
+    mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0));
+    mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1));
+    mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2));
+    mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3));
+
+    for (intptr_t j = 0; j < N; j++) {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]);
+
+        // add offsets
+        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
+
+        // gather 4 values, similar to 4 operations of tab[idx]
+        __m128 collected =
+                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSums[j] = collected;
+    }
+
+    // horizontal sum for partialSum
+    result0 = horizontal_sum(partialSums[0]);
+    result1 = horizontal_sum(partialSums[1]);
+    result2 = horizontal_sum(partialSums[2]);
+    result3 = horizontal_sum(partialSums[3]);
+}
+
+// processes four codes for M=8, ksub=256, nbits=8
+inline void distance_four_codes_avx2_pqdecoder8_m8(
+        // precomputed distances, layout (8, 256)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    // process 8 values
+    const __m256i vksub = _mm256_set1_epi32(ksub);
+    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m256 partialSums[N];
+
+    // load 8 uint8 values
+    __m128i mm1[N];
+    mm1[0] = _mm_loadu_si64((const __m128i_u*)code0);
+    mm1[1] = _mm_loadu_si64((const __m128i_u*)code1);
+    mm1[2] = _mm_loadu_si64((const __m128i_u*)code2);
+    mm1[3] = _mm_loadu_si64((const __m128i_u*)code3);
+
+    for (intptr_t j = 0; j < N; j++) {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+
+        // add offsets
+        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m256 collected =
+                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSums[j] = collected;
+    }
+
+    // horizontal sum for partialSum
+    result0 = horizontal_sum(partialSums[0]);
+    result1 = horizontal_sum(partialSums[1]);
+    result2 = horizontal_sum(partialSums[2]);
+    result3 = horizontal_sum(partialSums[3]);
+}
+
+} // namespace
+
+namespace faiss {
+
+template <>
+struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX2> {
+    float distance_single_code(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            const uint8_t* code) {
+        if (M == 4) {
+            return distance_single_code_avx2_pqdecoder8_m4(sim_table, code);
+        }
+        if (M == 8) {
+            return distance_single_code_avx2_pqdecoder8_m8(sim_table, code);
+        }
+
+        float result = 0;
+        constexpr size_t ksub = 1 << 8;
+
+        size_t m = 0;
+        const size_t pqM16 = M / 16;
+
+        const float* tab = sim_table;
+
+        if (pqM16 > 0) {
+            // process 16 values per loop
+
+            const __m256i vksub = _mm256_set1_epi32(ksub);
+            __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+            offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+            // accumulators of partial sums
+            __m256 partialSum = _mm256_setzero_ps();
+
+            // loop
+            for (m = 0; m < pqM16 * 16; m += 16) {
+                // load 16 uint8 values
+                const __m128i mm1 =
+                        _mm_loadu_si128((const __m128i_u*)(code + m));
+                {
+                    // convert uint8 values (low part of __m128i) to int32
+                    // values
+                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+
+                    // add offsets
+                    const __m256i indices_to_read_from =
+                            _mm256_add_epi32(idx1, offsets_0);
+
+                    // gather 8 values, similar to 8 operations of tab[idx]
+                    __m256 collected = _mm256_i32gather_ps(
+                            tab, indices_to_read_from, sizeof(float));
+                    tab += ksub * 8;
+
+                    // collect partial sums
+                    partialSum = _mm256_add_ps(partialSum, collected);
+                }
+
+                // move high 8 uint8 to low ones
+                const __m128i mm2 =
+                        _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
+                {
+                    // convert uint8 values (low part of __m128i) to int32
+                    // values
+                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                    // add offsets
+                    const __m256i indices_to_read_from =
+                            _mm256_add_epi32(idx1, offsets_0);
+
+                    // gather 8 values, similar to 8 operations of tab[idx]
+                    __m256 collected = _mm256_i32gather_ps(
+                            tab, indices_to_read_from, sizeof(float));
+                    tab += ksub * 8;
+
+                    // collect partial sums
+                    partialSum = _mm256_add_ps(partialSum, collected);
+                }
+            }
+
+            // horizontal sum for partialSum
+            result += horizontal_sum(partialSum);
+        }
+
+        //
+        if (m < M) {
+            // process leftovers
+            PQDecoder8 decoder(code + m, nbits);
+
+            for (; m < M; m++) {
+                result += tab[decoder.decode()];
+                tab += ksub;
+            }
+        }
+
+        return result;
+    }
+
+    // Combines 4 operations of distance_single_code()
+    void distance_four_codes(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            // codes
+            const uint8_t* __restrict code0,
+            const uint8_t* __restrict code1,
+            const uint8_t* __restrict code2,
+            const uint8_t* __restrict code3,
+            // computed distances
+            float& result0,
+            float& result1,
+            float& result2,
+            float& result3) {
+        if (M == 4) {
+            distance_four_codes_avx2_pqdecoder8_m4(
+                    sim_table,
+                    code0,
+                    code1,
+                    code2,
+                    code3,
+                    result0,
+                    result1,
+                    result2,
+                    result3);
+            return;
+        }
+        if (M == 8) {
+            distance_four_codes_avx2_pqdecoder8_m8(
+                    sim_table,
+                    code0,
+                    code1,
+                    code2,
+                    code3,
+                    result0,
+                    result1,
+                    result2,
+                    result3);
+            return;
+        }
+
+        result0 = 0;
+        result1 = 0;
+        result2 = 0;
+        result3 = 0;
+        constexpr size_t ksub = 1 << 8;
+
+        size_t m = 0;
+        const size_t pqM16 = M / 16;
+
+        constexpr intptr_t N = 4;
+
+        const float* tab = sim_table;
+
+        if (pqM16 > 0) {
+            // process 16 values per loop
+            const __m256i vksub = _mm256_set1_epi32(ksub);
+            __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+            offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+            // accumulators of partial sums
+            __m256 partialSums[N];
+            for (intptr_t j = 0; j < N; j++) {
+                partialSums[j] = _mm256_setzero_ps();
+            }
+
+            // loop
+            for (m = 0; m < pqM16 * 16; m += 16) {
+                // load 16 uint8 values
+                __m128i mm1[N];
+                mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+                mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+                mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+                mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+                // process first 8 codes
+                for (intptr_t j = 0; j < N; j++) {
+                    // convert uint8 values (low part of __m128i) to int32
+                    // values
+                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+
+                    // add offsets
+                    const __m256i indices_to_read_from =
+                            _mm256_add_epi32(idx1, offsets_0);
+
+                    // gather 8 values, similar to 8 operations of tab[idx]
+                    __m256 collected = _mm256_i32gather_ps(
+                            tab, indices_to_read_from, sizeof(float));
+
+                    // collect partial sums
+                    partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+                }
+                tab += ksub * 8;
+
+                // process next 8 codes
+                for (intptr_t j = 0; j < N; j++) {
+                    // move high 8 uint8 to low ones
+                    const __m128i mm2 =
+                            _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
+
+                    // convert uint8 values (low part of __m128i) to int32
+                    // values
+                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                    // add offsets
+                    const __m256i indices_to_read_from =
+                            _mm256_add_epi32(idx1, offsets_0);
+
+                    // gather 8 values, similar to 8 operations of tab[idx]
+                    __m256 collected = _mm256_i32gather_ps(
+                            tab, indices_to_read_from, sizeof(float));
+
+                    // collect partial sums
+                    partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+                }
+
+                tab += ksub * 8;
+            }
+
+            // horizontal sum for partialSum
+            result0 += horizontal_sum(partialSums[0]);
+            result1 += horizontal_sum(partialSums[1]);
+            result2 += horizontal_sum(partialSums[2]);
+            result3 += horizontal_sum(partialSums[3]);
+        }
+
+        //
+        if (m < M) {
+            // process leftovers
+            PQDecoder8 decoder0(code0 + m, nbits);
+            PQDecoder8 decoder1(code1 + m, nbits);
+            PQDecoder8 decoder2(code2 + m, nbits);
+            PQDecoder8 decoder3(code3 + m, nbits);
+            for (; m < M; m++) {
+                result0 += tab[decoder0.decode()];
+                result1 += tab[decoder1.decode()];
+                result2 += tab[decoder2.decode()];
+                result3 += tab[decoder3.decode()];
+                tab += ksub;
+            }
+        }
+    }
+};
+
+// explicit template instanciations
+// template struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX2>;
+
+// these two will automatically use the generic implementation
+template struct PQCodeDistance<PQDecoder16, SIMDLevel::AVX2>;
+template struct PQCodeDistance<PQDecoderGeneric, SIMDLevel::AVX2>;
+
+} // namespace faiss
diff --git a/faiss/impl/code_distance/code_distance-avx2.h b/faiss/impl/code_distance/code_distance-avx2.h
deleted file mode 100644
index 53380b6e46..0000000000
--- a/faiss/impl/code_distance/code_distance-avx2.h
+++ /dev/null
@@ -1,534 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef __AVX2__
-
-#include <immintrin.h>
-
-#include <type_traits>
-
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
-#if defined(__GNUC__) && __GNUC__ < 9
-#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
-#endif
-
-namespace {
-
-inline float horizontal_sum(const __m128 v) {
-    const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
-    const __m128 v1 = _mm_add_ps(v, v0);
-    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
-    const __m128 v3 = _mm_add_ps(v1, v2);
-    return _mm_cvtss_f32(v3);
-}
-
-// Computes a horizontal sum over an __m256 register
-inline float horizontal_sum(const __m256 v) {
-    const __m128 v0 =
-            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
-    return horizontal_sum(v0);
-}
-
-// processes a single code for M=4, ksub=256, nbits=8
-float inline distance_single_code_avx2_pqdecoder8_m4(
-        // precomputed distances, layout (4, 256)
-        const float* sim_table,
-        const uint8_t* code) {
-    float result = 0;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    const __m128i vksub = _mm_set1_epi32(ksub);
-    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
-    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m128 partialSum;
-
-    // load 4 uint8 values
-    const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code));
-    {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m128i idx1 = _mm_cvtepu8_epi32(mm1);
-
-        // add offsets
-        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m128 collected =
-                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSum = collected;
-    }
-
-    // horizontal sum for partialSum
-    result = horizontal_sum(partialSum);
-    return result;
-}
-
-// processes a single code for M=8, ksub=256, nbits=8
-float inline distance_single_code_avx2_pqdecoder8_m8(
-        // precomputed distances, layout (8, 256)
-        const float* sim_table,
-        const uint8_t* code) {
-    float result = 0;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    const __m256i vksub = _mm256_set1_epi32(ksub);
-    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m256 partialSum;
-
-    // load 8 uint8 values
-    const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code);
-    {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-
-        // add offsets
-        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m256 collected =
-                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSum = collected;
-    }
-
-    // horizontal sum for partialSum
-    result = horizontal_sum(partialSum);
-    return result;
-}
-
-// processes four codes for M=4, ksub=256, nbits=8
-inline void distance_four_codes_avx2_pqdecoder8_m4(
-        // precomputed distances, layout (4, 256)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    // process 8 values
-    const __m128i vksub = _mm_set1_epi32(ksub);
-    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
-    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m128 partialSums[N];
-
-    // load 4 uint8 values
-    __m128i mm1[N];
-    mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0));
-    mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1));
-    mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2));
-    mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3));
-
-    for (intptr_t j = 0; j < N; j++) {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]);
-
-        // add offsets
-        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
-
-        // gather 4 values, similar to 4 operations of tab[idx]
-        __m128 collected =
-                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSums[j] = collected;
-    }
-
-    // horizontal sum for partialSum
-    result0 = horizontal_sum(partialSums[0]);
-    result1 = horizontal_sum(partialSums[1]);
-    result2 = horizontal_sum(partialSums[2]);
-    result3 = horizontal_sum(partialSums[3]);
-}
-
-// processes four codes for M=8, ksub=256, nbits=8
-inline void distance_four_codes_avx2_pqdecoder8_m8(
-        // precomputed distances, layout (8, 256)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-    constexpr size_t ksub = 1 << 8;
-
-    // process 8 values
-    const __m256i vksub = _mm256_set1_epi32(ksub);
-    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-    // accumulators of partial sums
-    __m256 partialSums[N];
-
-    // load 8 uint8 values
-    __m128i mm1[N];
-    mm1[0] = _mm_loadu_si64((const __m128i_u*)code0);
-    mm1[1] = _mm_loadu_si64((const __m128i_u*)code1);
-    mm1[2] = _mm_loadu_si64((const __m128i_u*)code2);
-    mm1[3] = _mm_loadu_si64((const __m128i_u*)code3);
-
-    for (intptr_t j = 0; j < N; j++) {
-        // convert uint8 values (low part of __m128i) to int32
-        // values
-        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
-
-        // add offsets
-        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
-
-        // gather 8 values, similar to 8 operations of tab[idx]
-        __m256 collected =
-                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
-
-        // collect partial sums
-        partialSums[j] = collected;
-    }
-
-    // horizontal sum for partialSum
-    result0 = horizontal_sum(partialSums[0]);
-    result1 = horizontal_sum(partialSums[1]);
-    result2 = horizontal_sum(partialSums[2]);
-    result3 = horizontal_sum(partialSums[3]);
-}
-
-} // namespace
-
-namespace faiss {
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    // default implementation
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    if (M == 4) {
-        return distance_single_code_avx2_pqdecoder8_m4(sim_table, code);
-    }
-    if (M == 8) {
-        return distance_single_code_avx2_pqdecoder8_m8(sim_table, code);
-    }
-
-    float result = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-
-        const __m256i vksub = _mm256_set1_epi32(ksub);
-        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m256 partialSum = _mm256_setzero_ps();
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m));
-            {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-                tab += ksub * 8;
-
-                // collect partial sums
-                partialSum = _mm256_add_ps(partialSum, collected);
-            }
-
-            // move high 8 uint8 to low ones
-            const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
-            {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-                tab += ksub * 8;
-
-                // collect partial sums
-                partialSum = _mm256_add_ps(partialSum, collected);
-            }
-        }
-
-        // horizontal sum for partialSum
-        result += horizontal_sum(partialSum);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder(code + m, nbits);
-
-        for (; m < M; m++) {
-            result += tab[decoder.decode()];
-            tab += ksub;
-        }
-    }
-
-    return result;
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
-        type
-        distance_four_codes_avx2(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                // codes
-                const uint8_t* __restrict code0,
-                const uint8_t* __restrict code1,
-                const uint8_t* __restrict code2,
-                const uint8_t* __restrict code3,
-                // computed distances
-                float& result0,
-                float& result1,
-                float& result2,
-                float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-// Combines 4 operations of distance_single_code()
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
-distance_four_codes_avx2(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    if (M == 4) {
-        distance_four_codes_avx2_pqdecoder8_m4(
-                sim_table,
-                code0,
-                code1,
-                code2,
-                code3,
-                result0,
-                result1,
-                result2,
-                result3);
-        return;
-    }
-    if (M == 8) {
-        distance_four_codes_avx2_pqdecoder8_m8(
-                sim_table,
-                code0,
-                code1,
-                code2,
-                code3,
-                result0,
-                result1,
-                result2,
-                result3);
-        return;
-    }
-
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m256i vksub = _mm256_set1_epi32(ksub);
-        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m256 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm256_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
-            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
-            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 8;
-
-            // process next 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                // move high 8 uint8 to low ones
-                const __m128i mm2 =
-                        _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
-
-                // convert uint8 values (low part of __m128i) to int32
-                // values
-                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-
-                // add offsets
-                const __m256i indices_to_read_from =
-                        _mm256_add_epi32(idx1, offsets_0);
-
-                // gather 8 values, similar to 8 operations of tab[idx]
-                __m256 collected = _mm256_i32gather_ps(
-                        tab, indices_to_read_from, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
-            }
-
-            tab += ksub * 8;
-        }
-
-        // horizontal sum for partialSum
-        result0 += horizontal_sum(partialSums[0]);
-        result1 += horizontal_sum(partialSums[1]);
-        result2 += horizontal_sum(partialSums[2]);
-        result3 += horizontal_sum(partialSums[3]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        PQDecoder8 decoder1(code1 + m, nbits);
-        PQDecoder8 decoder2(code2 + m, nbits);
-        PQDecoder8 decoder3(code3 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            result1 += tab[decoder1.decode()];
-            result2 += tab[decoder2.decode()];
-            result3 += tab[decoder3.decode()];
-            tab += ksub;
-        }
-    }
-}
-
-} // namespace faiss
-
-#endif
diff --git a/faiss/impl/code_distance/code_distance-avx512.cpp b/faiss/impl/code_distance/code_distance-avx512.cpp
new file mode 100644
index 0000000000..891db07975
--- /dev/null
+++ b/faiss/impl/code_distance/code_distance-avx512.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <immintrin.h>
+
+#include <type_traits>
+
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/code_distance/code_distance.h>
+
+// According to experiments, the AVX-512 version may be SLOWER than
+//   the AVX2 version, which is somewhat unexpected.
+// This version is not used for now, but it may be used later.
+//
+// TODO: test for AMD CPUs.
+
+namespace faiss {
+
+template <>
+struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX512F> {
+    float distance_single_code(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            const uint8_t* code0) {
+        float result0 = 0;
+        constexpr size_t ksub = 1 << 8;
+
+        size_t m = 0;
+        const size_t pqM16 = M / 16;
+
+        constexpr intptr_t N = 1;
+
+        const float* tab = sim_table;
+
+        if (pqM16 > 0) {
+            // process 16 values per loop
+            const __m512i vksub = _mm512_set1_epi32(ksub);
+            __m512i offsets_0 = _mm512_setr_epi32(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
+
+            // accumulators of partial sums
+            __m512 partialSums[N];
+            for (intptr_t j = 0; j < N; j++) {
+                partialSums[j] = _mm512_setzero_ps();
+            }
+
+            // loop
+            for (m = 0; m < pqM16 * 16; m += 16) {
+                // load 16 uint8 values
+                __m128i mm1[N];
+                mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+
+                // process first 8 codes
+                for (intptr_t j = 0; j < N; j++) {
+                    const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
+
+                    // add offsets
+                    const __m512i indices_to_read_from =
+                            _mm512_add_epi32(idx1, offsets_0);
+
+                    // gather 16 values, similar to 16 operations of tab[idx]
+                    __m512 collected = _mm512_i32gather_ps(
+                            indices_to_read_from, tab, sizeof(float));
+
+                    // collect partial sums
+                    partialSums[j] = _mm512_add_ps(partialSums[j], collected);
+                }
+                tab += ksub * 16;
+            }
+
+            // horizontal sum for partialSum
+            result0 += _mm512_reduce_add_ps(partialSums[0]);
+        }
+
+        //
+        if (m < M) {
+            // process leftovers
+            PQDecoder8 decoder0(code0 + m, nbits);
+            for (; m < M; m++) {
+                result0 += tab[decoder0.decode()];
+                tab += ksub;
+            }
+        }
+
+        return result0;
+    }
+
+    void distance_four_codes_avx512(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            // codes
+            const uint8_t* __restrict code0,
+            const uint8_t* __restrict code1,
+            const uint8_t* __restrict code2,
+            const uint8_t* __restrict code3,
+            // computed distances
+            float& result0,
+            float& result1,
+            float& result2,
+            float& result3) {
+        result0 = 0;
+        result1 = 0;
+        result2 = 0;
+        result3 = 0;
+        constexpr size_t ksub = 1 << 8;
+
+        size_t m = 0;
+        const size_t pqM16 = M / 16;
+
+        constexpr intptr_t N = 4;
+
+        const float* tab = sim_table;
+
+        if (pqM16 > 0) {
+            // process 16 values per loop
+            const __m512i vksub = _mm512_set1_epi32(ksub);
+            __m512i offsets_0 = _mm512_setr_epi32(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
+
+            // accumulators of partial sums
+            __m512 partialSums[N];
+            for (intptr_t j = 0; j < N; j++) {
+                partialSums[j] = _mm512_setzero_ps();
+            }
+
+            // loop
+            for (m = 0; m < pqM16 * 16; m += 16) {
+                // load 16 uint8 values
+                __m128i mm1[N];
+                mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+                mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+                mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+                mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+                // process first 8 codes
+                for (intptr_t j = 0; j < N; j++) {
+                    const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
+
+                    // add offsets
+                    const __m512i indices_to_read_from =
+                            _mm512_add_epi32(idx1, offsets_0);
+
+                    // gather 16 values, similar to 16 operations of tab[idx]
+                    __m512 collected = _mm512_i32gather_ps(
+                            indices_to_read_from, tab, sizeof(float));
+
+                    // collect partial sums
+                    partialSums[j] = _mm512_add_ps(partialSums[j], collected);
+                }
+                tab += ksub * 16;
+            }
+
+            // horizontal sum for partialSum
+            result0 += _mm512_reduce_add_ps(partialSums[0]);
+            result1 += _mm512_reduce_add_ps(partialSums[1]);
+            result2 += _mm512_reduce_add_ps(partialSums[2]);
+            result3 += _mm512_reduce_add_ps(partialSums[3]);
+        }
+
+        //
+        if (m < M) {
+            // process leftovers
+            PQDecoder8 decoder0(code0 + m, nbits);
+            PQDecoder8 decoder1(code1 + m, nbits);
+            PQDecoder8 decoder2(code2 + m, nbits);
+            PQDecoder8 decoder3(code3 + m, nbits);
+            for (; m < M; m++) {
+                result0 += tab[decoder0.decode()];
+                result1 += tab[decoder1.decode()];
+                result2 += tab[decoder2.decode()];
+                result3 += tab[decoder3.decode()];
+                tab += ksub;
+            }
+        }
+    }
+};
+
+// explicit template instanciations
+// template struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX512F>;
+
+// these two will automatically use the generic implementation
+template struct PQCodeDistance<PQDecoder16, SIMDLevel::AVX512F>;
+template struct PQCodeDistance<PQDecoderGeneric, SIMDLevel::AVX512F>;
+
+} // namespace faiss
diff --git a/faiss/impl/code_distance/code_distance-avx512.h b/faiss/impl/code_distance/code_distance-avx512.h
deleted file mode 100644
index d05c41c19c..0000000000
--- a/faiss/impl/code_distance/code_distance-avx512.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef __AVX512F__
-
-#include <immintrin.h>
-
-#include <type_traits>
-
-#include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-namespace faiss {
-
-// According to experiments, the AVX-512 version may be SLOWER than
-//   the AVX2 version, which is somewhat unexpected.
-// This version is not used for now, but it may be used later.
-//
-// TODO: test for AMD CPUs.
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code) {
-    // default implementation
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
-        type inline distance_single_code_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                const uint8_t* code0) {
-    float result0 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 1;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m512i vksub = _mm512_set1_epi32(ksub);
-        __m512i offsets_0 = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m512 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm512_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m512i indices_to_read_from =
-                        _mm512_add_epi32(idx1, offsets_0);
-
-                // gather 16 values, similar to 16 operations of tab[idx]
-                __m512 collected = _mm512_i32gather_ps(
-                        indices_to_read_from, tab, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm512_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 16;
-        }
-
-        // horizontal sum for partialSum
-        result0 += _mm512_reduce_add_ps(partialSums[0]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            tab += ksub;
-        }
-    }
-
-    return result0;
-}
-
-template <typename PQDecoderT>
-typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
-        type
-        distance_four_codes_avx512(
-                // number of subquantizers
-                const size_t M,
-                // number of bits per quantization index
-                const size_t nbits,
-                // precomputed distances, layout (M, ksub)
-                const float* sim_table,
-                // codes
-                const uint8_t* __restrict code0,
-                const uint8_t* __restrict code1,
-                const uint8_t* __restrict code2,
-                const uint8_t* __restrict code3,
-                // computed distances
-                float& result0,
-                float& result1,
-                float& result2,
-                float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-// Combines 4 operations of distance_single_code()
-template <typename PQDecoderT>
-typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
-distance_four_codes_avx512(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-    constexpr size_t ksub = 1 << 8;
-
-    size_t m = 0;
-    const size_t pqM16 = M / 16;
-
-    constexpr intptr_t N = 4;
-
-    const float* tab = sim_table;
-
-    if (pqM16 > 0) {
-        // process 16 values per loop
-        const __m512i vksub = _mm512_set1_epi32(ksub);
-        __m512i offsets_0 = _mm512_setr_epi32(
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        offsets_0 = _mm512_mullo_epi32(offsets_0, vksub);
-
-        // accumulators of partial sums
-        __m512 partialSums[N];
-        for (intptr_t j = 0; j < N; j++) {
-            partialSums[j] = _mm512_setzero_ps();
-        }
-
-        // loop
-        for (m = 0; m < pqM16 * 16; m += 16) {
-            // load 16 uint8 values
-            __m128i mm1[N];
-            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
-            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
-            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
-            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
-
-            // process first 8 codes
-            for (intptr_t j = 0; j < N; j++) {
-                const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
-
-                // add offsets
-                const __m512i indices_to_read_from =
-                        _mm512_add_epi32(idx1, offsets_0);
-
-                // gather 16 values, similar to 16 operations of tab[idx]
-                __m512 collected = _mm512_i32gather_ps(
-                        indices_to_read_from, tab, sizeof(float));
-
-                // collect partial sums
-                partialSums[j] = _mm512_add_ps(partialSums[j], collected);
-            }
-            tab += ksub * 16;
-        }
-
-        // horizontal sum for partialSum
-        result0 += _mm512_reduce_add_ps(partialSums[0]);
-        result1 += _mm512_reduce_add_ps(partialSums[1]);
-        result2 += _mm512_reduce_add_ps(partialSums[2]);
-        result3 += _mm512_reduce_add_ps(partialSums[3]);
-    }
-
-    //
-    if (m < M) {
-        // process leftovers
-        PQDecoder8 decoder0(code0 + m, nbits);
-        PQDecoder8 decoder1(code1 + m, nbits);
-        PQDecoder8 decoder2(code2 + m, nbits);
-        PQDecoder8 decoder3(code3 + m, nbits);
-        for (; m < M; m++) {
-            result0 += tab[decoder0.decode()];
-            result1 += tab[decoder1.decode()];
-            result2 += tab[decoder2.decode()];
-            result3 += tab[decoder3.decode()];
-            tab += ksub;
-        }
-    }
-}
-
-} // namespace faiss
-
-#endif
diff --git a/faiss/impl/code_distance/code_distance-generic.cpp b/faiss/impl/code_distance/code_distance-generic.cpp
new file mode 100644
index 0000000000..ac9561ed93
--- /dev/null
+++ b/faiss/impl/code_distance/code_distance-generic.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/code_distance/code_distance.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace faiss {
+
+// explicit template instanciations
+template struct PQCodeDistance<PQDecoder8, SIMDLevel::NONE>;
+template struct PQCodeDistance<PQDecoder16, SIMDLevel::NONE>;
+template struct PQCodeDistance<PQDecoderGeneric, SIMDLevel::NONE>;
+
+} // namespace faiss
diff --git a/faiss/impl/code_distance/code_distance-generic.h b/faiss/impl/code_distance/code_distance-generic.h
deleted file mode 100644
index c02551c415..0000000000
--- a/faiss/impl/code_distance/code_distance-generic.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace faiss {
-
-/// Returns the distance to a single code.
-template <typename PQDecoderT>
-inline float distance_single_code_generic(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    PQDecoderT decoder(code, nbits);
-    const size_t ksub = 1 << nbits;
-
-    const float* tab = sim_table;
-    float result = 0;
-
-    for (size_t m = 0; m < M; m++) {
-        result += tab[decoder.decode()];
-        tab += ksub;
-    }
-
-    return result;
-}
-
-/// Combines 4 operations of distance_single_code()
-/// General-purpose version.
-template <typename PQDecoderT>
-inline void distance_four_codes_generic(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    PQDecoderT decoder0(code0, nbits);
-    PQDecoderT decoder1(code1, nbits);
-    PQDecoderT decoder2(code2, nbits);
-    PQDecoderT decoder3(code3, nbits);
-    const size_t ksub = 1 << nbits;
-
-    const float* tab = sim_table;
-    result0 = 0;
-    result1 = 0;
-    result2 = 0;
-    result3 = 0;
-
-    for (size_t m = 0; m < M; m++) {
-        result0 += tab[decoder0.decode()];
-        result1 += tab[decoder1.decode()];
-        result2 += tab[decoder2.decode()];
-        result3 += tab[decoder3.decode()];
-        tab += ksub;
-    }
-}
-
-} // namespace faiss
diff --git a/faiss/impl/code_distance/code_distance-sve.h b/faiss/impl/code_distance/code_distance-sve.cpp
similarity index 99%
rename from faiss/impl/code_distance/code_distance-sve.h
rename to faiss/impl/code_distance/code_distance-sve.cpp
index 82f7746be6..9a941798ff 100644
--- a/faiss/impl/code_distance/code_distance-sve.h
+++ b/faiss/impl/code_distance/code_distance-sve.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
 #include <arm_sve.h>
@@ -15,7 +13,7 @@
 #include <type_traits>
 
 #include <faiss/impl/ProductQuantizer.h>
-#include <faiss/impl/code_distance/code_distance-generic.h>
+#include <faiss/impl/code_distance/code_distance.h>
 
 namespace faiss {
 
diff --git a/faiss/impl/code_distance/code_distance.h b/faiss/impl/code_distance/code_distance.h
index 8f29abda97..585890cb40 100644
--- a/faiss/impl/code_distance/code_distance.h
+++ b/faiss/impl/code_distance/code_distance.h
@@ -9,6 +9,10 @@
 
 #include <faiss/impl/platform_macros.h>
 
+#include <faiss/utils/simd_levels.h>
+
+#include <faiss/impl/ProductQuantizer.h>
+
 // This directory contains functions to compute a distance
 // from a given PQ code to a query vector, given that the
 // distances to a query vector for pq.M codebooks are precomputed.
@@ -24,163 +28,76 @@
 // why the names of the functions for custom implementations
 // have this _generic or _avx2 suffix.
 
-#ifdef __AVX2__
-
-#include <faiss/impl/code_distance/code_distance-avx2.h>
-
 namespace faiss {
 
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_avx2<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_avx2<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
+// definiton and default implementation
+template <typename PQDecoderT, SIMDLevel>
+struct PQCodeDistance {
+    using PQDecoder = PQDecoderT;
+
+    /// Returns the distance to a single code.
+    static float distance_single_code(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            // the code
+            const uint8_t* code) {
+        PQDecoderT decoder(code, nbits);
+        const size_t ksub = 1 << nbits;
+
+        const float* tab = sim_table;
+        float result = 0;
+
+        for (size_t m = 0; m < M; m++) {
+            result += tab[decoder.decode()];
+            tab += ksub;
+        }
+
+        return result;
+    }
+
+    /// Combines 4 operations of distance_single_code()
+    /// General-purpose version.
+    static void distance_four_codes(
+            // number of subquantizers
+            const size_t M,
+            // number of bits per quantization index
+            const size_t nbits,
+            // precomputed distances, layout (M, ksub)
+            const float* sim_table,
+            // codes
+            const uint8_t* __restrict code0,
+            const uint8_t* __restrict code1,
+            const uint8_t* __restrict code2,
+            const uint8_t* __restrict code3,
+            // computed distances
+            float& result0,
+            float& result1,
+            float& result2,
+            float& result3) {
+        PQDecoderT decoder0(code0, nbits);
+        PQDecoderT decoder1(code1, nbits);
+        PQDecoderT decoder2(code2, nbits);
+        PQDecoderT decoder3(code3, nbits);
+        const size_t ksub = 1 << nbits;
+
+        const float* tab = sim_table;
+        result0 = 0;
+        result1 = 0;
+        result2 = 0;
+        result3 = 0;
+
+        for (size_t m = 0; m < M; m++) {
+            result0 += tab[decoder0.decode()];
+            result1 += tab[decoder1.decode()];
+            result2 += tab[decoder2.decode()];
+            result3 += tab[decoder3.decode()];
+            tab += ksub;
+        }
+    }
+};
 
 } // namespace faiss
-
-#elif defined(__ARM_FEATURE_SVE)
-
-#include <faiss/impl/code_distance/code_distance-sve.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_sve<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // the product quantizer
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_sve<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-} // namespace faiss
-
-#else
-
-#include <faiss/impl/code_distance/code_distance-generic.h>
-
-namespace faiss {
-
-template <typename PQDecoderT>
-inline float distance_single_code(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // the code
-        const uint8_t* code) {
-    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
-}
-
-template <typename PQDecoderT>
-inline void distance_four_codes(
-        // number of subquantizers
-        const size_t M,
-        // number of bits per quantization index
-        const size_t nbits,
-        // precomputed distances, layout (M, ksub)
-        const float* sim_table,
-        // codes
-        const uint8_t* __restrict code0,
-        const uint8_t* __restrict code1,
-        const uint8_t* __restrict code2,
-        const uint8_t* __restrict code3,
-        // computed distances
-        float& result0,
-        float& result1,
-        float& result2,
-        float& result3) {
-    distance_four_codes_generic<PQDecoderT>(
-            M,
-            nbits,
-            sim_table,
-            code0,
-            code1,
-            code2,
-            code3,
-            result0,
-            result1,
-            result2,
-            result3);
-}
-
-} // namespace faiss
-
-#endif
diff --git a/faiss/impl/scalar_quantizer/codecs.h b/faiss/impl/scalar_quantizer/codecs.h
new file mode 100644
index 0000000000..b5c20d464b
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/codecs.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/utils/simd_levels.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/*******************************************************************
+ * Codec: converts between values in [0, 1] and an index in a code
+ * array. The "i" parameter is the vector component index (not byte
+ * index).
+ */
+
+template <SIMDLevel>
+struct Codec8bit {};
+
+template <SIMDLevel>
+struct Codec4bit {};
+
+template <SIMDLevel>
+struct Codec6bit {};
+
+template <>
+struct Codec8bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        code[i] = (int)(255 * x);
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        return (code[i] + 0.5f) / 255.0f;
+    }
+};
+template <>
+struct Codec4bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
+    }
+};
+
+template <>
+struct Codec6bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        int bits = (int)(x * 63.0);
+        code += (i >> 2) * 3;
+        switch (i & 3) {
+            case 0:
+                code[0] |= bits;
+                break;
+            case 1:
+                code[0] |= bits << 6;
+                code[1] |= bits >> 2;
+                break;
+            case 2:
+                code[1] |= bits << 4;
+                code[2] |= bits >> 4;
+                break;
+            case 3:
+                code[2] |= bits << 2;
+                break;
+        }
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        uint8_t bits;
+        code += (i >> 2) * 3;
+        switch (i & 3) {
+            case 0:
+                bits = code[0] & 0x3f;
+                break;
+            case 1:
+                bits = code[0] >> 6;
+                bits |= (code[1] & 0xf) << 2;
+                break;
+            case 2:
+                bits = code[1] >> 4;
+                bits |= (code[2] & 3) << 4;
+                break;
+            case 3:
+                bits = code[2] >> 2;
+                break;
+        }
+        return (bits + 0.5f) / 63.0f;
+    }
+};
+
+} // namespace scalar_quantizer
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/distance_computers.h b/faiss/impl/scalar_quantizer/distance_computers.h
new file mode 100644
index 0000000000..c50f443ec4
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/distance_computers.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/impl/scalar_quantizer/quantizers.h>
+#include <faiss/utils/simd_levels.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/*******************************************************************
+ * Similarities: accumulates the element-wise similarities
+ *******************************************************************/
+
+template <SIMDLevel>
+struct SimilarityL2 {};
+
+template <SIMDLevel>
+struct SimilarityIP {};
+
+template <>
+struct SimilarityL2<SIMDLevel::NONE> {
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::NONE;
+    static constexpr int simdwidth = 1;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+
+    explicit SimilarityL2(const float* y) : y(y) {}
+
+    /******* scalar accumulator *******/
+
+    float accu;
+
+    FAISS_ALWAYS_INLINE void begin() {
+        accu = 0;
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_component(float x) {
+        float tmp = *yi++ - x;
+        accu += tmp * tmp;
+    }
+
+    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
+        float tmp = x1 - x2;
+        accu += tmp * tmp;
+    }
+
+    FAISS_ALWAYS_INLINE float result() {
+        return accu;
+    }
+};
+
+template <>
+struct SimilarityIP<SIMDLevel::NONE> {
+    static constexpr int simdwidth = 1;
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::NONE;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+    const float *y, *yi;
+
+    float accu;
+
+    explicit SimilarityIP(const float* y) : y(y) {}
+
+    FAISS_ALWAYS_INLINE void begin() {
+        accu = 0;
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_component(float x) {
+        accu += *yi++ * x;
+    }
+
+    FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) {
+        accu += x1 * x2;
+    }
+
+    FAISS_ALWAYS_INLINE float result() {
+        return accu;
+    }
+};
+
+/*******************************************************************
+ * Distance computers: compute distances between a query and a code
+ *******************************************************************/
+
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+
+template <class Quantizer, class Similarity, SIMDLevel level>
+struct DCTemplate : SQDistanceComputer {};
+
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, SIMDLevel::NONE> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin();
+        for (size_t i = 0; i < quant.d; i++) {
+            float xi = quant.reconstruct_component(code, i);
+            sim.add_component(xi);
+        }
+        return sim.result();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin();
+        for (size_t i = 0; i < quant.d; i++) {
+            float x1 = quant.reconstruct_component(code1, i);
+            float x2 = quant.reconstruct_component(code2, i);
+            sim.add_component_2(x1, x2);
+        }
+        return sim.result();
+    }
+
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+
+/*******************************************************************
+ * DistanceComputerByte: computes distances in the integer domain
+ *******************************************************************/
+
+template <class Similarity, SIMDLevel>
+struct DistanceComputerByte : SQDistanceComputer {};
+
+template <class Similarity>
+struct DistanceComputerByte<Similarity, SIMDLevel::NONE> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        int accu = 0;
+        for (int i = 0; i < d; i++) {
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                accu += int(code1[i]) * code2[i];
+            } else {
+                int diff = int(code1[i]) - code2[i];
+                accu += diff * diff;
+            }
+        }
+        return accu;
+    }
+
+    void set_query(const float* x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+
+/*******************************************************************
+ * select_distance_computer: runtime selection of template
+ * specialization
+ *******************************************************************/
+
+template <class Sim>
+SQDistanceComputer* select_distance_computer(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained) {
+    constexpr SIMDLevel SL = Sim::SIMD_LEVEL;
+    constexpr QScaling NU = QScaling::NON_UNIFORM;
+    constexpr QScaling U = QScaling::UNIFORM;
+    switch (qtype) {
+        case ScalarQuantizer::QT_8bit_uniform:
+            return new DCTemplate<QuantizerT<Codec8bit<SL>, U, SL>, Sim, SL>(
+                    d, trained);
+
+        case ScalarQuantizer::QT_4bit_uniform:
+            return new DCTemplate<QuantizerT<Codec4bit<SL>, U, SL>, Sim, SL>(
+                    d, trained);
+
+        case ScalarQuantizer::QT_8bit:
+            return new DCTemplate<QuantizerT<Codec8bit<SL>, NU, SL>, Sim, SL>(
+                    d, trained);
+
+        case ScalarQuantizer::QT_6bit:
+            return new DCTemplate<QuantizerT<Codec6bit<SL>, NU, SL>, Sim, SL>(
+                    d, trained);
+
+        case ScalarQuantizer::QT_4bit:
+            return new DCTemplate<QuantizerT<Codec4bit<SL>, NU, SL>, Sim, SL>(
+                    d, trained);
+
+        case ScalarQuantizer::QT_fp16:
+            return new DCTemplate<QuantizerFP16<SL>, Sim, SL>(d, trained);
+
+        case ScalarQuantizer::QT_bf16:
+            return new DCTemplate<QuantizerBF16<SL>, Sim, SL>(d, trained);
+
+        case ScalarQuantizer::QT_8bit_direct:
+            return new DCTemplate<Quantizer8bitDirect<SL>, Sim, SL>(d, trained);
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate<Quantizer8bitDirectSigned<SL>, Sim, SL>(
+                    d, trained);
+    }
+    FAISS_THROW_MSG("unknown qtype");
+    return nullptr;
+}
+
+template <SIMDLevel SL>
+SQDistanceComputer* select_distance_computer_1(
+        MetricType metric_type,
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained) {
+    if (metric_type == METRIC_L2) {
+        return select_distance_computer<SimilarityL2<SL>>(qtype, d, trained);
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        return select_distance_computer<SimilarityIP<SL>>(qtype, d, trained);
+    } else {
+        FAISS_THROW_MSG("unsuppored metric type");
+    }
+}
+
+// prevent implicit instantiation of the template
+extern template SQDistanceComputer* select_distance_computer_1<SIMDLevel::AVX2>(
+        MetricType metric_type,
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+extern template SQDistanceComputer* select_distance_computer_1<
+        SIMDLevel::AVX512F>(
+        MetricType metric_type,
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+} // namespace scalar_quantizer
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/impl-avx2.cpp b/faiss/impl/scalar_quantizer/impl-avx2.cpp
new file mode 100644
index 0000000000..a044ad9aaf
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/impl-avx2.cpp
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/impl/scalar_quantizer/distance_computers.h>
+#include <faiss/impl/scalar_quantizer/quantizers.h>
+#include <faiss/impl/scalar_quantizer/scanners.h>
+
+#include <faiss/utils/simdlib.h>
+
+#include <faiss/utils/bf16.h>
+#include <faiss/utils/fp16.h>
+#include <immintrin.h>
+
+#ifdef __F16C__
+#define USE_F16C
+#else
+#warning \
+        "Cannot enable AVX optimizations in scalar quantizer if -mf16c is not set as well"
+#endif
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/****************************************** Specialization of codecs */
+
+template <>
+struct Codec8bit<SIMDLevel::AVX2> : Codec8bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        const uint64_t c8 = *(uint64_t*)(code + i);
+
+        const __m128i i8 = _mm_set1_epi64x(c8);
+        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
+        const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
+        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
+    }
+};
+
+template <>
+struct Codec4bit<SIMDLevel::AVX2> : Codec4bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
+        uint32_t mask = 0x0f0f0f0f;
+        uint32_t c4ev = c4 & mask;
+        uint32_t c4od = (c4 >> 4) & mask;
+
+        // the 8 lower bytes of c8 contain the values
+        __m128i c8 =
+                _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od));
+        __m128i c4lo = _mm_cvtepu8_epi32(c8);
+        __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4));
+        __m256i i8 = _mm256_castsi128_si256(c4lo);
+        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
+        __m256 f8 = _mm256_cvtepi32_ps(i8);
+        __m256 half = _mm256_set1_ps(0.5f);
+        f8 = _mm256_add_ps(f8, half);
+        __m256 one_255 = _mm256_set1_ps(1.f / 15.f);
+        return simd8float32(_mm256_mul_ps(f8, one_255));
+    }
+};
+
+template <>
+struct Codec6bit<SIMDLevel::AVX2> : Codec6bit<SIMDLevel::NONE> {
+    /* Load 6 bytes that represent 8 6-bit values, return them as a
+     * 8*32 bit vector register */
+    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
+        const __m128i perm = _mm_set_epi8(
+                -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
+        const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
+
+        // load 6 bytes
+        __m128i c1 =
+                _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]);
+
+        // put in 8 * 32 bits
+        __m128i c2 = _mm_shuffle_epi8(c1, perm);
+        __m256i c3 = _mm256_cvtepi16_epi32(c2);
+
+        // shift and mask out useless bits
+        __m256i c4 = _mm256_srlv_epi32(c3, shifts);
+        __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4);
+        return c5;
+    }
+
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
+        // // for the reference, maybe, it becomes used oned day.
+        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
+        // const uint32_t* data32 = (const uint32_t*)data16;
+        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
+        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
+        // const __m128i i8 = _mm_set1_epi64x(vext);
+        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
+        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
+
+        __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
+        __m256 f8 = _mm256_cvtepi32_ps(i8);
+        // this could also be done with bit manipulations but it is
+        // not obviously faster
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
+    }
+};
+
+/****************************************** Specialization of quantizers */
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::AVX2>
+        : QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE>(
+                      d,
+                      trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m256 xi = Codec::decode_8_components(code, i).f;
+        return simd8float32(_mm256_fmadd_ps(
+                xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)));
+    }
+};
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::AVX2>
+        : QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE>(
+                      d,
+                      trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m256 xi = Codec::decode_8_components(code, i).f;
+        return simd8float32(_mm256_fmadd_ps(
+                xi,
+                _mm256_loadu_ps(this->vdiff + i),
+                _mm256_loadu_ps(this->vmin + i)));
+    }
+};
+
+template <>
+struct QuantizerFP16<SIMDLevel::AVX2> : QuantizerFP16<SIMDLevel::NONE> {
+    QuantizerFP16(size_t d, const std::vector<float>& trained)
+            : QuantizerFP16<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        return simd8float32(_mm256_cvtph_ps(codei));
+    }
+};
+
+template <>
+struct QuantizerBF16<SIMDLevel::AVX2> : QuantizerBF16<SIMDLevel::NONE> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
+        code_256i = _mm256_slli_epi32(code_256i, 16);
+        return simd8float32(_mm256_castsi256_ps(code_256i));
+    }
+};
+
+template <>
+struct Quantizer8bitDirect<SIMDLevel::AVX2>
+        : Quantizer8bitDirect<SIMDLevel::NONE> {
+    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirect<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
+        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
+        return simd8float32(_mm256_cvtepi32_ps(y8));        // 8 * float32
+    }
+};
+
+template <>
+struct Quantizer8bitDirectSigned<SIMDLevel::AVX2>
+        : Quantizer8bitDirectSigned<SIMDLevel::NONE> {
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
+        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
+        __m256i c8 = _mm256_set1_epi32(128);
+        __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes
+        return simd8float32(_mm256_cvtepi32_ps(z8)); // 8 * float32
+    }
+};
+
+/****************************************** Specialization of similarities */
+
+template <>
+struct SimilarityL2<SIMDLevel::AVX2> {
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX2;
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+
+    explicit SimilarityL2(const float* y) : y(y) {}
+    simd8float32 accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8.clear();
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) {
+        __m256 yiv = _mm256_loadu_ps(yi);
+        yi += 8;
+        __m256 tmp = _mm256_sub_ps(yiv, x.f);
+        accu8 = simd8float32(_mm256_fmadd_ps(tmp, tmp, accu8.f));
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            simd8float32 x,
+            simd8float32 y_2) {
+        __m256 tmp = _mm256_sub_ps(y_2.f, x.f);
+        accu8 = simd8float32(_mm256_fmadd_ps(tmp, tmp, accu8.f));
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        const __m128 sum = _mm_add_ps(
+                _mm256_castps256_ps128(accu8.f),
+                _mm256_extractf128_ps(accu8.f, 1));
+        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
+        const __m128 v1 = _mm_add_ps(sum, v0);
+        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+        const __m128 v3 = _mm_add_ps(v1, v2);
+        return _mm_cvtss_f32(v3);
+    }
+};
+
+template <>
+struct SimilarityIP<SIMDLevel::AVX2> {
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX2;
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+
+    const float *y, *yi;
+
+    float accu;
+
+    explicit SimilarityIP(const float* y) : y(y) {}
+
+    simd8float32 accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8.clear();
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) {
+        __m256 yiv = _mm256_loadu_ps(yi);
+        yi += 8;
+        accu8.f = _mm256_fmadd_ps(yiv, x.f, accu8.f);
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            simd8float32 x1,
+            simd8float32 x2) {
+        accu8.f = _mm256_fmadd_ps(x1.f, x2.f, accu8.f);
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        const __m128 sum = _mm_add_ps(
+                _mm256_castps256_ps128(accu8.f),
+                _mm256_extractf128_ps(accu8.f, 1));
+        const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
+        const __m128 v1 = _mm_add_ps(sum, v0);
+        __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+        const __m128 v3 = _mm_add_ps(v1, v2);
+        return _mm_cvtss_f32(v3);
+    }
+};
+
+/****************************************** Specialization of distance computers
+ */
+
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            simd8float32 xi = quant.reconstruct_8_components(code, i);
+            sim.add_8_components(xi);
+        }
+        return sim.result_8();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            simd8float32 x1 = quant.reconstruct_8_components(code1, i);
+            simd8float32 x2 = quant.reconstruct_8_components(code2, i);
+            sim.add_8_components_2(x1, x2);
+        }
+        return sim.result_8();
+    }
+
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+
+template <class Similarity>
+struct DistanceComputerByte<Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        // __m256i accu = _mm256_setzero_ps ();
+        __m256i accu = _mm256_setzero_si256();
+        for (int i = 0; i < d; i += 16) {
+            // load 16 bytes, convert to 16 uint16_t
+            __m256i c1 = _mm256_cvtepu8_epi16(
+                    _mm_loadu_si128((__m128i*)(code1 + i)));
+            __m256i c2 = _mm256_cvtepu8_epi16(
+                    _mm_loadu_si128((__m128i*)(code2 + i)));
+            __m256i prod32;
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                prod32 = _mm256_madd_epi16(c1, c2);
+            } else {
+                __m256i diff = _mm256_sub_epi16(c1, c2);
+                prod32 = _mm256_madd_epi16(diff, diff);
+            }
+            accu = _mm256_add_epi32(accu, prod32);
+        }
+        __m128i sum = _mm256_extractf128_si256(accu, 0);
+        sum = _mm_add_epi32(sum, _mm256_extractf128_si256(accu, 1));
+        sum = _mm_hadd_epi32(sum, sum);
+        sum = _mm_hadd_epi32(sum, sum);
+        return _mm_cvtsi128_si32(sum);
+    }
+
+    void set_query(const float* x) final {
+        /*
+        for (int i = 0; i < d; i += 8) {
+            __m256 xi = _mm256_loadu_ps (x + i);
+            __m256i ci = _mm256_cvtps_epi32(xi);
+        */
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+
+// explicit instantiation
+
+template ScalarQuantizer::SQuantizer* select_quantizer_1<SIMDLevel::AVX2>(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+template SQDistanceComputer* select_distance_computer_1<SIMDLevel::AVX2>(
+        MetricType metric_type,
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+template InvertedListScanner* sel0_InvertedListScanner<SIMDLevel::AVX2>(
+        MetricType mt,
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool by_residual);
+
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/impl-avx512.cpp b/faiss/impl/scalar_quantizer/impl-avx512.cpp
new file mode 100644
index 0000000000..ad782c61cb
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/impl-avx512.cpp
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/impl/scalar_quantizer/distance_computers.h>
+#include <faiss/impl/scalar_quantizer/quantizers.h>
+#include <faiss/impl/scalar_quantizer/scanners.h>
+
+#include <faiss/utils/simdlib.h>
+
+#if defined(__AVX512F__) && defined(__F16C__)
+#define USE_AVX512_F16C
+#else
+#warning "Wrong compiler flags for AVX512_F16C"
+#endif
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/******************************** Codec specializations */
+
+template <>
+struct Codec8bit<SIMDLevel::AVX512F> : Codec8bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i));
+        const __m512i i32 = _mm512_cvtepu8_epi32(c16);
+        const __m512 f16 = _mm512_cvtepi32_ps(i32);
+        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
+        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
+        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
+    }
+};
+
+template <>
+struct Codec4bit<SIMDLevel::AVX512F> : Codec4bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        uint64_t c8 = *(uint64_t*)(code + (i >> 1));
+        uint64_t mask = 0x0f0f0f0f0f0f0f0f;
+        uint64_t c8ev = c8 & mask;
+        uint64_t c8od = (c8 >> 4) & mask;
+
+        __m128i c16 =
+                _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od));
+        __m256i c8lo = _mm256_cvtepu8_epi32(c16);
+        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8));
+        __m512i i16 = _mm512_castsi256_si512(c8lo);
+        i16 = _mm512_inserti32x8(i16, c8hi, 1);
+        __m512 f16 = _mm512_cvtepi32_ps(i16);
+        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f);
+        const __m512 one_255 = _mm512_set1_ps(1.f / 15.f);
+        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
+    }
+};
+
+template <>
+struct Codec6bit<SIMDLevel::AVX512F> : Codec6bit<SIMDLevel::NONE> {
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        // pure AVX512 implementation (not necessarily the fastest).
+        // see:
+        // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
+
+        // clang-format off
+
+      // 16 components, 16x6 bit=12 bytes
+      const __m128i bit_6v =
+              _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3);
+      const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v);
+
+      // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+      // 00          01          02          03
+      const __m256i shuffle_mask = _mm256_setr_epi16(
+              0xFF00, 0x0100, 0x0201, 0xFF02,
+              0xFF03, 0x0403, 0x0504, 0xFF05,
+              0xFF06, 0x0706, 0x0807, 0xFF08,
+              0xFF09, 0x0A09, 0x0B0A, 0xFF0B);
+      const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask);
+
+      // 0: xxxxxxxx xx543210
+      // 1: xxxx5432 10xxxxxx
+      // 2: xxxxxx54 3210xxxx
+      // 3: xxxxxxxx 543210xx
+      const __m256i shift_right_v = _mm256_setr_epi16(
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U);
+      __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v);
+
+      // remove unneeded bits
+      shuffled_shifted =
+              _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F));
+
+      // scale
+      const __m512 f8 =
+              _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted));
+      const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
+      const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
+      return simd16float32(_mm512_fmadd_ps(f8, one_255, half_one_255));
+
+        // clang-format on
+    }
+};
+
+/******************************** Quantizer specializations */
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::AVX512F>
+        : QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE>(
+                      d,
+                      trained) {}
+
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m512 xi = Codec::decode_16_components(code, i).f;
+        return simd16float32(_mm512_fmadd_ps(
+                xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)));
+    }
+};
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::AVX512F>
+        : QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE>(
+                      d,
+                      trained) {}
+
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m512 xi = Codec::decode_16_components(code, i).f;
+        return simd16float32(_mm512_fmadd_ps(
+                xi,
+                _mm512_loadu_ps(this->vdiff + i),
+                _mm512_loadu_ps(this->vmin + i)));
+    }
+};
+
+template <>
+struct QuantizerFP16<SIMDLevel::AVX512F> : QuantizerFP16<SIMDLevel::NONE> {
+    QuantizerFP16(size_t d, const std::vector<float>& trained)
+            : QuantizerFP16<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
+        return simd16float32(_mm512_cvtph_ps(codei));
+    }
+};
+
+template <>
+struct QuantizerBF16<SIMDLevel::AVX512F> : QuantizerBF16<SIMDLevel::NONE> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<SIMDLevel::NONE>(d, trained) {}
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
+        __m512i code_512i = _mm512_cvtepu16_epi32(code_256i);
+        code_512i = _mm512_slli_epi32(code_512i, 16);
+        return simd16float32(_mm512_castsi512_ps(code_512i));
+    }
+};
+
+template <>
+struct Quantizer8bitDirect<SIMDLevel::AVX512F>
+        : Quantizer8bitDirect<SIMDLevel::NONE> {
+    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirect<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
+        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
+        return simd16float32(_mm512_cvtepi32_ps(y16));       // 16 * float32
+    }
+};
+
+template <>
+struct Quantizer8bitDirectSigned<SIMDLevel::AVX512F>
+        : Quantizer8bitDirectSigned<SIMDLevel::NONE> {
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<SIMDLevel::NONE>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd16float32
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m128i x16 = _mm_loadu_si128((__m128i*)(code + i)); // 16 * int8
+        __m512i y16 = _mm512_cvtepu8_epi32(x16);             // 16 * int32
+        __m512i c16 = _mm512_set1_epi32(128);
+        __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes
+        return simd16float32(_mm512_cvtepi32_ps(z16)); // 16 * float32
+    }
+};
+
+/****************************************** Specialization of similarities */
+
+template <>
+struct SimilarityL2<SIMDLevel::AVX512F> {
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX512F;
+    static constexpr int simdwidth = 16;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+
+    explicit SimilarityL2(const float* y) : y(y) {}
+    simd16float32 accu16;
+
+    FAISS_ALWAYS_INLINE void begin_16() {
+        accu16.clear();
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_16_components(simd16float32 x) {
+        __m512 yiv = _mm512_loadu_ps(yi);
+        yi += 16;
+        __m512 tmp = _mm512_sub_ps(yiv, x.f);
+        accu16 = simd16float32(_mm512_fmadd_ps(tmp, tmp, accu16.f));
+    }
+
+    FAISS_ALWAYS_INLINE void add_16_components_2(
+            simd16float32 x,
+            simd16float32 y_2) {
+        __m512 tmp = _mm512_sub_ps(y_2.f, x.f);
+        accu16 = simd16float32(_mm512_fmadd_ps(tmp, tmp, accu16.f));
+    }
+
+    FAISS_ALWAYS_INLINE float result_16() {
+        // performs better than dividing into _mm256 and adding
+        return _mm512_reduce_add_ps(accu16.f);
+    }
+};
+
+template <>
+struct SimilarityIP<SIMDLevel::AVX512F> {
+    static constexpr SIMDLevel SIMD_LEVEL = SIMDLevel::AVX512F;
+    static constexpr int simdwidth = 16;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+
+    const float *y, *yi;
+
+    float accu;
+
+    explicit SimilarityIP(const float* y) : y(y) {}
+
+    simd16float32 accu16;
+
+    FAISS_ALWAYS_INLINE void begin_16() {
+        accu16.clear();
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_16_components(simd16float32 x) {
+        __m512 yiv = _mm512_loadu_ps(yi);
+        yi += 16;
+        accu16.f = _mm512_fmadd_ps(yiv, x.f, accu16.f);
+    }
+
+    FAISS_ALWAYS_INLINE void add_16_components_2(
+            simd16float32 x1,
+            simd16float32 x2) {
+        accu16.f = _mm512_fmadd_ps(x1.f, x2.f, accu16.f);
+    }
+
+    FAISS_ALWAYS_INLINE float result_16() {
+        // performs better than dividing into _mm256 and adding
+        return _mm512_reduce_add_ps(accu16.f);
+    }
+};
+
+/****************************************** Specialization of distance computers
+ */
+
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX512F>
+        : SQDistanceComputer { // Update to handle 16 lanes
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin_16();
+        for (size_t i = 0; i < quant.d; i += 16) {
+            simd16float32 xi = quant.reconstruct_16_components(code, i);
+            sim.add_16_components(xi);
+        }
+        return sim.result_16();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin_16();
+        for (size_t i = 0; i < quant.d; i += 16) {
+            simd16float32 x1 = quant.reconstruct_16_components(code1, i);
+            simd16float32 x2 = quant.reconstruct_16_components(code2, i);
+            sim.add_16_components_2(x1, x2);
+        }
+        return sim.result_16();
+    }
+
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+
+template <class Similarity>
+struct DistanceComputerByte<Similarity, SIMDLevel::AVX512F>
+        : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        __m512i accu = _mm512_setzero_si512();
+        for (int i = 0; i < d; i += 32) { // Process 32 bytes at a time
+            __m512i c1 = _mm512_cvtepu8_epi16(
+                    _mm256_loadu_si256((__m256i*)(code1 + i)));
+            __m512i c2 = _mm512_cvtepu8_epi16(
+                    _mm256_loadu_si256((__m256i*)(code2 + i)));
+            __m512i prod32;
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                prod32 = _mm512_madd_epi16(c1, c2);
+            } else {
+                __m512i diff = _mm512_sub_epi16(c1, c2);
+                prod32 = _mm512_madd_epi16(diff, diff);
+            }
+            accu = _mm512_add_epi32(accu, prod32);
+        }
+        // Horizontally add elements of accu
+        return _mm512_reduce_add_epi32(accu);
+    }
+
+    void set_query(const float* x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+
+// explicit instantiation
+
+template ScalarQuantizer::SQuantizer* select_quantizer_1<SIMDLevel::AVX512F>(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+template SQDistanceComputer* select_distance_computer_1<SIMDLevel::AVX512F>(
+        MetricType metric_type,
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+template InvertedListScanner* sel0_InvertedListScanner<SIMDLevel::AVX512F>(
+        MetricType mt,
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool by_residual);
+
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/impl-neon.cpp b/faiss/impl/scalar_quantizer/impl-neon.cpp
new file mode 100644
index 0000000000..7daeeae5b6
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/impl-neon.cpp
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/impl/scalar_quantizer/distance_computers.h>
+#include <faiss/impl/scalar_quantizer/quantizers.h>
+
+#if defined(__aarch64__)
+#if defined(__GNUC__) && __GNUC__ < 8
+#warning \
+        "Cannot enable NEON optimizations in scalar quantizer if the compiler is GCC<8"
+#else
+#define USE_NEON
+#endif
+#endif
+
+namespace faiss {
+
+namespace scalar_quantizer {
+/******************************** Codec specializations */
+
+template <>
+struct Codec8bit<SIMDLevel::ARM_NEON> {
+    static FAISS_ALWAYS_INLINE decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32(float32x4x2_t{res1, res2});
+    }
+};
+
+template <>
+struct Codec4bit<SIMDLevel::ARM_NEON> {
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32({res1, res2});
+    }
+};
+
+template <>
+struct Codec6bit<SIMDLevel::ARM_NEON> {
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32(float32x4x2_t({res1, res2}));
+    }
+};
+/******************************** Quantizatoin specializations */
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::ARM_NEON>
+        : QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::UNIFORM, 1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i);
+        return simd8float32(float32x4x2_t(
+                {vfmaq_f32(
+                         vdupq_n_f32(this->vmin),
+                         xi.val[0],
+                         vdupq_n_f32(this->vdiff)),
+                 vfmaq_f32(
+                         vdupq_n_f32(this->vmin),
+                         xi.val[1],
+                         vdupq_n_f32(this->vdiff))}));
+    }
+};
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::ARM_NEON>
+        : QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE> {
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : QuantizerT<Codec, QScaling::NON_UNIFORM, 1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i).data;
+
+        float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
+        float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
+
+        return simd8float32(
+                {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
+                 vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])});
+    }
+};
+
+template <>
+struct QuantizerFP16<SIMDLevel::ARM_NEON> : QuantizerFP16<SIMDLevel::NONE> {
+    QuantizerFP16(size_t d, const std::vector<float>& trained)
+            : QuantizerFP16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return simd8float32(
+                {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
+                 vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))});
+    }
+};
+
+template <>
+struct QuantizerBF16<SIMDLevel::ARM_NEON> : QuantizerBF16<SIMDLevel::NONE> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return simd8float32(
+                {vreinterpretq_f32_u32(
+                         vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
+                 vreinterpretq_f32_u32(
+                         vshlq_n_u32(vmovl_u16(codei.val[1]), 16))});
+    }
+};
+
+template <>
+struct Quantizer8bitDirect<SIMDLevel::ARM_NEON>
+        : Quantizer8bitDirect<SIMDLevel::NONE> {
+    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirect<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8);
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        // convert uint16 -> uint32 -> fp32
+        return simd8float32(
+                {vcvtq_f32_u32(vmovl_u16(y8_0)),
+                 vcvtq_f32_u32(vmovl_u16(y8_1))});
+    }
+};
+
+template <>
+struct Quantizer8bitDirectSigned<SIMDLevel::ARM_NEON>
+        : Quantizer8bitDirectSigned<SIMDLevel::NONE> {
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        float32x4_t z8_0 = vcvtq_f32_u32(
+                vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32
+        float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1));
+
+        // subtract 128 to convert into signed numbers
+        return simd8float32(
+                {vsubq_f32(z8_0, vmovq_n_f32(128.0)),
+                 vsubq_f32(z8_1, vmovq_n_f32(128.0))});
+    }
+};
+
+/****************************************** Specialization of similarities */
+
+template <>
+struct SimilarityL2<SIMDLevel::ARM_NEON> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+    explicit SimilarityL2(const float* y) : y(y) {}
+    simd8float32 accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(simd8float32 x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+
+        float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]);
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+
+        accu8 = simd8float32({accu8_0, accu8_1});
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            simd8float32 x,
+            simd8float32 y) {
+        float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]);
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+
+        accu8 = simd8float32({accu8_0, accu8_1});
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4_t sum_0 = vpaddq_f32(accu8.data.val[0], accu8.data.val[0]);
+        float32x4_t sum_1 = vpaddq_f32(accu8.data.val[1], accu8.data.val[1]);
+
+        float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0);
+        float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1);
+        return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0);
+    }
+};
+
+template <>
+struct SimilarityIP<SIMDLevel::ARM_NEON> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+
+    const float *y, *yi;
+
+    explicit SimilarityIP(const float* y) : y(y) {}
+    float32x4x2_t accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            float32x4x2_t x1,
+            float32x4x2_t x2) {
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4x2_t sum = {
+                vpaddq_f32(accu8.val[0], accu8.val[0]),
+                vpaddq_f32(accu8.val[1], accu8.val[1])};
+
+        float32x4x2_t sum2 = {
+                vpaddq_f32(sum.val[0], sum.val[0]),
+                vpaddq_f32(sum.val[1], sum.val[1])};
+        return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
+    }
+};
+
+/****************************************** Specialization of distance computers
+ */
+
+// this is the same code as the AVX2 version... Possible to mutualize?
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, SIMDLevel::AVX2> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            simd8float32 xi = quant.reconstruct_8_components(code, i);
+            sim.add_8_components(xi);
+        }
+        return sim.result_8();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            simd8float32 x1 = quant.reconstruct_8_components(code1, i);
+            simd8float32 x2 = quant.reconstruct_8_components(code2, i);
+            sim.add_8_components_2(x1, x2);
+        }
+        return sim.result_8();
+    }
+
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+
+template <class Similarity>
+struct DistanceComputerByte<Similarity, SIMDLevel::ARM_NEON>
+        : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        int accu = 0;
+        for (int i = 0; i < d; i++) {
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                accu += int(code1[i]) * code2[i];
+            } else {
+                int diff = int(code1[i]) - code2[i];
+                accu += diff * diff;
+            }
+        }
+        return accu;
+    }
+
+    void set_query(const float* x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/quantizers.h b/faiss/impl/scalar_quantizer/quantizers.h
new file mode 100644
index 0000000000..a9865722d6
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/quantizers.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/utils/simd_levels.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/utils/bf16.h>
+#include <faiss/utils/fp16.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+using QuantizerType = ScalarQuantizer::QuantizerType;
+
+/*******************************************************************
+ * Quantizer: normalizes scalar vector components, then passes them
+ * through a codec
+ *******************************************************************/
+
+enum class QScaling { UNIFORM = 0, NON_UNIFORM = 1 };
+
+template <class Codec, QScaling SCALING, SIMDLevel LEVEL>
+struct QuantizerT {};
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::UNIFORM, SIMDLevel::NONE>
+        : ScalarQuantizer::SQuantizer {
+    const size_t d;
+    const float vmin, vdiff;
+
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : d(d), vmin(trained[0]), vdiff(trained[1]) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = 0;
+            if (vdiff != 0) {
+                xi = (x[i] - vmin) / vdiff;
+                if (xi < 0) {
+                    xi = 0;
+                }
+                if (xi > 1.0) {
+                    xi = 1.0;
+                }
+            }
+            Codec::encode_component(xi, code, i);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = Codec::decode_component(code, i);
+            x[i] = vmin + xi * vdiff;
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        float xi = Codec::decode_component(code, i);
+        return vmin + xi * vdiff;
+    }
+};
+
+template <class Codec>
+struct QuantizerT<Codec, QScaling::NON_UNIFORM, SIMDLevel::NONE>
+        : ScalarQuantizer::SQuantizer {
+    const size_t d;
+    const float *vmin, *vdiff;
+
+    QuantizerT(size_t d, const std::vector<float>& trained)
+            : d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = 0;
+            if (vdiff[i] != 0) {
+                xi = (x[i] - vmin[i]) / vdiff[i];
+                if (xi < 0) {
+                    xi = 0;
+                }
+                if (xi > 1.0) {
+                    xi = 1.0;
+                }
+            }
+            Codec::encode_component(xi, code, i);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            float xi = Codec::decode_component(code, i);
+            x[i] = vmin[i] + xi * vdiff[i];
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        float xi = Codec::decode_component(code, i);
+        return vmin[i] + xi * vdiff[i];
+    }
+};
+
+/*******************************************************************
+ * Quantizers that are not based on codecs
+ *******************************************************************/
+
+/*******************************************************************
+ * FP16 quantizer
+ *******************************************************************/
+
+template <SIMDLevel level>
+struct QuantizerFP16 {};
+
+template <>
+struct QuantizerFP16<SIMDLevel::NONE> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerFP16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_fp16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_fp16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_fp16(((uint16_t*)code)[i]);
+    }
+};
+
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <SIMDLevel level>
+struct QuantizerBF16 {};
+
+template <>
+struct QuantizerBF16<SIMDLevel::NONE> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_bf16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_bf16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_bf16(((uint16_t*)code)[i]);
+    }
+};
+
+/*******************************************************************
+ * 8bit_direct quantizer
+ *******************************************************************/
+
+template <SIMDLevel level>
+struct Quantizer8bitDirect {};
+
+template <>
+struct Quantizer8bitDirect<SIMDLevel::NONE> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    Quantizer8bitDirect(size_t d, const std::vector<float>& /* unused */)
+            : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            code[i] = (uint8_t)x[i];
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = code[i];
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return code[i];
+    }
+};
+
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <SIMDLevel level>
+struct Quantizer8bitDirectSigned {};
+
+template <>
+struct Quantizer8bitDirectSigned<SIMDLevel::NONE>
+        : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& /* unused */)
+            : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            code[i] = (uint8_t)(x[i] + 128);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = code[i] - 128;
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return code[i] - 128;
+    }
+};
+
+template <SIMDLevel SL>
+ScalarQuantizer::SQuantizer* select_quantizer_1(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained) {
+    // constexpr SIMDLevel SL = INSTANCIATE_SIMD_LEVEL;
+    constexpr QScaling NU = QScaling::NON_UNIFORM;
+    constexpr QScaling U = QScaling::UNIFORM;
+    switch (qtype) {
+        case ScalarQuantizer::QT_8bit:
+            return new QuantizerT<Codec8bit<SL>, NU, SL>(d, trained);
+
+        case ScalarQuantizer::QT_6bit:
+            return new QuantizerT<Codec6bit<SL>, NU, SL>(d, trained);
+        case ScalarQuantizer::QT_4bit:
+            return new QuantizerT<Codec4bit<SL>, NU, SL>(d, trained);
+        case ScalarQuantizer::QT_8bit_uniform:
+            return new QuantizerT<Codec8bit<SL>, U, SL>(d, trained);
+        case ScalarQuantizer::QT_4bit_uniform:
+            return new QuantizerT<Codec4bit<SL>, U, SL>(d, trained);
+        case ScalarQuantizer::QT_fp16:
+            return new QuantizerFP16<SL>(d, trained);
+        case ScalarQuantizer::QT_bf16:
+            return new QuantizerBF16<SL>(d, trained);
+        case ScalarQuantizer::QT_8bit_direct:
+            return new Quantizer8bitDirect<SL>(d, trained);
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned<SL>(d, trained);
+        default:
+            FAISS_THROW_MSG("unknown qtype");
+            return nullptr;
+    }
+}
+
+// prevent implicit instanciation
+extern template ScalarQuantizer::SQuantizer* select_quantizer_1<
+        SIMDLevel::AVX2>(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+extern template ScalarQuantizer::SQuantizer* select_quantizer_1<
+        SIMDLevel::AVX512F>(
+        QuantizerType qtype,
+        size_t d,
+        const std::vector<float>& trained);
+
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/scanners.h b/faiss/impl/scalar_quantizer/scanners.h
new file mode 100644
index 0000000000..b0e7f04286
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/scanners.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/scalar_quantizer/codecs.h>
+#include <faiss/impl/scalar_quantizer/distance_computers.h>
+#include <faiss/impl/scalar_quantizer/quantizers.h>
+#include <faiss/utils/simd_levels.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/*******************************************************************
+ * InvertedListScanner: scans series of codes and keeps the best ones
+ *******************************************************************/
+
+template <class DCClass, int use_sel>
+struct IVFSQScannerIP : InvertedListScanner {
+    DCClass dc;
+    bool by_residual;
+
+    float accu0; /// added to all distances
+
+    IVFSQScannerIP(
+            int d,
+            const std::vector<float>& trained,
+            size_t code_size,
+            bool store_pairs,
+            const IDSelector* sel,
+            bool by_residual)
+            : dc(d, trained), by_residual(by_residual), accu0(0) {
+        this->store_pairs = store_pairs;
+        this->sel = sel;
+        this->code_size = code_size;
+        this->keep_max = true;
+    }
+
+    void set_query(const float* query) override {
+        dc.set_query(query);
+    }
+
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        accu0 = by_residual ? coarse_dis : 0;
+    }
+
+    float distance_to_code(const uint8_t* code) const final {
+        return accu0 + dc.query_to_code(code);
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            float accu = accu0 + dc.query_to_code(codes);
+
+            if (accu > simi[0]) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                minheap_replace_top(k, simi, idxi, accu, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            float accu = accu0 + dc.query_to_code(codes);
+            if (accu > radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add(accu, id);
+            }
+        }
+    }
+};
+
+/* use_sel = 0: don't check selector
+ * = 1: check on ids[j]
+ * = 2: check in j directly (normally ids is nullptr and store_pairs)
+ */
+template <class DCClass, int use_sel>
+struct IVFSQScannerL2 : InvertedListScanner {
+    DCClass dc;
+
+    bool by_residual;
+    const Index* quantizer;
+    const float* x; /// current query
+
+    std::vector<float> tmp;
+
+    IVFSQScannerL2(
+            int d,
+            const std::vector<float>& trained,
+            size_t code_size,
+            const Index* quantizer,
+            bool store_pairs,
+            const IDSelector* sel,
+            bool by_residual)
+            : dc(d, trained),
+              by_residual(by_residual),
+              quantizer(quantizer),
+              x(nullptr),
+              tmp(d) {
+        this->store_pairs = store_pairs;
+        this->sel = sel;
+        this->code_size = code_size;
+    }
+
+    void set_query(const float* query) override {
+        x = query;
+        if (!quantizer) {
+            dc.set_query(query);
+        }
+    }
+
+    void set_list(idx_t list_no, float /*coarse_dis*/) override {
+        this->list_no = list_no;
+        if (by_residual) {
+            // shift of x_in wrt centroid
+            quantizer->compute_residual(x, tmp.data(), list_no);
+            dc.set_query(tmp.data());
+        } else {
+            dc.set_query(x);
+        }
+    }
+
+    float distance_to_code(const uint8_t* code) const final {
+        return dc.query_to_code(code);
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            float dis = dc.query_to_code(codes);
+
+            if (dis < simi[0]) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                maxheap_replace_top(k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            float dis = dc.query_to_code(codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add(dis, id);
+            }
+        }
+    }
+};
+
+template <class DCClass, int use_sel>
+InvertedListScanner* sel3_InvertedListScanner(
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool r) {
+    if (DCClass::Sim::metric_type == METRIC_L2) {
+        return new IVFSQScannerL2<DCClass, use_sel>(
+                sq->d,
+                sq->trained,
+                sq->code_size,
+                quantizer,
+                store_pairs,
+                sel,
+                r);
+    } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFSQScannerIP<DCClass, use_sel>(
+                sq->d, sq->trained, sq->code_size, store_pairs, sel, r);
+    } else {
+        FAISS_THROW_MSG("unsupported metric type");
+    }
+}
+
+template <class DCClass>
+InvertedListScanner* sel2_InvertedListScanner(
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool r) {
+    if (sel) {
+        if (store_pairs) {
+            return sel3_InvertedListScanner<DCClass, 2>(
+                    sq, quantizer, store_pairs, sel, r);
+        } else {
+            return sel3_InvertedListScanner<DCClass, 1>(
+                    sq, quantizer, store_pairs, sel, r);
+        }
+    } else {
+        return sel3_InvertedListScanner<DCClass, 0>(
+                sq, quantizer, store_pairs, sel, r);
+    }
+}
+
+template <class Similarity, class Codec, QScaling SCALING>
+InvertedListScanner* sel12_InvertedListScanner(
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool r) {
+    constexpr SIMDLevel SL = Similarity::SIMD_LEVEL;
+    using QuantizerClass = QuantizerT<Codec, SCALING, SL>;
+    using DCClass = DCTemplate<QuantizerClass, Similarity, SL>;
+    return sel2_InvertedListScanner<DCClass>(
+            sq, quantizer, store_pairs, sel, r);
+}
+
+template <class Sim>
+InvertedListScanner* sel1_InvertedListScanner(
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool r) {
+    constexpr SIMDLevel SL = Sim::SIMD_LEVEL;
+    constexpr QScaling NU = QScaling::NON_UNIFORM;
+    constexpr QScaling U = QScaling::UNIFORM;
+
+    switch (sq->qtype) {
+        case ScalarQuantizer::QT_8bit_uniform:
+            return sel12_InvertedListScanner<Sim, Codec8bit<SL>, U>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_4bit_uniform:
+            return sel12_InvertedListScanner<Sim, Codec4bit<SL>, U>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_8bit:
+            return sel12_InvertedListScanner<Sim, Codec8bit<SL>, NU>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_4bit:
+            return sel12_InvertedListScanner<Sim, Codec4bit<SL>, NU>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_6bit:
+            return sel12_InvertedListScanner<Sim, Codec6bit<SL>, NU>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_fp16:
+            return sel2_InvertedListScanner<
+                    DCTemplate<QuantizerFP16<SL>, Sim, SL>>(
+                    sq, quantizer, store_pairs, sel, r);
+
+        case ScalarQuantizer::QT_bf16:
+            return sel2_InvertedListScanner<
+                    DCTemplate<QuantizerBF16<SL>, Sim, SL>>(
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_8bit_direct:
+            return sel2_InvertedListScanner<
+                    DCTemplate<Quantizer8bitDirect<SL>, Sim, SL>>(
+                    sq, quantizer, store_pairs, sel, r);
+
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner<
+                    DCTemplate<Quantizer8bitDirectSigned<SL>, Sim, SL>>(
+                    sq, quantizer, store_pairs, sel, r);
+        default:
+            FAISS_THROW_MSG("unknown qtype");
+            return nullptr;
+    }
+}
+
+template <SIMDLevel SL>
+InvertedListScanner* sel0_InvertedListScanner(
+        MetricType mt,
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool by_residual) {
+    if (mt == METRIC_L2) {
+        return sel1_InvertedListScanner<SimilarityL2<SL>>(
+                sq, quantizer, store_pairs, sel, by_residual);
+    } else if (mt == METRIC_INNER_PRODUCT) {
+        return sel1_InvertedListScanner<SimilarityIP<SL>>(
+                sq, quantizer, store_pairs, sel, by_residual);
+    } else {
+        FAISS_THROW_MSG("unsupported metric type");
+    }
+}
+
+// prevent implicit instantiation of the template when there are
+// SIMD optimized versions...
+extern template InvertedListScanner* sel0_InvertedListScanner<SIMDLevel::AVX2>(
+        MetricType mt,
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool by_residual);
+
+extern template InvertedListScanner* sel0_InvertedListScanner<
+        SIMDLevel::AVX512F>(
+        MetricType mt,
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool by_residual);
+
+} // namespace scalar_quantizer
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/training.cpp b/faiss/impl/scalar_quantizer/training.cpp
new file mode 100644
index 0000000000..23c51384fd
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/training.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/scalar_quantizer/training.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <cmath>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+/*******************************************************************
+ * Quantizer range training
+ */
+
+static float sqr(float x) {
+    return x * x;
+}
+
+void train_Uniform(
+        RangeStat rs,
+        float rs_arg,
+        idx_t n,
+        int k,
+        const float* x,
+        std::vector<float>& trained) {
+    trained.resize(2);
+    float& vmin = trained[0];
+    float& vmax = trained[1];
+
+    if (rs == ScalarQuantizer::RS_minmax) {
+        vmin = HUGE_VAL;
+        vmax = -HUGE_VAL;
+        for (size_t i = 0; i < n; i++) {
+            if (x[i] < vmin)
+                vmin = x[i];
+            if (x[i] > vmax)
+                vmax = x[i];
+        }
+        float vexp = (vmax - vmin) * rs_arg;
+        vmin -= vexp;
+        vmax += vexp;
+    } else if (rs == ScalarQuantizer::RS_meanstd) {
+        double sum = 0, sum2 = 0;
+        for (size_t i = 0; i < n; i++) {
+            sum += x[i];
+            sum2 += x[i] * x[i];
+        }
+        float mean = sum / n;
+        float var = sum2 / n - mean * mean;
+        float std = var <= 0 ? 1.0 : sqrt(var);
+
+        vmin = mean - std * rs_arg;
+        vmax = mean + std * rs_arg;
+    } else if (rs == ScalarQuantizer::RS_quantiles) {
+        std::vector<float> x_copy(n);
+        memcpy(x_copy.data(), x, n * sizeof(*x));
+        // TODO just do a quickselect
+        std::sort(x_copy.begin(), x_copy.end());
+        int o = int(rs_arg * n);
+        if (o < 0)
+            o = 0;
+        if (o > n - o)
+            o = n / 2;
+        vmin = x_copy[o];
+        vmax = x_copy[n - 1 - o];
+
+    } else if (rs == ScalarQuantizer::RS_optim) {
+        float a, b;
+        float sx = 0;
+        {
+            vmin = HUGE_VAL, vmax = -HUGE_VAL;
+            for (size_t i = 0; i < n; i++) {
+                if (x[i] < vmin)
+                    vmin = x[i];
+                if (x[i] > vmax)
+                    vmax = x[i];
+                sx += x[i];
+            }
+            b = vmin;
+            a = (vmax - vmin) / (k - 1);
+        }
+        int verbose = false;
+        int niter = 2000;
+        float last_err = -1;
+        int iter_last_err = 0;
+        for (int it = 0; it < niter; it++) {
+            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
+
+            for (idx_t i = 0; i < n; i++) {
+                float xi = x[i];
+                float ni = floor((xi - b) / a + 0.5);
+                if (ni < 0)
+                    ni = 0;
+                if (ni >= k)
+                    ni = k - 1;
+                err1 += sqr(xi - (ni * a + b));
+                sn += ni;
+                sn2 += ni * ni;
+                sxn += ni * xi;
+            }
+
+            if (err1 == last_err) {
+                iter_last_err++;
+                if (iter_last_err == 16)
+                    break;
+            } else {
+                last_err = err1;
+                iter_last_err = 0;
+            }
+
+            float det = sqr(sn) - sn2 * n;
+
+            b = (sn * sxn - sn2 * sx) / det;
+            a = (sn * sx - n * sxn) / det;
+            if (verbose) {
+                printf("it %d, err1=%g            \r", it, err1);
+                fflush(stdout);
+            }
+        }
+        if (verbose)
+            printf("\n");
+
+        vmin = b;
+        vmax = b + a * (k - 1);
+
+    } else {
+        FAISS_THROW_MSG("Invalid qtype");
+    }
+    vmax -= vmin;
+}
+
+void train_NonUniform(
+        RangeStat rs,
+        float rs_arg,
+        idx_t n,
+        int d,
+        int k,
+        const float* x,
+        std::vector<float>& trained) {
+    trained.resize(2 * d);
+    float* vmin = trained.data();
+    float* vmax = trained.data() + d;
+    if (rs == ScalarQuantizer::RS_minmax) {
+        memcpy(vmin, x, sizeof(*x) * d);
+        memcpy(vmax, x, sizeof(*x) * d);
+        for (size_t i = 1; i < n; i++) {
+            const float* xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                if (xi[j] < vmin[j])
+                    vmin[j] = xi[j];
+                if (xi[j] > vmax[j])
+                    vmax[j] = xi[j];
+            }
+        }
+        float* vdiff = vmax;
+        for (size_t j = 0; j < d; j++) {
+            float vexp = (vmax[j] - vmin[j]) * rs_arg;
+            vmin[j] -= vexp;
+            vmax[j] += vexp;
+            vdiff[j] = vmax[j] - vmin[j];
+        }
+    } else {
+        // transpose
+        std::vector<float> xt(n * d);
+        for (size_t i = 1; i < n; i++) {
+            const float* xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                xt[j * n + i] = xi[j];
+            }
+        }
+        std::vector<float> trained_d(2);
+#pragma omp parallel for
+        for (int j = 0; j < d; j++) {
+            train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d);
+            vmin[j] = trained_d[0];
+            vmax[j] = trained_d[1];
+        }
+    }
+}
+
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/impl/scalar_quantizer/training.h b/faiss/impl/scalar_quantizer/training.h
new file mode 100644
index 0000000000..6d97507c8f
--- /dev/null
+++ b/faiss/impl/scalar_quantizer/training.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*******************************************************************
+ * Quantizer range training
+ */
+
+#include <faiss/impl/ScalarQuantizer.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+using RangeStat = ScalarQuantizer::RangeStat;
+
+void train_Uniform(
+        RangeStat rs,
+        float rs_arg,
+        idx_t n,
+        int k,
+        const float* x,
+        std::vector<float>& trained);
+
+void train_NonUniform(
+        RangeStat rs,
+        float rs_arg,
+        idx_t n,
+        int d,
+        int k,
+        const float* x,
+        std::vector<float>& trained);
+} // namespace scalar_quantizer
+
+} // namespace faiss
diff --git a/faiss/test_dispatch/a.out b/faiss/test_dispatch/a.out
new file mode 100755
index 0000000000..4648aebd57
Binary files /dev/null and b/faiss/test_dispatch/a.out differ
diff --git a/faiss/test_dispatch/f.cpp b/faiss/test_dispatch/f.cpp
new file mode 100644
index 0000000000..c0dc736229
--- /dev/null
+++ b/faiss/test_dispatch/f.cpp
@@ -0,0 +1,20 @@
+
+
+#include "f.h"
+
+SIMDLevel simd_level = SIMDLevel::NONE;
+
+template <>
+float fvec_norm_L2sqr<SIMDLevel::NONE>(const float* x, size_t d) {
+    // the double in the _ref is suspected to be a typo. Some of the manual
+    // implementations this replaces used float.
+    float res = 0;
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * x[i];
+    }
+    return res;
+}
+
+float fvec_norm_L2sqr(const float* x, size_t d) {
+    DISPATCH_SIMDLevel(fvec_norm_L2sqr, x, d);
+}
diff --git a/faiss/test_dispatch/f.h b/faiss/test_dispatch/f.h
new file mode 100644
index 0000000000..04351f63cc
--- /dev/null
+++ b/faiss/test_dispatch/f.h
@@ -0,0 +1,77 @@
+
+#include <cassert>
+#include <cstdio>
+
+enum SIMDLevel {
+    NONE,
+    AVX2,
+    AVX512F,
+    ARM_NEON,
+    ARM_SVE,
+};
+
+extern SIMDLevel simd_level;
+
+#ifdef __ARM_FEATURE_SVE
+#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) \
+    case SIMDLevel::ARM_SVE:               \
+        return f<SIMDLevel::ARM_SVE>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_ARM_SVE(f, ...)
+#endif
+
+// #ifdef __AVX2__
+#if 1
+#define DISPATCH_SIMDLevel_AVX2(f, ...) \
+    case SIMDLevel::AVX2:               \
+        return f<SIMDLevel::AVX2>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_AVX2(f, ...)
+#endif
+
+#ifdef __AVX512F__
+#define DISPATCH_SIMDLevel_AVX512F(f, ...) \
+    case SIMDLevel::AVX512F:               \
+        return f<SIMDLevel::AVX512F>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_AVX512F(f, ...)
+#endif
+
+#define DISPATCH_SIMDLevel(f, ...)                      \
+    switch (simd_level) {                               \
+        case SIMDLevel::NONE:                           \
+            return f<SIMDLevel::NONE>(__VA_ARGS__);     \
+            DISPATCH_SIMDLevel_ARM_SVE(f, __VA_ARGS__); \
+            DISPATCH_SIMDLevel_AVX2(f, __VA_ARGS__);    \
+            DISPATCH_SIMDLevel_AVX512F(f, __VA_ARGS__); \
+        default:                                        \
+            assert(!"invlalid SIMD level");             \
+    }
+
+template <SIMDLevel>
+float fvec_norm_L2sqr(const float* x, size_t d);
+
+float fvec_norm_L2sqr(const float* x, size_t d);
+
+template <typename T, int i>
+struct FF {
+    static void func(T* x) {
+        // default implementation
+    }
+};
+
+template <typename T>
+struct FF<T, 1> {
+    static void func(T* x) {
+        printf("sizeof T = %d\n", sizeof(T));
+        // specialized implementation for i = 1
+    }
+};
+
+template <typename T>
+struct FF<T, 1> {
+    static void func(T* x) {
+        printf("sizeof T = %d\n", sizeof(T));
+        // specialized implementation for i = 1
+    }
+};
diff --git a/faiss/test_dispatch/f_avx2.cpp b/faiss/test_dispatch/f_avx2.cpp
new file mode 100644
index 0000000000..1f94329032
--- /dev/null
+++ b/faiss/test_dispatch/f_avx2.cpp
@@ -0,0 +1,24 @@
+
+#include "f.h"
+
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize(enable) interleave(enable)")
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <>
+float fvec_norm_L2sqr<SIMDLevel::AVX512F>(const float* x, size_t d) {
+    // the double in the _ref is suspected to be a typo. Some of the manual
+    // implementations this replaces used float.
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * x[i];
+    }
+
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
diff --git a/faiss/test_dispatch/f_avx2.o b/faiss/test_dispatch/f_avx2.o
new file mode 100644
index 0000000000..a21f7395c5
Binary files /dev/null and b/faiss/test_dispatch/f_avx2.o differ
diff --git a/faiss/test_dispatch/f_avx512.cpp b/faiss/test_dispatch/f_avx512.cpp
new file mode 100644
index 0000000000..d8a58da3ea
--- /dev/null
+++ b/faiss/test_dispatch/f_avx512.cpp
@@ -0,0 +1,24 @@
+
+#include "f.h"
+
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize(enable) interleave(enable)")
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <>
+float fvec_norm_L2sqr<SIMDLevel::AVX2>(const float* x, size_t d) {
+    // the double in the _ref is suspected to be a typo. Some of the manual
+    // implementations this replaces used float.
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * x[i];
+    }
+
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
diff --git a/faiss/test_dispatch/f_avx512.o b/faiss/test_dispatch/f_avx512.o
new file mode 100644
index 0000000000..3f6940ed6b
Binary files /dev/null and b/faiss/test_dispatch/f_avx512.o differ
diff --git a/faiss/test_dispatch/g.cpp b/faiss/test_dispatch/g.cpp
new file mode 100644
index 0000000000..6402c8751d
--- /dev/null
+++ b/faiss/test_dispatch/g.cpp
@@ -0,0 +1,14 @@
+#include <cstdio>
+
+template <int>
+void func();
+
+template <>
+void func<1>() {
+    printf("func 1\n");
+}
+
+template <>
+void func<2>() {
+    printf("func 2\n");
+}
diff --git a/faiss/test_dispatch/g.o b/faiss/test_dispatch/g.o
new file mode 100644
index 0000000000..93aafd4746
Binary files /dev/null and b/faiss/test_dispatch/g.o differ
diff --git a/faiss/test_dispatch/main.cpp b/faiss/test_dispatch/main.cpp
new file mode 100644
index 0000000000..dfaf03511f
--- /dev/null
+++ b/faiss/test_dispatch/main.cpp
@@ -0,0 +1,37 @@
+
+#include <cstdlib>
+#include "f.h"
+
+// fw declaration
+template <int i>
+void func();
+
+int main() {
+    func<1>();
+    func<2>();
+
+    float x[16];
+    for (int i = 0; i < 16; i++) {
+        x[i] = i + 1;
+    }
+    float norm = fvec_norm_L2sqr(x, 16);
+    printf("norm=%g\n", norm);
+
+    simd_level = SIMDLevel::AVX2;
+    norm = fvec_norm_L2sqr(x, 16);
+    printf("norm=%g\n", norm);
+
+    /*
+    simd_level = SIMDLevel::AVX512F;
+    norm = fvec_norm_L2sqr(x, 16);
+    printf("norm=%g\n", norm);
+*/
+    char* env_var = getenv("FAISS_SIMD_LEVEL");
+
+    printf("env_var=%s\n", env_var);
+
+    int i;
+    FF<int, 1>::func(&i);
+
+    return 0;
+}
diff --git a/faiss/utils/distances.h b/faiss/utils/distances.h
index 80d2cfc699..238e4bb40e 100644
--- a/faiss/utils/distances.h
+++ b/faiss/utils/distances.h
@@ -15,6 +15,7 @@
 
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/simd_levels.h>
 
 namespace faiss {
 
@@ -27,9 +28,15 @@ struct IDSelector;
 /// Squared L2 distance between two vectors
 float fvec_L2sqr(const float* x, const float* y, size_t d);
 
+template <SIMDLevel>
+float fvec_L2sqr(const float* x, const float* y, size_t d);
+
 /// inner product
 float fvec_inner_product(const float* x, const float* y, size_t d);
 
+template <SIMDLevel>
+float fvec_inner_product(const float* x, const float* y, size_t d);
+
 /// L1 distance
 float fvec_L1(const float* x, const float* y, size_t d);
 
@@ -138,6 +145,9 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
 /** squared norm of a vector */
 float fvec_norm_L2sqr(const float* x, size_t d);
 
+template <SIMDLevel>
+float fvec_norm_L2sqr(const float* x, size_t d);
+
 /** compute the L2 norms for a set of vectors
  *
  * @param  norms    output norms, size nx
@@ -473,6 +483,10 @@ void compute_PQ_dis_tables_dsub2(
  */
 void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
 
+/* same statically */
+template <SIMDLevel>
+void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
+
 /** same as fvec_madd, also return index of the min of the result table
  * @return    index of the min of table c
  */
diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp
index 1990e46aae..e829ae54ec 100644
--- a/faiss/utils/distances_simd.cpp
+++ b/faiss/utils/distances_simd.cpp
@@ -19,6 +19,9 @@
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/simdlib.h>
 
+#define AUTOVEC_LEVEL SIMDLevel::NONE
+#include <faiss/utils/simd_impl/distances_autovec-inl.h>
+
 #ifdef __SSE3__
 #include <immintrin.h>
 #endif
@@ -191,43 +194,19 @@ void fvec_inner_products_ny_ref(
  * Autovectorized implementations
  */
 
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    float res = 0.F;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i != d; ++i) {
-        res += x[i] * y[i];
-    }
-    return res;
-}
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+// dispatching functions
 
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float fvec_norm_L2sqr(const float* x, size_t d) {
-    // the double in the _ref is suspected to be a typo. Some of the manual
-    // implementations this replaces used float.
-    float res = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (size_t i = 0; i != d; ++i) {
-        res += x[i] * x[i];
-    }
-
-    return res;
+    DISPATCH_SIMDLevel(fvec_norm_L2sqr, x, d);
 }
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
-FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    FAISS_PRAGMA_IMPRECISE_LOOP
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-        res += tmp * tmp;
-    }
-    return res;
+    DISPATCH_SIMDLevel(fvec_L2sqr, x, y, d);
+}
+
+float fvec_inner_product(const float* x, const float* y, size_t d) {
+    DISPATCH_SIMDLevel(fvec_inner_product, x, y, d);
 }
-FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 /// Special version of inner product that computes 4 distances
 /// between x and yi
@@ -3246,7 +3225,8 @@ void fvec_inner_products_ny(
  * heavily optimized table computations
  ***************************************************************************/
 
-[[maybe_unused]] static inline void fvec_madd_ref(
+template <>
+void fvec_madd<SIMDLevel::NONE>(
         size_t n,
         const float* a,
         float bf,
@@ -3256,94 +3236,6 @@ void fvec_inner_products_ny(
         c[i] = a[i] + bf * b[i];
 }
 
-#if defined(__AVX512F__)
-
-static inline void fvec_madd_avx512(
-        const size_t n,
-        const float* __restrict a,
-        const float bf,
-        const float* __restrict b,
-        float* __restrict c) {
-    const size_t n16 = n / 16;
-    const size_t n_for_masking = n % 16;
-
-    const __m512 bfmm = _mm512_set1_ps(bf);
-
-    size_t idx = 0;
-    for (idx = 0; idx < n16 * 16; idx += 16) {
-        const __m512 ax = _mm512_loadu_ps(a + idx);
-        const __m512 bx = _mm512_loadu_ps(b + idx);
-        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
-        _mm512_storeu_ps(c + idx, abmul);
-    }
-
-    if (n_for_masking > 0) {
-        const __mmask16 mask = (1 << n_for_masking) - 1;
-
-        const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx);
-        const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
-        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
-        _mm512_mask_storeu_ps(c + idx, mask, abmul);
-    }
-}
-
-#elif defined(__AVX2__)
-
-static inline void fvec_madd_avx2(
-        const size_t n,
-        const float* __restrict a,
-        const float bf,
-        const float* __restrict b,
-        float* __restrict c) {
-    //
-    const size_t n8 = n / 8;
-    const size_t n_for_masking = n % 8;
-
-    const __m256 bfmm = _mm256_set1_ps(bf);
-
-    size_t idx = 0;
-    for (idx = 0; idx < n8 * 8; idx += 8) {
-        const __m256 ax = _mm256_loadu_ps(a + idx);
-        const __m256 bx = _mm256_loadu_ps(b + idx);
-        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
-        _mm256_storeu_ps(c + idx, abmul);
-    }
-
-    if (n_for_masking > 0) {
-        __m256i mask;
-        switch (n_for_masking) {
-            case 1:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-                break;
-            case 2:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                break;
-            case 3:
-                mask = _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1);
-                break;
-            case 4:
-                mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-                break;
-            case 5:
-                mask = _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1);
-                break;
-            case 6:
-                mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1);
-                break;
-            case 7:
-                mask = _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1);
-                break;
-        }
-
-        const __m256 ax = _mm256_maskload_ps(a + idx, mask);
-        const __m256 bx = _mm256_maskload_ps(b + idx, mask);
-        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
-        _mm256_maskstore_ps(c + idx, mask, abmul);
-    }
-}
-
-#endif
-
 #ifdef __SSE3__
 
 [[maybe_unused]] static inline void fvec_madd_sse(
@@ -3367,16 +3259,7 @@ static inline void fvec_madd_avx2(
 }
 
 void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
-#ifdef __AVX512F__
-    fvec_madd_avx512(n, a, bf, b, c);
-#elif __AVX2__
-    fvec_madd_avx2(n, a, bf, b, c);
-#else
-    if ((n & 3) == 0 && ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
-        fvec_madd_sse(n, a, bf, b, c);
-    else
-        fvec_madd_ref(n, a, bf, b, c);
-#endif
+    DISPATCH_SIMDLevel(fvec_madd, n, a, bf, b, c);
 }
 
 #elif defined(__ARM_FEATURE_SVE)
diff --git a/faiss/utils/simd_impl/distances_autovec-inl.h b/faiss/utils/simd_impl/distances_autovec-inl.h
new file mode 100644
index 0000000000..ace6dccccb
--- /dev/null
+++ b/faiss/utils/simd_impl/distances_autovec-inl.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/utils/distances.h>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <>
+float fvec_norm_L2sqr<AUTOVEC_LEVEL>(const float* x, size_t d) {
+    // the double in the _ref is suspected to be a typo. Some of the manual
+    // implementations this replaces used float.
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * x[i];
+    }
+
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <>
+float fvec_L2sqr<AUTOVEC_LEVEL>(const float* x, const float* y, size_t d) {
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+        res += tmp * tmp;
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <>
+float fvec_inner_product<AUTOVEC_LEVEL>(
+        const float* x,
+        const float* y,
+        size_t d) {
+    float res = 0.F;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * y[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+} // namespace faiss
diff --git a/faiss/utils/simd_impl/distances_avx2.cpp b/faiss/utils/simd_impl/distances_avx2.cpp
new file mode 100644
index 0000000000..cd52c470f9
--- /dev/null
+++ b/faiss/utils/simd_impl/distances_avx2.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/utils/distances.h>
+
+#include <immintrin.h>
+
+#define AUTOVEC_LEVEL SIMDLevel::AVX2
+#include <faiss/utils/simd_impl/distances_autovec-inl.h>
+
+namespace faiss {
+
+template <>
+void fvec_madd<SIMDLevel::AVX2>(
+        const size_t n,
+        const float* __restrict a,
+        const float bf,
+        const float* __restrict b,
+        float* __restrict c) {
+    //
+    const size_t n8 = n / 8;
+    const size_t n_for_masking = n % 8;
+
+    const __m256 bfmm = _mm256_set1_ps(bf);
+
+    size_t idx = 0;
+    for (idx = 0; idx < n8 * 8; idx += 8) {
+        const __m256 ax = _mm256_loadu_ps(a + idx);
+        const __m256 bx = _mm256_loadu_ps(b + idx);
+        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
+        _mm256_storeu_ps(c + idx, abmul);
+    }
+
+    if (n_for_masking > 0) {
+        __m256i mask;
+        switch (n_for_masking) {
+            case 1:
+                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+                break;
+            case 2:
+                mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                break;
+            case 3:
+                mask = _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1);
+                break;
+            case 4:
+                mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+                break;
+            case 5:
+                mask = _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1);
+                break;
+            case 6:
+                mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1);
+                break;
+            case 7:
+                mask = _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1);
+                break;
+        }
+
+        const __m256 ax = _mm256_maskload_ps(a + idx, mask);
+        const __m256 bx = _mm256_maskload_ps(b + idx, mask);
+        const __m256 abmul = _mm256_fmadd_ps(bfmm, bx, ax);
+        _mm256_maskstore_ps(c + idx, mask, abmul);
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/utils/simd_impl/distances_avx512.cpp b/faiss/utils/simd_impl/distances_avx512.cpp
new file mode 100644
index 0000000000..e87df652fd
--- /dev/null
+++ b/faiss/utils/simd_impl/distances_avx512.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/utils/distances.h>
+
+#include <immintrin.h>
+
+#define AUTOVEC_LEVEL SIMDLevel::AVX512F
+#include <faiss/utils/simd_impl/distances_autovec-inl.h>
+
+namespace faiss {
+
+template <>
+void fvec_madd<SIMDLevel::AVX512F>(
+        const size_t n,
+        const float* __restrict a,
+        const float bf,
+        const float* __restrict b,
+        float* __restrict c) {
+    const size_t n16 = n / 16;
+    const size_t n_for_masking = n % 16;
+
+    const __m512 bfmm = _mm512_set1_ps(bf);
+
+    size_t idx = 0;
+    for (idx = 0; idx < n16 * 16; idx += 16) {
+        const __m512 ax = _mm512_loadu_ps(a + idx);
+        const __m512 bx = _mm512_loadu_ps(b + idx);
+        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
+        _mm512_storeu_ps(c + idx, abmul);
+    }
+
+    if (n_for_masking > 0) {
+        const __mmask16 mask = (1 << n_for_masking) - 1;
+
+        const __m512 ax = _mm512_maskz_loadu_ps(mask, a + idx);
+        const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
+        const __m512 abmul = _mm512_fmadd_ps(bfmm, bx, ax);
+        _mm512_mask_storeu_ps(c + idx, mask, abmul);
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/utils/simdlib_avx2.h b/faiss/utils/simd_impl/simdlib_avx2.h
similarity index 100%
rename from faiss/utils/simdlib_avx2.h
rename to faiss/utils/simd_impl/simdlib_avx2.h
diff --git a/faiss/utils/simdlib_avx512.h b/faiss/utils/simd_impl/simdlib_avx512.h
similarity index 86%
rename from faiss/utils/simdlib_avx512.h
rename to faiss/utils/simd_impl/simdlib_avx512.h
index 63b23f9b19..80889dc508 100644
--- a/faiss/utils/simdlib_avx512.h
+++ b/faiss/utils/simd_impl/simdlib_avx512.h
@@ -14,7 +14,7 @@
 
 #include <faiss/impl/platform_macros.h>
 
-#include <faiss/utils/simdlib_avx2.h>
+#include <faiss/utils/simd_impl/simdlib_avx2.h>
 
 namespace faiss {
 
@@ -293,4 +293,47 @@ struct simd64uint8 : simd512bit {
     }
 };
 
+struct simd16float32 : simd512bit {
+    simd16float32() {}
+
+    explicit simd16float32(simd512bit x) : simd512bit(x) {}
+
+    explicit simd16float32(__m512 x) : simd512bit(x) {}
+
+    explicit simd16float32(float x) : simd512bit(_mm512_set1_ps(x)) {}
+
+    explicit simd16float32(const float* x)
+            : simd16float32(_mm512_loadu_ps(x)) {}
+
+    simd16float32 operator*(simd16float32 other) const {
+        return simd16float32(_mm512_mul_ps(f, other.f));
+    }
+
+    simd16float32 operator+(simd16float32 other) const {
+        return simd16float32(_mm512_add_ps(f, other.f));
+    }
+
+    simd16float32 operator-(simd16float32 other) const {
+        return simd16float32(_mm512_sub_ps(f, other.f));
+    }
+
+    simd16float32& operator+=(const simd16float32& other) {
+        f = _mm512_add_ps(f, other.f);
+        return *this;
+    }
+
+    std::string tostring() const {
+        float tab[16];
+        storeu((void*)tab);
+        char res[1000];
+        char* ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, "%g,", tab[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
 } // namespace faiss
diff --git a/faiss/utils/simdlib_emulated.h b/faiss/utils/simd_impl/simdlib_emulated.h
similarity index 100%
rename from faiss/utils/simdlib_emulated.h
rename to faiss/utils/simd_impl/simdlib_emulated.h
diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simd_impl/simdlib_neon.h
similarity index 100%
rename from faiss/utils/simdlib_neon.h
rename to faiss/utils/simd_impl/simdlib_neon.h
diff --git a/faiss/utils/simdlib_ppc64.h b/faiss/utils/simd_impl/simdlib_ppc64.h
similarity index 100%
rename from faiss/utils/simdlib_ppc64.h
rename to faiss/utils/simd_impl/simdlib_ppc64.h
diff --git a/faiss/utils/simd_levels.cpp b/faiss/utils/simd_levels.cpp
new file mode 100644
index 0000000000..894c9aa271
--- /dev/null
+++ b/faiss/utils/simd_levels.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/utils/simd_levels.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <cstdlib>
+#include <cstring>
+
+namespace faiss {
+
+SIMDLevel SIMDConfig::level = SIMDLevel::NONE;
+
+const char* SIMDConfig::level_names[] =
+        {"NONE", "AVX2", "AVX512F", "ARM_NEON", "ARM_SVE", "PPC"};
+
+SIMDConfig::SIMDConfig() {
+    char* env_var = getenv("FAISS_SIMD_LEVEL");
+    if (env_var) {
+        int i;
+        for (i = 0; i <= sizeof(level_names); i++) {
+            if (strcmp(env_var, level_names[i]) == 0) {
+                level = (SIMDLevel)i;
+                break;
+            }
+        }
+        FAISS_THROW_IF_NOT_FMT(
+                i != sizeof(level_names),
+                "FAISS_SIMD_LEVEL %s unknown",
+                env_var);
+        return;
+    }
+
+#ifdef __x86_64__
+    {
+        unsigned int eax, ebx, ecx, edx;
+        asm volatile("cpuid"
+                     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                     : "a"(0));
+
+#ifdef COMPILE_SIMD_AVX512F
+        if (ebx & (1 << 16)) {
+            level = AVX512F;
+        } else
+#endif
+
+#ifdef COMPILE_SIMD_AVX2
+                if (ecx & 32) {
+            level = AVX2;
+        } else
+#endif
+            level = NONE;
+    }
+#endif
+}
+
+} // namespace faiss
diff --git a/faiss/utils/simd_levels.h b/faiss/utils/simd_levels.h
new file mode 100644
index 0000000000..c7d6056bc8
--- /dev/null
+++ b/faiss/utils/simd_levels.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace faiss {
+
+// levels should be of increasing capacity
+enum SIMDLevel {
+    NONE,
+    // x86
+    AVX2,
+    AVX512F,
+    // arm
+    ARM_NEON,
+    ARM_SVE,
+    // ppc
+    PPC_ALTIVEC,
+};
+
+struct SIMDConfig {
+    static SIMDLevel level;
+    static const char* level_names[];
+    // initializes the simd_level from the cpuid and the FAISS_SIMD_LEVEL
+    // environment variable
+    SIMDConfig();
+};
+
+/*********************** x86 SIMD */
+
+#ifdef COMPILE_SIMD_AVX2
+#define DISPATCH_SIMDLevel_AVX2(f, ...) \
+    case SIMDLevel::AVX2:               \
+        return f<SIMDLevel::AVX2>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_AVX2(f, ...)
+#endif
+
+#ifdef COMPILE_SIMD_AVX512F
+#define DISPATCH_SIMDLevel_AVX512F(f, ...) \
+    case SIMDLevel::AVX512F:               \
+        return f<SIMDLevel::AVX512F>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_AVX512F(f, ...)
+#endif
+
+/*********************** ARM SIMD */
+
+#ifdef COMPILE_SIMD_NEON
+#define DISPATCH_SIMDLevel_ARM_NEON(f, ...) \
+    case SIMDLevel::ARM_NEON:               \
+        return f<SIMDLevel::ARM_NEON>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_ARM_NEON(f, ...)
+#endif
+
+#ifdef COMPILE_SIMD_SVE
+#define DISPATCH_SIMDLevel_ARM_SVE(f, ...) \
+    case SIMDLevel::ARM_SVE:               \
+        return f<SIMDLevel::ARM_SVE>(__VA_ARGS__)
+#else
+#define DISPATCH_SIMDLevel_ARM_SVE(f, ...)
+#endif
+
+/* dispatch function f to f<SIMDLevel> */
+
+#define DISPATCH_SIMDLevel(f, ...)                       \
+    switch (SIMDConfig::level) {                         \
+        case SIMDLevel::NONE:                            \
+            return f<SIMDLevel::NONE>(__VA_ARGS__);      \
+            DISPATCH_SIMDLevel_AVX2(f, __VA_ARGS__);     \
+            DISPATCH_SIMDLevel_AVX512F(f, __VA_ARGS__);  \
+            DISPATCH_SIMDLevel_ARM_NEON(f, __VA_ARGS__); \
+            DISPATCH_SIMDLevel_ARM_SVE(f, __VA_ARGS__);  \
+        default:                                         \
+            assert(!"invlalid SIMD level");              \
+    }
+
+} // namespace faiss
diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h
index eadfb78ae3..2b8bef4716 100644
--- a/faiss/utils/simdlib.h
+++ b/faiss/utils/simdlib.h
@@ -16,25 +16,25 @@
 
 #if defined(__AVX512F__)
 
-#include <faiss/utils/simdlib_avx2.h>
-#include <faiss/utils/simdlib_avx512.h>
+#include <faiss/utils/simd_impl/simdlib_avx2.h>
+#include <faiss/utils/simd_impl/simdlib_avx512.h>
 
 #elif defined(__AVX2__)
 
-#include <faiss/utils/simdlib_avx2.h>
+#include <faiss/utils/simd_impl/simdlib_avx2.h>
 
 #elif defined(__aarch64__)
 
-#include <faiss/utils/simdlib_neon.h>
+#include <faiss/utils/simd_impl/simdlib_neon.h>
 
 #elif defined(__PPC64__)
 
-#include <faiss/utils/simdlib_ppc64.h>
+#include <faiss/utils/simd_impl/simdlib_ppc64.h>
 
 #else
 
 // emulated = all operations are implemented as scalars
-#include <faiss/utils/simdlib_emulated.h>
+#include <faiss/utils/simd_impl/simdlib_emulated.h>
 
 // FIXME: make a SSE version
 // is this ever going to happen? We will probably rather implement AVX512