facebookresearch
diff --git a/‎faiss/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎faiss/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎faiss/utils/distances.cpp‎
Lines changed: 58 additions & 4 deletions b/‎faiss/utils/distances.cpp‎
Lines changed: 58 additions & 4 deletions
@@ -86,6 +86,9 @@ set(FAISS_SRC
   utils/quantize_lut.cpp
   utils/random.cpp
   utils/utils.cpp
+  utils/distances_fused/avx512.cpp
+  utils/distances_fused/distances_fused.cpp
+  utils/distances_fused/simdlib_based.cpp
 )
 
 set(FAISS_HEADERS
@@ -187,6 +190,9 @@ set(FAISS_HEADERS
   utils/simdlib_emulated.h
   utils/simdlib_neon.h
   utils/utils.h
+  utils/distances_fused/avx512.h
+  utils/distances_fused/distances_fused.h
+  utils/distances_fused/simdlib_based.h
 )
 
 if(NOT WIN32)
 
@@ -26,6 +26,8 @@
 #include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ResultHandler.h>
 
+#include <faiss/utils/distances_fused/distances_fused.h>
+
 #ifndef FINTEGER
 #define FINTEGER long
 #endif
@@ -229,7 +231,7 @@ void exhaustive_inner_product_blas(
 // distance correction is an operator that can be applied to transform
 // the distances
 template <class ResultHandler>
-void exhaustive_L2sqr_blas(
+void exhaustive_L2sqr_blas_default_impl(
         const float* x,
         const float* y,
         size_t d,
@@ -311,10 +313,20 @@ void exhaustive_L2sqr_blas(
     }
 }
 
+template <class ResultHandler>
+void exhaustive_L2sqr_blas(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const float* y_norms = nullptr) {
+    exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
+}
+
 #ifdef __AVX2__
-// an override for AVX2 if only a single closest point is needed.
-template <>
-void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
+void exhaustive_L2sqr_blas_cmax_avx2(
         const float* x,
         const float* y,
         size_t d,
@@ -513,11 +525,53 @@ void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
                 res.add_result(i, current_min_distance, current_min_index);
             }
         }
+        // Does nothing for SingleBestResultHandler, but
+        // keeping the call for the consistency.
+        res.end_multiple();
         InterruptCallback::check();
     }
 }
 #endif
 
+// an override if only a single closest point is needed
+template <>
+void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+#if defined(__AVX2__)
+    // use a faster fused kernel if available
+    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
+        // the kernel is available and it is complete, we're done.
+        return;
+    }
+
+    // run the specialized AVX2 implementation
+    exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms);
+
+#elif defined(__aarch64__)
+    // use a faster fused kernel if available
+    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
+        // the kernel is available and it is complete, we're done.
+        return;
+    }
+
+    // run the default implementation
+    exhaustive_L2sqr_blas_default_impl<
+            SingleBestResultHandler<CMax<float, int64_t>>>(
+            x, y, d, nx, ny, res, y_norms);
+#else
+    // run the default implementation
+    exhaustive_L2sqr_blas_default_impl<
+            SingleBestResultHandler<CMax<float, int64_t>>>(
+            x, y, d, nx, ny, res, y_norms);
+#endif
+}
+
 template <class ResultHandler>
 void knn_L2sqr_select(
         const float* x,