From 5dab1ba796c2939a5235be90f11339511c7e28b7 Mon Sep 17 00:00:00 2001 From: Mengdi Lin Date: Tue, 18 Mar 2025 12:32:19 -0700 Subject: [PATCH] fix integer overflow issue when calculating imbalance_factor (#4245) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4245 When number of clustering embeddings > int32 max, calculating imbalance factor from python side causes an function overload not found error. ``` [0]:[rank0]: return faiss.imbalance_factor(len(assign), k, faiss.swig_ptr(assign)) [0]:[rank0]: NotImplementedError: Wrong number or type of arguments for overloaded function 'imbalance_factor'. [0]:[rank0]: Possible C/C++ prototypes are: [0]:[rank0]: faiss::imbalance_factor(int,int,int64_t const *) [0]:[rank0]: faiss::imbalance_factor(int,int const *) ``` Fixing it by changing the function signature in c++ land to support int64_t. Reviewed By: bshethmeta Differential Revision: D71130612 --- faiss/Clustering.cpp | 16 ---------------- faiss/invlists/InvertedLists.cpp | 2 +- faiss/utils/utils.cpp | 8 ++++---- faiss/utils/utils.h | 4 ++-- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp index e557deaa51..33c939f088 100644 --- a/faiss/Clustering.cpp +++ b/faiss/Clustering.cpp @@ -33,22 +33,6 @@ Clustering::Clustering(int d, int k) : d(d), k(k) {} Clustering::Clustering(int d, int k, const ClusteringParameters& cp) : ClusteringParameters(cp), d(d), k(k) {} -static double imbalance_factor(int n, int k, int64_t* assign) { - std::vector hist(k, 0); - for (int i = 0; i < n; i++) - hist[assign[i]]++; - - double tot = 0, uf = 0; - - for (int i = 0; i < k; i++) { - tot += hist[i]; - uf += hist[i] * (double)hist[i]; - } - uf = uf * k / (tot * tot); - - return uf; -} - void Clustering::post_process_centroids() { if (spherical) { fvec_renorm_L2(d, k, centroids.data()); diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index f02b2d250a..20542fcf9a 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -181,7 +181,7 @@ size_t InvertedLists::copy_subset_to( } double InvertedLists::imbalance_factor() const { - std::vector hist(nlist); + std::vector hist(nlist); for (size_t i = 0; i < nlist; i++) { hist[i] = list_size(i); diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp index 85bf1348f2..0811cb9030 100644 --- a/faiss/utils/utils.cpp +++ b/faiss/utils/utils.cpp @@ -387,7 +387,7 @@ size_t ranklist_intersection_size( return count; } -double imbalance_factor(int k, const int* hist) { +double imbalance_factor(int k, const int64_t* hist) { double tot = 0, uf = 0; for (int i = 0; i < k; i++) { @@ -399,9 +399,9 @@ double imbalance_factor(int k, const int* hist) { return uf; } -double imbalance_factor(int n, int k, const int64_t* assign) { - std::vector hist(k, 0); - for (int i = 0; i < n; i++) { +double imbalance_factor(int64_t n, int k, const int64_t* assign) { + std::vector hist(k, 0); + for (int64_t i = 0; i < n; i++) { hist[assign[i]]++; } diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h index 901459d1c7..7d75b3200d 100644 --- a/faiss/utils/utils.h +++ b/faiss/utils/utils.h @@ -92,10 +92,10 @@ size_t merge_result_table_with( /// a balanced assignment has a IF of 1, a completely unbalanced assignment has /// an IF = k. -double imbalance_factor(int n, int k, const int64_t* assign); +double imbalance_factor(int64_t n, int k, const int64_t* assign); /// same, takes a histogram as input -double imbalance_factor(int k, const int* hist); +double imbalance_factor(int k, const int64_t* hist); /// compute histogram on v int ivec_hist(size_t n, const int* v, int vmax, int* hist);