Fix CUDA kernel index data type in faiss/gpu/impl/DistanceUtils.cuh +10 (#4246)

r-barnes · facebook-github-bot · commit 0f2035cc8369 · 2025-03-19T13:19:34.000-07:00
Summary: Pull Request resolved: #4246 CUDA kernel variables matching the type `(thread|block|grid).(Idx|Dim).(x|y|z)` [have the data type `uint`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#built-in-variables). Many programmers mistakenly use implicit casts to turn these data types into `int`. In fact, the [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) it self is inconsistent and incorrect in its use of data types in programming examples. The result of these implicit casts is that our kernels may give unexpected results when exposed to large datasets, i.e., those exceeding >~2B items. While we now have linters in place to prevent simple mistakes (D71236150), our codebase has many problematic instances. This diff fixes some of them. Reviewed By: dtolnay Differential Revision: D71355340 fbshipit-source-id: 77dac270e1d3415bfe7d5cc214006d5176508474
diff --git a/faiss/gpu/impl/DistanceUtils.cuh b/faiss/gpu/impl/DistanceUtils.cuh
@@ -303,7 +303,7 @@ __global__ void incrementIndex(
         int k,
         idx_t increment) {
     for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
-        for (int j = threadIdx.x; j < k; j += blockDim.x) {
+        for (auto j = threadIdx.x; j < k; j += blockDim.x) {
             indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
         }
     }
diff --git a/faiss/gpu/impl/GpuScalarQuantizer.cuh b/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -377,7 +377,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_8bit, DimMultiple> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -587,7 +587,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_6bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
@@ -753,7 +753,7 @@ struct Codec<ScalarQuantizer::QuantizerType::QT_4bit, 1> {
         smemVmin = smem;
         smemVdiff = smem + dim;
 
-        for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < dim; i += blockDim.x) {
             // We are performing vmin + vdiff * (v + 0.5) / (2^bits - 1)
             // This can be simplified to vmin' + vdiff' * v where:
             // vdiff' = vdiff / (2^bits - 1)
diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu
@@ -368,9 +368,9 @@ __global__ void ivfInterleavedAppend(
         // The set of addresses for each of the lists
         void** listData) {
     // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-    int warpsPerBlock = blockDim.x / kWarpSize;
+    auto laneId = threadIdx.x % kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
+    auto warpsPerBlock = blockDim.x / kWarpSize;
 
     // Each block is dedicated to a separate list
     idx_t listId = uniqueLists[blockIdx.x];
diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
@@ -65,9 +65,9 @@ struct IVFFlatScan {
         int limit = utils::divDown(dim, Codec::kDimPerIter);
 
         // Each warp handles a separate chunk of vectors
-        int warpId = threadIdx.x / kWarpSize;
+        auto warpId = threadIdx.x / kWarpSize;
         // FIXME: why does getLaneId() not work when we write out below!?!?!
-        int laneId = threadIdx.x % kWarpSize; // getLaneId();
+        auto laneId = threadIdx.x % kWarpSize; // getLaneId();
 
         // Divide the set of vectors among the warps
         idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu
@@ -27,7 +27,7 @@ __global__ void ivfInterleavedScan2(
         Tensor<float, 2, true> distanceOut,
         Tensor<idx_t, 2, true> indicesOut) {
     if constexpr ((NumWarpQ == 1 && NumThreadQ == 1) || NumWarpQ >= kWarpSize) {
-        int queryId = blockIdx.x;
+        auto queryId = blockIdx.x;
 
         constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
@@ -99,7 +99,7 @@ __global__ void ivfInterleavedScan2(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             // Re-adjust the value we are selecting based on the sorting order
             distanceOut[queryId][i] = smemK[i] * adj;
             auto packedIndex = smemV[i];
diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh
@@ -56,7 +56,7 @@ __global__ void ivfInterleavedScan(
 
         for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
              queryId += gridDim.y) {
-            int probeId = blockIdx.x;
+            auto probeId = blockIdx.x;
             idx_t listId = listIds[queryId][probeId];
 
             // Safety guard in case NaNs in input cause no list ID to be
@@ -69,8 +69,8 @@ __global__ void ivfInterleavedScan(
             int dim = queries.getSize(1);
 
             // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-            int laneId = threadIdx.x % kWarpSize;
-            int warpId = threadIdx.x / kWarpSize;
+            auto laneId = threadIdx.x % kWarpSize;
+            auto warpId = threadIdx.x / kWarpSize;
 
             using EncodeT = typename Codec::EncodeT;
 
@@ -215,7 +215,7 @@ __global__ void ivfInterleavedScan(
             auto distanceOutBase = distanceOut[queryId][probeId].data();
             auto indicesOutBase = indicesOut[queryId][probeId].data();
 
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 distanceOutBase[i] = smemK[i];
                 indicesOutBase[i] = smemV[i];
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect1.cu b/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -90,7 +90,7 @@ __global__ void pass1SelectLists(
 
             // Write out the final k-selected values; they should be all
             // together
-            for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            for (auto i = threadIdx.x; i < k; i += blockDim.x) {
                 heapDistances[queryId][sliceId][i] = smemK[i];
                 heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
             }
diff --git a/faiss/gpu/impl/IVFUtilsSelect2.cu b/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -100,7 +100,7 @@ __global__ void pass2SelectLists(
         // Merge all final results
         heap.reduce();
 
-        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+        for (auto i = threadIdx.x; i < k; i += blockDim.x) {
             outDistances[queryId][i] = smemK[i];
 
             // `v` is the index in `heapIndices`
diff --git a/faiss/gpu/impl/IcmEncoder.cu b/faiss/gpu/impl/IcmEncoder.cu
@@ -46,8 +46,8 @@ __global__ void runIcmEncodeStep(
         int m) {
     using KVPair = Pair<float, int>;
 
-    int id = blockIdx.x;    // each block takes care of one vector
-    int code = threadIdx.x; // each thread takes care of one possible code
+    auto id = blockIdx.x;    // each block takes care of one vector
+    auto code = threadIdx.x; // each thread takes care of one possible code
 
     // compute the objective value by look-up tables
     KVPair obj(0.0f, code);
@@ -94,8 +94,8 @@ __global__ void runEvaluation(
         int M,
         int K,
         int dims) {
-    int id = blockIdx.x; // each block takes care of one vector
-    int d = threadIdx.x; // each thread takes care of one dimension
+    auto id = blockIdx.x; // each block takes care of one vector
+    auto d = threadIdx.x; // each thread takes care of one dimension
     float acc = 0.0f;
 
 #pragma unroll
@@ -136,7 +136,7 @@ __global__ void runCodesPerturbation(
         int K,
         int nperts) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n) {
         return;
@@ -173,7 +173,7 @@ __global__ void runCodesSelection(
         int n,
         int M) {
     // each thread takes care of one vector
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto id = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (id >= n || objs[id] >= bestObjs[id]) {
         return;
@@ -195,8 +195,8 @@ __global__ void runCodesSelection(
  * @param K     number of codewords in a codebook
  */
 __global__ void runNormAddition(float* uterm, const float* norm, int K) {
-    int id = blockIdx.x;
-    int code = threadIdx.x;
+    auto id = blockIdx.x;
+    auto code = threadIdx.x;
 
     uterm[id * K + code] += norm[code];
 }
diff --git a/faiss/gpu/impl/L2Norm.cu b/faiss/gpu/impl/L2Norm.cu
@@ -40,7 +40,7 @@ __global__ void l2NormRowMajor(
     // these are fine to be int (just based on block dimensions)
     int numWarps = utils::divUp(blockDim.x, kWarpSize);
     int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    auto warpId = threadIdx.x / kWarpSize;
 
     bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
     idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;

Original file line number	Diff line number	Diff line change
`@@ -303,7 +303,7 @@ __global__ void incrementIndex(`
`303`	`303`	`int k,`
`304`	`304`	`idx_t increment) {`
`305`	`305`	`for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {`
`306`		`- for (int j = threadIdx.x; j < k; j += blockDim.x) {`
	`306`	`+ for (auto j = threadIdx.x; j < k; j += blockDim.x) {`
`307`	`307`	`indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;`
`308`	`308`	`}`
`309`	`309`	`}`
Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ __global__ void pass1SelectLists(`
`90`	`90`
`91`	`91`	`// Write out the final k-selected values; they should be all`
`92`	`92`	`// together`
`93`		`- for (int i = threadIdx.x; i < k; i += blockDim.x) {`
	`93`	`+ for (auto i = threadIdx.x; i < k; i += blockDim.x) {`
`94`	`94`	`heapDistances[queryId][sliceId][i] = smemK[i];`
`95`	`95`	`heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);`
`96`	`96`	`}`