fix nvrtc compile and code style

pengbowang-nv · pengbowang-nv · commit 5d8eaed1d4fe · 2025-12-24T07:27:10.000Z
Signed-off-by: Pengbo Wang &lt;221450789+pengbowang-nv@users.noreply.github.com&gt;
diff --git a/cpp/kernels/xqa/defines.h b/cpp/kernels/xqa/defines.h
@@ -137,8 +137,8 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
 #define SKIP_SOFTMAX_ATTN_BLOCK_STATS 0
 #endif
 
-#ifndef SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GRETAER_THAN_ONE
-#define SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GRETAER_THAN_ONE 1
+#ifndef SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE
+#define SKIP_SOFTMAX_ATTN_FIX_THRESHOLD_GREATER_THAN_ONE 1
 #endif
 
 // 0 - no PDL
diff --git a/cpp/kernels/xqa/mha.h b/cpp/kernels/xqa/mha.h
@@ -134,7 +134,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
 #if SKIP_SOFTMAX_ATTN
     float const skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    uint32_t* __restrict__ skipped_block_count, uint32_t* __restrict__ total_block_count,
+    uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
 #endif
 #endif
     uint32_t* semaphores, void* scratch, cudaStream_t stream);
@@ -183,7 +183,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SKIP_SOFTMAX_ATTN
     float const skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    uint32_t* __restrict__ skipped_block_count, uint32_t* __restrict__ total_block_count,
+    uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
 #endif
 #endif
     uint32_t* semaphores, void* scratch, cudaStream_t stream);
diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
@@ -705,7 +705,7 @@ CUBIN_EXPORT __global__
 #if SKIP_SOFTMAX_ATTN
             float const skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-            uint32_t* __restrict__ skipped_block_count, uint32_t* __restrict__ total_block_count,
+            uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
 #endif
 #endif
             uint32_t* __restrict__ const semaphores
@@ -1083,8 +1083,8 @@ CUBIN_EXPORT __global__
 #if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
         if (threadIdx.x == 0 && skipped_block_count != nullptr && total_block_count != nullptr)
         {
-            atomicAdd(skipped_block_count, local_skipped_block_count);
-            atomicAdd(total_block_count, nbIters);
+            atomicAdd(skippedBlockCount, local_skipped_block_count);
+            atomicAdd(totalBlockCount, nbIters);
         }
 #endif
         unused(smem.qBar.consumed.arrive());
@@ -2395,7 +2395,7 @@ __device__ inline void storeGemm0AccToShm(
     uint32_t const idxOctInsideHalf = idxInHalf / 8;
     uint32_t const idxRowInsideOct = lane % 8;
     uint32_t const warpBaseC = 16 * warpRank;
-    auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> std::pair<uint32_t, uint32_t>
+    auto const toAccCoords = [](uint32_t const idxAccCoreMat) -> mha::pair<uint32_t, uint32_t>
     {
         uint32_t const accR = idxAccCoreMat / Gemm0Acc::cols;
         uint32_t const accC = idxAccCoreMat % Gemm0Acc::cols;
diff --git a/cpp/kernels/xqa/mha_stdheaders.cuh b/cpp/kernels/xqa/mha_stdheaders.cuh
@@ -1272,6 +1272,19 @@ using is_void = is_same<remove_cv_t<T>, void>;
 template <typename T>
 inline constexpr bool is_void_v = is_void<T>::value;
 #endif
+
+#ifndef GENERATE_CUBIN
+template <typename T1, typename T2>
+using pair = std::pair<T1, T2>;
+#else
+template <typename T1, typename T2>
+struct pair
+{
+    T1 first;
+    T2 second;
+};
+#endif
+
 } // namespace mha
 
 #if GENERATE_CUBIN
diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp
@@ -51,9 +51,8 @@ using Vector = Matrix<Type, Size, 1>;
 template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks,
-    float skip_softmax_threshold_scale_factor, uint32_t* skipped_block_count, uint32_t* total_block_count,
-    uint32_t multi_block_num)
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
+    uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
 {
     uint32_t const nbTiles = divUp(seqLen, tileSize);
     auto gemm1Acc = Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>::Zero().eval();
@@ -65,14 +64,14 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
     uint32_t const seqBeg = (seqLen < slidingWinSize ? 0 : seqLen - slidingWinSize);
     uint32_t const idxTileBeg = seqBeg / tileSize;
 
-    uint32_t const nbSubSeq = (multi_block_num > 0 && nbTiles >= 2) ? mha::min(nbTiles, multi_block_num) : 1;
+    uint32_t const nbSubSeq = (multiBlockNum > 0 && nbTiles >= 2) ? mha::min(nbTiles, multiBlockNum) : 1;
     // uint32_t const nbSubSeq = 1;
     std::vector<Eigen::Vector<float, headGrpSize>> skipRowMaxs(nbSubSeq);
     for (uint32_t i = 0; i < nbSubSeq; i++)
     {
         skipRowMaxs[i].fill(-INFINITY);
     }
-    float skip_softmax_threshold = skip_softmax_threshold_scale_factor / seqLen;
+    float skipSoftmaxThreshold = skipSoftmaxThresholdScaleFactor / seqLen;
 
     for (uint32_t idxTile = idxTileBeg; idxTile < nbTiles; idxTile++)
     {
@@ -106,17 +105,14 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
         auto const prevSkipRowMax = skipRowMaxs[idxTile % nbSubSeq];
         skipRowMaxs[idxTile % nbSubSeq] = localRowMax.cwiseMax(skipRowMaxs[idxTile % nbSubSeq]).eval();
 
-        // printf("\n===================\n");
-
-        // add skip softmax threshold here
-        if (skip_softmax_threshold > 0)
+        if (skipSoftmaxThreshold > 0)
         {
-            *total_block_count += 1;
-            auto const skip_softmax_mask = ((localRowMax - prevSkipRowMax).array() < std::log(skip_softmax_threshold));
-            bool const skip_block = skip_softmax_mask.all() && ((idxTile - idxTileBeg) >= nbSubSeq);
-            if (skip_block)
+            *totalBlockCount += 1;
+            auto const skipSoftmaxMask = ((localRowMax - prevSkipRowMax).array() < std::log(skipSoftmaxThreshold));
+            bool const skipBlock = skipSoftmaxMask.all() && ((idxTile - idxTileBeg) >= nbSubSeq);
+            if (skipBlock)
             {
-                *skipped_block_count += 1;
+                *skippedBlockCount += 1;
                 continue;
             }
         }
@@ -170,8 +166,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
     refFlashAttention<prec, tileSize, isPaged, useBeamSearch>(IOHead const* q,                                         \
         CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen,         \
         float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks,                     \
-        float skip_softmax_threshold, uint32_t* skipped_block_count, uint32_t* total_block_count,                      \
-        uint32_t multi_block_num)
+        float skipSoftmaxThreshold, uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum)
 
 INSTANTIATE_refFlashAttention(CacheElem, 64, false, false);
 INSTANTIATE_refFlashAttention(CacheElem, 64, false, true);
diff --git a/cpp/kernels/xqa/test/refAttention.h b/cpp/kernels/xqa/test/refAttention.h
@@ -88,8 +88,8 @@ struct CacheSeq<true, true>
 template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skip_softmax_threshold,
-    uint32_t* skipped_block_count, uint32_t* total_block_count, uint32_t multi_block_num);
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks, float skipSoftmaxThresholdScaleFactor,
+    uint32_t* skippedBlockCount, uint32_t* totalBlockCount, uint32_t multiBlockNum);
 
 template <typename MathElem, bool isPaged, bool useBeamSearch>
 #if SPEC_DEC
diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp
@@ -225,10 +225,9 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         seqLen = (16U << 20) / gmemCacheHeadBytes; // 32MB per K+V head.
     }
     ctxLen = std::min(ctxLen, seqLen);
-    float skip_softmax_threshold_scale_factor = skipSoftmaxThresholdScaleFactor;
-    uint32_t skipped_block_count = 0;
-    uint32_t total_block_count = 0;
-    if (skip_softmax_threshold_scale_factor > 0)
+    uint32_t skippedBlockCount = 0;
+    uint32_t totalBlockCount = 0;
+    if (skipSoftmaxThresholdScaleFactor > 0)
     {
         assert(useQGMMA);
     }
@@ -339,10 +338,10 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
     auto const ctxLenList = ManagedMemBuf<uint32_t[beamWidth]>(batchSize);
 #if SKIP_SOFTMAX_ATTN
 #ifdef SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    auto const kernel_skipped_block_count = ManagedMemBuf<uint32_t>(1);
-    auto const kernel_total_block_count = ManagedMemBuf<uint32_t>(1);
-    kernel_skipped_block_count[0] = 0;
-    kernel_total_block_count[0] = 0;
+    auto const kernelSkippedBlockCount = ManagedMemBuf<uint32_t>(1);
+    auto const kernelTotalBlockCount = ManagedMemBuf<uint32_t>(1);
+    kernelSkippedBlockCount[0] = 0;
+    kernelTotalBlockCount[0] = 0;
 #endif
 #else
     EXPECT_EQ(skipSoftmaxThresholdScaleFactor, 0.0f)
@@ -804,7 +803,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
 #if SKIP_SOFTMAX_ATTN
             skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-            kernel_skipped_block_count.get(), kernel_total_block_count.get(),
+            kernelSkippedBlockCount.get(), kernelTotalBlockCount.get(),
 #endif
 #endif
             semaphores.get(), scratch, stream);
@@ -844,8 +843,8 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
     prefetchToDevice(cudaCpuDeviceId);
     checkCuda(cudaStreamSynchronize(stream));
 #if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    kernel_skipped_block_count[0] /= nbIters;
-    kernel_total_block_count[0] /= nbIters;
+    kernelSkippedBlockCount[0] /= nbIters;
+    kernelTotalBlockCount[0] /= nbIters;
 #endif
     if (testPerf)
     {
@@ -885,7 +884,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         float const dramSolRatio = dramSolTime / ms;
 #if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
         size_t const totalNbCacheLoadWithSkip = gmemCacheHeadBytes
-            * (nbKHeads + nbVHeads * (1 - 1.0f * kernel_skipped_block_count[0] / kernel_total_block_count[0]))
+            * (nbKHeads + nbVHeads * (1 - 1.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]))
             * nbLoadedCacheTokens;
         float const totalTrafficWithSkip
             = totalNbCacheLoadWithSkip + inputBytes + outputBytes; // we ignore page indices and beam search indices.
@@ -907,13 +906,9 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         float const tops = headGrpSize * qSeqLen * float(seqLen) * (validElemsPerKHead + validElemsPerVHead) * 2
             * nbKHeads * batchSize / (ms * 1E-3F) * 1E-12F;
 #if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
-        float const topsWithSkip = headGrpSize * qSeqLen * float(seqLen) * (validElemsPerKHead + validElemsPerVHead) * 2
-            * nbKHeads * batchSize / (ms * 1E-3F) * 1E-12F;
-        printf("kernel skipped_block_count: %d/%d (%.2f%%)\n", kernel_skipped_block_count[0],
-            kernel_total_block_count[0],
-            kernel_total_block_count[0] == 0 ? 0.0f
-                                             : 100.0f * kernel_skipped_block_count[0] / kernel_total_block_count[0]);
-        printf("dramSolRatioWithSkip: %f%% (%f ms, TOPS = %f)\n", dramSolRatioWithSkip * 100, ms, topsWithSkip);
+        printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
+            kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
+        printf("dramSolRatioWithSkip: %f%% (%f ms, TOPS = %f)\n", dramSolRatioWithSkip * 100, ms, tops);
 #else
         printf("dramSolRatio: %f%% (%f ms, TOPS = %f)\n", dramSolRatio * 100, ms, tops);
 #endif
@@ -1138,8 +1133,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
                     {
                         refOutput = refFlashAttention<CacheElem, 64>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
                             vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks,
-                            skip_softmax_threshold_scale_factor, &skipped_block_count, &total_block_count,
-                            multiBlockNum);
+                            skipSoftmaxThresholdScaleFactor, &skippedBlockCount, &totalBlockCount, multiBlockNum);
                         // refOutput = refAttention<CacheElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
                         // vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
                     }
@@ -1187,13 +1181,11 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
             }
         }
 #if SKIP_SOFTMAX_ATTN
-        printf("host skipped_block_count: %d/%d (%.2f%%)\n", skipped_block_count, total_block_count,
-            total_block_count == 0 ? 0.0f : 100.0f * skipped_block_count / total_block_count);
+        printf("host skippedBlockCount: %d/%d (%.2f%%)\n", skippedBlockCount, totalBlockCount,
+            totalBlockCount == 0 ? 0.0f : 100.0f * skippedBlockCount / totalBlockCount);
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-        printf("kernel skipped_block_count: %d/%d (%.2f%%)\n", kernel_skipped_block_count[0],
-            kernel_total_block_count[0],
-            kernel_total_block_count[0] == 0 ? 0.0f
-                                             : 100.0f * kernel_skipped_block_count[0] / kernel_total_block_count[0]);
+        printf("kernel skippedBlockCount: %d/%d (%.2f%%)\n", kernelSkippedBlockCount[0], kernelTotalBlockCount[0],
+            kernelTotalBlockCount[0] == 0 ? 0.0f : 100.0f * kernelSkippedBlockCount[0] / kernelTotalBlockCount[0]);
 #endif
 #endif
         if (saveData)