fix spelling, kernel param number error and cleanup

pengbowang-nv · pengbowang-nv · commit 2202e650d5b8 · 2025-12-24T09:04:22.000Z
Signed-off-by: Pengbo Wang &lt;221450789+pengbowang-nv@users.noreply.github.com&gt;
diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu
@@ -2794,8 +2794,8 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SKIP_SOFTMAX_ATTN
     float const skipSoftmaxThresholdScaleFactor, // for compatibility with mha_sm90.cu only
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    uint32_t* __restrict__ skipped_block_count,  // for compatibility with mha_sm90.cu only
-    uint32_t* __restrict__ total_block_count,    // for compatibility with mha_sm90.cu only
+    uint32_t* __restrict__ skippedBlockCount,    // for compatibility with mha_sm90.cu only
+    uint32_t* __restrict__ totalBlockCount,      // for compatibility with mha_sm90.cu only
 #endif
 #endif
     uint32_t* semaphores, void* scratch, cudaStream_t stream)
diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
@@ -868,7 +868,7 @@ CUBIN_EXPORT __global__
 #endif
 
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-        uint32_t local_skipped_block_count = 0;
+        uint32_t localSkippedBlockCount = 0;
 #endif
 
         // QK gemm
@@ -1014,7 +1014,7 @@ CUBIN_EXPORT __global__
                 {
                     smem.skipSoftmaxVotesGemm0ToGemm1[idxXBuf] = 1U;
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-                    local_skipped_block_count++;
+                    localSkippedBlockCount++;
 #endif
                 }
                 asm volatile("fence.proxy.async.shared::cta;\n"); // maybe not used
@@ -1081,9 +1081,9 @@ CUBIN_EXPORT __global__
             unused(xBar.produced.arrive());
         }
 #if SKIP_SOFTMAX_ATTN && SKIP_SOFTMAX_ATTN_BLOCK_STATS
-        if (threadIdx.x == 0 && skipped_block_count != nullptr && total_block_count != nullptr)
+        if (threadIdx.x == 0 && skippedBlockCount != nullptr && totalBlockCount != nullptr)
         {
-            atomicAdd(skippedBlockCount, local_skipped_block_count);
+            atomicAdd(skippedBlockCount, localSkippedBlockCount);
             atomicAdd(totalBlockCount, nbIters);
         }
 #endif
@@ -1670,7 +1670,6 @@ CUBIN_EXPORT __global__
     {
         return;
     }
-    // todo: skip_softmax_attn: fix multiblockmode
     bool& smemIsLastCta = smem.isLastCta;
     if (threadIdx.x == gemm1NbThrds - 1U && threadIdx.z == 0)
     {
@@ -3486,7 +3485,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SKIP_SOFTMAX_ATTN
     float const skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-    uint32_t* __restrict__ skipped_block_count, uint32_t* __restrict__ total_block_count,
+    uint32_t* __restrict__ skippedBlockCount, uint32_t* __restrict__ totalBlockCount,
 #endif
 #endif
     uint32_t* semaphores, void* scratch, cudaStream_t stream)
@@ -3515,8 +3514,6 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
     // gridDim.z == nbKHeads * batchSize && gridDim.y == nbSubSeqPerSeq && gridDim.x == nbInputSeqSplit
     dim3 const dimGrid{divUp(qSeqLen, inputTokensPerCta), nbSubSeqPerSeq, nbKHeads * batchSize};
     dim3 const dimCta{warp_size * gmmaWarpsPerGrp, 1, 3};
-    // printf("dimGrid: %d, %d, %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
-    // printf("dimCta: %d, %d, %d\n", dimCta.x, dimCta.y, dimCta.z);
     auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
 #if USE_PAGED_KV_CACHE
     uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
@@ -3582,7 +3579,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SKIP_SOFTMAX_ATTN
         skipSoftmaxThresholdScaleFactor,
 #if SKIP_SOFTMAX_ATTN_BLOCK_STATS
-        skipped_block_count, total_block_count,
+        skippedBlockCount, totalBlockCount,
 #endif
 #endif
         semaphores, scratch);
diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp
@@ -17,7 +17,6 @@
 
 #include "refAttention.h"
 #include <cstdint>
-#include <cstdio>
 
 template <typename T>
 Vec<float, validElemsPerHead> toF32Head(Vec<T, validElemsPerHead> const& src)
@@ -65,7 +64,6 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
     uint32_t const idxTileBeg = seqBeg / tileSize;
 
     uint32_t const nbSubSeq = (multiBlockNum > 0 && nbTiles >= 2) ? mha::min(nbTiles, multiBlockNum) : 1;
-    // uint32_t const nbSubSeq = 1;
     std::vector<Eigen::Vector<float, headGrpSize>> skipRowMaxs(nbSubSeq);
     for (uint32_t i = 0; i < nbSubSeq; i++)
     {
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -379,7 +379,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
             .mask = reinterpret_cast<SpecDecParams::MaskType const*>(xqaParams.spec_decoding_packed_mask)};
     };
 
-    constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 16;
+    constexpr uint32_t kMAX_NB_KERNEL_PARAMS = 19;
     uint32_t idxNextParam = 0;
     void* kernelParams[kMAX_NB_KERNEL_PARAMS];
     auto appendParam = [&](auto* p) mutable
@@ -517,7 +517,8 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
         }
         if (isSkipSoftmax)
         {
-            TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel in JIT path for now.");
+            TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel for now.");
+            TLLM_CHECK_WITH_INFO(!isSpecDec, "skip softmax is not supported with spec dec for now.");
             appendParam(&xqaParams.skip_softmax_threshold_scale_factor);
 #ifdef SKIP_SOFTMAX_STAT
             appendParam(&xqaParams.skip_softmax_total_blocks);