add support for hopper xqa skip softmax kernel

pengbowang-nv · pengbowang-nv · commit dbbd4c3b5e77 · 2025-12-24T07:27:11.000Z
Signed-off-by: Pengbo Wang &lt;221450789+pengbowang-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -298,6 +298,11 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.use_sparse_attention = useTllmGenSparseAttention();
     // Skip softmax threshold.
     xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode;
+#ifdef SKIP_SOFTMAX_STAT
+    // Statistics of skip-softmax, pointers of device memory for output
+    xqaParams.skip_softmax_total_blocks = mSkipSoftmaxTotalBlocks;
+    xqaParams.skip_softmax_skipped_blocks = mSkipSoftmaxSkippedBlocks;
+#endif
     // Cross attention parameters.
     xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp
@@ -105,7 +105,8 @@ CubinObj CompileEngine::compile() const
         // scratch in this case.
         /*use_input_kv=*/applyRoPEInXqaKernel,
         /*rope_style=*/ropeStyle,
-        /*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree};
+        /*is_spec_dec_tree=*/mXqaParams.is_spec_dec_tree,
+        /*use_skip_softmax_attn=*/mXqaParams.skip_softmax_threshold_scale_factor != 0};
     if (context.kernel_type == TLLM_XQA_JIT_MLA)
     {
         auto const& c = context;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -232,6 +232,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
     jit::CubinObj const* const cubinObj = mResource->getCubinObjRegistry()->getCubin(key);
     TLLM_CHECK(cubinObj != nullptr && cubinObj->isInitialized());
     bool const isSpecDec = xqaParams.multi_query_tokens;
+    bool const isSkipSoftmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
     bool const isHMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kAMPERE_WARP_SPECIALIZED);
     bool const isGMMAKernel = (cubinObj->getKernelType() == XQAKernelType::kHOPPER_WARP_SPECIALIZED);
     bool const isMLAKernel = (cubinObj->getKernelType() == XQAKernelType::kSM120_MLA);
@@ -514,6 +515,15 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
             appendParam(&specDecParams);
             specDecBlocks = divUp(specDecParams.qSeqLen, 64 / num_q_heads_over_kv);
         }
+        if (isSkipSoftmax)
+        {
+            TLLM_CHECK_WITH_INFO(isGMMAKernel, "skip softmax is only supported for GMMA kernel in JIT path for now.");
+            appendParam(&xqaParams.skip_softmax_threshold_scale_factor);
+#ifdef SKIP_SOFTMAX_STAT
+            appendParam(&xqaParams.skip_softmax_total_blocks);
+            appendParam(&xqaParams.skip_softmax_skipped_blocks);
+#endif
+        }
         appendParam(&launchParams.semaphores);
         appendParam(&launchParams.scratch);
         kernelParams[idxNextParam] = nullptr; // one extra nullptr at end as guard.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp
@@ -96,10 +96,16 @@ bool supportConfigQGMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlu
     {
         return false;
     }
-    if (xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
+    if (!contains({DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_E4M3}, xqaParams.kv_cache_data_type))
     {
         return false;
     }
+    bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
+    if (!is_skip_softmax && xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)
+    {
+        // Only use hopper kernel with fp16/bf16 kv cache data type when skip softmax is enabled
+        return false;
+    }
     if (xqaParams.beam_width != 1)
     {
         return false;
@@ -168,6 +174,11 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug
     {
         return false;
     }
+    bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
+    if (is_skip_softmax)
+    {
+        return false;
+    }
     return true;
 }
 
@@ -201,6 +212,11 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi
     {
         return false;
     }
+    bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;
+    if (is_skip_softmax)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/include/nvrtcWrapper.h
@@ -66,6 +66,7 @@ extern "C"
 
         bool is_spec_dec_tree
             = true; // useful only when multi_query_tokens, should be true unless using linear tree in spec-dec.
+        bool use_skip_softmax_attn;
     } tllmXqaJitContext;
 
     // tllmXqaJitProgram is an opaque handle for a program.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/src/nvrtcWrapper.cpp
@@ -215,6 +215,10 @@ tllmXqaJitStatus getMacroFlags(tllmXqaJitContext const* context, std::vector<std
     macros["USE_INPUT_KV"] = context->use_input_kv ? "1" : "0";
     macros["ROPE_STYLE"] = std::to_string(int(context->rope_style));
     macros["IS_SPEC_DEC_TREE"] = context->is_spec_dec_tree ? "1" : "0";
+    macros["SKIP_SOFTMAX_ATTN"] = context->use_skip_softmax_attn ? "1" : "0";
+#ifdef SKIP_SOFTMAX_STAT
+    macros["SKIP_SOFTMAX_ATTN_BLOCK_STATS"] = context->use_skip_softmax_attn ? "1" : "0";
+#endif
 
     // Without these macros, NVRTC uses precompiled headers for cuda_fp16.h etc.
     // Linking might fail due to ABI incompatibility.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp
@@ -493,6 +493,10 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo
     {
         SUPPORT_RETURN_FALSE("streaming-llm");
     }
+    if (xqaParams.skip_softmax_threshold_scale_factor != 0)
+    {
+        SUPPORT_RETURN_FALSE("skip_softmax_threshold_scale_factor");
+    }
 
     // OPTIMIZE: For the standard generation-phase MHA, there are still extra limitations.
     // NOTE: Medusa mode = Multi_query_tokens > 1.
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp
@@ -64,6 +64,21 @@ CUtensorMapSwizzle getSwizzleMode(uint32_t partBytes)
     }
 };
 
+CUtensorMapDataType_enum getDataTypeFromXqaParams(XQAParams const& xqaParams)
+{
+    if (xqaParams.kv_cache_data_type == DATA_TYPE_BF16)
+    {
+        return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
+    }
+    else if (xqaParams.kv_cache_data_type == DATA_TYPE_FP16)
+    {
+        return CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
+    }
+    TLLM_CHECK(xqaParams.kv_cache_data_type == DATA_TYPE_E4M3 || xqaParams.kv_cache_data_type == DATA_TYPE_E5M2
+        || xqaParams.kv_cache_data_type == DATA_TYPE_INT8);
+    return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+}
+
 CUtensorMap makeTensorMapForQ(std::shared_ptr<CUDADriverWrapper> const& driver, void const* addr,
     CUtensorMapDataType_enum dataType, uint32_t headElems, uint32_t totalNbHeads, uint32_t partElems, uint32_t boxHeads)
 {
@@ -131,24 +146,26 @@ CUtensorMap makeTensorMapForHopperXqaKVCache(
     if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
     {
         uint32_t const headElems = xqaParams.head_size;
-        uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
+        CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
+        uint32_t const elemBytes = getElemBytes(dataType);
         TLLM_CHECK(headElems <= 256);
         uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
         uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
-        return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
-            xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
+        return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
+            xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
     }
     else
     {
         static_assert(std::is_same_v<KVCacheBuffer, KVLinearBuffer>);
         uint32_t const headElems = xqaParams.head_size;
-        uint32_t const elemBytes = getElemBytes(CU_TENSOR_MAP_DATA_TYPE_UINT8);
+        CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
+        uint32_t const elemBytes = getElemBytes(dataType);
         TLLM_CHECK(headElems <= 256);
         uint32_t const paddedHeadElems = headElems <= 64 ? 64 : (headElems <= 128 ? 128 : 256);
         uint32_t const partElems = std::min(elemBytes * paddedHeadElems, 128U) / elemBytes;
-        return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, CU_TENSOR_MAP_DATA_TYPE_UINT8,
-            xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width,
-            xqaParams.batch_size, partElems);
+        return makeTensorMapForContiguousKVCache(driver, kv_cache_buffer.data, dataType, xqaParams.head_size,
+            xqaParams.num_kv_heads, xqaParams.max_attention_window_size, xqaParams.beam_width, xqaParams.batch_size,
+            partElems);
     }
 }
 
@@ -161,11 +178,12 @@ template <typename KVCacheBuffer>
 CUtensorMap makeTensorMapForXqaMlaKVCache(std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver,
     XQAParams const& xqaParams, KVCacheBuffer const& kv_cache_buffer, bool forK)
 {
+    CUtensorMapDataType_enum const dataType = getDataTypeFromXqaParams(xqaParams);
     uint32_t const partElems = (forK ? 64 : 128);
     if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
     {
-        return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, CU_TENSOR_MAP_DATA_TYPE_UINT8,
-            xqaParams.head_size, xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
+        return makeTensorMapForPagedKVCache(driver, kv_cache_buffer.mPrimaryPoolPtr, dataType, xqaParams.head_size,
+            xqaParams.num_kv_heads, xqaParams.tokens_per_block, partElems);
     }
     else
     {
@@ -183,7 +201,7 @@ CUtensorMap makeTensorMapForXqaMlaQ(
     std::shared_ptr<tensorrt_llm::common::CUDADriverWrapper> const& driver, XQAParams const& xqaParams, void const* q)
 {
     uint32_t const partElems = 64;
-    return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size,
+    return makeTensorMapForQ(driver, q, getDataTypeFromXqaParams(xqaParams), xqaParams.head_size,
         xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads);
 }
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -119,7 +119,12 @@ struct XQAParams
     bool use_sparse_attention = false;
 
     // Skip softmax threshold.
-    float skip_softmax_threshold_scale_factor = 0.0f;
+    float skip_softmax_threshold_scale_factor = 0;
+
+#ifdef SKIP_SOFTMAX_STAT
+    uint32_t* skip_softmax_total_blocks = nullptr;
+    uint32_t* skip_softmax_skipped_blocks = nullptr;
+#endif
 
     cudaStream_t stream = 0;
     // layer index
@@ -199,6 +204,10 @@ struct XQAParams
            << "sparse_params: " << sparse_params.toString() << std::endl
            << "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl
            << "skip_softmax_threshold_scale_factor :" << skip_softmax_threshold_scale_factor << std ::endl
+#ifdef SKIP_SOFTMAX_STAT
+           << "skip_softmax_total_blocks :" << skip_softmax_total_blocks << std ::endl
+           << "skip_softmax_skipped_blocks :" << skip_softmax_skipped_blocks << std ::endl
+#endif
            << "stream :" << stream;
 
         return ss.str();

Original file line number	Diff line number	Diff line change
`@@ -96,10 +96,16 @@ bool supportConfigQGMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlu`
`96`	`96`	`{`
`97`	`97`	`return false;`
`98`	`98`	`}`
`99`		`- if (xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)`
	`99`	`+ if (!contains({DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_E4M3}, xqaParams.kv_cache_data_type))`
`100`	`100`	`{`
`101`	`101`	`return false;`
`102`	`102`	`}`
	`103`	`+ bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;`
	`104`	`+ if (!is_skip_softmax && xqaParams.kv_cache_data_type != DATA_TYPE_E4M3)`
	`105`	`+ {`
	`106`	`+ // Only use hopper kernel with fp16/bf16 kv cache data type when skip softmax is enabled`
	`107`	`+ return false;`
	`108`	`+ }`
`103`	`109`	`if (xqaParams.beam_width != 1)`
`104`	`110`	`{`
`105`	`111`	`return false;`
`@@ -168,6 +174,11 @@ bool supportConfigHMMA(XQAParams const& xqaParams, int SM, bool forConfigurePlug`
`168`	`174`	`{`
`169`	`175`	`return false;`
`170`	`176`	`}`
	`177`	`+ bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;`
	`178`	`+ if (is_skip_softmax)`
	`179`	`+ {`
	`180`	`+ return false;`
	`181`	`+ }`
`171`	`182`	`return true;`
`172`	`183`	`}`
`173`	`184`
`@@ -201,6 +212,11 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi`
`201`	`212`	`{`
`202`	`213`	`return false;`
`203`	`214`	`}`
	`215`	`+ bool const is_skip_softmax = xqaParams.skip_softmax_threshold_scale_factor != 0;`
	`216`	`+ if (is_skip_softmax)`
	`217`	`+ {`
	`218`	`+ return false;`
	`219`	`+ }`
`204`	`220`	`return true;`
`205`	`221`	`}`
`206`	`222`
Original file line number	Diff line number	Diff line change
`@@ -493,6 +493,10 @@ bool DecoderXQAImplPrecompiled::shouldUse(XQAParams const& xqaParams, bool forCo`
`493`	`493`	`{`
`494`	`494`	`SUPPORT_RETURN_FALSE("streaming-llm");`
`495`	`495`	`}`
	`496`	`+ if (xqaParams.skip_softmax_threshold_scale_factor != 0)`
	`497`	`+ {`
	`498`	`+ SUPPORT_RETURN_FALSE("skip_softmax_threshold_scale_factor");`
	`499`	`+ }`
`496`	`500`
`497`	`501`	`// OPTIMIZE: For the standard generation-phase MHA, there are still extra limitations.`
`498`	`502`	`// NOTE: Medusa mode = Multi_query_tokens > 1.`