update trtllm-gen to dd8b

IwakuraRein · IwakuraRein · commit a5f958561353 · 2025-10-30T11:28:02.000-07:00
Signed-off-by: Siyuan Fu &lt;siyuanf@nvidia.com&gt;
diff --git a/flashinfer/jit/fused_moe.py b/flashinfer/jit/fused_moe.py
@@ -233,11 +233,12 @@ def gen_trtllm_gen_fused_moe_sm100_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             "-DENABLE_BF16",
             "-DENABLE_FP8",
             "-DENABLE_FP4",
-            f'-DTLLM_GEN_BMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_BMM}\\"',
+            f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_BMM}\\"',
         ]
         + nvcc_flags,
         extra_include_paths=[
diff --git a/flashinfer/jit/gemm/core.py b/flashinfer/jit/gemm/core.py
@@ -381,6 +381,7 @@ def gen_trtllm_gen_gemm_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
         ]
@@ -531,6 +532,7 @@ def gen_trtllm_low_latency_gemm_module() -> JitSpec:
         ],
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_GEN_EXPORT_FLASHINFER",
             "-DTLLM_ENABLE_CUDA",
             f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
         ]
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmEnums.h
@@ -31,7 +31,9 @@ enum class RouteImpl {
   // Use LDGSTS to do the routing
   Ldgsts = 1,
   // Use UTMALDG.GATHER4 to do the routing
-  Tma = 2
+  Tma = 2,
+  // Use LDG+STS to do the routing
+  LdgPlusSts = 3
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -48,6 +50,10 @@ inline bool doesRouteImplUseTma(RouteImpl mode) { return (mode == RouteImpl::Tma
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline bool doesRouteImplUseLdgPlusSts(RouteImpl mode) { return (mode == RouteImpl::LdgPlusSts); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace batchedGemm
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -24,29 +24,22 @@
 #include "trtllm/gen/CudaKernelLauncher.h"
 
 #ifdef TLLM_GEN_EXPORT_INTERFACE
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
 #include "flashinferMetaInfo.h"
-#endif  // TLLM_GEN_EXPORT_INTERFACE
-
-#include "flashinfer/trtllm/common.h"
-#ifdef TLLM_GEN_BMM_CUBIN_PATH
-static const std::string tllm_gen_bmm_cubin_path = std::string(TLLM_GEN_BMM_CUBIN_PATH);
 #else
-static_assert(false, "TLLM_GEN_BMM_CUBIN_PATH macro is not defined when compiling");
-#endif
-
-namespace flashinfer::trtllm_cubin_loader {
-std::string getCubin(const std::string& kernelName, const std::string& sha256);
-}
+#include "KernelMetaInfo.h"
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
+#endif  // TLLM_GEN_EXPORT_INTERFACE
 
 namespace batchedGemm {
 
 namespace batchedGemm {
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // BatchedGemmData
 //
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct BatchedGemmData {
   struct ProblemDimensions {
@@ -448,11 +441,11 @@ struct BatchedGemmData {
   OutputBuffers mOutputBuffers;
 };
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // BatchedGemmInterface
 //
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 class BatchedGemmInterface {
  public:
@@ -530,18 +523,12 @@ class BatchedGemmInterface {
     if (config.mData == nullptr) {
       batchedGemmConfig = generateAndCompileKernel(batchedGemmConfig);
     }
+    TLLM_CHECK_ERROR(batchedGemmConfig.mCudaRunner != nullptr, "CudaRunner is not set");
+    batchedGemmConfig.mCudaRunner->run((void*)&kernelParams, (void*)cudaStream, grid,
+                                       /* cluster */ {},
+                                       /* instanceId */ batchedGemmConfig.mInstanceIdx);
+    return 0;
 #endif
-    auto fiModuleLoadData = [&](CUmodule* module) {
-      const std::string sha256 = config.mHash ? config.mHash : "";
-      std::string fname_cubin = config.mFunctionName;
-      if (!fname_cubin.empty()) {
-        fname_cubin[0] =
-            static_cast<char>(std::toupper(static_cast<unsigned char>(fname_cubin[0])));
-      }
-      fname_cubin = tllm_gen_bmm_cubin_path + "/" + fname_cubin + ".cubin";
-      std::string cubin = flashinfer::trtllm_cubin_loader::getCubin(fname_cubin, sha256);
-      cuErrCheck(cuModuleLoadData(module, cubin.c_str()));
-    };
 
     CUmodule cuModule;
     CUfunction cuFunction;
@@ -567,12 +554,12 @@ class BatchedGemmInterface {
       if (module != moduleCacheRef.end()) {
         cuFunction = std::get<1>(module->second);
       } else {
-        fiModuleLoadData(&cuModule);
+        gemm::loadCubinData(&cuModule, batchedGemmConfig);
         cuModuleGetFunction(&cuFunction, cuModule, batchedGemmConfig.mFunctionName);
         moduleCacheRef.insert(std::make_pair(moduleKey, std::make_tuple(cuModule, cuFunction)));
       }
     } else {
-      fiModuleLoadData(&cuModule);
+      gemm::loadCubinData(&cuModule, batchedGemmConfig);
       cuModuleGetFunction(&cuFunction, cuModule, batchedGemmConfig.mFunctionName);
     }
 
@@ -808,10 +795,10 @@ class BatchedGemmInterface {
   int32_t mNumRotations;
 };
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace batchedGemm
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace batchedGemm
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -260,8 +260,10 @@ inline bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool i
 
   if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() != options.mRouteImpl) {
     TLLM_CHECK_ERROR(
-        options.mRouteSfsImpl.value() == RouteImpl::Ldgsts && options.mRouteImpl == RouteImpl::Tma,
-        "RouteSfsImpl must be equal to RouteImpl, or Ldgsts, when RouteImpl is Tma");
+        (options.mRouteSfsImpl.value() == RouteImpl::Ldgsts ||
+         options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) &&
+            options.mRouteImpl == RouteImpl::Tma,
+        "RouteSfsImpl must be equal to RouteImpl, or Ldgsts/LdgPlusSts, when RouteImpl is Tma");
   } else if (!options.mRouteSfsImpl.has_value()) {
     if (updateOptions) {
       options.mRouteSfsImpl = options.mRouteImpl;
@@ -271,6 +273,15 @@ inline bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool i
     }
   }
 
+  TLLM_CHECK_ERROR(options.mRouteImpl != RouteImpl::LdgPlusSts,
+                   "LdgPlusSts does not support routing the tokens");
+
+  if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() == RouteImpl::LdgPlusSts) {
+    TLLM_CHECK_ERROR(!batchM, "LdgPlusSts only supports batch N");
+    TLLM_CHECK_ERROR(options.mTileK <= 512 && options.mTileK >= 128,
+                     "LdgPlusSts only supports 128 <= tileK <= 512");
+  }
+
   if (batchM) {
     if (options.mDtypeA == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) {
       TLLM_CHECK_ERROR(doesRouteImplUseNoRoute(options.mRouteImpl),
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h
@@ -30,7 +30,14 @@
 #include "trtllm/gen/CudaRunner.h"
 #include "trtllm/gen/GenCtx.h"
 #else
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
+#include <string>
+namespace flashinfer::trtllm_cubin_loader {
+std::string getCubin(const std::string& kernelName, const std::string& sha256);
+}
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
 #include <iostream>
+namespace batchedGemm {
 
 template <typename T>
 void printArgs(T arg) {
@@ -72,8 +79,6 @@ void printArgs(T first, Args... args) {
 
 #endif  // TLLM_GEN_EXPORT_INTERFACE
 
-namespace batchedGemm {
-
 namespace trtllm {
 namespace gen {
 class CudaRunner;
@@ -1471,6 +1476,31 @@ inline bool getKernelDoesScaleC(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dt
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename Config>
+inline CUresult loadCubinData(CUmodule* module, Config const& config) {
+  // Trtllm links the cubin into the executable while Flashinfer loads the cubin from storage.
+#ifdef TLLM_GEN_EXPORT_FLASHINFER
+#ifdef TLLM_GEN_GEMM_CUBIN_PATH
+  static const std::string tllm_gen_gemm_cubin_path = std::string(TLLM_GEN_GEMM_CUBIN_PATH);
+  const std::string sha256 = config.mHash ? config.mHash : "";
+  std::string fileName = config.mFunctionName;
+  if (!fileName.empty()) {
+    fileName[0] = static_cast<char>(std::toupper(static_cast<unsigned char>(fileName[0])));
+  }
+  const std::string& data = flashinfer::trtllm_cubin_loader::getCubin(
+      tllm_gen_gemm_cubin_path + "/" + fileName + ".cubin", sha256);
+  CUresult result = cuModuleLoadData(module, data.c_str());
+#else
+  static_assert(false, "TLLM_GEN_GEMM_CUBIN_PATH macro is not defined when compiling");
+#endif  // TLLM_GEN_GEMM_CUBIN_PATH
+#else
+  CUresult result = cuModuleLoadData(module, config.mData);
+#endif  // TLLM_GEN_EXPORT_FLASHINFER
+  return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace gemm
 
 #ifdef TLLM_GEN_EXPORT_INTERFACE