perf: Fix the tactic sorting in TrtllmGenBatchedGemmRunner::getValidConfigIndices (#1615)

jinyangyuan-nvidia · web-flow · commit da937d73c6dd · 2025-09-02T13:32:32.000-07:00
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -24,6 +24,7 @@
 #include "flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h"
 #include "flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h"
 #include "flashinfer/trtllm/common.h"
+#include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
 
 namespace tensorrt_llm {
@@ -306,6 +307,8 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
   auto const bmm = BatchedGemmInterface();
   auto const configs = bmm.getBatchedGemmConfigs();
 
+  int32_t multiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
+
   BatchedGemmData gemmData;
   // Dims
   gemmData.mProblemDimensions.mNumBatches = numBatches;
@@ -322,67 +325,57 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
   gemmData.mProblemDimensions.mWorldSize = 1;
   gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
 
-  // Tier 0: K < tileK, prefer higher efficiency.
-  auto cmpTier0 = [&configs, &gemmData](int64_t idx0, int64_t idx1) {
+  auto cmpFunc = [&configs, &gemmData, &bmm, &multiProcessorCount](int64_t idx0, int64_t idx1) {
     auto const& optionsA = configs[idx0].mOptions;
     auto const& optionsB = configs[idx1].mOptions;
     int32_t sizeK = gemmData.mProblemDimensions.mK;
-    // Both waste computation, prefer higher efficiency.
-    if (sizeK <= optionsA.mTileK && sizeK <= optionsB.mTileK) {
-      double eff_a = (double)sizeK / optionsA.mTileK;
-      double eff_b = (double)sizeK / optionsB.mTileK;
-      return eff_a > eff_b;
-    }
-    // If either can be utilized, sort by tileK.
-    else {
-      return optionsA.mTileK > optionsB.mTileK;
+
+    // Tier 0: K < tileK, prefer higher efficiency.
+    if (optionsA.mTileK != optionsB.mTileK) {
+      // Both waste computation, prefer higher efficiency.
+      if (sizeK <= optionsA.mTileK && sizeK <= optionsB.mTileK) {
+        double eff_a = (double)sizeK / optionsA.mTileK;
+        double eff_b = (double)sizeK / optionsB.mTileK;
+        return eff_a > eff_b;
+      }
+      // If either can be utilized, sort by tileK.
+      else {
+        return optionsA.mTileK > optionsB.mTileK;
+      }
     }
-  };
-  // Tier 1: When tileK is the same, prefer unroll loop 2x for mma.
-  auto cmpTier1 = [&configs](int64_t idx0, int64_t idx1) {
-    auto const& optionsA = configs[idx0].mOptions;
-    auto const& optionsB = configs[idx1].mOptions;
-    if (optionsA.mTileK == optionsB.mTileK) {
+
+    // Tier 1: When tileK is the same, prefer unroll loop 2x for mma.
+    if (optionsA.mUseUnrollLoop2xForMma != optionsB.mUseUnrollLoop2xForMma) {
       return optionsA.mUseUnrollLoop2xForMma;
     }
-    return false;
-  };
-  // Tier 2+: When previous comparators are the same, prefer higher tileM.
-  auto cmpTier2 = [&configs](int64_t idx0, int64_t idx1) {
-    auto const& optionsA = configs[idx0].mOptions;
-    auto const& optionsB = configs[idx1].mOptions;
-    if (optionsA.mTileK == optionsB.mTileK &&
-        optionsA.mUseUnrollLoop2xForMma == optionsB.mUseUnrollLoop2xForMma) {
+
+    // Tier 2+: When previous comparators are the same, prefer higher tileM.
+    if (optionsA.mTileM != optionsB.mTileM) {
       return optionsA.mTileM > optionsB.mTileM;
     }
-    return false;
-  };
-  // Tier 2+: When previous comparators are the same, and when number of estimated CTAs is on the
-  // larger side, prefer persistent tile scheduler. The threshold is hardcoded as >148 CTAs at the
-  // moment.
-  auto cmpTier3 = [&configs, &gemmData](int64_t idx0, int64_t idx1) {
-    int32_t sizeM = gemmData.mProblemDimensions.mM;
-    int32_t sizeN = gemmData.mProblemDimensions.mN;
-    auto const& optionsA = configs[idx0].mOptions;
-    auto const& optionsB = configs[idx1].mOptions;
-    if (optionsA.mTileK == optionsB.mTileK &&
-        optionsA.mUseUnrollLoop2xForMma == optionsB.mUseUnrollLoop2xForMma &&
-        optionsA.mTileM == optionsB.mTileM) {
-      int64_t numTilesM = batchedGemm::gemm::divUp(sizeM, optionsA.mTileM);
-      int64_t numTilesN = batchedGemm::gemm::divUp(sizeN, optionsA.mTileN);
-      if (numTilesM * numTilesN > 148) {
+
+    // Tier 2+: When previous comparators are the same, prefer higher tileN.
+    if (optionsA.mTileN != optionsB.mTileN) {
+      return optionsA.mTileN > optionsB.mTileN;
+    }
+
+    // Tier 2+: When previous comparators are the same, and when the number of estimated CTAs is on
+    // the larger side, prefer persistent tile scheduler.
+    if (optionsA.mTileScheduler != optionsB.mTileScheduler) {
+      auto options = bmm.getOptionsFromConfigAndData(configs[idx0], gemmData);
+      auto numCtas = bmm.getNumCtas(options, gemmData.mProblemDimensions.mMaxNumCtasInTokenDim);
+      if (numCtas > multiProcessorCount) {
         return optionsA.mTileScheduler == batchedGemm::gemm::TileScheduler::Persistent;
+      } else {
+        return optionsB.mTileScheduler == batchedGemm::gemm::TileScheduler::Persistent;
       }
     }
+
     return false;
   };
-
   // Sort configs by options.
   std::vector<int64_t> sortedIndices = mPassingConfigIndices;
-  std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier0);
-  std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier1);
-  std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier2);
-  std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier3);
+  std::sort(sortedIndices.begin(), sortedIndices.end(), cmpFunc);
 
   // Special rules for corner cases, if applicable.
   std::vector<int64_t> prioritizedIndices =
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -522,13 +522,14 @@ class BatchedGemmInterface {
   // Returns true if the configuration of the cubin can be executed for the given params.
   bool isValidConfig(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
+  // Creates GemmOptions from kernel and data.
+  BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config,
+                                                 BatchedGemmData const& data) const;
+
  private:
   // Aligns the pointer to the alignment
   template <typename Dtype>
   inline Dtype* alignPtr(Dtype* ptr, int64_t alignment) const;
-  // Creates GemmOptions from kernel and data.
-  BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config,
-                                                 BatchedGemmData const& data) const;
 
   // Returns the size of the workspace buffers in bytes
   std::vector<size_t> getWorkspaceSizesInBytes(BatchedGemmConfig const& config,