fix wrong cache name and remove some commented code

jiahanc · jiahanc · commit 9fb400ff65ec · 2025-10-18T13:35:03.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -735,8 +735,8 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
     TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
     TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
         << "num_experts must be divisible by n_group";
-    // TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-    //     << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
+        << "Current routing kernel (with groups) only supports top_k<=10 && top_k>0.";
     TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
         << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
     TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
@@ -749,9 +749,9 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
              static_cast<RoutingMethodType>(routing_method_type) ==
                  RoutingMethodType::RenormalizeNaive ||
              static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::TopK) {
-    // TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-    //     << "Current routing kernel (no groups, renormalize/topk) only supports top_k<=8 && "
-    //        "top_k>0.";
+    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
+        << "Current routing kernel (no groups, renormalize/topk) only supports top_k<=10 && "
+           "top_k>0.";
   } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
     TVM_FFI_ICHECK_EQ(top_k, 1)
         << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
diff --git a/csrc/trtllm_fused_moe_routing_renormalize.cu b/csrc/trtllm_fused_moe_routing_renormalize.cu
@@ -463,20 +463,6 @@ void run(Data const& data, void* stream) {
   }
 }
 
-// void run(Data const& data, void* stream) {
-//   TVM_FFI_ICHECK(data.mPtrExpertIdx != nullptr || data.mPtrScores != nullptr)
-//       << "Routing kernel requires at least one input parameter";
-//   TVM_FFI_ICHECK(data.mPtrPermutedIdxSize != nullptr && data.mPtrCtaIdxXyToBatchIdx != nullptr &&
-//                  data.mPtrCtaIdxXyToMnLimit != nullptr && data.mPtrNumNonExitingCtas != nullptr)
-//       << "Llama4 routing kernel expects permuted idx and grouped Gemm launch config buffers";
-//   TVM_FFI_ICHECK_LE(data.mTopK, MaxNumTopExperts)
-//       << "Routing kernel expects topK experts <= " << MaxNumTopExperts << ", got " << data.mTopK;
-//   TVM_FFI_ICHECK_LT(data.mPaddingLog2, 8)
-//       << "Routing kernel expects padding log2 < 8, got " << data.mPaddingLog2;
-
-//   runImpl(data, stream);
-// }
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace routingRenormalize
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -171,7 +171,9 @@ def _maybe_get_cached_w3_w1_permute_indices(
     epilogue_tile_m: int,
     num_elts_per_sf: Union[None, int] = None,
 ) -> torch.Tensor:
-    if dst_w3_w1_weight.shape not in _cache_permute_indices:
+    # Create a unique cache key that includes all parameters affecting the permutation
+    cache_key = ("w3_w1", dst_w3_w1_weight.shape)
+    if cache_key not in _cache_permute_indices:
         # Get permute indices and chain them together
         permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(dst_w3_w1_weight)
         if num_elts_per_sf is None:
@@ -185,10 +187,10 @@ def _maybe_get_cached_w3_w1_permute_indices(
                 num_elts_per_sf=num_elts_per_sf,
             )
         # Memoize permute indices as recompute is **very** costly
-        _cache_permute_indices[dst_w3_w1_weight.shape] = permute0[permute1].to(
+        _cache_permute_indices[cache_key] = permute0[permute1].to(
             dst_w3_w1_weight.device
         )
-    permute_indices = _cache_permute_indices[dst_w3_w1_weight.shape]
+    permute_indices = _cache_permute_indices[cache_key]
     return permute_indices
 
 
@@ -198,7 +200,9 @@ def get_w2_permute_indices_with_cache(
     epilogue_tile_m: int,
     num_elts_per_sf: Union[None, int] = None,
 ) -> torch.Tensor:
-    if dst_w2_weight.shape not in _cache_permute_indices:
+    # Create a unique cache key that includes all parameters affecting the permutation
+    cache_key = ("w2", dst_w2_weight.shape)
+    if cache_key not in _cache_permute_indices:
         if num_elts_per_sf is None:
             permute_indices = get_shuffle_matrix_a_row_indices(
                 dst_w2_weight, epilogue_tile_m
@@ -210,8 +214,8 @@ def get_w2_permute_indices_with_cache(
                 num_elts_per_sf=num_elts_per_sf,
             ).to(dst_w2_weight.device)
         # Memoize permute indices as recompute is **very** costly
-        _cache_permute_indices[dst_w2_weight.shape] = permute_indices
-    permute_indices = _cache_permute_indices[dst_w2_weight.shape]
+        _cache_permute_indices[cache_key] = permute_indices
+    permute_indices = _cache_permute_indices[cache_key]
     return permute_indices
 
 
diff --git a/include/flashinfer/trtllm/fused_moe/RoutingKernelTopK.cuh b/include/flashinfer/trtllm/fused_moe/RoutingKernelTopK.cuh
@@ -138,160 +138,6 @@ struct Sort<4, RedType> {
   }
 };
 
-// For N > 4, use a generic bubble sort approach for simplicity
-// This is not the most efficient but adequate for small N
-template <typename RedType>
-struct Sort<5, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-#pragma unroll
-      for (int j = 0; j < 4 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<6, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-#pragma unroll
-      for (int j = 0; j < 5 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<7, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 6; ++i) {
-#pragma unroll
-      for (int j = 0; j < 6 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<8, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 7; ++i) {
-#pragma unroll
-      for (int j = 0; j < 7 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<9, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 8; ++i) {
-#pragma unroll
-      for (int j = 0; j < 8 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<10, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 9; ++i) {
-#pragma unroll
-      for (int j = 0; j < 9 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<11, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 10; ++i) {
-#pragma unroll
-      for (int j = 0; j < 10 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
-template <typename RedType>
-struct Sort<12, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 11; ++i) {
-#pragma unroll
-      for (int j = 0; j < 11 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-template <typename RedType>
-struct Sort<13, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 12; ++i) {
-#pragma unroll
-      for (int j = 0; j < 12 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-template <typename RedType>
-struct Sort<14, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 13; ++i) {
-#pragma unroll
-      for (int j = 0; j < 13 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-template <typename RedType>
-struct Sort<15, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 14; ++i) {
-#pragma unroll
-      for (int j = 0; j < 14 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-template <typename RedType>
-struct Sort<16, RedType> {
-  static __device__ void run(RedType* topK) {
-#pragma unroll
-    for (int i = 0; i < 15; ++i) {
-#pragma unroll
-      for (int j = 0; j < 15 - i; ++j) {
-        TOPK_SWAP(j, j + 1);
-      }
-    }
-  }
-};
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int K, typename Type>
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -139,7 +139,7 @@ def is_cuda_oom_error_str(e: str) -> bool:
 
 @pytest.hookimpl(tryfirst=True)
 def pytest_runtest_call(item):
-    # Wrap the test call so we don't invoke item.runtest() ourselves; yield lets pytest run it.
+    # skip OOM error and missing JIT cache errors
     try:
         item.runtest()
     except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -105,7 +105,7 @@ def capture(self, hidden_states_sample, **runtime_args):
         self.input_tensor = hidden_states_sample.clone()
 
         # Warmup
-        with torch.cuda.stream(torch_stream), autotune(False):
+        with torch.cuda.stream(torch_stream), autotune(True):
             for _ in range(1):
                 self._run_moe_computation(runtime_args)
 
@@ -1832,13 +1832,14 @@ def _compute_moe_actual_unified(moe_impl, args_dequant, args, **kwargs):
 
 @pytest.fixture(scope="module")
 def cache_permute_indices():
-    _cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
+    # The cache key is now a tuple of (weight_type, shape)
+    _cache_permute_indices: Dict[tuple, torch.Tensor] = {}
     return _cache_permute_indices
 
 
 @pytest.mark.parametrize("num_tokens", [1, 8, 1024])
 @pytest.mark.parametrize("hidden_size", [1024, 8192])
-@pytest.mark.parametrize("intermediate_size", [1024, 768, 384, 512])
+@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 512, 384])
 @pytest.mark.parametrize(
     "moe_impl",
     [
@@ -1905,8 +1906,8 @@ def cache_permute_indices():
         ),
         pytest.param(
             {
-                "num_experts": 512,
-                "top_k": 10,
+                "num_experts": 256,
+                "top_k": 8,
                 "padding": 8,
                 "n_groups": None,
                 "top_k_groups": None,
@@ -1916,9 +1917,9 @@ def cache_permute_indices():
                 "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe],
             },
             id="Renorm",
-            # marks=pytest.mark.skip(
-            #     reason="Disabled for testing speed - similar to RenormalizeNaive"
-            # ),
+            marks=pytest.mark.skip(
+                reason="Disabled for testing speed - similar to RenormalizeNaive"
+            ),
         ),
         pytest.param(
             {
@@ -1929,6 +1930,20 @@ def cache_permute_indices():
                 "top_k_groups": None,
                 "routed_scaling": None,
                 "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize,
+                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe],
+            },
+            id="Qwen3_next",
+        ),
+        pytest.param(
+            {
+                "num_experts": 256,
+                "top_k": 8,
+                "padding": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.RenormalizeNaive,
                 "compatible_moe_impls": [FP4Moe, FP8BlockScaleMoe],
             },
@@ -2041,6 +2056,20 @@ def test_moe_quantization_classes(
             f"Skip for testing speed: {gated_act_type} + {hidden_size} + {intermediate_size}"
         )
 
+    # Skip large intermediate sizes for configurations with many experts
+    if routing_config["num_experts"] >= 512 and intermediate_size > 512:
+        pytest.skip(
+            f"Skipping for testing speed: intermediate_size={intermediate_size} with {routing_config['num_experts']} experts"
+        )
+
+    # Skip large intermediate size and hidden size for configurations with small epxerts
+    if routing_config["num_experts"] < 512 and (
+        intermediate_size > 512 or hidden_size > 1024
+    ):
+        pytest.skip(
+            f"Skipping for testing speed: intermediate_size={intermediate_size} with {routing_config['num_experts']} experts"
+        )
+
     if type(moe_impl) not in routing_config["compatible_moe_impls"]:
         pytest.skip(
             f"Incompatible: {moe_impl.name} + {routing_config['routing_method_type'].name}"
@@ -2085,10 +2114,10 @@ def test_moe_quantization_classes(
         )
         else 64,
     )
-    padding = tile_tokens_dim
+
     # Validation checks
     assert top_k <= num_experts
-    # assert top_k <= 8
+    assert top_k <= 10
     if (top_k_groups is not None) and (n_groups is not None) and (n_groups > 0):
         assert top_k_groups <= 4
         assert num_experts > n_groups