flashinfer-ai
diff --git a/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/trtllm_fused_moe_kernel_launcher.cu‎
Lines changed: 76 additions & 39 deletions b/‎csrc/trtllm_fused_moe_kernel_launcher.cu‎
Lines changed: 76 additions & 39 deletions
@@ -169,7 +169,7 @@ void TrtllmGenBatchedGemmRunner::run(
   auto const configs = bmm.getBatchedGemmConfigs();
 
   auto const& config = configs[configIndex];
-
+  std::cout << "config.mFunctionName: " << config.mFunctionName << std::endl;
   FLASHINFER_CHECK(numBatches > 0, "Batched GEMM requires numBatches > 0");
   if (!mOptions.staticBatch) {
     FLASHINFER_CHECK(totalNumPaddedTokens,
 
@@ -42,9 +42,9 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
     TensorView gemm1_weights, TensorView output1_scales_scalar,
     TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
     TensorView output2_scales_scalar, TensorView output, int64_t const num_experts,
-    int64_t const top_k, int64_t const n_group, int64_t const topk_group,
+    int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
     int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, double const routed_scaling_factor,
+    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
     bool const use_routing_scales_on_input, int64_t const tile_tokens_dim,
     int64_t const routing_method_type, bool enable_pdl) {
   static const std::tuple<int, int> device_props = [hidden_states] {
@@ -62,8 +62,11 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
 
   if (use_routing_scales_on_input) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
-  } else {
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::DeepSeekV3) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  } else {
+    TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
   }
   TVM_FFI_ICHECK_EQ(routing_logits->ndim, 2) << "routing_logits must be 2D.";
   TVM_FFI_ICHECK_EQ(routing_logits->shape[1], num_experts) << "routing_logits has incorrect shape.";
@@ -74,17 +77,32 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
         << "routing_bias has incorrect shape.";
   }
 
-  if (n_group <= 0 || topk_group <= 0) {
-    TVM_FFI_ICHECK_EQ(top_k, 1) << "Current routing kernel (no groups) only supports top_k=1.";
-  } else {
-    TVM_FFI_ICHECK_LE(top_k, 8) << "Current routing kernel (with groups) only supports top_k<=8.";
-    TVM_FFI_ICHECK_LE(topk_group, 4)
-        << "Current routing kernel (with groups) only supports topk_group<=4.";
-    TVM_FFI_ICHECK_LE(topk_group, n_group) << "n_group must not be smaller than topk_group.";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group, 0) << "num_experts must be divisible by n_group";
+  if (n_group.has_value() && n_group.value() != 0) {
+    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::DeepSeekV3)
+        << "Routing kernel with groups implies DeepSeekV3 routing method.";
+    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
+    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
+        << "num_experts must be divisible by n_group";
+    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
+        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
+    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
+        << "n_group must not be smaller than topk_group.";
     // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group * num_experts / n_group))
+    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
         << "top_k must be less than total number of experts in selected groups";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::Renormalize ||
+             static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::RenormalizeNaive) {
+    TVM_FFI_LOG_AND_THROW(NotImplementedError)
+        << "Don't support routing method type Renormalize(Naive).";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::Llama4) {
+    TVM_FFI_ICHECK_EQ(top_k, 1)
+        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
   }
   TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
       << "Routing kernel expects that num_experts must be divisible by 4";
@@ -122,11 +140,11 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
   args.hidden_size = hidden_states->shape[1];
   args.hidden_size_output = args.hidden_size;
   args.top_k = top_k;
-  args.n_group = n_group;
-  args.topk_group = topk_group;
+  args.n_group = n_group.has_value() ? n_group.value() : 0;
+  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor;
+  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseRoutingScalesOnInput = use_routing_scales_on_input;
 
@@ -282,8 +300,8 @@ void trtllm_fp8_per_tensor_scale_moe(
     TensorView gemm1_weights, TensorView output1_scales_scalar,
     TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
     TensorView output2_scales_scalar, TensorView output, int64_t num_experts, int64_t top_k,
-    int64_t n_group, int64_t topk_group, int64_t intermediate_size, int64_t local_expert_offset,
-    int64_t local_num_experts, double routed_scaling_factor, bool use_routing_scales_on_input,
+    Optional<int64_t> n_group, Optional<int64_t> topk_group,  int64_t intermediate_size, int64_t local_expert_offset,
+    int64_t local_num_experts, Optional<double> routed_scaling_factor, bool use_routing_scales_on_input,
     int64_t tile_tokens_dim, int64_t routing_method_type, bool enable_pdl) {
   auto dtype = hidden_states->dtype;
   if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
@@ -302,9 +320,9 @@ void trtllm_fp8_block_scale_moe_launcher(
     TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
     TensorView hidden_states_scale, TensorView gemm1_weights, TensorView gemm1_weights_scale,
     TensorView gemm2_weights, TensorView gemm2_weights_scale, TensorView output,
-    int64_t const num_experts, int64_t const top_k, int64_t const n_group, int64_t const topk_group,
+    int64_t const num_experts, int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
     int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, double const routed_scaling_factor,
+    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
     int64_t const tile_tokens_dim, int64_t const routing_method_type,
     tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner& moe_runner, int64_t moeConfigIndex,
     bool enable_pdl) {
@@ -321,7 +339,11 @@ void trtllm_fp8_block_scale_moe_launcher(
       << "This kernel requires 10.x architecture. Current device has SM "
       << std::get<0>(device_props) << std::get<1>(device_props);
 
-  TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+    TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  } else {
+    TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
+  }
   TVM_FFI_ICHECK_EQ(routing_logits->ndim, 2) << "routing_logits must be 2D.";
   TVM_FFI_ICHECK_EQ(routing_logits->shape[0], hidden_states->shape[0])
       << "routing_logits and hidden_states must have the same number of tokens.";
@@ -336,17 +358,32 @@ void trtllm_fp8_block_scale_moe_launcher(
         << "routing_bias has incorrect shape.";
   }
 
-  if (n_group <= 0 || topk_group <= 0) {
-    TVM_FFI_ICHECK_EQ(top_k, 1) << "Current routing kernel (no groups) only supports top_k=1.";
-  } else {
-    TVM_FFI_ICHECK_LE(top_k, 8) << "Current routing kernel (with groups) only supports top_k<=8.";
-    TVM_FFI_ICHECK_LE(topk_group, 4)
-        << "Current routing kernel (with groups) only supports topk_group<=4.";
-    TVM_FFI_ICHECK_LE(topk_group, n_group) << "n_group must not be smaller than topk_group.";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group, 0) << "num_experts must be divisible by n_group";
+  if (n_group.has_value() && n_group.value() != 0) {
+    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::DeepSeekV3)
+        << "Routing kernel with groups implies DeepSeekV3 routing method.";
+    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
+    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
+        << "num_experts must be divisible by n_group";
+    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
+        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
+    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
+        << "n_group must not be smaller than topk_group.";
     // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group * num_experts / n_group))
+    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
         << "top_k must be less than total number of experts in selected groups";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::Renormalize ||
+             static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::RenormalizeNaive) {
+    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
+        << "Current routing kernel (no groups, renormalize) only supports top_k<=10 && top_k>0.";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::Llama4) {
+    TVM_FFI_ICHECK_EQ(top_k, 1)
+        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
   }
   TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
       << "Routing kernel expects that num_experts must be divisible by 4";
@@ -383,11 +420,11 @@ void trtllm_fp8_block_scale_moe_launcher(
   args.hidden_size = hidden_states->shape[1];
   args.hidden_size_output = args.hidden_size;
   args.top_k = top_k;
-  args.n_group = n_group;
-  args.topk_group = topk_group;
+  args.n_group = n_group.has_value() ? n_group.value() : 0;
+  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor;
+  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseDeepSeekFp8 = true;
 
@@ -573,9 +610,9 @@ void trtllm_fp8_block_scale_moe(TensorView routing_logits, Optional<TensorView>
                                 TensorView gemm1_weights, TensorView gemm1_weights_scale,
                                 TensorView gemm2_weights, TensorView gemm2_weights_scale,
                                 TensorView output, int64_t num_experts, int64_t top_k,
-                                int64_t n_group, int64_t topk_group, int64_t intermediate_size,
+                                Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
                                 int64_t local_expert_offset, int64_t local_num_experts,
-                                double routed_scaling_factor, int64_t tile_tokens_dim,
+                                Optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
                                 int64_t routing_method_type, bool use_shuffled_weight,
                                 int64_t weight_layout, bool enable_pdl) {
   auto dtype = hidden_states->dtype;
@@ -696,8 +733,8 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
     TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
     TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
         << "num_experts must be divisible by n_group";
-    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    // TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+    //     << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
     TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
         << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
     TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
@@ -710,9 +747,9 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
              static_cast<RoutingMethodType>(routing_method_type) ==
                  RoutingMethodType::RenormalizeNaive ||
              static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::TopK) {
-    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
-        << "Current routing kernel (no groups, renormalize/topk) only supports top_k<=8 && "
-           "top_k>0.";
+    // TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+    //     << "Current routing kernel (no groups, renormalize/topk) only supports top_k<=8 && "
+    //        "top_k>0.";
   } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
     TVM_FFI_ICHECK_EQ(top_k, 1)
         << "Current routing kernel (no groups, Llama4) only supports top_k=1.";