flashinfer-ai · yzh119 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
@@ -714,16 +714,20 @@ class Fp8BlockScaleLauncher : public FusedMoeLauncher {
  public:
   static constexpr std::array<int32_t, 5> mSupportedTileNums = {8, 16, 32, 64, 128};
 
-  Fp8BlockScaleLauncher(TensorView const& routing_logits, Optional<TensorView> const& routing_bias,
-                        TensorView const& hidden_states, TensorView const& hidden_states_scale,
-                        TensorView const& gemm1_weights, TensorView const& gemm1_weights_scale,
-                        TensorView const& gemm2_weights, TensorView const& gemm2_weights_scale)
-      : FusedMoeLauncher(Optional<TensorView>(routing_logits), routing_bias, hidden_states,
-                         gemm1_weights, Optional<TensorView>(), Optional<TensorView>(),
-                         gemm2_weights, Optional<TensorView>()),
+  Fp8BlockScaleLauncher(Optional<TensorView> const& routing_logits,
+                        Optional<TensorView> const& routing_bias, TensorView const& hidden_states,
+                        TensorView const& hidden_states_scale, TensorView const& gemm1_weights,
+                        TensorView const& gemm1_weights_scale, TensorView const& gemm2_weights,
+                        TensorView const& gemm2_weights_scale, TensorView const& expert_indices,
+                        TensorView const& expert_weights)
+      : FusedMoeLauncher(routing_logits, routing_bias, hidden_states, gemm1_weights,
+                         Optional<TensorView>(), Optional<TensorView>(), gemm2_weights,
+                         Optional<TensorView>()),
         hidden_states_scale(hidden_states_scale),
         gemm1_weights_scale(gemm1_weights_scale),
-        gemm2_weights_scale(gemm2_weights_scale) {}
+        gemm2_weights_scale(gemm2_weights_scale),
+        expert_indices(expert_indices),
+        expert_weights(expert_weights) {}
 
   void init(std::unique_ptr<tensorrt_llm::kernels::trtllmgen_moe::MoE::MoERunnerArgs>&& args,
             int64_t tile_tokens_dim, int64_t routing_method_type, bool use_shuffled_weight,
@@ -906,6 +910,8 @@ class Fp8BlockScaleLauncher : public FusedMoeLauncher {
   TensorView hidden_states_scale;
   TensorView gemm1_weights_scale;
   TensorView gemm2_weights_scale;
+  TensorView expert_indices;
+  TensorView expert_weights;
   Tensor gemm1_output_scale;
   Tensor activation_output_scale;
 
@@ -1565,19 +1571,24 @@ Tensor trtllm_fp8_per_tensor_scale_moe(
 }
 
 Tensor trtllm_fp8_block_scale_moe(
-    TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
-    TensorView hidden_states_scale, TensorView gemm1_weights, TensorView gemm1_weights_scale,
-    TensorView gemm2_weights, TensorView gemm2_weights_scale, TensorView output,
-    int64_t num_experts, int64_t top_k, Optional<int64_t> n_group, Optional<int64_t> topk_group,
-    int64_t intermediate_size, int64_t local_expert_offset, int64_t local_num_experts,
-    Optional<double> routed_scaling_factor, int64_t routing_method_type, bool use_shuffled_weight,
-    int64_t weight_layout, bool enable_pdl, Array<int64_t> config_index) {
+    Optional<TensorView> routing_logits, TensorView topk_ids, TensorView expert_weights,
+    Optional<TensorView> routing_bias, TensorView hidden_states, TensorView hidden_states_scale,
+    TensorView gemm1_weights, TensorView gemm1_weights_scale, TensorView gemm2_weights,
+    TensorView gemm2_weights_scale, TensorView output, int64_t num_experts, int64_t top_k,
+    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
+    int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
+    int64_t routing_method_type, bool use_shuffled_weight, int64_t weight_layout, bool enable_pdl,
+    Array<int64_t> config_index) {
   // Basic type validation
   auto dtype = hidden_states.dtype();
-  if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_float32) << "routing_logits must be float.";
-  } else {
-    TVM_FFI_ICHECK_EQ(routing_logits.dtype(), dl_bfloat16) << "routing_logits must be bfloat16.";
+  if (routing_logits.has_value()) {
+    if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+      TVM_FFI_ICHECK_EQ(routing_logits.value().dtype(), dl_float32)
+          << "routing_logits must be float.";
+    } else {
+      TVM_FFI_ICHECK_EQ(routing_logits.value().dtype(), dl_bfloat16)
+          << "routing_logits must be bfloat16.";
+    }
   }
   TVM_FFI_ICHECK(dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn)
       << "FP8 block scale MoE: hidden_states must be fp16, bf16, or fp8.";
@@ -1621,7 +1632,7 @@ Tensor trtllm_fp8_block_scale_moe(
     // Create and initialize launcher for this tile size
     auto launcher = std::make_unique<Fp8BlockScaleLauncher>(
         routing_logits, routing_bias, hidden_states, hidden_states_scale, gemm1_weights,
-        gemm1_weights_scale, gemm2_weights, gemm2_weights_scale);
+        gemm1_weights_scale, gemm2_weights, gemm2_weights_scale, topk_ids, expert_weights);
     launcher->init(std::move(args), curr_tile_N, routing_method_type, use_shuffled_weight,
                    weight_layout);
 

@@ -40,5 +40,7 @@ TensorRT-LLM Fused MoE
     :toctree: ../generated
 
     trtllm_fp4_block_scale_moe
+    trtllm_fp4_block_scale_routed_moe
     trtllm_fp8_block_scale_moe
+    trtllm_fp8_block_scale_routed_moe
     trtllm_fp8_per_tensor_scale_moe
@@ -29,6 +29,7 @@
     trtllm_fp4_block_scale_moe,
     trtllm_fp4_block_scale_routed_moe,
     trtllm_fp8_block_scale_moe,
+    trtllm_fp8_block_scale_routed_moe,
     trtllm_fp8_per_tensor_scale_moe,
     trtllm_bf16_moe,
     trtllm_mxint4_block_scale_moe,
@@ -54,6 +55,7 @@
     "trtllm_fp4_block_scale_moe",
     "trtllm_fp4_block_scale_routed_moe",
     "trtllm_fp8_block_scale_moe",
+    "trtllm_fp8_block_scale_routed_moe",
     "trtllm_fp8_per_tensor_scale_moe",
     "trtllm_mxint4_block_scale_moe",
     "fused_topk_deepseek",

@@ -2291,8 +2291,113 @@ def trtllm_fp8_block_scale_moe(
     output = torch.empty(
         hidden_states.shape, dtype=torch.bfloat16, device=hidden_states.device
     )
+    # Create dummy topk_ids and expert_weights for non-routed variant
+    num_tokens = hidden_states.shape[0]
+    topk_ids = torch.empty(
+        num_tokens, top_k, dtype=torch.int32, device=hidden_states.device
+    )
+    expert_weights = torch.empty(
+        num_tokens, top_k, dtype=routing_logits.dtype, device=hidden_states.device
+    )
     return get_trtllm_moe_sm100_module().trtllm_fp8_block_scale_moe(
         routing_logits,
+        topk_ids,
+        expert_weights,
+        routing_bias,
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        output,
+        num_experts,
+        top_k,
+        n_group,
+        topk_group,
+        intermediate_size,
+        local_expert_offset,
+        local_num_experts,
+        routed_scaling_factor,
+        routing_method_type,
+        use_shuffled_weight,
+        weight_layout,
+        enable_pdl,
+        tune_max_num_tokens,
+    )
+
+
+@flashinfer_api
+def trtllm_fp8_block_scale_routed_moe(
+    topk_ids: torch.Tensor,
+    routing_bias: Optional[torch.Tensor],
+    hidden_states: torch.Tensor,
+    hidden_states_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    n_group: Optional[int],
+    topk_group: Optional[int],
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    routed_scaling_factor: Optional[float],
+    routing_method_type: int = 0,
+    use_shuffled_weight: bool = False,
+    weight_layout: int = 0,
+    enable_pdl: Optional[bool] = None,
+    tune_max_num_tokens: int = 8192,
+) -> torch.Tensor:
+    """FP8 block scale MoE operation with pre-computed routing (routed variant).
+
+    This function skips the routing computation and accepts pre-computed top-k expert
+    indices and weights. This is useful when routing is computed separately or when
+    integrating with custom routing strategies.
+
+    Args:
+        topk_ids: [seq_len, top_k] tensor of top-k indices and expert weights.
+            Dtype must be int32. It must represent a packed value where the most
+            significant 16 bits represent the score (bfloat16) and the least
+            significant 16 bits represent the index of the chosen expert (unsigned).
+        routing_bias: [num_experts] tensor of routing bias
+        hidden_states: [seq_len, hidden_size] tensor of input hidden states
+        hidden_states_scale: [hidden_size//128, seq_len] tensor of hidden states block scales
+        gemm1_weights: [num_experts, 2*intermediate_size, hidden_size] tensor of first layer weights
+        gemm1_weights_scale: [num_experts, 2*intermediate_size//128, hidden_size//128] tensor of first layer block scales
+        gemm2_weights: [num_experts, hidden_size, intermediate_size] tensor of second layer weights
+        gemm2_weights_scale: [num_experts, hidden_size//128, intermediate_size//128] tensor of second layer block scales
+        num_experts: Total number of experts
+        top_k: Number of experts to route to per token
+        n_group: Number of expert groups
+        topk_group: Number of groups to consider for top-k routing
+        intermediate_size: Size of intermediate layer
+        local_expert_offset: Offset of local experts in global expert space
+        local_num_experts: Number of experts handled by this device
+        routed_scaling_factor: Scaling factor for routing
+        routing_method_type: Type of routing method to use (default: 0)
+        use_shuffled_weight: Whether to use shuffled weight layout (default: False)
+        weight_layout: Weight layout type (default: 0)
+        enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
+        tune_max_num_tokens: Maximum number of tokens for tuning. (default: 8192)
+
+    Returns:
+        torch.Tensor: Output tensor of shape [seq_len, hidden_size]
+    """
+    output = torch.empty(
+        hidden_states.shape, dtype=torch.bfloat16, device=hidden_states.device
+    )
+    # Create empty expert_weights tensor (weights are packed in topk_ids)
+    num_tokens = hidden_states.shape[0]
+    expert_weights = torch.empty(
+        num_tokens, top_k, dtype=torch.bfloat16, device=hidden_states.device
+    )
+    return get_trtllm_moe_sm100_module().trtllm_fp8_block_scale_moe(
+        None,  # routing_logits (None for routed variant)
+        topk_ids,
+        expert_weights,  # empty tensor, weights are packed in topk_ids
         routing_bias,
         hidden_states,
         hidden_states_scale,

@@ -0,0 +1,177 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import pytest
+import torch
+
+from flashinfer import (
+    RoutingMethodType,
+)
+from flashinfer.fused_moe import (
+    trtllm_fp8_block_scale_moe,
+    trtllm_fp8_block_scale_routed_moe,
+)
+from flashinfer.utils import device_support_pdl
+
+from .test_trtllm_gen_fused_moe import (
+    routing_reference_renormalize,
+    routing_reference_renormalize_naive,
+    routing_reference_topk,
+)
+
+from flashinfer.utils import get_compute_capability
+
+
+@pytest.mark.parametrize("num_tokens", [1, 8, 1024])
+@pytest.mark.parametrize("hidden_size", [1024, 2048, 3072, 4096])
+@pytest.mark.parametrize("intermediate_size", [1024, 2048, 3072, 4096])
+@pytest.mark.parametrize("num_experts", [128, 256])
+@pytest.mark.parametrize("top_k", [4, 8])
+@pytest.mark.parametrize(
+    "routing_method_type",
+    [
+        RoutingMethodType.Renormalize,
+        RoutingMethodType.RenormalizeNaive,
+        RoutingMethodType.TopK,
+    ],
+)
+def test_trtllm_fp8_routed_fused_moe(
+    num_tokens: int,
+    hidden_size: int,
+    intermediate_size: int,
+    top_k: int,
+    num_experts: int,
+    routing_method_type: RoutingMethodType,
+):
+    compute_capability = get_compute_capability(torch.device(device="cuda"))
+    if compute_capability[0] not in [10]:
+        pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
+    torch.manual_seed(42)
+    device = torch.device("cuda:0")
+    enable_pdl = device_support_pdl(device)
+    routing_logits = torch.rand(num_tokens, num_experts, device=device).to(
+        torch.bfloat16
+    )
+
+    # Create FP8 hidden states and scales
+    hidden_states = torch.randn(num_tokens, hidden_size, device=device).to(
+        torch.float8_e4m3fn
+    )
+    # Block scale: [hidden_size//128, num_tokens]
+    hidden_states_scale = torch.rand(
+        hidden_size // 128, num_tokens, device=device, dtype=torch.float32
+    )
+
+    # Create FP8 weights and scales
+    gemm1_weights = torch.randn(
+        num_experts, intermediate_size * 2, hidden_size, device=device
+    ).to(torch.float8_e4m3fn)
+    gemm1_weights_scale = torch.rand(
+        num_experts,
+        intermediate_size * 2 // 128,
+        hidden_size // 128,
+        device=device,
+        dtype=torch.float32,
+    )
+
+    gemm2_weights = torch.randn(
+        num_experts, hidden_size, intermediate_size, device=device
+    ).to(torch.float8_e4m3fn)
+    gemm2_weights_scale = torch.rand(
+        num_experts,
+        hidden_size // 128,
+        intermediate_size // 128,
+        device=device,
+        dtype=torch.float32,
+    )
+
+    # Run the non-routed version as reference
+    reference_output = trtllm_fp8_block_scale_moe(
+        routing_logits,
+        None,  # routing_bias
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        num_experts,
+        top_k,
+        None,  # n_group
+        None,  # topk_group
+        intermediate_size,
+        0,  # local_expert_offset
+        num_experts,
+        None,  # routed_scaling_factor
+        routing_method_type.value,
+        False,  # use_shuffled_weight
+        0,  # weight_layout
+        enable_pdl,
+    ).to(torch.float)
+
+    # Compute routing for routed version
+    if routing_method_type == RoutingMethodType.Renormalize:
+        permute_info, expert_weights = routing_reference_renormalize(
+            routing_logits, top_k, num_experts, 8
+        )
+    elif routing_method_type == RoutingMethodType.RenormalizeNaive:
+        permute_info, expert_weights = routing_reference_renormalize_naive(
+            routing_logits, top_k, num_experts, 8
+        )
+    elif routing_method_type == RoutingMethodType.TopK:
+        permute_info, expert_weights = routing_reference_topk(
+            routing_logits, top_k, num_experts, 8
+        )
+    topk_ids = permute_info["topKIndices"].to(torch.int32)
+    expert_weights = expert_weights.view(num_tokens, num_experts)[
+        torch.arange(num_tokens).unsqueeze(1), topk_ids
+    ].to(torch.bfloat16)
+
+    # Pack topk_ids and expert_weights into a single tensor
+    packed_tensor = (topk_ids.to(torch.int32) << 16) | expert_weights.to(
+        torch.bfloat16
+    ).view(torch.int16)
+
+    # Run the routed version
+    output = trtllm_fp8_block_scale_routed_moe(
+        packed_tensor,
+        None,  # routing_bias
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        num_experts,
+        top_k,
+        None,  # n_group
+        None,  # topk_group
+        intermediate_size,
+        0,  # local_expert_offset
+        num_experts,
+        None,  # routed_scaling_factor
+        routing_method_type.value,
+        False,  # use_shuffled_weight
+        0,  # weight_layout
+        enable_pdl,
+    ).to(torch.float)
+
+    # Compare outputs
+    mask = torch.isclose(output, reference_output, rtol=1e-3, atol=1e-3)
+
+    # mismatch percentage
+    mismatch_pct = (~mask).float().mean().item() * 100
+    assert mismatch_pct < 6, f"Mismatch percentage is {mismatch_pct:.2f}"