sgl-project · zhyncs · Jul 9, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 8, 2025
@@ -319,7 +319,7 @@ def forward_cpu(
                 layer.w2_weight,
                 topk_weights.to(
                     torch.float
-                ),  # TODO: the topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
+                ),  # TODO: [Note] topk_weights should be float32 for now. The topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
                 topk_ids,
                 False,  # inplace # See [Note] inplace should be False in fused_experts.
                 False,  # use_int8_w8a8

diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -1002,7 +1002,9 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                topk_weights,
+                topk_weights.to(
+                    torch.float
+                ),  # See [Note] topk_weights should be float32 for now.
                 topk_ids,
                 False,  # inplace See [Note] inplace should be False in fused_experts.
                 False,  # use_int8_w8a8

diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -290,7 +290,9 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                topk_weights,
+                topk_weights.to(
+                    torch.float
+                ),  # See [Note] topk_weights should be float32 for now.
                 topk_ids,
                 False,  # inplace See [Note] inplace should be False in fused_experts.
                 True,  # use_int8_w8a8