sgl-project · ispobock · Jul 20, 2025 · Jul 20, 2025 · gemini-code-assist · Jul 20, 2025
@@ -524,7 +524,7 @@ def biased_grouped_topk_gpu(
         topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
         topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
         aiter_biased_grouped_topk(
-            gating_output,
+            gating_output.to(dtype=torch.float32),
-            gating_output.to(dtype=torch.float32),
+            # TODO: Cast to float32 to match correction_bias.dtype. This is a temporary
+            # workaround for a limitation in the aiter kernel. Remove this cast once
+            # bf16/bf16 mixed-precision GEMM is supported.
+            # Ref: https://github.com/sgl-project/sglang/pull/7825
+            gating_output.to(dtype=torch.float32),
-            gating_output.to(dtype=torch.float32),
+            # TODO: Cast to float32 to match correction_bias.dtype. This is a temporary
+            # workaround for a limitation in the aiter kernel. Remove this cast once
+            # bf16/bf16 mixed-precision GEMM is supported.
+            # Ref: https://github.com/sgl-project/sglang/pull/7825
+            gating_output.to(dtype=torch.float32),
             correction_bias,
             topk_weights,
             topk_ids,