fix: address review

nekorobov · nekorobov · commit 8222437c7d3b · 2025-12-02T07:14:57.000-08:00
Signed-off-by: Nikita Korobov &lt;14355239+nekorobov@users.noreply.github.com&gt;
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -155,7 +155,7 @@ void blockScaleInterleaveHost(TensorView blockScale, TensorView interleavedBlock
       auto globalRowIdx = eIdx * rows + rIdx;
       T* blockScalePtr = static_cast<T*>(blockScale.data_ptr()) + globalRowIdx * cols;
       for (int cIdx = 0; cIdx < static_cast<int>(cols_padded); ++cIdx) {
-        uint8_t sf_ori = 0;
+        T sf_ori = 0;
         if (rIdx < static_cast<int>(rows) && cIdx < static_cast<int>(cols)) {
           sf_ori = blockScalePtr[cIdx];
         }
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -2568,22 +2568,22 @@ def trtllm_mxint4_block_scale_moe(
     Args:
         routing_logits (torch.Tensor): shape [seq_len, num_experts]
             Input tensor of routing logits. Supports float32, bfloat16.
-        hidden_states (torch.Tensor): shape [seq_len, hidden_size // 2 if nvfp4 else hidden_size]
-            Tensor of input hidden states. Supports bfloat16, mxfp8, and nvfp4 (packed into uint8)
+        hidden_states (torch.Tensor): shape [seq_len, hidden_size]
+            Tensor of input hidden states. Supports bfloat16.
         gemm1_weights (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 2]
-            Tensor of FC1 weights. Dtype must be uint8 (packed fp4)
-        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // (32 if mxfp4 else 16)]
-            Scale tensor of FC1 weights. Dtype must be float8.
+            Tensor of FC1 weights. Dtype must be uint8 (packed mxint4)
+        gemm1_weights_scale (torch.Tensor): shape [num_experts, 2 * intermediate_size, hidden_size // 32]
+            Scale tensor of FC1 weights. Dtype must be bfloat16.
         gemm1_alpha (Optional[torch.Tensor]): shape [num_experts]
             Tensor of swiglu alpha. Dtype is float32.
         gemm1_beta (Optional[torch.Tensor]): shape [num_experts]
             Tensor of swiglu beta. Dtype is float32.
         gemm1_clamp_limit (Optional[torch.Tensor]): shape [num_experts]
             Tensor of swiglu clamp limit. Dtype is float32.
         gemm2_weights (torch.Tensor): shape [num_experts, hidden_size, intermediate_size]
-            Tensor of FC2 weights. Dtype must be uint8 (packed fp4)
-        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // (32 if mxfp4 else 16)]
-            Scale tensor of FC2 weights. Dtype must be float8.
+            Tensor of FC2 weights. Dtype must be uint8 (packed mxint4)
+        gemm2_weights_scale (torch.Tensor): shape [num_experts, hidden_size, intermediate_size // 32]
+            Scale tensor of FC2 weights. Dtype must be bfloat16.
         num_experts (int): Total number of experts
         top_k (int): Number of experts to route to per token
         n_group (Optional[int]): Number of expert groups (can be None for some routing methods)
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -789,7 +789,7 @@ def compute_reference(self, args):
         return run_moe_reference_mxint4(args)
 
     def get_tolerances(self):
-        """Get FP4-specific accuracy tolerances."""
+        """Get MXINT4-specific accuracy tolerances."""
         return {"atol": 0.1, "rtol": 0.85, "percent": 0.925}
 
 
@@ -2472,12 +2472,12 @@ def run_moe_test(
 @pytest.mark.parametrize(
     "moe_impl",
     [
-        # pytest.param(BF16Moe(), id="BF16xBF16"),
-        # pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
-        # pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
-        # pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
-        # pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
-        # pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
+        pytest.param(BF16Moe(), id="BF16xBF16"),
+        pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
+        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
         pytest.param(MxInt4BlockScaleMoe(), id="MxInt4xBf16"),
     ],
 )

Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,7 @@ void blockScaleInterleaveHost(TensorView blockScale, TensorView interleavedBlock`
`155`	`155`	`auto globalRowIdx = eIdx * rows + rIdx;`
`156`	`156`	`T* blockScalePtr = static_cast<T>(blockScale.data_ptr()) + globalRowIdx cols;`
`157`	`157`	`for (int cIdx = 0; cIdx < static_cast<int>(cols_padded); ++cIdx) {`
`158`		`- uint8_t sf_ori = 0;`
	`158`	`+ T sf_ori = 0;`
`159`	`159`	`if (rIdx < static_cast<int>(rows) && cIdx < static_cast<int>(cols)) {`
`160`	`160`	`sf_ori = blockScalePtr[cIdx];`
`161`	`161`	`}`