apply fix to fp8 pt

jiahanc · jiahanc · commit 8e72f08fb9eb · 2025-10-17T23:49:26.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -161,8 +161,9 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
       alloc_tensor({args.num_tokens, args.top_k}, routing_bias_dtype, routing_logits->device);
   Tensor expert_indexes =
       alloc_tensor({args.num_tokens, args.top_k}, dl_int32, routing_logits->device);
+  int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
   Tensor expert_count_histogram = alloc_tensor(
-      {2 * 256},
+      {size_of_expert_count_histogram},
       dl_int32,  // 256 is the max number of threads per block and max number of experts
       routing_logits->device);