[Feature] TritonBF16MoEMethod bug fix & test

xuanyuanminzheng · xuanyuanminzheng · commit 679bc3e560b4 · 2026-05-07T15:46:59.000+08:00
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -1897,20 +1897,9 @@ class TritonBF16MoEMethod(QuantMethodBase):
     This matches UnquantizedFusedMoEMethod.create_weights layout on CUDA.
     """
 
-    # Class-level flag: print the "Triton BF16 MoE activated" message only once.
-    _logged = False
-
     def __init__(self, quant_config=None):
         self.quant_config = quant_config
         self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
-        if not TritonBF16MoEMethod._logged:
-            import logging
-
-            logging.getLogger(__name__).warning(
-                "[TritonBF16MoEMethod] Triton BF16 MoE backend is ACTIVE "
-                "(FD_MOE_BACKEND=triton). Using fused_moe_kernel_paddle for BF16."
-            )
-            TritonBF16MoEMethod._logged = True
 
     def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
         pass
@@ -2104,8 +2093,6 @@ def apply(
             BLOCK_SIZE_K=cfg2["BLOCK_SIZE_K"],
             GROUP_SIZE_M=cfg2["GROUP_SIZE_M"],
             MUL_ROUTED_WEIGHT=True,   # fuse router weight * output
-            # top_k=1: down_proj_input rows are indexed directly by sorted_token_ids,
-            # so a_ptrs = base + offs_token * stride_am (no // top_k needed).
             top_k=1,
             compute_type=tl.bfloat16,
         )
@@ -2122,4 +2109,4 @@ def apply_ep_decode(self, layer, x, gate, topk_ids_hookfunc=None, shared_experts
         raise NotImplementedError("TritonBF16MoEMethod does not support EP decode yet.")
 
     def apply_tp(self, layer, x, gate, topk_ids_hookfunc=None, shared_experts=None):
-        return self.apply(layer, x, gate, topk_ids_hookfunc=topk_ids_hookfunc)
+        return self.apply(layer, x, gate, topk_ids_hookfunc, shared_experts)
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -56,11 +56,8 @@ def get_moe_method(layer=None):
     if current_platform.is_cuda():
         moe_backend = envs.FD_MOE_BACKEND.lower()
         if moe_backend == "triton":
-            from paddleformers.utils.log import logger
-
             from .fused_moe_triton_backend import TritonBF16MoEMethod
 
-            logger.info("[get_moe_method] FD_MOE_BACKEND=triton -> TritonBF16MoEMethod")
             return TritonBF16MoEMethod(None)
         from .fused_moe_cutlass_backend import CutlassMoEMethod
 
diff --git a/tests/layers/test_fused_moe_triton_backend.py b/tests/layers/test_fused_moe_triton_backend.py