@@ -1897,20 +1897,9 @@ class TritonBF16MoEMethod(QuantMethodBase):
18971897 This matches UnquantizedFusedMoEMethod.create_weights layout on CUDA.
18981898 """
18991899
1900- # Class-level flag: print the "Triton BF16 MoE activated" message only once.
1901- _logged = False
1902-
19031900 def __init__ (self , quant_config = None ):
19041901 self .quant_config = quant_config
19051902 self .added_weight_attrs = ["up_gate_proj_weight" , "down_proj_weight" ]
1906- if not TritonBF16MoEMethod ._logged :
1907- import logging
1908-
1909- logging .getLogger (__name__ ).warning (
1910- "[TritonBF16MoEMethod] Triton BF16 MoE backend is ACTIVE "
1911- "(FD_MOE_BACKEND=triton). Using fused_moe_kernel_paddle for BF16."
1912- )
1913- TritonBF16MoEMethod ._logged = True
19141903
19151904 def process_prequanted_weights (self , layer : nn .Layer , state_dict , is_rearrange : bool = False ) -> None :
19161905 pass
@@ -2104,8 +2093,6 @@ def apply(
21042093 BLOCK_SIZE_K = cfg2 ["BLOCK_SIZE_K" ],
21052094 GROUP_SIZE_M = cfg2 ["GROUP_SIZE_M" ],
21062095 MUL_ROUTED_WEIGHT = True , # fuse router weight * output
2107- # top_k=1: down_proj_input rows are indexed directly by sorted_token_ids,
2108- # so a_ptrs = base + offs_token * stride_am (no // top_k needed).
21092096 top_k = 1 ,
21102097 compute_type = tl .bfloat16 ,
21112098 )
@@ -2122,4 +2109,4 @@ def apply_ep_decode(self, layer, x, gate, topk_ids_hookfunc=None, shared_experts
21222109 raise NotImplementedError ("TritonBF16MoEMethod does not support EP decode yet." )
21232110
21242111 def apply_tp (self , layer , x , gate , topk_ids_hookfunc = None , shared_experts = None ):
2125- return self .apply (layer , x , gate , topk_ids_hookfunc = topk_ids_hookfunc )
2112+ return self .apply (layer , x , gate , topk_ids_hookfunc , shared_experts )
0 commit comments