Skip to content

Commit 679bc3e

Browse files
[Feature] TritonBF16MoEMethod bug fix & test
1 parent 428291e commit 679bc3e

3 files changed

Lines changed: 486 additions & 17 deletions

File tree

fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1897,20 +1897,9 @@ class TritonBF16MoEMethod(QuantMethodBase):
18971897
This matches UnquantizedFusedMoEMethod.create_weights layout on CUDA.
18981898
"""
18991899

1900-
# Class-level flag: print the "Triton BF16 MoE activated" message only once.
1901-
_logged = False
1902-
19031900
def __init__(self, quant_config=None):
19041901
self.quant_config = quant_config
19051902
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
1906-
if not TritonBF16MoEMethod._logged:
1907-
import logging
1908-
1909-
logging.getLogger(__name__).warning(
1910-
"[TritonBF16MoEMethod] Triton BF16 MoE backend is ACTIVE "
1911-
"(FD_MOE_BACKEND=triton). Using fused_moe_kernel_paddle for BF16."
1912-
)
1913-
TritonBF16MoEMethod._logged = True
19141903

19151904
def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False) -> None:
19161905
pass
@@ -2104,8 +2093,6 @@ def apply(
21042093
BLOCK_SIZE_K=cfg2["BLOCK_SIZE_K"],
21052094
GROUP_SIZE_M=cfg2["GROUP_SIZE_M"],
21062095
MUL_ROUTED_WEIGHT=True, # fuse router weight * output
2107-
# top_k=1: down_proj_input rows are indexed directly by sorted_token_ids,
2108-
# so a_ptrs = base + offs_token * stride_am (no // top_k needed).
21092096
top_k=1,
21102097
compute_type=tl.bfloat16,
21112098
)
@@ -2122,4 +2109,4 @@ def apply_ep_decode(self, layer, x, gate, topk_ids_hookfunc=None, shared_experts
21222109
raise NotImplementedError("TritonBF16MoEMethod does not support EP decode yet.")
21232110

21242111
def apply_tp(self, layer, x, gate, topk_ids_hookfunc=None, shared_experts=None):
2125-
return self.apply(layer, x, gate, topk_ids_hookfunc=topk_ids_hookfunc)
2112+
return self.apply(layer, x, gate, topk_ids_hookfunc, shared_experts)

fastdeploy/model_executor/layers/moe/moe.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,8 @@ def get_moe_method(layer=None):
5656
if current_platform.is_cuda():
5757
moe_backend = envs.FD_MOE_BACKEND.lower()
5858
if moe_backend == "triton":
59-
from paddleformers.utils.log import logger
60-
6159
from .fused_moe_triton_backend import TritonBF16MoEMethod
6260

63-
logger.info("[get_moe_method] FD_MOE_BACKEND=triton -> TritonBF16MoEMethod")
6461
return TritonBF16MoEMethod(None)
6562
from .fused_moe_cutlass_backend import CutlassMoEMethod
6663

0 commit comments

Comments
 (0)