Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def forward_cpu(
layer.w2_weight,
topk_weights.to(
torch.float
), # TODO: the topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
), # TODO: [Note] topk_weights should be float32 for now. The topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
topk_ids,
False, # inplace # See [Note] inplace should be False in fused_experts.
False, # use_int8_w8a8
Expand Down
4 changes: 3 additions & 1 deletion python/sglang/srt/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,9 @@ def apply(
x,
layer.w13_weight,
layer.w2_weight,
topk_weights,
topk_weights.to(
torch.float
), # See [Note] topk_weights should be float32 for now.
topk_ids,
False, # inplace See [Note] inplace should be False in fused_experts.
False, # use_int8_w8a8
Expand Down
4 changes: 3 additions & 1 deletion python/sglang/srt/layers/quantization/w8a8_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,9 @@ def apply(
x,
layer.w13_weight,
layer.w2_weight,
topk_weights,
topk_weights.to(
torch.float
), # See [Note] topk_weights should be float32 for now.
topk_ids,
False, # inplace See [Note] inplace should be False in fused_experts.
True, # use_int8_w8a8
Expand Down