Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions python/sglang/srt/layers/moe/fused_moe_triton/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce,
)
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
from sglang.srt.layers.moe.topk import TopKOutput
from sglang.srt.layers.quantization.base_config import (
QuantizationConfig,
Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(
if params_dtype is None:
params_dtype = torch.get_default_dtype()

self.layer_id = layer_id
self.top_k = top_k
self.hidden_size = hidden_size
self.tp_size = (
Expand Down Expand Up @@ -371,6 +373,29 @@ def weight_loader(
weight_name: str,
shard_id: str,
expert_id: int,
) -> None:
# TODO temp code duplication w/ EPMoE.weight_loader, since will have refactor later
physical_expert_ids = (
get_global_expert_location_metadata().logical_to_all_physical(
self.layer_id, expert_id
)
)
for physical_expert_id in physical_expert_ids:
self._weight_loader_physical(
param=param,
loaded_weight=loaded_weight,
weight_name=weight_name,
shard_id=shard_id,
expert_id=physical_expert_id,
)

def _weight_loader_physical(
self,
param: torch.nn.Parameter,
loaded_weight: torch.Tensor,
weight_name: str,
shard_id: str,
expert_id: int,
) -> None:
expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
if expert_id == -1:
Comment on lines 400 to 401
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

There are a couple of issues here:

  1. Potential Bug with Weight Tensor Sizing: The weight tensors (self.w13_weight, self.w2_weight) are created with a size based on the global number of experts. However, when EPLB is active, _map_global_expert_id_to_local_expert_id returns a local expert index. This local index is then used to access the globally-sized weight tensors. This will result in only the first num_experts_per_partition slots of the weight tensors being populated, leaving the rest uninitialized. To fix this, FusedMoE's weight tensors should likely be sized using self.num_experts_per_partition when expert parallelism is active, similar to how EPMoE is implemented. This would require changes in UnquantizedFusedMoEMethod.create_weights and FusedMoE.__init__.

  2. Variable Shadowing: The parameter expert_id is reassigned on line 400. It initially represents a physical expert ID and is then updated to be a local expert ID. This makes the code harder to follow. It would be clearer to use a new variable name, e.g., local_expert_id.

Suggested change
expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
if expert_id == -1:
local_expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
if local_expert_id == -1:

Expand Down
Loading