Load w13/w2_input_scale for all experts

wenscarl · wenscarl · commit 597672d560f1 · 2025-10-14T03:56:26.000Z
Signed-off-by: Shu Wang &lt;shuw@nvidia.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1226,6 +1226,7 @@ def __init__(
             "intermediate_size_per_partition": self.intermediate_size_per_partition,
             "params_dtype": params_dtype,
             "weight_loader": self.weight_loader,
+            "global_num_experts": self.global_num_experts,
         }
         # need full intermediate size pre-sharding for WNA16 act order
         if self.quant_method.__class__.__name__ in (
@@ -1546,13 +1547,16 @@ def weight_loader(
                 param.data[:, :dim1, :dim2].copy_(loaded_weight)
             return True if return_success else None
 
-        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
-        if expert_id == -1:
+        quant_method_name = self.quant_method.__class__.__name__
+        global_expert_id = expert_id
+        expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id)
+        is_modeloptnvfp4 = quant_method_name == "ModelOptNvFp4FusedMoE"
+        is_input_scale = "input_scale" in weight_name
+        if expert_id == -1 and not (is_modeloptnvfp4 and is_input_scale):
             # Failed to load this param since it's not local to this rank
             return False if return_success else None
         # Hereafter, `expert_id` is local physical id
 
-        quant_method_name = self.quant_method.__class__.__name__
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
@@ -1621,7 +1625,7 @@ def weight_loader(
         expert_data = param.data if full_load else param.data[expert_id]
 
         # Case input scale: input_scale loading is only supported for fp8
-        if "input_scale" in weight_name:
+        if is_input_scale:
             # this is needed for compressed-tensors only
             loaded_weight = loaded_weight.to(param.data.device)
 
@@ -1637,7 +1641,9 @@ def weight_loader(
                 )
 
             self._load_single_value(
-                param=param, loaded_weight=loaded_weight, expert_id=expert_id
+                param=param,
+                loaded_weight=loaded_weight,
+                expert_id=global_expert_id if is_modeloptnvfp4 else expert_id,
             )
             return True if return_success else None
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1224,6 +1224,7 @@ def create_weights(
         weight_dtype = torch.uint8
         weight_scale_dtype = torch.float8_e4m3fn
         weight_loader = extra_weight_attrs.get("weight_loader")
+        global_num_experts = extra_weight_attrs.get("global_num_experts")
         # GEMM 1
         w13_weight = ModelWeightParameter(
             data=torch.empty(
@@ -1303,15 +1304,16 @@ def create_weights(
         )
 
         w13_input_scale = PerTensorScaleParameter(
-            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            data=torch.empty(global_num_experts, 2, dtype=torch.float32),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
 
         w2_input_scale = PerTensorScaleParameter(
-            data=torch.empty(num_experts, dtype=torch.float32),
+            data=torch.empty(global_num_experts, dtype=torch.float32),
             weight_loader=weight_loader,
         )
+
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
     def prepare_static_weights_for_trtllm_fp4_moe(
@@ -1464,7 +1466,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
 
         # Common processing for input scales and alphas
-        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+        w13_input_scale = (
+            layer.w13_input_scale.max().to(torch.float32).expand(layer.num_experts)
+        )
         layer.g1_alphas = Parameter(
             (w13_input_scale * w13_weight_scale_2).to(torch.float32),
             requires_grad=False,
@@ -1476,14 +1480,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
 
         # GEMM 2 processing
+        w2_input_scale = (
+            layer.w2_input_scale.max().to(torch.float32).expand(layer.num_experts)
+        )
         layer.g2_alphas = Parameter(
-            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
 
         # This is for quantization, so we need to invert it.
         layer.w2_input_scale_quant = Parameter(
-            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
+            (1 / w2_input_scale).to(torch.float32), requires_grad=False
         )
 
         # TensorRT-LLM specific processing