[EPLB][ROCm]: support EPBL for ROCm backend (vllm-project#27731)

PerryZhang01 · zgplvyou · devpatelio · commit 1a425d7ec64a · 2025-11-28T16:30:32.000-08:00
Signed-off-by: Perry Zhang &lt;perzhang@amd.com&gt;
Co-authored-by: Perry Zhang &lt;perzhang@amd.com&gt;
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -278,10 +278,10 @@ def _validate_parallel_config(self) -> Self:
             )
 
         if self.enable_eplb:
-            if not current_platform.is_cuda():
+            if not current_platform.is_cuda_alike():
                 raise ValueError(
                     "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now."
+                    "CUDA devices or ROCm devices now."
                 )
             if not self.enable_expert_parallel:
                 raise ValueError("enable_expert_parallel must be True to use EPLB.")
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1218,7 +1218,11 @@ def load_weights(
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
         weights = list(self.named_parameters())
-        assert all(weight.is_contiguous() for _, weight in weights)
+        assert all(
+            weight.is_contiguous()
+            for name, weight in weights
+            if not name.startswith("_shared_experts.")
+        )
 
         # Filter out the non-expert weights.
         # `e_score_correction_bias` is a bias for each logical expert,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1019,9 +1019,10 @@ def apply(
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Fp8MoEMethod` yet."
-            )
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
@@ -1037,6 +1038,11 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             num_fused_shared_experts=layer.num_fused_shared_experts,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1145,6 +1151,10 @@ def apply(
                 quant_config=self.moe_quant_config,
             )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     def __init__(

Original file line number	Diff line number	Diff line change
`@@ -278,10 +278,10 @@ def _validate_parallel_config(self) -> Self:`
`278`	`278`	`)`
`279`	`279`
`280`	`280`	`if self.enable_eplb:`
`281`		`- if not current_platform.is_cuda():`
	`281`	`+ if not current_platform.is_cuda_alike():`
`282`	`282`	`raise ValueError(`
`283`	`283`	`"Expert parallelism load balancing is only supported on "`
`284`		`- "CUDA devices now."`
	`284`	`+ "CUDA devices or ROCm devices now."`
`285`	`285`	`)`
`286`	`286`	`if not self.enable_expert_parallel:`
`287`	`287`	`raise ValueError("enable_expert_parallel must be True to use EPLB.")`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# SPDX-License-Identifier: Apache-2.0`
	`2`	`+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
	`3`	`+`