vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 13 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tests/distributed/test_elastic_ep.py‎
Lines changed: 80 additions & 0 deletions b/‎tests/distributed/test_elastic_ep.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎tools/ep_kernels/elastic_ep/install_eep_libraries.sh‎
Lines changed: 8 additions & 1 deletion b/‎tools/ep_kernels/elastic_ep/install_eep_libraries.sh‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎vllm/compilation/wrapper.py‎
Lines changed: 31 additions & 0 deletions b/‎vllm/compilation/wrapper.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 78 additions & 2 deletions b/‎vllm/config/parallel.py‎
Lines changed: 78 additions & 2 deletions
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 26 additions & 22 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 26 additions & 22 deletions
@@ -1147,6 +1147,19 @@ steps:
   - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
   - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
 
+- label: Elastic EP Scaling Test
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_elastic_ep.py
+
 - label: Plugin Tests (2 GPUs) # 40min
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
 
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import time
+
+import requests
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ..utils import RemoteOpenAIServer, _test_completion, multi_gpu_test
+
+MODEL_NAME = "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8"
+
+
+def _send_scale_command(server: RemoteOpenAIServer, new_dp_size: int) -> bool:
+    url = server.url_for("scale_elastic_ep")
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling():
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--disable-log-requests",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.9",
+        "--max-model-len",
+        "16384",
+        "--no-enable-prefix-caching",
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "pplx",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "128",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--data-parallel-size-local",
+        "2",
+        "--data-parallel-start-rank",
+        "0",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    tokenizer = get_tokenizer(MODEL_NAME, trust_remote_code=True)
+    prompt = "Hello, my name is"
+    token_ids = tokenizer(prompt).input_ids
+
+    # timeout is 20 minutes
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        client = server.get_client()
+        _test_completion(client, MODEL_NAME, prompt, token_ids)
+
+        # Scale up from 2->4
+        assert _send_scale_command(server, 4)
+        time.sleep(10)
+        _test_completion(client, MODEL_NAME, prompt, token_ids)
+
+        # Scale down from 4->2
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        _test_completion(client, MODEL_NAME, prompt, token_ids)
@@ -52,6 +52,12 @@ if [ -z "$CUDA_HOME" ]; then
     exit 1
 fi
 
+# assume TORCH_CUDA_ARCH_LIST is set correctly
+if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
+    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+    exit 1
+fi
+
 # disable all features except IBGDA
 export NVSHMEM_IBGDA_SUPPORT=1
 
@@ -82,5 +88,6 @@ git clone https://github.com/ppl-ai/pplx-kernels
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
+git checkout 12cecfd
+PIP_NO_BUILD_ISOLATION=0 pip install . --no-deps -v
 
@@ -258,3 +258,34 @@ def _dispatch_to_compiled_code(self):
             yield
         finally:
             self.__class__.forward.__code__ = original
+
+
+def reset_compile_wrapper(model: torch.nn.Module) -> None:
+    """
+    Clean up compiled model and captured CUDA graphs for elastic EP.
+    """
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper) and hasattr(
+        model, "model"
+    ):
+        model = model.model
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper):
+        return
+    # model.do_not_compile is set by the @support_torch_compile decorator
+    if hasattr(model, "do_not_compile") and model.do_not_compile:
+        return
+    from vllm.compilation.counter import compilation_counter
+
+    # reset the compilation counter
+    compilation_counter.num_models_seen = 0
+    compilation_counter.num_graphs_seen = 0
+    compilation_counter.num_piecewise_graphs_seen = 0
+    compilation_counter.num_piecewise_capturable_graphs_seen = 0
+    compilation_counter.num_backend_compilations = 0
+    compilation_counter.num_gpu_runner_capture_triggers = 0
+    compilation_counter.num_cudagraph_captured = 0
+    compilation_counter.num_inductor_compiles = 0
+    compilation_counter.num_eager_compiles = 0
+    compilation_counter.num_cache_entries_updated = 0
+    compilation_counter.num_compiled_artifacts_saved = 0
+    compilation_counter.stock_torch_compile_count = 0
+    TorchCompileWithNoGuardsWrapper.__init__(model)
@@ -166,6 +166,9 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
 
@@ -239,6 +242,29 @@ class is dynamically inherited by the worker class. This is used to inject
     Set to be private as it's not intended to be configured by users.
     """
 
+    _stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    It is a list of list[int], with each inner list contains a set of 3 ports
+    to be used for setting up the stateless CPU/device/TCPStore groups
+    in StatelessGroupCoordinator. The number of inner lists is equal to
+    the number of DP groups, 
+    i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
+    and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
+    """
+
+    _stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
+    """
+
+    _stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_world_group_port_list) == 1,
+    """
+
     decode_context_parallel_size: int = 1
     """Number of decode context parallel groups, because the world size does
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -358,7 +384,16 @@ def get_next_dp_init_port(self) -> int:
 
         return answer
 
-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop()
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop()
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop()
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
         # NOTE: In high-concurrency scenarios multiple processes
         # can pick the same (currently free) port through a race
         # condition when calling `get_open_port()`. When the first
@@ -382,7 +417,8 @@ def stateless_init_dp_group(self) -> ProcessGroup:
                     self.get_next_dp_init_port(),
                     self.data_parallel_rank,
                     self.data_parallel_size,
-                    backend=current_platform.dist_backend,
+                    backend="gloo",
+                    return_store=return_store,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
@@ -561,6 +597,46 @@ def __post_init__(self) -> None:
             logger.info("Using external launcher for distributed inference.")
             self.world_size *= self.data_parallel_size
 
+        # Initialize stateless group ports for elastic EP
+        if self.enable_elastic_ep:
+            if not self.enable_eplb:
+                raise ValueError("Elastic EP is only supported with enable_eplb=True.")
+            num_world_groups = 1
+            dp_size = self.data_parallel_size
+            ep_size = self.data_parallel_size * self.world_size_across_dp
+            num_dp_groups = max(1, self.world_size_across_dp // dp_size)
+            num_ep_groups = max(1, self.world_size_across_dp // ep_size)
+
+            # NOTE(yongji):
+            # we need 3 ports for each comm group in `StatelessGroupCoordinator`.
+            # one for stateless CPU group, one for stateless device group,
+            # one for stateless TCPStore group.
+            total_ports_needed = (num_world_groups + num_dp_groups + num_ep_groups) * 3
+            if not self._stateless_world_group_port_list:
+                all_ports = get_open_ports_list(total_ports_needed + 5)
+                # NOTE(yongji): allocate 5 ports for _data_parallel_master_port_list
+                # as in the case when elastic EP is not enabled
+                # (the regular DP code path below this if: `get_open_ports_list(5)`).
+                # We must set _data_parallel_master_port_list here instead of
+                # letting the regular DP code path to set it, since
+                # we should call get_open_ports_list() only once
+                # to ensure the allocated ports are distinct.
+                self._data_parallel_master_port_list = all_ports[-5:]
+                all_ports = all_ports[:-5]
+                self._stateless_world_group_port_list = [
+                    all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+                ]
+                start_idx = num_world_groups * 3
+                self._stateless_dp_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+                ]
+                start_idx += num_dp_groups * 3
+                self._stateless_ep_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+                ]
+
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
             if self.distributed_executor_backend == "external_launcher":
 
@@ -32,8 +32,8 @@ class NaiveAll2AllManager(All2AllManagerBase):
     debugging.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def naive_multicast(
         self,
@@ -105,8 +105,8 @@ class AgRsAll2AllManager(All2AllManagerBase):
     all-gather (dispatch) and reduce-scatter (combine).
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def dispatch(
         self,
@@ -155,13 +155,16 @@ class PPLXAll2AllManager(All2AllManagerBase):
     All2All communication based on PPLX kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_pplx(), (
             "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install pplx_kernels."
         )
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
+        self.nvshmem_initialized = False
+        self.handle_cache = Cache()
 
+    def get_handle(self, kwargs):
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
@@ -181,17 +184,18 @@ def __init__(self, cpu_group):
                 if self.rank == 0
                 else nvshmem_alloc_empty_unique_id()
             )
-            dist.broadcast(
-                uid,
-                src=dist.get_process_group_ranks(self.cpu_group)[0],
-                group=self.cpu_group,
-            )
+            if self.tcp_store_group is not None:
+                uid = self.tcp_store_group.broadcast_obj(uid, src=0)
+            else:
+                dist.broadcast(
+                    uid,
+                    src=dist.get_process_group_ranks(self.cpu_group)[0],
+                    group=self.cpu_group,
+                )
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
+            self.nvshmem_initialized = True
 
-        self.handle_cache = Cache()
-
-    def get_handle(self, kwargs):
         import pplx_kernels as pplx  # type: ignore[import-not-found]
 
         return self.handle_cache.get_or_create(
@@ -231,12 +235,12 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_deep_ep(), (
             "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install DeepEP kernels."
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         self.handle_cache = Cache()
 
         # This is the DeepEP default. Stick to it till we can establish
@@ -268,8 +272,8 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
@@ -325,8 +329,8 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP Low-Latency kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(
         self,
@@ -396,11 +400,11 @@ class FlashInferAllToAllManager(All2AllManagerBase):
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_flashinfer_all2all(), (
             "flashinfer all2all module not found. Please install/check flashinfer"
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         logger.debug(
             "Initialize for flashinfer All2All rank=%d, world size=%d",
             self.rank,