vllm-project
diff --git a/‎tools/ep_kernels/elastic_ep/install_eep_libraries.sh‎
Lines changed: 8 additions & 1 deletion b/‎tools/ep_kernels/elastic_ep/install_eep_libraries.sh‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 59 additions & 1 deletion b/‎vllm/config/parallel.py‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 26 additions & 22 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 26 additions & 22 deletions
diff --git a/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 45 additions & 10 deletions b/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 45 additions & 10 deletions
@@ -52,6 +52,12 @@ if [ -z "$CUDA_HOME" ]; then
     exit 1
 fi
 
+# assume TORCH_CUDA_ARCH_LIST is set correctly
+if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
+    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+    exit 1
+fi
+
 # disable all features except IBGDA
 export NVSHMEM_IBGDA_SUPPORT=1
 
@@ -82,5 +88,6 @@ git clone https://github.com/ppl-ai/pplx-kernels
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
+git checkout 12cecfd
+PIP_NO_BUILD_ISOLATION=0 pip install . --no-deps -v
 
@@ -157,6 +157,9 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
 
@@ -218,6 +221,21 @@ class is dynamically inherited by the worker class. This is used to inject
     Set to be private as it's not intended to be configured by users.
     """
 
+    _stateless_world_group_port_list: list[int] = Field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
+    _stateless_dp_group_port_list: list[int] = Field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
+    _stateless_ep_group_port_list: list[int] = Field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
     decode_context_parallel_size: int = 1
     """Number of decode context parallel groups, because the world size does
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -310,7 +328,16 @@ def get_next_dp_init_port(self) -> int:
 
         return answer
 
-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop(0)
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop(0)
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop(0)
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
         # NOTE: In high-concurrency scenarios multiple processes
         # can pick the same (currently free) port through a race
         # condition when calling `get_open_port()`. When the first
@@ -335,6 +362,7 @@ def stateless_init_dp_group(self) -> ProcessGroup:
                     self.data_parallel_rank,
                     self.data_parallel_size,
                     backend="gloo",
+                    return_store=return_store,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
@@ -470,6 +498,36 @@ def __post_init__(self) -> None:
             logger.info("Using external launcher for distributed inference.")
             self.world_size *= self.data_parallel_size
 
+        # Initialize stateless group ports for elastic EP
+        if self.enable_elastic_ep:
+            num_world_groups = 1
+            num_dp_groups = max(1, self.world_size_across_dp // self.data_parallel_size)
+            num_ep_groups = max(
+                1,
+                self.world_size_across_dp
+                // (self.data_parallel_size * self.tensor_parallel_size),
+            )
+
+            total_ports_needed = (num_world_groups + num_dp_groups + num_ep_groups) * 3
+
+            if not self._stateless_world_group_port_list:
+                all_ports = get_open_ports_list(total_ports_needed + 5)
+                self._data_parallel_master_port_list = all_ports[-5:]
+                all_ports = all_ports[:-5]
+                self._stateless_world_group_port_list = [
+                    all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+                ]
+                start_idx = num_world_groups * 3
+                self._stateless_dp_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+                ]
+                start_idx += num_dp_groups * 3
+                self._stateless_ep_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+                ]
+
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
             if self.distributed_executor_backend == "external_launcher":
 
@@ -32,8 +32,8 @@ class NaiveAll2AllManager(All2AllManagerBase):
     debugging.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def naive_multicast(
         self,
@@ -105,8 +105,8 @@ class AgRsAll2AllManager(All2AllManagerBase):
     all-gather (dispatch) and reduce-scatter (combine).
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def dispatch(
         self,
@@ -155,13 +155,16 @@ class PPLXAll2AllManager(All2AllManagerBase):
     All2All communication based on PPLX kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_pplx(), (
             "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install pplx_kernels."
         )
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
+        self.nvshmem_initialized = False
+        self.handle_cache = Cache()
 
+    def get_handle(self, kwargs):
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
@@ -181,17 +184,18 @@ def __init__(self, cpu_group):
                 if self.rank == 0
                 else nvshmem_alloc_empty_unique_id()
             )
-            dist.broadcast(
-                uid,
-                src=dist.get_process_group_ranks(self.cpu_group)[0],
-                group=self.cpu_group,
-            )
+            if self.tcp_store_group is not None:
+                uid = self.tcp_store_group.broadcast_obj(uid, src=0)
+            else:
+                dist.broadcast(
+                    uid,
+                    src=dist.get_process_group_ranks(self.cpu_group)[0],
+                    group=self.cpu_group,
+                )
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
+            self.nvshmem_initialized = True
 
-        self.handle_cache = Cache()
-
-    def get_handle(self, kwargs):
         import pplx_kernels as pplx  # type: ignore[import-not-found]
 
         return self.handle_cache.get_or_create(
@@ -231,12 +235,12 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_deep_ep(), (
             "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install DeepEP kernels."
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         self.handle_cache = Cache()
 
         # This is the DeepEP default. Stick to it till we can establish
@@ -268,8 +272,8 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
@@ -325,8 +329,8 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP Low-Latency kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(
         self,
@@ -394,11 +398,11 @@ class FlashInferAllToAllManager(All2AllManagerBase):
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_flashinfer_all2all(), (
             "flashinfer all2all module not found. Please install/check flashinfer"
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         logger.debug(
             "Initialize for flashinfer All2All rank=%d, world size=%d",
             self.rank,
 
@@ -29,8 +29,9 @@ class All2AllManagerBase:
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         self.cpu_group = cpu_group
+        self.tcp_store_group = tcp_store_group
 
         # compute some common properties
         from vllm.distributed.parallel_state import (
@@ -47,12 +48,17 @@ def __init__(self, cpu_group):
         # when we create this object
         self.dp_rank = self.dp_group.rank_in_group
         self.dp_world_size = self.dp_group.world_size
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
+        self.rank = cpu_group.rank()
+        self.world_size = cpu_group.size()
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        if tcp_store_group is None:
+            self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        else:
+            self.internode = not all(
+                in_the_same_node_as(tcp_store_group, source_rank=0)
+            )
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
@@ -98,17 +104,36 @@ def __init__(
         device: torch.device | None = None,
         device_group: ProcessGroup | None = None,
         unique_name: str = "",
+        global_ranks: list[int] | None = None,
+        global_world_size: int | None = None,
     ):
         self.device = device or torch.device("cpu")
         self.cpu_group = cpu_group
         self.device_group = device_group
         self.unique_name = unique_name
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
-        self.ranks = dist.get_process_group_ranks(cpu_group)
-        self.global_rank = dist.get_rank()
-        self.global_world_size = dist.get_world_size()
-        self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
+
+        # Check if this is a stateless process group
+        from torch.distributed.distributed_c10d import _world
+
+        is_stateless = _world.pg_map.get(cpu_group, None) is None
+
+        if is_stateless:
+            # For stateless groups, we can't use torch.distributed methods
+            self.rank = cpu_group.rank()
+            self.world_size = cpu_group.size()
+            assert global_ranks is not None
+            assert global_world_size is not None
+            self.ranks = global_ranks
+            self.global_rank = self.ranks[self.rank]
+            self.global_world_size = global_world_size
+            self.rank_in_group = self.rank
+        else:
+            self.rank = dist.get_rank(cpu_group)
+            self.world_size = dist.get_world_size(cpu_group)
+            self.ranks = dist.get_process_group_ranks(cpu_group)
+            self.global_rank = dist.get_rank()
+            self.global_world_size = dist.get_world_size()
+            self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
 
         use_ep = False
         all2all_backend = None
@@ -252,6 +277,13 @@ def recv(
         torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
+        """Broadcast a tensor from source rank to all ranks."""
+        if self.world_size == 1:
+            return tensor
+        torch.distributed.broadcast(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         pass
 
@@ -295,3 +327,6 @@ def combine(
         This is a no-op in the base class.
         """
         return hidden_states
+
+    def batch_isend_irecv(self, p2p_ops: list):
+        raise NotImplementedError