Skip to content

Commit 5017132

Browse files
leo-ponymercykid
authored andcommitted
[bugfix] fix ray start failed: local_world_size cannot little than visible device count error (vllm-project#4457)
### What this PR does / why we need it? Fix the ray start failed bug: local_world_size cannot little than visible device count error detail see issue vllm-project#4456. This fix code is copied from vllm fixing modify, PR: [#28873](vllm-project/vllm#28873) - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: leo-pony <[email protected]> Signed-off-by: Che Ruan <[email protected]>
1 parent 52f97b0 commit 5017132

File tree

3 files changed

+16
-6
lines changed

3 files changed

+16
-6
lines changed

tests/ut/torchair/test_torchair_worker.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def test_init_device(self, mock_platform, mock_init_dist_env):
5959
worker.vllm_config = MagicMock()
6060
worker.parallel_config = MagicMock()
6161
worker.parallel_config.local_world_size = 0
62+
worker.parallel_config.data_parallel_size = 1
6263

6364
result = worker._init_device()
6465

@@ -93,6 +94,7 @@ def test_init_device_torchair_worker(self, mock_platform,
9394
worker.vllm_config = MagicMock()
9495
worker.parallel_config = MagicMock()
9596
worker.parallel_config.local_world_size = 0
97+
worker.parallel_config.data_parallel_size = 1
9698

9799
result = worker._init_device()
98100

tests/ut/worker/test_worker_v1.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def test_init_device(self, mock_platform, mock_init_dist_env):
329329
worker.model_config = MagicMock()
330330
worker.parallel_config = MagicMock()
331331
worker.parallel_config.local_world_size = 0
332+
worker.parallel_config.data_parallel_size = 1
333+
332334
worker.model_config.seed = 42
333335

334336
# Test _init_device

vllm_ascend/worker/worker_v1.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,12 +208,18 @@ def _init_device(self):
208208
NPUPlatform.set_device(device)
209209
NPUPlatform.empty_cache()
210210

211-
visible_device_count = (torch.npu.device_count()
212-
if torch.npu.is_available() else 0)
213-
assert self.parallel_config.local_world_size <= visible_device_count, (
214-
f"local_world_size ({self.parallel_config.local_world_size}) must be "
215-
f"less than or equal to the number of visible devices "
216-
f"({visible_device_count}).")
211+
if (self.parallel_config.data_parallel_size > 1
212+
and self.parallel_config.data_parallel_size_local > 0
213+
and self.parallel_config.distributed_executor_backend
214+
not in ["ray", "external_launcher"] and
215+
self.vllm_config.parallel_config.data_parallel_backend != "ray"
216+
and self.vllm_config.parallel_config.nnodes_within_dp == 1):
217+
visible_device_count = (torch.npu.device_count()
218+
if torch.npu.is_available() else 0)
219+
assert self.parallel_config.local_world_size <= visible_device_count, (
220+
f"local_world_size ({self.parallel_config.local_world_size}) must "
221+
f"be less than or equal to the number of visible devices "
222+
f"({visible_device_count}).")
217223

218224
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
219225
# Initialize the distributed environment.

0 commit comments

Comments
 (0)