|
2 | 2 | from functools import partial |
3 | 3 | from typing import Any, Awaitable, List, Optional, Set, Tuple, Union |
4 | 4 |
|
5 | | -import vllm.envs as envs |
6 | | -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, |
7 | | - SchedulerConfig) |
8 | 5 | from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase |
9 | 6 | from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, |
10 | 7 | ResultHandler, WorkerMonitor) |
|
13 | 10 | from vllm.model_executor.layers.sampler import SamplerOutput |
14 | 11 | from vllm.prompt_adapter.request import PromptAdapterRequest |
15 | 12 | from vllm.sequence import ExecuteModelRequest |
16 | | -from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port, |
| 13 | +from vllm.utils import (get_distributed_init_method, get_open_port, |
17 | 14 | get_vllm_instance_id, make_async) |
18 | 15 | from vllm.worker.worker_base import WorkerWrapperBase |
19 | 16 |
|
@@ -57,13 +54,6 @@ def _init_executor(self) -> None: |
57 | 54 | os.environ["LOCAL_WORLD_SIZE"] = str( |
58 | 55 | self.parallel_config.tensor_parallel_size) |
59 | 56 |
|
60 | | - self.model_config = _verify_and_get_model_config(self.model_config) |
61 | | - self.cache_config = _verify_and_get_cache_config(self.cache_config) |
62 | | - self.scheduler_config = _verify_and_get_scheduler_config( |
63 | | - self.scheduler_config) |
64 | | - self.parallel_config = _verify_and_get_parallel_config( |
65 | | - self.parallel_config) |
66 | | - |
67 | 57 | # Multiprocessing-based executor does not support multi-node setting. |
68 | 58 | # Since it only works for single node, we can use the loopback address |
69 | 59 | # 127.0.0.1 for communication. |
@@ -313,62 +303,6 @@ async def check_health_async(self) -> None: |
313 | 303 | self.check_health() |
314 | 304 |
|
315 | 305 |
|
316 | | -def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: |
317 | | - # Reminder: Please update docs/source/serving/compatibility_matrix.rst |
318 | | - # If the feature combo become valid |
319 | | - if not config.enforce_eager: |
320 | | - logger.warning( |
321 | | - "CUDA graph is not supported on CPU, fallback to the eager " |
322 | | - "mode.") |
323 | | - config.enforce_eager = True |
324 | | - return config |
325 | | - |
326 | | - |
327 | | -def _verify_and_get_scheduler_config( |
328 | | - config: SchedulerConfig) -> SchedulerConfig: |
329 | | - # Reminder: Please update docs/source/serving/compatibility_matrix.rst |
330 | | - # If the feature combo become valid |
331 | | - if config.chunked_prefill_enabled: |
332 | | - logger.warning("Chunked prefill is not supported on CPU, disable it.") |
333 | | - config.chunked_prefill_enabled = False |
334 | | - |
335 | | - return config |
336 | | - |
337 | | - |
338 | | -def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: |
339 | | - # Reminder: Please update docs/source/serving/compatibility_matrix.rst |
340 | | - # If the feature combo become valid |
341 | | - if config.enable_prefix_caching: |
342 | | - logger.warning("Prefix caching is not supported on CPU, disable it.") |
343 | | - config.enable_prefix_caching = False |
344 | | - |
345 | | - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE |
346 | | - |
347 | | - if kv_cache_space >= 0: |
348 | | - if kv_cache_space == 0: |
349 | | - config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore |
350 | | - logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " |
351 | | - "for CPU backend is not set, using 4 by default.") |
352 | | - else: |
353 | | - config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore |
354 | | - else: |
355 | | - raise RuntimeError( |
356 | | - "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" |
357 | | - f" {kv_cache_space}, expect a positive integer value.") |
358 | | - |
359 | | - return config |
360 | | - |
361 | | - |
362 | | -def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig: |
363 | | - if (config.distributed_executor_backend is not None |
364 | | - and config.distributed_executor_backend != "mp"): |
365 | | - logger.warning( |
366 | | - "%s is not supported on CPU, fallback to mp distributed executor " |
367 | | - "backend.", config.distributed_executor_backend) |
368 | | - config.distributed_executor_backend = "mp" |
369 | | - return config |
370 | | - |
371 | | - |
372 | 306 | def _driver_method_invoker(driver, method: str, *args, **kwargs): |
373 | 307 | return getattr(driver, method)(*args, **kwargs) |
374 | 308 |
|
|
0 commit comments