@@ -35,7 +35,6 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig,
3535
3636 # Instantiate the worker and load the model to CPU.
3737 self ._init_worker ()
38- self ._init_cache ()
3938
4039 def _init_worker (self ):
4140 from vllm .worker .cpu_worker import CPUWorker
@@ -46,10 +45,11 @@ def _init_worker(self):
4645 distributed_init_method = get_distributed_init_method (
4746 get_ip (), get_open_port ())
4847 self .driver_worker = CPUWorker (
49- self .model_config ,
50- self .parallel_config ,
51- self .scheduler_config ,
52- self .device_config ,
48+ model_config = self .model_config ,
49+ parallel_config = self .parallel_config ,
50+ scheduler_config = self .scheduler_config ,
51+ device_config = self .device_config ,
52+ cache_config = self .cache_config ,
5353 local_rank = 0 ,
5454 rank = 0 ,
5555 distributed_init_method = distributed_init_method ,
@@ -60,35 +60,21 @@ def _init_worker(self):
6060 self .driver_worker .init_device ()
6161 self .driver_worker .load_model ()
6262
63- def _init_cache (self ) -> None :
64- num_cpu_blocks = self .driver_worker .get_cpu_cache_block_num (
65- block_size = self .cache_config .block_size ,
66- cache_space = self .cache_config .cpu_kvcache_space_bytes ,
67- cache_dtype = self .cache_config .cache_dtype ,
68- )
69-
63+ def determine_num_available_blocks (self ) -> tuple [int , int ]:
64+ """Determine the number of available KV blocks by invoking the
65+ underlying worker.
66+ """
67+ return self .driver_worker .determine_num_available_blocks ()
68+
69+ def initialize_cache (self , num_gpu_blocks : int ,
70+ num_cpu_blocks : int ) -> None :
71+ """Initialize the KV cache by invoking the underlying worker.
72+ """
73+ # NOTE: We log here to avoid multiple logs when number of workers is
74+ # greater than one. We could log in the engine, but not all executors
75+ # have GPUs.
7076 logger .info (f"# CPU blocks: { num_cpu_blocks } " )
71- if num_cpu_blocks <= 0 :
72- raise ValueError ("No available memory for the cache blocks. "
73- "Try increasing `VLLM_CPU_KVCACHE_SPACE` when "
74- "initializing the engine." )
75-
76- max_seq_len = self .cache_config .block_size * num_cpu_blocks
77- if self .model_config .max_model_len > max_seq_len :
78- raise ValueError (
79- f"The model's max seq len ({ self .model_config .max_model_len } ) "
80- "is larger than the maximum number of tokens that can be "
81- f"stored in KV cache ({ max_seq_len } ). Try increasing "
82- "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when "
83- "initializing the engine." )
84-
85- # Note: To reuse the cache management procedure,
86- # use cpu cache as 'gpu cache'.
87- self .cache_config .num_gpu_blocks = num_cpu_blocks # type: ignore
88- self .cache_config .num_cpu_blocks = 0 # type: ignore
89-
90- # Initialize the cache.
91- self .driver_worker .init_cache_engine (cache_config = self .cache_config )
77+ self .driver_worker .initialize_cache (num_gpu_blocks , num_cpu_blocks )
9278
9379 def execute_model (self ,
9480 seq_group_metadata_list : List [SequenceGroupMetadata ],
@@ -104,13 +90,13 @@ def execute_model(self,
10490 return output
10591
10692 def add_lora (self , lora_request : LoRARequest ) -> bool :
107- raise NotImplementedError ( "LoRA is not implemented for cpu backend." )
93+ return self . driver_worker . add_lora ( lora_request )
10894
10995 def remove_lora (self , lora_id : int ) -> bool :
110- raise NotImplementedError ( "LoRA is not implemented for cpu backend." )
96+ return self . driver_worker . remove_lora ( lora_id )
11197
11298 def list_loras (self ) -> List [int ]:
113- raise NotImplementedError ( "LoRA is not implemented for cpu backend." )
99+ return self . driver_worker . list_loras ( )
114100
115101 def check_health (self ) -> None :
116102 # CPUExecutor will always be healthy as long as
0 commit comments