From 252a0c7c2c491ce5c4e6ff70463cfacb7c2ed18e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 14:17:28 -0700 Subject: [PATCH 001/165] wip --- vllm/executor/executor_base.py | 39 +++++++++++++++++++ vllm/worker/worker_base.py | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 vllm/worker/worker_base.py diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 55180d6110b6..7e21ded9b134 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -28,6 +28,45 @@ def __init__( ) -> None: raise NotImplementedError + + #@abstractmethod + #def init_workers(self) -> None: + # """Initialize workers, such as loading the model or preparing on-device + # tensors. + # """ + # raise NotImplementedError + + + #@abstractmethod + #def profile_num_available_blocks(self, block_size: int, + # gpu_memory_utilization: float, + # cpu_swap_space: float, + # cache_dtype: str) -> tuple[int, int]: + # """Profile the model on-device to determine the maximum number of KV + # blocks that can be allocated. + + # Returns a tuple[num_device_blocks, num_cpu_blocks], where + # num_device_blocks refers to the number of blocks in the "active" KV + # cache (e.g. where blocks are appended to), and num_cpu_blocks refers + # to the number of blocks in the "passive" KV cache (e.g. where blocks + # are swapped to). + + # Examples: + # - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + # - A future CPUExecutor can return [num_cpu_blocks, 0] or + # [num_cpu_blocks, num_swap_cpu_blocks]. + # """ + # raise NotImplementedError + + + #@abstractmethod + #def init_cache(self, cache_config: CacheConfig) -> None: + # """Given a fully-specified cache config, initialize the KV cache. This + # is separate from init_workers as profiling may be required to determine + # the maxmimum allowed KV cache size. + # """ + # raise NotImplementedError + @abstractmethod def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py new file mode 100644 index 000000000000..c12b876451ca --- /dev/null +++ b/vllm/worker/worker_base.py @@ -0,0 +1,70 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + + +class WorkerBase(ABC): + @abstractmethod + def init_device(self) -> None: + """Initialize device state, such as loading the model or other on-device + memory allocations. + """ + raise NotImplementedError + + @abstractmethod + def profile_num_available_blocks(self, block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: float, + cache_dtype: str) -> tuple[int, int]: + """Profile the model on-device to determine the maximum number of KV + blocks that can be allocated. + + Returns a tuple[num_device_blocks, num_cpu_blocks], where + num_device_blocks refers to the number of blocks in the "active" KV + cache (e.g. where blocks are appended to), and num_cpu_blocks refers + to the number of blocks in the "passive" KV cache (e.g. where blocks + are swapped to). + + Examples: + - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + - A future CPUExecutor can return [num_cpu_blocks, 0] or + [num_cpu_blocks, num_swap_cpu_blocks]. + """ + raise NotImplementedError + + @abstractmethod + def init_cache(self, cache_config: CacheConfig) -> None: + """Given a fully-specified cache config, initialize the KV cache. This + is separate from init_workers as profiling may be required to determine + the maxmimum allowed KV cache size. + """ + raise NotImplementedError + + @abstractmethod + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + @abstractmethod + def list_loras(self) -> List[int]: + raise NotImplementedError + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError From a34800fbf0270814f370b5b06b535a8f70c16e16 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 16:11:21 -0700 Subject: [PATCH 002/165] wip --- vllm/engine/llm_engine.py | 8 ++++ vllm/entrypoints/llm.py | 1 + vllm/executor/executor_base.py | 6 +++ vllm/executor/gpu_executor.py | 83 +++++++++++++++++++++++++--------- 4 files changed, 76 insertions(+), 22 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5c343921f07f..831627ac72e9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,6 +127,13 @@ def __init__( speculative_config=speculative_config, ) + # TODO cleanup location + profile_result = self.model_executor.profile_num_available_blocks() + self.model_executor.allocate_kv_cache( + num_active_kv_blocks=profile_result.num_active_kv_blocks, + num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, + ) + # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): from vllm.model_executor.model_loader import ( @@ -212,6 +219,7 @@ def from_engine_args( log_stats=not engine_args.disable_log_stats, usage_context=usage_context, ) + return engine def __reduce__(self): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c..b079d7c117d8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -109,6 +109,7 @@ def __init__( disable_custom_all_reduce=disable_custom_all_reduce, **kwargs, ) + self.llm_engine = LLMEngine.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) self.request_counter = Counter() diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 1a069f2a971d..94b2fe420838 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional +from dataclasses import dataclass from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -115,3 +116,8 @@ async def check_health_async(self) -> None: """Checks if the executor is healthy. If not, it should raise an exception.""" raise NotImplementedError + +@dataclass(frozen=True) +class KvCacheProfileResult: + num_active_kv_blocks: int + num_swapped_kv_blocks: int diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 7b683107d30e..95cd0fa8940f 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -3,7 +3,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase, KvCacheProfileResult from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -42,7 +42,7 @@ def __init__( self._init_worker() # Profile the memory usage and initialize the cache. - self._init_cache() + #self._init_cache() def _init_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers @@ -70,17 +70,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine first profiles the existing memory usage. - Then, it allocates the remaining memory for KV blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. + def profile_num_available_blocks(self) -> KvCacheProfileResult: + # TODO clean up datastructure num_gpu_blocks, num_cpu_blocks = ( self.driver_worker.profile_num_available_blocks( block_size=self.cache_config.block_size, @@ -90,27 +81,75 @@ def _init_cache(self) -> None: cache_dtype=self.cache_config.cache_dtype, )) + return KvCacheProfileResult( + num_active_kv_blocks=num_gpu_blocks, + num_swapped_kv_blocks=num_cpu_blocks, + ) + + def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks + forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_active_kv_blocks=} with " + f"{forced_num_active_kv_blocks=}") + num_active_kv_blocks = forced_num_active_kv_blocks - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") + logger.info(f"# GPU blocks: {num_active_kv_blocks}, " + f"# CPU blocks: {num_swapped_kv_blocks}") - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + check_block_size_valid(num_active_kv_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + self.cache_config.num_gpu_blocks = num_active_kv_blocks + self.cache_config.num_cpu_blocks = num_swapped_kv_blocks # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. self.driver_worker.warm_up_model() + #def _init_cache(self) -> None: + # """Profiles the memory usage and initializes the KV cache. + + # The engine first profiles the existing memory usage. + # Then, it allocates the remaining memory for KV blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ + # # Get the maximum number of blocks that can be allocated on GPU and CPU. + # num_gpu_blocks, num_cpu_blocks = ( + # self.driver_worker.profile_num_available_blocks( + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config. + # gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + # )) + + # if self.cache_config.forced_num_gpu_blocks is not None: + # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + # logger.info(f"Replacing profiled {num_gpu_blocks=} with " + # f"{forced_num_gpu_blocks=}") + # num_gpu_blocks = forced_num_gpu_blocks + + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + + # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + # self.cache_config.num_gpu_blocks = num_gpu_blocks + # self.cache_config.num_cpu_blocks = num_cpu_blocks + + # # Initialize the cache. + # self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # # Warm up the model. This includes capturing the model into CUDA graph + # # if enforce_eager is False. + # self.driver_worker.warm_up_model() + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], From 09f30bde56f9f7709fcd14e5edcf4e98d345cf73 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 3 Apr 2024 16:45:05 -0700 Subject: [PATCH 003/165] wip --- vllm/executor/gpu_executor.py | 41 +---------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 95cd0fa8940f..df1f30b0e402 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -86,6 +86,7 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: num_swapped_kv_blocks=num_cpu_blocks, ) + def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks @@ -109,46 +110,6 @@ def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> # if enforce_eager is False. self.driver_worker.warm_up_model() - #def _init_cache(self) -> None: - # """Profiles the memory usage and initializes the KV cache. - - # The engine first profiles the existing memory usage. - # Then, it allocates the remaining memory for KV blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ - # # Get the maximum number of blocks that can be allocated on GPU and CPU. - # num_gpu_blocks, num_cpu_blocks = ( - # self.driver_worker.profile_num_available_blocks( - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config. - # gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - # )) - - # if self.cache_config.forced_num_gpu_blocks is not None: - # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - # logger.info(f"Replacing profiled {num_gpu_blocks=} with " - # f"{forced_num_gpu_blocks=}") - # num_gpu_blocks = forced_num_gpu_blocks - - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - - # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) - - # self.cache_config.num_gpu_blocks = num_gpu_blocks - # self.cache_config.num_cpu_blocks = num_cpu_blocks - - # # Initialize the cache. - # self.driver_worker.init_cache_engine(cache_config=self.cache_config) - # # Warm up the model. This includes capturing the model into CUDA graph - # # if enforce_eager is False. - # self.driver_worker.warm_up_model() def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], From 8b5bb8b98320d235fcba5c960c6fb1778bd314c6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:33:02 -0700 Subject: [PATCH 004/165] clean --- vllm/engine/llm_engine.py | 2 +- vllm/executor/gpu_executor.py | 26 +++++++++++++++----------- vllm/worker/worker.py | 22 +++++++++++++++------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 831627ac72e9..c311c96b76e8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,7 +129,7 @@ def __init__( # TODO cleanup location profile_result = self.model_executor.profile_num_available_blocks() - self.model_executor.allocate_kv_cache( + self.model_executor.initialize_cache( num_active_kv_blocks=profile_result.num_active_kv_blocks, num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, ) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index df1f30b0e402..8095659c092e 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -55,16 +55,17 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - kv_cache_dtype=self.cache_config.cache_dtype, + #kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) self.driver_worker.init_device() @@ -74,12 +75,15 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: # TODO clean up datastructure num_gpu_blocks, num_cpu_blocks = ( self.driver_worker.profile_num_available_blocks( - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config. - gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, + #self.cache_config, )) + #self.driver_worker.profile_num_available_blocks( + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config. + # gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + #)) return KvCacheProfileResult( num_active_kv_blocks=num_gpu_blocks, @@ -87,7 +91,7 @@ def profile_num_available_blocks(self) -> KvCacheProfileResult: ) - def allocate_kv_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: + def initialize_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks logger.info(f"Replacing profiled {num_active_kv_blocks=} with " diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 48facb57de19..58a8752d9dcc 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -35,18 +35,20 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, local_rank: int, rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - kv_cache_dtype: Optional[str] = "auto", + #kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, ) -> None: self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -66,12 +68,12 @@ def __init__( scheduler_config, device_config, lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, + kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, vision_language_config=vision_language_config) # Uninitialized cache engine. Will be initialized by # self.init_cache_engine(). - self.cache_config = None + #self.cache_config = None self.cache_engine = None self.gpu_cache = None @@ -109,10 +111,10 @@ def load_model(self): @torch.inference_mode() def profile_num_available_blocks( self, - block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str, + #block_size: int, + #gpu_memory_utilization: float, + #cpu_swap_space: int, + #cache_dtype: str, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. @@ -122,6 +124,12 @@ def profile_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ + + block_size = self.cache_config.block_size + gpu_memory_utilization = self.cache_config.gpu_memory_utilization + cpu_swap_space = self.cache_config.swap_space_bytes + cache_dtype = self.cache_config.cache_dtype + # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() From 6fd424f4391a5a4f8138e696c68ace58906e913c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:40:24 -0700 Subject: [PATCH 005/165] wip --- vllm/engine/llm_engine.py | 5 ++-- vllm/executor/executor_base.py | 5 ---- vllm/executor/gpu_executor.py | 45 ++++++++++++---------------------- vllm/worker/worker.py | 6 ++--- 4 files changed, 19 insertions(+), 42 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c311c96b76e8..8758edf0ef9b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -128,10 +128,9 @@ def __init__( ) # TODO cleanup location - profile_result = self.model_executor.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() self.model_executor.initialize_cache( - num_active_kv_blocks=profile_result.num_active_kv_blocks, - num_swapped_kv_blocks=profile_result.num_swapped_kv_blocks, + num_gpu_blocks, num_cpu_blocks, ) # If usage stat is enabled, collect relevant info. diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 94b2fe420838..b531d2080be5 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -116,8 +116,3 @@ async def check_health_async(self) -> None: """Checks if the executor is healthy. If not, it should raise an exception.""" raise NotImplementedError - -@dataclass(frozen=True) -class KvCacheProfileResult: - num_active_kv_blocks: int - num_swapped_kv_blocks: int diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 8095659c092e..7fa2c4eb6024 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,9 +1,9 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase, KvCacheProfileResult +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -71,41 +71,26 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def profile_num_available_blocks(self) -> KvCacheProfileResult: - # TODO clean up datastructure - num_gpu_blocks, num_cpu_blocks = ( - self.driver_worker.profile_num_available_blocks( - #self.cache_config, - )) - #self.driver_worker.profile_num_available_blocks( - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config. - # gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - #)) - - return KvCacheProfileResult( - num_active_kv_blocks=num_gpu_blocks, - num_swapped_kv_blocks=num_cpu_blocks, - ) + + def profile_num_available_blocks(self) -> Tuple[int, int]: + return self.driver_worker.profile_num_available_blocks() - def initialize_cache(self, num_active_kv_blocks: int, num_swapped_kv_blocks) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_active_kv_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_active_kv_blocks=} with " - f"{forced_num_active_kv_blocks=}") - num_active_kv_blocks = forced_num_active_kv_blocks + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks - logger.info(f"# GPU blocks: {num_active_kv_blocks}, " - f"# CPU blocks: {num_swapped_kv_blocks}") + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") - check_block_size_valid(num_active_kv_blocks, self.cache_config.block_size, + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_active_kv_blocks - self.cache_config.num_cpu_blocks = num_swapped_kv_blocks + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 58a8752d9dcc..8176529ba7be 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -111,15 +111,12 @@ def load_model(self): @torch.inference_mode() def profile_num_available_blocks( self, - #block_size: int, - #gpu_memory_utilization: float, - #cpu_swap_space: int, - #cache_dtype: str, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. Args: + # TODO block_size: The size of the cache block. gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. @@ -161,6 +158,7 @@ def profile_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() + return num_gpu_blocks, num_cpu_blocks def init_cache_engine(self, cache_config: CacheConfig) -> None: From 2a347bb39a4284fcf7710541838218ac1666b6ef Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:50:15 -0700 Subject: [PATCH 006/165] wip --- vllm/engine/llm_engine.py | 32 +++++++++++++++++++++++++++++--- vllm/executor/gpu_executor.py | 15 --------------- vllm/executor/utils.py | 1 + 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8758edf0ef9b..e0e732681f7d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,9 +129,20 @@ def __init__( # TODO cleanup location num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() - self.model_executor.initialize_cache( - num_gpu_blocks, num_cpu_blocks, - ) + + if self.cache_config.forced_num_gpu_blocks is not None: + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks + + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + + logger.info( + f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): @@ -841,3 +852,18 @@ def list_loras(self) -> List[int]: def check_health(self) -> None: self.model_executor.check_health() + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 7fa2c4eb6024..bc88571542f6 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -41,9 +41,6 @@ def __init__( # Instantiate the worker and load the model to GPU. self._init_worker() - # Profile the memory usage and initialize the cache. - #self._init_cache() - def _init_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker @@ -77,18 +74,6 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py index 44976696a77c..666ab4b2927d 100644 --- a/vllm/executor/utils.py +++ b/vllm/executor/utils.py @@ -1,3 +1,4 @@ +# TODO def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " From 658ff9be6ac2a7bd3dadf3ff9542763c3f2518d0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:55:26 -0700 Subject: [PATCH 007/165] wip --- vllm/config.py | 15 +++++++++++++++ vllm/engine/llm_engine.py | 4 +++- vllm/executor/gpu_executor.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e27c8eb4fd25..7a137006d0b5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,6 +367,21 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} + def shallow_copy(self): + cache_config = CacheConfig( + block_size=self.block_size, + gpu_memory_utilization=self.gpu_memory_utilization, + swap_space=self.swap_space_bytes // _GB, + cache_dtype=self.cache_dtype, + forced_num_gpu_blocks=self.forced_num_gpu_blocks, + sliding_window=self.sliding_window, + enable_prefix_caching=self.enable_prefix_caching + ) + + cache_config.num_gpu_blocks = self.num_gpu_blocks + cache_config.num_cpu_blocks = self.num_cpu_blocks + return cache_config + def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e0e732681f7d..11dad5e5e86c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -118,7 +118,7 @@ def __init__( self.model_executor = executor_class( model_config=model_config, - cache_config=cache_config, + cache_config=cache_config.shallow_copy(), parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, @@ -137,6 +137,8 @@ def __init__( num_gpu_blocks = forced_num_gpu_blocks raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks logger.info( f"# GPU blocks: {num_gpu_blocks}, " diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index bc88571542f6..0156b735c7b2 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -56,7 +56,7 @@ def _init_worker(self): parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, - cache_config=self.cache_config, + cache_config=self.cache_config.shallow_copy(), local_rank=0, rank=0, distributed_init_method=distributed_init_method, From acee7bec37362863b7bd57eafcbc693bff76a64e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 00:59:41 -0700 Subject: [PATCH 008/165] wip --- vllm/engine/llm_engine.py | 1 + vllm/executor/executor_base.py | 51 ++++++++++++++++------------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 11dad5e5e86c..f030e7ebf679 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -137,6 +137,7 @@ def __init__( num_gpu_blocks = forced_num_gpu_blocks raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b531d2080be5..9dd372156b9f 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -40,35 +40,32 @@ def __init__( # raise NotImplementedError - #@abstractmethod - #def profile_num_available_blocks(self, block_size: int, - # gpu_memory_utilization: float, - # cpu_swap_space: float, - # cache_dtype: str) -> tuple[int, int]: - # """Profile the model on-device to determine the maximum number of KV - # blocks that can be allocated. - - # Returns a tuple[num_device_blocks, num_cpu_blocks], where - # num_device_blocks refers to the number of blocks in the "active" KV - # cache (e.g. where blocks are appended to), and num_cpu_blocks refers - # to the number of blocks in the "passive" KV cache (e.g. where blocks - # are swapped to). - - # Examples: - # - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - # - A future CPUExecutor can return [num_cpu_blocks, 0] or - # [num_cpu_blocks, num_swap_cpu_blocks]. - # """ - # raise NotImplementedError + @abstractmethod + def profile_num_available_blocks(self) -> tuple[int, int]: + """Profile the model on-device to determine the maximum number of KV + blocks that can be allocated. + + Returns a tuple[num_device_blocks, num_cpu_blocks], where + num_device_blocks refers to the number of blocks in the "active" KV + cache (e.g. where blocks are appended to), and num_cpu_blocks refers + to the number of blocks in the "passive" KV cache (e.g. where blocks + are swapped to). + + Examples: + - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. + - A future CPUExecutor can return [num_cpu_blocks, 0] or + [num_cpu_blocks, num_swap_cpu_blocks]. + """ + raise NotImplementedError - #@abstractmethod - #def init_cache(self, cache_config: CacheConfig) -> None: - # """Given a fully-specified cache config, initialize the KV cache. This - # is separate from init_workers as profiling may be required to determine - # the maxmimum allowed KV cache size. - # """ - # raise NotImplementedError + @abstractmethod + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Given a fully-specified cache config, initialize the KV cache. This + is separate from init_workers as profiling may be required to determine + the maxmimum allowed KV cache size. + """ + raise NotImplementedError @abstractmethod def execute_model(self, From 85760d63461700ff0a25f8bf53d2825d8d976d41 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:04:09 -0700 Subject: [PATCH 009/165] wip --- vllm/config.py | 2 +- vllm/engine/llm_engine.py | 23 +---------------------- vllm/executor/cpu_executor.py | 25 ++++++++++++++++++------- vllm/executor/gpu_executor.py | 11 +++++++++-- vllm/executor/neuron_executor.py | 22 ++++++++++++++++++++-- vllm/executor/utils.py | 14 ++++++++++++++ 6 files changed, 63 insertions(+), 34 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7a137006d0b5..735462d1eba6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,7 +367,7 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def shallow_copy(self): + def shallow_copy2(self): cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f030e7ebf679..f8cb7b0f38a1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -118,7 +118,7 @@ def __init__( self.model_executor = executor_class( model_config=model_config, - cache_config=cache_config.shallow_copy(), + cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, @@ -136,15 +136,9 @@ def __init__( f"{forced_num_gpu_blocks=}") num_gpu_blocks = forced_num_gpu_blocks - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - logger.info( - f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) # If usage stat is enabled, collect relevant info. @@ -855,18 +849,3 @@ def list_loras(self) -> List[int]: def check_health(self) -> None: self.model_executor.check_health() - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 7b3cc784c98e..38d5fa0032c5 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -35,7 +35,7 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig, # Instantiate the worker and load the model to CPU. self._init_worker() - self._init_cache() + #self._init_cache() def _init_worker(self): from vllm.worker.cpu_worker import CPUWorker @@ -60,13 +60,29 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def _init_cache(self) -> None: + def profile_num_available_blocks(self) -> tuple[int, int]: num_cpu_blocks = self.driver_worker.get_cpu_cache_block_num( block_size=self.cache_config.block_size, cache_space=self.cache_config.cpu_kvcache_space_bytes, cache_dtype=self.cache_config.cache_dtype, ) + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_gpu_blocks = num_cpu_blocks + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + assert num_cpu_blocks == 0 + num_cpu_blocks = num_gpu_blocks + num_gpu_blocks = 0 + self.cache_config.num_gpu_blocks = num_cpu_blocks + self.cache_config.num_cpu_blocks = 0 + logger.info(f"# CPU blocks: {num_cpu_blocks}") if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " @@ -82,11 +98,6 @@ def _init_cache(self) -> None: "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " "initializing the engine.") - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - self.cache_config.num_gpu_blocks = num_cpu_blocks # type: ignore - self.cache_config.num_cpu_blocks = 0 # type: ignore - # Initialize the cache. self.driver_worker.init_cache_engine(cache_config=self.cache_config) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 0156b735c7b2..afec559a59fb 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -4,7 +4,7 @@ ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid +from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -56,7 +56,7 @@ def _init_worker(self): parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, - cache_config=self.cache_config.shallow_copy(), + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, @@ -74,6 +74,13 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + logger.info( + f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}" + ) + + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c0af058cb90b..53441d8ecca0 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -36,8 +36,8 @@ def __init__( # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. - self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs - self.cache_config.num_cpu_blocks = 0 + #self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs + #self.cache_config.num_cpu_blocks = 0 # Instantiate the worker and load the model to the device. self._init_worker() @@ -54,6 +54,24 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() + # TODO change name + def profile_num_available_blocks(self) -> tuple[int, int]: + # Set the number of GPU blocks to be the same as the maximum number of + # sequences that can be processed in a single batch. This is equivalent + # to schedule without PagedAttention. + num_gpu_blocks = self.scheduler_config.max_num_seqs + + # Swap not yet supported with Neuron backend. + num_cpu_blocks = 0 + + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + assert num_cpu_blocks == 0 + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py index 666ab4b2927d..89fe04434062 100644 --- a/vllm/executor/utils.py +++ b/vllm/executor/utils.py @@ -12,3 +12,17 @@ def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: f"stored in KV cache ({max_seq_len}). Try increasing " "`gpu_memory_utilization` or decreasing `max_model_len` when " "initializing the engine.") + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From 408b29d318da99c8bb277df38341dc41ebf98655 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:07:02 -0700 Subject: [PATCH 010/165] wip --- vllm/executor/ray_gpu_executor.py | 36 +++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 24b3a8c18d92..80ec36e9a5d9 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -10,7 +10,7 @@ VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid +from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -66,7 +66,7 @@ def __init__( self._init_workers_ray(placement_group) # Profile the memory usage and initialize the cache. - self._init_cache() + #self._init_cache() self.forward_dag = None if USE_RAY_COMPILED_DAG: @@ -256,6 +256,38 @@ def _init_cache(self) -> None: # if enforce_eager is False. self._run_workers("warm_up_model") + def profile_num_available_blocks(self) -> tuple[int, int]: + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], From 3149a03d2780f241c80b365a10b1d39e2af90abf Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:08:06 -0700 Subject: [PATCH 011/165] wip --- vllm/config.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 735462d1eba6..e27c8eb4fd25 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -367,21 +367,6 @@ def metrics_info(self): # metrics info return {key: str(value) for key, value in self.__dict__.items()} - def shallow_copy2(self): - cache_config = CacheConfig( - block_size=self.block_size, - gpu_memory_utilization=self.gpu_memory_utilization, - swap_space=self.swap_space_bytes // _GB, - cache_dtype=self.cache_dtype, - forced_num_gpu_blocks=self.forced_num_gpu_blocks, - sliding_window=self.sliding_window, - enable_prefix_caching=self.enable_prefix_caching - ) - - cache_config.num_gpu_blocks = self.num_gpu_blocks - cache_config.num_cpu_blocks = self.num_cpu_blocks - return cache_config - def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( From 0c32e0a793489d8f3b557f747e22bdb27bedb85f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:18:44 -0700 Subject: [PATCH 012/165] wip --- vllm/executor/gpu_executor.py | 2 +- vllm/worker/worker.py | 11 ++++++++--- vllm/worker/worker_base.py | 8 +------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index afec559a59fb..6619cdc15a17 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -85,7 +85,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. - self.driver_worker.init_cache_engine(cache_config=self.cache_config) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8176529ba7be..b6955fa678bf 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,9 +19,10 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner +from vllm.worker.worker_base import WorkerBase -class Worker: +class Worker(WorkerBase): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single GPU. The worker is responsible for @@ -161,13 +162,17 @@ def profile_num_available_blocks( return num_gpu_blocks, num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + #self.cache_config = cache_config self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.gpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) + def warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c12b876451ca..7db8cc0fe591 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -35,7 +35,7 @@ def profile_num_available_blocks(self, block_size: int, raise NotImplementedError @abstractmethod - def init_cache(self, cache_config: CacheConfig) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine the maxmimum allowed KV cache size. @@ -62,9 +62,3 @@ def remove_lora(self, lora_id: int) -> bool: @abstractmethod def list_loras(self) -> List[int]: raise NotImplementedError - - @abstractmethod - def check_health(self) -> None: - """Checks if the executor is healthy. If not, it should raise an - exception.""" - raise NotImplementedError From f64d5b14196dd3f21bfad900ba0033fd6441343b Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:24:04 -0700 Subject: [PATCH 013/165] wip --- vllm/executor/neuron_executor.py | 13 ++++++++----- vllm/worker/neuron_worker.py | 25 +++++++++++++++++++++++-- vllm/worker/worker_base.py | 5 +---- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 53441d8ecca0..c9ecca885ee2 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -25,7 +25,7 @@ def __init__( speculative_config: Optional[SpeculativeConfig], ) -> None: self.model_config = model_config - self.cache_config = cache_config + #self.cache_config = cache_config assert lora_config is None, "LoRA is not supported for Neuron backend." self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -56,6 +56,8 @@ def _init_worker(self): # TODO change name def profile_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.profile_num_available_blocks() + # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. @@ -67,10 +69,11 @@ def profile_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - assert num_cpu_blocks == 0 - assert num_gpu_blocks == self.scheduler_config.max_num_seqs - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #assert num_cpu_blocks == 0 + #assert num_gpu_blocks == self.scheduler_config.max_num_seqs + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 0ae067aafb29..3f39808bf4ac 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -5,13 +5,14 @@ import torch.distributed from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig, - SchedulerConfig) + SchedulerConfig, CacheConfig) from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner +from vllm.worker.worker_base import WorkerBase -class NeuronWorker: +class NeuronWorker(WorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -21,11 +22,13 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, ) -> None: self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.model_runner = NeuronModelRunner(model_config, parallel_config, scheduler_config, device_config) @@ -37,6 +40,24 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() + # TODO change name + def profile_num_available_blocks(self) -> tuple[int, int]: + # Set the number of GPU blocks to be the same as the maximum number of + # sequences that can be processed in a single batch. This is equivalent + # to schedule without PagedAttention. + num_gpu_blocks = self.scheduler_config.max_num_seqs + + # Swap not yet supported with Neuron backend. + num_cpu_blocks = 0 + + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + assert num_cpu_blocks == 0 + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7db8cc0fe591..9c37459ed344 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,10 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def profile_num_available_blocks(self, block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: float, - cache_dtype: str) -> tuple[int, int]: + def profile_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 7207f0c368b43f1e8edbcde55e729f781dafb549 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 14:28:18 -0700 Subject: [PATCH 014/165] wip --- vllm/worker/cpu_worker.py | 43 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 262ed9abd36b..e9dff4a6bf5d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -17,6 +17,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.model_runner import ModelRunner +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -112,7 +113,7 @@ def get_cache_block_size( return dtype_size * total -class CPUWorker: +class CPUWorker(WorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is @@ -167,6 +168,46 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() + def profile_num_available_blocks(self) -> tuple[int, int]: + num_cpu_blocks = self.get_cpu_cache_block_num( + block_size=self.cache_config.block_size, + cache_space=self.cache_config.cpu_kvcache_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_gpu_blocks = num_cpu_blocks + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + assert num_cpu_blocks == 0 + num_cpu_blocks = num_gpu_blocks + num_gpu_blocks = 0 + self.cache_config.num_gpu_blocks = num_cpu_blocks + self.cache_config.num_cpu_blocks = 0 + + logger.info(f"# CPU blocks: {num_cpu_blocks}") + if num_cpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " + "initializing the engine.") + + max_seq_len = self.cache_config.block_size * num_cpu_blocks + if self.model_config.max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({self.model_config.max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " + "initializing the engine.") + + # Initialize the cache. + self.init_cache_engine(cache_config=self.cache_config) + def get_cpu_cache_block_num( self, block_size: int, From 0c4df0b14612af9b8cd63097bfe6e543bf365e97 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:05:43 -0700 Subject: [PATCH 015/165] wip --- vllm/worker/cpu_worker.py | 2 +- vllm/worker/worker_base.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index e9dff4a6bf5d..13f8f050c6ea 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -113,7 +113,7 @@ def get_cache_block_size( return dtype_size * total -class CPUWorker(WorkerBase): +class CPUWorker(LoraNotSupportedWorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 9c37459ed344..4ba985c0a39b 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -59,3 +59,13 @@ def remove_lora(self, lora_id: int) -> bool: @abstractmethod def list_loras(self) -> List[int]: raise NotImplementedError + +class LoraNotSupportedWorkerBase(WorkerBase); + def add_lora(self, lora_request: LoRARequest) -> bool: + raise ValueError(f"{type(self)} does not support LoRA") + + def remove_lora(self, lora_id: int) -> bool: + raise ValueError(f"{type(self)} does not support LoRA") + + def list_loras(self) -> List[int]: + raise ValueError(f"{type(self)} does not support LoRA") From 2e355e7e02c095280878e93a8865443834adf0f5 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:07:13 -0700 Subject: [PATCH 016/165] wip --- vllm/executor/cpu_executor.py | 6 +++--- vllm/worker/worker_base.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 38d5fa0032c5..44fdddc3f942 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -115,13 +115,13 @@ def execute_model(self, return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.remove_lora(lora_id) def list_loras(self) -> List[int]: - raise NotImplementedError("LoRA is not implemented for cpu backend.") + return self.driver_worker.list_loras() def check_health(self) -> None: # CPUExecutor will always be healthy as long as diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 4ba985c0a39b..5fd381c5b953 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -60,7 +60,7 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> List[int]: raise NotImplementedError -class LoraNotSupportedWorkerBase(WorkerBase); +class LoraNotSupportedWorkerBase(WorkerBase): def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From edb7f6281f065a0d6f89852876152199b75519aa Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:07:59 -0700 Subject: [PATCH 017/165] wip --- vllm/worker/worker_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 5fd381c5b953..cf611eb2b88a 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -60,6 +60,7 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> List[int]: raise NotImplementedError + class LoraNotSupportedWorkerBase(WorkerBase): def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From 48bb3e9b0340160b394c5a754e0ef39a08ffdff6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 15:11:48 -0700 Subject: [PATCH 018/165] wip --- vllm/executor/neuron_executor.py | 9 +++------ vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 4 ++-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c9ecca885ee2..2626a5d81672 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -89,16 +89,13 @@ def execute_model(self, return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.remove_lora(lora_request) def list_loras(self) -> List[int]: - raise NotImplementedError( - "LoRA is not implemented for neuron backend.") + return self.driver_worker.list_loras(lora_request) def check_health(self) -> None: # NeuronExecutor will always be healthy as long as diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 13f8f050c6ea..848f36d15abd 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -17,7 +17,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase logger = init_logger(__name__) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3f39808bf4ac..16e9a128d024 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -9,10 +9,10 @@ from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class NeuronWorker(WorkerBase): +class NeuronWorker(LoraNotSupportedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ From 7b390444dfb97d6639b035601e941edc7952b2d1 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 16:57:36 -0700 Subject: [PATCH 019/165] fix test --- tests/spec_decode/test_spec_decode_worker.py | 22 ++++++---------- tests/spec_decode/utils.py | 3 ++- vllm/spec_decode/spec_decode_worker.py | 27 ++++++++------------ vllm/worker/cache_engine.py | 10 +++----- vllm/worker/worker.py | 10 +++----- 5 files changed, 27 insertions(+), 45 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 87d3716ca98d..d4c15d9aea50 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -512,7 +512,7 @@ def test_init_device(): @torch.inference_mode() -def test_init_cache_engine(): +def test_initialize_cache(): """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer workers. """ @@ -526,11 +526,12 @@ def test_init_cache_engine(): metrics_collector) cache_config = MagicMock() + + kwargs = {"num_gpu_blocks":1024, "num_cpu_blocks": 1023} + worker.initialize_cache(**kwargs) - worker.init_cache_engine(cache_config) - - draft_worker.init_cache_engine.assert_called_once_with(cache_config) - target_worker.init_cache_engine.assert_called_once_with(cache_config) + draft_worker.initialize_cache.assert_called_once_with(**kwargs) + target_worker.initialize_cache.assert_called_once_with(**kwargs) @pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) @@ -561,17 +562,10 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - # These values do not directly impact the adjusted block size calculation, - # so they can be fixed. - gpu_memory_utilization = 0.9 - cpu_swap_space = 100 - block_size = 16 - num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks( - block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto") + num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks() - target_worker.profile_num_available_blocks.assert_called_once_with( - block_size, gpu_memory_utilization, cpu_swap_space, "auto") + target_worker.profile_num_available_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 5ef1cc28253e..5c78b3b780d8 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -117,6 +117,7 @@ def create_worker(cls: type, parallel_config=engine_config.parallel_config, scheduler_config=engine_config.scheduler_config, device_config=engine_config.device_config, + cache_config=engine_config.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, @@ -128,7 +129,7 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 - worker.init_cache_engine(engine_config.cache_config) + worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) worker.warm_up_model() return worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 59f9d5b5107f..659acc6620bc 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,7 +3,6 @@ import torch -from vllm.config import CacheConfig from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceGroupOutput, SequenceOutput) @@ -15,9 +14,10 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker +from vllm.worker.worker_base import LoraNotSupportedWorkerBase -class SpecDecodeWorker: +class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. Speculative decoding reduces decoding per-token latency by using a proposal @@ -94,10 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def profile_num_available_blocks(self, block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str) -> Tuple[int, int]: + def profile_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -105,28 +102,24 @@ def profile_num_available_blocks(self, block_size: int, scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = ( - self.scorer_worker.profile_num_available_blocks( - block_size, gpu_memory_utilization, cpu_swap_space, - cache_dtype)) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.profile_num_available_blocks()) scorer_cache_block_size_bytes = ( - self.scorer_worker.get_cache_block_size_bytes( - block_size, cache_dtype)) + self.scorer_worker.get_cache_block_size_bytes()) proposer_cache_block_size_bytes = ( - self.proposer_worker.get_cache_block_size_bytes( - block_size, cache_dtype)) + self.proposer_worker.get_cache_block_size_bytes()) new_num_gpu_blocks = split_num_cache_blocks_evenly( scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, num_gpu_blocks) return new_num_gpu_blocks, num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig): + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. + TODO """ - self.scorer_worker.init_cache_engine(cache_config) - self.proposer_worker.init_cache_engine(cache_config) + self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) @torch.inference_mode() def execute_model( diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 27d1727cd16a..011fc69c4b1c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -80,10 +80,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) - @staticmethod def get_cache_block_size( - block_size: int, - cache_dtype: str, + self, model_config: ModelConfig, parallel_config: ParallelConfig, ) -> int: @@ -91,13 +89,13 @@ def get_cache_block_size( num_heads = model_config.get_num_kv_heads(parallel_config) num_layers = model_config.get_num_layers(parallel_config) - key_cache_block = block_size * num_heads * head_size + key_cache_block = self.block_size * num_heads * head_size value_cache_block = key_cache_block total = num_layers * (key_cache_block + value_cache_block) - if cache_dtype == "auto": + if self.cache_dtype == "auto": dtype = model_config.dtype else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] + dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_dtype] dtype_size = _get_dtype_size(dtype) return dtype_size * total diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b6955fa678bf..5914079f713a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -147,8 +147,7 @@ def profile_num_available_blocks( "Error in memory profiling. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") - cache_block_size = self.get_cache_block_size_bytes( - block_size, cache_dtype) + cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) @@ -250,13 +249,10 @@ def max_model_len(self) -> int: def vocab_size(self) -> int: return self.model_runner.vocab_size - def get_cache_block_size_bytes(self, block_size: int, - cache_dtype: str) -> int: + def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size(block_size, cache_dtype, - self.model_config, - self.parallel_config) + return self.cache_config.get_cache_block_size(self.model_config, self.parallel_config) def init_distributed_environment( From 9e5f2fbbeef1eb6aaf1a066546186767ad13928d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 17:03:04 -0700 Subject: [PATCH 020/165] fix test --- tests/worker/test_swap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 5d6ba51ea0f0..bf89eec62b4d 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -22,6 +22,7 @@ def test_swap() -> None: parallel_config=engine_config.parallel_config, scheduler_config=engine_config.scheduler_config, device_config=engine_config.device_config, + cache_config=engine_config.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, From 1a3e26ed81acce3c0cb5982f53525b6f7e8334c2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 17:05:07 -0700 Subject: [PATCH 021/165] fix test --- tests/lora/test_worker.py | 1 + tests/worker/test_swap.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 60aa90fe4ee8..11370b3ea1c6 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -27,6 +27,7 @@ def test_worker_apply_lora(sql_lora_files): parallel_config=ParallelConfig(1, 1, False), scheduler_config=SchedulerConfig(32, 32, 32), device_config=DeviceConfig("cuda"), + cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), local_rank=0, rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index bf89eec62b4d..7b58416257b8 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -32,7 +32,7 @@ def test_swap() -> None: # Initialize the worker. worker.init_device() worker.load_model() - worker.init_cache_engine(engine_config.cache_config) + worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) worker.warm_up_model() # Randomly initialize the cache. From cd2015c9a548f56d91b94951d13e30c378d55cd7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:02:03 -0700 Subject: [PATCH 022/165] fix test --- vllm/worker/cache_engine.py | 9 +++++---- vllm/worker/worker.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 011fc69c4b1c..c34ee0648626 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -80,8 +80,9 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) + @staticmethod def get_cache_block_size( - self, + cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, ) -> int: @@ -89,13 +90,13 @@ def get_cache_block_size( num_heads = model_config.get_num_kv_heads(parallel_config) num_layers = model_config.get_num_layers(parallel_config) - key_cache_block = self.block_size * num_heads * head_size + key_cache_block = cache_config.block_size * num_heads * head_size value_cache_block = key_cache_block total = num_layers * (key_cache_block + value_cache_block) - if self.cache_dtype == "auto": + if cache_config.cache_dtype == "auto": dtype = model_config.dtype else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[self.cache_dtype] + dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] dtype_size = _get_dtype_size(dtype) return dtype_size * total diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5914079f713a..1e051697fa7e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -252,7 +252,8 @@ def vocab_size(self) -> int: def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return self.cache_config.get_cache_block_size(self.model_config, self.parallel_config) + return CacheEngine.get_cache_block_size( + self.cache_config, self.model_config, self.parallel_config) def init_distributed_environment( From d92603494025e923ea37629486d2255bdda53222 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:07:07 -0700 Subject: [PATCH 023/165] fix --- vllm/executor/ray_gpu_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 80ec36e9a5d9..0855aaec47e7 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -172,7 +172,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", rank, distributed_init_method, lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, + #kv_cache_dtype=kv_cache_dtype, )) # Initialize the driver worker with the Worker class. @@ -188,7 +188,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - kv_cache_dtype=kv_cache_dtype, + #kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) From 607f7e22c5b5999809655dadc8f25b5651a0c107 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:10:57 -0700 Subject: [PATCH 024/165] fix --- vllm/executor/ray_gpu_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 0855aaec47e7..8f7c4d341562 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -272,6 +272,8 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) + return num_gpu_blocks, num_cpu_blocks + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, From e127bb7094a8ad04173167906c9874441578bbc9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:11:41 -0700 Subject: [PATCH 025/165] fix --- vllm/executor/ray_gpu_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 8f7c4d341562..bb93e438c043 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -283,7 +283,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) # Warm up the model. This includes capturing the model into CUDA graph # if enforce_eager is False. From deaa8b059e48b9e229237bf2c6b55cc1368b1fdf Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:13:28 -0700 Subject: [PATCH 026/165] fix --- vllm/executor/cpu_executor.py | 39 ++--------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 44fdddc3f942..40f366f987f0 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -61,45 +61,10 @@ def _init_worker(self): self.driver_worker.load_model() def profile_num_available_blocks(self) -> tuple[int, int]: - num_cpu_blocks = self.driver_worker.get_cpu_cache_block_num( - block_size=self.cache_config.block_size, - cache_space=self.cache_config.cpu_kvcache_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_gpu_blocks = num_cpu_blocks - num_cpu_blocks = 0 - return num_gpu_blocks, num_cpu_blocks - + return self.driver_worker.profile_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - assert num_cpu_blocks == 0 - num_cpu_blocks = num_gpu_blocks - num_gpu_blocks = 0 - self.cache_config.num_gpu_blocks = num_cpu_blocks - self.cache_config.num_cpu_blocks = 0 - - logger.info(f"# CPU blocks: {num_cpu_blocks}") - if num_cpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_cpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " - "initializing the engine.") - - # Initialize the cache. - self.driver_worker.init_cache_engine(cache_config=self.cache_config) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], From 7817d61d25379b5b1d787926c1cb9858042b7159 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:15:07 -0700 Subject: [PATCH 027/165] clean --- vllm/executor/cpu_executor.py | 1 - vllm/executor/ray_gpu_executor.py | 115 +++++++++++++++--------------- vllm/worker/cpu_worker.py | 6 +- 3 files changed, 59 insertions(+), 63 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 40f366f987f0..3c1b2b5e21e8 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -35,7 +35,6 @@ def __init__(self, model_config: ModelConfig, cache_config: CacheConfig, # Instantiate the worker and load the model to CPU. self._init_worker() - #self._init_cache() def _init_worker(self): from vllm.worker.cpu_worker import CPUWorker diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index bb93e438c043..8ffbc64a2cfc 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -65,9 +65,6 @@ def __init__( # Create the parallel GPU workers. self._init_workers_ray(placement_group) - # Profile the memory usage and initialize the cache. - #self._init_cache() - self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() @@ -199,62 +196,62 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") + #def _init_cache(self) -> None: + # """Profiles the memory usage and initializes the KV cache. + + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. + + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. + + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ + # # Get the maximum number of blocks that can be allocated on GPU and CPU. + # num_blocks = self._run_workers( + # "profile_num_available_blocks", + # block_size=self.cache_config.block_size, + # gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + # cpu_swap_space=self.cache_config.swap_space_bytes, + # cache_dtype=self.cache_config.cache_dtype, + # ) + + # # Since we use a shared centralized controller, we take the minimum + # # number of blocks across all workers to make sure all the memory + # # operators can be applied to all workers. + # num_gpu_blocks = min(b[0] for b in num_blocks) + # num_cpu_blocks = min(b[1] for b in num_blocks) + + # if self.cache_config.forced_num_gpu_blocks is not None: + # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + # logger.info(f"Replacing profiled {num_gpu_blocks=} with " + # f"{forced_num_gpu_blocks=}") + # num_gpu_blocks = forced_num_gpu_blocks + + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + + # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + # self.cache_config.num_gpu_blocks = num_gpu_blocks + # self.cache_config.num_cpu_blocks = num_cpu_blocks + + # # Initialize the cache. + # self._run_workers("init_cache_engine", cache_config=self.cache_config) + # # Warm up the model. This includes capturing the model into CUDA graph + # # if enforce_eager is False. + # self._run_workers("warm_up_model") def profile_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 848f36d15abd..1d14bb0bd6d2 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -155,7 +155,7 @@ def __init__( kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). + # initialize_cache. self.cache_config = None self.cache_engine = None self.cpu_cache = None @@ -206,7 +206,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: "initializing the engine.") # Initialize the cache. - self.init_cache_engine(cache_config=self.cache_config) + self._init_cache_engine(cache_config=self.cache_config) def get_cpu_cache_block_num( self, @@ -228,7 +228,7 @@ def get_cpu_cache_block_num( return num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig) -> None: + def _init_cache_engine(self, cache_config: CacheConfig) -> None: self.cache_config = cache_config self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, From 99823a34607920537920b71c6221d7e3b285cca0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:28:19 -0700 Subject: [PATCH 028/165] clean --- tests/spec_decode/test_spec_decode_worker.py | 2 +- vllm/executor/gpu_executor.py | 20 +++++++++++------- vllm/executor/neuron_executor.py | 14 ------------- vllm/executor/ray_gpu_executor.py | 22 ++++++++++++-------- vllm/worker/worker.py | 9 +++++++- 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index d4c15d9aea50..ff7beff40ded 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -513,7 +513,7 @@ def test_init_device(): @torch.inference_mode() def test_initialize_cache(): - """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer + """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. """ draft_worker = mock_worker(cls=MultiStepWorker) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 6619cdc15a17..9fb7b0df00aa 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -79,17 +79,21 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: f"# CPU blocks: {num_cpu_blocks}" ) - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #return - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - # Initialize the cache. - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks + + ## Initialize the cache. + #self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self.driver_worker.warm_up_model() + ## Warm up the model. This includes capturing the model into CUDA graph + ## if enforce_eager is False. + #self.driver_worker.warm_up_model() def execute_model(self, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 2626a5d81672..d8cda2ee461c 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -58,22 +58,8 @@ def _init_worker(self): def profile_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.profile_num_available_blocks() - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - num_gpu_blocks = self.scheduler_config.max_num_seqs - - # Swap not yet supported with Neuron backend. - num_cpu_blocks = 0 - - return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - #assert num_cpu_blocks == 0 - #assert num_gpu_blocks == self.scheduler_config.max_num_seqs - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 8ffbc64a2cfc..a2b571242c6e 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -273,18 +273,22 @@ def profile_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks + #return - # Initialize the cache. - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, + # self.model_config.max_model_len) + + #self.cache_config.num_gpu_blocks = num_gpu_blocks + #self.cache_config.num_cpu_blocks = num_cpu_blocks + + ## Initialize the cache. + #self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") + ## Warm up the model. This includes capturing the model into CUDA graph + ## if enforce_eager is False. + #self._run_workers("warm_up_model") def execute_model(self, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 1e051697fa7e..d84d11021a85 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -163,9 +163,16 @@ def profile_num_available_blocks( def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - #self.cache_config = cache_config + + self._init_cache_engine() + self.warm_up_model() + + def _init_cache_engine(self): + assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.gpu_cache = self.cache_engine.gpu_cache From 849bfe911f99a761f9163e5c2496d10fb33e416a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:34:52 -0700 Subject: [PATCH 029/165] fix --- vllm/executor/cpu_executor.py | 9 +++++---- vllm/executor/gpu_executor.py | 17 +++-------------- vllm/executor/ray_gpu_executor.py | 18 +++--------------- vllm/worker/cpu_worker.py | 3 ++- vllm/worker/worker.py | 4 +++- 5 files changed, 16 insertions(+), 35 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 3c1b2b5e21e8..e17bdf34a98d 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -45,10 +45,11 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = CPUWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 9fb7b0df00aa..889f1079efc9 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -74,26 +74,15 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. logger.info( f"# GPU blocks: {num_gpu_blocks}, " f"# CPU blocks: {num_cpu_blocks}" ) self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - #return - - - #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) - - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks - - ## Initialize the cache. - #self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - ## Warm up the model. This includes capturing the model into CUDA graph - ## if enforce_eager is False. - #self.driver_worker.warm_up_model() def execute_model(self, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index a2b571242c6e..b39d552d62dd 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -273,22 +273,10 @@ def profile_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - - #return - - #raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks - #self.cache_config.num_gpu_blocks = num_gpu_blocks - #self.cache_config.num_cpu_blocks = num_cpu_blocks - - ## Initialize the cache. - #self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - - ## Warm up the model. This includes capturing the model into CUDA graph - ## if enforce_eager is False. - #self._run_workers("warm_up_model") + self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) def execute_model(self, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1d14bb0bd6d2..9decc83af6a9 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -128,6 +128,7 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, local_rank: int, rank: int, distributed_init_method: str, @@ -139,6 +140,7 @@ def __init__( self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -156,7 +158,6 @@ def __init__( is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_config = None self.cache_engine = None self.cpu_cache = None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d84d11021a85..d7fdaf6d1f88 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,9 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import WorkerBase, raise_if_cache_size_invalid + +# TODO move raise_if_cache_size_invalid class Worker(WorkerBase): From 951ba8597dc08994b1484f9f49b226acd8bc373e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:42:46 -0700 Subject: [PATCH 030/165] fix --- tests/spec_decode/utils.py | 1 - tests/worker/test_swap.py | 1 - vllm/executor/ray_gpu_executor.py | 39 +++---------------------------- vllm/worker/worker.py | 7 +++--- 4 files changed, 7 insertions(+), 41 deletions(-) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 5c78b3b780d8..0916d3d49421 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -130,7 +130,6 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - worker.warm_up_model() return worker diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 7b58416257b8..b35bf583ecb4 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -33,7 +33,6 @@ def test_swap() -> None: worker.init_device() worker.load_model() worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - worker.warm_up_model() # Randomly initialize the cache. gpu_cache = worker.cache_engine.gpu_cache diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index b39d552d62dd..e7a52b5830f6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -196,7 +196,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - #def _init_cache(self) -> None: # """Profiles the memory usage and initializes the KV cache. # The engine will first conduct a profiling of the existing memory usage. @@ -217,41 +216,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # You may limit the usage of GPU memory # by adjusting the `gpu_memory_utilization` parameter. # """ - # # Get the maximum number of blocks that can be allocated on GPU and CPU. - # num_blocks = self._run_workers( - # "profile_num_available_blocks", - # block_size=self.cache_config.block_size, - # gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - # cpu_swap_space=self.cache_config.swap_space_bytes, - # cache_dtype=self.cache_config.cache_dtype, - # ) - - # # Since we use a shared centralized controller, we take the minimum - # # number of blocks across all workers to make sure all the memory - # # operators can be applied to all workers. - # num_gpu_blocks = min(b[0] for b in num_blocks) - # num_cpu_blocks = min(b[1] for b in num_blocks) - - # if self.cache_config.forced_num_gpu_blocks is not None: - # forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - # logger.info(f"Replacing profiled {num_gpu_blocks=} with " - # f"{forced_num_gpu_blocks=}") - # num_gpu_blocks = forced_num_gpu_blocks - - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - - # check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - # self.model_config.max_model_len) - - # self.cache_config.num_gpu_blocks = num_gpu_blocks - # self.cache_config.num_cpu_blocks = num_cpu_blocks - - # # Initialize the cache. - # self._run_workers("init_cache_engine", cache_config=self.cache_config) - # # Warm up the model. This includes capturing the model into CUDA graph - # # if enforce_eager is False. - # self._run_workers("warm_up_model") def profile_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. @@ -269,6 +233,9 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) + # logger.info(f"# GPU blocks: {num_gpu_blocks}, " + # f"# CPU blocks: {num_cpu_blocks}") + return num_gpu_blocks, num_cpu_blocks diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d7fdaf6d1f88..3dd233159d9b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker_base import WorkerBase, raise_if_cache_size_invalid +from vllm.worker.worker_base import WorkerBase +from vllm.executor.utils import raise_if_cache_size_invalid # TODO move raise_if_cache_size_invalid @@ -171,7 +172,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks self._init_cache_engine() - self.warm_up_model() + self._warm_up_model() def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None @@ -181,7 +182,7 @@ def _init_cache_engine(self): self.model_runner.set_block_size(self.cache_engine.block_size) - def warm_up_model(self) -> None: + def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) # Reset the seed to ensure that the random state is not affected by From 38948df55a2f20c18a57647115709cf3ece6d0ec Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:49:22 -0700 Subject: [PATCH 031/165] speed up cpu test --- tests/conftest.py | 6 +++++- tests/spec_decode/test_batch_expansion.py | 3 +++ tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5c409c8cd5ee..e00f3eb871e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,11 +56,15 @@ def cleanup(): @pytest.fixture() -def should_do_global_cleanup_after_test() -> bool: +def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ + + if request.node.get_closest_marker("skip_global_cleanup"): + return False + return True diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 80a960acf0be..43cfd78ddb0c 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize('num_target_seq_ids', [100]) +@pytest.mark.skip_global_cleanup def test_create_target_seq_id_iterator(num_target_seq_ids: int): """Verify all new sequence ids are greater than all input seq ids. @@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): """Verify correct tokens are selected for scoring. """ @@ -53,6 +55,7 @@ def test_get_token_ids_to_score(k: int): @pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.skip_global_cleanup def test_create_single_target_seq_group_metadata(k: int): """Verify correct creation of a batch-expanded seq group metadata. """ diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ff7beff40ded..038de6a48d7a 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,7 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -511,7 +511,7 @@ def test_init_device(): rejection_sampler.init_gpu_tensors.assert_called_once() -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('available_cpu_blocks', [500]) @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_profile_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -578,7 +578,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@torch.inference_mode() +@pytest.mark.skip_global_cleanup def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): From 397ec77d77db76d757841ba18da48128d9f918eb Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:54:35 -0700 Subject: [PATCH 032/165] wip --- vllm/worker/cpu_worker.py | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 9decc83af6a9..65f90fcbd86b 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -170,11 +170,12 @@ def load_model(self): self.model_runner.load_model() def profile_num_available_blocks(self) -> tuple[int, int]: - num_cpu_blocks = self.get_cpu_cache_block_num( - block_size=self.cache_config.block_size, - cache_space=self.cache_config.cpu_kvcache_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) + # For CPU device, the block number will be calculated based on the + # cpu_kvcache_space. + cache_block_size = CPUCacheEngine.get_cache_block_size( + self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) + num_cpu_blocks = max(num_cpu_blocks, 0) # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. @@ -209,26 +210,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: # Initialize the cache. self._init_cache_engine(cache_config=self.cache_config) - def get_cpu_cache_block_num( - self, - block_size: int, - cache_space: int, - cache_dtype: str, - ) -> int: - """ - Args: - block_size: The size of the cache block. - cache_space: The size of the CPU KV cache space in bytes. - """ - # For CPU device, the block number will be calculated based on the - # cpu_kvcache_space. - cache_block_size = CPUCacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) - num_cpu_blocks = int(cache_space // cache_block_size) - num_cpu_blocks = max(num_cpu_blocks, 0) - - return num_cpu_blocks - def _init_cache_engine(self, cache_config: CacheConfig) -> None: self.cache_config = cache_config self.cache_engine = CPUCacheEngine(self.cache_config, From 23382b955b1a84c99a3ec169f14f05d0d2d3c4fe Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 18:59:27 -0700 Subject: [PATCH 033/165] wip --- vllm/executor/cpu_executor.py | 1 + vllm/worker/cpu_worker.py | 6 ++---- vllm/worker/worker.py | 10 ++-------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index e17bdf34a98d..c307d08ae0d7 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -64,6 +64,7 @@ def profile_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.profile_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 65f90fcbd86b..781501dc610d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -192,7 +192,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 - logger.info(f"# CPU blocks: {num_cpu_blocks}") if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " @@ -208,10 +207,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: "initializing the engine.") # Initialize the cache. - self._init_cache_engine(cache_config=self.cache_config) + self._init_cache_engine() - def _init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config + def _init_cache_engine(self) -> None: self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, self.parallel_config, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 3dd233159d9b..2fc89635112a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -125,12 +125,6 @@ def profile_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ - - block_size = self.cache_config.block_size - gpu_memory_utilization = self.cache_config.gpu_memory_utilization - cpu_swap_space = self.cache_config.swap_space_bytes - cache_dtype = self.cache_config.cache_dtype - # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -152,9 +146,9 @@ def profile_num_available_blocks( cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( - (total_gpu_memory * gpu_memory_utilization - peak_memory) // + (total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size) - num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: From 7a0294cd0e47b618ac170275dc69fde532a0992d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:03:02 -0700 Subject: [PATCH 034/165] clean --- vllm/engine/llm_engine.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 23a952b4101a..155b65e74434 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,19 +127,7 @@ def __init__( speculative_config=speculative_config, ) - # TODO cleanup location - num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() - - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) + self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. if is_usage_stats_enabled(): @@ -192,6 +180,20 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + def _initialize_kv_caches(self) -> None: + num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() + + if self.cache_config.forced_num_gpu_blocks is not None: + forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks + logger.info(f"Replacing profiled {num_gpu_blocks=} with " + f"{forced_num_gpu_blocks=}") + num_gpu_blocks = forced_num_gpu_blocks + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) + @classmethod def from_engine_args( cls, From dcdca688de21f994faa24dafd9ac6cb9455f2461 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:08:38 -0700 Subject: [PATCH 035/165] wip --- vllm/executor/ray_gpu_executor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e7a52b5830f6..637581b53f1f 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -233,13 +233,17 @@ def profile_num_available_blocks(self) -> tuple[int, int]: num_gpu_blocks = min(b[0] for b in num_blocks) num_cpu_blocks = min(b[1] for b in num_blocks) - # logger.info(f"# GPU blocks: {num_gpu_blocks}, " - # f"# CPU blocks: {num_cpu_blocks}") - return num_gpu_blocks, num_cpu_blocks def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From ed58af224b35e516d7e8ff3316a744d2d4c9f4c3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:15:43 -0700 Subject: [PATCH 036/165] remove --- tests/spec_decode/test_spec_decode_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 038de6a48d7a..c4dfbb5dc00c 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,7 +487,6 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) -@pytest.mark.skip_global_cleanup def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. From df8688e0cad205a4690a4d0f680ccf994959b350 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:12 -0700 Subject: [PATCH 037/165] Revert "more test speedup" This reverts commit 4c486f9bb4fc3b90efc1765ba46f4a666d1c9339. --- tests/conftest.py | 6 +----- tests/spec_decode/test_batch_expansion.py | 3 --- tests/spec_decode/test_spec_decode_worker.py | 5 +++-- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e00f3eb871e3..5c409c8cd5ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,15 +56,11 @@ def cleanup(): @pytest.fixture() -def should_do_global_cleanup_after_test(request) -> bool: +def should_do_global_cleanup_after_test() -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ - - if request.node.get_closest_marker("skip_global_cleanup"): - return False - return True diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0c..80a960acf0be 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize('num_target_seq_ids', [100]) -@pytest.mark.skip_global_cleanup def test_create_target_seq_id_iterator(num_target_seq_ids: int): """Verify all new sequence ids are greater than all input seq ids. @@ -28,7 +27,6 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): @pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): """Verify correct tokens are selected for scoring. """ @@ -55,7 +53,6 @@ def test_get_token_ids_to_score(k: int): @pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup def test_create_single_target_seq_group_metadata(k: int): """Verify correct creation of a batch-expanded seq group metadata. """ diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index c4dfbb5dc00c..c7b11f7bbf68 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -487,6 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int): **execute_model_data.to_dict()) +@torch.inference_mode() def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -537,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('available_cpu_blocks', [500]) @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup +@torch.inference_mode() def test_profile_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -577,7 +578,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup +@torch.inference_mode() def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): From 55a5203484b1861ca91ecf661decb771d6c5603d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:38 -0700 Subject: [PATCH 038/165] wip --- tests/spec_decode/test_spec_decode_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index c7b11f7bbf68..8d33fa2f1e38 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -511,7 +511,6 @@ def test_init_device(): rejection_sampler.init_gpu_tensors.assert_called_once() -@pytest.mark.skip_global_cleanup def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. From 55d083bf1a761f20e4cf089283a5657282e118e7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:19:56 -0700 Subject: [PATCH 039/165] wip --- tests/spec_decode/test_spec_decode_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 8d33fa2f1e38..218704b4224a 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -510,7 +510,7 @@ def test_init_device(): metrics_collector.init_gpu_tensors.assert_called_once() rejection_sampler.init_gpu_tensors.assert_called_once() - +@torch.inference_mode() def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer workers. From 0814d245e896309ce6ca85214e391c9e99225dc3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:28:55 -0700 Subject: [PATCH 040/165] wip --- vllm/entrypoints/llm.py | 1 - vllm/executor/executor_base.py | 8 -------- vllm/executor/gpu_executor.py | 1 - vllm/executor/neuron_executor.py | 7 ------- 4 files changed, 17 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b079d7c117d8..5777e8179a1c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -109,7 +109,6 @@ def __init__( disable_custom_all_reduce=disable_custom_all_reduce, **kwargs, ) - self.llm_engine = LLMEngine.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) self.request_counter = Counter() diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 9dd372156b9f..b575d238696f 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -32,14 +32,6 @@ def __init__( raise NotImplementedError - #@abstractmethod - #def init_workers(self) -> None: - # """Initialize workers, such as loading the model or preparing on-device - # tensors. - # """ - # raise NotImplementedError - - @abstractmethod def profile_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 889f1079efc9..f138258ec83a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -62,7 +62,6 @@ def _init_worker(self): distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - #kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) self.driver_worker.init_device() diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index d8cda2ee461c..5290bbd8a82c 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -25,7 +25,6 @@ def __init__( speculative_config: Optional[SpeculativeConfig], ) -> None: self.model_config = model_config - #self.cache_config = cache_config assert lora_config is None, "LoRA is not supported for Neuron backend." self.parallel_config = parallel_config self.scheduler_config = scheduler_config @@ -33,12 +32,6 @@ def __init__( assert (not speculative_config ), "Speculative decoding not yet supported for Neuron backend." - # Set the number of GPU blocks to be the same as the maximum number of - # sequences that can be processed in a single batch. This is equivalent - # to schedule without PagedAttention. - #self.cache_config.num_gpu_blocks = self.scheduler_config.max_num_seqs - #self.cache_config.num_cpu_blocks = 0 - # Instantiate the worker and load the model to the device. self._init_worker() From b18d00c6c0d7dbbda13768e12122f3e958b61667 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:35:18 -0700 Subject: [PATCH 041/165] rename profile_num_available_blocks to get_max_allowed_kv_blocks --- tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- vllm/engine/llm_engine.py | 2 +- vllm/executor/cpu_executor.py | 4 ++-- vllm/executor/executor_base.py | 2 +- vllm/executor/gpu_executor.py | 4 ++-- vllm/executor/neuron_executor.py | 4 ++-- vllm/executor/ray_gpu_executor.py | 6 +++--- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 2 +- vllm/worker/worker.py | 2 +- vllm/worker/worker_base.py | 2 +- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 218704b4224a..e1dc33e8babc 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() -def test_profile_num_available_blocks(available_gpu_blocks: int, +def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): @@ -552,7 +552,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.profile_num_available_blocks.return_value = ( + target_worker.get_max_allowed_kv_blocks.return_value = ( available_gpu_blocks, available_cpu_blocks) target_worker.get_cache_block_size_bytes.return_value = ( target_cache_block_size_bytes) @@ -562,9 +562,9 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = worker.get_max_allowed_kv_blocks() - target_worker.profile_num_available_blocks.assert_called_once() + target_worker.get_max_allowed_kv_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 155b65e74434..4974cca23c48 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,7 +181,7 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.profile_num_available_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.get_max_allowed_kv_blocks() if self.cache_config.forced_num_gpu_blocks is not None: forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index c307d08ae0d7..42f773e1defa 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -60,8 +60,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def profile_num_available_blocks(self) -> tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index b575d238696f..5953aa3f4bde 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -33,7 +33,7 @@ def __init__( @abstractmethod - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index f138258ec83a..f30ec45d3e4e 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -68,8 +68,8 @@ def _init_worker(self): self.driver_worker.load_model() - def profile_num_available_blocks(self) -> Tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 5290bbd8a82c..82487a065d69 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,8 +48,8 @@ def _init_worker(self): self.driver_worker.load_model() # TODO change name - def profile_num_available_blocks(self) -> tuple[int, int]: - return self.driver_worker.profile_num_available_blocks() + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + return self.driver_worker.get_max_allowed_kv_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 637581b53f1f..ca84485af0ca 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -202,7 +202,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Then, it calculate the maximum possible number of GPU and CPU blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + # :meth:`~vllm.worker.worker.Worker.get_max_allowed_kv_blocks` method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, @@ -217,10 +217,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # by adjusting the `gpu_memory_utilization` parameter. # """ - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( - "profile_num_available_blocks", + "get_max_allowed_kv_blocks", block_size=self.cache_config.block_size, gpu_memory_utilization=self.cache_config.gpu_memory_utilization, cpu_swap_space=self.cache_config.swap_space_bytes, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 659acc6620bc..863eccb47216 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -94,7 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def profile_num_available_blocks(self) -> Tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -102,7 +102,7 @@ def profile_num_available_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.profile_num_available_blocks()) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.get_max_allowed_kv_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 781501dc610d..db238e81a5f6 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -169,7 +169,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = CPUCacheEngine.get_cache_block_size( diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 16e9a128d024..7ba8c2c754e3 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -41,7 +41,7 @@ def load_model(self): self.model_runner.load_model() # TODO change name - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2fc89635112a..a4aea636a4d9 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -113,7 +113,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def profile_num_available_blocks( + def get_max_allowed_kv_blocks( self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index cf611eb2b88a..1708795b0176 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,7 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def profile_num_available_blocks(self) -> tuple[int, int]: + def get_max_allowed_kv_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 8fb7b9a45812e16939539f0c155503fde0b0ad1c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:37:58 -0700 Subject: [PATCH 042/165] rename again --- tests/spec_decode/test_spec_decode_worker.py | 8 ++++---- vllm/engine/llm_engine.py | 2 +- vllm/executor/cpu_executor.py | 4 ++-- vllm/executor/executor_base.py | 2 +- vllm/executor/gpu_executor.py | 4 ++-- vllm/executor/neuron_executor.py | 5 ++--- vllm/executor/ray_gpu_executor.py | 6 +++--- vllm/spec_decode/spec_decode_worker.py | 4 ++-- vllm/worker/cpu_worker.py | 2 +- vllm/worker/neuron_worker.py | 3 +-- vllm/worker/worker.py | 2 +- vllm/worker/worker_base.py | 2 +- 12 files changed, 21 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e1dc33e8babc..511d600199a0 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -538,7 +538,7 @@ def test_initialize_cache(): @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() -def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, +def test_determine_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): @@ -552,7 +552,7 @@ def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.get_max_allowed_kv_blocks.return_value = ( + target_worker.determine_num_available_blocks.return_value = ( available_gpu_blocks, available_cpu_blocks) target_worker.get_cache_block_size_bytes.return_value = ( target_cache_block_size_bytes) @@ -562,9 +562,9 @@ def test_get_max_allowed_kv_blocks(available_gpu_blocks: int, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.get_max_allowed_kv_blocks() + num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() - target_worker.get_max_allowed_kv_blocks.assert_called_once() + target_worker.determine_num_available_blocks.assert_called_once() assert num_cpu_blocks == available_cpu_blocks assert num_gpu_blocks == split_num_cache_blocks_evenly( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4974cca23c48..ad037cf2e79b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,7 +181,7 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.get_max_allowed_kv_blocks() + num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() if self.cache_config.forced_num_gpu_blocks is not None: forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 42f773e1defa..b78f6d993453 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -60,8 +60,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 5953aa3f4bde..757549bdedbe 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -33,7 +33,7 @@ def __init__( @abstractmethod - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index f30ec45d3e4e..e586cf810d78 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -68,8 +68,8 @@ def _init_worker(self): self.driver_worker.load_model() - def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> Tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 82487a065d69..b907fd472704 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -47,9 +47,8 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - # TODO change name - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: - return self.driver_worker.get_max_allowed_kv_blocks() + def determine_num_available_blocks(self) -> tuple[int, int]: + return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index ca84485af0ca..ca851dfc462b 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -202,7 +202,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Then, it calculate the maximum possible number of GPU and CPU blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.get_max_allowed_kv_blocks` method + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, @@ -217,10 +217,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # by adjusting the `gpu_memory_utilization` parameter. # """ - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( - "get_max_allowed_kv_blocks", + "determine_num_available_blocks", block_size=self.cache_config.block_size, gpu_memory_utilization=self.cache_config.gpu_memory_utilization, cpu_swap_space=self.cache_config.swap_space_bytes, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 863eccb47216..5f03b1edc07a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -94,7 +94,7 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) - def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. This is done by profiling the scorer model (which is typically the @@ -102,7 +102,7 @@ def get_max_allowed_kv_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.get_max_allowed_kv_blocks()) + num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.determine_num_available_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index db238e81a5f6..5250c15330e4 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -169,7 +169,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = CPUCacheEngine.get_cache_block_size( diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 7ba8c2c754e3..dab70d884db4 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -40,8 +40,7 @@ def init_device(self) -> None: def load_model(self): self.model_runner.load_model() - # TODO change name - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a4aea636a4d9..10396262101b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -113,7 +113,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def get_max_allowed_kv_blocks( + def determine_num_available_blocks( self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 1708795b0176..6bb605d954e1 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -14,7 +14,7 @@ def init_device(self) -> None: raise NotImplementedError @abstractmethod - def get_max_allowed_kv_blocks(self) -> tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV blocks that can be allocated. From 3bb9e6f187d4745168005f9e995b7d45375a5429 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:39:57 -0700 Subject: [PATCH 043/165] rename --- tests/core/block/e2e/test_correctness.py | 6 +++--- vllm/config.py | 6 +++--- vllm/engine/arg_utils.py | 6 +++--- vllm/engine/llm_engine.py | 10 +++++----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 5a7f828456e2..94b65401e1dd 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -16,7 +16,7 @@ # Allow only 5 sequences of ~1024 tokens in worst case. "block_size": 16, - "forced_num_gpu_blocks": 5 * (64 + 1), + "num_gpu_blocks_override": 5 * (64 + 1), }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ @@ -162,14 +162,14 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, # Allow only 2 sequences of ~128 tokens in worst case. # Note 8 = 128/block_size - "forced_num_gpu_blocks": 2 * (8 + 1), + "num_gpu_blocks_override": 2 * (8 + 1), }, { "block_size": 8, # Allow only 2 sequences of ~128 tokens in worst case. # Note 16 = 128/block_size - "forced_num_gpu_blocks": 2 * (16 + 1), + "num_gpu_blocks_override": 2 * (16 + 1), } ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ diff --git a/vllm/config.py b/vllm/config.py index e27c8eb4fd25..5730997f639d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -334,7 +334,7 @@ class CacheConfig: vLLM execution. swap_space: Size of the CPU swap space per GPU (in GiB). cache_dtype: Data type for kv cache storage. - forced_num_gpu_blocks: Number of GPU blocks to use. This overrides the + num_gpu_blocks_override: Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified. Does nothing if None. """ @@ -344,14 +344,14 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, - forced_num_gpu_blocks: Optional[int] = None, + num_gpu_blocks_override: Optional[int] = None, sliding_window: Optional[int] = None, enable_prefix_caching: bool = False, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB - self.forced_num_gpu_blocks = forced_num_gpu_blocks + self.num_gpu_blocks_override = num_gpu_blocks_override self.cache_dtype = cache_dtype self.sliding_window = sliding_window self.enable_prefix_caching = enable_prefix_caching diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a6197942645e..d4b573992c06 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -55,7 +55,7 @@ class EngineArgs: max_cpu_loras: Optional[int] = None device: str = 'auto' ray_workers_use_nsight: bool = False - forced_num_gpu_blocks: Optional[int] = None + num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 # Related to Vision-language models such as llava @@ -246,7 +246,7 @@ def add_cli_args( 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') parser.add_argument( - '--forced-num-gpu-blocks', + '--num-gpu-blocks-override', type=int, default=None, help='If specified, ignore GPU profiling result and use this number' @@ -426,7 +426,7 @@ def create_engine_config(self, ) -> EngineConfig: cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - self.forced_num_gpu_blocks, + self.num_gpu_blocks_override, model_config.get_sliding_window(), self.enable_prefix_caching) parallel_config = ParallelConfig( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ad037cf2e79b..2e50dff02a01 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -183,11 +183,11 @@ def __init__( def _initialize_kv_caches(self) -> None: num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() - if self.cache_config.forced_num_gpu_blocks is not None: - forced_num_gpu_blocks = self.cache_config.forced_num_gpu_blocks - logger.info(f"Replacing profiled {num_gpu_blocks=} with " - f"{forced_num_gpu_blocks=}") - num_gpu_blocks = forced_num_gpu_blocks + if self.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override + logger.info(f"Overriding {num_gpu_blocks=} with " + f"{num_gpu_blocks_override=}") + num_gpu_blocks = num_gpu_blocks_override self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From edad09c2627c558a1b0567ff832fb4b7dd753499 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:44:20 -0700 Subject: [PATCH 044/165] wip --- vllm/engine/llm_engine.py | 1 - vllm/executor/gpu_executor.py | 1 - vllm/executor/ray_gpu_executor.py | 28 ---------------------- vllm/executor/utils.py | 28 ---------------------- vllm/worker/worker.py | 39 ++++++++++++++++++++++++++++--- 5 files changed, 36 insertions(+), 61 deletions(-) delete mode 100644 vllm/executor/utils.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e50dff02a01..d2f3f3aae42c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -228,7 +228,6 @@ def from_engine_args( log_stats=not engine_args.disable_log_stats, usage_context=usage_context, ) - return engine def __reduce__(self): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index e586cf810d78..4c936fb81f2a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -4,7 +4,6 @@ ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index ca851dfc462b..3647a46ef527 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -10,7 +10,6 @@ VisionLanguageConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid, raise_if_cache_size_invalid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -169,7 +168,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", rank, distributed_init_method, lora_config=lora_config, - #kv_cache_dtype=kv_cache_dtype, )) # Initialize the driver worker with the Worker class. @@ -185,7 +183,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, - #kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) @@ -196,35 +193,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers, ) - # """Profiles the memory usage and initializes the KV cache. - - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method - # from class :class:`~vllm.worker.Worker`. - - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. - - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ - def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( "determine_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, ) # Since we use a shared centralized controller, we take the minimum diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py deleted file mode 100644 index 89fe04434062..000000000000 --- a/vllm/executor/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -# TODO -def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 10396262101b..7c0af623be98 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -20,9 +20,6 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner from vllm.worker.worker_base import WorkerBase -from vllm.executor.utils import raise_if_cache_size_invalid - -# TODO move raise_if_cache_size_invalid class Worker(WorkerBase): @@ -125,6 +122,27 @@ def determine_num_available_blocks( gpu_memory_utilization: The fraction of the total GPU memory to use. cpu_swap_space: The size of the CPU swap space in bytes. """ + + # """Profiles the memory usage and initializes the KV cache. + + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. + + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. + + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. + + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -327,3 +345,18 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): f"{compute_capability[0]}.{compute_capability[1]}. " "You can use float16 instead by explicitly setting the" "`dtype` flag in CLI, for example: --dtype=half.") + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From f93c845872d250f6137f92ee2660baac43972433 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:45:24 -0700 Subject: [PATCH 045/165] wip --- vllm/executor/gpu_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 4c936fb81f2a..066502f9dc54 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -67,7 +67,7 @@ def _init_worker(self): self.driver_worker.load_model() - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() From d2d22186f05b061e131c3737174fcf49e06d7976 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:51:24 -0700 Subject: [PATCH 046/165] wip --- vllm/worker/cpu_worker.py | 8 +++++-- vllm/worker/neuron_worker.py | 3 +++ vllm/worker/worker.py | 43 ++++++++++++++---------------------- vllm/worker/worker_base.py | 4 ++++ 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 5250c15330e4..4e51a8f10f4f 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -172,8 +172,7 @@ def load_model(self): def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. - cache_block_size = CPUCacheEngine.get_cache_block_size( - self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + cache_block_size = self.get_cache_block_size_bytes() num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) num_cpu_blocks = max(num_cpu_blocks, 0) @@ -299,3 +298,8 @@ def init_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) + + def get_cache_block_size_bytes(self) -> int: + return CPUCacheEngine.get_cache_block_size( + self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) + diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index dab70d884db4..28bd10db72e5 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -70,3 +70,6 @@ def execute_model( output = self.model_runner.execute_model(seq_group_metadata_list) return output + + def get_cache_block_size_bytes(self) -> int: + raise NotImplementedError diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7c0af623be98..77cf5c180a27 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -42,7 +42,6 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, - #kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, ) -> None: self.model_config = model_config @@ -73,8 +72,7 @@ def __init__( is_driver_worker=is_driver_worker, vision_language_config=vision_language_config) # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). - #self.cache_config = None + # initialize_cache. self.cache_engine = None self.gpu_cache = None @@ -115,34 +113,28 @@ def determine_num_available_blocks( ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. - - Args: - # TODO - block_size: The size of the cache block. - gpu_memory_utilization: The fraction of the total GPU memory to use. - cpu_swap_space: The size of the CPU swap space in bytes. """ - # """Profiles the memory usage and initializes the KV cache. + # """Profiles the memory usage and initializes the KV cache. - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method - # from class :class:`~vllm.worker.Worker`. + # The engine will first conduct a profiling of the existing memory usage. + # Then, it calculate the maximum possible number of GPU and CPU blocks + # that can be allocated with the remaining free memory. + # More details can be found in the + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # from class :class:`~vllm.worker.Worker`. - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. + # Afterwards, as there may be multiple workers, + # we take the minimum number of blocks across all workers + # to ensure this can be applied to all of them. - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. + # Finally, the engine will initialize the KV cache + # with the calculated number of blocks. - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ + # .. tip:: + # You may limit the usage of GPU memory + # by adjusting the `gpu_memory_utilization` parameter. + # """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -173,7 +165,6 @@ def determine_num_available_blocks( self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache() - return num_gpu_blocks, num_cpu_blocks diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 6bb605d954e1..42de84ab68f2 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -48,6 +48,10 @@ def execute_model(self, """Executes one model step on the given sequences.""" raise NotImplementedError + @abstractmethod + def get_cache_block_size_bytes() -> int: + raise NotImplementedError + @abstractmethod def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError From 2f960e7d7d0a9c7c6af8ee931a61c8368608e94d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 19:55:04 -0700 Subject: [PATCH 047/165] lint --- tests/lora/test_worker.py | 7 +++-- tests/spec_decode/test_spec_decode_worker.py | 12 ++++----- tests/spec_decode/utils.py | 4 ++- tests/worker/test_swap.py | 4 ++- vllm/engine/llm_engine.py | 7 ++--- vllm/executor/cpu_executor.py | 3 ++- vllm/executor/executor_base.py | 8 +++--- vllm/executor/gpu_executor.py | 9 ++----- vllm/executor/neuron_executor.py | 7 ++--- vllm/executor/ray_gpu_executor.py | 14 +++++----- vllm/spec_decode/spec_decode_worker.py | 12 ++++++--- vllm/worker/cpu_worker.py | 10 ++++--- vllm/worker/neuron_worker.py | 3 ++- vllm/worker/worker.py | 28 +++++++++++--------- vllm/worker/worker_base.py | 9 ++++--- 15 files changed, 74 insertions(+), 63 deletions(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 11370b3ea1c6..3fd7d000d31b 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -4,7 +4,7 @@ from unittest.mock import patch from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig) + SchedulerConfig, CacheConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker @@ -27,7 +27,10 @@ def test_worker_apply_lora(sql_lora_files): parallel_config=ParallelConfig(1, 1, False), scheduler_config=SchedulerConfig(32, 32, 32), device_config=DeviceConfig("cuda"), - cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), + cache_config=CacheConfig(block_size=16, + gpu_memory_utilization=1., + swap_space=0, + cache_dtype="auto"), local_rank=0, rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 511d600199a0..3c513e5d881f 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -510,6 +510,7 @@ def test_init_device(): metrics_collector.init_gpu_tensors.assert_called_once() rejection_sampler.init_gpu_tensors.assert_called_once() + @torch.inference_mode() def test_initialize_cache(): """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer @@ -524,9 +525,7 @@ def test_initialize_cache(): worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - cache_config = MagicMock() - - kwargs = {"num_gpu_blocks":1024, "num_cpu_blocks": 1023} + kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023} worker.initialize_cache(**kwargs) draft_worker.initialize_cache.assert_called_once_with(**kwargs) @@ -539,9 +538,9 @@ def test_initialize_cache(): @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @torch.inference_mode() def test_determine_num_available_blocks(available_gpu_blocks: int, - available_cpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int): + available_cpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): """Verify SpecDecodeWorker correctly profiles num available GPU blocks. Specifically, it should run profiling in the scorer worker, and then evenly split the blocks between proposer and scorer worker. @@ -561,7 +560,6 @@ def test_determine_num_available_blocks(available_gpu_blocks: int, worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, metrics_collector) - num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() target_worker.determine_num_available_blocks.assert_called_once() diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 0916d3d49421..4637826f254d 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -129,7 +129,9 @@ def create_worker(cls: type, engine_config.cache_config.num_gpu_blocks = num_gpu_blocks engine_config.cache_config.num_cpu_blocks = 0 - worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) + worker.initialize_cache( + num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, + num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) return worker diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index b35bf583ecb4..893637d92f85 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -32,7 +32,9 @@ def test_swap() -> None: # Initialize the worker. worker.init_device() worker.load_model() - worker.initialize_cache(num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) + worker.initialize_cache( + num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, + num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) # Randomly initialize the cache. gpu_cache = worker.cache_engine.gpu_cache diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d2f3f3aae42c..57be4835e5be 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,12 +181,13 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = self.model_executor.determine_num_available_blocks() - + num_gpu_blocks, num_cpu_blocks = (self.model_executor.determine_num_available_blocks( + )) + if self.cache_config.num_gpu_blocks_override is not None: num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override logger.info(f"Overriding {num_gpu_blocks=} with " - f"{num_gpu_blocks_override=}") + f"{num_gpu_blocks_override=}") num_gpu_blocks = num_gpu_blocks_override self.cache_config.num_gpu_blocks = num_gpu_blocks diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index b78f6d993453..f44667f5112c 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -63,7 +63,8 @@ def _init_worker(self): def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 757549bdedbe..63c3766b6221 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional -from dataclasses import dataclass from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -31,7 +30,6 @@ def __init__( ) -> None: raise NotImplementedError - @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: """Profile the model on-device to determine the maximum number of KV @@ -50,12 +48,12 @@ def determine_num_available_blocks(self) -> tuple[int, int]: """ raise NotImplementedError - @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine - the maxmimum allowed KV cache size. + the maximum allowed KV cache size. """ raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 066502f9dc54..caedea97dc6d 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -66,23 +66,18 @@ def _init_worker(self): self.driver_worker.init_device() self.driver_worker.load_model() - def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: # NOTE: This is logged in the executor because there can be >1 worker # with other executors. We could log in the engine level, but work # remains to abstract away the device for non-GPU configurations. - logger.info( - f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}" - ) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index b907fd472704..d9f52adc49f6 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -50,7 +50,8 @@ def _init_worker(self): def determine_num_available_blocks(self) -> tuple[int, int]: return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, @@ -70,10 +71,10 @@ def add_lora(self, lora_request: LoRARequest) -> bool: return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_request) + return self.driver_worker.remove_lora(lora_id) def list_loras(self) -> List[int]: - return self.driver_worker.list_loras(lora_request) + return self.driver_worker.list_loras() def check_health(self) -> None: # NeuronExecutor will always be healthy as long as diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 3647a46ef527..e71f0a4b7b82 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -150,7 +150,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype # Initialize the actual workers with the Worker class. for rank, (worker, (node_id, _)) in enumerate( @@ -195,9 +194,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", def determine_num_available_blocks(self) -> tuple[int, int]: # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "determine_num_available_blocks", - ) + num_blocks = self._run_workers("determine_num_available_blocks", ) # Since we use a shared centralized controller, we take the minimum # number of blocks across all workers to make sure all the memory @@ -207,8 +204,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors @@ -219,8 +216,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._run_workers("initialize_cache", num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 5f03b1edc07a..a13748fd9405 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -102,7 +102,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: scorer cache is divided evenly between the proposer and scorer model KV, such that the number of blocks is equal in both KV caches. """ - num_gpu_blocks, num_cpu_blocks = (self.scorer_worker.determine_num_available_blocks()) + num_gpu_blocks, num_cpu_blocks = ( + self.scorer_worker.determine_num_available_blocks()) scorer_cache_block_size_bytes = ( self.scorer_worker.get_cache_block_size_bytes()) @@ -114,12 +115,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_gpu_blocks) return new_num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. TODO """ - self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) - self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) @torch.inference_mode() def execute_model( diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 4e51a8f10f4f..bb611b4b173f 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -173,7 +173,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = self.get_cache_block_size_bytes() - num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // cache_block_size) + num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // + cache_block_size) num_cpu_blocks = max(num_cpu_blocks, 0) # Note: To reuse the cache management procedure, @@ -182,7 +183,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: num_cpu_blocks = 0 return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. assert num_cpu_blocks == 0 @@ -301,5 +303,5 @@ def init_distributed_environment(self) -> None: def get_cache_block_size_bytes(self) -> int: return CPUCacheEngine.get_cache_block_size( - self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) - + self.cache_config.block_size, self.cache_config.cache_dtype, + self.model_config, self.parallel_config) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 28bd10db72e5..d862600c5c93 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -51,7 +51,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: return num_gpu_blocks, num_cpu_blocks - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: assert num_cpu_blocks == 0 assert num_gpu_blocks == self.scheduler_config.max_num_seqs self.cache_config.num_gpu_blocks = num_gpu_blocks diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 77cf5c180a27..4a273347927a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -108,9 +108,7 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks( - self, - ) -> Tuple[int, int]: + def determine_num_available_blocks(self, ) -> Tuple[int, int]: """Profiles the peak memory usage of the model and returns the maximum number of GPU and CPU cache blocks that can be allocated. """ @@ -156,9 +154,10 @@ def determine_num_available_blocks( cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // - cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: @@ -167,9 +166,11 @@ def determine_num_available_blocks( torch.cuda.empty_cache() return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.model_config.max_model_len) self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -184,7 +185,6 @@ def _init_cache_engine(self): self.gpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) - def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) @@ -265,8 +265,9 @@ def vocab_size(self) -> int: def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size( - self.cache_config, self.model_config, self.parallel_config) + return CacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) def init_distributed_environment( @@ -338,7 +339,8 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): "`dtype` flag in CLI, for example: --dtype=half.") -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, + max_model_len) -> None: if num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 42de84ab68f2..4675dbd4b314 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,11 +1,12 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict, List from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata class WorkerBase(ABC): + @abstractmethod def init_device(self) -> None: """Initialize device state, such as loading the model or other on-device @@ -32,10 +33,11 @@ def determine_num_available_blocks(self) -> tuple[int, int]: raise NotImplementedError @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: """Given a fully-specified cache config, initialize the KV cache. This is separate from init_workers as profiling may be required to determine - the maxmimum allowed KV cache size. + the maximum allowed KV cache size. """ raise NotImplementedError @@ -66,6 +68,7 @@ def list_loras(self) -> List[int]: class LoraNotSupportedWorkerBase(WorkerBase): + def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From 68552e105c997892ff2ea65128025bd1c90f5fb0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:06:31 -0700 Subject: [PATCH 048/165] wip --- vllm/engine/llm_engine.py | 4 ++-- vllm/worker/worker.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 57be4835e5be..fc2d476cf534 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,8 +181,8 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: - num_gpu_blocks, num_cpu_blocks = (self.model_executor.determine_num_available_blocks( - )) + num_gpu_blocks, num_cpu_blocks = ( + self.model_executor.determine_num_available_blocks()) if self.cache_config.num_gpu_blocks_override is not None: num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 4a273347927a..24c5ab6ff6c1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -115,11 +115,14 @@ def determine_num_available_blocks(self, ) -> Tuple[int, int]: # """Profiles the memory usage and initializes the KV cache. - # The engine will first conduct a profiling of the existing memory usage. - # Then, it calculate the maximum possible number of GPU and CPU blocks + # The engine will first conduct a profiling of the existing memory + # usage. + # Then, it calculate the maximum possible number of GPU and CPU + # blocks # that can be allocated with the remaining free memory. # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` method + # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` + # method # from class :class:`~vllm.worker.Worker`. # Afterwards, as there may be multiple workers, From 42983ba1617aab7aa9b6ab5fb0a90e71d0b7c7c9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:09:35 -0700 Subject: [PATCH 049/165] import order --- tests/lora/test_worker.py | 4 ++-- vllm/worker/neuron_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 3fd7d000d31b..54594690f792 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,8 +3,8 @@ import tempfile from unittest.mock import patch -from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, CacheConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d862600c5c93..d37cd048031d 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -4,8 +4,8 @@ import torch import torch.distributed -from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig, - SchedulerConfig, CacheConfig) +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.model_executor import set_random_seed from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.neuron_model_runner import NeuronModelRunner From 2d5dbb89378d94c025cf34ac6e7b9ba4126aa738 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 20:37:11 -0700 Subject: [PATCH 050/165] fix --- tests/worker/test_swap.py | 4 ++-- vllm/executor/ray_gpu_executor.py | 3 +++ vllm/spec_decode/spec_decode_worker.py | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 893637d92f85..8edb1cf05c08 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -11,8 +11,8 @@ def test_swap() -> None: dtype="half", load_format="dummy") engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 100 - engine_config.cache_config.num_cpu_blocks = 100 + engine_config.cache_config.num_gpu_blocks = 1000 + engine_config.cache_config.num_cpu_blocks = 1000 # Create the worker. distributed_init_method = get_distributed_init_method( diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e71f0a4b7b82..1175a400fdc6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -150,6 +150,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) lora_config = copy.deepcopy(self.lora_config) + cache_config = copy.deepcopy(self.cache_config) # Initialize the actual workers with the Worker class. for rank, (worker, (node_id, _)) in enumerate( @@ -163,6 +164,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", parallel_config, scheduler_config, device_config, + cache_config, local_rank, rank, distributed_init_method, @@ -177,6 +179,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self.parallel_config, self.scheduler_config, self.device_config, + self.cache_config, driver_local_rank, driver_rank, distributed_init_method, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a13748fd9405..180dea26c0d6 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -348,6 +348,9 @@ def rank(self): def device(self): return self.scorer_worker.device + def get_cache_block_size_bytes(self): + raise NotImplementedError + def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, proposer_cache_block_size_bytes: int, From ae2f7e6b6b97cf0847712e99da7c0ce3e8a92447 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 4 Apr 2024 22:46:43 -0700 Subject: [PATCH 051/165] docstrings --- vllm/engine/llm_engine.py | 5 +++ vllm/executor/cpu_executor.py | 8 +++++ vllm/executor/executor_base.py | 28 +++++++---------- vllm/executor/gpu_executor.py | 5 +++ vllm/executor/neuron_executor.py | 5 +++ vllm/executor/ray_gpu_executor.py | 43 ++++++++++++++++---------- vllm/spec_decode/spec_decode_worker.py | 8 ++++- vllm/worker/cpu_worker.py | 25 +++++++++++++-- vllm/worker/neuron_worker.py | 15 +++++++++ vllm/worker/worker.py | 42 +++++++++---------------- vllm/worker/worker_base.py | 36 +++++++++++---------- 11 files changed, 142 insertions(+), 78 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fc2d476cf534..1db6c740733a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -181,6 +181,11 @@ def __init__( self.stat_logger.info("cache_config", self.cache_config) def _initialize_kv_caches(self) -> None: + """Initialize the KV cache in the worker(s). + + The workers will determine the number of blocks in both the GPU cache + and the swap CPU cache. + """ num_gpu_blocks, num_cpu_blocks = ( self.model_executor.determine_num_available_blocks()) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index f44667f5112c..2bf97338da0e 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -61,10 +61,18 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. logger.info(f"# CPU blocks: {num_cpu_blocks}") self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 63c3766b6221..c18edd75d7a4 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -32,28 +32,24 @@ def __init__( @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: - """Profile the model on-device to determine the maximum number of KV - blocks that can be allocated. - - Returns a tuple[num_device_blocks, num_cpu_blocks], where - num_device_blocks refers to the number of blocks in the "active" KV - cache (e.g. where blocks are appended to), and num_cpu_blocks refers - to the number of blocks in the "passive" KV cache (e.g. where blocks - are swapped to). - - Examples: - - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - - A future CPUExecutor can return [num_cpu_blocks, 0] or - [num_cpu_blocks, num_swap_cpu_blocks]. + """Determine the number of available blocks for the GPU KV cache and + swappable CPU KV cache. + + Normally, this should simply delegate to the underlying Worker. Some + ExecutorBase may require modification of the result, e.g. to ensure the + selected cache sizes are compatible with all workers. + + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + are blocks that are "active" on the device and can be appended to. + num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + appended to. """ raise NotImplementedError @abstractmethod def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - """Given a fully-specified cache config, initialize the KV cache. This - is separate from init_workers as profiling may be required to determine - the maximum allowed KV cache size. + """Initialize the KV cache with the given size in blocks. """ raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index caedea97dc6d..80ca5cb7367c 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -67,9 +67,14 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ # NOTE: This is logged in the executor because there can be >1 worker # with other executors. We could log in the engine level, but work # remains to abstract away the device for non-GPU configurations. diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index d9f52adc49f6..57436a85cfa2 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,10 +48,15 @@ def _init_worker(self): self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ return self.driver_worker.determine_num_available_blocks() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) def execute_model(self, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 1175a400fdc6..a508d1e8fe60 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -160,14 +160,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", local_rank = node_workers[node_id].index(rank) worker.init_worker.remote( lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - local_rank, - rank, - distributed_init_method, + model_config=model_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + cache_config=cache_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, lora_config=lora_config, )) @@ -175,14 +175,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", driver_rank = 0 driver_local_rank = node_workers[driver_node_id].index(driver_rank) self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - self.cache_config, - driver_local_rank, - driver_rank, - distributed_init_method, + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=driver_local_rank, + rank=driver_rank, + distributed_init_method=distributed_init_method, lora_config=self.lora_config, vision_language_config=self.vision_language_config, is_driver_worker=True, @@ -196,6 +196,15 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", ) def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks. + + This invokes `determine_num_available_blocks` on each worker and takes + the min of the results, guaranteeing that the selected cache sizes are + compatible with all workers. + + Returns: + - tuple[num_gpu_blocks, num_cpu_blocks] + """ # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers("determine_num_available_blocks", ) @@ -209,6 +218,8 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache in all workers. + """ # NOTE: We log here to avoid multiple logs when number of workers is # greater than one. We could log in the engine, but not all executors diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 180dea26c0d6..885bf537568e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -118,7 +118,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Initialize the cache engine of the scorer and proposer workers. - TODO """ self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) @@ -349,6 +348,13 @@ def device(self): return self.scorer_worker.device def get_cache_block_size_bytes(self): + """Return the size of a cache block in bytes. + + This function is only used to compose workers within a SpecDecodeWorker. + We leave composing a SpecDecodeWorker within a SpecDecodeWorker + undefined for now, although it could be implemented in the future. + See https://arxiv.org/abs/2308.04623. + """ raise NotImplementedError diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bb611b4b173f..bd67f9f8850a 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -170,6 +170,16 @@ def load_model(self): self.model_runner.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of blocks available for the KV cache. + + This determines how many KV blocks can fit into the configured CPU + KV cache space. + + Note that since vLLM assumes a block resides on GPU if it can be + modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. + This allows us to reuse the scheduler of vLLM without generalizing it + to different devices. + """ # For CPU device, the block number will be calculated based on the # cpu_kvcache_space. cache_block_size = self.get_cache_block_size_bytes() @@ -185,11 +195,20 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache. Currently, swappable CPU memory is not + supported. + + Since this worker does not support GPUs, we use the num_gpu_blocks to + determine how many non-swappable CPU blocks to allocate. + """ + assert (num_cpu_blocks == 0 + ), f"{type(self)} does not support swappable cache" + # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. - assert num_cpu_blocks == 0 num_cpu_blocks = num_gpu_blocks - num_gpu_blocks = 0 + del num_gpu_blocks + self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 @@ -302,6 +321,8 @@ def init_distributed_environment(self) -> None: parallel_config.pipeline_parallel_size) def get_cache_block_size_bytes(self) -> int: + """Return the size in bytes of a single KV cache block. + """ return CPUCacheEngine.get_cache_block_size( self.cache_config.block_size, self.cache_config.cache_dtype, self.model_config, self.parallel_config) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d37cd048031d..6136d50d0c06 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -41,6 +41,12 @@ def load_model(self): self.model_runner.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: + """Determine the number of available KV blocks. + + Swapping is not yet supported, so always return num_cpu_blocks=0. + + We configure num_gpu_blocks to be equal to max_num_seqs. + """ # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. @@ -53,8 +59,13 @@ def determine_num_available_blocks(self) -> tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initialize the KV cache. + """ + + # Different values are not tested. assert num_cpu_blocks == 0 assert num_gpu_blocks == self.scheduler_config.max_num_seqs + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks @@ -73,4 +84,8 @@ def execute_model( return output def get_cache_block_size_bytes(self) -> int: + """Determine the size in bytes of a cache block. + + This is required for speculative decoding; it is not yet implemented. + """ raise NotImplementedError diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 24c5ab6ff6c1..b46229c5b694 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -108,34 +108,18 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks(self, ) -> Tuple[int, int]: - """Profiles the peak memory usage of the model and returns the maximum - number of GPU and CPU cache blocks that can be allocated. - """ + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. - # """Profiles the memory usage and initializes the KV cache. - - # The engine will first conduct a profiling of the existing memory - # usage. - # Then, it calculate the maximum possible number of GPU and CPU - # blocks - # that can be allocated with the remaining free memory. - # More details can be found in the - # :meth:`~vllm.worker.worker.Worker.determine_num_available_blocks` - # method - # from class :class:`~vllm.worker.Worker`. - - # Afterwards, as there may be multiple workers, - # we take the minimum number of blocks across all workers - # to ensure this can be applied to all of them. - - # Finally, the engine will initialize the KV cache - # with the calculated number of blocks. - - # .. tip:: - # You may limit the usage of GPU memory - # by adjusting the `gpu_memory_utilization` parameter. - # """ + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() @@ -171,6 +155,10 @@ def determine_num_available_blocks(self, ) -> Tuple[int, int]: def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks. + + This also warms up the model, which may record CUDA graphs. + """ raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, self.model_config.max_model_len) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 4675dbd4b314..e3027c406ffe 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,6 +6,9 @@ class WorkerBase(ABC): + """Worker interface that allows vLLM to cleanly separate implementations for + different hardware. + """ @abstractmethod def init_device(self) -> None: @@ -16,28 +19,23 @@ def init_device(self) -> None: @abstractmethod def determine_num_available_blocks(self) -> tuple[int, int]: - """Profile the model on-device to determine the maximum number of KV - blocks that can be allocated. - - Returns a tuple[num_device_blocks, num_cpu_blocks], where - num_device_blocks refers to the number of blocks in the "active" KV - cache (e.g. where blocks are appended to), and num_cpu_blocks refers - to the number of blocks in the "passive" KV cache (e.g. where blocks - are swapped to). - - Examples: - - The GPUExecutor will return [num_gpu_blocks, num_cpu_blocks]. - - A future CPUExecutor can return [num_cpu_blocks, 0] or - [num_cpu_blocks, num_swap_cpu_blocks]. + """Determine the number of available blocks for the GPU KV cache and + swappable CPU KV cache. + + The implementation may run profiling or other heuristics to determine + the size of caches. + + Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks + are blocks that are "active" on the device and can be appended to. + num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + appended to. """ raise NotImplementedError @abstractmethod def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: - """Given a fully-specified cache config, initialize the KV cache. This - is separate from init_workers as profiling may be required to determine - the maximum allowed KV cache size. + """Initialize the KV cache with the given size in blocks. """ raise NotImplementedError @@ -52,6 +50,9 @@ def execute_model(self, @abstractmethod def get_cache_block_size_bytes() -> int: + """Return the size of a single cache block, in bytes. Used in + speculative decoding. + """ raise NotImplementedError @abstractmethod @@ -68,6 +69,9 @@ def list_loras(self) -> List[int]: class LoraNotSupportedWorkerBase(WorkerBase): + """Partial implementation of WorkerBase that raises exceptions when LoRA + methods are invoked. + """ def add_lora(self, lora_request: LoRARequest) -> bool: raise ValueError(f"{type(self)} does not support LoRA") From fa8705de390cc727acc5a094abbba2f070de27dd Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sat, 6 Apr 2024 22:29:36 -0700 Subject: [PATCH 052/165] wip --- vllm/executor/gpu_executor.py | 71 +++++++++++++++++++++++++- vllm/spec_decode/spec_decode_worker.py | 4 ++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 80ca5cb7367c..ac7e4c5dda74 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -33,14 +33,81 @@ def __init__( self.scheduler_config = scheduler_config self.device_config = device_config self.vision_language_config = vision_language_config + self.speculative_config = speculative_config - assert (not speculative_config - ), "Speculative decoding not yet supported for GPU backend" + #assert (not speculative_config + # ), "Speculative decoding not yet supported for GPU backend" # Instantiate the worker and load the model to GPU. self._init_worker() def _init_worker(self): + if self.speculative_config is None: + self._init_non_spec_worker() + else: + self._init_spec_worker() + + def _init_spec_worker(self): + from vllm.worker.worker import Worker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.spec_decode.multi_step_worker import MultiStepWorker + + #from vllm.worker.multi_step_worker import MultiStepWorker # pylint: disable=import-outside-toplevel + #from vllm.worker.single_tp_worker import SingleTpWorker # pylint: disable=import-outside-toplevel + #from vllm.worker.draft_target_worker import DraftTargetWorker # pylint: disable=import-outside-toplevel + + #scheduler_config: "SchedulerConfig" = worker_kwargs.pop( + # "scheduler_config") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + + target_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.multi_step_worker import MultiStepWorker + draft_worker = MultiStepWorker( + model_config=self.speculative_config.draft_model_config, + parallel_config=self.speculative_config.draft_parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.model_executor.layers.rejection_sampler import RejectionSampler + spec_decode_worker = SpecDecodeWorker( + proposer_worker=draft_worker, + scorer_worker=target_worker, + rejection_sampler=RejectionSampler(), + ) + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + self.driver_worker = spec_decode_worker + + self.driver_worker.init_device() + #self.driver_worker.load_model() + + def _init_non_spec_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker from vllm.worker.worker import Worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 885bf537568e..d555f27650e1 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -87,6 +87,10 @@ def init_device(self) -> None: self.scorer_worker.init_device() self.proposer_worker.init_device() + # TODO separate from init_device? + self.scorer_worker.load_model() + self.proposer_worker.load_model() + self._metrics.init_gpu_tensors(self.rank) self.rejection_sampler.init_gpu_tensors(self.rank) self.scorer = BatchExpansionTop1Scorer( From 84953210e527c011704974435ae1b61ed7296a26 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sat, 6 Apr 2024 22:36:27 -0700 Subject: [PATCH 053/165] wip --- tests/spec_decode/e2e/test_correctness.py | 3 +++ vllm/engine/llm_engine.py | 10 ++++++---- vllm/executor/gpu_executor.py | 5 ++++- vllm/spec_decode/spec_decode_worker.py | 9 +++++---- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index b5a6fcb7900a..c427fbc7a05b 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -11,6 +11,9 @@ "speculative_model": "facebook/opt-125m", "num_speculative_tokens": 5, + # Skip cuda graph recording for fast test. + "enforce_eager": True, + # Required for spec decode. "use_v2_block_manager": True }]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1c639af69654..9ca809f51d0f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -709,12 +709,14 @@ def step(self) -> List[RequestOutput]: if not scheduler_outputs.is_empty(): output = self.model_executor.execute_model( - seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, - scheduler_outputs.blocks_to_swap_out, - scheduler_outputs.blocks_to_copy) + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, + blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, + blocks_to_copy=scheduler_outputs.blocks_to_copy, + num_lookahead_slots=scheduler_outputs.num_lookahead_slots) else: output = [] - + return self._process_model_outputs(output, scheduler_outputs) def do_log_stats(self) -> None: diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index ac7e4c5dda74..80ec79ba3c3c 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -154,12 +154,15 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int, + ) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, + num_lookahead_slots=num_lookahead_slots, ) return output diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index d555f27650e1..a2c9a9944af5 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -135,7 +135,7 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]], blocks_to_swap_out: Optional[Dict[int, int]], blocks_to_copy: Optional[Dict[int, List[int]]], - num_spec_tokens: int, + num_lookahead_slots: int, ) -> List[SamplerOutput]: """Perform speculative decoding on the input batch. """ @@ -146,7 +146,7 @@ def execute_model( # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. - if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0: + if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0: return self._run_no_spec( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -159,7 +159,7 @@ def execute_model( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - k=num_spec_tokens, + k=num_lookahead_slots, ) @nvtx_range("spec_decode_worker._run_no_spec") @@ -180,7 +180,8 @@ def _run_no_spec( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - return_python_output=False) + #return_python_output=False + ) sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, From b63975bd45ea1a1770a8c742dc732b91e6f3cbf9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:06:56 -0700 Subject: [PATCH 054/165] wip --- tests/spec_decode/e2e/test_correctness.py | 18 ++-- vllm/core/scheduler.py | 14 +-- vllm/engine/llm_engine.py | 121 +++++++++++++++++++++- vllm/model_executor/layers/sampler.py | 8 +- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 4 +- 7 files changed, 145 insertions(+), 25 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c427fbc7a05b..782bd9d0cecb 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -21,14 +21,14 @@ @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_config(test_llm_generator): - output_len = 1024 + output_len = 128 temperature = 0.0 prompts = [ "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", + #"The president of the United States is", + #"The capital of France is", + #"The future of AI is", ] sampling_params = SamplingParams( @@ -37,11 +37,11 @@ def test_spec_decode_config(test_llm_generator): temperature=temperature, ) - with pytest.raises( - AssertionError, - match="Speculative decoding not yet supported for GPU backend"): - get_token_ids_from_llm_generator(test_llm_generator, prompts, - sampling_params) + #with pytest.raises( + # AssertionError, + # match="Speculative decoding not yet supported for GPU backend"): + get_token_ids_from_llm_generator(test_llm_generator, prompts, + sampling_params) def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 0ae53f937496..e176848c0490 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -753,9 +753,10 @@ def _schedule_default(self) -> SchedulerOutputs: blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy, swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, - num_lookahead_slots=(prefills.num_lookahead_slots + - running_scheduled.num_lookahead_slots + - swapped_in.num_lookahead_slots), + num_lookahead_slots=running_scheduled.num_lookahead_slots, + #num_lookahead_slots=(prefills.num_lookahead_slots + + # running_scheduled.num_lookahead_slots + + # swapped_in.num_lookahead_slots), ) def _schedule_chunked_prefill(self): @@ -842,9 +843,10 @@ def _schedule_chunked_prefill(self): blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy, swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, - num_lookahead_slots=(prefills.num_lookahead_slots + - running_scheduled.num_lookahead_slots + - swapped_in.num_lookahead_slots), + num_lookahead_slots=running_scheduled.num_lookahead_slots, + #num_lookahead_slots=(prefills.num_lookahead_slots + + # running_scheduled.num_lookahead_slots + + # swapped_in.num_lookahead_slots), ) def _schedule(self) -> SchedulerOutputs: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9ca809f51d0f..1bd4129090c2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -626,14 +626,38 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + + + if not isinstance(output, list): + all_output = [output] + else: + all_output = output + + scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups + + # Organize list of sampler output by sequence group. + output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ + [] for _ in scheduled_seq_groups + ] + for step in output: + for i, sequence_group_output in enumerate(step): + output_by_sequence_group[i].append(sequence_group_output) + now = time.time() + # Update the scheduled sequence groups with the model outputs. - scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output): + for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): + seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - self._process_sequence_group_outputs(seq_group, outputs) + + assert len(outputs) > 0 + # TODO can spec decode go through second path? + if len(outputs) > 1: + self._process_sequence_group_outputs_multi_step(seq_group, outputs) + else: + self._process_sequence_group_outputs(seq_group, outputs[0]) # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() @@ -654,6 +678,91 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs + def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + assert seqs + #if not seqs: + # return [] + + assert len(seqs) == 1, ("Beam search not supported in speculative " + "decoding.") + seq = seqs[0] + + # Since there's only one sequence per sequence group, we can take the + # first sample. + samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # -1 means the output token is not valid (eg. due to spec decode + # rejecting tokens). + valid_samples = [ + sample for sample in samples if sample.output_token != -1 + ] + + # Draft target worker pads all outputs with -1 to have same length. + output_token_ids = [sample.output_token for sample in valid_samples] + #successes = [sample.success for sample in samples] + + ## Truncate to max_tokens if necessary. + #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # seq.get_output_len() + len(output_token_ids)) + #if remaining_tokens < 0: + # valid_samples = valid_samples[:remaining_tokens] + # output_token_ids = output_token_ids[:remaining_tokens] + + ## Truncate any tokens after EOS. This is required as spec decode + ## generates tokens in fixed blocks, which may go beyond the EOS token. + #if not seq_group.sampling_params.ignore_eos: + # eos_token_id = self.tokenizer.get_lora_tokenizer( + # seq.lora_request).eos_token_id + # # Avoiding .index calls as exception throwing in the happy path + # # is expensive. + # for i in range(len(output_token_ids)): + # if output_token_ids[i] == eos_token_id: + # output_token_ids = output_token_ids[:i + 1] + # valid_samples = valid_samples[:i + 1] + # break + + #output_logprobs = [sample.logprobs for sample in valid_samples] + + ## Use the last sample for the sequence as it will have + ## the speculation and num_unprocessed_tokens for all the + ## previous samples (they are cumulative when it comes + ## to those two attributes). + #speculation = valid_samples[-1].speculation + #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + for output_token_id in output_token_ids: + from vllm.sequence import Logprob + seq.append_token_id( + token_id=output_token_id, + logprobs={output_token_id: Logprob(0.0)}, + ) + print(f'Appended token id {output_token_id=}') + + #seq.append_token_ids(output_token_ids, + # output_logprobs, + # ) + # #num_unprocessed_tokens=num_unprocessed_tokens) + ##seq.set_last_speculation(speculation) + + #if not all(successes): + # seq.set_status_to_failed() + + #if decode: + # self._decode_sequence(seq, + # seq_group.sampling_params, + # token_ids=seq.get_token_ids(), + # unseen_token_ids=output_token_ids, + # prefix_offset=seq.prefix_offset, + # read_offset=seq.read_offset) + #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # output_token_ids) + # TODO pass output token ids + self._check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): + self.scheduler.free_seq(seq) + def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -804,9 +913,11 @@ def _check_stop(self, seq: Sequence, if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - + + breakpoint() # Check if the sequence has reached max_tokens. - if seq.get_output_len() == sampling_params.max_tokens: + if seq.get_output_len() >= sampling_params.max_tokens: + # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cb1480de03e3..4f0cc4405e81 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -684,4 +684,10 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput(outputs=sampler_output) + + return SamplerOutput( + outputs=sampler_output, + # TODO + sampled_token_probs=torch.empty((len(sampler_output), 50_272), device='cuda', dtype=torch.float32), + sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), + ) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e0b75837e8a3..89be25252c2c 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -83,7 +83,8 @@ def score_proposals( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - return_python_output=False) + #return_python_output=False + ) all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 73b6e201c67a..c817f54d7fe3 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -340,7 +340,7 @@ def _merge_outputs( return proposal_tokens, proposal_probs, proposal_lens sampler_output = maybe_sampler_output - + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a2c9a9944af5..85667a6c3dd4 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -5,7 +5,7 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, - SequenceGroupOutput, SequenceOutput) + SequenceGroupOutput, SequenceOutput, Logprob) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -316,7 +316,7 @@ def _create_output_sampler_list( parent_seq_id=seq_id, output_token=token_id, # TODO Add verifier logprobs. - logprobs={token_id: 0.0}, + logprobs={token_id: Logprob(0.0)}, ) ], prompt_logprobs=None, From cb23e8ca4e6ff3c667b44e9ce4f179f629740008 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:07:10 -0700 Subject: [PATCH 055/165] wip --- vllm/engine/llm_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1bd4129090c2..15ef7df26b0b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -914,7 +914,6 @@ def _check_stop(self, seq: Sequence, seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - breakpoint() # Check if the sequence has reached max_tokens. if seq.get_output_len() >= sampling_params.max_tokens: # TODO should cap block From 143ca28e5de41f1d32e730bc3e9da2a954a2024e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 00:14:02 -0700 Subject: [PATCH 056/165] wip --- vllm/executor/cpu_executor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 2bf97338da0e..835ba18ab756 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -80,7 +80,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, From d8d4725d3365e25c67cbb115e5a437fd7e574fd0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 13:41:20 -0700 Subject: [PATCH 057/165] fix --- tests/spec_decode/e2e/test_correctness.py | 7 +++++-- vllm/model_executor/layers/sampler.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 782bd9d0cecb..fc5640d23ab5 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -7,10 +7,13 @@ "common_llm_kwargs", [{ # Use a small model for a fast test. - "model": "facebook/opt-125m", - "speculative_model": "facebook/opt-125m", + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + # Skip real loading for fast test. + "load_format": "dummy", + # Skip cuda graph recording for fast test. "enforce_eager": True, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 4f0cc4405e81..9540a3d89bd8 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -688,6 +688,6 @@ def _build_sampler_output( return SamplerOutput( outputs=sampler_output, # TODO - sampled_token_probs=torch.empty((len(sampler_output), 50_272), device='cuda', dtype=torch.float32), + sampled_token_probs=torch.empty((len(sampler_output), 32_000), device='cuda', dtype=torch.float32), sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), ) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 85667a6c3dd4..f665c3b72219 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -15,7 +15,9 @@ split_batch_by_proposal_len) from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.logger import init_logger +logger = init_logger(__name__) class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -144,6 +146,8 @@ def execute_model( "speculative decoding " "requires non-None seq_group_metadata_list") + logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") + # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0: @@ -174,6 +178,7 @@ def _run_no_spec( proposer and scorer model so that the KV cache is consistent between the two. """ + logger.info("run proposer worker no spec") self.proposer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, @@ -183,6 +188,7 @@ def _run_no_spec( #return_python_output=False ) + logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -214,11 +220,14 @@ def _run_speculative_decoding_step( sequence. """ + logger.info("get spec proposals") # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) + #logger.info(f"score proposals {proposals=}") + logger.info(f"score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, @@ -228,9 +237,11 @@ def _run_speculative_decoding_step( proposals, ) + logger.info("verify proposals") accepted_token_ids = self._verify_tokens(seq_group_metadata_list, proposal_scores, proposals, k) + logger.info("create output list") return self._create_output_sampler_list(seq_group_metadata_list, accepted_token_ids, k) From b2728e03de0703d9e479bd9e0e4aa3f158f426f6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:03:53 -0700 Subject: [PATCH 058/165] wip --- tests/spec_decode/e2e/test_correctness.py | 54 +++++++++++++++++++++- vllm/spec_decode/spec_decode_worker.py | 55 ++++++++++++++++++++++- vllm/worker/worker.py | 3 ++ 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fc5640d23ab5..28a88a750edb 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -20,10 +20,14 @@ # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "tensor_parallel_size": 1, + }, +]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_config(test_llm_generator): +def test_spec_decode(test_llm_generator): output_len = 128 temperature = 0.0 @@ -46,6 +50,51 @@ def test_spec_decode_config(test_llm_generator): get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + # Expect failure as spec decode not supported by + # Ray backend. + "tensor_parallel_size": 2, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail(test_llm_generator): + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises( + AssertionError, + match="Speculative decoding not yet supported for "): + get_token_ids_from_llm_generator(test_llm_generator, prompts, + sampling_params) def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: @@ -54,3 +103,4 @@ def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): del llm return token_ids + diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index f665c3b72219..3802ed42f786 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -19,6 +19,60 @@ logger = init_logger(__name__) +def create_spec_decode_worker(): + + from vllm.worker.worker import Worker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.spec_decode.multi_step_worker import MultiStepWorker + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + + target_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.multi_step_worker import MultiStepWorker + draft_worker = MultiStepWorker( + model_config=self.speculative_config.draft_model_config, + parallel_config=self.speculative_config.draft_parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.model_executor.layers.rejection_sampler import RejectionSampler + spec_decode_worker = SpecDecodeWorker( + proposer_worker=draft_worker, + scorer_worker=target_worker, + rejection_sampler=RejectionSampler(), + ) + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + self.driver_worker = spec_decode_worker + + self.driver_worker.init_device() + #self.driver_worker.load_model() + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -226,7 +280,6 @@ def _run_speculative_decoding_step( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - #logger.info(f"score proposals {proposals=}") logger.info(f"score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b46229c5b694..5d9a9acd763e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -205,7 +205,10 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]] = None, blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, + num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: + assert (num_lookahead_slots == 0), "worker does not support lookahead slots" + if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) From 6250f6cf32842de588edfe58f93e942a64cfd5b6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:12:50 -0700 Subject: [PATCH 059/165] assertion --- tests/spec_decode/e2e/test_correctness.py | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 28a88a750edb..92076d88ea83 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,4 +1,5 @@ import pytest +from itertools import cycle from vllm import SamplingParams @@ -26,30 +27,39 @@ }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode(test_llm_generator): +def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): + """Run generation with speculative decoding on a batch. Verify the number + of output tokens is equal to the expected number. + """ output_len = 128 temperature = 0.0 prompts = [ "Hello, my name is", - #"The president of the United States is", - #"The capital of France is", - #"The future of AI is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", ] + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + sampling_params = SamplingParams( max_tokens=output_len, ignore_eos=True, temperature=temperature, ) - #with pytest.raises( - # AssertionError, - # match="Speculative decoding not yet supported for GPU backend"): - get_token_ids_from_llm_generator(test_llm_generator, prompts, + batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) + # Expect a generation for each prompt in the batch. + assert len(batch_token_ids) == len(prompts) + + # TODO(cadedaniel) check for equality once block truncation is implemented. + assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + @pytest.mark.parametrize( "common_llm_kwargs", [{ From a930755de760545726cfcc9de5fc8d51a4b6fb71 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:18:19 -0700 Subject: [PATCH 060/165] fix --- vllm/model_executor/layers/sampler.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 9540a3d89bd8..7c7148b12229 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -78,8 +78,15 @@ def forward( # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) + + breakpoint() + + return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs) + prompt_logprobs, sample_logprobs, + sampled_token_probs=probs, + sampled_token_ids=torch.empty((len(sampling_metadata.seq_groups), 1), device=probs.device, dtype=torch.long), + ) def _get_bin_counts_and_mask( @@ -668,6 +675,8 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], + sampled_token_ids: Optional[torch.Tensor] = None, + sampled_token_probs: Optional[torch.Tensor] = None, ) -> SamplerOutput: sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, @@ -687,7 +696,6 @@ def _build_sampler_output( return SamplerOutput( outputs=sampler_output, - # TODO - sampled_token_probs=torch.empty((len(sampler_output), 32_000), device='cuda', dtype=torch.float32), - sampled_token_ids=torch.empty((len(sampler_output), 1), device='cuda', dtype=torch.long), + sampled_token_probs=sampled_token_probs, + sampled_token_ids=sampled_token_ids, ) From 5b896a3fe4e9614ee2557a9361cb381f88eeb15d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:18:43 -0700 Subject: [PATCH 061/165] fix --- vllm/model_executor/layers/sampler.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 7c7148b12229..71807b25834a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,9 +79,7 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - breakpoint() - - + # TODO gate by config return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs, sampled_token_probs=probs, From bb43b530ce2eeecaa29a8108dc17e0f24b80b099 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:19:23 -0700 Subject: [PATCH 062/165] lint --- tests/spec_decode/e2e/test_correctness.py | 29 +++++++++++++---------- vllm/engine/llm_engine.py | 11 +++++---- vllm/executor/gpu_executor.py | 19 ++++++++------- vllm/model_executor/layers/sampler.py | 14 +++++++---- vllm/spec_decode/batch_expansion.py | 2 +- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 10 ++++---- vllm/worker/worker.py | 3 ++- 8 files changed, 52 insertions(+), 38 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 92076d88ea83..36a66ea2ec38 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -51,8 +51,9 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, - sampling_params) + batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, + sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) @@ -60,6 +61,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # TODO(cadedaniel) check for equality once block truncation is implemented. assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -77,13 +79,15 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - # Expect failure as spec decode not supported by - # Ray backend. - "tensor_parallel_size": 2, - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + # Expect failure as spec decode not supported by + # Ray backend. + "tensor_parallel_size": 2, + }, + ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_xfail(test_llm_generator): @@ -100,12 +104,12 @@ def test_spec_decode_xfail(test_llm_generator): temperature=temperature, ) - with pytest.raises( - AssertionError, - match="Speculative decoding not yet supported for "): + with pytest.raises(AssertionError, + match="Speculative decoding not yet supported for "): get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) + def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) @@ -113,4 +117,3 @@ def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): del llm return token_ids - diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 15ef7df26b0b..1ca447890d4c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -627,7 +627,6 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: - if not isinstance(output, list): all_output = [output] else: @@ -646,7 +645,8 @@ def _process_model_outputs( now = time.time() # Update the scheduled sequence groups with the model outputs. - for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): + for scheduled_seq_group, outputs in zip(scheduled_seq_groups, + output_by_sequence_group): seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( @@ -655,7 +655,8 @@ def _process_model_outputs( assert len(outputs) > 0 # TODO can spec decode go through second path? if len(outputs) > 1: - self._process_sequence_group_outputs_multi_step(seq_group, outputs) + self._process_sequence_group_outputs_multi_step( + seq_group, outputs) else: self._process_sequence_group_outputs(seq_group, outputs[0]) @@ -825,7 +826,7 @@ def step(self) -> List[RequestOutput]: num_lookahead_slots=scheduler_outputs.num_lookahead_slots) else: output = [] - + return self._process_model_outputs(output, scheduler_outputs) def do_log_stats(self) -> None: @@ -913,7 +914,7 @@ def _check_stop(self, seq: Sequence, if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return - + # Check if the sequence has reached max_tokens. if seq.get_output_len() >= sampling_params.max_tokens: # TODO should cap block diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 80ec79ba3c3c..60c9a9ca3c78 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -75,7 +75,7 @@ def _init_spec_worker(self): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, @@ -90,7 +90,7 @@ def _init_spec_worker(self): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.model_executor.layers.rejection_sampler import RejectionSampler spec_decode_worker = SpecDecodeWorker( @@ -150,13 +150,14 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int, - ) -> SamplerOutput: + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int, + ) -> SamplerOutput: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 71807b25834a..5c1017207878 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -80,11 +80,17 @@ def forward( logprobs, sampling_metadata, sample_results) # TODO gate by config - return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs, + return _build_sampler_output( + sample_results, + sampling_metadata, + prompt_logprobs, + sample_logprobs, sampled_token_probs=probs, - sampled_token_ids=torch.empty((len(sampling_metadata.seq_groups), 1), device=probs.device, dtype=torch.long), - ) + sampled_token_ids=torch.empty( + (len(sampling_metadata.seq_groups), 1), + device=probs.device, + dtype=torch.long), + ) def _get_bin_counts_and_mask( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 89be25252c2c..6be8c843cf7a 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -84,7 +84,7 @@ def score_proposals( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, #return_python_output=False - ) + ) all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c817f54d7fe3..73b6e201c67a 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -340,7 +340,7 @@ def _merge_outputs( return proposal_tokens, proposal_probs, proposal_lens sampler_output = maybe_sampler_output - + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3802ed42f786..12a70d402e98 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -19,8 +19,9 @@ logger = init_logger(__name__) + def create_spec_decode_worker(): - + from vllm.worker.worker import Worker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker @@ -41,7 +42,7 @@ def create_spec_decode_worker(): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, @@ -56,7 +57,7 @@ def create_spec_decode_worker(): vision_language_config=self.vision_language_config, is_driver_worker=True, ) - + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.model_executor.layers.rejection_sampler import RejectionSampler spec_decode_worker = SpecDecodeWorker( @@ -73,6 +74,7 @@ def create_spec_decode_worker(): self.driver_worker.init_device() #self.driver_worker.load_model() + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -240,7 +242,7 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, #return_python_output=False - ) + ) logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5d9a9acd763e..941c06208129 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -207,7 +207,8 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: - assert (num_lookahead_slots == 0), "worker does not support lookahead slots" + assert (num_lookahead_slots == 0 + ), "worker does not support lookahead slots" if self.is_driver_worker: assert seq_group_metadata_list is not None From cde3160fdd542b80abba0d9855c98d8a12d959ac Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 14:57:45 -0700 Subject: [PATCH 063/165] fix --- vllm/executor/gpu_executor.py | 2 +- vllm/model_executor/layers/sampler.py | 11 ++++++----- vllm/sequence.py | 10 ++++++++++ vllm/spec_decode/batch_expansion.py | 10 +++++++++- vllm/spec_decode/multi_step_worker.py | 10 +++++++++- vllm/spec_decode/util.py | 7 +++++++ 6 files changed, 42 insertions(+), 8 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 60c9a9ca3c78..ac445cd51a7e 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -96,7 +96,7 @@ def _init_spec_worker(self): spec_decode_worker = SpecDecodeWorker( proposer_worker=draft_worker, scorer_worker=target_worker, - rejection_sampler=RejectionSampler(), + rejection_sampler=RejectionSampler(strict_mode=True), ) assert self.parallel_config.world_size == 1, ( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c1017207878..135bc13e8d7c 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -85,11 +85,12 @@ def forward( sampling_metadata, prompt_logprobs, sample_logprobs, - sampled_token_probs=probs, - sampled_token_ids=torch.empty( - (len(sampling_metadata.seq_groups), 1), - device=probs.device, - dtype=torch.long), + #sampled_token_probs=probs, + ## TODO + #sampled_token_ids=torch.empty( + # (len(sampling_metadata.seq_groups), 1), + # device=probs.device, + # dtype=torch.long), ) diff --git a/vllm/sequence.py b/vllm/sequence.py index 576bbe8c4f6c..223a7cf80232 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -686,3 +686,13 @@ def __len__(self): def __eq__(self, other: object): return isinstance(other, self.__class__) and self.outputs == other.outputs + + def __repr__(self) -> str: + """Show the shape of a tensor instead of its values to reduce noise. + """ + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) + return (f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr}, " + f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 6be8c843cf7a..701324c16dfe 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -8,7 +8,7 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, - split_batch_by_proposal_len) + split_batch_by_proposal_len, mock_device_tensors) from vllm.worker.worker import Worker SeqId = int @@ -143,6 +143,14 @@ def _contract_batch(self, original_bs: int, This maps the scores of speculative tokens back to their original sequences. """ + + mock_device_tensors( + sampler_output=target_sampler_output, + batch_size=len(non_spec_indices) + num_scoring_tokens, + vocab_size=self._vocab_size, + device=self._device, + ) + (target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs) = self._split_scoring_output( target_sampler_output, num_scoring_tokens) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 73b6e201c67a..262bab162649 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import sampler_output_to_torch +from vllm.spec_decode.util import (sampler_output_to_torch, mock_device_tensors) from vllm.worker.worker import Worker @@ -341,6 +341,14 @@ def _merge_outputs( sampler_output = maybe_sampler_output + for step_output in sampler_output: + mock_device_tensors( + sampler_output=step_output, + batch_size=len(proposal_lens), + vocab_size=self._vocab_size, + device=self._device, + ) + proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 406568a4bc08..234ed9e44f4e 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,6 +82,13 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs +def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: + assert sampler_output.sampled_token_probs is None + assert sampler_output.sampled_token_ids is None + + sampler_output.sampled_token_probs = torch.nn.functional.softmax(torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, high=vocab_size, size=(batch_size,), dtype=torch.long, device=device) + @contextmanager def nvtx_range(msg, *args, **kwargs): """ From dd8aeff307f7c035b7db4a5184d00172cad6c3e9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:00:34 -0700 Subject: [PATCH 064/165] fix --- vllm/engine/llm_engine.py | 1 - vllm/sequence.py | 9 +++-- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 5 ++- vllm/spec_decode/spec_decode_worker.py | 55 -------------------------- vllm/spec_decode/util.py | 14 +++++-- 6 files changed, 22 insertions(+), 65 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1ca447890d4c..9d65ec1a2faa 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -739,7 +739,6 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): token_id=output_token_id, logprobs={output_token_id: Logprob(0.0)}, ) - print(f'Appended token id {output_token_id=}') #seq.append_token_ids(output_token_ids, # output_logprobs, diff --git a/vllm/sequence.py b/vllm/sequence.py index 223a7cf80232..fa51483301a3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -690,9 +690,12 @@ def __eq__(self, other: object): def __repr__(self) -> str: """Show the shape of a tensor instead of its values to reduce noise. """ - sampled_token_probs_repr = ("None" if self.sampled_token_probs is None else self.sampled_token_probs.shape) - sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) - return (f"SamplerOutput(outputs={self.outputs}, " + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None + else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else + self.sampled_token_ids.shape) + return ( + f"SamplerOutput(outputs={self.outputs}, " f"sampled_token_probs={sampled_token_probs_repr}, " f"sampled_token_ids={sampled_token_ids_repr}, " f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 701324c16dfe..bba3c4733e4f 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -8,7 +8,8 @@ SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, - split_batch_by_proposal_len, mock_device_tensors) + split_batch_by_proposal_len, + mock_device_tensors) from vllm.worker.worker import Worker SeqId = int diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 262bab162649..0ac189a7bacc 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,7 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (sampler_output_to_torch, mock_device_tensors) +from vllm.spec_decode.util import (sampler_output_to_torch, + mock_device_tensors) from vllm.worker.worker import Worker @@ -343,7 +344,7 @@ def _merge_outputs( for step_output in sampler_output: mock_device_tensors( - sampler_output=step_output, + sampler_output=step_output, batch_size=len(proposal_lens), vocab_size=self._vocab_size, device=self._device, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 12a70d402e98..3e33371edadf 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -20,61 +20,6 @@ logger = init_logger(__name__) -def create_spec_decode_worker(): - - from vllm.worker.worker import Worker - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.spec_decode.multi_step_worker import MultiStepWorker - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - - target_worker = Worker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - - from vllm.spec_decode.multi_step_worker import MultiStepWorker - draft_worker = MultiStepWorker( - model_config=self.speculative_config.draft_model_config, - parallel_config=self.speculative_config.draft_parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.model_executor.layers.rejection_sampler import RejectionSampler - spec_decode_worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - rejection_sampler=RejectionSampler(), - ) - - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - self.driver_worker = spec_decode_worker - - self.driver_worker.init_device() - #self.driver_worker.load_model() - - class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 234ed9e44f4e..7129f47d65f6 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,12 +82,20 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs -def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: +def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, + vocab_size: int, device: str) -> None: assert sampler_output.sampled_token_probs is None assert sampler_output.sampled_token_ids is None - sampler_output.sampled_token_probs = torch.nn.functional.softmax(torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - sampler_output.sampled_token_ids = torch.randint(low=0, high=vocab_size, size=(batch_size,), dtype=torch.long, device=device) + sampler_output.sampled_token_probs = torch.nn.functional.softmax( + torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), + dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) + @contextmanager def nvtx_range(msg, *args, **kwargs): From 46e48474ab355254f4d831b86f2b3303abde0d22 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:10:22 -0700 Subject: [PATCH 065/165] test --- tests/spec_decode/e2e/test_correctness.py | 8 +++++--- vllm/engine/llm_engine.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 36a66ea2ec38..a1df4dccbe3b 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -9,8 +9,6 @@ [{ # Use a small model for a fast test. "model": "JackFram/llama-68m", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, # Skip real loading for fast test. "load_format": "dummy", @@ -23,7 +21,11 @@ }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [ { - "tensor_parallel_size": 1, + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + # No spec decode. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9d65ec1a2faa..a08a883539a9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -627,7 +627,7 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: - if not isinstance(output, list): + if self.speculative_config is None: all_output = [output] else: all_output = output @@ -638,7 +638,7 @@ def _process_model_outputs( output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ [] for _ in scheduled_seq_groups ] - for step in output: + for step in all_output: for i, sequence_group_output in enumerate(step): output_by_sequence_group[i].append(sequence_group_output) From 8454edc8bf13cb04936b7f552f7e6ec368a6693f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:41:02 -0700 Subject: [PATCH 066/165] test fixes --- tests/spec_decode/test_spec_decode_worker.py | 14 +++++++------- vllm/engine/llm_engine.py | 2 +- vllm/executor/ray_gpu_executor.py | 3 ++- vllm/worker/worker.py | 2 -- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 47aff8f57541..bd06d5b17d07 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -37,7 +37,7 @@ def test_correctly_calls_draft_model(k: int, batch_size: int): execute_model_data, _, _ = create_batch(batch_size, k) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) call_args_list = draft_worker.get_spec_proposals.call_args_list assert len(call_args_list) == 1 @@ -102,7 +102,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): target_worker.execute_model.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) seen_contexts = [] @@ -195,7 +195,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): rejection_sampler.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) assert len(rejection_sampler.call_args_list) == 1 args, _ = rejection_sampler.call_args_list[0] @@ -283,7 +283,7 @@ def test_correctly_formats_output(k: int, batch_size: int): rejection_sampler.return_value = rejection_sampler_output output = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) expected_output = create_sampler_output_list( rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)]) @@ -400,7 +400,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): mock_rejsample_metrics) output = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics call_args_list = ( @@ -435,7 +435,7 @@ def test_k_equals_zero(k: int, batch_size: int): batch_size, k, prev_output_token_len=0) out = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert len(out) == 1, f"expected only one token output when {k=}" assert out[0].probs is None, "expect gpu tensor references to be None" @@ -474,7 +474,7 @@ def test_empty_input_batch(k: int, batch_size: int): batch_size, k, prev_output_token_len=0) out = worker.execute_model(**execute_model_data.to_dict(), - num_spec_tokens=k) + num_lookahead_slots=k) assert len(out) == 1, f"expected only one token output when {k=}" assert out[0].probs is None, "expect gpu tensor references to be None" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a08a883539a9..e47af8dfcf9e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -915,7 +915,7 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= sampling_params.max_tokens: + if seq.get_output_len() >= int(sampling_params.max_tokens): # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index a508d1e8fe60..226183855708 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -238,7 +238,8 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int = 0) -> SamplerOutput: all_outputs = self._run_workers( "execute_model", driver_kwargs={ diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 941c06208129..cb30f658482b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -207,8 +207,6 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> Optional[SamplerOutput]: - assert (num_lookahead_slots == 0 - ), "worker does not support lookahead slots" if self.is_driver_worker: assert seq_group_metadata_list is not None From 819e65695455e9d63e4ed306f313b1d96f6b2c9a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 15:41:22 -0700 Subject: [PATCH 067/165] lint --- tests/spec_decode/e2e/test_correctness.py | 20 +++++++++++--------- tests/spec_decode/test_spec_decode_worker.py | 9 ++++++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index a1df4dccbe3b..d8b09ce5b77a 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -19,15 +19,17 @@ # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - # No spec decode. - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + # No spec decode. + }, + ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index bd06d5b17d07..3725924ea89c 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -37,7 +37,8 @@ def test_correctly_calls_draft_model(k: int, batch_size: int): execute_model_data, _, _ = create_batch(batch_size, k) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) call_args_list = draft_worker.get_spec_proposals.call_args_list assert len(call_args_list) == 1 @@ -102,7 +103,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int): target_worker.execute_model.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) seen_contexts = [] @@ -195,7 +197,8 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): rejection_sampler.side_effect = ValueError(exception_secret) with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(**execute_model_data.to_dict(), num_lookahead_slots=k) + worker.execute_model(**execute_model_data.to_dict(), + num_lookahead_slots=k) assert len(rejection_sampler.call_args_list) == 1 args, _ = rejection_sampler.call_args_list[0] From d0fbe47bdb778b9ba32bda2b0d9a621d9ecd1134 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:01:35 -0700 Subject: [PATCH 068/165] clean --- vllm/model_executor/layers/sampler.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 135bc13e8d7c..bed915faf3fb 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,19 +79,7 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - # TODO gate by config - return _build_sampler_output( - sample_results, - sampling_metadata, - prompt_logprobs, - sample_logprobs, - #sampled_token_probs=probs, - ## TODO - #sampled_token_ids=torch.empty( - # (len(sampling_metadata.seq_groups), 1), - # device=probs.device, - # dtype=torch.long), - ) + return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) def _get_bin_counts_and_mask( @@ -699,8 +687,4 @@ def _build_sampler_output( sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput( - outputs=sampler_output, - sampled_token_probs=sampled_token_probs, - sampled_token_ids=sampled_token_ids, - ) + return SamplerOutput(outputs=sampler_output) From 5445af6ddf43cf9b1b82dc53260627e455d0ae81 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:45:19 -0700 Subject: [PATCH 069/165] refactor out beam search model processor --- vllm/engine/llm_engine.py | 537 ++++++++++--------- vllm/engine/output_processor/__init__.py | 0 vllm/engine/output_processor/beam_search.py | 321 +++++++++++ vllm/engine/output_processor/block_decode.py | 186 +++++++ vllm/engine/output_processor/interfaces.py | 36 ++ 5 files changed, 817 insertions(+), 263 deletions(-) create mode 100644 vllm/engine/output_processor/__init__.py create mode 100644 vllm/engine/output_processor/beam_search.py create mode 100644 vllm/engine/output_processor/block_decode.py create mode 100644 vllm/engine/output_processor/interfaces.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e47af8dfcf9e..1ac73bc874de 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -25,6 +25,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -180,6 +181,14 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + self.output_processor = SequenceGroupOutputProcessor.create_output_processor( + self.scheduler_config, + self.detokenizer, + self.scheduler, + self.seq_counter, + self.get_tokenizer_for_seq, + ) + def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). @@ -449,179 +458,179 @@ def _check_beam_search_early_stopping( eos_token_id=best_running_seq.eos_token_id)) return current_worst_score >= highest_attainable_score - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput) -> None: - - # Process prompt logprobs - prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None and seq_group.sampling_params.detokenize: - self.detokenizer.decode_prompt_logprobs_inplace( - seq_group, prompt_logprobs) - seq_group.prompt_logprobs = prompt_logprobs - - # Process samples - samples = outputs.samples - parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - existing_finished_seqs = seq_group.get_finished_seqs() - parent_child_dict = { - parent_seq.seq_id: [] - for parent_seq in parent_seqs - } - for sample in samples: - parent_child_dict[sample.parent_seq_id].append(sample) - # List of (child, parent) - child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # Process the child samples for each parent sequence - for parent in parent_seqs: - child_samples: List[SequenceOutput] = parent_child_dict[ - parent.seq_id] - if len(child_samples) == 0: - # This parent sequence has no children samples. Remove - # the parent sequence from the sequence group since it will - # not be used in the future iterations. - parent.status = SequenceStatus.FINISHED_ABORTED - seq_group.remove(parent.seq_id) - self.scheduler.free_seq(parent) - continue - # Fork the parent sequence if there are multiple child samples. - for child_sample in child_samples[:-1]: - new_child_seq_id = next(self.seq_counter) - child = parent.fork(new_child_seq_id) - child.append_token_id(child_sample.output_token, - child_sample.logprobs) - child_seqs.append((child, parent)) - # Continue the parent sequence for the last child sample. - # We reuse the parent sequence here to reduce redundant memory - # copies, especially when using non-beam search sampling methods. - last_child_sample = child_samples[-1] - parent.append_token_id(last_child_sample.output_token, - last_child_sample.logprobs) - child_seqs.append((parent, parent)) - - for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize: - self.detokenizer.decode_sequence_inplace( - seq, seq_group.sampling_params) - self._check_stop(seq, seq_group.sampling_params) - - # Non-beam search case - if not seq_group.sampling_params.use_beam_search: - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - # NOTE: we need to fork the new sequences before freeing the - # old sequences. - for seq, parent in child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - return - - # Beam search case - # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] - beam_width = seq_group.sampling_params.best_of - length_penalty = seq_group.sampling_params.length_penalty - - # Select the newly finished sequences with the highest scores - # to replace existing finished sequences. - # Tuple of (seq, parent, is_new) - existing_finished_seqs = [(seq, None, False) - for seq in existing_finished_seqs] - new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - if seq.is_finished()] - all_finished_seqs = existing_finished_seqs + new_finished_seqs - # Sort the finished sequences by their scores. - all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - for seq, parent, is_new in all_finished_seqs[:beam_width]: - if is_new: - # A newly generated child sequence finishes and has a high - # score, so we will add it into the sequence group. - selected_child_seqs.append((seq, parent)) - for seq, parent, is_new in all_finished_seqs[beam_width:]: - if is_new: - # A newly generated child sequence finishes but has a low - # score, so we will not add it into the sequence group. - # Additionally, if this sequence is a continuation of a - # parent sequence, we will need remove the parent sequence - # from the sequence group. - unselected_child_seqs.append((seq, parent)) - else: - # An existing finished sequence has a low score, so we will - # remove it from the sequence group. - seq_group.remove(seq.seq_id) - - # select the top beam_width sequences from the running - # sequences for the next iteration to continue the beam - # search. - running_child_seqs = [(seq, parent) for seq, parent in child_seqs - if not seq.is_finished()] - # Sort the running sequences by their scores. - running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - - # Check if we can stop the beam search. - if len(running_child_seqs) == 0: - # No running sequences, stop the beam search. - stop_beam_search = True - elif len(all_finished_seqs) < beam_width: - # Not enough finished sequences, continue the beam search. - stop_beam_search = False - else: - # Check the early stopping criteria - best_running_seq = running_child_seqs[0][0] - current_worst_seq = all_finished_seqs[beam_width - 1][0] - stop_beam_search = self._check_beam_search_early_stopping( - seq_group.sampling_params.early_stopping, - seq_group.sampling_params, best_running_seq, current_worst_seq) - - if stop_beam_search: - # Stop the beam search and remove all the running sequences from - # the sequence group. - unselected_child_seqs.extend(running_child_seqs) - else: - # Continue the beam search and select the top beam_width sequences - # to continue the beam search. - selected_child_seqs.extend(running_child_seqs[:beam_width]) - # The remaining running sequences will not be used in the next - # iteration. Again, if these sequences are continuations of - # parent sequences, we will need to remove the parent sequences - # from the sequence group. - unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in selected_child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - for seq, parent in selected_child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - - # Remove the unselected parent sequences from the sequence group and - # free their memory in block manager. - for seq, parent in unselected_child_seqs: - if seq is parent: - # Remove the parent sequence if it is not selected for next - # iteration - seq_group.remove(seq.seq_id) - self.scheduler.free_seq(seq) + #def _process_sequence_group_outputs(self, seq_group: SequenceGroup, + # outputs: SequenceGroupOutput) -> None: + + # # Process prompt logprobs + # prompt_logprobs = outputs.prompt_logprobs + # if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + # self.detokenizer.decode_prompt_logprobs_inplace( + # seq_group, prompt_logprobs) + # seq_group.prompt_logprobs = prompt_logprobs + + # # Process samples + # samples = outputs.samples + # parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + # existing_finished_seqs = seq_group.get_finished_seqs() + # parent_child_dict = { + # parent_seq.seq_id: [] + # for parent_seq in parent_seqs + # } + # for sample in samples: + # parent_child_dict[sample.parent_seq_id].append(sample) + # # List of (child, parent) + # child_seqs: List[Tuple[Sequence, Sequence]] = [] + + # # Process the child samples for each parent sequence + # for parent in parent_seqs: + # child_samples: List[SequenceOutput] = parent_child_dict[ + # parent.seq_id] + # if len(child_samples) == 0: + # # This parent sequence has no children samples. Remove + # # the parent sequence from the sequence group since it will + # # not be used in the future iterations. + # parent.status = SequenceStatus.FINISHED_ABORTED + # seq_group.remove(parent.seq_id) + # self.scheduler.free_seq(parent) + # continue + # # Fork the parent sequence if there are multiple child samples. + # for child_sample in child_samples[:-1]: + # new_child_seq_id = next(self.seq_counter) + # child = parent.fork(new_child_seq_id) + # child.append_token_id(child_sample.output_token, + # child_sample.logprobs) + # child_seqs.append((child, parent)) + # # Continue the parent sequence for the last child sample. + # # We reuse the parent sequence here to reduce redundant memory + # # copies, especially when using non-beam search sampling methods. + # last_child_sample = child_samples[-1] + # parent.append_token_id(last_child_sample.output_token, + # last_child_sample.logprobs) + # child_seqs.append((parent, parent)) + + # for seq, _ in child_seqs: + # if seq_group.sampling_params.detokenize: + # self.detokenizer.decode_sequence_inplace( + # seq, seq_group.sampling_params) + # self._check_stop(seq, seq_group.sampling_params) + + # # Non-beam search case + # if not seq_group.sampling_params.use_beam_search: + # # For newly created child sequences, add them to the sequence group + # # and fork them in block manager if they are not finished. + # for seq, parent in child_seqs: + # if seq is not parent: + # seq_group.add(seq) + # if not seq.is_finished(): + # self.scheduler.fork_seq(parent, seq) + + # # Free the finished and selected parent sequences' memory in block + # # manager. Keep them in the sequence group as candidate output. + # # NOTE: we need to fork the new sequences before freeing the + # # old sequences. + # for seq, parent in child_seqs: + # if seq is parent and seq.is_finished(): + # self.scheduler.free_seq(seq) + # return + + # # Beam search case + # # Select the child sequences to keep in the sequence group. + # selected_child_seqs = [] + # unselected_child_seqs = [] + # beam_width = seq_group.sampling_params.best_of + # length_penalty = seq_group.sampling_params.length_penalty + + # # Select the newly finished sequences with the highest scores + # # to replace existing finished sequences. + # # Tuple of (seq, parent, is_new) + # existing_finished_seqs = [(seq, None, False) + # for seq in existing_finished_seqs] + # new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs + # if seq.is_finished()] + # all_finished_seqs = existing_finished_seqs + new_finished_seqs + # # Sort the finished sequences by their scores. + # all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( + # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + # reverse=True) + # for seq, parent, is_new in all_finished_seqs[:beam_width]: + # if is_new: + # # A newly generated child sequence finishes and has a high + # # score, so we will add it into the sequence group. + # selected_child_seqs.append((seq, parent)) + # for seq, parent, is_new in all_finished_seqs[beam_width:]: + # if is_new: + # # A newly generated child sequence finishes but has a low + # # score, so we will not add it into the sequence group. + # # Additionally, if this sequence is a continuation of a + # # parent sequence, we will need remove the parent sequence + # # from the sequence group. + # unselected_child_seqs.append((seq, parent)) + # else: + # # An existing finished sequence has a low score, so we will + # # remove it from the sequence group. + # seq_group.remove(seq.seq_id) + + # # select the top beam_width sequences from the running + # # sequences for the next iteration to continue the beam + # # search. + # running_child_seqs = [(seq, parent) for seq, parent in child_seqs + # if not seq.is_finished()] + # # Sort the running sequences by their scores. + # running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( + # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + # reverse=True) + + # # Check if we can stop the beam search. + # if len(running_child_seqs) == 0: + # # No running sequences, stop the beam search. + # stop_beam_search = True + # elif len(all_finished_seqs) < beam_width: + # # Not enough finished sequences, continue the beam search. + # stop_beam_search = False + # else: + # # Check the early stopping criteria + # best_running_seq = running_child_seqs[0][0] + # current_worst_seq = all_finished_seqs[beam_width - 1][0] + # stop_beam_search = self._check_beam_search_early_stopping( + # seq_group.sampling_params.early_stopping, + # seq_group.sampling_params, best_running_seq, current_worst_seq) + + # if stop_beam_search: + # # Stop the beam search and remove all the running sequences from + # # the sequence group. + # unselected_child_seqs.extend(running_child_seqs) + # else: + # # Continue the beam search and select the top beam_width sequences + # # to continue the beam search. + # selected_child_seqs.extend(running_child_seqs[:beam_width]) + # # The remaining running sequences will not be used in the next + # # iteration. Again, if these sequences are continuations of + # # parent sequences, we will need to remove the parent sequences + # # from the sequence group. + # unselected_child_seqs.extend(running_child_seqs[beam_width:]) + + # # For newly created child sequences, add them to the sequence group + # # and fork them in block manager if they are not finished. + # for seq, parent in selected_child_seqs: + # if seq is not parent: + # seq_group.add(seq) + # if not seq.is_finished(): + # self.scheduler.fork_seq(parent, seq) + + # # Free the finished and selected parent sequences' memory in block + # # manager. Keep them in the sequence group as candidate output. + # for seq, parent in selected_child_seqs: + # if seq is parent and seq.is_finished(): + # self.scheduler.free_seq(seq) + + # # Remove the unselected parent sequences from the sequence group and + # # free their memory in block manager. + # for seq, parent in unselected_child_seqs: + # if seq is parent: + # # Remove the parent sequence if it is not selected for next + # # iteration + # seq_group.remove(seq.seq_id) + # self.scheduler.free_seq(seq) def _process_model_outputs( self, output: SamplerOutput, @@ -651,14 +660,16 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) + + self.output_processor.process_outputs(seq_group, outputs) - assert len(outputs) > 0 - # TODO can spec decode go through second path? - if len(outputs) > 1: - self._process_sequence_group_outputs_multi_step( - seq_group, outputs) - else: - self._process_sequence_group_outputs(seq_group, outputs[0]) + #assert len(outputs) > 0 + ## TODO can spec decode go through second path? + #if len(outputs) > 1: + # self._process_sequence_group_outputs_multi_step( + # seq_group, outputs) + #else: + # self._process_sequence_group_outputs(seq_group, outputs[0]) # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() @@ -679,89 +690,89 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs - def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - assert seqs - #if not seqs: - # return [] - - assert len(seqs) == 1, ("Beam search not supported in speculative " - "decoding.") - seq = seqs[0] - - # Since there's only one sequence per sequence group, we can take the - # first sample. - samples = [outputs[step].samples[0] for step in range(len(outputs))] - - # -1 means the output token is not valid (eg. due to spec decode - # rejecting tokens). - valid_samples = [ - sample for sample in samples if sample.output_token != -1 - ] - - # Draft target worker pads all outputs with -1 to have same length. - output_token_ids = [sample.output_token for sample in valid_samples] - #successes = [sample.success for sample in samples] - - ## Truncate to max_tokens if necessary. - #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # seq.get_output_len() + len(output_token_ids)) - #if remaining_tokens < 0: - # valid_samples = valid_samples[:remaining_tokens] - # output_token_ids = output_token_ids[:remaining_tokens] - - ## Truncate any tokens after EOS. This is required as spec decode - ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not seq_group.sampling_params.ignore_eos: - # eos_token_id = self.tokenizer.get_lora_tokenizer( - # seq.lora_request).eos_token_id - # # Avoiding .index calls as exception throwing in the happy path - # # is expensive. - # for i in range(len(output_token_ids)): - # if output_token_ids[i] == eos_token_id: - # output_token_ids = output_token_ids[:i + 1] - # valid_samples = valid_samples[:i + 1] - # break - - #output_logprobs = [sample.logprobs for sample in valid_samples] - - ## Use the last sample for the sequence as it will have - ## the speculation and num_unprocessed_tokens for all the - ## previous samples (they are cumulative when it comes - ## to those two attributes). - #speculation = valid_samples[-1].speculation - #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - - for output_token_id in output_token_ids: - from vllm.sequence import Logprob - seq.append_token_id( - token_id=output_token_id, - logprobs={output_token_id: Logprob(0.0)}, - ) - - #seq.append_token_ids(output_token_ids, - # output_logprobs, - # ) - # #num_unprocessed_tokens=num_unprocessed_tokens) - ##seq.set_last_speculation(speculation) - - #if not all(successes): - # seq.set_status_to_failed() - - #if decode: - # self._decode_sequence(seq, - # seq_group.sampling_params, - # token_ids=seq.get_token_ids(), - # unseen_token_ids=output_token_ids, - # prefix_offset=seq.prefix_offset, - # read_offset=seq.read_offset) - #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # output_token_ids) - # TODO pass output token ids - self._check_stop(seq, seq_group.sampling_params) - if seq.is_finished(): - self.scheduler.free_seq(seq) + #def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + # seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + # assert seqs + # #if not seqs: + # # return [] + + # assert len(seqs) == 1, ("Beam search not supported in speculative " + # "decoding.") + # seq = seqs[0] + + # # Since there's only one sequence per sequence group, we can take the + # # first sample. + # samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # # -1 means the output token is not valid (eg. due to spec decode + # # rejecting tokens). + # valid_samples = [ + # sample for sample in samples if sample.output_token != -1 + # ] + + # # Draft target worker pads all outputs with -1 to have same length. + # output_token_ids = [sample.output_token for sample in valid_samples] + # #successes = [sample.success for sample in samples] + + # ## Truncate to max_tokens if necessary. + # #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # # seq.get_output_len() + len(output_token_ids)) + # #if remaining_tokens < 0: + # # valid_samples = valid_samples[:remaining_tokens] + # # output_token_ids = output_token_ids[:remaining_tokens] + + # ## Truncate any tokens after EOS. This is required as spec decode + # ## generates tokens in fixed blocks, which may go beyond the EOS token. + # #if not seq_group.sampling_params.ignore_eos: + # # eos_token_id = self.tokenizer.get_lora_tokenizer( + # # seq.lora_request).eos_token_id + # # # Avoiding .index calls as exception throwing in the happy path + # # # is expensive. + # # for i in range(len(output_token_ids)): + # # if output_token_ids[i] == eos_token_id: + # # output_token_ids = output_token_ids[:i + 1] + # # valid_samples = valid_samples[:i + 1] + # # break + + # #output_logprobs = [sample.logprobs for sample in valid_samples] + + # ## Use the last sample for the sequence as it will have + # ## the speculation and num_unprocessed_tokens for all the + # ## previous samples (they are cumulative when it comes + # ## to those two attributes). + # #speculation = valid_samples[-1].speculation + # #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + # for output_token_id in output_token_ids: + # from vllm.sequence import Logprob + # seq.append_token_id( + # token_id=output_token_id, + # logprobs={output_token_id: Logprob(0.0)}, + # ) + + # #seq.append_token_ids(output_token_ids, + # # output_logprobs, + # # ) + # # #num_unprocessed_tokens=num_unprocessed_tokens) + # ##seq.set_last_speculation(speculation) + + # #if not all(successes): + # # seq.set_status_to_failed() + + # #if decode: + # # self._decode_sequence(seq, + # # seq_group.sampling_params, + # # token_ids=seq.get_token_ids(), + # # unseen_token_ids=output_token_ids, + # # prefix_offset=seq.prefix_offset, + # # read_offset=seq.read_offset) + # #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # # output_token_ids) + # # TODO pass output token ids + # self._check_stop(seq, seq_group.sampling_params) + # if seq.is_finished(): + # self.scheduler.free_seq(seq) def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py new file mode 100644 index 000000000000..5f823b5c5c72 --- /dev/null +++ b/vllm/engine/output_processor/beam_search.py @@ -0,0 +1,321 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) + + +class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): + + def __init__( + self, + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + self.scheduler_config = scheduler_config + self.detokenizer = detokenizer + self.scheduler = scheduler + self.seq_counter = seq_counter + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" + return self._process_sequence_group_outputs(sequence_group, outputs[0]) + + def _check_beam_search_early_stopping( + self, + early_stopping: Union[bool, str], + sampling_params: SamplingParams, + best_running_seq: Sequence, + current_worst_seq: Sequence, + ) -> bool: + assert sampling_params.use_beam_search + length_penalty = sampling_params.length_penalty + if early_stopping is True: + return True + + current_worst_score = current_worst_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=current_worst_seq.eos_token_id) + if early_stopping is False: + highest_attainable_score = best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id) + else: + assert early_stopping == "never" + if length_penalty > 0.0: + # If length_penalty > 0.0, beam search will prefer longer + # sequences. The highest attainable score calculation is + # based on the longest possible sequence length in this case. + max_possible_length = max( + best_running_seq.get_prompt_len() + + sampling_params.max_tokens, + self.scheduler_config.max_model_len) + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id, + seq_len=max_possible_length)) + else: + # Otherwise, beam search will prefer shorter sequences. The + # highest attainable score calculation is based on the current + # sequence length. + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id)) + return current_worst_score >= highest_attainable_score + + def _process_sequence_group_outputs(self, seq_group: SequenceGroup, + outputs: SequenceGroupOutput) -> None: + + # Process prompt logprobs + prompt_logprobs = outputs.prompt_logprobs + if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + self.detokenizer.decode_prompt_logprobs_inplace( + seq_group, prompt_logprobs) + seq_group.prompt_logprobs = prompt_logprobs + + # Process samples + samples = outputs.samples + parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + existing_finished_seqs = seq_group.get_finished_seqs() + parent_child_dict = { + parent_seq.seq_id: [] + for parent_seq in parent_seqs + } + for sample in samples: + parent_child_dict[sample.parent_seq_id].append(sample) + # List of (child, parent) + child_seqs: List[Tuple[Sequence, Sequence]] = [] + + # Process the child samples for each parent sequence + for parent in parent_seqs: + child_samples: List[SequenceOutput] = parent_child_dict[ + parent.seq_id] + if len(child_samples) == 0: + # This parent sequence has no children samples. Remove + # the parent sequence from the sequence group since it will + # not be used in the future iterations. + parent.status = SequenceStatus.FINISHED_ABORTED + seq_group.remove(parent.seq_id) + self.scheduler.free_seq(parent) + continue + # Fork the parent sequence if there are multiple child samples. + for child_sample in child_samples[:-1]: + new_child_seq_id = next(self.seq_counter) + child = parent.fork(new_child_seq_id) + child.append_token_id(child_sample.output_token, + child_sample.logprobs) + child_seqs.append((child, parent)) + # Continue the parent sequence for the last child sample. + # We reuse the parent sequence here to reduce redundant memory + # copies, especially when using non-beam search sampling methods. + last_child_sample = child_samples[-1] + parent.append_token_id(last_child_sample.output_token, + last_child_sample.logprobs) + child_seqs.append((parent, parent)) + + for seq, _ in child_seqs: + if seq_group.sampling_params.detokenize: + self.detokenizer.decode_sequence_inplace( + seq, seq_group.sampling_params) + self._check_stop(seq, seq_group.sampling_params) + + # Non-beam search case + if not seq_group.sampling_params.use_beam_search: + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + # NOTE: we need to fork the new sequences before freeing the + # old sequences. + for seq, parent in child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + return + + # Beam search case + # Select the child sequences to keep in the sequence group. + selected_child_seqs = [] + unselected_child_seqs = [] + beam_width = seq_group.sampling_params.best_of + length_penalty = seq_group.sampling_params.length_penalty + + # Select the newly finished sequences with the highest scores + # to replace existing finished sequences. + # Tuple of (seq, parent, is_new) + existing_finished_seqs = [(seq, None, False) + for seq in existing_finished_seqs] + new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs + if seq.is_finished()] + all_finished_seqs = existing_finished_seqs + new_finished_seqs + # Sort the finished sequences by their scores. + all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + reverse=True) + for seq, parent, is_new in all_finished_seqs[:beam_width]: + if is_new: + # A newly generated child sequence finishes and has a high + # score, so we will add it into the sequence group. + selected_child_seqs.append((seq, parent)) + for seq, parent, is_new in all_finished_seqs[beam_width:]: + if is_new: + # A newly generated child sequence finishes but has a low + # score, so we will not add it into the sequence group. + # Additionally, if this sequence is a continuation of a + # parent sequence, we will need remove the parent sequence + # from the sequence group. + unselected_child_seqs.append((seq, parent)) + else: + # An existing finished sequence has a low score, so we will + # remove it from the sequence group. + seq_group.remove(seq.seq_id) + + # select the top beam_width sequences from the running + # sequences for the next iteration to continue the beam + # search. + running_child_seqs = [(seq, parent) for seq, parent in child_seqs + if not seq.is_finished()] + # Sort the running sequences by their scores. + running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), + reverse=True) + + # Check if we can stop the beam search. + if len(running_child_seqs) == 0: + # No running sequences, stop the beam search. + stop_beam_search = True + elif len(all_finished_seqs) < beam_width: + # Not enough finished sequences, continue the beam search. + stop_beam_search = False + else: + # Check the early stopping criteria + best_running_seq = running_child_seqs[0][0] + current_worst_seq = all_finished_seqs[beam_width - 1][0] + stop_beam_search = self._check_beam_search_early_stopping( + seq_group.sampling_params.early_stopping, + seq_group.sampling_params, best_running_seq, current_worst_seq) + + if stop_beam_search: + # Stop the beam search and remove all the running sequences from + # the sequence group. + unselected_child_seqs.extend(running_child_seqs) + else: + # Continue the beam search and select the top beam_width sequences + # to continue the beam search. + selected_child_seqs.extend(running_child_seqs[:beam_width]) + # The remaining running sequences will not be used in the next + # iteration. Again, if these sequences are continuations of + # parent sequences, we will need to remove the parent sequences + # from the sequence group. + unselected_child_seqs.extend(running_child_seqs[beam_width:]) + + # For newly created child sequences, add them to the sequence group + # and fork them in block manager if they are not finished. + for seq, parent in selected_child_seqs: + if seq is not parent: + seq_group.add(seq) + if not seq.is_finished(): + self.scheduler.fork_seq(parent, seq) + + # Free the finished and selected parent sequences' memory in block + # manager. Keep them in the sequence group as candidate output. + for seq, parent in selected_child_seqs: + if seq is parent and seq.is_finished(): + self.scheduler.free_seq(seq) + + # Remove the unselected parent sequences from the sequence group and + # free their memory in block manager. + for seq, parent in unselected_child_seqs: + if seq is parent: + # Remove the parent sequence if it is not selected for next + # iteration + seq_group.remove(seq.seq_id) + self.scheduler.free_seq(seq) + + def _check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if seq.get_output_len() >= int(sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py new file mode 100644 index 000000000000..f11520d3a7e9 --- /dev/null +++ b/vllm/engine/output_processor/block_decode.py @@ -0,0 +1,186 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) + + +class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): + + def __init__( + self, + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + self.scheduler_config = scheduler_config + self.detokenizer = detokenizer + self.scheduler = scheduler + self.seq_counter = seq_counter + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) + + def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): + seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) + + assert seqs + #if not seqs: + # return [] + + assert len(seqs) == 1, ("Beam search not supported in speculative " + "decoding.") + seq = seqs[0] + + # Since there's only one sequence per sequence group, we can take the + # first sample. + samples = [outputs[step].samples[0] for step in range(len(outputs))] + + # -1 means the output token is not valid (eg. due to spec decode + # rejecting tokens). + valid_samples = [ + sample for sample in samples if sample.output_token != -1 + ] + + # Draft target worker pads all outputs with -1 to have same length. + output_token_ids = [sample.output_token for sample in valid_samples] + #successes = [sample.success for sample in samples] + + ## Truncate to max_tokens if necessary. + #remaining_tokens = seq_group.sampling_params.max_tokens - ( + # seq.get_output_len() + len(output_token_ids)) + #if remaining_tokens < 0: + # valid_samples = valid_samples[:remaining_tokens] + # output_token_ids = output_token_ids[:remaining_tokens] + + ## Truncate any tokens after EOS. This is required as spec decode + ## generates tokens in fixed blocks, which may go beyond the EOS token. + #if not seq_group.sampling_params.ignore_eos: + # eos_token_id = self.tokenizer.get_lora_tokenizer( + # seq.lora_request).eos_token_id + # # Avoiding .index calls as exception throwing in the happy path + # # is expensive. + # for i in range(len(output_token_ids)): + # if output_token_ids[i] == eos_token_id: + # output_token_ids = output_token_ids[:i + 1] + # valid_samples = valid_samples[:i + 1] + # break + + #output_logprobs = [sample.logprobs for sample in valid_samples] + + ## Use the last sample for the sequence as it will have + ## the speculation and num_unprocessed_tokens for all the + ## previous samples (they are cumulative when it comes + ## to those two attributes). + #speculation = valid_samples[-1].speculation + #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens + + for output_token_id in output_token_ids: + from vllm.sequence import Logprob + seq.append_token_id( + token_id=output_token_id, + logprobs={output_token_id: Logprob(0.0)}, + ) + + #seq.append_token_ids(output_token_ids, + # output_logprobs, + # ) + # #num_unprocessed_tokens=num_unprocessed_tokens) + ##seq.set_last_speculation(speculation) + + #if not all(successes): + # seq.set_status_to_failed() + + #if decode: + # self._decode_sequence(seq, + # seq_group.sampling_params, + # token_ids=seq.get_token_ids(), + # unseen_token_ids=output_token_ids, + # prefix_offset=seq.prefix_offset, + # read_offset=seq.read_offset) + #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, + # output_token_ids) + # TODO pass output token ids + self._check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): + self.scheduler.free_seq(seq) + + def _check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if seq.get_output_len() >= int(sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py new file mode 100644 index 000000000000..4d1da960dc41 --- /dev/null +++ b/vllm/engine/output_processor/interfaces.py @@ -0,0 +1,36 @@ +from abc import ABC, abstractmethod +from vllm.config import SchedulerConfig +from vllm.sequence import SequenceGroup, SequenceGroupOutput + +class SequenceGroupOutputProcessor(ABC): + + @staticmethod + def create_output_processor( + scheduler_config: SchedulerConfig, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ): + if scheduler_config.num_lookahead_slots == 0: + from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor + return BeamSearchOutputProcessor( + scheduler_config, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ) + else: + from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor + return BlockDecodeOutputProcessor( + scheduler_config, + detokenizer, + scheduler, + seq_counter, + get_tokenizer_for_seq, + ) + + @abstractmethod + def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + pass From 632b439541021309fbc0f83b78210532e1a94606 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:46:14 -0700 Subject: [PATCH 070/165] fix --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1ac73bc874de..60b0f46b2318 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -926,7 +926,7 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): + if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): # TODO should cap block seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return From 26e7368e95f824fdce6cac30f476529d270ac6ed Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:53:01 -0700 Subject: [PATCH 071/165] dedup stop check --- vllm/engine/llm_engine.py | 104 +++++++------- vllm/engine/output_processor/beam_search.py | 140 ++++++------------- vllm/engine/output_processor/block_decode.py | 57 +------- vllm/engine/output_processor/interfaces.py | 3 + vllm/engine/output_processor/stop_checker.py | 89 ++++++++++++ 5 files changed, 194 insertions(+), 199 deletions(-) create mode 100644 vllm/engine/output_processor/stop_checker.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 60b0f46b2318..570b5eff581d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -26,6 +26,7 @@ usage_message) from vllm.utils import Counter from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -187,6 +188,7 @@ def __init__( self.scheduler, self.seq_counter, self.get_tokenizer_for_seq, + stop_checker=StopChecker(scheduler, self.get_tokenizer_for_seq), ) def _initialize_kv_caches(self) -> None: @@ -917,57 +919,57 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return - - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] + #def _check_stop(self, seq: Sequence, + # sampling_params: SamplingParams) -> None: + # """Stop the finished sequences.""" + # # Check if the sequence has reached max_model_len. + # if seq.get_len() > self.scheduler_config.max_model_len: + # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + # return + + # # Check if the sequence has reached max_tokens. + # if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): + # # TODO should cap block + # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + # return + + # # Check if the minimum number of tokens has been generated yet; + # # skip the stop string/token checks if not + # if seq.get_output_len() < sampling_params.min_tokens: + # return + + # if sampling_params.detokenize: + # for stop_str in sampling_params.stop: + # if seq.output_text.endswith(stop_str): + # self._finalize_sequence(seq, sampling_params, stop_str) + # seq.status = SequenceStatus.FINISHED_STOPPED + # seq.stop_reason = stop_str + # return + # last_token_id = seq.get_last_token_id() + # if last_token_id in sampling_params.stop_token_ids: + # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + # last_token_id) + # self._finalize_sequence(seq, sampling_params, stop_str) + # seq.status = SequenceStatus.FINISHED_STOPPED + # seq.stop_reason = last_token_id + # return + + # # Check if the sequence has generated the EOS token. + # if ((not sampling_params.ignore_eos) + # and seq.get_last_token_id() == seq.eos_token_id): + # seq.status = SequenceStatus.FINISHED_STOPPED + # return + + #def _finalize_sequence(self, seq: Sequence, + # sampling_params: SamplingParams, + # stop_string: str) -> None: + # if sampling_params.include_stop_str_in_output: + # return + + # if stop_string and seq.output_text.endswith(stop_string): + # # Truncate the output text so that the stop string is + # # not included in the output. + # seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 5f823b5c5c72..c9ded1171151 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -39,61 +39,19 @@ def __init__( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=current_worst_seq.eos_token_id) - if early_stopping is False: - highest_attainable_score = best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id)) - return current_worst_score >= highest_attainable_score - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: @@ -148,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self._check_stop(seq, seq_group.sampling_params) + self.stop_checker.check_stop(seq, seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: @@ -268,54 +226,46 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, seq_group.remove(seq.seq_id) self.scheduler.free_seq(seq) - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return + def _check_beam_search_early_stopping( + self, + early_stopping: Union[bool, str], + sampling_params: SamplingParams, + best_running_seq: Sequence, + current_worst_seq: Sequence, + ) -> bool: + assert sampling_params.use_beam_search + length_penalty = sampling_params.length_penalty + if early_stopping is True: + return True - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] + current_worst_score = current_worst_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=current_worst_seq.eos_token_id) + if early_stopping is False: + highest_attainable_score = best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id) + else: + assert early_stopping == "never" + if length_penalty > 0.0: + # If length_penalty > 0.0, beam search will prefer longer + # sequences. The highest attainable score calculation is + # based on the longest possible sequence length in this case. + max_possible_length = max( + best_running_seq.get_prompt_len() + + sampling_params.max_tokens, + self.scheduler_config.max_model_len) + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id, + seq_len=max_possible_length)) + else: + # Otherwise, beam search will prefer shorter sequences. The + # highest attainable score calculation is based on the current + # sequence length. + highest_attainable_score = ( + best_running_seq.get_beam_search_score( + length_penalty=length_penalty, + eos_token_id=best_running_seq.eos_token_id)) + return current_worst_score >= highest_attainable_score diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index f11520d3a7e9..90ad03df32dd 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -39,12 +39,14 @@ def __init__( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq + self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) @@ -129,58 +131,7 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, # output_token_ids) # TODO pass output token ids - self._check_stop(seq, seq_group.sampling_params) + self.stop_checker.check_stop(seq, seq_group.sampling_params) + if seq.is_finished(): self.scheduler.free_seq(seq) - - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() >= int(sampling_params.max_tokens): - # TODO should cap block - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the minimum number of tokens has been generated yet; - # skip the stop string/token checks if not - if seq.get_output_len() < sampling_params.min_tokens: - return - - if sampling_params.detokenize: - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = stop_str - return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) - seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def _finalize_sequence(self, seq: Sequence, - sampling_params: SamplingParams, - stop_string: str) -> None: - if sampling_params.include_stop_str_in_output: - return - - if stop_string and seq.output_text.endswith(stop_string): - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_string)] diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4d1da960dc41..d2368fc811a0 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -11,6 +11,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ): if scheduler_config.num_lookahead_slots == 0: from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor @@ -20,6 +21,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ) else: from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor @@ -29,6 +31,7 @@ def create_output_processor( scheduler, seq_counter, get_tokenizer_for_seq, + stop_checker, ) @abstractmethod diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py new file mode 100644 index 000000000000..feeef1c0f24a --- /dev/null +++ b/vllm/engine/output_processor/stop_checker.py @@ -0,0 +1,89 @@ +import time +from typing import Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer + +import vllm +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, SpeculativeConfig, + VisionLanguageConfig) +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, + SequenceGroup, SequenceGroupOutput, SequenceOutput, + SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, + usage_message) +from vllm.utils import Counter +from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor + +logger = init_logger(__name__) +_LOCAL_LOGGING_INTERVAL_SEC = 5 + +class StopChecker: + + def __init__(self, scheduler, get_tokenizer_for_seq): + self.scheduler = scheduler + self.get_tokenizer_for_seq = get_tokenizer_for_seq + + def check_stop(self, seq: Sequence, + sampling_params: SamplingParams) -> None: + """Stop the finished sequences.""" + # Check if the sequence has reached max_model_len. + if seq.get_len() > self.scheduler_config.max_model_len: + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the sequence has reached max_tokens. + if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): + # TODO should cap block + seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED + return + + # Check if the minimum number of tokens has been generated yet; + # skip the stop string/token checks if not + if seq.get_output_len() < sampling_params.min_tokens: + return + + if sampling_params.detokenize: + for stop_str in sampling_params.stop: + if seq.output_text.endswith(stop_str): + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = stop_str + return + last_token_id = seq.get_last_token_id() + if last_token_id in sampling_params.stop_token_ids: + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( + last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) + seq.status = SequenceStatus.FINISHED_STOPPED + seq.stop_reason = last_token_id + return + + # Check if the sequence has generated the EOS token. + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): + seq.status = SequenceStatus.FINISHED_STOPPED + return + + def _finalize_sequence(self, seq: Sequence, + sampling_params: SamplingParams, + stop_string: str) -> None: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): + # Truncate the output text so that the stop string is + # not included in the output. + seq.output_text = seq.output_text[:-len(stop_string)] From 06e7c01d3867439289e8f5958cf1bc00be0c305a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:55:20 -0700 Subject: [PATCH 072/165] wip --- vllm/engine/llm_engine.py | 6 +++++- vllm/engine/output_processor/stop_checker.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 570b5eff581d..036709a414c2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -188,7 +188,11 @@ def __init__( self.scheduler, self.seq_counter, self.get_tokenizer_for_seq, - stop_checker=StopChecker(scheduler, self.get_tokenizer_for_seq), + stop_checker=StopChecker( + self.scheduler, + self.scheduler_config, + self.get_tokenizer_for_seq, + ), ) def _initialize_kv_caches(self) -> None: diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index feeef1c0f24a..cc6655b7aaa7 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -32,8 +32,9 @@ class StopChecker: - def __init__(self, scheduler, get_tokenizer_for_seq): + def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler = scheduler + self.scheduler_config = scheduler_config self.get_tokenizer_for_seq = get_tokenizer_for_seq def check_stop(self, seq: Sequence, From 184a52c166ec6eeb75dfedbb544c65188322ece7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:56:18 -0700 Subject: [PATCH 073/165] del --- vllm/engine/llm_engine.py | 352 -------------------------------------- 1 file changed, 352 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 036709a414c2..2be4a260f164 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -420,224 +420,6 @@ def has_unfinished_requests(self) -> bool: """Returns True if there are unfinished requests.""" return self.scheduler.has_unfinished_seqs() - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=current_worst_seq.eos_token_id) - if early_stopping is False: - highest_attainable_score = best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=best_running_seq.eos_token_id)) - return current_worst_score >= highest_attainable_score - - #def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - # outputs: SequenceGroupOutput) -> None: - - # # Process prompt logprobs - # prompt_logprobs = outputs.prompt_logprobs - # if prompt_logprobs is not None and seq_group.sampling_params.detokenize: - # self.detokenizer.decode_prompt_logprobs_inplace( - # seq_group, prompt_logprobs) - # seq_group.prompt_logprobs = prompt_logprobs - - # # Process samples - # samples = outputs.samples - # parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - # existing_finished_seqs = seq_group.get_finished_seqs() - # parent_child_dict = { - # parent_seq.seq_id: [] - # for parent_seq in parent_seqs - # } - # for sample in samples: - # parent_child_dict[sample.parent_seq_id].append(sample) - # # List of (child, parent) - # child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # # Process the child samples for each parent sequence - # for parent in parent_seqs: - # child_samples: List[SequenceOutput] = parent_child_dict[ - # parent.seq_id] - # if len(child_samples) == 0: - # # This parent sequence has no children samples. Remove - # # the parent sequence from the sequence group since it will - # # not be used in the future iterations. - # parent.status = SequenceStatus.FINISHED_ABORTED - # seq_group.remove(parent.seq_id) - # self.scheduler.free_seq(parent) - # continue - # # Fork the parent sequence if there are multiple child samples. - # for child_sample in child_samples[:-1]: - # new_child_seq_id = next(self.seq_counter) - # child = parent.fork(new_child_seq_id) - # child.append_token_id(child_sample.output_token, - # child_sample.logprobs) - # child_seqs.append((child, parent)) - # # Continue the parent sequence for the last child sample. - # # We reuse the parent sequence here to reduce redundant memory - # # copies, especially when using non-beam search sampling methods. - # last_child_sample = child_samples[-1] - # parent.append_token_id(last_child_sample.output_token, - # last_child_sample.logprobs) - # child_seqs.append((parent, parent)) - - # for seq, _ in child_seqs: - # if seq_group.sampling_params.detokenize: - # self.detokenizer.decode_sequence_inplace( - # seq, seq_group.sampling_params) - # self._check_stop(seq, seq_group.sampling_params) - - # # Non-beam search case - # if not seq_group.sampling_params.use_beam_search: - # # For newly created child sequences, add them to the sequence group - # # and fork them in block manager if they are not finished. - # for seq, parent in child_seqs: - # if seq is not parent: - # seq_group.add(seq) - # if not seq.is_finished(): - # self.scheduler.fork_seq(parent, seq) - - # # Free the finished and selected parent sequences' memory in block - # # manager. Keep them in the sequence group as candidate output. - # # NOTE: we need to fork the new sequences before freeing the - # # old sequences. - # for seq, parent in child_seqs: - # if seq is parent and seq.is_finished(): - # self.scheduler.free_seq(seq) - # return - - # # Beam search case - # # Select the child sequences to keep in the sequence group. - # selected_child_seqs = [] - # unselected_child_seqs = [] - # beam_width = seq_group.sampling_params.best_of - # length_penalty = seq_group.sampling_params.length_penalty - - # # Select the newly finished sequences with the highest scores - # # to replace existing finished sequences. - # # Tuple of (seq, parent, is_new) - # existing_finished_seqs = [(seq, None, False) - # for seq in existing_finished_seqs] - # new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - # if seq.is_finished()] - # all_finished_seqs = existing_finished_seqs + new_finished_seqs - # # Sort the finished sequences by their scores. - # all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - # reverse=True) - # for seq, parent, is_new in all_finished_seqs[:beam_width]: - # if is_new: - # # A newly generated child sequence finishes and has a high - # # score, so we will add it into the sequence group. - # selected_child_seqs.append((seq, parent)) - # for seq, parent, is_new in all_finished_seqs[beam_width:]: - # if is_new: - # # A newly generated child sequence finishes but has a low - # # score, so we will not add it into the sequence group. - # # Additionally, if this sequence is a continuation of a - # # parent sequence, we will need remove the parent sequence - # # from the sequence group. - # unselected_child_seqs.append((seq, parent)) - # else: - # # An existing finished sequence has a low score, so we will - # # remove it from the sequence group. - # seq_group.remove(seq.seq_id) - - # # select the top beam_width sequences from the running - # # sequences for the next iteration to continue the beam - # # search. - # running_child_seqs = [(seq, parent) for seq, parent in child_seqs - # if not seq.is_finished()] - # # Sort the running sequences by their scores. - # running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - # length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - # reverse=True) - - # # Check if we can stop the beam search. - # if len(running_child_seqs) == 0: - # # No running sequences, stop the beam search. - # stop_beam_search = True - # elif len(all_finished_seqs) < beam_width: - # # Not enough finished sequences, continue the beam search. - # stop_beam_search = False - # else: - # # Check the early stopping criteria - # best_running_seq = running_child_seqs[0][0] - # current_worst_seq = all_finished_seqs[beam_width - 1][0] - # stop_beam_search = self._check_beam_search_early_stopping( - # seq_group.sampling_params.early_stopping, - # seq_group.sampling_params, best_running_seq, current_worst_seq) - - # if stop_beam_search: - # # Stop the beam search and remove all the running sequences from - # # the sequence group. - # unselected_child_seqs.extend(running_child_seqs) - # else: - # # Continue the beam search and select the top beam_width sequences - # # to continue the beam search. - # selected_child_seqs.extend(running_child_seqs[:beam_width]) - # # The remaining running sequences will not be used in the next - # # iteration. Again, if these sequences are continuations of - # # parent sequences, we will need to remove the parent sequences - # # from the sequence group. - # unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # # For newly created child sequences, add them to the sequence group - # # and fork them in block manager if they are not finished. - # for seq, parent in selected_child_seqs: - # if seq is not parent: - # seq_group.add(seq) - # if not seq.is_finished(): - # self.scheduler.fork_seq(parent, seq) - - # # Free the finished and selected parent sequences' memory in block - # # manager. Keep them in the sequence group as candidate output. - # for seq, parent in selected_child_seqs: - # if seq is parent and seq.is_finished(): - # self.scheduler.free_seq(seq) - - # # Remove the unselected parent sequences from the sequence group and - # # free their memory in block manager. - # for seq, parent in unselected_child_seqs: - # if seq is parent: - # # Remove the parent sequence if it is not selected for next - # # iteration - # seq_group.remove(seq.seq_id) - # self.scheduler.free_seq(seq) - def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: @@ -696,89 +478,6 @@ def _process_model_outputs( self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs - #def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - # seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - # assert seqs - # #if not seqs: - # # return [] - - # assert len(seqs) == 1, ("Beam search not supported in speculative " - # "decoding.") - # seq = seqs[0] - - # # Since there's only one sequence per sequence group, we can take the - # # first sample. - # samples = [outputs[step].samples[0] for step in range(len(outputs))] - - # # -1 means the output token is not valid (eg. due to spec decode - # # rejecting tokens). - # valid_samples = [ - # sample for sample in samples if sample.output_token != -1 - # ] - - # # Draft target worker pads all outputs with -1 to have same length. - # output_token_ids = [sample.output_token for sample in valid_samples] - # #successes = [sample.success for sample in samples] - - # ## Truncate to max_tokens if necessary. - # #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # # seq.get_output_len() + len(output_token_ids)) - # #if remaining_tokens < 0: - # # valid_samples = valid_samples[:remaining_tokens] - # # output_token_ids = output_token_ids[:remaining_tokens] - - # ## Truncate any tokens after EOS. This is required as spec decode - # ## generates tokens in fixed blocks, which may go beyond the EOS token. - # #if not seq_group.sampling_params.ignore_eos: - # # eos_token_id = self.tokenizer.get_lora_tokenizer( - # # seq.lora_request).eos_token_id - # # # Avoiding .index calls as exception throwing in the happy path - # # # is expensive. - # # for i in range(len(output_token_ids)): - # # if output_token_ids[i] == eos_token_id: - # # output_token_ids = output_token_ids[:i + 1] - # # valid_samples = valid_samples[:i + 1] - # # break - - # #output_logprobs = [sample.logprobs for sample in valid_samples] - - # ## Use the last sample for the sequence as it will have - # ## the speculation and num_unprocessed_tokens for all the - # ## previous samples (they are cumulative when it comes - # ## to those two attributes). - # #speculation = valid_samples[-1].speculation - # #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - - # for output_token_id in output_token_ids: - # from vllm.sequence import Logprob - # seq.append_token_id( - # token_id=output_token_id, - # logprobs={output_token_id: Logprob(0.0)}, - # ) - - # #seq.append_token_ids(output_token_ids, - # # output_logprobs, - # # ) - # # #num_unprocessed_tokens=num_unprocessed_tokens) - # ##seq.set_last_speculation(speculation) - - # #if not all(successes): - # # seq.set_status_to_failed() - - # #if decode: - # # self._decode_sequence(seq, - # # seq_group.sampling_params, - # # token_ids=seq.get_token_ids(), - # # unseen_token_ids=output_token_ids, - # # prefix_offset=seq.prefix_offset, - # # read_offset=seq.read_offset) - # #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # # output_token_ids) - # # TODO pass output token ids - # self._check_stop(seq, seq_group.sampling_params) - # if seq.is_finished(): - # self.scheduler.free_seq(seq) def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -923,57 +622,6 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - #def _check_stop(self, seq: Sequence, - # sampling_params: SamplingParams) -> None: - # """Stop the finished sequences.""" - # # Check if the sequence has reached max_model_len. - # if seq.get_len() > self.scheduler_config.max_model_len: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the sequence has reached max_tokens. - # if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # # TODO should cap block - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the minimum number of tokens has been generated yet; - # # skip the stop string/token checks if not - # if seq.get_output_len() < sampling_params.min_tokens: - # return - - # if sampling_params.detokenize: - # for stop_str in sampling_params.stop: - # if seq.output_text.endswith(stop_str): - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_str - # return - # last_token_id = seq.get_last_token_id() - # if last_token_id in sampling_params.stop_token_ids: - # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - # last_token_id) - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = last_token_id - # return - - # # Check if the sequence has generated the EOS token. - # if ((not sampling_params.ignore_eos) - # and seq.get_last_token_id() == seq.eos_token_id): - # seq.status = SequenceStatus.FINISHED_STOPPED - # return - - #def _finalize_sequence(self, seq: Sequence, - # sampling_params: SamplingParams, - # stop_string: str) -> None: - # if sampling_params.include_stop_str_in_output: - # return - - # if stop_string and seq.output_text.endswith(stop_string): - # # Truncate the output text so that the stop string is - # # not included in the output. - # seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) From 34468fe8af84d0a2bd313e9b4dc06582e17c1458 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 16:57:51 -0700 Subject: [PATCH 074/165] rename --- vllm/engine/output_processor/beam_search.py | 2 +- vllm/engine/output_processor/block_decode.py | 2 +- vllm/engine/output_processor/stop_checker.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index c9ded1171151..829c5ecd7839 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -106,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.check_stop(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 90ad03df32dd..44b4efba6372 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -131,7 +131,7 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, # output_token_ids) # TODO pass output token ids - self.stop_checker.check_stop(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index cc6655b7aaa7..82973e304202 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -37,7 +37,7 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler_config = scheduler_config self.get_tokenizer_for_seq = get_tokenizer_for_seq - def check_stop(self, seq: Sequence, + def maybe_stop_sequence(self, seq: Sequence, sampling_params: SamplingParams) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. From 208c4671593534e9a2f9ed7f64da80c5a74a4fb4 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:10:05 -0700 Subject: [PATCH 075/165] wip --- vllm/engine/llm_engine.py | 23 +++++------------------ vllm/engine/output_processor/util.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 18 deletions(-) create mode 100644 vllm/engine/output_processor/util.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2be4a260f164..86ba02023627 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -27,6 +27,7 @@ from vllm.utils import Counter from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.engine.output_processor.util import create_output_by_sequence_group logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -424,6 +425,9 @@ def _process_model_outputs( self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + now = time.time() + + # TODO if self.speculative_config is None: all_output = [output] else: @@ -431,34 +435,17 @@ def _process_model_outputs( scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - # Organize list of sampler output by sequence group. - output_by_sequence_group: List[List[SequenceGroupOutputs]] = [ - [] for _ in scheduled_seq_groups - ] - for step in all_output: - for i, sequence_group_output in enumerate(step): - output_by_sequence_group[i].append(sequence_group_output) - - now = time.time() + output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): - seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) self.output_processor.process_outputs(seq_group, outputs) - #assert len(outputs) > 0 - ## TODO can spec decode go through second path? - #if len(outputs) > 1: - # self._process_sequence_group_outputs_multi_step( - # seq_group, outputs) - #else: - # self._process_sequence_group_outputs(seq_group, outputs[0]) - # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py new file mode 100644 index 000000000000..1fcd651deef1 --- /dev/null +++ b/vllm/engine/output_processor/util.py @@ -0,0 +1,12 @@ +from vllm.sequence import SequenceGroupOutput, SamplerOutput +from typing import List + +def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): + output_by_sequence_group = [ + [] for _ in range(num_seq_groups) + ] + for step in sampler_outputs: + for i, sequence_group_output in enumerate(step): + output_by_sequence_group[i].append(sequence_group_output) + + return output_by_sequence_group From 3c6abcc564bafc242316797ccbed1e10db54dff7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:14:22 -0700 Subject: [PATCH 076/165] wip --- vllm/engine/llm_engine.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 86ba02023627..72af9c3da9f7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -423,7 +423,8 @@ def has_unfinished_requests(self) -> bool: def _process_model_outputs( self, output: SamplerOutput, - scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: + scheduled_seq_groups: List[SequenceGroup], + ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() @@ -433,8 +434,6 @@ def _process_model_outputs( else: all_output = output - scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. @@ -456,13 +455,9 @@ def _process_model_outputs( seq_group.maybe_set_first_token_time(now) request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - for seq_group in scheduler_outputs.ignored_seq_groups: + for seq_group in ignored_seq_groups: request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - - # Log stats. - if self.log_stats: - self.stat_logger.log(self._get_stats(scheduler_outputs)) return request_outputs @@ -529,7 +524,13 @@ def step(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs) + request_outputs = self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + + # Log stats. + if self.log_stats: + self.stat_logger.log(self._get_stats(scheduler_outputs)) + + return request_outputs def do_log_stats(self) -> None: """Forced log when no requests active.""" From bbbcef70d603ab791ecc62336a56ef25b1566d33 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:27:24 -0700 Subject: [PATCH 077/165] wip --- tests/spec_decode/e2e/test_correctness.py | 2 +- vllm/engine/llm_engine.py | 11 +++-------- vllm/executor/cpu_executor.py | 2 +- vllm/executor/executor_base.py | 5 +++-- vllm/executor/gpu_executor.py | 2 +- vllm/spec_decode/multi_step_worker.py | 2 ++ vllm/spec_decode/spec_decode_worker.py | 2 ++ vllm/worker/cpu_worker.py | 8 +++++--- vllm/worker/neuron_worker.py | 9 ++++++--- vllm/worker/worker.py | 9 ++++++--- vllm/worker/worker_base.py | 5 +++-- 11 files changed, 33 insertions(+), 24 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d8b09ce5b77a..eb6d1e1c5ddd 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -89,7 +89,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): { # Expect failure as spec decode not supported by # Ray backend. - "tensor_parallel_size": 2, + "worker_use_ray": True, }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 72af9c3da9f7..bce36ddccc81 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -422,19 +422,14 @@ def has_unfinished_requests(self) -> bool: return self.scheduler.has_unfinished_seqs() def _process_model_outputs( - self, output: SamplerOutput, + self, + output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() - # TODO - if self.speculative_config is None: - all_output = [output] - else: - all_output = output - - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=all_output, num_seq_groups=len(scheduled_seq_groups)) + output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 835ba18ab756..f308f9149475 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -81,7 +81,7 @@ def execute_model(self, blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int) -> SamplerOutput: + num_lookahead_slots: int) -> List[SamplerOutput]: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index c18edd75d7a4..23927c113744 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -58,8 +58,9 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - """Executes one model step on the given sequences.""" + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: + """Executes at least one model step on the given sequences.""" raise NotImplementedError @abstractmethod diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index ac445cd51a7e..90a534dc1271 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -157,7 +157,7 @@ def execute_model( blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], num_lookahead_slots: int, - ) -> SamplerOutput: + ) -> List[SamplerOutput]: output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0ac189a7bacc..4cdbe0923455 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -70,6 +70,8 @@ def execute_model_multi_step( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) + assert (len(model_output) == 1), "composing multistep workers not supported" + model_output = model_output[0] self._append_new_tokens(model_output, copied_seq_group_metadata_list) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 3e33371edadf..894377c9421e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,6 +196,8 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) + assert len(sampler_output) == 1, "expected single output from scorer worker" + sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication # overhead when the engine runs in a different process than the workers. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bd67f9f8850a..09a37c25783a 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -257,7 +257,7 @@ def execute_model( blocks_to_swap_in: Optional[Dict[int, int]] = None, blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) @@ -280,11 +280,13 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.cpu_cache) - return output + + # CPU worker only supports single-step execution. + return [output] def init_distributed_environment(self) -> None: """Initialize the distributed environment.""" diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 6136d50d0c06..d0f01b893bc6 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -73,15 +73,18 @@ def initialize_cache(self, num_gpu_blocks: int, def execute_model( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: num_seq_groups = len(seq_group_metadata_list) # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list) - return output + + # Neuron worker only supports single-step output. Wrap the output in a + # list to conform to interface. + return [output] def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index cb30f658482b..95e62b9e6a75 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -206,7 +206,7 @@ def execute_model( blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, - ) -> Optional[SamplerOutput]: + ) -> List[SamplerOutput]: if self.is_driver_worker: assert seq_group_metadata_list is not None @@ -232,11 +232,14 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) - return output + + # Worker only supports single-step execution. Wrap the output in a list + # to conform to interface. + return [output] def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index e3027c406ffe..1481a4c2eef4 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -44,8 +44,9 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - """Executes one model step on the given sequences.""" + blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: + """Executes at least one model step on the given sequences, unless no + sequences are provided.""" raise NotImplementedError @abstractmethod From b58762d4fa0f64eb29af5a649650d6293c5d988f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:29:07 -0700 Subject: [PATCH 078/165] fix --- vllm/spec_decode/batch_expansion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index bba3c4733e4f..f7bac45861a7 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -86,6 +86,8 @@ def score_proposals( blocks_to_copy=blocks_to_copy, #return_python_output=False ) + assert len(target_sampler_output) == 1, "expected single-step output" + target_sampler_output = target_sampler_output[0] all_tokens, all_probs = self._contract_batch( original_bs=len(seq_group_metadata_list), From 8b500d404b81b10857f75503e312ecf44ee9dd9f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 17:43:04 -0700 Subject: [PATCH 079/165] wip --- vllm/engine/output_processor/block_decode.py | 67 ++++++-------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 44b4efba6372..3fb2b7ee3235 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -18,7 +18,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) + SequenceStatus, Logprob) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -49,17 +49,10 @@ def __init__( self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: - return self._process_sequence_group_outputs_multi_step(sequence_group, outputs) + seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) - def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - - assert seqs - #if not seqs: - # return [] - - assert len(seqs) == 1, ("Beam search not supported in speculative " - "decoding.") + assert seqs, "expected running sequences" + assert len(seqs) == 1, ("Beam search not supported in block decoding.") seq = seqs[0] # Since there's only one sequence per sequence group, we can take the @@ -71,21 +64,23 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): valid_samples = [ sample for sample in samples if sample.output_token != -1 ] + assert valid_samples + + self._process_seq_outputs(seq, valid_samples, sequence_group.sampling_params) - # Draft target worker pads all outputs with -1 to have same length. + def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput], sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] - #successes = [sample.success for sample in samples] - ## Truncate to max_tokens if necessary. - #remaining_tokens = seq_group.sampling_params.max_tokens - ( - # seq.get_output_len() + len(output_token_ids)) - #if remaining_tokens < 0: - # valid_samples = valid_samples[:remaining_tokens] - # output_token_ids = output_token_ids[:remaining_tokens] + # Truncate to max_tokens if necessary. + remaining_tokens = sampling_params.max_tokens - ( + seq.get_output_len() + len(output_token_ids)) + if remaining_tokens < 0: + valid_samples = valid_samples[:remaining_tokens] + output_token_ids = output_token_ids[:remaining_tokens] ## Truncate any tokens after EOS. This is required as spec decode ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not seq_group.sampling_params.ignore_eos: + #if not sampling_params.ignore_eos: # eos_token_id = self.tokenizer.get_lora_tokenizer( # seq.lora_request).eos_token_id # # Avoiding .index calls as exception throwing in the happy path @@ -96,42 +91,16 @@ def _process_sequence_group_outputs_multi_step(self, seq_group, outputs): # valid_samples = valid_samples[:i + 1] # break - #output_logprobs = [sample.logprobs for sample in valid_samples] - - ## Use the last sample for the sequence as it will have - ## the speculation and num_unprocessed_tokens for all the - ## previous samples (they are cumulative when it comes - ## to those two attributes). - #speculation = valid_samples[-1].speculation - #num_unprocessed_tokens = valid_samples[-1].num_unprocessed_tokens - for output_token_id in output_token_ids: - from vllm.sequence import Logprob seq.append_token_id( token_id=output_token_id, + # TODO emit logprobs in block decoding. logprobs={output_token_id: Logprob(0.0)}, ) - #seq.append_token_ids(output_token_ids, - # output_logprobs, - # ) - # #num_unprocessed_tokens=num_unprocessed_tokens) - ##seq.set_last_speculation(speculation) - - #if not all(successes): - # seq.set_status_to_failed() - - #if decode: - # self._decode_sequence(seq, - # seq_group.sampling_params, - # token_ids=seq.get_token_ids(), - # unseen_token_ids=output_token_ids, - # prefix_offset=seq.prefix_offset, - # read_offset=seq.read_offset) - #self._check_stop(seq, seq_group.sampling_params, seq.lora_request, - # output_token_ids) + # TODO detokenize # TODO pass output token ids - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, sampling_params) if seq.is_finished(): self.scheduler.free_seq(seq) From 782ce22d604291a64ac6dce3efbb9b4c662c0557 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:26:30 -0700 Subject: [PATCH 080/165] unit tests for block decode --- tests/core/utils.py | 16 +- .../output_processor/test_block_decode.py | 238 ++++++++++++++++++ vllm/engine/output_processor/beam_search.py | 2 - vllm/engine/output_processor/block_decode.py | 27 +- vllm/engine/output_processor/interfaces.py | 5 +- 5 files changed, 262 insertions(+), 26 deletions(-) create mode 100644 tests/engine/output_processor/test_block_decode.py diff --git a/tests/core/utils.py b/tests/core/utils.py index fbbdb07cb8e6..d9d2eeaee1b9 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional, Tuple +from typing import Optional, Tuple, Iterable from vllm import SamplingParams from vllm.lora.request import LoRARequest @@ -31,14 +31,18 @@ def create_dummy_prompt( def create_seq_group( - seq_prompt_len=1024, - seq_output_lens=(128, ), - request_id='0', - seq_id_start=0, + seq_prompt_len: int=1024, + seq_output_lens: Iterable[int]=(128, ), + request_id: str='0', + seq_id_start: int=0, + sampling_params: Optional[SamplingParams] = None ) -> SequenceGroup: assert len(seq_output_lens) > 0 + if sampling_params is None: + sampling_params = SamplingParams() + prompt_token_ids = [0] * seq_prompt_len seqs = [] @@ -60,7 +64,7 @@ def create_seq_group( seq_group = SequenceGroup( request_id=request_id, seqs=seqs, - sampling_params=SamplingParams(), + sampling_params=sampling_params, arrival_time=time.time(), ) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py new file mode 100644 index 000000000000..aae184c16447 --- /dev/null +++ b/tests/engine/output_processor/test_block_decode.py @@ -0,0 +1,238 @@ +import pytest +from unittest.mock import MagicMock +import random + +from transformers import PreTrainedTokenizer + +from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.utils import Counter +from vllm.sequence import SequenceStatus, SequenceGroupOutput, SequenceOutput, Logprob +from vllm.sampling_params import SamplingParams +from tests.core.utils import create_seq_group + +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [1, 12]) +@pytest.mark.skip_global_cleanup +def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=1024, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + max_tokens=seq_output_len + num_new_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids + output_processor.process_outputs(seq_group, outputs) + assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8]) +@pytest.mark.parametrize("max_tokens", [128 + 3]) +@pytest.mark.skip_global_cleanup +def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + max_tokens=max_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to not go over max tokens in len. + assert seq.get_len() == seq_prompt_len + max_tokens + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [12]) +@pytest.mark.parametrize("seed", list(range(6))) +@pytest.mark.skip_global_cleanup +def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + random.seed(seed) + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + eos_token_id = 100 + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + # Ensure enough space. + max_tokens=seq_output_len + num_new_tokens, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + assert eos_token_id not in new_token_ids + eos_index = random.randint(0, len(new_token_ids) - 1) + new_token_ids[eos_index] = eos_token_id + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to not go beyond provided eos. + assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1) + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:eos_index+1] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +@pytest.mark.parametrize("seq_prompt_len", [1024]) +@pytest.mark.parametrize("seq_output_len", [128]) +@pytest.mark.parametrize("num_new_tokens", [12]) +@pytest.mark.parametrize("seed", list(range(6))) +@pytest.mark.skip_global_cleanup +def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + random.seed(seed) + detokenizer = MagicMock(spec=Detokenizer) + scheduler = MagicMock(spec=Scheduler) + stop_checker = MagicMock(spec=StopChecker) + seq_counter = Counter() + + eos_token_id = 100 + + output_processor = BlockDecodeOutputProcessor( + detokenizer=detokenizer, + scheduler=scheduler, + seq_counter=seq_counter, + get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id), + stop_checker=stop_checker, + ) + + seq_group = create_seq_group( + seq_prompt_len=seq_prompt_len, + seq_output_lens=[seq_output_len], + sampling_params=SamplingParams( + # Ensure enough space. + max_tokens=seq_output_len + num_new_tokens, + ignore_eos=True, + ), + ) + + seq = seq_group.get_seqs()[0] + seq.status = SequenceStatus.RUNNING + + new_token_ids = list(range(num_new_tokens)) + assert eos_token_id not in new_token_ids + eos_index = random.randint(0, len(new_token_ids) - 1) + new_token_ids[eos_index] = eos_token_id + + outputs = [SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids] + + assert seq.get_len() == seq_prompt_len + seq_output_len + output_processor.process_outputs(seq_group, outputs) + + # Expect the processed sequence to go beyond eos. + assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens + + # Expect the correct tokens were appended. + expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - seq_output_len] + assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + +def mock_tokenizer(eos_token_id=1000): + tokenizer = MagicMock(spec=PreTrainedTokenizer) + tokenizer.eos_token_id = eos_token_id + return tokenizer diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 829c5ecd7839..827142bd4bf5 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -38,14 +38,12 @@ def __init__( detokenizer, scheduler, seq_counter, - get_tokenizer_for_seq, stop_checker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter - self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 3fb2b7ee3235..06d3ee9306ef 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -34,21 +34,19 @@ class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): def __init__( self, - scheduler_config: SchedulerConfig, detokenizer, scheduler, seq_counter, get_tokenizer_for_seq, stop_checker, ): - self.scheduler_config = scheduler_config self.detokenizer = detokenizer self.scheduler = scheduler self.seq_counter = seq_counter self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" @@ -78,18 +76,17 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput valid_samples = valid_samples[:remaining_tokens] output_token_ids = output_token_ids[:remaining_tokens] - ## Truncate any tokens after EOS. This is required as spec decode - ## generates tokens in fixed blocks, which may go beyond the EOS token. - #if not sampling_params.ignore_eos: - # eos_token_id = self.tokenizer.get_lora_tokenizer( - # seq.lora_request).eos_token_id - # # Avoiding .index calls as exception throwing in the happy path - # # is expensive. - # for i in range(len(output_token_ids)): - # if output_token_ids[i] == eos_token_id: - # output_token_ids = output_token_ids[:i + 1] - # valid_samples = valid_samples[:i + 1] - # break + # Truncate any tokens after EOS. This is required as spec decode + # generates tokens in fixed blocks, which may go beyond the EOS token. + if not sampling_params.ignore_eos: + eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id + # Avoiding .index calls as exception throwing in the happy path + # is expensive. + for i in range(len(output_token_ids)): + if output_token_ids[i] == eos_token_id: + output_token_ids = output_token_ids[:i + 1] + valid_samples = valid_samples[:i + 1] + break for output_token_id in output_token_ids: seq.append_token_id( diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index d2368fc811a0..8a7e27645b4d 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from vllm.config import SchedulerConfig from vllm.sequence import SequenceGroup, SequenceGroupOutput +from typing import List class SequenceGroupOutputProcessor(ABC): @@ -20,13 +21,11 @@ def create_output_processor( detokenizer, scheduler, seq_counter, - get_tokenizer_for_seq, stop_checker, ) else: from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor return BlockDecodeOutputProcessor( - scheduler_config, detokenizer, scheduler, seq_counter, @@ -35,5 +34,5 @@ def create_output_processor( ) @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, outputs: SequenceGroupOutput) -> None: + def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: pass From 3062e1cbeb11d66a8904d05c6ef935784caf44ef Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:34:53 -0700 Subject: [PATCH 081/165] stop token ids --- vllm/engine/output_processor/beam_search.py | 2 +- vllm/engine/output_processor/block_decode.py | 3 +-- vllm/engine/output_processor/stop_checker.py | 20 ++++++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 827142bd4bf5..2b5657d37ccd 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -104,7 +104,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params, [seq.get_last_token_id()]) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 06d3ee9306ef..e218fa99b0e6 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -96,8 +96,7 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput ) # TODO detokenize - # TODO pass output token ids - self.stop_checker.maybe_stop_sequence(seq, sampling_params) + self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 82973e304202..4d8f3730e9f6 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -38,7 +38,7 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, - sampling_params: SamplingParams) -> None: + sampling_params: SamplingParams, new_token_ids: List[int]) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: @@ -46,8 +46,7 @@ def maybe_stop_sequence(self, seq: Sequence, return # Check if the sequence has reached max_tokens. - if (sampling_params.max_tokens is not None) and (seq.get_output_len() >= sampling_params.max_tokens): - # TODO should cap block + if seq.get_output_len() == sampling_params.max_tokens: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return @@ -63,18 +62,23 @@ def maybe_stop_sequence(self, seq: Sequence, seq.status = SequenceStatus.FINISHED_STOPPED seq.stop_reason = stop_str return - last_token_id = seq.get_last_token_id() - if last_token_id in sampling_params.stop_token_ids: + + # Determine if any stop_token_ids are in new_token_ids. + intersection = set(new_token_ids).intersection(sampling_params.stop_token_ids) + if intersection: + # Get arbitrary token id that caused the stop. + stop_token_id = next(iter(intersection)) + stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) + stop_token_id) self._finalize_sequence(seq, sampling_params, stop_str) seq.status = SequenceStatus.FINISHED_STOPPED - seq.stop_reason = last_token_id + seq.stop_reason = stop_token_id return # Check if the sequence has generated the EOS token. if ((not sampling_params.ignore_eos) - and seq.get_last_token_id() == seq.eos_token_id): + and seq.eos_token_id in new_token_ids): seq.status = SequenceStatus.FINISHED_STOPPED return From fba3b300f66e047750eb3a392e0b2f3aee0e0cd8 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:35:16 -0700 Subject: [PATCH 082/165] format --- tests/core/utils.py | 11 +- .../output_processor/test_block_decode.py | 136 ++++++++++-------- vllm/engine/llm_engine.py | 14 +- vllm/engine/output_processor/beam_search.py | 12 +- vllm/engine/output_processor/block_decode.py | 20 ++- vllm/engine/output_processor/interfaces.py | 6 +- vllm/engine/output_processor/stop_checker.py | 7 +- vllm/engine/output_processor/util.py | 10 +- vllm/model_executor/layers/sampler.py | 3 +- vllm/spec_decode/multi_step_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/worker/worker_base.py | 10 +- 12 files changed, 134 insertions(+), 101 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index d9d2eeaee1b9..39f8e507d0f1 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -31,12 +31,11 @@ def create_dummy_prompt( def create_seq_group( - seq_prompt_len: int=1024, - seq_output_lens: Iterable[int]=(128, ), - request_id: str='0', - seq_id_start: int=0, - sampling_params: Optional[SamplingParams] = None -) -> SequenceGroup: + seq_prompt_len: int = 1024, + seq_output_lens: Iterable[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: assert len(seq_output_lens) > 0 diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index aae184c16447..f426f1d32d7a 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -13,6 +13,7 @@ from vllm.sampling_params import SamplingParams from tests.core.utils import create_seq_group + @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup @@ -33,37 +34,40 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): seq_group = create_seq_group( seq_prompt_len=1024, seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - max_tokens=seq_output_len + num_new_tokens, - ), + sampling_params=SamplingParams(max_tokens=seq_output_len + + num_new_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING new_token_ids = list(range(num_new_tokens)) - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids output_processor.process_outputs(seq_group, outputs) assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8]) @pytest.mark.parametrize("max_tokens", [128 + 3]) @pytest.mark.skip_global_cleanup -def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): +def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, max_tokens: int): detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -80,26 +84,26 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_outpu seq_group = create_seq_group( seq_prompt_len=seq_prompt_len, seq_output_lens=[seq_output_len], - sampling_params=SamplingParams( - max_tokens=max_tokens, - ), + sampling_params=SamplingParams(max_tokens=max_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING new_token_ids = list(range(num_new_tokens)) - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -109,14 +113,17 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_outpu # Expect the correct tokens were appended. expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [12]) @pytest.mark.parametrize("seed", list(range(6))) @pytest.mark.skip_global_cleanup -def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): +def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, seed: int): random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -138,10 +145,9 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out seq_output_lens=[seq_output_len], sampling_params=SamplingParams( # Ensure enough space. - max_tokens=seq_output_len + num_new_tokens, - ), + max_tokens=seq_output_len + num_new_tokens, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING @@ -150,16 +156,18 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out eos_index = random.randint(0, len(new_token_ids) - 1) new_token_ids[eos_index] = eos_token_id - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -168,15 +176,18 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_out assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1) # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:eos_index+1] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + expected_appended_tokens = new_token_ids[:eos_index + 1] + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + @pytest.mark.parametrize("seq_prompt_len", [1024]) @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [12]) @pytest.mark.parametrize("seed", list(range(6))) @pytest.mark.skip_global_cleanup -def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): +def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, + seq_output_len: int, seed: int): random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -202,7 +213,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp ignore_eos=True, ), ) - + seq = seq_group.get_seqs()[0] seq.status = SequenceStatus.RUNNING @@ -211,16 +222,18 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp eos_index = random.randint(0, len(new_token_ids) - 1) new_token_ids[eos_index] = eos_token_id - outputs = [SequenceGroupOutput( - samples=[ - SequenceOutput( - parent_seq_id=seq.seq_id, - output_token=output_token, - logprobs={output_token: Logprob(0.0)}, - ) - ], - prompt_logprobs=None, - ) for output_token in new_token_ids] + outputs = [ + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq.seq_id, + output_token=output_token, + logprobs={output_token: Logprob(0.0)}, + ) + ], + prompt_logprobs=None, + ) for output_token in new_token_ids + ] assert seq.get_len() == seq_prompt_len + seq_output_len output_processor.process_outputs(seq_group, outputs) @@ -229,8 +242,11 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_outp assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens # Expect the correct tokens were appended. - expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - seq_output_len] - assert seq.get_token_ids()[-len(expected_appended_tokens):] == expected_appended_tokens + expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens - + seq_output_len] + assert seq.get_token_ids( + )[-len(expected_appended_tokens):] == expected_appended_tokens + def mock_tokenizer(eos_token_id=1000): tokenizer = MagicMock(spec=PreTrainedTokenizer) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index bce36ddccc81..9936eb18c032 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -422,14 +422,14 @@ def has_unfinished_requests(self) -> bool: return self.scheduler.has_unfinished_seqs() def _process_model_outputs( - self, - output: List[SamplerOutput], + self, output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: now = time.time() - output_by_sequence_group = create_output_by_sequence_group(sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) + output_by_sequence_group = create_output_by_sequence_group( + sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, @@ -437,7 +437,7 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - + self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. @@ -455,7 +455,6 @@ def _process_model_outputs( request_outputs.append(request_output) return request_outputs - def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. @@ -519,7 +518,9 @@ def step(self) -> List[RequestOutput]: else: output = [] - request_outputs = self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + request_outputs = self._process_model_outputs( + output, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups) # Log stats. if self.log_stats: @@ -605,7 +606,6 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) - def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 2b5657d37ccd..94af809e2673 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -31,7 +31,7 @@ class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): - + def __init__( self, scheduler_config: SchedulerConfig, @@ -46,8 +46,10 @@ def __init__( self.seq_counter = seq_counter self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: - assert (len(outputs) == 1), f"{type(self)} does not support multiple outputs per step" + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: + assert (len(outputs) == 1 + ), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) def _process_sequence_group_outputs(self, seq_group: SequenceGroup, @@ -104,7 +106,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if seq_group.sampling_params.detokenize: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) - self.stop_checker.maybe_stop_sequence(seq, seq_group.sampling_params, [seq.get_last_token_id()]) + self.stop_checker.maybe_stop_sequence(seq, + seq_group.sampling_params, + [seq.get_last_token_id()]) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index e218fa99b0e6..3b6a60e857fa 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -31,7 +31,7 @@ class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): - + def __init__( self, detokenizer, @@ -46,7 +46,8 @@ def __init__( self.get_tokenizer_for_seq = get_tokenizer_for_seq self.stop_checker = stop_checker - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" @@ -64,14 +65,17 @@ def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceG ] assert valid_samples - self._process_seq_outputs(seq, valid_samples, sequence_group.sampling_params) + self._process_seq_outputs(seq, valid_samples, + sequence_group.sampling_params) - def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput], sampling_params: SamplingParams) -> None: + def _process_seq_outputs(self, seq: Sequence, + valid_samples: List[SequenceOutput], + sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] # Truncate to max_tokens if necessary. - remaining_tokens = sampling_params.max_tokens - ( - seq.get_output_len() + len(output_token_ids)) + remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() + + len(output_token_ids)) if remaining_tokens < 0: valid_samples = valid_samples[:remaining_tokens] output_token_ids = output_token_ids[:remaining_tokens] @@ -96,7 +100,9 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples: List[SequenceOutput ) # TODO detokenize - self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) + self.stop_checker.maybe_stop_sequence(seq, + sampling_params, + new_token_ids=output_token_ids) if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 8a7e27645b4d..2b931a0b2f41 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -3,8 +3,9 @@ from vllm.sequence import SequenceGroup, SequenceGroupOutput from typing import List + class SequenceGroupOutputProcessor(ABC): - + @staticmethod def create_output_processor( scheduler_config: SchedulerConfig, @@ -34,5 +35,6 @@ def create_output_processor( ) @abstractmethod - def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + def process_outputs(self, sequence_group: SequenceGroup, + outputs: List[SequenceGroupOutput]) -> None: pass diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 4d8f3730e9f6..3f03373f2698 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -30,6 +30,7 @@ logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 + class StopChecker: def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): @@ -38,7 +39,8 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, - sampling_params: SamplingParams, new_token_ids: List[int]) -> None: + sampling_params: SamplingParams, + new_token_ids: List[int]) -> None: """Stop the finished sequences.""" # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: @@ -64,7 +66,8 @@ def maybe_stop_sequence(self, seq: Sequence, return # Determine if any stop_token_ids are in new_token_ids. - intersection = set(new_token_ids).intersection(sampling_params.stop_token_ids) + intersection = set(new_token_ids).intersection( + sampling_params.stop_token_ids) if intersection: # Get arbitrary token id that caused the stop. stop_token_id = next(iter(intersection)) diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 1fcd651deef1..b49bbb2fab32 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,12 +1,12 @@ from vllm.sequence import SequenceGroupOutput, SamplerOutput from typing import List -def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): - output_by_sequence_group = [ - [] for _ in range(num_seq_groups) - ] + +def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], + num_seq_groups: int): + output_by_sequence_group = [[] for _ in range(num_seq_groups)] for step in sampler_outputs: for i, sequence_group_output in enumerate(step): - output_by_sequence_group[i].append(sequence_group_output) + output_by_sequence_group[i].append(sequence_group_output) return output_by_sequence_group diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index bed915faf3fb..be970e56b611 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -79,7 +79,8 @@ def forward( prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) + return _build_sampler_output(sample_results, sampling_metadata, + prompt_logprobs, sample_logprobs) def _get_bin_counts_and_mask( diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4cdbe0923455..85060ccf2b15 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -70,7 +70,8 @@ def execute_model_multi_step( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert (len(model_output) == 1), "composing multistep workers not supported" + assert (len(model_output) == 1 + ), "composing multistep workers not supported" model_output = model_output[0] self._append_new_tokens(model_output, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 894377c9421e..b9824937a944 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,7 +196,8 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert len(sampler_output) == 1, "expected single output from scorer worker" + assert len( + sampler_output) == 1, "expected single output from scorer worker" sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 1481a4c2eef4..d5d3ffda1f43 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -40,11 +40,11 @@ def initialize_cache(self, num_gpu_blocks: int, raise NotImplementedError @abstractmethod - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: + def execute_model( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, + int], + blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]: """Executes at least one model step on the given sequences, unless no sequences are provided.""" raise NotImplementedError From bda141fe4dca51b53edf0bafb97882155b2b6839 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 18:56:05 -0700 Subject: [PATCH 083/165] fixing spec tests --- tests/spec_decode/test_multi_step_worker.py | 5 +++-- tests/spec_decode/test_spec_decode_worker.py | 16 +++++++++++----- tests/spec_decode/utils.py | 4 ++-- vllm/engine/async_llm_engine.py | 2 +- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/multi_step_worker.py | 5 +++-- vllm/spec_decode/spec_decode_worker.py | 3 +-- vllm/spec_decode/util.py | 17 ++++++++++------- 8 files changed, 33 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index f4d44108b47c..f9840d6157c3 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -125,7 +125,7 @@ def test_same_output_for_single_step(): zero_kv_cache(worker.cache_engine) set_random_seed(seed) expected_output = worker.execute_model( - **single_step_execute_model_data.to_dict(), ) + **single_step_execute_model_data.to_dict(), )[0] actual_token_ids = [ output.samples[0].output_token for output in actual_output @@ -219,7 +219,7 @@ def test_same_output_for_multi_step(): continuations=continuations, final_seq_lens=final_seq_lens)) - single_step_output.append( + single_step_output.extend( worker.execute_model(**execute_model_data.to_dict(), )) # Append output tokens to new sequence data. @@ -352,6 +352,7 @@ def test_draft_proposals_no_speculations(): @torch.inference_mode() +#@pytest.skip("Broken because output is padded.") def test_draft_proposals_mixed_k(): """Verify DraftModelTop1Proposer correctly handles case some sequences can speculate and some can't. diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 3725924ea89c..889712fb9360 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -12,6 +12,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) +from vllm.sequence import SamplerOutput from .utils import (ExecuteModelData, create_batch, create_sampler_output_list, mock_worker) @@ -191,7 +192,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] exception_secret = 'artifical stop' rejection_sampler.side_effect = ValueError(exception_secret) @@ -271,7 +272,7 @@ def test_correctly_formats_output(k: int, batch_size: int): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] rejection_sampler_output = torch.randint(low=0, high=vocab_size, @@ -340,6 +341,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -383,7 +385,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): target_output = create_sampler_output_list(target_token_ids, target_token_probs) - target_worker.execute_model.return_value = target_output[0] + target_worker.execute_model.return_value = [target_output[0]] rejection_sampler_output = torch.randint(low=0, high=vocab_size, @@ -426,6 +428,8 @@ def test_k_equals_zero(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -446,7 +450,7 @@ def test_k_equals_zero(k: int, batch_size: int): 0].sampled_tokens is None, "expect gpu tensor references to be None" draft_worker.execute_model.assert_called_once_with( - **execute_model_data.to_dict(), return_python_output=False) + **execute_model_data.to_dict()) target_worker.execute_model.assert_called_once_with( **execute_model_data.to_dict()) @@ -465,6 +469,8 @@ def test_empty_input_batch(k: int, batch_size: int): rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) + target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] + draft_worker.device = 'cuda' target_worker.device = 'cuda' @@ -485,7 +491,7 @@ def test_empty_input_batch(k: int, batch_size: int): 0].sampled_tokens is None, "expect gpu tensor references to be None" draft_worker.execute_model.assert_called_once_with( - **execute_model_data.to_dict(), return_python_output=False) + **execute_model_data.to_dict()) target_worker.execute_model.assert_called_once_with( **execute_model_data.to_dict()) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 4637826f254d..3914af945eff 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -10,7 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceGroupOutput, - SequenceOutput) + SequenceOutput, Logprob) from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker @@ -211,7 +211,7 @@ def create_sampler_output_list( SequenceOutput( output_token=token_id, parent_seq_id=seq_ids[seq_index], - logprobs={token_id: 0}, + logprobs={token_id: Logprob(0)}, ) ], prompt_logprobs=None, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index f61049513512..378484510247 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -217,7 +217,7 @@ async def step_async(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs) + return self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) async def encode_request_async( self, diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index f7bac45861a7..1011dd970ebc 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -9,7 +9,7 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, sampler_output_to_torch, split_batch_by_proposal_len, - mock_device_tensors) + maybe_mock_device_tensors) from vllm.worker.worker import Worker SeqId = int @@ -147,7 +147,7 @@ def _contract_batch(self, original_bs: int, sequences. """ - mock_device_tensors( + maybe_mock_device_tensors( sampler_output=target_sampler_output, batch_size=len(non_spec_indices) + num_scoring_tokens, vocab_size=self._vocab_size, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 85060ccf2b15..4182b8758465 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -7,7 +7,7 @@ from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.util import (sampler_output_to_torch, - mock_device_tensors) + maybe_mock_device_tensors) from vllm.worker.worker import Worker @@ -346,7 +346,7 @@ def _merge_outputs( sampler_output = maybe_sampler_output for step_output in sampler_output: - mock_device_tensors( + maybe_mock_device_tensors( sampler_output=step_output, batch_size=len(proposal_lens), vocab_size=self._vocab_size, @@ -364,6 +364,7 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b9824937a944..c221f0421f53 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -196,8 +196,7 @@ def _run_no_spec( blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, ) - assert len( - sampler_output) == 1, "expected single output from scorer worker" + assert len(sampler_output) == 1 sampler_output = sampler_output[0] # Clear device tensors from sampler output. This reduces communication diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 7129f47d65f6..c47d5b878153 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -82,19 +82,22 @@ def sampler_output_to_torch( return sampled_token_ids, sampled_token_probs -def mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, +def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: - assert sampler_output.sampled_token_probs is None - assert sampler_output.sampled_token_ids is None + values = [sampler_output.sampled_token_probs, sampler_output.sampled_token_ids] + assert all(v is None for v in values) or not any(v is None for v in values) + if not any(v is None for v in values): + return sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) + sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - dtype=torch.long, - device=device) + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) @contextmanager From 49865fba9be8aeb19735b3b08ec9a830bf9caee7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:05:55 -0700 Subject: [PATCH 084/165] lint --- vllm/engine/async_llm_engine.py | 4 +++- vllm/spec_decode/multi_step_worker.py | 2 +- vllm/spec_decode/util.py | 16 +++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 378484510247..4bab116dcb14 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -217,7 +217,9 @@ async def step_async(self) -> List[RequestOutput]: else: output = [] - return self._process_model_outputs(output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups) + return self._process_model_outputs( + output, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups) async def encode_request_async( self, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4182b8758465..c79d79930a18 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -364,7 +364,7 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) - + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index c47d5b878153..efc54c4de4cf 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -83,8 +83,10 @@ def sampler_output_to_torch( def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, - vocab_size: int, device: str) -> None: - values = [sampler_output.sampled_token_probs, sampler_output.sampled_token_ids] + vocab_size: int, device: str) -> None: + values = [ + sampler_output.sampled_token_probs, sampler_output.sampled_token_ids + ] assert all(v is None for v in values) or not any(v is None for v in values) if not any(v is None for v in values): return @@ -92,12 +94,12 @@ def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - + sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - dtype=torch.long, - device=device) + high=vocab_size, + size=(batch_size, ), + dtype=torch.long, + device=device) @contextmanager From 1a17ed14a57c13def30b6d7e99236ffa92cdfb61 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:15:03 -0700 Subject: [PATCH 085/165] clean up gpu executor --- vllm/executor/gpu_executor.py | 70 +++++++++++--------------- vllm/spec_decode/spec_decode_worker.py | 9 ++++ 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 90a534dc1271..18be6da10ce9 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -47,18 +47,37 @@ def _init_worker(self): else: self._init_spec_worker() + def _init_non_spec_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + from vllm.worker.worker import Worker + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + is_driver_worker=True, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + def _init_spec_worker(self): from vllm.worker.worker import Worker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker - #from vllm.worker.multi_step_worker import MultiStepWorker # pylint: disable=import-outside-toplevel - #from vllm.worker.single_tp_worker import SingleTpWorker # pylint: disable=import-outside-toplevel - #from vllm.worker.draft_target_worker import DraftTargetWorker # pylint: disable=import-outside-toplevel - - #scheduler_config: "SchedulerConfig" = worker_kwargs.pop( - # "scheduler_config") - distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) @@ -76,7 +95,6 @@ def _init_spec_worker(self): is_driver_worker=True, ) - from vllm.spec_decode.multi_step_worker import MultiStepWorker draft_worker = MultiStepWorker( model_config=self.speculative_config.draft_model_config, parallel_config=self.speculative_config.draft_parallel_config, @@ -91,47 +109,15 @@ def _init_spec_worker(self): is_driver_worker=True, ) - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker - from vllm.model_executor.layers.rejection_sampler import RejectionSampler - spec_decode_worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - rejection_sampler=RejectionSampler(strict_mode=True), - ) + spec_decode_worker = SpecDecodeWorker.from_workers(proposer_worker=draft_worker, scorer_worker=target_worker) assert self.parallel_config.world_size == 1, ( "GPUExecutor only supports single GPU.") self.driver_worker = spec_decode_worker + # Load model handled in spec decode worker. self.driver_worker.init_device() - #self.driver_worker.load_model() - - def _init_non_spec_worker(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker - - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - is_driver_worker=True, - ) - self.driver_worker.init_device() - self.driver_worker.load_model() def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available KV blocks by invoking the diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index c221f0421f53..91bc530084e7 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -48,6 +48,15 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. """ + @classmethod + def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBase) -> "SpecDecodeWorker": + return SpecDecodeWorker( + proposer_worker, + scorer_worker, + # TODO(cade) disable strict mode for speedup. + rejection_sampler=RejectionSampler(strict_mode=True), + ) + def __init__( self, proposer_worker: MultiStepWorker, From dea67bbd6fb1f0278ee4c605d8be77991c8657ae Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:16:16 -0700 Subject: [PATCH 086/165] wip --- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/spec_decode_worker.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 1011dd970ebc..4dc34f1ab7c7 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -10,7 +10,7 @@ sampler_output_to_torch, split_batch_by_proposal_len, maybe_mock_device_tensors) -from vllm.worker.worker import Worker +from vllm.worker.worker_base import WorkerBase SeqId = int TargetSeqId = int @@ -32,7 +32,7 @@ class BatchExpansionTop1Scorer(SpeculativeScorer): of topk/tree. """ - def __init__(self, scorer_worker: Worker, device: str, vocab_size: int): + def __init__(self, scorer_worker: WorkerBase, device: str, vocab_size: int): self._scorer_worker = scorer_worker self._device = device self._vocab_size = vocab_size diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 91bc530084e7..e5b493c46c6c 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -14,7 +14,7 @@ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase from vllm.logger import init_logger logger = init_logger(__name__) @@ -60,7 +60,7 @@ def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBas def __init__( self, proposer_worker: MultiStepWorker, - scorer_worker: Worker, + scorer_worker: WorkerBase, rejection_sampler: RejectionSampler, metrics_collector: Optional[AsyncMetricsCollector] = None, ): From 189d7ebab4a783cb651fb339b2fba88fd8b1f019 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:17:59 -0700 Subject: [PATCH 087/165] fix --- tests/spec_decode/e2e/test_correctness.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index eb6d1e1c5ddd..6b01936e8178 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -62,8 +62,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) - # TODO(cadedaniel) check for equality once block truncation is implemented. - assert all(len(token_ids) >= output_len for token_ids in batch_token_ids) + assert all(len(token_ids) == output_len for token_ids in batch_token_ids) @pytest.mark.parametrize( From a70a0408b12631ca00a78e7cbbcf1db7ef211f33 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:18:47 -0700 Subject: [PATCH 088/165] wip --- vllm/executor/gpu_executor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 18be6da10ce9..22cd2797282e 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -35,9 +35,6 @@ def __init__( self.vision_language_config = vision_language_config self.speculative_config = speculative_config - #assert (not speculative_config - # ), "Speculative decoding not yet supported for GPU backend" - # Instantiate the worker and load the model to GPU. self._init_worker() From 3e1b8f5c17e8ac0a96a1ddc05300b4eeb1996e66 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:42:50 -0700 Subject: [PATCH 089/165] detokenization --- tests/spec_decode/e2e/test_correctness.py | 20 ++++++++++++++++---- vllm/engine/output_processor/block_decode.py | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 6b01936e8178..d2f07f729f5a 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,13 +1,16 @@ import pytest from itertools import cycle +from typing import Tuple, List from vllm import SamplingParams +from transformers import AutoTokenizer @pytest.mark.parametrize( "common_llm_kwargs", [{ # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", # Skip real loading for fast test. @@ -55,15 +58,23 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + batch_tokens, batch_token_ids = get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) + # Expect each generation to have expected number of tokens (note + # ignore_eos=True). assert all(len(token_ids) == output_len for token_ids in batch_token_ids) + # Expect detokenized string to match. + tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") + for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): + expected_tokens = tok.decode(actual_token_ids) + assert actual_tokens == expected_tokens + @pytest.mark.parametrize( "common_llm_kwargs", @@ -109,14 +120,15 @@ def test_spec_decode_xfail(test_llm_generator): with pytest.raises(AssertionError, match="Speculative decoding not yet supported for "): - get_token_ids_from_llm_generator(test_llm_generator, prompts, + get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) -def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): +def get_output_from_llm_generator(llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) token_ids = [output.outputs[0].token_ids for output in outputs] + tokens = [output.outputs[0].text for output in outputs] del llm - return token_ids + return tokens, token_ids diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 3b6a60e857fa..99963111e219 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -98,8 +98,8 @@ def _process_seq_outputs(self, seq: Sequence, # TODO emit logprobs in block decoding. logprobs={output_token_id: Logprob(0.0)}, ) + self.detokenizer.decode_sequence_inplace(seq, sampling_params) - # TODO detokenize self.stop_checker.maybe_stop_sequence(seq, sampling_params, new_token_ids=output_token_ids) From b9777a6ea80e4d0340e406dfe0748a32d5d34138 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 19:48:20 -0700 Subject: [PATCH 090/165] lint --- tests/core/utils.py | 2 +- .../output_processor/test_block_decode.py | 16 +++++---- tests/spec_decode/e2e/test_correctness.py | 18 +++++----- tests/spec_decode/test_spec_decode_worker.py | 2 +- tests/spec_decode/utils.py | 2 +- vllm/engine/llm_engine.py | 33 ++++++++++--------- vllm/engine/output_processor/beam_search.py | 31 ++++------------- vllm/engine/output_processor/block_decode.py | 32 ++++-------------- vllm/engine/output_processor/interfaces.py | 11 +++++-- vllm/engine/output_processor/stop_checker.py | 27 ++------------- vllm/engine/output_processor/util.py | 3 +- vllm/executor/gpu_executor.py | 7 ++-- vllm/spec_decode/batch_expansion.py | 10 +++--- vllm/spec_decode/multi_step_worker.py | 4 +-- vllm/spec_decode/spec_decode_worker.py | 12 +++---- vllm/worker/neuron_worker.py | 2 +- 16 files changed, 81 insertions(+), 131 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index 39f8e507d0f1..22c1d3826dff 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,5 @@ import time -from typing import Optional, Tuple, Iterable +from typing import Iterable, Optional, Tuple from vllm import SamplingParams from vllm.lora.request import LoRARequest diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index f426f1d32d7a..87f451da7c29 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -1,17 +1,19 @@ -import pytest -from unittest.mock import MagicMock import random +from unittest.mock import MagicMock +import pytest from transformers import PreTrainedTokenizer -from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor +from tests.core.utils import create_seq_group +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.block_decode import ( + BlockDecodeOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.sampling_params import SamplingParams +from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput, + SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.core.scheduler import Scheduler from vllm.utils import Counter -from vllm.sequence import SequenceStatus, SequenceGroupOutput, SequenceOutput, Logprob -from vllm.sampling_params import SamplingParams -from tests.core.utils import create_seq_group @pytest.mark.parametrize("seq_output_len", [128]) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d2f07f729f5a..fe543dfda552 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,10 +1,11 @@ -import pytest from itertools import cycle -from typing import Tuple, List +from typing import List, Tuple -from vllm import SamplingParams +import pytest from transformers import AutoTokenizer +from vllm import SamplingParams + @pytest.mark.parametrize( "common_llm_kwargs", @@ -58,9 +59,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): temperature=temperature, ) - batch_tokens, batch_token_ids = get_output_from_llm_generator(test_llm_generator, - prompts, - sampling_params) + batch_tokens, batch_token_ids = get_output_from_llm_generator( + test_llm_generator, prompts, sampling_params) # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) @@ -121,10 +121,12 @@ def test_spec_decode_xfail(test_llm_generator): with pytest.raises(AssertionError, match="Speculative decoding not yet supported for "): get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) + sampling_params) -def get_output_from_llm_generator(llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: +def get_output_from_llm_generator( + llm_generator, prompts, + sampling_params) -> Tuple[List[str], List[List[int]]]: for llm in llm_generator: outputs = llm.generate(prompts, sampling_params, use_tqdm=True) token_ids = [output.outputs[0].token_ids for output in outputs] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 889712fb9360..4470cee78eed 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -6,13 +6,13 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed +from vllm.sequence import SamplerOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from vllm.sequence import SamplerOutput from .utils import (ExecuteModelData, create_batch, create_sampler_output_list, mock_worker) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 3914af945eff..c428c4258c14 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -10,7 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceGroupOutput, - SequenceOutput, Logprob) + SequenceOutput) from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9936eb18c032..8c3786354f40 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,5 +1,5 @@ import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import Iterable, List, Optional, Type, Union from transformers import PreTrainedTokenizer @@ -10,6 +10,10 @@ from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics import StatLogger, Stats +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.engine.ray_utils import initialize_ray_cluster from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger @@ -17,17 +21,13 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) + SequenceGroup) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.engine.output_processor.util import create_output_by_sequence_group logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -183,18 +183,19 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) - self.output_processor = SequenceGroupOutputProcessor.create_output_processor( - self.scheduler_config, - self.detokenizer, - self.scheduler, - self.seq_counter, - self.get_tokenizer_for_seq, - stop_checker=StopChecker( - self.scheduler, + self.output_processor = ( + SequenceGroupOutputProcessor.create_output_processor( self.scheduler_config, + self.detokenizer, + self.scheduler, + self.seq_counter, self.get_tokenizer_for_seq, - ), - ) + stop_checker=StopChecker( + self.scheduler, + self.scheduler_config, + self.get_tokenizer_for_seq, + ), + )) def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 94af809e2673..885a241f7b2d 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,31 +1,12 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import List, Tuple, Union -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase +from vllm.config import SchedulerConfig +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, + SequenceOutput, SequenceStatus) logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 99963111e219..f63ce7d0ef41 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,31 +1,11 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union - -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase +from typing import List + +from vllm.engine.output_processor.interfaces import ( + SequenceGroupOutputProcessor) from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus, Logprob) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import (Logprob, Sequence, SequenceGroup, + SequenceGroupOutput, SequenceOutput, SequenceStatus) logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 2b931a0b2f41..5596bc3f3d67 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from typing import List + from vllm.config import SchedulerConfig from vllm.sequence import SequenceGroup, SequenceGroupOutput -from typing import List class SequenceGroupOutputProcessor(ABC): @@ -16,7 +17,9 @@ def create_output_processor( stop_checker, ): if scheduler_config.num_lookahead_slots == 0: - from vllm.engine.output_processor.beam_search import BeamSearchOutputProcessor + # Importing here to avoid cycle. + from vllm.engine.output_processor.beam_search import ( + BeamSearchOutputProcessor) return BeamSearchOutputProcessor( scheduler_config, detokenizer, @@ -25,7 +28,9 @@ def create_output_processor( stop_checker, ) else: - from vllm.engine.output_processor.block_decode import BlockDecodeOutputProcessor + # Importing here to avoid cycle. + from vllm.engine.output_processor.block_decode import ( + BlockDecodeOutputProcessor) return BlockDecodeOutputProcessor( detokenizer, scheduler, diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 3f03373f2698..b55e47ab3c12 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,31 +1,8 @@ -import time -from typing import Iterable, List, Optional, Tuple, Type, Union +from typing import List -from transformers import PreTrainedTokenizer - -import vllm -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, SpeculativeConfig, - VisionLanguageConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import initialize_ray_cluster -from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupOutput, SequenceOutput, - SequenceStatus) -from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, - get_tokenizer_group) -from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, - usage_message) -from vllm.utils import Counter -from vllm.engine.output_processor.interfaces import SequenceGroupOutputProcessor +from vllm.sequence import Sequence, SequenceStatus logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index b49bbb2fab32..e4939b9be445 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,6 +1,7 @@ -from vllm.sequence import SequenceGroupOutput, SamplerOutput from typing import List +from vllm.sequence import SamplerOutput + def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 22cd2797282e..b5e64843213a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -71,9 +71,9 @@ def _init_non_spec_worker(self): self.driver_worker.load_model() def _init_spec_worker(self): - from vllm.worker.worker import Worker - from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.multi_step_worker import MultiStepWorker + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + from vllm.worker.worker import Worker distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) @@ -106,7 +106,8 @@ def _init_spec_worker(self): is_driver_worker=True, ) - spec_decode_worker = SpecDecodeWorker.from_workers(proposer_worker=draft_worker, scorer_worker=target_worker) + spec_decode_worker = SpecDecodeWorker.from_workers( + proposer_worker=draft_worker, scorer_worker=target_worker) assert self.parallel_config.world_size == 1, ( "GPUExecutor only supports single GPU.") diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 4dc34f1ab7c7..6945877fbf34 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -6,10 +6,9 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, - sampler_output_to_torch, - split_batch_by_proposal_len, - maybe_mock_device_tensors) +from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors, + nvtx_range, sampler_output_to_torch, + split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase SeqId = int @@ -32,7 +31,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer): of topk/tree. """ - def __init__(self, scorer_worker: WorkerBase, device: str, vocab_size: int): + def __init__(self, scorer_worker: WorkerBase, device: str, + vocab_size: int): self._scorer_worker = scorer_worker self._device = device self._vocab_size = vocab_size diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index c79d79930a18..6fdc3b294295 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,8 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (sampler_output_to_torch, - maybe_mock_device_tensors) +from vllm.spec_decode.util import (maybe_mock_device_tensors, + sampler_output_to_torch) from vllm.worker.worker import Worker diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index e5b493c46c6c..84aa562eba50 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -3,9 +3,10 @@ import torch +from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, - SequenceGroupOutput, SequenceOutput, Logprob) +from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata, + SequenceGroupOutput, SequenceOutput) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -13,9 +14,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) -from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase -from vllm.logger import init_logger logger = init_logger(__name__) @@ -49,7 +48,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): """ @classmethod - def from_workers(cls, proposer_worker: MultiStepWorker, scorer_worker: WorkerBase) -> "SpecDecodeWorker": + def from_workers(cls, proposer_worker: MultiStepWorker, + scorer_worker: WorkerBase) -> "SpecDecodeWorker": return SpecDecodeWorker( proposer_worker, scorer_worker, @@ -238,7 +238,7 @@ def _run_speculative_decoding_step( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - logger.info(f"score proposals") + logger.info("score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index d0f01b893bc6..7472a795fb51 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,5 +1,5 @@ """A Neuron worker class.""" -from typing import List, Optional +from typing import List import torch import torch.distributed From 29b4f12dc07a1c4d5238d9e5cc6fe9211d57b4d9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:21:51 -0700 Subject: [PATCH 091/165] docstrings --- .../output_processor/test_block_decode.py | 17 ++++++++- tests/spec_decode/e2e/test_correctness.py | 7 +++- tests/spec_decode/test_multi_step_worker.py | 1 - tests/spec_decode/test_spec_decode_worker.py | 1 - vllm/core/scheduler.py | 6 --- vllm/engine/llm_engine.py | 9 ++++- vllm/engine/output_processor/beam_search.py | 28 +++++++++++--- vllm/engine/output_processor/block_decode.py | 36 +++++++++++++++--- vllm/engine/output_processor/interfaces.py | 37 +++++++++++++++---- vllm/engine/output_processor/stop_checker.py | 14 ++++--- vllm/engine/output_processor/util.py | 3 ++ vllm/executor/gpu_executor.py | 2 + vllm/model_executor/layers/sampler.py | 4 -- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/multi_step_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/spec_decode/util.py | 6 +++ 17 files changed, 137 insertions(+), 43 deletions(-) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_block_decode.py index 87f451da7c29..c4a88d67cabc 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_block_decode.py @@ -20,6 +20,11 @@ @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): + """Verify block decoding appends token ids correctly. + + We append token ids and verify all the token ids were appended correctly. + Note that ignore_eos=True. + """ detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -37,7 +42,8 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): seq_prompt_len=1024, seq_output_lens=[seq_output_len], sampling_params=SamplingParams(max_tokens=seq_output_len + - num_new_tokens, ), + num_new_tokens, + ignore_eos=True), ) seq = seq_group.get_seqs()[0] @@ -70,6 +76,9 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): @pytest.mark.skip_global_cleanup def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, max_tokens: int): + """Verify tokens after max_tokens are dropped and not appended to the + sequence. + """ detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) stop_checker = MagicMock(spec=StopChecker) @@ -126,6 +135,9 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, @pytest.mark.skip_global_cleanup def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + """Verify the eos token id is included in the sequence, but subsequent + tokens are dropped (not appended to sequence). + """ random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) @@ -190,6 +202,9 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, @pytest.mark.skip_global_cleanup def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, seq_output_len: int, seed: int): + """When sampling parameters dictate that we should ignore the eos token id, + ensure all token ids are appended even if the eos token id is emitted. + """ random.seed(seed) detokenizer = MagicMock(spec=Detokenizer) scheduler = MagicMock(spec=Scheduler) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fe543dfda552..160510e6c0c0 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -38,8 +38,9 @@ @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): - """Run generation with speculative decoding on a batch. Verify the number - of output tokens is equal to the expected number. + """Run generation with speculative decoding on a batch. Verify the engine + generates the correct number of tokens (via ignore_eos=True), and that the + detokenization matches HF transformers. """ output_len = 128 temperature = 0.0 @@ -105,6 +106,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_xfail(test_llm_generator): + """Verify that speculative decoding with Ray fails. + """ output_len = 128 temperature = 0.0 diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index f9840d6157c3..d6edbab579af 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -352,7 +352,6 @@ def test_draft_proposals_no_speculations(): @torch.inference_mode() -#@pytest.skip("Broken because output is padded.") def test_draft_proposals_mixed_k(): """Verify DraftModelTop1Proposer correctly handles case some sequences can speculate and some can't. diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 4470cee78eed..0a3110775e2d 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -341,7 +341,6 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' target_worker.device = 'cuda' diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index e176848c0490..db48a1f7f0d2 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -754,9 +754,6 @@ def _schedule_default(self) -> SchedulerOutputs: swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, num_lookahead_slots=running_scheduled.num_lookahead_slots, - #num_lookahead_slots=(prefills.num_lookahead_slots + - # running_scheduled.num_lookahead_slots + - # swapped_in.num_lookahead_slots), ) def _schedule_chunked_prefill(self): @@ -844,9 +841,6 @@ def _schedule_chunked_prefill(self): swapped_in.blocks_to_copy), ignored_seq_groups=prefills.ignored_seq_groups, num_lookahead_slots=running_scheduled.num_lookahead_slots, - #num_lookahead_slots=(prefills.num_lookahead_slots + - # running_scheduled.num_lookahead_slots + - # swapped_in.num_lookahead_slots), ) def _schedule(self) -> SchedulerOutputs: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8c3786354f40..e6e75ee59c76 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -183,6 +183,8 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) + # Create sequence output processor, e.g. for beam search or + # speculative decoding. self.output_processor = ( SequenceGroupOutputProcessor.create_output_processor( self.scheduler_config, @@ -426,9 +428,15 @@ def _process_model_outputs( self, output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]: + """Apply the model output to the sequences in the scheduled seq groups. + + Returns RequestOutputs that can be returned to the client. + """ now = time.time() + # Organize outputs by [sequence group][step] instead of + # [step][sequence group]. output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) @@ -438,7 +446,6 @@ def _process_model_outputs( seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 885a241f7b2d..330eeced21cf 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import List, Tuple, Union, Iterable from vllm.config import SchedulerConfig from vllm.engine.output_processor.interfaces import ( @@ -7,19 +7,31 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles logic related to beam search + sequence management and coupled logic like detokenization and stop logic. + + This class is in charge of sorting out which sequences survive after beam + sampling. It manages forking and freeing of sequences. + + It does not support lookahead decoding, e.g. where the model generates >1 + token per scheduling invocation. + """ def __init__( self, scheduler_config: SchedulerConfig, - detokenizer, - scheduler, - seq_counter, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + stop_checker: StopChecker, ): self.scheduler_config = scheduler_config self.detokenizer = detokenizer @@ -29,6 +41,12 @@ def __init__( def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Append all new tokens to sequences in the sequence group. Fork any + surviving beam candidates; free any unsurviving ones. + + Invokes detokenizer to detokenize new tokens, and also marks sequences + as finished if they meet stop conditions. + """ assert (len(outputs) == 1 ), f"{type(self)} does not support multiple outputs per step" return self._process_sequence_group_outputs(sequence_group, outputs[0]) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index f63ce7d0ef41..8c9b3e25598f 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,24 +1,39 @@ -from typing import List +from typing import List, Iterable, Callable from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.core.scheduler import Scheduler +from vllm.transformers_utils.detokenizer import Detokenizer +from transformers import PreTrainedTokenizer logger = init_logger(__name__) class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles logic related to + detokenization and stopping conditions. Besides not supporting beam search, + this differs from BeamSearchOutputProcessor in that it supports lookahead + scheduling (where the model may generate >1 token per scheduler invocation). + + This allows it to support speculative decoding and cases where the model + runs more than once. We generalize these cases as "block decoding", where + the model emits a block of tokens at the same time. In this case, this class + is responsible for correctly appending all token ids to sequences and + detokenizing new token ids. + """ def __init__( self, - detokenizer, - scheduler, - seq_counter, - get_tokenizer_for_seq, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer], + stop_checker: StopChecker, ): self.detokenizer = detokenizer self.scheduler = scheduler @@ -28,6 +43,15 @@ def __init__( def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Append new tokens in the outputs to sequences in the sequence group. + + This only supports sequence groups of size 1. It supports greater than + one new token per sequence. + + This applies logic like stop condition checking and detokenization, + including freeing finished sequences. It also handles cases where there + are tokens emitted after the EOS token. + """ seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 5596bc3f3d67..1f940f292406 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,21 +1,40 @@ from abc import ABC, abstractmethod -from typing import List +from typing import List, Callable, Iterable from vllm.config import SchedulerConfig -from vllm.sequence import SequenceGroup, SequenceGroupOutput +from vllm.sequence import SequenceGroup, SequenceGroupOutput, Sequence +from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.core.scheduler import Scheduler +from vllm.engine.output_processor.stop_checker import StopChecker class SequenceGroupOutputProcessor(ABC): + """Interface for logic that processes new token ids in sequence groups, + managing detokenization, stop checking, and freeing/forking sequences with + the scheduler. + + This is highly coupled with the LLMEngine and should be seen as an extension + of it. The logic is separated out to simplify the LLMEngine class and to + allow a beam search implementation (which handles forking, etc) and a block + decode implementation (which handles decoding >1 token per step). + """ @staticmethod def create_output_processor( scheduler_config: SchedulerConfig, - detokenizer, - scheduler, - seq_counter, - get_tokenizer_for_seq, - stop_checker, + detokenizer: Detokenizer, + scheduler: Scheduler, + seq_counter: Iterable[int], + get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer], + stop_checker: "StopChecker", ): + """Create an output processor. + + This returns an output processor compatible with beam search if the + scheduler is not configured to scheduler lookahead slots. Otherwise, it + returns an output processor that is incompatible with beam search but + which supports decoding more than one token per scheduling invocation. + """ if scheduler_config.num_lookahead_slots == 0: # Importing here to avoid cycle. from vllm.engine.output_processor.beam_search import ( @@ -42,4 +61,8 @@ def create_output_processor( @abstractmethod def process_outputs(self, sequence_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Process new token ids for the sequence group. Handles logic such as + detokenization, stop checking, and freeing/forking sequences in the + scheduler. + """ pass diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index b55e47ab3c12..2a6c79d2dc02 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,14 +1,15 @@ from typing import List -from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus -logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5 - class StopChecker: + """LLMEngine helper class which separates out the logic involving stop + checking. This checks things such as: whether the eos token was emitted, + whether the max_tokens has been consumed, whether a stop string has been + emitted, or if we have exceeded the max model len. + """ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): self.scheduler = scheduler @@ -18,7 +19,9 @@ def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): def maybe_stop_sequence(self, seq: Sequence, sampling_params: SamplingParams, new_token_ids: List[int]) -> None: - """Stop the finished sequences.""" + """Check if the sequences should be stopped. If so, mark it as finished. + """ + # Check if the sequence has reached max_model_len. if seq.get_len() > self.scheduler_config.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED @@ -36,6 +39,7 @@ def maybe_stop_sequence(self, seq: Sequence, if sampling_params.detokenize: for stop_str in sampling_params.stop: + # TODO(cade) Fix this for speculative decoding. if seq.output_text.endswith(stop_str): self._finalize_sequence(seq, sampling_params, stop_str) seq.status = SequenceStatus.FINISHED_STOPPED diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index e4939b9be445..5fbb09a857a4 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -5,6 +5,9 @@ def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput], num_seq_groups: int): + """Helper method which transforms a 2d list organized by + [step][sequence group] into [sequence group][step]. + """ output_by_sequence_group = [[] for _ in range(num_seq_groups)] for step in sampler_outputs: for i, sequence_group_output in enumerate(step): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index b5e64843213a..9330d754d5d7 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -71,6 +71,8 @@ def _init_non_spec_worker(self): self.driver_worker.load_model() def _init_spec_worker(self): + """Initialize a SpecDecodeWorker, using a draft model for proposals. + """ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.worker.worker import Worker diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index be970e56b611..cb1480de03e3 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -78,7 +78,6 @@ def forward( # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs) @@ -669,8 +668,6 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], - sampled_token_ids: Optional[torch.Tensor] = None, - sampled_token_probs: Optional[torch.Tensor] = None, ) -> SamplerOutput: sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, @@ -687,5 +684,4 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput(outputs=sampler_output) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 6945877fbf34..88af1dd36015 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -84,7 +84,6 @@ def score_proposals( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - #return_python_output=False ) assert len(target_sampler_output) == 1, "expected single-step output" target_sampler_output = target_sampler_output[0] @@ -147,6 +146,8 @@ def _contract_batch(self, original_bs: int, sequences. """ + # We mock the device tensors until PR 7/9 is merged (e2e correctness). + # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer maybe_mock_device_tensors( sampler_output=target_sampler_output, batch_size=len(non_spec_indices) + num_scoring_tokens, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 6fdc3b294295..ce63c329a40a 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -345,6 +345,8 @@ def _merge_outputs( sampler_output = maybe_sampler_output + # We mock the device tensors until PR 7/9 is merged (e2e correctness). + # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer for step_output in sampler_output: maybe_mock_device_tensors( sampler_output=step_output, @@ -364,7 +366,6 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) - entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens entire_proposal_probs = torch.zeros(batch_size, *proposal_probs.shape[1:], diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 84aa562eba50..be3af7be9386 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -99,7 +99,7 @@ def init_device(self) -> None: self.scorer_worker.init_device() self.proposer_worker.init_device() - # TODO separate from init_device? + # NOTE(cade): load_model is not part of the WorkerBase interface. self.scorer_worker.load_model() self.proposer_worker.load_model() @@ -195,7 +195,6 @@ def _run_no_spec( blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy, - #return_python_output=False ) logger.info("run target worker no spec") diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index efc54c4de4cf..85aee137dcbc 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -84,13 +84,19 @@ def sampler_output_to_torch( def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, vocab_size: int, device: str) -> None: + """Helper method which mocks out the GPU tensors in SamplerOutput with dummy + values. This will be removed in PR 7/9. + https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer + """ values = [ sampler_output.sampled_token_probs, sampler_output.sampled_token_ids ] assert all(v is None for v in values) or not any(v is None for v in values) if not any(v is None for v in values): + # Do nothing if the tensors are already created (usually in unit tests). return + # Softmax to ensure valid probs. sampler_output.sampled_token_probs = torch.nn.functional.softmax( torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) From 42aa0bc45900b49ca5ae7878f90e371a123e0e66 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:30:23 -0700 Subject: [PATCH 092/165] fix --- vllm/engine/output_processor/beam_search.py | 6 +++--- vllm/engine/output_processor/block_decode.py | 7 ++++--- vllm/engine/output_processor/interfaces.py | 8 +++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/beam_search.py index 330eeced21cf..b0c0246b9935 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/beam_search.py @@ -1,15 +1,15 @@ -from typing import List, Tuple, Union, Iterable +from typing import Iterable, List, Tuple, Union from vllm.config import SchedulerConfig +from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) +from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.detokenizer import Detokenizer -from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.stop_checker import StopChecker logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/block_decode.py index 8c9b3e25598f..e309b57af6de 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/block_decode.py @@ -1,5 +1,8 @@ -from typing import List, Iterable, Callable +from typing import Callable, Iterable, List +from transformers import PreTrainedTokenizer + +from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker @@ -7,9 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) -from vllm.core.scheduler import Scheduler from vllm.transformers_utils.detokenizer import Detokenizer -from transformers import PreTrainedTokenizer logger = init_logger(__name__) diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 1f940f292406..26ec982cc13f 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,11 +1,13 @@ from abc import ABC, abstractmethod -from typing import List, Callable, Iterable +from typing import Callable, Iterable, List + +from transformers import PreTrainedTokenizer from vllm.config import SchedulerConfig -from vllm.sequence import SequenceGroup, SequenceGroupOutput, Sequence -from vllm.transformers_utils.detokenizer import Detokenizer from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput +from vllm.transformers_utils.detokenizer import Detokenizer class SequenceGroupOutputProcessor(ABC): From 0ebd93b98f1c334aca3f4f4f6b651a7301a4f427 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:31:51 -0700 Subject: [PATCH 093/165] more spec test --- tests/spec_decode/e2e/test_correctness.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 160510e6c0c0..c9665ee5bbc2 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,6 +26,10 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + }, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 33a3d7230b1e6f6a699b3863046494ecf5aca365 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:37:05 -0700 Subject: [PATCH 094/165] remove --- tests/spec_decode/e2e/test_correctness.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c9665ee5bbc2..160510e6c0c0 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,10 +26,6 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - }, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 15c942dfc8a49e294d803a1088bd8776bfd69aa2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 20:37:29 -0700 Subject: [PATCH 095/165] wip --- tests/spec_decode/e2e/test_correctness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 160510e6c0c0..ac79f977ce39 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,6 +26,7 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ + # TODO(cade) handle output { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, From 063e34b370e0dcd8080faa3e397f303f0e4d3795 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 21:24:25 -0700 Subject: [PATCH 096/165] strip --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index ac79f977ce39..173f96c4de60 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -75,7 +75,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) - assert actual_tokens == expected_tokens + assert actual_tokens.strip() == expected_tokens.strip() @pytest.mark.parametrize( From 672a855bb1ca4a074a9158d79eb99253fe3b2540 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 7 Apr 2024 22:57:11 -0700 Subject: [PATCH 097/165] print --- tests/spec_decode/e2e/test_correctness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 173f96c4de60..d76dbc50c872 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -75,6 +75,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) + print(f"{actual_token_ids=}") assert actual_tokens.strip() == expected_tokens.strip() From 8021b38ab38f85e187c6462fa804f8e55a18f8c2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 15:25:49 -0700 Subject: [PATCH 098/165] fix flaky test --- tests/spec_decode/e2e/test_correctness.py | 16 +++++++++++++--- vllm/spec_decode/util.py | 4 ++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d76dbc50c872..1041a5ddac12 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,17 +26,25 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - # TODO(cade) handle output { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + }, { # No spec decode. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1, 10]) +@pytest.mark.parametrize("batch_size", [1]) +# NOTE: We should run more permutations of this test (more BS, more seeds). But +# because our spec decode generates gibberish token ids, the likelihood of +# emitting an invalid token combination is nontrivial. This causes divergence in +# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf- +# start" bytes are emitted. @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): """Run generation with speculative decoding on a batch. Verify the engine @@ -59,6 +67,8 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): max_tokens=output_len, ignore_eos=True, temperature=temperature, + skip_special_tokens=True, + spaces_between_special_tokens=False, ) batch_tokens, batch_token_ids = get_output_from_llm_generator( @@ -76,7 +86,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) print(f"{actual_token_ids=}") - assert actual_tokens.strip() == expected_tokens.strip() + assert actual_tokens == expected_tokens @pytest.mark.parametrize( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 85aee137dcbc..eb6d4ca1da8e 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -101,8 +101,8 @@ def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), dim=-1) - sampler_output.sampled_token_ids = torch.randint(low=0, - high=vocab_size, + sampler_output.sampled_token_ids = torch.randint(low=10, + high=100, size=(batch_size, ), dtype=torch.long, device=device) From 8e93fff38628411da884e35290f547f42c6f3d27 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 15:55:21 -0700 Subject: [PATCH 099/165] reduce output len --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 1041a5ddac12..c8b6cf0d7df7 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -51,7 +51,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): generates the correct number of tokens (via ignore_eos=True), and that the detokenization matches HF transformers. """ - output_len = 128 + output_len = 32 temperature = 0.0 prompts = [ From d06e9a482125150d7d94ac8095203e86481c4c55 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 16:44:05 -0700 Subject: [PATCH 100/165] strip --- tests/spec_decode/e2e/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index c8b6cf0d7df7..a8ebd66841eb 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -86,7 +86,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): expected_tokens = tok.decode(actual_token_ids) print(f"{actual_token_ids=}") - assert actual_tokens == expected_tokens + assert actual_tokens.strip() == expected_tokens.strip() @pytest.mark.parametrize( From ca516aa614db261075c24780490e8b3d9767efed Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 8 Apr 2024 21:41:16 -0700 Subject: [PATCH 101/165] pr feedback --- vllm/worker/cpu_worker.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bd67f9f8850a..9debe3f0dfd1 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -207,11 +207,17 @@ def initialize_cache(self, num_gpu_blocks: int, # Note: To reuse the cache management procedure, # use cpu cache as 'gpu cache'. num_cpu_blocks = num_gpu_blocks - del num_gpu_blocks + self._validate_num_cpu_blocks(num_cpu_blocks) self.cache_config.num_gpu_blocks = num_cpu_blocks self.cache_config.num_cpu_blocks = 0 + # Initialize the cache. + self._init_cache_engine() + + def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: + """Raise errors if the num_cpu_blocks is invalid. + """ if num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " @@ -226,9 +232,6 @@ def initialize_cache(self, num_gpu_blocks: int, "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " "initializing the engine.") - # Initialize the cache. - self._init_cache_engine() - def _init_cache_engine(self) -> None: self.cache_engine = CPUCacheEngine(self.cache_config, self.model_config, From f6c7b2ecded9a7b7e9575aec2ca405d7ae3dd9a7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 11:59:09 -0700 Subject: [PATCH 102/165] Zhuohan offline pr feedback --- ...est_block_decode.py => test_multi_step.py} | 13 ++++---- vllm/engine/output_processor/interfaces.py | 25 ++++++++-------- .../{block_decode.py => multi_step.py} | 30 +++++++++++-------- .../{beam_search.py => single_step.py} | 19 +++++++----- 4 files changed, 46 insertions(+), 41 deletions(-) rename tests/engine/output_processor/{test_block_decode.py => test_multi_step.py} (96%) rename vllm/engine/output_processor/{block_decode.py => multi_step.py} (79%) rename vllm/engine/output_processor/{beam_search.py => single_step.py} (94%) diff --git a/tests/engine/output_processor/test_block_decode.py b/tests/engine/output_processor/test_multi_step.py similarity index 96% rename from tests/engine/output_processor/test_block_decode.py rename to tests/engine/output_processor/test_multi_step.py index c4a88d67cabc..6da3da091db7 100644 --- a/tests/engine/output_processor/test_block_decode.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -6,8 +6,7 @@ from tests.core.utils import create_seq_group from vllm.core.scheduler import Scheduler -from vllm.engine.output_processor.block_decode import ( - BlockDecodeOutputProcessor) +from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput, @@ -20,7 +19,7 @@ @pytest.mark.parametrize("num_new_tokens", [1, 12]) @pytest.mark.skip_global_cleanup def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): - """Verify block decoding appends token ids correctly. + """Verify multi-step decoding appends token ids correctly. We append token ids and verify all the token ids were appended correctly. Note that ignore_eos=True. @@ -30,7 +29,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int): stop_checker = MagicMock(spec=StopChecker) seq_counter = Counter() - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -84,7 +83,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int, stop_checker = MagicMock(spec=StopChecker) seq_counter = Counter() - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -146,7 +145,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int, eos_token_id = 100 - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, @@ -213,7 +212,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int, eos_token_id = 100 - output_processor = BlockDecodeOutputProcessor( + output_processor = MultiStepOutputProcessor( detokenizer=detokenizer, scheduler=scheduler, seq_counter=seq_counter, diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 26ec982cc13f..9ddac7a04cb3 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -16,9 +16,10 @@ class SequenceGroupOutputProcessor(ABC): the scheduler. This is highly coupled with the LLMEngine and should be seen as an extension - of it. The logic is separated out to simplify the LLMEngine class and to - allow a beam search implementation (which handles forking, etc) and a block - decode implementation (which handles decoding >1 token per step). + of it. The logic is separated to simplify the LLMEngine class and allow + separate implementations for single-step decoding (which supports beam + search sequence forking) and multi-step decoding (which does not support + beam search, but does support speculative decoding). """ @staticmethod @@ -32,16 +33,14 @@ def create_output_processor( ): """Create an output processor. - This returns an output processor compatible with beam search if the - scheduler is not configured to scheduler lookahead slots. Otherwise, it - returns an output processor that is incompatible with beam search but - which supports decoding more than one token per scheduling invocation. + This returns a single-step output processor if num_lookahead_slots is + zero, else returns a multi-step output processor. """ if scheduler_config.num_lookahead_slots == 0: # Importing here to avoid cycle. - from vllm.engine.output_processor.beam_search import ( - BeamSearchOutputProcessor) - return BeamSearchOutputProcessor( + from vllm.engine.output_processor.single_step import ( + SingleStepOutputProcessor) + return SingleStepOutputProcessor( scheduler_config, detokenizer, scheduler, @@ -50,9 +49,9 @@ def create_output_processor( ) else: # Importing here to avoid cycle. - from vllm.engine.output_processor.block_decode import ( - BlockDecodeOutputProcessor) - return BlockDecodeOutputProcessor( + from vllm.engine.output_processor.multi_step import ( + MultiStepOutputProcessor) + return MultiStepOutputProcessor( detokenizer, scheduler, seq_counter, diff --git a/vllm/engine/output_processor/block_decode.py b/vllm/engine/output_processor/multi_step.py similarity index 79% rename from vllm/engine/output_processor/block_decode.py rename to vllm/engine/output_processor/multi_step.py index e309b57af6de..6b01a94f59e4 100644 --- a/vllm/engine/output_processor/block_decode.py +++ b/vllm/engine/output_processor/multi_step.py @@ -15,17 +15,18 @@ logger = init_logger(__name__) -class BlockDecodeOutputProcessor(SequenceGroupOutputProcessor): +class MultiStepOutputProcessor(SequenceGroupOutputProcessor): """SequenceGroupOutputProcessor which handles logic related to - detokenization and stopping conditions. Besides not supporting beam search, - this differs from BeamSearchOutputProcessor in that it supports lookahead - scheduling (where the model may generate >1 token per scheduler invocation). - - This allows it to support speculative decoding and cases where the model - runs more than once. We generalize these cases as "block decoding", where - the model emits a block of tokens at the same time. In this case, this class - is responsible for correctly appending all token ids to sequences and - detokenizing new token ids. + detokenization and stopping conditions. It specializes to "multi-step + decoding", where vLLM's worker may generate multiple tokens per invocation. + This is currently mutually exclusive with advanced sampling techniques like + beam search, which motivates the separation of this logic from the single + step output processor. + + This class is responsible for things such as correctly appending all new + token ids to their sequence, detokenizing new token ids, truncating new + output tokens after an eos token, and correctly handling the case where the + number of new output tokens per sequence differs in a single batch. """ def __init__( @@ -56,7 +57,8 @@ def process_outputs(self, sequence_group: SequenceGroup, seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) assert seqs, "expected running sequences" - assert len(seqs) == 1, ("Beam search not supported in block decoding.") + assert len(seqs) == 1, ( + "Beam search not supported in multi-step decoding.") seq = seqs[0] # Since there's only one sequence per sequence group, we can take the @@ -86,7 +88,9 @@ def _process_seq_outputs(self, seq: Sequence, output_token_ids = output_token_ids[:remaining_tokens] # Truncate any tokens after EOS. This is required as spec decode - # generates tokens in fixed blocks, which may go beyond the EOS token. + # generates a fixed number of tokens without evaluating stopping + # conditions within the block. This can cause an eos token to be + # unintentionally ignored. if not sampling_params.ignore_eos: eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id # Avoiding .index calls as exception throwing in the happy path @@ -100,7 +104,7 @@ def _process_seq_outputs(self, seq: Sequence, for output_token_id in output_token_ids: seq.append_token_id( token_id=output_token_id, - # TODO emit logprobs in block decoding. + # TODO emit logprobs in multi-step decoding. logprobs={output_token_id: Logprob(0.0)}, ) self.detokenizer.decode_sequence_inplace(seq, sampling_params) diff --git a/vllm/engine/output_processor/beam_search.py b/vllm/engine/output_processor/single_step.py similarity index 94% rename from vllm/engine/output_processor/beam_search.py rename to vllm/engine/output_processor/single_step.py index b0c0246b9935..a642070dce60 100644 --- a/vllm/engine/output_processor/beam_search.py +++ b/vllm/engine/output_processor/single_step.py @@ -14,15 +14,18 @@ logger = init_logger(__name__) -class BeamSearchOutputProcessor(SequenceGroupOutputProcessor): - """SequenceGroupOutputProcessor which handles logic related to beam search - sequence management and coupled logic like detokenization and stop logic. +class SingleStepOutputProcessor(SequenceGroupOutputProcessor): + """SequenceGroupOutputProcessor which handles "output processing" logic, + which happens after the model returns generated token ids and before + scheduling of the next batch. Output processing logic includes + detokenization, and determining if a sequence is finished (e.g. via max len + or eos token). - This class is in charge of sorting out which sequences survive after beam - sampling. It manages forking and freeing of sequences. - - It does not support lookahead decoding, e.g. where the model generates >1 - token per scheduling invocation. + The SingleStepOutputProcessor is specialized to the case where the model + emits at most a single token per invocation, which precludes configurations + such as speculative decoding or multi-step decoding. This enables beam + search sampling, which requires forking/finishing/freeing sequences in a way + that is currently difficult to schedule multiple steps ahead of time. """ def __init__( From 96f81c4abdb4157b68bd33db3ff07a7825e6695e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 12:18:07 -0700 Subject: [PATCH 103/165] lint --- vllm/spec_decode/spec_decode_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 29144f70ff6f..be3af7be9386 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -18,6 +18,7 @@ logger = init_logger(__name__) + class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. From a573e2c6de994c90f203907cbc1bb267d55ba8dc Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 15:12:46 -0700 Subject: [PATCH 104/165] sampler output gpu tensor --- tests/spec_decode/e2e/test_correctness.py | 2 +- vllm/model_executor/layers/sampler.py | 206 +++++++++++++--------- 2 files changed, 119 insertions(+), 89 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index a8ebd66841eb..9c53c344c4b0 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -39,7 +39,7 @@ }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize("batch_size", [2]) # NOTE: We should run more permutations of this test (more BS, more seeds). But # because our spec decode generates gibberish token ids, the likelihood of # emitting an invalid token combination is nontrivial. This causes divergence in diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cb1480de03e3..ec40e016e18f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -29,6 +29,10 @@ class Sampler(nn.Module): parameters (e.g., sampling method, temperature, top-p, top-k, etc.). """ + def __init__(self): + super().__init__() + self._include_gpu_probs_tensor = True + def forward( self, logits: torch.Tensor, @@ -73,13 +77,13 @@ def forward( logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) # Sample the next tokens. - sample_results = _sample(probs, logprobs, sampling_metadata, + sample_results, sampled_tokens_tensor = _sample(probs, logprobs, sampling_metadata, sampling_tensors) # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs) + prompt_logprobs, sample_logprobs, (probs, sampled_tokens_tensor)) def _get_bin_counts_and_mask( @@ -354,6 +358,8 @@ def _sample_with_torch( sample_metadata = {} multinomial_samples = {} + sampled_token_ids_tensor = torch.empty(logprobs.shape[0], 1, dtype=torch.long, device=logprobs.device) + # Counterintiutively, having two loops here is actually faster. # The first loop can run without waiting on GPU<->CPU sync. for sampling_type in SamplingType: @@ -367,8 +373,17 @@ def _sample_with_torch( sample_metadata[sampling_type] = (seq_group_ids, seq_groups, is_prompts, sample_indices) if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[sample_indices.long()], - dim=-1) + s_i = sample_indices.long() + greedy_samples = torch.argmax(logprobs[s_i], dim=-1) + + # TODO clean up + # self._include_gpu_probs_tensor + logprobs[s_i, :] = -float('inf') + logprobs[s_i, greedy_samples] = 0.0 + probs[s_i, :] = 0 + probs[s_i, greedy_samples] = 1.0 + sampled_token_ids_tensor[s_i] = greedy_samples.unsqueeze(-1) + elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): max_best_of_in_batch = 1 for seq_group, is_prompt in zip(seq_groups, is_prompts): @@ -380,9 +395,15 @@ def _sample_with_torch( "seq_groups": seq_groups, "generators": sampling_metadata.generators, } - multinomial_samples[sampling_type] = _multinomial( - probs[sample_indices.long()], max_best_of_in_batch, + + s_i = sample_indices.long() + + mn_samples = _multinomial( + probs[s_i], max_best_of_in_batch, **seeded_args) + multinomial_samples[sampling_type] = mn_samples + + sampled_token_ids_tensor[s_i] = mn_samples elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] else: @@ -396,6 +417,7 @@ def _sample_with_torch( seq_group_ids, seq_groups, is_prompts, sample_indices = sample_metadata[ sampling_type] if sampling_type == SamplingType.GREEDY: + # This merely serializes the samples. sample_results = _greedy_sample(seq_groups, greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): sample_results = _random_sample(seq_groups, is_prompts, @@ -410,87 +432,87 @@ def _sample_with_torch( sample_results_dict[i] for i in range(len(sampling_metadata.seq_groups)) ] - return sample_results - - -def _sample_with_triton_kernel( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sampling_tensors: SamplingTensors, -) -> List[Tuple[List[int], List[int]]]: - categorized_seq_group_ids = {t: [] for t in SamplingType} - categorized_sample_indices = sampling_metadata.categorized_sample_indices - for i, seq_group in enumerate(sampling_metadata.seq_groups): - _, sampling_params = seq_group - sampling_type = sampling_params.sampling_type - categorized_seq_group_ids[sampling_type].append(i) - - sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} - max_best_of_in_batch = 1 - - # Counterintiutively, having two loops here is actually faster. - # The first loop can run without waiting on GPU<->CPU sync. - for sampling_type in SamplingType: - sample_indices = categorized_sample_indices[sampling_type][:, 0] - sampled_token_indices = categorized_sample_indices[sampling_type][:, 1] - num_tokens = len(sample_indices) - if num_tokens == 0: - continue - seq_group_ids = categorized_seq_group_ids[sampling_type] - seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] - is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] - sample_metadata[sampling_type] = (seq_group_ids, seq_groups, - is_prompts, sample_indices, - sampled_token_indices) - if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM, - SamplingType.RANDOM_SEED): - for seq_group, is_prompt in zip(seq_groups, is_prompts): - if is_prompt: - _, sampling_params = seq_group - max_best_of_in_batch = max(max_best_of_in_batch, - sampling_params.best_of) - elif sampling_type == SamplingType.BEAM: - beam_search_logprobs = logprobs[sample_indices] - else: - raise ValueError(f"Unsupported sampling type: {sampling_type}") - - sampled_tokens, _, _ = sample_triton( - probs=probs, - seeds=sampling_tensors.sampling_seeds, - max_best_of=max_best_of_in_batch, - sample_indices=sampling_tensors.sample_indices, - logprobs=logprobs, - # don't save logprobs because we have logic for that below - # TODO: use this instead of the CPU-based logic below - save_logprobs=False, - ) - - # GPU<->CPU sync happens in the loop below. - - for sampling_type in SamplingType: - if sampling_type not in sample_metadata: - continue - (seq_group_ids, seq_groups, is_prompts, sample_indices, - sampled_token_indices) = sample_metadata[sampling_type] - if sampling_type == SamplingType.GREEDY: - sample_results = _greedy_sample( - seq_groups, sampled_tokens[sampled_token_indices][:, 0]) - elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - sample_results = _random_sample( - seq_groups, is_prompts, sampled_tokens[sampled_token_indices]) - elif sampling_type == SamplingType.BEAM: - sample_results = _beam_search_sample(seq_groups, is_prompts, - sampling_metadata.seq_data, - beam_search_logprobs) - sample_results_dict.update(zip(seq_group_ids, sample_results)) - - sample_results = [ - sample_results_dict[i] - for i in range(len(sampling_metadata.seq_groups)) - ] - return sample_results + return sample_results, sampled_token_ids_tensor + + +#def _sample_with_triton_kernel( +# probs: torch.Tensor, +# logprobs: torch.Tensor, +# sampling_metadata: SamplingMetadata, +# sampling_tensors: SamplingTensors, +#) -> List[Tuple[List[int], List[int]]]: +# categorized_seq_group_ids = {t: [] for t in SamplingType} +# categorized_sample_indices = sampling_metadata.categorized_sample_indices +# for i, seq_group in enumerate(sampling_metadata.seq_groups): +# _, sampling_params = seq_group +# sampling_type = sampling_params.sampling_type +# categorized_seq_group_ids[sampling_type].append(i) +# +# sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} +# sample_metadata = {} +# max_best_of_in_batch = 1 +# +# # Counterintiutively, having two loops here is actually faster. +# # The first loop can run without waiting on GPU<->CPU sync. +# for sampling_type in SamplingType: +# sample_indices = categorized_sample_indices[sampling_type][:, 0] +# sampled_token_indices = categorized_sample_indices[sampling_type][:, 1] +# num_tokens = len(sample_indices) +# if num_tokens == 0: +# continue +# seq_group_ids = categorized_seq_group_ids[sampling_type] +# seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] +# is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] +# sample_metadata[sampling_type] = (seq_group_ids, seq_groups, +# is_prompts, sample_indices, +# sampled_token_indices) +# if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM, +# SamplingType.RANDOM_SEED): +# for seq_group, is_prompt in zip(seq_groups, is_prompts): +# if is_prompt: +# _, sampling_params = seq_group +# max_best_of_in_batch = max(max_best_of_in_batch, +# sampling_params.best_of) +# elif sampling_type == SamplingType.BEAM: +# beam_search_logprobs = logprobs[sample_indices] +# else: +# raise ValueError(f"Unsupported sampling type: {sampling_type}") +# +# sampled_tokens, _, _ = sample_triton( +# probs=probs, +# seeds=sampling_tensors.sampling_seeds, +# max_best_of=max_best_of_in_batch, +# sample_indices=sampling_tensors.sample_indices, +# logprobs=logprobs, +# # don't save logprobs because we have logic for that below +# # TODO: use this instead of the CPU-based logic below +# save_logprobs=False, +# ) +# +# # GPU<->CPU sync happens in the loop below. +# +# for sampling_type in SamplingType: +# if sampling_type not in sample_metadata: +# continue +# (seq_group_ids, seq_groups, is_prompts, sample_indices, +# sampled_token_indices) = sample_metadata[sampling_type] +# if sampling_type == SamplingType.GREEDY: +# sample_results = _greedy_sample( +# seq_groups, sampled_tokens[sampled_token_indices][:, 0]) +# elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): +# sample_results = _random_sample( +# seq_groups, is_prompts, sampled_tokens[sampled_token_indices]) +# elif sampling_type == SamplingType.BEAM: +# sample_results = _beam_search_sample(seq_groups, is_prompts, +# sampling_metadata.seq_data, +# beam_search_logprobs) +# sample_results_dict.update(zip(seq_group_ids, sample_results)) +# +# sample_results = [ +# sample_results_dict[i] +# for i in range(len(sampling_metadata.seq_groups)) +# ] +# return sample_results def _sample( @@ -668,6 +690,7 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], + spec_decode_data, ) -> SamplerOutput: sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, @@ -684,4 +707,11 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return SamplerOutput(outputs=sampler_output) + + + probs, token_ids = spec_decode_data + return SamplerOutput( + outputs=sampler_output, + sampled_token_probs=probs, + sampled_token_ids=token_ids, + ) From bb36081d218d6fe491beb3b8a4983e4f16f30ac2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 15:14:55 -0700 Subject: [PATCH 105/165] remove mock in multi step worker --- vllm/spec_decode/multi_step_worker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index ce63c329a40a..44605fc0ec58 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -348,12 +348,13 @@ def _merge_outputs( # We mock the device tensors until PR 7/9 is merged (e2e correctness). # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer for step_output in sampler_output: - maybe_mock_device_tensors( - sampler_output=step_output, - batch_size=len(proposal_lens), - vocab_size=self._vocab_size, - device=self._device, - ) + pass + #maybe_mock_device_tensors( + # sampler_output=step_output, + # batch_size=len(proposal_lens), + # vocab_size=self._vocab_size, + # device=self._device, + #) proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) From 4c40eaf65bc715df9cf5ea2eb1b043a375c9d10a Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 15:17:48 -0700 Subject: [PATCH 106/165] remove mock tensors from target worker --- vllm/spec_decode/batch_expansion.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 88af1dd36015..353a8a584ebc 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -148,12 +148,12 @@ def _contract_batch(self, original_bs: int, # We mock the device tensors until PR 7/9 is merged (e2e correctness). # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer - maybe_mock_device_tensors( - sampler_output=target_sampler_output, - batch_size=len(non_spec_indices) + num_scoring_tokens, - vocab_size=self._vocab_size, - device=self._device, - ) + #maybe_mock_device_tensors( + # sampler_output=target_sampler_output, + # batch_size=len(non_spec_indices) + num_scoring_tokens, + # vocab_size=self._vocab_size, + # device=self._device, + #) (target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs) = self._split_scoring_output( From 22220179c48ac5ffc6cede345a5bb58905aba365 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 15:43:26 -0700 Subject: [PATCH 107/165] bs1 correctness test passes --- tests/spec_decode/e2e/conftest.py | 8 +-- tests/spec_decode/e2e/test_correctness.py | 63 ++++++++++++++++++++++- vllm/worker/worker.py | 11 ++-- 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 1d99cb5d3221..dbe2661d93ed 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -36,6 +36,8 @@ def generator_inner(): del llm cleanup() - for llm in generator_inner(): - yield llm - del llm + def generator_outer(): + for llm in generator_inner(): + yield llm + del llm + return generator_outer diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 9c53c344c4b0..cae2f03972d7 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -39,7 +39,7 @@ }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("batch_size", [1]) # NOTE: We should run more permutations of this test (more BS, more seeds). But # because our spec decode generates gibberish token ids, the likelihood of # emitting an invalid token combination is nontrivial. This causes divergence in @@ -88,6 +88,63 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): print(f"{actual_token_ids=}") assert actual_tokens.strip() == expected_tokens.strip() +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + } +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): + output_len = 32 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + _, spec_batch_token_ids = get_output_from_llm_generator( + test_llm_generator, prompts, sampling_params) + + _, baseline_batch_token_ids = get_output_from_llm_generator( + test_llm_generator, prompts, sampling_params) + + assert len(baseline_batch_token_ids) == len(prompts) + assert len(spec_batch_token_ids) == len(prompts) + assert [len(token_ids) for token_ids in baseline_batch_token_ids + spec_batch_token_ids] == [output_len] * (batch_size * 2) + + for i, (baseline_token_ids, spec_token_ids) in enumerate(zip(baseline_batch_token_ids, spec_batch_token_ids)): + print(f'{i=} {baseline_batch_token_ids=}') + print(f'{i=} {spec_batch_token_ids=}') + assert baseline_token_ids == spec_token_ids + @pytest.mark.parametrize( "common_llm_kwargs", @@ -142,7 +199,9 @@ def test_spec_decode_xfail(test_llm_generator): def get_output_from_llm_generator( llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: - for llm in llm_generator: + tokens = [] + token_ids = [] + for llm in llm_generator(): outputs = llm.generate(prompts, sampling_params, use_tqdm=True) token_ids = [output.outputs[0].token_ids for output in outputs] tokens = [output.outputs[0].text for output in outputs] diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9add1bd59ef6..9fbb763831f7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -135,9 +135,14 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( - "Error in memory profiling. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") + try: + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + except AssertionError as e: + print(e) + breakpoint() + raise cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( From 51a7eaed47c32a88cec24ffa3aeb5c7cab5dbfb0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 15:45:10 -0700 Subject: [PATCH 108/165] bs32 passes --- tests/spec_decode/e2e/test_correctness.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index cae2f03972d7..4003119e5be9 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -109,9 +109,9 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): } ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): +def test_spec_decode_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator, batch_size: int): output_len = 32 temperature = 0.0 @@ -138,11 +138,10 @@ def test_spec_decode_e2e_greedy_correctness_bs1(baseline_llm_generator, test_llm assert len(baseline_batch_token_ids) == len(prompts) assert len(spec_batch_token_ids) == len(prompts) - assert [len(token_ids) for token_ids in baseline_batch_token_ids + spec_batch_token_ids] == [output_len] * (batch_size * 2) for i, (baseline_token_ids, spec_token_ids) in enumerate(zip(baseline_batch_token_ids, spec_batch_token_ids)): - print(f'{i=} {baseline_batch_token_ids=}') - print(f'{i=} {spec_batch_token_ids=}') + print(f'{i=} {baseline_token_ids=}') + print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids From 1153cbee21b188b0f402b5cd790e740c79fa73b7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 16:13:33 -0700 Subject: [PATCH 109/165] adding more correctness tests --- tests/spec_decode/e2e/conftest.py | 12 ++- tests/spec_decode/e2e/test_correctness.py | 119 +++++++++++++++++++++- 2 files changed, 121 insertions(+), 10 deletions(-) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index dbe2661d93ed..6dab4c9747dc 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -6,28 +6,30 @@ @pytest.fixture -def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, +def baseline_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, seed): - return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, + return create_llm_generator("baseline", request, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, seed) @pytest.fixture -def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, +def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed): - return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, + return create_llm_generator("test", request, common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed) -def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, +def create_llm_generator(baseline_or_test, request, common_llm_kwargs, per_test_common_llm_kwargs, distinct_llm_kwargs, seed): kwargs = { **common_llm_kwargs, **per_test_common_llm_kwargs, **distinct_llm_kwargs, } + test_name = request.node.name def generator_inner(): + print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}') llm = LLM(**kwargs) set_random_seed(seed) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 4003119e5be9..ea9f03032ee5 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -91,9 +91,80 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): @pytest.mark.parametrize( "common_llm_kwargs", [{ - # Use a small model for a fast test. - # Note this is repeated in the test body; to initialize a tokenizer. + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { "model": "JackFram/llama-68m", + }, + { + "model": "JackFram/llama-160m", + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } +]) +@pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): + run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model": "JackFram/llama-68m", + }, + { + "model": "JackFram/llama-160m", + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } +]) +@pytest.mark.parametrize("batch_size", [32]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(baseline_llm_generator, test_llm_generator, batch_size: int): + run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # A "real" model (not tiny). + "model": "meta-llama/Llama-2-7b-chat-hf", # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -102,16 +173,54 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): "use_v2_block_manager": True }]) @pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ + # Try two different num spec tokens. { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, } ]) +@pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_real_model_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): + run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # A "real" model (not tiny). + "model": "meta-llama/Llama-2-7b-chat-hf", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("test_llm_kwargs", [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } +]) +@pytest.mark.parametrize("batch_size", [32]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator, batch_size: int): +def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(baseline_llm_generator, test_llm_generator, batch_size: int): + run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) + +def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size): output_len = 32 temperature = 0.0 @@ -134,7 +243,7 @@ def test_spec_decode_e2e_greedy_correctness(baseline_llm_generator, test_llm_gen test_llm_generator, prompts, sampling_params) _, baseline_batch_token_ids = get_output_from_llm_generator( - test_llm_generator, prompts, sampling_params) + baseline_llm_generator, prompts, sampling_params) assert len(baseline_batch_token_ids) == len(prompts) assert len(spec_batch_token_ids) == len(prompts) From 68072af3441e7c0a21f4a2cbd702cfec0287463d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 16:38:32 -0700 Subject: [PATCH 110/165] test with ignore_eos=False --- tests/spec_decode/e2e/conftest.py | 19 +- tests/spec_decode/e2e/test_correctness.py | 279 ++++++++++++++++------ 2 files changed, 212 insertions(+), 86 deletions(-) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 6dab4c9747dc..b9f9001511ec 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -6,21 +6,25 @@ @pytest.fixture -def baseline_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, seed): - return create_llm_generator("baseline", request, common_llm_kwargs, per_test_common_llm_kwargs, +def baseline_llm_generator(request, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, + seed): + return create_llm_generator("baseline", request, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, seed) @pytest.fixture def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs, test_llm_kwargs, seed): - return create_llm_generator("test", request, common_llm_kwargs, per_test_common_llm_kwargs, - test_llm_kwargs, seed) + return create_llm_generator("test", request, common_llm_kwargs, + per_test_common_llm_kwargs, test_llm_kwargs, + seed) -def create_llm_generator(baseline_or_test, request, common_llm_kwargs, per_test_common_llm_kwargs, - distinct_llm_kwargs, seed): +def create_llm_generator(baseline_or_test, request, common_llm_kwargs, + per_test_common_llm_kwargs, distinct_llm_kwargs, + seed): kwargs = { **common_llm_kwargs, **per_test_common_llm_kwargs, @@ -42,4 +46,5 @@ def generator_outer(): for llm in generator_inner(): yield llm del llm + return generator_outer diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index ea9f03032ee5..246f540fa2df 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -88,6 +88,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): print(f"{actual_token_ids=}") assert actual_tokens.strip() == expected_tokens.strip() + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -97,32 +98,49 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model": "JackFram/llama-68m", - }, - { - "model": "JackFram/llama-160m", - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model": "JackFram/llama-68m", + }, + { + "model": "JackFram/llama-160m", + }, + ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } -]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } + ]) +@pytest.mark.parametrize( + "output_len", + [ + # Use long output len for the small model test. + 1536, + ]) @pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): - run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) +def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", @@ -133,32 +151,99 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(baseline_llm_generato # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model": "JackFram/llama-68m", - }, - { - "model": "JackFram/llama-160m", - }, -]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model": "JackFram/llama-68m", + }, + { + "model": "JackFram/llama-160m", + }, + ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } + ]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 256, + ]) +@pytest.mark.parametrize("batch_size", [32]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model": "JackFram/llama-68m", + }, + { + "model": "JackFram/llama-160m", + }, + ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } + ]) +@pytest.mark.parametrize("max_output_len", [ + 256, ]) @pytest.mark.parametrize("batch_size", [32]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(baseline_llm_generator, test_llm_generator, batch_size: int): - run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) +def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( + baseline_llm_generator, test_llm_generator, batch_size: int, + max_output_len: int): + run_greedy_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len, + force_output_len=False) + @pytest.mark.parametrize( "common_llm_kwargs", @@ -172,23 +257,38 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(baseline_llm_gen # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } -]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } + ]) @pytest.mark.parametrize("batch_size", [1]) +@pytest.mark.parametrize( + "output_len", + [ + # Use decently long output len for a high quality test. + 256, + ]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_real_model_bs1(baseline_llm_generator, test_llm_generator, batch_size: int): - run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) +def test_spec_decode_e2e_greedy_correctness_real_model_bs1( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", @@ -202,26 +302,42 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(baseline_llm_generato # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } -]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + } + ]) @pytest.mark.parametrize("batch_size", [32]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 64, + ]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(baseline_llm_generator, test_llm_generator, batch_size: int): - run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size) - -def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batch_size): - output_len = 32 +def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, + batch_size, max_output_len, + force_output_len: bool): temperature = 0.0 prompts = [ @@ -233,9 +349,13 @@ def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batc prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + # If the test requires that we generated max_output_len tokens, then set the + # sampling params to ignore eos token. + ignore_eos = force_output_len + sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, + max_tokens=max_output_len, + ignore_eos=ignore_eos, temperature=temperature, ) @@ -248,7 +368,8 @@ def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, batc assert len(baseline_batch_token_ids) == len(prompts) assert len(spec_batch_token_ids) == len(prompts) - for i, (baseline_token_ids, spec_token_ids) in enumerate(zip(baseline_batch_token_ids, spec_batch_token_ids)): + for i, (baseline_token_ids, spec_token_ids) in enumerate( + zip(baseline_batch_token_ids, spec_batch_token_ids)): print(f'{i=} {baseline_token_ids=}') print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids From 71860e16a0e0cddc0f827e3fde17b23c5d7569f1 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 16:38:53 -0700 Subject: [PATCH 111/165] lint --- vllm/model_executor/layers/sampler.py | 19 ++++++++++--------- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/multi_step_worker.py | 3 +-- vllm/worker/worker.py | 11 +++-------- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index ec40e016e18f..db36bb898a3f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -5,7 +5,6 @@ import torch import torch.nn as nn -from vllm.model_executor.layers.ops.sample import sample as sample_triton from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors) from vllm.sampling_params import SamplingParams, SamplingType @@ -77,13 +76,14 @@ def forward( logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) # Sample the next tokens. - sample_results, sampled_tokens_tensor = _sample(probs, logprobs, sampling_metadata, - sampling_tensors) + sample_results, sampled_tokens_tensor = _sample( + probs, logprobs, sampling_metadata, sampling_tensors) # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs, (probs, sampled_tokens_tensor)) + prompt_logprobs, sample_logprobs, + (probs, sampled_tokens_tensor)) def _get_bin_counts_and_mask( @@ -358,7 +358,10 @@ def _sample_with_torch( sample_metadata = {} multinomial_samples = {} - sampled_token_ids_tensor = torch.empty(logprobs.shape[0], 1, dtype=torch.long, device=logprobs.device) + sampled_token_ids_tensor = torch.empty(logprobs.shape[0], + 1, + dtype=torch.long, + device=logprobs.device) # Counterintiutively, having two loops here is actually faster. # The first loop can run without waiting on GPU<->CPU sync. @@ -398,9 +401,8 @@ def _sample_with_torch( s_i = sample_indices.long() - mn_samples = _multinomial( - probs[s_i], max_best_of_in_batch, - **seeded_args) + mn_samples = _multinomial(probs[s_i], max_best_of_in_batch, + **seeded_args) multinomial_samples[sampling_type] = mn_samples sampled_token_ids_tensor[s_i] = mn_samples @@ -708,7 +710,6 @@ def _build_sampler_output( sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - probs, token_ids = spec_decode_data return SamplerOutput( outputs=sampler_output, diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 353a8a584ebc..e01cf224a5ae 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -6,8 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors, - nvtx_range, sampler_output_to_torch, +from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, + sampler_output_to_torch, split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 44605fc0ec58..0ca970c8f5ff 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,8 +6,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (maybe_mock_device_tensors, - sampler_output_to_torch) +from vllm.spec_decode.util import sampler_output_to_torch from vllm.worker.worker import Worker diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9fbb763831f7..9add1bd59ef6 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -135,14 +135,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory - try: - assert peak_memory > 0, ( - "Error in memory profiling. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - except AssertionError as e: - print(e) - breakpoint() - raise + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") cache_block_size = self.get_cache_block_size_bytes() num_gpu_blocks = int( From 4b2470bdd17cd3a2dc5b92c5bd989e1eb950c691 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 16:41:04 -0700 Subject: [PATCH 112/165] fix multinomial sampling --- vllm/model_executor/layers/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index db36bb898a3f..dbf161538937 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -405,7 +405,7 @@ def _sample_with_torch( **seeded_args) multinomial_samples[sampling_type] = mn_samples - sampled_token_ids_tensor[s_i] = mn_samples + sampled_token_ids_tensor[s_i] = mn_samples.unsqueeze(-1) elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] else: From 5e9dafbdb509a8ec92fb43b228b3ddffeeb3cc45 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 17:08:14 -0700 Subject: [PATCH 113/165] more tests, unfix examples test --- tests/spec_decode/e2e/test_correctness.py | 32 +++++++++-------------- vllm/model_executor/layers/sampler.py | 2 +- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 246f540fa2df..309fbb95f58b 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -6,6 +6,9 @@ from vllm import SamplingParams +# TODO test preemption +# TODO test integration (cuda graph, tp) +# TODO test smoke (sampling params) @pytest.mark.parametrize( "common_llm_kwargs", @@ -14,8 +17,8 @@ # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", - # Skip real loading for fast test. - "load_format": "dummy", + ## Skip real loading for fast test. + # "load_format": "dummy", # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -30,21 +33,12 @@ "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - }, { # No spec decode. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1]) -# NOTE: We should run more permutations of this test (more BS, more seeds). But -# because our spec decode generates gibberish token ids, the likelihood of -# emitting an invalid token combination is nontrivial. This causes divergence in -# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf- -# start" bytes are emitted. +@pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): """Run generation with speculative decoding on a batch. Verify the engine @@ -67,8 +61,6 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): max_tokens=output_len, ignore_eos=True, temperature=temperature, - skip_special_tokens=True, - spaces_between_special_tokens=False, ) batch_tokens, batch_token_ids = get_output_from_llm_generator( @@ -135,7 +127,7 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - run_greedy_correctness_test(baseline_llm_generator, + run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, @@ -188,7 +180,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - run_greedy_correctness_test(baseline_llm_generator, + run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, @@ -238,7 +230,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( baseline_llm_generator, test_llm_generator, batch_size: int, max_output_len: int): - run_greedy_correctness_test(baseline_llm_generator, + run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len, @@ -283,7 +275,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( def test_spec_decode_e2e_greedy_correctness_real_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - run_greedy_correctness_test(baseline_llm_generator, + run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, @@ -328,14 +320,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - run_greedy_correctness_test(baseline_llm_generator, + run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, force_output_len=True) -def run_greedy_correctness_test(baseline_llm_generator, test_llm_generator, +def run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len, force_output_len: bool): temperature = 0.0 diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index dbf161538937..db36bb898a3f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -405,7 +405,7 @@ def _sample_with_torch( **seeded_args) multinomial_samples[sampling_type] = mn_samples - sampled_token_ids_tensor[s_i] = mn_samples.unsqueeze(-1) + sampled_token_ids_tensor[s_i] = mn_samples elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] else: From 9f42d5afc41aa5c94fae44a50d1ee7d68de397c7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 17:08:47 -0700 Subject: [PATCH 114/165] lint --- tests/spec_decode/e2e/test_correctness.py | 48 ++++++++++++----------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 309fbb95f58b..8ab26ecf23dc 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -10,6 +10,7 @@ # TODO test integration (cuda graph, tp) # TODO test smoke (sampling params) + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -128,10 +129,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) @pytest.mark.parametrize( @@ -181,10 +182,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) @pytest.mark.parametrize( @@ -231,10 +232,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( baseline_llm_generator, test_llm_generator, batch_size: int, max_output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len, - force_output_len=False) + test_llm_generator, + batch_size, + max_output_len, + force_output_len=False) @pytest.mark.parametrize( @@ -276,10 +277,10 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) @pytest.mark.parametrize( @@ -321,15 +322,16 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) -def run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, - batch_size, max_output_len, - force_output_len: bool): +def run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, batch_size, + max_output_len, + force_output_len: bool): temperature = 0.0 prompts = [ From 399e7ddc045231fd3159b30123f0797499364720 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 17:12:20 -0700 Subject: [PATCH 115/165] clean --- tests/spec_decode/e2e/test_correctness.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 8ab26ecf23dc..215ec65c8c47 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -9,6 +9,7 @@ # TODO test preemption # TODO test integration (cuda graph, tp) # TODO test smoke (sampling params) +# TODO investigate token with 68m/68m k=5 temp=1.0 @pytest.mark.parametrize( @@ -70,9 +71,10 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): # Expect a generation for each prompt in the batch. assert len(batch_token_ids) == len(prompts) - # Expect each generation to have expected number of tokens (note - # ignore_eos=True). - assert all(len(token_ids) == output_len for token_ids in batch_token_ids) + # Expect each generation to have expected number of tokens (note ignore_eos + # is True). + assert [len(token_ids) + for token_ids in batch_token_ids] == ([output_len] * batch_size) # Expect detokenized string to match. tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") From 44c8195eef0ef4b4e4e8feb1e3c5331a57631ed7 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 17:29:27 -0700 Subject: [PATCH 116/165] failing test --- tests/spec_decode/e2e/test_correctness.py | 54 +++++++++++++++++++++++ vllm/config.py | 6 ++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 215ec65c8c47..558d09e60947 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -10,6 +10,9 @@ # TODO test integration (cuda graph, tp) # TODO test smoke (sampling params) # TODO investigate token with 68m/68m k=5 temp=1.0 +# TODO test for when sequences skip speculation +# TODO test different block sizes +# TODO validate acceptance rate @pytest.mark.parametrize( @@ -328,6 +331,57 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( batch_size, max_output_len=output_len, force_output_len=True) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + #"block_size": 8, + # 2 for small prompt, 256//8 for generated. + #"num_gpu_blocks_override": 2 + 256//8, + #"max_model_len": (2 + 256//8)*8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + #"model": "JackFram/llama-160m", + "model": "meta-llama/Llama-2-7b-chat-hf", + }, + ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + ]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 256, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_with_preemption( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + """ + NOTE(cade): this test fails, unclear why + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + def run_greedy_equality_correctness_test(baseline_llm_generator, diff --git a/vllm/config.py b/vllm/config.py index 753fc33e9b71..ce72d4f70320 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -675,7 +675,6 @@ def maybe_create_spec_config( draft_revision = None draft_code_revision = None draft_quantization = None - draft_max_model_len = None draft_model_config = ModelConfig( model=speculative_model, @@ -689,7 +688,7 @@ def maybe_create_spec_config( revision=draft_revision, code_revision=draft_code_revision, tokenizer_revision=target_model_config.tokenizer_revision, - max_model_len=draft_max_model_len, + max_model_len=None, quantization=draft_quantization, enforce_eager=target_model_config.enforce_eager, max_context_len_to_capture=target_model_config. @@ -697,6 +696,9 @@ def maybe_create_spec_config( max_logprobs=target_model_config.max_logprobs, ) + # TODO docs + #draft_model_config.max_model_len = min(target_model_config.max_model_len, draft_model_config.max_model_len) + draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( target_parallel_config)) From a931ed02ed7c8c83a65ae9d206e8854416b4c0a5 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 18:15:45 -0700 Subject: [PATCH 117/165] many bs test --- tests/spec_decode/e2e/test_correctness.py | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 558d09e60947..0aef13d12ff4 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -192,6 +192,52 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( max_output_len=output_len, force_output_len=True) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + "model": "JackFram/llama-160m", + }, + { + "model": "meta-llama/Llama-2-7b-chat-hf", + }, + ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + # Try two different num spec tokens. + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + ]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 256, + ]) +@pytest.mark.parametrize("batch_size", [2, 4, 5, 9, 13, 27, 29]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_e2e_greedy_correctness_tiny_model_many_distinct_bs( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", From e36b35222667b35dbf0afa70a4ba0af196bca9fa Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 9 Apr 2024 18:42:18 -0700 Subject: [PATCH 118/165] remove logs --- vllm/spec_decode/spec_decode_worker.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index be3af7be9386..5e11e44b53bd 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -156,7 +156,7 @@ def execute_model( "speculative decoding " "requires non-None seq_group_metadata_list") - logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") + #logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. @@ -188,7 +188,7 @@ def _run_no_spec( proposer and scorer model so that the KV cache is consistent between the two. """ - logger.info("run proposer worker no spec") + #logger.info("run proposer worker no spec") self.proposer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, @@ -197,7 +197,7 @@ def _run_no_spec( blocks_to_copy=blocks_to_copy, ) - logger.info("run target worker no spec") + #logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -231,13 +231,13 @@ def _run_speculative_decoding_step( sequence. """ - logger.info("get spec proposals") + #logger.info("get spec proposals") # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - logger.info("score proposals") + #logger.info("score proposals") proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, @@ -247,11 +247,11 @@ def _run_speculative_decoding_step( proposals, ) - logger.info("verify proposals") + #logger.info("verify proposals") accepted_token_ids = self._verify_tokens(seq_group_metadata_list, proposal_scores, proposals, k) - logger.info("create output list") + #logger.info("create output list") return self._create_output_sampler_list(seq_group_metadata_list, accepted_token_ids, k) From de1691929e58af704c72b329c9e608d06f2d8320 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 10 Apr 2024 14:13:32 -0700 Subject: [PATCH 119/165] pr feedback --- vllm/engine/llm_engine.py | 3 +-- vllm/engine/output_processor/stop_checker.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e6e75ee59c76..59add1faba44 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -193,8 +193,7 @@ def __init__( self.seq_counter, self.get_tokenizer_for_seq, stop_checker=StopChecker( - self.scheduler, - self.scheduler_config, + self.scheduler_config.max_model_len, self.get_tokenizer_for_seq, ), )) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 2a6c79d2dc02..37d53fa3c7fa 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,6 @@ -from typing import List +from typing import Callable, List + +from transformers import PreTrainedTokenizer from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceStatus @@ -11,9 +13,10 @@ class StopChecker: emitted, or if we have exceeded the max model len. """ - def __init__(self, scheduler, scheduler_config, get_tokenizer_for_seq): - self.scheduler = scheduler - self.scheduler_config = scheduler_config + def __init__(self, max_model_len: int, + get_tokenizer_for_seq: Callable[[Sequence], + PreTrainedTokenizer]): + self.max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq def maybe_stop_sequence(self, seq: Sequence, @@ -23,7 +26,7 @@ def maybe_stop_sequence(self, seq: Sequence, """ # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: + if seq.get_len() > self.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return From d30c4a5d9033cff9a62b5816030431cb42dbe880 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 10 Apr 2024 18:49:55 -0700 Subject: [PATCH 120/165] test larger bs, remove many distinct test due to numerical instability --- tests/spec_decode/e2e/test_correctness.py | 48 +------------------ .../layers/rejection_sampler.py | 1 + 2 files changed, 2 insertions(+), 47 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 0aef13d12ff4..abfb206356f8 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -181,7 +181,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( # Use small output len for fast test. 256, ]) -@pytest.mark.parametrize("batch_size", [32]) +@pytest.mark.parametrize("batch_size", [64]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, @@ -192,52 +192,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( max_output_len=output_len, force_output_len=True) -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - "model": "JackFram/llama-160m", - }, - { - "model": "meta-llama/Llama-2-7b-chat-hf", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [2, 4, 5, 9, 13, 27, 29]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_greedy_correctness_tiny_model_many_distinct_bs( - baseline_llm_generator, test_llm_generator, batch_size: int, - output_len: int): - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) - @pytest.mark.parametrize( "common_llm_kwargs", diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index ecd2bd0fce3a..61c2009b07c2 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -144,6 +144,7 @@ def _batch_modified_rejection_sampling( recovered_probs = self._get_recovered_probs( target_probs, draft_probs).reshape(batch_size * k, vocab_size) + # NOTE: the recovered_probs are overwritten by this method. recovered_token_ids = _multinomial(recovered_probs, num_samples=1).reshape( batch_size, k) From d8f8d1f21d813e09619ef0d462cef00659c41ce3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 10 Apr 2024 18:59:42 -0700 Subject: [PATCH 121/165] lint --- tests/spec_decode/e2e/test_correctness.py | 17 ++++++++--------- vllm/config.py | 3 ++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index abfb206356f8..3f56a387f70b 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -331,6 +331,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( batch_size, max_output_len=output_len, force_output_len=True) + + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -354,14 +356,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize( "output_len", [ @@ -383,7 +383,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( force_output_len=True) - def run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, max_output_len, diff --git a/vllm/config.py b/vllm/config.py index ce72d4f70320..8773d7aeb067 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -697,7 +697,8 @@ def maybe_create_spec_config( ) # TODO docs - #draft_model_config.max_model_len = min(target_model_config.max_model_len, draft_model_config.max_model_len) + #draft_model_config.max_model_len = min(target_model_config.max_model_len, + # draft_model_config.max_model_len) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( From 2bbc12c17c9ca17a3f89ee0e6523ab2861464e6f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 11 Apr 2024 12:38:05 -0700 Subject: [PATCH 122/165] wip validate acceptance rate --- tests/spec_decode/e2e/test_correctness.py | 14 +++++++------- vllm/model_executor/layers/rejection_sampler.py | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 3f56a387f70b..9accd4511c96 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -104,9 +104,9 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): { "model": "JackFram/llama-68m", }, - { - "model": "JackFram/llama-160m", - }, + #{ + # "model": "JackFram/llama-160m", + #}, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize( @@ -117,10 +117,10 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } + #{ + # "speculative_model": "JackFram/llama-68m", + # "num_speculative_tokens": 1, + #} ]) @pytest.mark.parametrize( "output_len", diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 61c2009b07c2..925934d2197f 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -316,6 +316,10 @@ def _create_output( self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() self.num_draft_tokens += batch_size * k + print(f'{self.num_accepted_tokens=}') + print(f'{self.num_emitted_tokens=}') + print(f'{self.num_draft_tokens=}') + return output_with_bonus_tokens def _raise_if_incorrect_shape( From 2d6112bc7a1a1ec47a8eec42136c9dd563f5641d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 00:27:43 -0700 Subject: [PATCH 123/165] WIP chunked prefill work --- tests/spec_decode/e2e/test_correctness.py | 17 +++++---- vllm/core/scheduler.py | 46 ++++++++++++++++++++++- vllm/engine/llm_engine.py | 32 +++++++++++++++- vllm/sequence.py | 5 +++ 4 files changed, 90 insertions(+), 10 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index a8ebd66841eb..e5f0e70e5b3b 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -26,17 +26,20 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ + #{ + # "speculative_model": "JackFram/llama-68m", + # "num_speculative_tokens": 5, + #}, { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - { + #"enable_chunked_prefill": True, + #"max_num_batched_tokens": 2, + #"max_num_seqs": 2, "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 1, }, - { - # No spec decode. - }, + #{ + # # No spec decode. + #}, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [1]) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 2403093202a7..75272e224bec 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -100,6 +100,27 @@ class ScheduledSequenceGroup: token_chunk_size: int + """ + What should happen here? + - each sequence in a sequence group can produce one output token, or multiple. + - the "role" of the output token could be prefill, or decode. + - even with one output token, it could be prefill or decode. + - with >1 output token, it could be prefill or decode. + - technically, it could even be a mix of both prefill and decode -- first N are ignored, latter M are sent to user. + + so we need to track how many of the tokens have been computed + we need to track how many new tokens have been computed + we need to track num decode tokens + + why can't token_chunk_size just be num_prefill_tokens? then any output after + token_chunk_size is a decode token + + - s/token_chunk_size/prefill_chunk_size/g + - make optional (None during decode) + - how to handle the last chunk, where a token is emitted? + """ + + @dataclass class SchedulerOutputs: """The scheduling decision made from a scheduler.""" @@ -437,7 +458,8 @@ def _schedule_running( else: decode_seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, - token_chunk_size=1)) + token_chunk_size=1)) + #token_chunk_size=self._get_num_lookahead_slots(is_prefill=False))) budget.add_num_batched_tokens(seq_group.request_id, num_running_tokens) budget.add_num_seqs(seq_group.request_id, num_running_seqs) @@ -654,7 +676,7 @@ def _schedule_prefills( if curr_loras is not None and lora_int_id > 0: curr_loras.add(lora_int_id) waiting_queue.popleft() - self._allocate_and_set_running(seq_group, num_new_tokens) + self._allocate_and_set_running(seq_group, num_new_tokens) # num_new_tokens not required here seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens)) @@ -1096,10 +1118,30 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup, seqs = seq_group.get_seqs(status=status) for seq in seqs: num_new_tokens += seq.get_num_new_tokens() + # + self._get_num_lookahead_slots(is_prefill=seq_group.is_prefill()) # Chunk if a running request cannot fit in. # If number of seq > 1, it means it is doing beam search in a # decode phase. Do not chunk in that case. if enable_chunking and len(seqs) == 1: num_new_tokens = min(num_new_tokens, budget.remaining_token_budget()) + + return num_new_tokens + +""" +1 Generalize get_num_new_tokens to distinguish between prefill, decode. + essentially, the number of query tokens, _not_ the number of output tokens. + + token_chunk_size --> rename to prefill only (None for decode) + token budget distinguishes between prefill and decode (_computed_ tokens, not "new" output tokens) + + currently we assume 1:1 map between new tokens and computed tokens; we need to break this 1:1 mapping + +2 Set num_new_tokens to num_lookahead_tokens in spec decode + will break when budget is exhausted + breaks for some other reason? + +3 Fork chunked prefill vs spec decode budget calculations + +""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6d154296f29e..6152687c0fc2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -438,18 +438,48 @@ def _process_model_outputs( # [step][sequence group]. output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) + + print(f'_process_model_outputs') # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): seq_group = scheduled_seq_group.seq_group + + print(f'{scheduled_seq_group.token_chunk_size=}') + print(f'before {seq_group.get_num_uncomputed_tokens()=}') + + output_token_ids = [sgo.samples[0].output_token for sgo in outputs] + print(f'{output_token_ids=}') + + + # Num computed tokens tracks how many of the prefill tokens have + # been computed. + # + # The token chunk size is always 1 in normal decoding. So + # we have 1 uncomputed token, then zero after this update. + # + # Then, process_outputs runs when it's zero. + # + # So, the problem is likely that the scheduler updates token_chunk_size + # to num_lookahead_slots. Let's confirm. + + # These are only prefill computed tokens. seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - # If uncomputed tokens > 0, it means prefill is chunked. + + # token chunk size -- is this one? + + print(f'after {seq_group.get_num_uncomputed_tokens()=}') + + # If uncomputed tokens > 0, it means prefill is chunked and prefill is not complete. # We don't need to process outputs in that case. if seq_group.get_num_uncomputed_tokens() == 0: self.output_processor.process_outputs(seq_group, outputs) + # s/token_chunk_size/prefill_chunk_size/g + # s/num_uncomputed_tokens/num_uncomputed_prefill_tokens/g + # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() diff --git a/vllm/sequence.py b/vllm/sequence.py index 1cabc9e9f482..c9f54c48541e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -144,11 +144,14 @@ def get_num_computed_tokens(self) -> int: def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" + print(f'seq_data.update_num_computed_tokens({num_new_computed_tokens=})') self._num_computed_tokens += num_new_computed_tokens assert self._num_computed_tokens <= self.get_len(), ( self._num_computed_tokens, self.get_len()) + # If all tokens are computed, it means it is in decoding phase. if self.get_num_uncomputed_tokens() == 0: + # define a property _stage; return DECODE if num_uncomputed == 0; else PREFILL. self._stage = SequenceStage.DECODE def reset_state_for_recompute(self) -> None: @@ -494,6 +497,8 @@ def get_finished_seqs(self) -> List[Sequence]: def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" for seq in self.seqs_dict.values(): + # TODO does this not handle swapped seqs? --> we swap out/in sequences + # by group, so it's fine.. if not seq.is_finished(): seq.data.update_num_computed_tokens(num_new_computed_tokens) From b7887bc8d4c3efc5a515befecaa32859ba1b1f48 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 01:39:25 -0700 Subject: [PATCH 124/165] block manager v2 + chunked prefill test --- tests/core/block/e2e/test_correctness.py | 65 +++++++++++++++++++++++ tests/spec_decode/e2e/test_correctness.py | 17 ++++-- vllm/core/block/block_table.py | 1 - 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 94b65401e1dd..a403d442d7af 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -229,6 +229,71 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.parametrize("common_llm_kwargs", [ + { + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, +]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [ + { + "use_v2_block_manager": False, + }, +]) +@pytest.mark.parametrize( + "test_llm_kwargs", [ + { + "use_v2_block_manager": True, + "num_lookahead_slots": 0, + }, + { + "use_v2_block_manager": True, + "num_lookahead_slots": 5, + }, +]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): + output_len = 32 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + print('Getting token ids with BlockManagerV1') + baseline_token_ids = get_token_ids_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) + + print('Getting token ids with BlockManagerV2') + test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, sampling_params) + + for expected_token_ids, actual_token_ids in zip(baseline_token_ids, + test_token_ids): + assert expected_token_ids == actual_token_ids + + assert baseline_token_ids == test_token_ids + def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index e5f0e70e5b3b..f34874f6f1b5 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -31,13 +31,20 @@ # "num_speculative_tokens": 5, #}, { - #"enable_chunked_prefill": True, - #"max_num_batched_tokens": 2, - #"max_num_seqs": 2, - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + #"speculative_model": "JackFram/llama-68m", + #"num_speculative_tokens": 1, }, #{ + # #"enable_chunked_prefill": True, + # #"max_num_batched_tokens": 2, + # #"max_num_seqs": 2, + # "speculative_model": "JackFram/llama-68m", + # "num_speculative_tokens": 1, + #}, + #{ # # No spec decode. #}, ]) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ba061bbc4fbc..560267e55ea3 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -104,7 +104,6 @@ def append_token_ids(self, token_ids (List[int]): The sequence of token IDs to be appended. """ assert self._is_allocated - assert token_ids, "can't append empty token ids" self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + num_lookahead_slots) From 84de321563e6108448fe129c4e682ecbcd66f934 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 02:23:34 -0700 Subject: [PATCH 125/165] check stages instead of num uncomputed --- tests/core/block/e2e/test_correctness.py | 2 +- vllm/core/scheduler.py | 6 +++--- vllm/engine/llm_engine.py | 16 +++++++++++++++- vllm/sequence.py | 7 ++++++- vllm/worker/model_runner.py | 2 +- 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index a403d442d7af..a68d52fb0a91 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -259,7 +259,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, "num_lookahead_slots": 5, }, ]) -@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("seed", [1]) def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): output_len = 32 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 75272e224bec..713171856a1b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -97,7 +97,7 @@ class ScheduledSequenceGroup: # The total chunk size (number of tokens) to process for next iteration. # 1 for decoding. Same as prompt tokens for prefill, but if prefill is # chunked, it can be smaller than that. - token_chunk_size: int + token_chunk_size: int # TODO docs """ @@ -458,7 +458,7 @@ def _schedule_running( else: decode_seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, - token_chunk_size=1)) + token_chunk_size=0)) #token_chunk_size=self._get_num_lookahead_slots(is_prefill=False))) budget.add_num_batched_tokens(seq_group.request_id, num_running_tokens) @@ -562,7 +562,7 @@ def _schedule_swapped( else: assert num_new_tokens == 1 decode_seq_groups.append( - ScheduledSequenceGroup(seq_group, token_chunk_size=1)) + ScheduledSequenceGroup(seq_group, token_chunk_size=0)) budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens) budget.add_num_seqs(seq_group.request_id, num_new_seqs) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6152687c0fc2..c287d6617a7b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -463,7 +463,15 @@ def _process_model_outputs( # # So, the problem is likely that the scheduler updates token_chunk_size # to num_lookahead_slots. Let's confirm. + + from vllm.sequence import SequenceStage + + stages = [seq.data._stage for seq in seq_group.get_unfinished_seqs()] + equal_decode = [stage == SequenceStage.DECODE for stage in stages] + print(f'before {stages=}') + + #breakpoint() # These are only prefill computed tokens. seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) @@ -472,9 +480,15 @@ def _process_model_outputs( print(f'after {seq_group.get_num_uncomputed_tokens()=}') + stages = [seq.data._stage for seq in seq_group.get_unfinished_seqs()] + equal_decode = [stage == SequenceStage.DECODE for stage in stages] + print(f'after {stages=}') + # If uncomputed tokens > 0, it means prefill is chunked and prefill is not complete. # We don't need to process outputs in that case. - if seq_group.get_num_uncomputed_tokens() == 0: + #if seq_group.get_num_uncomputed_tokens() == 0: + + if all(equal_decode): self.output_processor.process_outputs(seq_group, outputs) # s/token_chunk_size/prefill_chunk_size/g diff --git a/vllm/sequence.py b/vllm/sequence.py index c9f54c48541e..03b47bcf8be4 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -591,7 +591,12 @@ def __init__( if is_prompt: self._token_chunk_size = list(seq_data.values())[0].get_len() else: - self._token_chunk_size = 1 + self._token_chunk_size = 0 + + if is_prompt: + assert self._token_chunk_size >= 1 + else: + assert self._token_chunk_size == 0 @property def lora_int_id(self) -> int: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 47ad8f0c9b78..c8191d7c00b4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -429,7 +429,7 @@ def _prepare_decode( for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 + assert seq_group_metadata.token_chunk_size == 0 seq_ids = list(seq_group_metadata.seq_data.keys()) lora_id = seq_group_metadata.lora_int_id From a9ad5ed38984ecbff4be2d35f52f28b0403a5055 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 02:27:18 -0700 Subject: [PATCH 126/165] spec decode test passes? --- tests/spec_decode/e2e/test_correctness.py | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index f34874f6f1b5..415619bb24cf 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -30,20 +30,20 @@ # "speculative_model": "JackFram/llama-68m", # "num_speculative_tokens": 5, #}, - { - "enable_chunked_prefill": True, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, - #"speculative_model": "JackFram/llama-68m", - #"num_speculative_tokens": 1, - }, #{ - # #"enable_chunked_prefill": True, - # #"max_num_batched_tokens": 2, - # #"max_num_seqs": 2, - # "speculative_model": "JackFram/llama-68m", - # "num_speculative_tokens": 1, + # "enable_chunked_prefill": True, + # "max_num_batched_tokens": 2, + # "max_num_seqs": 2, + # #"speculative_model": "JackFram/llama-68m", + # #"num_speculative_tokens": 1, #}, + { + #"enable_chunked_prefill": True, + #"max_num_batched_tokens": 2, + #"max_num_seqs": 2, + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 1, + }, #{ # # No spec decode. #}, From 2a19f5e58f36efb090434adb57e55a411144669b Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 01:39:25 -0700 Subject: [PATCH 127/165] allow append empty tokens in block table --- tests/core/block/e2e/test_correctness.py | 65 ++++++++++++++++++++++++ vllm/core/block/block_table.py | 1 - 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 94b65401e1dd..a403d442d7af 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -229,6 +229,71 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.parametrize("common_llm_kwargs", [ + { + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, +]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [ + { + "use_v2_block_manager": False, + }, +]) +@pytest.mark.parametrize( + "test_llm_kwargs", [ + { + "use_v2_block_manager": True, + "num_lookahead_slots": 0, + }, + { + "use_v2_block_manager": True, + "num_lookahead_slots": 5, + }, +]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): + output_len = 32 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + print('Getting token ids with BlockManagerV1') + baseline_token_ids = get_token_ids_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) + + print('Getting token ids with BlockManagerV2') + test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, + prompts, sampling_params) + + for expected_token_ids, actual_token_ids in zip(baseline_token_ids, + test_token_ids): + assert expected_token_ids == actual_token_ids + + assert baseline_token_ids == test_token_ids + def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): for llm in llm_generator: diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ba061bbc4fbc..560267e55ea3 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -104,7 +104,6 @@ def append_token_ids(self, token_ids (List[int]): The sequence of token IDs to be appended. """ assert self._is_allocated - assert token_ids, "can't append empty token ids" self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + num_lookahead_slots) From b6e9e826604123654224a5d598fd140c1cfedde5 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 02:58:43 -0700 Subject: [PATCH 128/165] rebase on stop string fixes --- vllm/engine/output_processor/multi_step.py | 15 +++-- vllm/engine/output_processor/stop_checker.py | 63 +------------------- 2 files changed, 13 insertions(+), 65 deletions(-) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 6b01a94f59e4..bae903acda66 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -101,17 +101,24 @@ def _process_seq_outputs(self, seq: Sequence, valid_samples = valid_samples[:i + 1] break + # Incrementally append tokens to the sequence, as if we had only one new + # token. for output_token_id in output_token_ids: seq.append_token_id( token_id=output_token_id, # TODO emit logprobs in multi-step decoding. logprobs={output_token_id: Logprob(0.0)}, ) - self.detokenizer.decode_sequence_inplace(seq, sampling_params) - self.stop_checker.maybe_stop_sequence(seq, - sampling_params, - new_token_ids=output_token_ids) + new_char_count = 0 + if sampling_params.detokenize: + new_char_count = self.detokenizer.decode_sequence_inplace(seq, sampling_params) + + self.stop_checker.maybe_stop_sequence(seq, + new_char_count=new_char_count, + sampling_params=sampling_params) + if seq.is_finished(): + break if seq.is_finished(): self.scheduler.free_seq(seq) diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index f259b818748e..93e2fe6ac17c 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, List +from typing import Callable, List, Optional from transformers import PreTrainedTokenizer @@ -61,7 +61,7 @@ def maybe_stop_sequence(self, seq: Sequence, return # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: + if seq.get_len() > self.max_model_len: seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return @@ -101,62 +101,3 @@ def _check_stop_strings(seq: Sequence, new_char_count: int, seq.output_text = seq.output_text[:stop_index] return stop_str return None - # TODO spec decode - ## - # """Check if the sequences should be stopped. If so, mark it as finished. - # """ - - # # Check if the sequence has reached max_model_len. - # if seq.get_len() > self.max_model_len: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the sequence has reached max_tokens. - # if seq.get_output_len() == sampling_params.max_tokens: - # seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - # return - - # # Check if the minimum number of tokens has been generated yet; - # # skip the stop string/token checks if not - # if seq.get_output_len() < sampling_params.min_tokens: - # return - - # if sampling_params.detokenize: - # for stop_str in sampling_params.stop: - # # TODO(cade) Fix this for speculative decoding. - # if seq.output_text.endswith(stop_str): - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_str - # return - - # # Determine if any stop_token_ids are in new_token_ids. - # intersection = set(new_token_ids).intersection( - # sampling_params.stop_token_ids) - # if intersection: - # # Get arbitrary token id that caused the stop. - # stop_token_id = next(iter(intersection)) - - # stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - # stop_token_id) - # self._finalize_sequence(seq, sampling_params, stop_str) - # seq.status = SequenceStatus.FINISHED_STOPPED - # seq.stop_reason = stop_token_id - # return - - # # Check if the sequence has generated the EOS token. - # if ((not sampling_params.ignore_eos) - # and seq.eos_token_id in new_token_ids): - # seq.status = SequenceStatus.FINISHED_STOPPED - # return - - #def _finalize_sequence(self, seq: Sequence, - # sampling_params: SamplingParams, - # stop_string: str) -> None: - # if sampling_params.include_stop_str_in_output: - # return - - # if stop_string and seq.output_text.endswith(stop_string): - # # Truncate the output text so that the stop string is - # # not included in the output. - # seq.output_text = seq.output_text[:-len(stop_string)] From bf0c37cbbd2f0f034edbd77a6292d9ba3509bf19 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:00:13 -0700 Subject: [PATCH 129/165] test spec --- vllm/executor/gpu_executor.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 4fd9735669fd..9268b646a18a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -13,13 +13,6 @@ class GPUExecutor(ExecutorBase): def _init_executor(self) -> None: - assert (not self.speculative_config - ), "Speculative decoding not yet supported for GPU backend" - - # Instantiate the worker and load the model to GPU. - self._init_worker() - - def _init_worker(self): if self.speculative_config is None: self._init_non_spec_worker() else: From a158256acb08f0c954feaf953590b0668d6f8904 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:07:16 -0700 Subject: [PATCH 130/165] lint & mypy --- tests/core/block/e2e/test_correctness.py | 34 +++++++++++--------- vllm/engine/output_processor/multi_step.py | 10 +++--- vllm/engine/output_processor/single_step.py | 3 +- vllm/engine/output_processor/stop_checker.py | 6 ++-- vllm/executor/gpu_executor.py | 2 ++ vllm/executor/neuron_executor.py | 5 ++- 6 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index a403d442d7af..1015892b67a4 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -229,27 +229,28 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.parametrize("common_llm_kwargs", [ - { - # Use a small model for a fast test. - "model": "facebook/opt-125m", - - # skip cuda graph creation for fast test. - "enforce_eager": True, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 2, - "max_num_seqs": 2, - }, -]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize( + "common_llm_kwargs", + [ + { + # Use a small model for a fast test. + "model": "facebook/opt-125m", + + # skip cuda graph creation for fast test. + "enforce_eager": True, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 2, + "max_num_seqs": 2, + }, + ]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [ { "use_v2_block_manager": False, }, ]) -@pytest.mark.parametrize( - "test_llm_kwargs", [ +@pytest.mark.parametrize("test_llm_kwargs", [ { "use_v2_block_manager": True, "num_lookahead_slots": 0, @@ -261,7 +262,8 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) -def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): +def test_chunked_prefill_block_manager_v2(baseline_llm_generator, + test_llm_generator, batch_size): output_len = 32 temperature = 0.0 diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index bae903acda66..50da0d35fcec 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -112,11 +112,13 @@ def _process_seq_outputs(self, seq: Sequence, new_char_count = 0 if sampling_params.detokenize: - new_char_count = self.detokenizer.decode_sequence_inplace(seq, sampling_params) + new_char_count = self.detokenizer.decode_sequence_inplace( + seq, sampling_params) - self.stop_checker.maybe_stop_sequence(seq, - new_char_count=new_char_count, - sampling_params=sampling_params) + self.stop_checker.maybe_stop_sequence( + seq, + new_char_count=new_char_count, + sampling_params=sampling_params) if seq.is_finished(): break diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 3ded72db3092..1b7eb014f802 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -110,7 +110,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, seq, seq_group.sampling_params) else: new_char_count = 0 - self.stop_checker.maybe_stop_sequence(seq, new_char_count, seq_group.sampling_params) + self.stop_checker.maybe_stop_sequence(seq, new_char_count, + seq_group.sampling_params) # Non-beam search case if not seq_group.sampling_params.use_beam_search: diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 93e2fe6ac17c..66deb9b59174 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional +from typing import Callable, Optional from transformers import PreTrainedTokenizer @@ -19,10 +19,8 @@ def __init__(self, max_model_len: int, self.max_model_len = max_model_len self.get_tokenizer_for_seq = get_tokenizer_for_seq - def maybe_stop_sequence(self, seq: Sequence, - new_char_count: int, + def maybe_stop_sequence(self, seq: Sequence, new_char_count: int, sampling_params: SamplingParams) -> None: - """Stop the finished sequences. new_char_count is the number of chars added to the diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 9268b646a18a..b7ab9481eb9f 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -48,6 +48,8 @@ def _init_non_spec_worker(self): def _init_spec_worker(self): """Initialize a SpecDecodeWorker, using a draft model for proposals. """ + assert self.speculative_config is not None + from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.worker.worker import Worker diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 273b17a927ef..7cc187e297c9 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -48,10 +48,13 @@ def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: assert (blocks_to_swap_in == {} and blocks_to_swap_out == {} and blocks_to_copy == {}), ( "Cache operations are not supported for Neuron backend.") + assert num_lookahead_slots == 0, ( + "lookahead not supported for Neuron backend.") output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list) From 5a69f6c25ad51515fcc9d1e5ecc9d43fea3af89c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 03:15:31 -0700 Subject: [PATCH 131/165] doc --- tests/core/block/e2e/test_correctness.py | 3 +++ vllm/executor/gpu_executor.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 1015892b67a4..0ee78a9b0a8e 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -264,6 +264,9 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, @pytest.mark.parametrize("seed", [1]) def test_chunked_prefill_block_manager_v2(baseline_llm_generator, test_llm_generator, batch_size): + """Verify that chunked prefill works with BlockManagerV2, with and without + lookahead scheduling. + """ output_len = 32 temperature = 0.0 diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index b7ab9481eb9f..962cac585bb2 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -13,6 +13,11 @@ class GPUExecutor(ExecutorBase): def _init_executor(self) -> None: + """Initialize the worker and load the model. + + If speculative decoding is enabled, we instead create the speculative + worker. + """ if self.speculative_config is None: self._init_non_spec_worker() else: From 16c3ec424f0350e896aacd3c460cc7c4c167d276 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 12:14:09 -0700 Subject: [PATCH 132/165] remove --- vllm/config.py | 4 +-- vllm/core/scheduler.py | 49 +++---------------------------------- vllm/engine/llm_engine.py | 48 ++---------------------------------- vllm/sequence.py | 11 +-------- vllm/worker/model_runner.py | 2 +- 5 files changed, 10 insertions(+), 104 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 728339a5faee..9bb9190d2b14 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -714,8 +714,8 @@ def maybe_create_spec_config( ) # TODO docs - #draft_model_config.max_model_len = min(target_model_config.max_model_len, - # draft_model_config.max_model_len) + #draft_model_config.max_model_len = min( + # target_model_config.max_model_len, draft_model_config.max_model_len) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 70a5d8e0d699..bc55a3899035 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -97,28 +97,7 @@ class ScheduledSequenceGroup: # The total chunk size (number of tokens) to process for next iteration. # 1 for decoding. Same as prompt tokens for prefill, but if prefill is # chunked, it can be smaller than that. - token_chunk_size: int # TODO docs - - - """ - What should happen here? - - each sequence in a sequence group can produce one output token, or multiple. - - the "role" of the output token could be prefill, or decode. - - even with one output token, it could be prefill or decode. - - with >1 output token, it could be prefill or decode. - - technically, it could even be a mix of both prefill and decode -- first N are ignored, latter M are sent to user. - - so we need to track how many of the tokens have been computed - we need to track how many new tokens have been computed - we need to track num decode tokens - - why can't token_chunk_size just be num_prefill_tokens? then any output after - token_chunk_size is a decode token - - - s/token_chunk_size/prefill_chunk_size/g - - make optional (None during decode) - - how to handle the last chunk, where a token is emitted? - """ + token_chunk_size: int @dataclass @@ -459,8 +438,7 @@ def _schedule_running( else: decode_seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, - token_chunk_size=0)) - #token_chunk_size=self._get_num_lookahead_slots(is_prefill=False))) + token_chunk_size=1)) budget.add_num_batched_tokens(seq_group.request_id, num_running_tokens) budget.add_num_seqs(seq_group.request_id, num_running_seqs) @@ -565,7 +543,7 @@ def _schedule_swapped( else: assert num_new_tokens == 1 decode_seq_groups.append( - ScheduledSequenceGroup(seq_group, token_chunk_size=0)) + ScheduledSequenceGroup(seq_group, token_chunk_size=1)) budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens) budget.add_num_seqs(seq_group.request_id, num_new_seqs) @@ -681,7 +659,7 @@ def _schedule_prefills( if curr_loras is not None and lora_int_id > 0: curr_loras.add(lora_int_id) waiting_queue.popleft() - self._allocate_and_set_running(seq_group, num_new_tokens) # num_new_tokens not required here + self._allocate_and_set_running(seq_group, num_new_tokens) seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens)) @@ -1123,30 +1101,11 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup, seqs = seq_group.get_seqs(status=status) for seq in seqs: num_new_tokens += seq.get_num_new_tokens() - # + self._get_num_lookahead_slots(is_prefill=seq_group.is_prefill()) # Chunk if a running request cannot fit in. # If number of seq > 1, it means it is doing beam search in a # decode phase. Do not chunk in that case. if enable_chunking and len(seqs) == 1: num_new_tokens = min(num_new_tokens, budget.remaining_token_budget()) - return num_new_tokens - -""" -1 Generalize get_num_new_tokens to distinguish between prefill, decode. - essentially, the number of query tokens, _not_ the number of output tokens. - - token_chunk_size --> rename to prefill only (None for decode) - token budget distinguishes between prefill and decode (_computed_ tokens, not "new" output tokens) - - currently we assume 1:1 map between new tokens and computed tokens; we need to break this 1:1 mapping - -2 Set num_new_tokens to num_lookahead_tokens in spec decode - will break when budget is exhausted - breaks for some other reason? - -3 Fork chunked prefill vs spec decode budget calculations - -""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4bacc4c39a97..f2a6660ca388 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -448,62 +448,18 @@ def _process_model_outputs( # [step][sequence group]. output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) - - print(f'_process_model_outputs') # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): seq_group = scheduled_seq_group.seq_group - - print(f'{scheduled_seq_group.token_chunk_size=}') - print(f'before {seq_group.get_num_uncomputed_tokens()=}') - - output_token_ids = [sgo.samples[0].output_token for sgo in outputs] - print(f'{output_token_ids=}') - - - # Num computed tokens tracks how many of the prefill tokens have - # been computed. - # - # The token chunk size is always 1 in normal decoding. So - # we have 1 uncomputed token, then zero after this update. - # - # Then, process_outputs runs when it's zero. - # - # So, the problem is likely that the scheduler updates token_chunk_size - # to num_lookahead_slots. Let's confirm. - - from vllm.sequence import SequenceStage - - stages = [seq.data._stage for seq in seq_group.get_unfinished_seqs()] - equal_decode = [stage == SequenceStage.DECODE for stage in stages] - - print(f'before {stages=}') - - #breakpoint() - # These are only prefill computed tokens. seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - - # token chunk size -- is this one? - - print(f'after {seq_group.get_num_uncomputed_tokens()=}') - - stages = [seq.data._stage for seq in seq_group.get_unfinished_seqs()] - equal_decode = [stage == SequenceStage.DECODE for stage in stages] - print(f'after {stages=}') - - # If uncomputed tokens > 0, it means prefill is chunked and prefill is not complete. + # If uncomputed tokens > 0, it means prefill is chunked. # We don't need to process outputs in that case. - #if seq_group.get_num_uncomputed_tokens() == 0: - - if all(equal_decode): + if seq_group.get_num_uncomputed_tokens() == 0: self.output_processor.process_outputs(seq_group, outputs) - # s/token_chunk_size/prefill_chunk_size/g - # s/num_uncomputed_tokens/num_uncomputed_prefill_tokens/g - # Free the finished sequence groups. self.scheduler.free_finished_seq_groups() diff --git a/vllm/sequence.py b/vllm/sequence.py index f6936dcc1f7e..9202e0bb69bf 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -144,14 +144,12 @@ def get_num_computed_tokens(self) -> int: def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" - print(f'seq_data.update_num_computed_tokens({num_new_computed_tokens=})') self._num_computed_tokens += num_new_computed_tokens assert self._num_computed_tokens <= self.get_len(), ( self._num_computed_tokens, self.get_len()) # If all tokens are computed, it means it is in decoding phase. if self.get_num_uncomputed_tokens() == 0: - # define a property _stage; return DECODE if num_uncomputed == 0; else PREFILL. self._stage = SequenceStage.DECODE def reset_state_for_recompute(self) -> None: @@ -503,8 +501,6 @@ def get_finished_seqs(self) -> List[Sequence]: def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" for seq in self.seqs_dict.values(): - # TODO does this not handle swapped seqs? --> we swap out/in sequences - # by group, so it's fine.. if not seq.is_finished(): seq.data.update_num_computed_tokens(num_new_computed_tokens) @@ -597,12 +593,7 @@ def __init__( if is_prompt: self._token_chunk_size = list(seq_data.values())[0].get_len() else: - self._token_chunk_size = 0 - - if is_prompt: - assert self._token_chunk_size >= 1 - else: - assert self._token_chunk_size == 0 + self._token_chunk_size = 1 @property def lora_int_id(self) -> int: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 14cfb9fc5bd7..7dbe14ead097 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -434,7 +434,7 @@ def _prepare_decode( for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 0 + assert seq_group_metadata.token_chunk_size == 1 seq_ids = list(seq_group_metadata.seq_data.keys()) lora_id = seq_group_metadata.lora_int_id From ce07d3dbb47dee992c0c07d5b8139aca223e629b Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 14:51:48 -0700 Subject: [PATCH 133/165] lint --- vllm/executor/gpu_executor.py | 2 +- vllm/spec_decode/batch_expansion.py | 4 ++-- vllm/spec_decode/multi_step_worker.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 8d8a1b0a2a51..fc4ce7aa228a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -83,7 +83,7 @@ def _init_spec_worker(self): scheduler_config=self.scheduler_config, device_config=self.device_config, cache_config=self.cache_config, - load_config=self.load_config, # TODO get from spec + load_config=self.load_config, # TODO get from spec local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 353a8a584ebc..e01cf224a5ae 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -6,8 +6,8 @@ from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (get_all_seq_ids, maybe_mock_device_tensors, - nvtx_range, sampler_output_to_torch, +from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, + sampler_output_to_torch, split_batch_by_proposal_len) from vllm.worker.worker_base import WorkerBase diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 44605fc0ec58..0ca970c8f5ff 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -6,8 +6,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) -from vllm.spec_decode.util import (maybe_mock_device_tensors, - sampler_output_to_torch) +from vllm.spec_decode.util import sampler_output_to_torch from vllm.worker.worker import Worker From f7938d20a0d0322bf26d23c05cc1b00631c036cd Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:04:48 -0700 Subject: [PATCH 134/165] trimming tests --- tests/spec_decode/e2e/test_correctness.py | 51 ++--------------------- 1 file changed, 3 insertions(+), 48 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 833f14725401..aa8c52cbf982 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -6,15 +6,6 @@ from vllm import SamplingParams -# TODO test preemption -# TODO test integration (cuda graph, tp) -# TODO test smoke (sampling params) -# TODO investigate token with 68m/68m k=5 temp=1.0 -# TODO test for when sequences skip speculation -# TODO test different block sizes -# TODO validate acceptance rate - - @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -22,9 +13,6 @@ # Note this is repeated in the test body; to initialize a tokenizer. "model": "JackFram/llama-68m", - ## Skip real loading for fast test. - # "load_format": "dummy", - # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -34,17 +22,6 @@ @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - #{ - # "speculative_model": "JackFram/llama-68m", - # "num_speculative_tokens": 5, - #}, - #{ - # "enable_chunked_prefill": True, - # "max_num_batched_tokens": 2, - # "max_num_seqs": 2, - # #"speculative_model": "JackFram/llama-68m", - # #"num_speculative_tokens": 1, - #}, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, @@ -115,9 +92,9 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): { "model": "JackFram/llama-68m", }, - #{ - # "model": "JackFram/llama-160m", - #}, + { + "model": "JackFram/llama-160m", + }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize( @@ -128,10 +105,6 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - #{ - # "speculative_model": "JackFram/llama-68m", - # "num_speculative_tokens": 1, - #} ]) @pytest.mark.parametrize( "output_len", @@ -181,10 +154,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } ]) @pytest.mark.parametrize( "output_len", @@ -229,15 +198,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( @pytest.mark.parametrize( "test_llm_kwargs", [ - # Try two different num spec tokens. { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } ]) @pytest.mark.parametrize("max_output_len", [ 256, @@ -271,15 +235,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( @pytest.mark.parametrize( "test_llm_kwargs", [ - # Try two different num spec tokens. { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } ]) @pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize( @@ -321,10 +280,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 1, - } ]) @pytest.mark.parametrize("batch_size", [32]) @pytest.mark.parametrize( From aa4b56233e45d5f0af9fa744fdbb6fee3e8abe37 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:09:49 -0700 Subject: [PATCH 135/165] move cpu/amd tests to after wait --- .buildkite/test-template.j2 | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 3ed23c62c005..0e1acc9777d4 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -3,13 +3,6 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: - - label: "AMD Test" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh - - - label: "CPU Test" - command: bash .buildkite/run-cpu-test.sh - label: ":docker: build image" commands: @@ -23,6 +16,14 @@ steps: limit: 5 - wait + - label: "AMD Test" + agents: + queue: amd + command: bash .buildkite/run-amd-test.sh + + - label: "CPU Test" + command: bash .buildkite/run-cpu-test.sh + {% for step in steps %} - label: "{{ step.label }}" agents: From e831854266c8dd607948f393864b71354b53ee9c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:23:21 -0700 Subject: [PATCH 136/165] spec decode + preemption test --- tests/spec_decode/e2e/test_correctness.py | 51 ++++++++++------------- vllm/config.py | 5 ++- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index aa8c52cbf982..3066061004b0 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -6,6 +6,7 @@ from vllm import SamplingParams + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -195,14 +196,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize("max_output_len", [ 256, ]) @@ -232,14 +231,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize( "output_len", @@ -302,10 +299,10 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( @pytest.mark.parametrize( "common_llm_kwargs", [{ - #"block_size": 8, + "block_size": 8, # 2 for small prompt, 256//8 for generated. - #"num_gpu_blocks_override": 2 + 256//8, - #"max_model_len": (2 + 256//8)*8, + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -313,14 +310,11 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - #"model": "JackFram/llama-160m", - "model": "meta-llama/Llama-2-7b-chat-hf", - }, - ]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "model": "JackFram/llama-160m", + }, +]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { @@ -339,9 +333,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( def test_spec_decode_e2e_greedy_correctness_with_preemption( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - """ - NOTE(cade): this test fails, unclear why - """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, diff --git a/vllm/config.py b/vllm/config.py index 9404652611de..5e2fa3a8a2dc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -709,8 +709,9 @@ def maybe_create_spec_config( ) # TODO docs - #draft_model_config.max_model_len = min( - # target_model_config.max_model_len, draft_model_config.max_model_len) + draft_model_config.max_model_len = min( + target_model_config.max_model_len, + draft_model_config.max_model_len) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( From f23ea7fc169ef142d19c871cf14063926f28ecaa Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:33:56 -0700 Subject: [PATCH 137/165] try different block sizes --- tests/spec_decode/e2e/test_correctness.py | 46 ++++++++++++++++++++++- vllm/engine/arg_utils.py | 2 +- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 3066061004b0..fb921b4f533f 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -272,7 +272,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( @pytest.mark.parametrize( "test_llm_kwargs", [ - # Try two different num spec tokens. { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, @@ -339,6 +338,51 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( max_output_len=output_len, force_output_len=True) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-160m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "block_size": 8, + }, + { + "block_size": 32, + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_many_block_size( + baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + def run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c61c0cc67d7a..63ca8622ebf3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -225,7 +225,7 @@ def add_cli_args( parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32, 128], + choices=[8, 16, 32], help='token block size') parser.add_argument('--enable-prefix-caching', From d66ce83935793f64ca046e88dd7c96100f9e814e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:35:34 -0700 Subject: [PATCH 138/165] fix --- tests/spec_decode/e2e/test_correctness.py | 50 ++++++++++++----------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fb921b4f533f..d71cc4efb039 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -269,14 +269,12 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize("batch_size", [32]) @pytest.mark.parametrize( "output_len", @@ -338,6 +336,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( max_output_len=output_len, force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -349,23 +348,28 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "block_size": 8, - }, - { - "block_size": 32, - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize( - "test_llm_kwargs", + "per_test_common_llm_kwargs", [ + # As of this writing, vLLM only compiles with these 3 block sizes by + # default. { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, + "block_size": 8, + }, + { + "block_size": 16, + }, + { + "block_size": 32, }, ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize( "output_len", @@ -374,9 +378,9 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( 32, ]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_many_block_size( - baseline_llm_generator, test_llm_generator, batch_size: int, - output_len: int): +def test_spec_decode_different_block_size(baseline_llm_generator, + test_llm_generator, batch_size: int, + output_len: int): run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, From ec0adf8e89b525c0367e56c3162d60d3c60914d9 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 16 Apr 2024 15:48:48 -0700 Subject: [PATCH 139/165] skip speculation test does not crash --- tests/spec_decode/e2e/test_correctness.py | 43 +++++++++++++++ vllm/config.py | 2 + .../layers/rejection_sampler.py | 24 +++++---- vllm/model_executor/models/llama.py | 3 ++ vllm/spec_decode/batch_expansion.py | 52 ++++++++++++++----- vllm/spec_decode/interfaces.py | 4 +- vllm/spec_decode/multi_step_worker.py | 24 +++++++-- vllm/spec_decode/spec_decode_worker.py | 30 ++++++----- vllm/worker/model_runner.py | 17 ++++++ 9 files changed, 160 insertions(+), 39 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d71cc4efb039..faac5b8f55b1 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -387,6 +387,44 @@ def test_spec_decode_different_block_size(baseline_llm_generator, max_output_len=output_len, force_output_len=True) +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-160m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("output_len", + [ + # Use smaller output len for fast test. + 512, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_skip_speculation(baseline_llm_generator, + test_llm_generator, batch_size: int, + output_len: int): + """Verify correct output when we skip speculation. + Test skip 1, skip >1, skip all. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + def run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -399,6 +437,11 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, "The president of the United States is", "The capital of France is", "The future of AI is", + + "Mark Zuckerberg loves to dance, and", + "Ray is a framework for", + "Chevelle is a heavy-metal band that", + "Park is a common surname from the country of", ] prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] diff --git a/vllm/config.py b/vllm/config.py index 5e2fa3a8a2dc..e17642d2691e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -713,6 +713,8 @@ def maybe_create_spec_config( target_model_config.max_model_len, draft_model_config.max_model_len) + draft_model_config.max_model_len = 32 + draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( target_parallel_config)) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 925934d2197f..812ad97ccd1f 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -335,16 +335,20 @@ def _raise_if_incorrect_shape( draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - assert draft_token_ids_batch_size == draft_batch_size - assert num_draft_token_ids == num_draft_probs - - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens + try: + assert draft_batch_size == target_batch_size + assert num_draft_probs == num_target_probs + assert (draft_vocab_size == target_vocab_size + ), f"{draft_vocab_size=} {target_vocab_size=}" + + assert draft_token_ids_batch_size == draft_batch_size + assert num_draft_token_ids == num_draft_probs + + assert bonus_batch_size == target_batch_size + assert num_bonus_tokens == self._num_bonus_tokens + except: + breakpoint() + raise def _raise_if_incorrect_dtype( self, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 016e3b039d1e..1cf416f80b02 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -266,6 +266,9 @@ def __init__( self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + #print(f'get_input_embeddings {input_ids.shape=} {input_ids=}') + #if input_ids.shape[0] == 43: + # breakpoint() return self.embed_tokens(input_ids) def forward( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e01cf224a5ae..b876c16730a7 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -127,8 +127,24 @@ def _expand_batch( proposal_lens_list, select_proposal_len_zero=True) + # TODO clean up + filtered = [] + for p in proposal_token_ids_list: + if -1 in p: + assert all([x == -1 for x in p]) + continue + filtered.append(p) + + target_seq_group_metadata_list = self._create_scoring_model_input( - spec_seqs, proposal_token_ids_list) + seq_group_metadata_list=spec_seqs, + #proposal_token_ids=proposal_token_ids_list, + proposal_token_ids=filtered, + target_seq_ids_iter=self._create_target_seq_id_iterator( + seq_ids=get_all_seq_ids(seq_group_metadata_list) + ), + ) + num_scoring_tokens = len(target_seq_group_metadata_list) target_seq_group_metadata_list.extend(non_spec_seqs) @@ -161,12 +177,20 @@ def _contract_batch(self, original_bs: int, # Map distinct sequences used to score each token # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - batch_size, k = proposals.proposal_token_ids.shape - - target_token_ids = target_token_ids.squeeze().reshape( - batch_size, k + 1) - target_probs = target_probs.squeeze().reshape(batch_size, k + 1, - self._vocab_size) + full_batch_size, k = proposals.proposal_token_ids.shape + non_spec_batch_size, _ = non_spec_target_token_ids.shape + speculated_batch_size = full_batch_size - non_spec_batch_size + # TODO clean up + + try: + target_token_ids = target_token_ids.squeeze().reshape( + speculated_batch_size, k + 1) + target_probs = target_probs.squeeze().reshape(speculated_batch_size, k + 1, + self._vocab_size) + except Exception as e: + print(e) + breakpoint() + raise all_tokens = torch.full(size=(original_bs, k + 1), fill_value=-1, @@ -179,8 +203,12 @@ def _contract_batch(self, original_bs: int, dtype=torch.float32) if non_spec_indices: - all_tokens[non_spec_indices, 0] = non_spec_target_token_ids - all_probs[non_spec_indices, :1, :] = non_spec_target_probs + try: + all_tokens[non_spec_indices, :1] = non_spec_target_token_ids + all_probs[non_spec_indices, :1, :] = non_spec_target_probs + except: + breakpoint() + raise if spec_indices: all_tokens[spec_indices] = target_token_ids @@ -192,17 +220,17 @@ def _create_scoring_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + target_seq_ids_iter: Iterator[TargetSeqId], ) -> List[SequenceGroupMetadata]: """Given the original input sequences and proposed tokens from the draft model, create a list of target sequences that can be used for scoring. + + TODO docs on target_seq_ids_iter """ if not seq_group_metadata_list: return [] - target_seq_ids_iter = self._create_target_seq_id_iterator( - get_all_seq_ids(seq_group_metadata_list)) - target_seq_group_metadata = list( chain.from_iterable( self._create_target_seq_group_metadata( diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 2a72974d01bd..b78fcd960ea9 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -24,9 +24,9 @@ class SpeculativeProposals: def __repr__(self): return (f"SpeculativeProposals(" - f"proposal_token_ids={self.proposal_token_ids.shape}, " + f"proposal_token_ids={self.proposal_token_ids}, " f"proposal_probs={self.proposal_probs.shape}, " - f"proposal_lens={self.proposal_lens.shape})") + f"proposal_lens={self.proposal_lens})") @dataclass diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0ca970c8f5ff..4e421a0f2dc8 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -255,6 +255,11 @@ def get_proposals( nonzero_proposal_len_indices) = self._split_by_max_model_len( seq_group_metadata_list, max_proposal_len) + print(f'{proposal_lens=}') + if 0 in proposal_lens: + pass + #breakpoint() + if nonzero_proposal_len_seqs: # Speculate tokens using the draft worker for the speculative # sequences. @@ -269,6 +274,10 @@ def get_proposals( # If no sequences can be speculated, set sampler output to None. maybe_sampler_output = None + if 0 in proposal_lens: + pass + #breakpoint() + # Combine speculative- and non-speculative sequences into the same # representation. proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( @@ -328,11 +337,20 @@ def _merge_outputs( if maybe_sampler_output is None: # If no speculative tokens, the sampler output will be None. # In this case we return empty tensors. - proposal_tokens = torch.zeros(0, - max_proposal_len, + #proposal_tokens = torch.zeros(0, + # max_proposal_len, + # dtype=torch.long, + # device=self._device) + proposal_tokens = torch.full(size=(batch_size, max_proposal_len,), + fill_value=-1, dtype=torch.long, device=self._device) - proposal_probs = torch.zeros(0, + #proposal_probs = torch.zeros(0, + # max_proposal_len, + # self._vocab_size, + # dtype=torch.float32, + # device=self._device) + proposal_probs = torch.zeros(batch_size, max_proposal_len, self._vocab_size, dtype=torch.float32, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 5e11e44b53bd..0d1e8b167d58 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -156,7 +156,7 @@ def execute_model( "speculative decoding " "requires non-None seq_group_metadata_list") - #logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") + logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") # If no spec tokens, call the proposer and scorer workers normally. # Used for prefill. @@ -188,7 +188,7 @@ def _run_no_spec( proposer and scorer model so that the KV cache is consistent between the two. """ - #logger.info("run proposer worker no spec") + logger.info("run proposer worker no spec") self.proposer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, @@ -197,7 +197,7 @@ def _run_no_spec( blocks_to_copy=blocks_to_copy, ) - #logger.info("run target worker no spec") + logger.info("run target worker no spec") sampler_output = self.scorer_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, @@ -231,13 +231,14 @@ def _run_speculative_decoding_step( sequence. """ - #logger.info("get spec proposals") + logger.info("get spec proposals") # Generate proposals using draft worker. proposals = self.proposer_worker.get_spec_proposals( seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy, k) - #logger.info("score proposals") + logger.info("score proposals") + print(f'{proposals=}') proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, @@ -247,11 +248,11 @@ def _run_speculative_decoding_step( proposals, ) - #logger.info("verify proposals") + logger.info("verify proposals") accepted_token_ids = self._verify_tokens(seq_group_metadata_list, proposal_scores, proposals, k) - #logger.info("create output list") + logger.info("create output list") return self._create_output_sampler_list(seq_group_metadata_list, accepted_token_ids, k) @@ -282,15 +283,20 @@ def _verify_tokens( select_proposal_len_zero=True) original_indices = spec_indices + non_spec_indices - proposal_probs = proposal_scores.probs[spec_indices, :-1] + proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1] bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] + proposal_probs = proposals.proposal_probs[spec_indices] + proposal_token_ids = proposals.proposal_token_ids[spec_indices] non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] + #if -1 in proposals.proposal_token_ids: + # breakpoint() + accepted_token_ids = self.rejection_sampler( - proposal_probs, - bonus_token_ids, - proposals.proposal_probs, - proposals.proposal_token_ids, + target_probs=proposal_verifier_probs, + bonus_token_ids=bonus_token_ids, + draft_probs=proposal_probs, + draft_token_ids=proposal_token_ids, ) # Append output tokens from non-speculative sequences to diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 42c06a1b1936..3d9507072ddb 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -812,6 +812,7 @@ def execute_model( seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) @@ -837,9 +838,19 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) + # When 4 speculations x 6 speculated tokens + 4 non speculations. + if len(seq_group_metadata_list) == 28: + print('after model execute') + breakpoint() + # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) + # When 4 speculations x 6 speculated tokens + 4 non speculations. + if len(seq_group_metadata_list) == 28: + print('after compute logits') + breakpoint() + # Only perform sampling in the driver worker. if not sampling_metadata.perform_sampling: return None @@ -849,6 +860,12 @@ def execute_model( logits=logits, sampling_metadata=sampling_metadata, ) + + # When 4 speculations x 6 speculated tokens + 4 non speculations. + if len(seq_group_metadata_list) == 28: + print('after sampling') + breakpoint() + return output @torch.inference_mode() From 7ee67f94a4a7ef1e500415be0a8a35aac13cea12 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 16:53:20 -0700 Subject: [PATCH 140/165] speculative_max_model_len --- tests/spec_decode/e2e/test_correctness.py | 1 + vllm/config.py | 11 +++++++---- vllm/engine/arg_utils.py | 12 ++++++++++-- vllm/worker/model_runner.py | 12 ------------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index faac5b8f55b1..d0789348a499 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -404,6 +404,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "speculative_max_model_len": 32, }, ]) @pytest.mark.parametrize("batch_size", [8]) diff --git a/vllm/config.py b/vllm/config.py index e17642d2691e..0902c7de4437 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -649,6 +649,7 @@ def maybe_create_spec_config( target_dtype: str, speculative_model: Optional[str], num_speculative_tokens: Optional[int], + speculative_max_model_len: Optional[int], ) -> Optional["SpeculativeConfig"]: """Create a SpeculativeConfig if possible, else return None. @@ -666,6 +667,7 @@ def maybe_create_spec_config( model, if provided. num_speculative_tokens (Optional[int]): The number of speculative tokens, if provided. + TODO speculative_max_model_len Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if @@ -709,11 +711,12 @@ def maybe_create_spec_config( ) # TODO docs - draft_model_config.max_model_len = min( - target_model_config.max_model_len, - draft_model_config.max_model_len) + #draft_model_config.max_model_len = min( + # target_model_config.max_model_len, + # draft_model_config.max_model_len) - draft_model_config.max_model_len = 32 + if speculative_max_model_len is not None: + draft_model_config.max_model_len = speculative_max_model_len draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 63ca8622ebf3..d5ce507ec9e4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,6 +71,7 @@ class EngineArgs: # Speculative decoding configuration. speculative_model: Optional[str] = None num_speculative_tokens: Optional[int] = None + speculative_max_model_len: Optional[int] = None def __post_init__(self): if self.tokenizer is None: @@ -406,17 +407,23 @@ def add_cli_args( parser.add_argument( '--speculative-model', type=str, - default=None, + default=EngineArgs.speculative_model, help= 'The name of the draft model to be used in speculative decoding.') parser.add_argument( '--num-speculative-tokens', type=int, - default=None, + default=EngineArgs.num_speculative_tokens, help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding') + parser.add_argument( + '--speculative-max-model-len', + type=str, + default=EngineArgs.speculative_max_model_len, + help='TODO') + parser.add_argument('--model-loader-extra-config', type=str, default=EngineArgs.model_loader_extra_config, @@ -467,6 +474,7 @@ def create_engine_config(self, ) -> EngineConfig: target_dtype=self.dtype, speculative_model=self.speculative_model, num_speculative_tokens=self.num_speculative_tokens, + speculative_max_model_len=self.speculative_max_model_len, ) scheduler_config = SchedulerConfig( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3d9507072ddb..952cc299786f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -838,18 +838,10 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) - # When 4 speculations x 6 speculated tokens + 4 non speculations. - if len(seq_group_metadata_list) == 28: - print('after model execute') - breakpoint() # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) - # When 4 speculations x 6 speculated tokens + 4 non speculations. - if len(seq_group_metadata_list) == 28: - print('after compute logits') - breakpoint() # Only perform sampling in the driver worker. if not sampling_metadata.perform_sampling: @@ -861,10 +853,6 @@ def execute_model( sampling_metadata=sampling_metadata, ) - # When 4 speculations x 6 speculated tokens + 4 non speculations. - if len(seq_group_metadata_list) == 28: - print('after sampling') - breakpoint() return output From 8c1e2a7564895012cf13497d499b0fbc0850419d Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 16:53:54 -0700 Subject: [PATCH 141/165] lint --- tests/spec_decode/e2e/test_correctness.py | 14 +++++++------- vllm/engine/arg_utils.py | 9 ++++----- vllm/spec_decode/batch_expansion.py | 18 ++++++++---------- vllm/spec_decode/multi_step_worker.py | 11 +++++++---- vllm/worker/model_runner.py | 3 --- 5 files changed, 26 insertions(+), 29 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index d0789348a499..9c4467928ca1 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -387,6 +387,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator, max_output_len=output_len, force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -398,25 +399,25 @@ def test_spec_decode_different_block_size(baseline_llm_generator, # Required for spec decode. "use_v2_block_manager": True }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs",[{}]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - "speculative_max_model_len": 32, + "speculative_max_model_len": 32, # TODO expect fail ? }, ]) @pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("output_len", +@pytest.mark.parametrize( + "output_len", [ # Use smaller output len for fast test. 512, ]) @pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(baseline_llm_generator, - test_llm_generator, batch_size: int, - output_len: int): +def test_skip_speculation(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): """Verify correct output when we skip speculation. Test skip 1, skip >1, skip all. """ @@ -438,7 +439,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, "The president of the United States is", "The capital of France is", "The future of AI is", - "Mark Zuckerberg loves to dance, and", "Ray is a framework for", "Chevelle is a heavy-metal band that", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d5ce507ec9e4..c8f3c9d20b2d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -418,11 +418,10 @@ def add_cli_args( help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding') - parser.add_argument( - '--speculative-max-model-len', - type=str, - default=EngineArgs.speculative_max_model_len, - help='TODO') + parser.add_argument('--speculative-max-model-len', + type=str, + default=EngineArgs.speculative_max_model_len, + help='TODO') parser.add_argument('--model-loader-extra-config', type=str, diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index b876c16730a7..8a5b8d764281 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -135,14 +135,12 @@ def _expand_batch( continue filtered.append(p) - target_seq_group_metadata_list = self._create_scoring_model_input( seq_group_metadata_list=spec_seqs, #proposal_token_ids=proposal_token_ids_list, proposal_token_ids=filtered, target_seq_ids_iter=self._create_target_seq_id_iterator( - seq_ids=get_all_seq_ids(seq_group_metadata_list) - ), + seq_ids=get_all_seq_ids(seq_group_metadata_list)), ) num_scoring_tokens = len(target_seq_group_metadata_list) @@ -181,12 +179,12 @@ def _contract_batch(self, original_bs: int, non_spec_batch_size, _ = non_spec_target_token_ids.shape speculated_batch_size = full_batch_size - non_spec_batch_size # TODO clean up - + try: target_token_ids = target_token_ids.squeeze().reshape( speculated_batch_size, k + 1) - target_probs = target_probs.squeeze().reshape(speculated_batch_size, k + 1, - self._vocab_size) + target_probs = target_probs.squeeze().reshape( + speculated_batch_size, k + 1, self._vocab_size) except Exception as e: print(e) breakpoint() @@ -217,10 +215,10 @@ def _contract_batch(self, original_bs: int, return all_tokens, all_probs def _create_scoring_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] - target_seq_ids_iter: Iterator[TargetSeqId], + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + target_seq_ids_iter: Iterator[TargetSeqId], ) -> List[SequenceGroupMetadata]: """Given the original input sequences and proposed tokens from the draft model, create a list of target sequences that can be used for scoring. diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4e421a0f2dc8..292ba64160af 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -341,10 +341,13 @@ def _merge_outputs( # max_proposal_len, # dtype=torch.long, # device=self._device) - proposal_tokens = torch.full(size=(batch_size, max_proposal_len,), - fill_value=-1, - dtype=torch.long, - device=self._device) + proposal_tokens = torch.full(size=( + batch_size, + max_proposal_len, + ), + fill_value=-1, + dtype=torch.long, + device=self._device) #proposal_probs = torch.zeros(0, # max_proposal_len, # self._vocab_size, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 952cc299786f..3124650b3e17 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -838,11 +838,9 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) hidden_states = model_executable(**execute_model_kwargs) - # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) - # Only perform sampling in the driver worker. if not sampling_metadata.perform_sampling: return None @@ -853,7 +851,6 @@ def execute_model( sampling_metadata=sampling_metadata, ) - return output @torch.inference_mode() From 519f5aa4416e4f5562125365ca2dacb1fcf2ac43 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 17:46:30 -0700 Subject: [PATCH 142/165] fix --- tests/spec_decode/e2e/test_correctness.py | 47 ++++++++++++++--------- vllm/config.py | 3 +- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 9c4467928ca1..4b6935247d6f 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -405,7 +405,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - "speculative_max_model_len": 32, # TODO expect fail ? + "speculative_max_model_len": 32, }, ]) @pytest.mark.parametrize("batch_size", [8]) @@ -413,7 +413,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator, "output_len", [ # Use smaller output len for fast test. - 512, + 64, ]) @pytest.mark.parametrize("seed", [1]) def test_skip_speculation(baseline_llm_generator, test_llm_generator, @@ -421,17 +421,22 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, """Verify correct output when we skip speculation. Test skip 1, skip >1, skip all. """ - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) + run_greedy_equality_correctness_test( + baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True, + print_tokens=True, + ) def run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, batch_size, + test_llm_generator, + batch_size, max_output_len, - force_output_len: bool): + force_output_len: bool, + print_tokens: bool = False): temperature = 0.0 prompts = [ @@ -439,10 +444,10 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, "The president of the United States is", "The capital of France is", "The future of AI is", - "Mark Zuckerberg loves to dance, and", - "Ray is a framework for", - "Chevelle is a heavy-metal band that", - "Park is a common surname from the country of", + "San Francisco is know for its", + "Facebook was created in 2004 by", + "Curious George is a", + "Python 3.11 brings improvements to its", ] prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] @@ -457,17 +462,23 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, temperature=temperature, ) - _, spec_batch_token_ids = get_output_from_llm_generator( + spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator( test_llm_generator, prompts, sampling_params) - _, baseline_batch_token_ids = get_output_from_llm_generator( - baseline_llm_generator, prompts, sampling_params) + (baseline_batch_tokens, + baseline_batch_token_ids) = get_output_from_llm_generator( + baseline_llm_generator, prompts, sampling_params) assert len(baseline_batch_token_ids) == len(prompts) assert len(spec_batch_token_ids) == len(prompts) - for i, (baseline_token_ids, spec_token_ids) in enumerate( - zip(baseline_batch_token_ids, spec_batch_token_ids)): + for i, (baseline_token_ids, baseline_tokens, spec_token_ids, + spec_tokens) in enumerate( + zip(baseline_batch_token_ids, baseline_batch_tokens, + spec_batch_token_ids, spec_batch_tokens)): + if print_tokens: + print(f'{i=} {baseline_tokens=}') + print(f'{i=} {spec_tokens=}') print(f'{i=} {baseline_token_ids=}') print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids diff --git a/vllm/config.py b/vllm/config.py index 0902c7de4437..8b90eb251602 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -716,7 +716,8 @@ def maybe_create_spec_config( # draft_model_config.max_model_len) if speculative_max_model_len is not None: - draft_model_config.max_model_len = speculative_max_model_len + draft_model_config.max_model_len = min( + speculative_max_model_len, draft_model_config.max_model_len) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( From 0439b7928f6c1845eb4655d9aa8efdb18f6c4bb6 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 17:57:46 -0700 Subject: [PATCH 143/165] test_many_k --- tests/spec_decode/e2e/test_correctness.py | 42 +++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 4b6935247d6f..62be102c382d 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -426,8 +426,46 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, test_llm_generator, batch_size, max_output_len=output_len, - force_output_len=True, - print_tokens=True, + force_output_len=True + ) + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": k, + } + # Try a range of common k, as well as large speculation. + for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_many_k(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + run_greedy_equality_correctness_test( + baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True ) From 58ba3b6f725cb3a94c02599cff49d99d460283d2 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 18:21:08 -0700 Subject: [PATCH 144/165] fixes --- tests/spec_decode/e2e/test_correctness.py | 113 +++++++++++++------ tests/spec_decode/test_multi_step_worker.py | 4 +- tests/spec_decode/test_spec_decode_worker.py | 14 +-- vllm/config.py | 11 +- vllm/engine/llm_engine.py | 3 + 5 files changed, 99 insertions(+), 46 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 62be102c382d..1c100de5b8e5 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -28,13 +28,15 @@ "num_speculative_tokens": 5, }, { - # No spec decode. + # Verify the detokenizer assertions in the test work when spec + # decode is disabled. }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) -def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): +def test_spec_decode_e2e_with_detokenization(test_llm_generator, + batch_size: int): """Run generation with speculative decoding on a batch. Verify the engine generates the correct number of tokens (via ignore_eos=True), and that the detokenization matches HF transformers. @@ -98,15 +100,12 @@ def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int): }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - # Try two different num spec tokens. - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize( "output_len", [ @@ -421,13 +420,12 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, """Verify correct output when we skip speculation. Test skip 1, skip >1, skip all. """ - run_greedy_equality_correctness_test( - baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True - ) + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + @pytest.mark.parametrize( "common_llm_kwargs", @@ -442,31 +440,78 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": k, + } + # Try a range of common k, as well as large speculation. + for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, + output_len: int): + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + # Try two different tiny base models. + # Note that one is equal to the draft model, another isn't. + { + "model": "JackFram/llama-68m", + }, + #{ + # "model": "JackFram/llama-160m", + #}, + ]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [ { "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": k, - } - # Try a range of common k, as well as large speculation. - for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] + "num_speculative_tokens": 5, + }, ]) -@pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize( "output_len", [ - # Use smaller output len for fast test. - 32, + # Use long output len for the small model test. + 1536, ]) +@pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("seed", [1]) -def test_many_k(baseline_llm_generator, test_llm_generator, - batch_size: int, output_len: int): - run_greedy_equality_correctness_test( - baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True - ) +@pytest.mark.skip("used for local testing (cade to remove)") +def test_wip_validate_acceptance_rate(baseline_llm_generator, + test_llm_generator, batch_size: int, + output_len: int): + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) def run_greedy_equality_correctness_test(baseline_llm_generator, diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index d6edbab579af..e7aaa1ff4eff 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -344,8 +344,8 @@ def test_draft_proposals_no_speculations(): assert torch.is_tensor(proposals.proposal_token_ids) assert torch.is_tensor(proposals.proposal_probs) - assert proposals.proposal_token_ids.shape == torch.Size([0, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k]) + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) assert proposals.proposal_lens.shape == torch.Size([batch_size]) assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 0a3110775e2d..edda1d0aaecc 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,4 +1,5 @@ import random +from types import SimpleNamespace from unittest.mock import MagicMock import pytest @@ -202,17 +203,16 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): num_lookahead_slots=k) assert len(rejection_sampler.call_args_list) == 1 - args, _ = rejection_sampler.call_args_list[0] - (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs, - actual_proposal_token_ids) = args + _, kwargs = rejection_sampler.call_args_list[0] + actual = SimpleNamespace(**kwargs) - assert torch.equal(actual_bonus_token_ids, + assert torch.equal(actual.bonus_token_ids, target_token_ids.reshape(batch_size, k + 1)[:, -1:]) assert torch.equal( - actual_proposal_scores, + actual.target_probs, target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1]) - assert torch.equal(actual_proposal_token_ids, proposal_token_ids) - assert torch.equal(actual_proposal_probs, proposal_probs) + assert torch.equal(actual.draft_token_ids, proposal_token_ids) + assert torch.equal(actual.draft_probs, proposal_probs) @pytest.mark.parametrize('k', [1, 2, 6]) diff --git a/vllm/config.py b/vllm/config.py index 8b90eb251602..50363d3b055a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -715,9 +715,14 @@ def maybe_create_spec_config( # target_model_config.max_model_len, # draft_model_config.max_model_len) - if speculative_max_model_len is not None: - draft_model_config.max_model_len = min( - speculative_max_model_len, draft_model_config.max_model_len) + max_model_lens = [ + speculative_max_model_len, draft_model_config.max_model_len, + target_model_config.max_model_len + ] + draft_model_config.max_model_len = min([ + max_model_len for max_model_len in max_model_lens + if max_model_len is not None + ]) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c3de57e249ff..959244887136 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -446,6 +446,9 @@ def _process_model_outputs( output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) + if output and output[0].spec_decode_worker_metrics is not None: + print(f'{output[0].spec_decode_worker_metrics}') + # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): From be38a3347a5b4f0aacfd49f9beb81aaf938b1406 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 23:52:05 -0700 Subject: [PATCH 145/165] fix llmengine issue --- tests/spec_decode/e2e/test_correctness.py | 5 ++-- vllm/engine/llm_engine.py | 9 ++++++- .../layers/rejection_sampler.py | 26 ++++++++++++++----- vllm/spec_decode/batch_expansion.py | 4 +-- vllm/spec_decode/spec_decode_worker.py | 2 ++ 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 1c100de5b8e5..e1759c297fa2 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -499,11 +499,12 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, "output_len", [ # Use long output len for the small model test. - 1536, + #1536, + 128, ]) @pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("seed", [1]) -@pytest.mark.skip("used for local testing (cade to remove)") +#@pytest.mark.skip("used for local testing (cade to remove)") def test_wip_validate_acceptance_rate(baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 959244887136..51925d859bbe 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -457,7 +457,14 @@ def _process_model_outputs( scheduled_seq_group.token_chunk_size) # If uncomputed tokens > 0, it means prefill is chunked. # We don't need to process outputs in that case. - if seq_group.get_num_uncomputed_tokens() == 0: + + from vllm.sequence import SequenceStage + stages = [seq.data._stage for seq in seq_group.seqs_dict.values()] + all_decode = all([stage == SequenceStage.DECODE for stage in stages]) + + print(f'{seq_group.get_num_uncomputed_tokens()=}') + print(f'{all_decode=} {stages=}') + if all_decode: self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 812ad97ccd1f..72b6dc801754 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -312,13 +312,25 @@ def _create_output( output.mul_(~after_false_mask).add_( recovered_token_ids.mul(after_false_mask)) - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k + da = accepted.sum() + de = (output_with_bonus_tokens != -1).sum() + dd = batch_size * k - print(f'{self.num_accepted_tokens=}') - print(f'{self.num_emitted_tokens=}') - print(f'{self.num_draft_tokens=}') + self.num_accepted_tokens += da + self.num_emitted_tokens += de + self.num_draft_tokens += dd + + #self.num_accepted_tokens += accepted.sum() + #self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + #self.num_draft_tokens += batch_size * k + + print(f'delta num_accepted={da}') + print(f'delta num_emitted_tokens={de}') + print(f'delta num_draft_tokens={dd}') + + print(f'cumulative {self.num_accepted_tokens=}') + print(f'cumulative {self.num_emitted_tokens=}') + print(f'cumulative {self.num_draft_tokens=}') return output_with_bonus_tokens @@ -347,7 +359,7 @@ def _raise_if_incorrect_shape( assert bonus_batch_size == target_batch_size assert num_bonus_tokens == self._num_bonus_tokens except: - breakpoint() + #breakpoint() raise def _raise_if_incorrect_dtype( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 8a5b8d764281..421b483a2524 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -187,7 +187,7 @@ def _contract_batch(self, original_bs: int, speculated_batch_size, k + 1, self._vocab_size) except Exception as e: print(e) - breakpoint() + #breakpoint() raise all_tokens = torch.full(size=(original_bs, k + 1), @@ -205,7 +205,7 @@ def _contract_batch(self, original_bs: int, all_tokens[non_spec_indices, :1] = non_spec_target_token_ids all_probs[non_spec_indices, :1, :] = non_spec_target_probs except: - breakpoint() + #breakpoint() raise if spec_indices: diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 0d1e8b167d58..ff7e0f280653 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -299,6 +299,8 @@ def _verify_tokens( draft_token_ids=proposal_token_ids, ) + print(f'{accepted_token_ids=}') + # Append output tokens from non-speculative sequences to # the accepted token ids tensor. non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + From 36c741ab373c32b1684e2d38879e023250f27c04 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 23:58:36 -0700 Subject: [PATCH 146/165] wip metrics --- vllm/spec_decode/metrics.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 5df8fc4316d4..ef6bf705780c 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -169,6 +169,25 @@ def _collect_rejsample_metrics( @staticmethod def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: + + """ + SpecDecodeWorkerMetrics( + draft_acceptance_rate=0.888, + system_efficiency=29.5, + draft_tokens=125, + emitted_tokens=118, + accepted_tokens=111, + num_spec_tokens=5, + ) + + accepted / draft ~= 88% + 125/5 25 spec steps + grep says 27 verify steps + ok 25 makes sense (metrics lag) + + 118/(25 * (5+1)) = 118/150 = 0.78666666666 + """ + # Divide by k since batch size can be variable. total_num_spec_seqs = draft_tokens / k num_accepted_per_seq_if_all_accepted = k + 1 From 7297bae9979df4b5bcfa13544aaa8f05b986d521 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 17 Apr 2024 23:59:07 -0700 Subject: [PATCH 147/165] lint --- vllm/engine/llm_engine.py | 7 ++++--- vllm/spec_decode/metrics.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 51925d859bbe..390d09e70b2a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -457,11 +457,12 @@ def _process_model_outputs( scheduled_seq_group.token_chunk_size) # If uncomputed tokens > 0, it means prefill is chunked. # We don't need to process outputs in that case. - + from vllm.sequence import SequenceStage stages = [seq.data._stage for seq in seq_group.seqs_dict.values()] - all_decode = all([stage == SequenceStage.DECODE for stage in stages]) - + all_decode = all( + [stage == SequenceStage.DECODE for stage in stages]) + print(f'{seq_group.get_num_uncomputed_tokens()=}') print(f'{all_decode=} {stages=}') if all_decode: diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index ef6bf705780c..ca7a3504d83a 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -169,7 +169,6 @@ def _collect_rejsample_metrics( @staticmethod def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: - """ SpecDecodeWorkerMetrics( draft_acceptance_rate=0.888, From 140f198f38f85918990bea9c7c7f0b0a013d6376 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 18 Apr 2024 13:37:50 -0700 Subject: [PATCH 148/165] fix system efficiency metric --- tests/spec_decode/e2e/test_correctness.py | 1 - vllm/spec_decode/metrics.py | 22 ++-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index e1759c297fa2..fca8f40801d2 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -504,7 +504,6 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, ]) @pytest.mark.parametrize("batch_size", [1]) @pytest.mark.parametrize("seed", [1]) -#@pytest.mark.skip("used for local testing (cade to remove)") def test_wip_validate_acceptance_rate(baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index ca7a3504d83a..2117bc5b5a7c 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -169,25 +169,7 @@ def _collect_rejsample_metrics( @staticmethod def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: - """ - SpecDecodeWorkerMetrics( - draft_acceptance_rate=0.888, - system_efficiency=29.5, - draft_tokens=125, - emitted_tokens=118, - accepted_tokens=111, - num_spec_tokens=5, - ) - - accepted / draft ~= 88% - 125/5 25 spec steps - grep says 27 verify steps - ok 25 makes sense (metrics lag) - - 118/(25 * (5+1)) = 118/150 = 0.78666666666 - """ - # Divide by k since batch size can be variable. - total_num_spec_seqs = draft_tokens / k + total_num_spec_seqs = int(draft_tokens / k) num_accepted_per_seq_if_all_accepted = k + 1 - return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted) + return total_num_spec_seqs * num_accepted_per_seq_if_all_accepted From f9b8a6802c4663256d62c1a83eff7b91a01b3eb8 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Thu, 18 Apr 2024 17:02:11 -0700 Subject: [PATCH 149/165] debugging prints --- tests/spec_decode/e2e/test_correctness.py | 1 + .../layers/rejection_sampler.py | 28 +++++++++++++++++++ vllm/model_executor/layers/sampler.py | 12 +++++--- vllm/model_executor/models/llama.py | 13 +++++++++ vllm/spec_decode/multi_step_worker.py | 4 ++- vllm/spec_decode/spec_decode_worker.py | 3 ++ vllm/worker/model_runner.py | 7 +++++ vllm/worker/worker.py | 5 ++++ 8 files changed, 68 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fca8f40801d2..4a450d29340a 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -483,6 +483,7 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, # Note that one is equal to the draft model, another isn't. { "model": "JackFram/llama-68m", + "gpu_memory_utilization": 0.5, }, #{ # "model": "JackFram/llama-160m", diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 72b6dc801754..caabd526a6d8 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -116,6 +116,7 @@ def forward( draft_token_ids, bonus_token_ids, ) + return output_token_ids def _batch_modified_rejection_sampling( @@ -137,6 +138,17 @@ def _batch_modified_rejection_sampling( batch_size, k, vocab_size = draft_probs.shape + target_probs_unmodified = target_probs.clone() + draft_probs_unmodified = draft_probs.clone() + + target_greedy_bois = target_probs.argmax(dim=-1) + draft_greedy_bois = draft_probs.argmax(dim=-1) + + target_probs[:] = 0 + target_probs[:, torch.arange(k), target_greedy_bois] = 1 + draft_probs[:] = 0 + draft_probs[:, torch.arange(k), draft_greedy_bois] = 1 + # shape [batch_size, k] accepted = self._get_accepted(target_probs, draft_probs, draft_token_ids) @@ -144,10 +156,24 @@ def _batch_modified_rejection_sampling( recovered_probs = self._get_recovered_probs( target_probs, draft_probs).reshape(batch_size * k, vocab_size) + recovered_probs_clone = recovered_probs.clone() + # NOTE: the recovered_probs are overwritten by this method. recovered_token_ids = _multinomial(recovered_probs, num_samples=1).reshape( batch_size, k) + + if False in accepted: + print(f'{accepted=}') + print(f'{target_greedy_bois=}') + print(f'{draft_greedy_bois=}') + toks = [target_greedy_bois[0, 1].item(), draft_greedy_bois[0, 1].item()] + print(f'{toks=}') + print(f'{target_probs_unmodified[0, 1, toks]=}') + print(f'{draft_probs_unmodified[0, 1, toks]=}') + # [1, 15043, 29892, 590, 1024, 338, 590, 1024, 29889, 306, 626, 263, 29871] + #breakpoint() + return accepted, recovered_token_ids def _get_accepted( @@ -307,6 +333,8 @@ def _create_output( # with causal acceptance. output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, bonus_token_ids, -1) + print(f'disabling bonus token') + output_with_bonus_tokens[:, -1] = -1 # Fill the recovered token ids. output.mul_(~after_false_mask).add_( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index e97941f01ac9..c2832fc0bbb0 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -398,10 +398,14 @@ def _sample_with_torch( # TODO clean up # self._include_gpu_probs_tensor - logprobs[s_i, :] = -float('inf') - logprobs[s_i, greedy_samples] = 0.0 - probs[s_i, :] = 0 - probs[s_i, greedy_samples] = 1.0 + # modify greedy probs + modify_greedy_probs = False + if modify_greedy_probs: + logprobs[s_i, :] = -float('inf') + logprobs[s_i, greedy_samples] = 0.0 + probs[s_i, :] = 0 + probs[s_i, greedy_samples] = 1.0 + sampled_token_ids_tensor[s_i] = greedy_samples.unsqueeze(-1) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1cf416f80b02..9dc728755cf6 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -285,6 +285,13 @@ def forward( hidden_states = self.get_input_embeddings(input_ids) residual = None for i in range(len(self.layers)): + def xform(kv_cache, blocks): + if kv_cache is None: + return None + return torch.abs(kv_cache[:, :blocks]).sum(-1) + + uniq_hs_val_0 = torch.abs(hidden_states).sum(dim=-1) + uniq_kv_val_0 = xform(kv_caches[i], blocks=2) layer = self.layers[i] hidden_states, residual = layer( positions, @@ -293,6 +300,12 @@ def forward( attn_metadata, residual, ) + + uniq_hs_val_1 = torch.abs(hidden_states).sum(dim=-1) + uniq_kv_val_1 = xform(kv_caches[i], blocks=2) + print(f'uniq_val hs {positions=} {i=} {uniq_hs_val_0=} -> {uniq_hs_val_1}=') + print(f'uniq_val kv {positions=} {i=} {uniq_kv_val_0=} -> {uniq_kv_val_1}=') + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 292ba64160af..d5a3fc990534 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -62,7 +62,9 @@ def execute_model_multi_step( # Run model num_steps times. model_outputs = [] - for _ in range(num_steps): + for i in range(num_steps): + print(f'multi step worker running step {i}/{num_steps}') + model_output = super().execute_model( seq_group_metadata_list=copied_seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index ff7e0f280653..751e9c6d06e2 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -292,6 +292,9 @@ def _verify_tokens( #if -1 in proposals.proposal_token_ids: # breakpoint() + self.rejection_sampler.seq_group_metadata_list = seq_group_metadata_list + self.rejection_sampler.spec_indices = spec_indices + accepted_token_ids = self.rejection_sampler( target_probs=proposal_verifier_probs, bonus_token_ids=bonus_token_ids, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3124650b3e17..fe4957f3c502 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -510,6 +510,12 @@ def _prepare_decode( else: max_block_table_len = max( len(block_table) for block_table in block_tables) + print(f'model runner {input_tokens=}') + print(f'model runner {input_positions=}') + print(f'model runner {context_lens=}') + print(f'model runner {slot_mapping=}') + print(f'model runner {max_block_table_len=}') + print(f'model runner {block_tables=}') block_tables = make_tensor_with_pad( block_tables, max_len=max_block_table_len, @@ -518,6 +524,7 @@ def _prepare_decode( device=self.device, ) + attn_metadata = self.attn_backend.make_metadata( is_prompt=False, prompt_lens=None, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e2b47530d41e..d775c2bcc8de 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -212,6 +212,11 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> List[SamplerOutput]: + + print(f'worker execute model') + for seq_group in seq_group_metadata_list: + for seq_data in seq_group.seq_data.values(): + print(f'{seq_data.get_token_ids()}=') if self.is_driver_worker: assert seq_group_metadata_list is not None From c5af09e91319aa96d558f39ea314157ff025dbef Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Fri, 19 Apr 2024 11:47:17 -0700 Subject: [PATCH 150/165] Revert "debugging prints" This reverts commit f9b8a6802c4663256d62c1a83eff7b91a01b3eb8. --- tests/spec_decode/e2e/test_correctness.py | 1 - .../layers/rejection_sampler.py | 28 ------------------- vllm/model_executor/layers/sampler.py | 12 +++----- vllm/model_executor/models/llama.py | 13 --------- vllm/spec_decode/multi_step_worker.py | 4 +-- vllm/spec_decode/spec_decode_worker.py | 3 -- vllm/worker/model_runner.py | 7 ----- vllm/worker/worker.py | 5 ---- 8 files changed, 5 insertions(+), 68 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 4a450d29340a..fca8f40801d2 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -483,7 +483,6 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, # Note that one is equal to the draft model, another isn't. { "model": "JackFram/llama-68m", - "gpu_memory_utilization": 0.5, }, #{ # "model": "JackFram/llama-160m", diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index caabd526a6d8..72b6dc801754 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -116,7 +116,6 @@ def forward( draft_token_ids, bonus_token_ids, ) - return output_token_ids def _batch_modified_rejection_sampling( @@ -138,17 +137,6 @@ def _batch_modified_rejection_sampling( batch_size, k, vocab_size = draft_probs.shape - target_probs_unmodified = target_probs.clone() - draft_probs_unmodified = draft_probs.clone() - - target_greedy_bois = target_probs.argmax(dim=-1) - draft_greedy_bois = draft_probs.argmax(dim=-1) - - target_probs[:] = 0 - target_probs[:, torch.arange(k), target_greedy_bois] = 1 - draft_probs[:] = 0 - draft_probs[:, torch.arange(k), draft_greedy_bois] = 1 - # shape [batch_size, k] accepted = self._get_accepted(target_probs, draft_probs, draft_token_ids) @@ -156,24 +144,10 @@ def _batch_modified_rejection_sampling( recovered_probs = self._get_recovered_probs( target_probs, draft_probs).reshape(batch_size * k, vocab_size) - recovered_probs_clone = recovered_probs.clone() - # NOTE: the recovered_probs are overwritten by this method. recovered_token_ids = _multinomial(recovered_probs, num_samples=1).reshape( batch_size, k) - - if False in accepted: - print(f'{accepted=}') - print(f'{target_greedy_bois=}') - print(f'{draft_greedy_bois=}') - toks = [target_greedy_bois[0, 1].item(), draft_greedy_bois[0, 1].item()] - print(f'{toks=}') - print(f'{target_probs_unmodified[0, 1, toks]=}') - print(f'{draft_probs_unmodified[0, 1, toks]=}') - # [1, 15043, 29892, 590, 1024, 338, 590, 1024, 29889, 306, 626, 263, 29871] - #breakpoint() - return accepted, recovered_token_ids def _get_accepted( @@ -333,8 +307,6 @@ def _create_output( # with causal acceptance. output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, bonus_token_ids, -1) - print(f'disabling bonus token') - output_with_bonus_tokens[:, -1] = -1 # Fill the recovered token ids. output.mul_(~after_false_mask).add_( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c2832fc0bbb0..e97941f01ac9 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -398,14 +398,10 @@ def _sample_with_torch( # TODO clean up # self._include_gpu_probs_tensor - # modify greedy probs - modify_greedy_probs = False - if modify_greedy_probs: - logprobs[s_i, :] = -float('inf') - logprobs[s_i, greedy_samples] = 0.0 - probs[s_i, :] = 0 - probs[s_i, greedy_samples] = 1.0 - + logprobs[s_i, :] = -float('inf') + logprobs[s_i, greedy_samples] = 0.0 + probs[s_i, :] = 0 + probs[s_i, greedy_samples] = 1.0 sampled_token_ids_tensor[s_i] = greedy_samples.unsqueeze(-1) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 9dc728755cf6..1cf416f80b02 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -285,13 +285,6 @@ def forward( hidden_states = self.get_input_embeddings(input_ids) residual = None for i in range(len(self.layers)): - def xform(kv_cache, blocks): - if kv_cache is None: - return None - return torch.abs(kv_cache[:, :blocks]).sum(-1) - - uniq_hs_val_0 = torch.abs(hidden_states).sum(dim=-1) - uniq_kv_val_0 = xform(kv_caches[i], blocks=2) layer = self.layers[i] hidden_states, residual = layer( positions, @@ -300,12 +293,6 @@ def xform(kv_cache, blocks): attn_metadata, residual, ) - - uniq_hs_val_1 = torch.abs(hidden_states).sum(dim=-1) - uniq_kv_val_1 = xform(kv_caches[i], blocks=2) - print(f'uniq_val hs {positions=} {i=} {uniq_hs_val_0=} -> {uniq_hs_val_1}=') - print(f'uniq_val kv {positions=} {i=} {uniq_kv_val_0=} -> {uniq_kv_val_1}=') - hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index d5a3fc990534..292ba64160af 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -62,9 +62,7 @@ def execute_model_multi_step( # Run model num_steps times. model_outputs = [] - for i in range(num_steps): - print(f'multi step worker running step {i}/{num_steps}') - + for _ in range(num_steps): model_output = super().execute_model( seq_group_metadata_list=copied_seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 751e9c6d06e2..ff7e0f280653 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -292,9 +292,6 @@ def _verify_tokens( #if -1 in proposals.proposal_token_ids: # breakpoint() - self.rejection_sampler.seq_group_metadata_list = seq_group_metadata_list - self.rejection_sampler.spec_indices = spec_indices - accepted_token_ids = self.rejection_sampler( target_probs=proposal_verifier_probs, bonus_token_ids=bonus_token_ids, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fe4957f3c502..3124650b3e17 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -510,12 +510,6 @@ def _prepare_decode( else: max_block_table_len = max( len(block_table) for block_table in block_tables) - print(f'model runner {input_tokens=}') - print(f'model runner {input_positions=}') - print(f'model runner {context_lens=}') - print(f'model runner {slot_mapping=}') - print(f'model runner {max_block_table_len=}') - print(f'model runner {block_tables=}') block_tables = make_tensor_with_pad( block_tables, max_len=max_block_table_len, @@ -524,7 +518,6 @@ def _prepare_decode( device=self.device, ) - attn_metadata = self.attn_backend.make_metadata( is_prompt=False, prompt_lens=None, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d775c2bcc8de..e2b47530d41e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -212,11 +212,6 @@ def execute_model( blocks_to_copy: Optional[Dict[int, List[int]]] = None, num_lookahead_slots: int = 0, ) -> List[SamplerOutput]: - - print(f'worker execute model') - for seq_group in seq_group_metadata_list: - for seq_data in seq_group.seq_data.values(): - print(f'{seq_data.get_token_ids()}=') if self.is_driver_worker: assert seq_group_metadata_list is not None From da95e2210767e8ba102fddfc2cdec4877a95591c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Fri, 19 Apr 2024 11:49:20 -0700 Subject: [PATCH 151/165] disable bonus token --- vllm/model_executor/layers/rejection_sampler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 72b6dc801754..2dce1a07b4bf 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -307,6 +307,12 @@ def _create_output( # with causal acceptance. output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, bonus_token_ids, -1) + + # We disable bonus tokens because it causes corrupt KV cache for + # proposal methods that require KV cache. We can fix it by "prefilling" + # the bonus token in the proposer. + # https://github.com/vllm-project/vllm/issues/4212 + output_with_bonus_tokens[:, -1] = -1 # Fill the recovered token ids. output.mul_(~after_false_mask).add_( From 9f8ff5634d053a11f6951bd125b3ba5bf8cd0e54 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 21:32:34 -0700 Subject: [PATCH 152/165] cleanup --- tests/spec_decode/e2e/test_correctness.py | 24 ++++-- vllm/config.py | 45 ++++++++--- vllm/core/scheduler.py | 1 - vllm/engine/arg_utils.py | 4 +- vllm/engine/llm_engine.py | 29 ++++--- vllm/executor/gpu_executor.py | 3 +- .../layers/rejection_sampler.py | 22 +---- vllm/model_executor/models/llama.py | 3 - vllm/sequence.py | 1 - vllm/spec_decode/batch_expansion.py | 81 ++++++++----------- vllm/spec_decode/metrics.py | 27 +++++-- vllm/spec_decode/multi_step_worker.py | 32 +------- vllm/spec_decode/spec_decode_worker.py | 17 ++-- vllm/worker/model_runner.py | 2 - 14 files changed, 139 insertions(+), 152 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index fca8f40801d2..7b50046e8d20 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -85,7 +85,10 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, "enforce_eager": True, # Required for spec decode. - "use_v2_block_manager": True + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -131,7 +134,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( "enforce_eager": True, # Required for spec decode. - "use_v2_block_manager": True + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -226,7 +232,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( "enforce_eager": True, # Required for spec decode. - "use_v2_block_manager": True + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -264,7 +273,10 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( "enforce_eager": True, # Required for spec decode. - "use_v2_block_manager": True + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @@ -474,7 +486,9 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, "enforce_eager": True, # Required for spec decode. - "use_v2_block_manager": True + "use_v2_block_manager": True, + + "disable_log_stats": False, }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", diff --git a/vllm/config.py b/vllm/config.py index 50363d3b055a..177e9c68283c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -667,7 +667,9 @@ def maybe_create_spec_config( model, if provided. num_speculative_tokens (Optional[int]): The number of speculative tokens, if provided. - TODO speculative_max_model_len + speculative_max_model_len (Optional[int]): The maximum model len of + the speculative model. Used when testing the ability to skip + speculation for some sequences. Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if @@ -710,19 +712,10 @@ def maybe_create_spec_config( max_logprobs=target_model_config.max_logprobs, ) - # TODO docs - #draft_model_config.max_model_len = min( - # target_model_config.max_model_len, - # draft_model_config.max_model_len) + draft_model_config.max_model_len = (SpeculativeConfig._maybe_override_draft_max_model_len( + speculative_max_model_len, draft_model_config.max_model_len, target_model_config.max_model_len, + )) - max_model_lens = [ - speculative_max_model_len, draft_model_config.max_model_len, - target_model_config.max_model_len - ] - draft_model_config.max_model_len = min([ - max_model_len for max_model_len in max_model_lens - if max_model_len is not None - ]) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( @@ -734,6 +727,32 @@ def maybe_create_spec_config( num_speculative_tokens, ) + @staticmethod + def _maybe_override_draft_max_model_len( + speculative_max_model_len: Optional[int], + draft_max_model_len: int, + target_max_model_len: int, + ) -> int: + """Determine the max sequence len for the draft model. This is usually + the draft_max_model_len, but may be the target_max_model_len if it is + less than the draft_max_model_len, or may be speculative_max_model_len + if it is specified. + + This is necessary so that sequences do not exceed the capacity of the + draft model or the target model. + + speculative_max_model_len is mainly used for testing that sequences can + skip speculation. + """ + + if speculative_max_model_len is not None: + return speculative_max_model_len + + return min( + draft_max_model_len, + target_max_model_len, + ) + @staticmethod def create_draft_parallel_config( target_parallel_config: ParallelConfig) -> ParallelConfig: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index bc55a3899035..419855062103 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1107,5 +1107,4 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup, if enable_chunking and len(seqs) == 1: num_new_tokens = min(num_new_tokens, budget.remaining_token_budget()) - return num_new_tokens diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c8f3c9d20b2d..fe1f205d84e1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -421,7 +421,9 @@ def add_cli_args( parser.add_argument('--speculative-max-model-len', type=str, default=EngineArgs.speculative_max_model_len, - help='TODO') + help='The maximum sequence length supported by the ' + 'draft model. Sequences over this length will skip ' + 'speculation.') parser.add_argument('--model-loader-extra-config', type=str, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 390d09e70b2a..2c7cc4c916c4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -22,7 +22,7 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (MultiModalData, SamplerOutput, Sequence, - SequenceGroup) + SequenceGroup, SequenceStage) from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) @@ -446,26 +446,18 @@ def _process_model_outputs( output_by_sequence_group = create_output_by_sequence_group( sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups)) - if output and output[0].spec_decode_worker_metrics is not None: - print(f'{output[0].spec_decode_worker_metrics}') - # Update the scheduled sequence groups with the model outputs. for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output_by_sequence_group): seq_group = scheduled_seq_group.seq_group seq_group.update_num_computed_tokens( scheduled_seq_group.token_chunk_size) - # If uncomputed tokens > 0, it means prefill is chunked. - # We don't need to process outputs in that case. - from vllm.sequence import SequenceStage + # If all sequences in the sequence group are in DECODE, then we can + # process the output tokens. Otherwise, they are (chunked) prefill + # samples and should not be processed. stages = [seq.data._stage for seq in seq_group.seqs_dict.values()] - all_decode = all( - [stage == SequenceStage.DECODE for stage in stages]) - - print(f'{seq_group.get_num_uncomputed_tokens()=}') - print(f'{all_decode=} {stages=}') - if all_decode: + if all(stage == SequenceStage.DECODE for stage in stages): self.output_processor.process_outputs(seq_group, outputs) # Free the finished sequence groups. @@ -553,6 +545,17 @@ def step(self) -> List[RequestOutput]: # Log stats. if self.log_stats: self.stat_logger.log(self._get_stats(scheduler_outputs)) + if output and output[0].spec_decode_worker_metrics is not None: + # TODO Integrate speculative metrics with Prometheus/stdout + # stats logger. + metrics = output[0].spec_decode_worker_metrics + logger.info("Speculative metrics: " + f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " + f"System efficiency: {metrics.system_efficiency:.3f}, " + f"Number of speculative tokens: {metrics.num_spec_tokens}, " + f"Number of accepted tokens: {metrics.accepted_tokens}, " + f"Number of draft tokens tokens: {metrics.draft_tokens}, " + f"Number of emitted tokens tokens: {metrics.emitted_tokens}.") return request_outputs diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index fc4ce7aa228a..d413a7d27ff3 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -83,7 +83,8 @@ def _init_spec_worker(self): scheduler_config=self.scheduler_config, device_config=self.device_config, cache_config=self.cache_config, - load_config=self.load_config, # TODO get from spec + # TODO allow draft-model specific load config. + load_config=self.load_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 2dce1a07b4bf..d1521c589e21 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -318,25 +318,9 @@ def _create_output( output.mul_(~after_false_mask).add_( recovered_token_ids.mul(after_false_mask)) - da = accepted.sum() - de = (output_with_bonus_tokens != -1).sum() - dd = batch_size * k - - self.num_accepted_tokens += da - self.num_emitted_tokens += de - self.num_draft_tokens += dd - - #self.num_accepted_tokens += accepted.sum() - #self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - #self.num_draft_tokens += batch_size * k - - print(f'delta num_accepted={da}') - print(f'delta num_emitted_tokens={de}') - print(f'delta num_draft_tokens={dd}') - - print(f'cumulative {self.num_accepted_tokens=}') - print(f'cumulative {self.num_emitted_tokens=}') - print(f'cumulative {self.num_draft_tokens=}') + self.num_accepted_tokens += accepted.sum() + self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + self.num_draft_tokens += batch_size * k return output_with_bonus_tokens diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1cf416f80b02..016e3b039d1e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -266,9 +266,6 @@ def __init__( self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - #print(f'get_input_embeddings {input_ids.shape=} {input_ids=}') - #if input_ids.shape[0] == 43: - # breakpoint() return self.embed_tokens(input_ids) def forward( diff --git a/vllm/sequence.py b/vllm/sequence.py index 9202e0bb69bf..92362a9a5d2a 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -147,7 +147,6 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int): self._num_computed_tokens += num_new_computed_tokens assert self._num_computed_tokens <= self.get_len(), ( self._num_computed_tokens, self.get_len()) - # If all tokens are computed, it means it is in decoding phase. if self.get_num_uncomputed_tokens() == 0: self._stage = SequenceStage.DECODE diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 421b483a2524..0720b7f2c101 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -72,10 +72,13 @@ def score_proposals( proposal_lens_list = proposals.proposal_lens.tolist() proposal_token_ids_list = proposals.proposal_token_ids.tolist() + # Filter the list to ignore -1 proposals. + proposal_token_ids_list_without_skips = [proposals for proposals in proposal_token_ids_list if -1 not in proposals] + (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) = self._expand_batch( seq_group_metadata_list=seq_group_metadata_list, - proposal_token_ids_list=proposal_token_ids_list, + proposal_token_ids_list=proposal_token_ids_list_without_skips, proposal_lens_list=proposal_lens_list, ) @@ -89,7 +92,7 @@ def score_proposals( target_sampler_output = target_sampler_output[0] all_tokens, all_probs = self._contract_batch( - original_bs=len(seq_group_metadata_list), + contracted_bs=len(seq_group_metadata_list), target_sampler_output=target_sampler_output, proposals=proposals, num_scoring_tokens=num_scoring_tokens, @@ -127,18 +130,11 @@ def _expand_batch( proposal_lens_list, select_proposal_len_zero=True) - # TODO clean up - filtered = [] - for p in proposal_token_ids_list: - if -1 in p: - assert all([x == -1 for x in p]) - continue - filtered.append(p) - target_seq_group_metadata_list = self._create_scoring_model_input( seq_group_metadata_list=spec_seqs, - #proposal_token_ids=proposal_token_ids_list, - proposal_token_ids=filtered, + proposal_token_ids=proposal_token_ids_list, + # NOTE: We determine the seq ids in the expanded batch using the + # full seq_group_metadata_list, instead of only spec_seqs. target_seq_ids_iter=self._create_target_seq_id_iterator( seq_ids=get_all_seq_ids(seq_group_metadata_list)), ) @@ -149,7 +145,7 @@ def _expand_batch( return (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) - def _contract_batch(self, original_bs: int, + def _contract_batch(self, contracted_bs: int, target_sampler_output: List[SamplerOutput], proposals: SpeculativeProposals, num_scoring_tokens: int, non_spec_indices: List[int], @@ -158,55 +154,42 @@ def _contract_batch(self, original_bs: int, """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. - """ - - # We mock the device tensors until PR 7/9 is merged (e2e correctness). - # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer - #maybe_mock_device_tensors( - # sampler_output=target_sampler_output, - # batch_size=len(non_spec_indices) + num_scoring_tokens, - # vocab_size=self._vocab_size, - # device=self._device, - #) + contracted_bs is the original batch size, and the batch size that the + target_sampler_output will be contracted to. + """ (target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs) = self._split_scoring_output( target_sampler_output, num_scoring_tokens) # Map distinct sequences used to score each token # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - full_batch_size, k = proposals.proposal_token_ids.shape - non_spec_batch_size, _ = non_spec_target_token_ids.shape - speculated_batch_size = full_batch_size - non_spec_batch_size - # TODO clean up - - try: - target_token_ids = target_token_ids.squeeze().reshape( - speculated_batch_size, k + 1) - target_probs = target_probs.squeeze().reshape( - speculated_batch_size, k + 1, self._vocab_size) - except Exception as e: - print(e) - #breakpoint() - raise - - all_tokens = torch.full(size=(original_bs, k + 1), + expanded_batch_size, k = proposals.proposal_token_ids.shape + + # The number of tokens in the expanded batch used for speculation is + # equal to the total expanded batch size minus the number of samples for + # non-speculative sequences. + non_spec_expanded_bs, _ = non_spec_target_token_ids.shape + spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs + + target_token_ids = target_token_ids.squeeze().reshape( + spec_expanded_bs, k + 1) + target_probs = target_probs.squeeze().reshape( + spec_expanded_bs, k + 1, self._vocab_size) + + all_tokens = torch.full(size=(contracted_bs, k + 1), fill_value=-1, device=self._device, dtype=torch.long) - all_probs = torch.zeros(original_bs, + all_probs = torch.zeros(contracted_bs, k + 1, self._vocab_size, device=self._device, dtype=torch.float32) if non_spec_indices: - try: - all_tokens[non_spec_indices, :1] = non_spec_target_token_ids - all_probs[non_spec_indices, :1, :] = non_spec_target_probs - except: - #breakpoint() - raise + all_tokens[non_spec_indices, :1] = non_spec_target_token_ids + all_probs[non_spec_indices, :1, :] = non_spec_target_probs if spec_indices: all_tokens[spec_indices] = target_token_ids @@ -223,7 +206,9 @@ def _create_scoring_model_input( """Given the original input sequences and proposed tokens from the draft model, create a list of target sequences that can be used for scoring. - TODO docs on target_seq_ids_iter + target_seq_ids_iter provides sequence ids for the expanded batch, + fulfilling the requirement that no seq id in the expanded batch is equal + to the seq id in the original batch. """ if not seq_group_metadata_list: @@ -244,7 +229,7 @@ def _create_scoring_model_input( def _create_target_seq_group_metadata( self, input_seq_group_metadata: SequenceGroupMetadata, - proposal_token_ids: List[TokenId], # shape: [batch_size, k] + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] batch_index: int, target_seq_ids_iter: Iterator[TargetSeqId], ) -> List[SequenceGroupMetadata]: diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 2117bc5b5a7c..9b17a12cc173 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -146,15 +146,15 @@ def _collect_rejsample_metrics( emitted_tokens = self._aggregate_num_emitted_tokens.item() draft_tokens = self._aggregate_num_draft_tokens - num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k) + max_num_emitted_tokens = self.get_max_num_emitted_tokens(draft_tokens, k) if draft_tokens > 0: draft_acceptance_rate = accepted_tokens / draft_tokens else: draft_acceptance_rate = float("nan") - if num_possible_tokens > 0: - system_efficiency = emitted_tokens / num_possible_tokens + if max_num_emitted_tokens > 0: + system_efficiency = emitted_tokens / max_num_emitted_tokens else: system_efficiency = float("nan") @@ -168,8 +168,21 @@ def _collect_rejsample_metrics( ) @staticmethod - def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: - # Divide by k since batch size can be variable. + def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int: + """Calculate the number of emitted tokens, assuming all tokens are + accepted. + + This is equal to the number of sequences that have been speculated on, + times (speculation len + 1). The +1 comes from the bonus token. + """ + # Determine the number of sequences that have been speculated on. Since + # the batch size can be variable, we divide by k. total_num_spec_seqs = int(draft_tokens / k) - num_accepted_per_seq_if_all_accepted = k + 1 - return total_num_spec_seqs * num_accepted_per_seq_if_all_accepted + + # A single sequence may emit k accepted tokens and one bonus token in + # the best case. + num_emitted_per_seq_if_all_accepted = k + 1 + + # The max num of emitted tokens is the number of speculated sequences + # times the max emitted per seq. + return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 292ba64160af..2954898ea93f 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -255,11 +255,6 @@ def get_proposals( nonzero_proposal_len_indices) = self._split_by_max_model_len( seq_group_metadata_list, max_proposal_len) - print(f'{proposal_lens=}') - if 0 in proposal_lens: - pass - #breakpoint() - if nonzero_proposal_len_seqs: # Speculate tokens using the draft worker for the speculative # sequences. @@ -274,10 +269,6 @@ def get_proposals( # If no sequences can be speculated, set sampler output to None. maybe_sampler_output = None - if 0 in proposal_lens: - pass - #breakpoint() - # Combine speculative- and non-speculative sequences into the same # representation. proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( @@ -336,11 +327,7 @@ def _merge_outputs( """ if maybe_sampler_output is None: # If no speculative tokens, the sampler output will be None. - # In this case we return empty tensors. - #proposal_tokens = torch.zeros(0, - # max_proposal_len, - # dtype=torch.long, - # device=self._device) + # In this case we return empty proposals. proposal_tokens = torch.full(size=( batch_size, max_proposal_len, @@ -348,11 +335,6 @@ def _merge_outputs( fill_value=-1, dtype=torch.long, device=self._device) - #proposal_probs = torch.zeros(0, - # max_proposal_len, - # self._vocab_size, - # dtype=torch.float32, - # device=self._device) proposal_probs = torch.zeros(batch_size, max_proposal_len, self._vocab_size, @@ -364,18 +346,6 @@ def _merge_outputs( return proposal_tokens, proposal_probs, proposal_lens sampler_output = maybe_sampler_output - - # We mock the device tensors until PR 7/9 is merged (e2e correctness). - # https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer - for step_output in sampler_output: - pass - #maybe_mock_device_tensors( - # sampler_output=step_output, - # batch_size=len(proposal_lens), - # vocab_size=self._vocab_size, - # device=self._device, - #) - proposal_tokens, proposal_probs = sampler_output_to_torch( sampler_output) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index ff7e0f280653..517d17d04f95 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -238,7 +238,6 @@ def _run_speculative_decoding_step( blocks_to_copy, k) logger.info("score proposals") - print(f'{proposals=}') proposal_scores = self.scorer.score_proposals( seq_group_metadata_list, blocks_to_swap_in, @@ -283,14 +282,20 @@ def _verify_tokens( select_proposal_len_zero=True) original_indices = spec_indices + non_spec_indices + # Get probabilities of target model, excluding bonus token. proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1] + + # Get non-speculative sampled tokens from target model. + non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] + + # Get bonus tokens from target model. bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] + + # Get probabilities according to proposal method. proposal_probs = proposals.proposal_probs[spec_indices] + + # Get proposed tokens. proposal_token_ids = proposals.proposal_token_ids[spec_indices] - non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] - - #if -1 in proposals.proposal_token_ids: - # breakpoint() accepted_token_ids = self.rejection_sampler( target_probs=proposal_verifier_probs, @@ -299,8 +304,6 @@ def _verify_tokens( draft_token_ids=proposal_token_ids, ) - print(f'{accepted_token_ids=}') - # Append output tokens from non-speculative sequences to # the accepted token ids tensor. non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3124650b3e17..42c06a1b1936 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -812,7 +812,6 @@ def execute_model( seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) @@ -850,7 +849,6 @@ def execute_model( logits=logits, sampling_metadata=sampling_metadata, ) - return output @torch.inference_mode() From a937a49a8659fc57a16c441bace250c1288ca662 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 21:34:22 -0700 Subject: [PATCH 153/165] lint --- tests/spec_decode/e2e/test_correctness.py | 1 - vllm/config.py | 10 ++++++---- vllm/engine/arg_utils.py | 13 +++++++------ vllm/engine/llm_engine.py | 6 ++++-- vllm/model_executor/layers/rejection_sampler.py | 2 +- vllm/spec_decode/batch_expansion.py | 11 +++++++---- vllm/spec_decode/metrics.py | 3 ++- vllm/spec_decode/spec_decode_worker.py | 2 +- 8 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 7b50046e8d20..103866a9755d 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -487,7 +487,6 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, # Required for spec decode. "use_v2_block_manager": True, - "disable_log_stats": False, }]) @pytest.mark.parametrize( diff --git a/vllm/config.py b/vllm/config.py index 177e9c68283c..801a94a003de 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -712,10 +712,12 @@ def maybe_create_spec_config( max_logprobs=target_model_config.max_logprobs, ) - draft_model_config.max_model_len = (SpeculativeConfig._maybe_override_draft_max_model_len( - speculative_max_model_len, draft_model_config.max_model_len, target_model_config.max_model_len, - )) - + draft_model_config.max_model_len = ( + SpeculativeConfig._maybe_override_draft_max_model_len( + speculative_max_model_len, + draft_model_config.max_model_len, + target_model_config.max_model_len, + )) draft_parallel_config = ( SpeculativeConfig.create_draft_parallel_config( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fe1f205d84e1..11b710f599bc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -418,12 +418,13 @@ def add_cli_args( help='The number of speculative tokens to sample from ' 'the draft model in speculative decoding') - parser.add_argument('--speculative-max-model-len', - type=str, - default=EngineArgs.speculative_max_model_len, - help='The maximum sequence length supported by the ' - 'draft model. Sequences over this length will skip ' - 'speculation.') + parser.add_argument( + '--speculative-max-model-len', + type=str, + default=EngineArgs.speculative_max_model_len, + help='The maximum sequence length supported by the ' + 'draft model. Sequences over this length will skip ' + 'speculation.') parser.add_argument('--model-loader-extra-config', type=str, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2c7cc4c916c4..44d2650f1b84 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -549,13 +549,15 @@ def step(self) -> List[RequestOutput]: # TODO Integrate speculative metrics with Prometheus/stdout # stats logger. metrics = output[0].spec_decode_worker_metrics - logger.info("Speculative metrics: " + logger.info( + "Speculative metrics: " f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " f"System efficiency: {metrics.system_efficiency:.3f}, " f"Number of speculative tokens: {metrics.num_spec_tokens}, " f"Number of accepted tokens: {metrics.accepted_tokens}, " f"Number of draft tokens tokens: {metrics.draft_tokens}, " - f"Number of emitted tokens tokens: {metrics.emitted_tokens}.") + f"Number of emitted tokens tokens: {metrics.emitted_tokens}." + ) return request_outputs diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index d1521c589e21..c2c33d1340ed 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -307,7 +307,7 @@ def _create_output( # with causal acceptance. output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, bonus_token_ids, -1) - + # We disable bonus tokens because it causes corrupt KV cache for # proposal methods that require KV cache. We can fix it by "prefilling" # the bonus token in the proposer. diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 0720b7f2c101..3c77f34f5817 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -73,7 +73,10 @@ def score_proposals( proposal_token_ids_list = proposals.proposal_token_ids.tolist() # Filter the list to ignore -1 proposals. - proposal_token_ids_list_without_skips = [proposals for proposals in proposal_token_ids_list if -1 not in proposals] + proposal_token_ids_list_without_skips = [ + proposals for proposals in proposal_token_ids_list + if -1 not in proposals + ] (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) = self._expand_batch( @@ -165,7 +168,7 @@ def _contract_batch(self, contracted_bs: int, # Map distinct sequences used to score each token # of shape [batch_size * k + 1] back to [batch_size, k + 1]. expanded_batch_size, k = proposals.proposal_token_ids.shape - + # The number of tokens in the expanded batch used for speculation is # equal to the total expanded batch size minus the number of samples for # non-speculative sequences. @@ -174,8 +177,8 @@ def _contract_batch(self, contracted_bs: int, target_token_ids = target_token_ids.squeeze().reshape( spec_expanded_bs, k + 1) - target_probs = target_probs.squeeze().reshape( - spec_expanded_bs, k + 1, self._vocab_size) + target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1, + self._vocab_size) all_tokens = torch.full(size=(contracted_bs, k + 1), fill_value=-1, diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 9b17a12cc173..7926d75be72a 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -146,7 +146,8 @@ def _collect_rejsample_metrics( emitted_tokens = self._aggregate_num_emitted_tokens.item() draft_tokens = self._aggregate_num_draft_tokens - max_num_emitted_tokens = self.get_max_num_emitted_tokens(draft_tokens, k) + max_num_emitted_tokens = self.get_max_num_emitted_tokens( + draft_tokens, k) if draft_tokens > 0: draft_acceptance_rate = accepted_tokens / draft_tokens diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 517d17d04f95..a5f49783f98e 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -293,7 +293,7 @@ def _verify_tokens( # Get probabilities according to proposal method. proposal_probs = proposals.proposal_probs[spec_indices] - + # Get proposed tokens. proposal_token_ids = proposals.proposal_token_ids[spec_indices] From 87e412891ed694b7192cecfe70d665915bd2a1f0 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 21:51:38 -0700 Subject: [PATCH 154/165] metrics log --- vllm/engine/llm_engine.py | 40 ++++++++++++++++++++++----------------- vllm/engine/metrics.py | 22 ++++++++++++++++++++- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 44d2650f1b84..540b421bf24d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -544,20 +544,8 @@ def step(self) -> List[RequestOutput]: # Log stats. if self.log_stats: - self.stat_logger.log(self._get_stats(scheduler_outputs)) - if output and output[0].spec_decode_worker_metrics is not None: - # TODO Integrate speculative metrics with Prometheus/stdout - # stats logger. - metrics = output[0].spec_decode_worker_metrics - logger.info( - "Speculative metrics: " - f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " - f"System efficiency: {metrics.system_efficiency:.3f}, " - f"Number of speculative tokens: {metrics.num_spec_tokens}, " - f"Number of accepted tokens: {metrics.accepted_tokens}, " - f"Number of draft tokens tokens: {metrics.draft_tokens}, " - f"Number of emitted tokens tokens: {metrics.emitted_tokens}." - ) + self.stat_logger.log( + self._get_stats(scheduler_outputs, model_output=output)) return request_outputs @@ -566,9 +554,18 @@ def do_log_stats(self) -> None: if self.log_stats: self.stat_logger.log(self._get_stats(scheduler_outputs=None)) - def _get_stats(self, - scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: - """Get Stats to be Logged to Prometheus.""" + def _get_stats( + self, + scheduler_outputs: Optional[SchedulerOutputs], + model_output: Optional[List[SamplerOutput]] = None) -> Stats: + """Get Stats to be Logged to Prometheus. + + Args: + scheduler_outputs: Optional, used to populate metrics related to + the scheduled batch, + model_output: Optional, used to emit speculative decoding metrics + which are created by the workers. + """ now = time.time() # KV Cache Usage in %. @@ -625,6 +622,14 @@ def _get_stats(self, time_to_first_tokens = time_last_iters if prompt_run else [] time_per_output_tokens = [] if prompt_run else time_last_iters + # Spec decode, if enabled, emits specialized metrics from the worker in + # sampler output. + if model_output and (model_output[0].spec_decode_worker_metrics + is not None): + spec_decode_metrics = model_output[0].spec_decode_worker_metrics + else: + spec_decode_metrics = None + return Stats( now=now, num_running=num_running, @@ -637,6 +642,7 @@ def _get_stats(self, time_to_first_tokens=time_to_first_tokens, time_per_output_tokens=time_per_output_tokens, time_e2e_requests=time_e2e_requests, + spec_decode_metrics=spec_decode_metrics, ) def add_lora(self, lora_request: LoRARequest) -> bool: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 04e27e69ce0f..66fe12ab77d3 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,6 +1,6 @@ import time from dataclasses import dataclass -from typing import Dict, List, Protocol +from typing import TYPE_CHECKING, Dict, List, Optional, Protocol import numpy as np from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, @@ -8,6 +8,9 @@ from vllm.logger import init_logger +if TYPE_CHECKING: + from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + logger = init_logger(__name__) disable_created_metrics() @@ -118,6 +121,8 @@ class Stats: time_per_output_tokens: List[float] time_e2e_requests: List[float] + spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + class SupportsMetricsInfo(Protocol): @@ -235,3 +240,18 @@ def log(self, stats: Stats) -> None: self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now + + if stats.spec_decode_metrics is not None: + logger.info( + self._format_spec_decode_metrics_str( + stats.spec_decode_metrics)) + + def _format_spec_decode_metrics_str( + self, metrics: Optional["SpecDecodeWorkerMetrics"]) -> str: + return ("Speculative metrics: " + f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " + f"System efficiency: {metrics.system_efficiency:.3f}, " + f"Number of speculative tokens: {metrics.num_spec_tokens}, " + f"Number of accepted tokens: {metrics.accepted_tokens}, " + f"Number of draft tokens tokens: {metrics.draft_tokens}, " + f"Number of emitted tokens tokens: {metrics.emitted_tokens}.") From 75b271efebd2a770f74b0d172a3eae4ada906c3c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 21:55:40 -0700 Subject: [PATCH 155/165] fix --- tests/spec_decode/test_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 36e91672069d..312878804b86 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -119,7 +119,7 @@ def test_initial_metrics_has_correct_values(has_data: bool): num_draft_tokens = 0 k = 5 - num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens( + max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens( num_draft_tokens, k) rej_sampler = MagicMock() @@ -153,7 +153,7 @@ def test_initial_metrics_has_correct_values(has_data: bool): assert (metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens) assert (metrics.system_efficiency == num_emitted_tokens / - num_possible_tokens) + max_num_emitted_tokens) else: assert math.isnan(metrics.draft_acceptance_rate) assert math.isnan(metrics.system_efficiency) From f274ed78e7781b3072633ea0f41068cebfd800fd Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 22:35:05 -0700 Subject: [PATCH 156/165] sampler cleanup --- tests/spec_decode/test_spec_decode_worker.py | 26 +- tests/spec_decode/utils.py | 7 +- vllm/model_executor/layers/sampler.py | 309 ++++++++++++------- vllm/spec_decode/spec_decode_worker.py | 26 ++ 4 files changed, 244 insertions(+), 124 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index edda1d0aaecc..d24d726c9c0c 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -63,8 +63,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int): """Verify SpecDecodeWorker calls the target model with correct inputs. Everything else is mocked out. """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() + draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) + target_worker = mock_worker(use_spec=False) rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) @@ -145,8 +145,10 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): """ vocab_size = 32_000 - draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) - target_worker = mock_worker(vocab_size=vocab_size) + draft_worker = mock_worker(cls=MultiStepWorker, + vocab_size=vocab_size, + use_spec=False) + target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) @@ -224,8 +226,10 @@ def test_correctly_formats_output(k: int, batch_size: int): """ vocab_size = 32_000 - draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) - target_worker = mock_worker(vocab_size=vocab_size) + draft_worker = mock_worker(cls=MultiStepWorker, + vocab_size=vocab_size, + use_spec=False) + target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) @@ -336,8 +340,10 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): """ vocab_size = 32_000 - draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) - target_worker = mock_worker(vocab_size=vocab_size) + draft_worker = mock_worker(cls=MultiStepWorker, + vocab_size=vocab_size, + use_spec=False) + target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) @@ -500,8 +506,8 @@ def test_init_device(): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() + draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) + target_worker = mock_worker(use_spec=False) rejection_sampler = MagicMock(spec=RejectionSampler) rejection_sampler.token_id_dtype = torch.int64 metrics_collector = MagicMock(spec=AsyncMetricsCollector) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d04b6029493f..4f8295d25cf4 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -63,11 +63,14 @@ def create_execute_model_data( def mock_worker(cls=None, vocab_size: int = 30_000, max_model_len: int = 2048, - rank: int = 0) -> MagicMock: + rank: int = 0, + use_spec: bool = True) -> MagicMock: if cls is None: cls = Worker - worker = MagicMock(spec=cls) + spec = cls if use_spec else None + + worker = MagicMock(spec=spec) worker.vocab_size = vocab_size worker.max_model_len = max_model_len worker.rank = rank diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index e97941f01ac9..0fa01047ec36 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn +from vllm.model_executor.layers.ops.sample import sample as sample_triton from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors) from vllm.sampling_params import SamplingParams, SamplingType @@ -36,7 +37,11 @@ class Sampler(nn.Module): def __init__(self): super().__init__() - self._include_gpu_probs_tensor = True + + # Whether or not the SamplerOutput should have on-device tensors + # containing the sampled token ids and probabilities. This is used by + # speculative decoding. + self.include_gpu_probs_tensor = False def forward( self, @@ -82,14 +87,45 @@ def forward( logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) # Sample the next tokens. - sample_results, sampled_tokens_tensor = _sample( - probs, logprobs, sampling_metadata, sampling_tensors) + sample_results, maybe_sampled_tokens_tensor = _sample( + probs, + logprobs, + sampling_metadata, + sampling_tensors, + include_gpu_probs_tensor=self.include_gpu_probs_tensor, + modify_greedy_probs=self._should_modify_greedy_probs_inplace, + ) + + if self.include_gpu_probs_tensor: + assert maybe_sampled_tokens_tensor is not None + sampled_tokens_tensor = maybe_sampled_tokens_tensor + on_device_tensors = (probs, sampled_tokens_tensor) + else: + on_device_tensors = None + # Get the logprobs query results. prompt_logprobs, sample_logprobs = _get_logprobs( logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs, - (probs, sampled_tokens_tensor)) + return _build_sampler_output(sample_results, + sampling_metadata, + prompt_logprobs, + sample_logprobs, + on_device_tensors=on_device_tensors) + + @property + def _should_modify_greedy_probs_inplace(self) -> bool: + """Whether or not the sampler should modify the probability distribution + of greedily-sampled tokens such that multinomial sampling would sample + the greedily-sampled token. + + In other words, if True then we set the probability of the greedily- + sampled token to 1. + + This is used by speculative decoding, which requires that the sampling + method be encoded into the probability distribution. + """ + # Modify greedy probs if include_gpu_probs_tensor is set. + return self.include_gpu_probs_tensor def _get_bin_counts_and_mask( @@ -363,7 +399,9 @@ def _sample_with_torch( probs: torch.Tensor, logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, -) -> List[Tuple[List[int], List[int]]]: + include_gpu_probs_tensor: bool, + modify_greedy_probs: bool, +) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]: categorized_seq_group_ids = {t: [] for t in SamplingType} categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): @@ -375,10 +413,14 @@ def _sample_with_torch( sample_metadata = {} multinomial_samples = {} - sampled_token_ids_tensor = torch.empty(logprobs.shape[0], - 1, - dtype=torch.long, - device=logprobs.device) + # Create output tensor for sampled token ids. + if include_gpu_probs_tensor: + sampled_token_ids_tensor = torch.empty(logprobs.shape[0], + 1, + dtype=torch.long, + device=logprobs.device) + else: + sampled_token_ids_tensor = None # Counterintiutively, having two loops here is actually faster. # The first loop can run without waiting on GPU<->CPU sync. @@ -392,17 +434,24 @@ def _sample_with_torch( is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] sample_metadata[sampling_type] = (seq_group_ids, seq_groups, is_prompts, sample_indices) - if sampling_type == SamplingType.GREEDY: - s_i = sample_indices.long() - greedy_samples = torch.argmax(logprobs[s_i], dim=-1) + long_sample_indices = sample_indices.long() - # TODO clean up - # self._include_gpu_probs_tensor - logprobs[s_i, :] = -float('inf') - logprobs[s_i, greedy_samples] = 0.0 - probs[s_i, :] = 0 - probs[s_i, greedy_samples] = 1.0 - sampled_token_ids_tensor[s_i] = greedy_samples.unsqueeze(-1) + if sampling_type == SamplingType.GREEDY: + greedy_samples = torch.argmax(logprobs[long_sample_indices], + dim=-1) + + if include_gpu_probs_tensor: + # Store sampled tokens in output tensor. + sampled_token_ids_tensor[ + long_sample_indices] = greedy_samples.unsqueeze(-1) + + if modify_greedy_probs: + # If required, modify the probabilities such that sampling from + # the modified distribution would always sample the argmax + # token id. + _modify_greedy_probs_inplace(logprobs, probs, + long_sample_indices, + greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): max_best_of_in_batch = 1 @@ -416,19 +465,22 @@ def _sample_with_torch( "generators": sampling_metadata.generators, } - s_i = sample_indices.long() + multinomial_samples[sampling_type] = _multinomial( + probs[long_sample_indices], max_best_of_in_batch, + **seeded_args) - mn_samples = _multinomial(probs[s_i], max_best_of_in_batch, - **seeded_args) - multinomial_samples[sampling_type] = mn_samples + if include_gpu_probs_tensor: + # Store sampled tokens in output tensor. + sampled_token_ids_tensor[ + long_sample_indices] = multinomial_samples[sampling_type] - sampled_token_ids_tensor[s_i] = mn_samples elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] else: raise ValueError(f"Unsupported sampling type: {sampling_type}") # GPU<->CPU sync happens in the loop below. + # This also converts the sample output to Python objects. for sampling_type in SamplingType: if sampling_type not in sample_metadata: @@ -454,93 +506,98 @@ def _sample_with_torch( return sample_results, sampled_token_ids_tensor -#def _sample_with_triton_kernel( -# probs: torch.Tensor, -# logprobs: torch.Tensor, -# sampling_metadata: SamplingMetadata, -# sampling_tensors: SamplingTensors, -#) -> List[Tuple[List[int], List[int]]]: -# categorized_seq_group_ids = {t: [] for t in SamplingType} -# categorized_sample_indices = sampling_metadata.categorized_sample_indices -# for i, seq_group in enumerate(sampling_metadata.seq_groups): -# _, sampling_params = seq_group -# sampling_type = sampling_params.sampling_type -# categorized_seq_group_ids[sampling_type].append(i) -# -# sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} -# sample_metadata = {} -# max_best_of_in_batch = 1 -# -# # Counterintiutively, having two loops here is actually faster. -# # The first loop can run without waiting on GPU<->CPU sync. -# for sampling_type in SamplingType: -# sample_indices = categorized_sample_indices[sampling_type][:, 0] -# sampled_token_indices = categorized_sample_indices[sampling_type][:, 1] -# num_tokens = len(sample_indices) -# if num_tokens == 0: -# continue -# seq_group_ids = categorized_seq_group_ids[sampling_type] -# seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] -# is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] -# sample_metadata[sampling_type] = (seq_group_ids, seq_groups, -# is_prompts, sample_indices, -# sampled_token_indices) -# if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM, -# SamplingType.RANDOM_SEED): -# for seq_group, is_prompt in zip(seq_groups, is_prompts): -# if is_prompt: -# _, sampling_params = seq_group -# max_best_of_in_batch = max(max_best_of_in_batch, -# sampling_params.best_of) -# elif sampling_type == SamplingType.BEAM: -# beam_search_logprobs = logprobs[sample_indices] -# else: -# raise ValueError(f"Unsupported sampling type: {sampling_type}") -# -# sampled_tokens, _, _ = sample_triton( -# probs=probs, -# seeds=sampling_tensors.sampling_seeds, -# max_best_of=max_best_of_in_batch, -# sample_indices=sampling_tensors.sample_indices, -# logprobs=logprobs, -# # don't save logprobs because we have logic for that below -# # TODO: use this instead of the CPU-based logic below -# save_logprobs=False, -# ) -# -# # GPU<->CPU sync happens in the loop below. -# -# for sampling_type in SamplingType: -# if sampling_type not in sample_metadata: -# continue -# (seq_group_ids, seq_groups, is_prompts, sample_indices, -# sampled_token_indices) = sample_metadata[sampling_type] -# if sampling_type == SamplingType.GREEDY: -# sample_results = _greedy_sample( -# seq_groups, sampled_tokens[sampled_token_indices][:, 0]) -# elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): -# sample_results = _random_sample( -# seq_groups, is_prompts, sampled_tokens[sampled_token_indices]) -# elif sampling_type == SamplingType.BEAM: -# sample_results = _beam_search_sample(seq_groups, is_prompts, -# sampling_metadata.seq_data, -# beam_search_logprobs) -# sample_results_dict.update(zip(seq_group_ids, sample_results)) -# -# sample_results = [ -# sample_results_dict[i] -# for i in range(len(sampling_metadata.seq_groups)) -# ] -# return sample_results - - -def _sample( +def _sample_with_triton_kernel( probs: torch.Tensor, logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors, ) -> List[Tuple[List[int], List[int]]]: - return _sample_with_torch(probs, logprobs, sampling_metadata) + categorized_seq_group_ids = {t: [] for t in SamplingType} + categorized_sample_indices = sampling_metadata.categorized_sample_indices + for i, seq_group in enumerate(sampling_metadata.seq_groups): + _, sampling_params = seq_group + sampling_type = sampling_params.sampling_type + categorized_seq_group_ids[sampling_type].append(i) + + sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} + sample_metadata = {} + max_best_of_in_batch = 1 + + # Counterintiutively, having two loops here is actually faster. + # The first loop can run without waiting on GPU<->CPU sync. + for sampling_type in SamplingType: + sample_indices = categorized_sample_indices[sampling_type][:, 0] + sampled_token_indices = categorized_sample_indices[sampling_type][:, 1] + num_tokens = len(sample_indices) + if num_tokens == 0: + continue + seq_group_ids = categorized_seq_group_ids[sampling_type] + seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] + is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] + sample_metadata[sampling_type] = (seq_group_ids, seq_groups, + is_prompts, sample_indices, + sampled_token_indices) + if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM, + SamplingType.RANDOM_SEED): + for seq_group, is_prompt in zip(seq_groups, is_prompts): + if is_prompt: + _, sampling_params = seq_group + max_best_of_in_batch = max(max_best_of_in_batch, + sampling_params.best_of) + elif sampling_type == SamplingType.BEAM: + beam_search_logprobs = logprobs[sample_indices] + else: + raise ValueError(f"Unsupported sampling type: {sampling_type}") + + sampled_tokens, _, _ = sample_triton( + probs=probs, + seeds=sampling_tensors.sampling_seeds, + max_best_of=max_best_of_in_batch, + sample_indices=sampling_tensors.sample_indices, + logprobs=logprobs, + # don't save logprobs because we have logic for that below + # TODO: use this instead of the CPU-based logic below + save_logprobs=False, + ) + + # GPU<->CPU sync happens in the loop below. + + for sampling_type in SamplingType: + if sampling_type not in sample_metadata: + continue + (seq_group_ids, seq_groups, is_prompts, sample_indices, + sampled_token_indices) = sample_metadata[sampling_type] + if sampling_type == SamplingType.GREEDY: + sample_results = _greedy_sample( + seq_groups, sampled_tokens[sampled_token_indices][:, 0]) + elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): + sample_results = _random_sample( + seq_groups, is_prompts, sampled_tokens[sampled_token_indices]) + elif sampling_type == SamplingType.BEAM: + sample_results = _beam_search_sample(seq_groups, is_prompts, + sampling_metadata.seq_data, + beam_search_logprobs) + sample_results_dict.update(zip(seq_group_ids, sample_results)) + + sample_results = [ + sample_results_dict[i] + for i in range(len(sampling_metadata.seq_groups)) + ] + return sample_results + + +def _sample( + probs: torch.Tensor, logprobs: torch.Tensor, + sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors, + include_gpu_probs_tensor: bool, modify_greedy_probs: bool +) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]: + return _sample_with_torch( + probs, + logprobs, + sampling_metadata, + include_gpu_probs_tensor=include_gpu_probs_tensor, + modify_greedy_probs=modify_greedy_probs, + ) # TODO: Enable once Triton kernel & associated code is faster. # return _sample_with_triton_kernel(probs, logprobs, sampling_metadata, @@ -704,13 +761,36 @@ def _get_logprobs( return result_prompt_logprobs, result_sample_logprobs +def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, + sample_indices: torch.Tensor, + greedy_samples: torch.Tensor) -> None: + """Modify the probability distributions of the greedily-sampled tokens such + that each sampled token has a "probability" of 1.0. This is required by + speculative decoding, which depends on the sampling method being encoded + within the probability distribution for correctness. + """ + logprobs[sample_indices, :] = -float('inf') + logprobs[sample_indices, greedy_samples] = 0.0 + probs[sample_indices, :] = 0 + probs[sample_indices, greedy_samples] = 1.0 + + def _build_sampler_output( sample_results: List[Tuple[List[int], List[int]]], sampling_metadata: SamplingMetadata, prompt_logprobs: List[Optional[PromptLogprobs]], sample_logprobs: List[SampleLogprobs], - spec_decode_data, + on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor]], ) -> SamplerOutput: + """Construct Python objects with the output of sampling. + + Args: + on_device_tensors: Tuple containing on-device tensors with the + probabilities used in sampling and the sampled token ids. This + allows post-processing without copies to CPU/serialization, e.g. in + speculative decoding rejection sampling. + """ + sampler_output = [] for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, @@ -727,9 +807,14 @@ def _build_sampler_output( sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - probs, token_ids = spec_decode_data + # If not specified, store None values in SamplerOutput. + if on_device_tensors is not None: + sampled_token_probs, sampled_token_ids = on_device_tensors + else: + sampled_token_probs, sampled_token_ids = (None, None) + return SamplerOutput( outputs=sampler_output, - sampled_token_probs=probs, - sampled_token_ids=token_ids, + sampled_token_probs=sampled_token_probs, + sampled_token_ids=sampled_token_ids, ) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a5f49783f98e..38e529faba3d 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -110,6 +110,32 @@ def init_device(self) -> None: device=self.device, vocab_size=self._vocab_size) + self._configure_model_sampler_for_spec_decode() + + def _configure_model_sampler_for_spec_decode(self): + """Configure model sampler to emit GPU tensors. This allows spec decode + to keep data on device without transferring to CPU and serializing, + which significantly reduces overhead of rejection sampling. + + NOTE(cade): This breaks abstraction boundaries pretty badly. The better + design is to have the "move to CPU and serialize" sampling decision be + done outside of the model/sampler; this way the "last-mile" worker + object which interfaces with the scheduler can serialize and incur the + performance hit as necessary. This allows us to run the worker several + iterations in a row without incurring the "move to CPU and serialize" + performance penalty. + + Since this requires a large change to vLLM, we defer it to later and + temporarily accept this broken abstraction boundary. + + NOTE(cade): This will require a special check if the proposer worker + does not have a sampler (e.g. ngram speculation). + """ + (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor + ) = True + (self.proposer_worker.model_runner.model.sampler. + include_gpu_probs_tensor) = True + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of cache blocks to use. From e9c627147d48873658fa7d9e7185f578f63f8c18 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 23:16:39 -0700 Subject: [PATCH 157/165] clean --- .../layers/rejection_sampler.py | 22 ++++++++----------- vllm/model_executor/layers/sampler.py | 1 - vllm/spec_decode/metrics.py | 3 ++- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index c2c33d1340ed..5edbbf2c70a4 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -310,7 +310,7 @@ def _create_output( # We disable bonus tokens because it causes corrupt KV cache for # proposal methods that require KV cache. We can fix it by "prefilling" - # the bonus token in the proposer. + # the bonus token in the proposer. The following issue tracks the fix. # https://github.com/vllm-project/vllm/issues/4212 output_with_bonus_tokens[:, -1] = -1 @@ -337,20 +337,16 @@ def _raise_if_incorrect_shape( draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - try: - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" + assert draft_batch_size == target_batch_size + assert num_draft_probs == num_target_probs + assert (draft_vocab_size == target_vocab_size + ), f"{draft_vocab_size=} {target_vocab_size=}" - assert draft_token_ids_batch_size == draft_batch_size - assert num_draft_token_ids == num_draft_probs + assert draft_token_ids_batch_size == draft_batch_size + assert num_draft_token_ids == num_draft_probs - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - except: - #breakpoint() - raise + assert bonus_batch_size == target_batch_size + assert num_bonus_tokens == self._num_bonus_tokens def _raise_if_incorrect_dtype( self, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 0fa01047ec36..b6199137aa3a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -488,7 +488,6 @@ def _sample_with_torch( seq_group_ids, seq_groups, is_prompts, sample_indices = sample_metadata[ sampling_type] if sampling_type == SamplingType.GREEDY: - # This merely serializes the samples. sample_results = _greedy_sample(seq_groups, greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): sample_results = _random_sample(seq_groups, is_prompts, diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 7926d75be72a..c175279c18c5 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -178,7 +178,8 @@ def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int: """ # Determine the number of sequences that have been speculated on. Since # the batch size can be variable, we divide by k. - total_num_spec_seqs = int(draft_tokens / k) + assert draft_tokens % k == 0 + total_num_spec_seqs = draft_tokens // k # A single sequence may emit k accepted tokens and one bonus token in # the best case. From fa2ff3e8e86c6b23d71692e675ac778c593a851f Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 23:44:06 -0700 Subject: [PATCH 158/165] some docs on testing methodology --- tests/spec_decode/e2e/test_correctness.py | 120 ++++++++++++---------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 103866a9755d..43a63d072367 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -1,3 +1,33 @@ +"""The tests in this file verify end-to-end speculative decoding correctness. + +This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. This gives us good coverage of temp=0. + +For temp>0, we rely on unit tests on the rejection sampler to verify that the +output distribution is the same with spec decode vs. no spec decode (this would +be prohibitively expensive to run with a real model). + +NOTE: Speculative decoding's distribution equality requires that the measured +distributions of the target model and proposal model be deterministic given the +same input. vLLM largely guarantees this. + +@cadedaniel has seen cases where the output probabilities of a draft/target +model change slightly with certain batch sizes or prompts, even with Torch +determinism flags set. It is unclear if this is a bug in vLLM, due to non- +determinism in on-device batched operations, a bug in vLLM's spec decode +implementation, or the "hardware numerics" limitations. Either way, rejection +sampling ensures the output distribution matches the target model, but it breaks +greedy-equality tests for those batch sizes/prompts. +""" + from itertools import cycle from typing import List, Tuple @@ -120,6 +150,11 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality on a tiny model with batch size of one. + + Since this test is cheaper than other e2e correctness tests, we generate + with a higher output_len. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -155,7 +190,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( @pytest.mark.parametrize( "test_llm_kwargs", [ - # Try two different num spec tokens. { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, @@ -172,6 +206,8 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality on a tiny model and large batch size. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -215,6 +251,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( baseline_llm_generator, test_llm_generator, batch_size: int, max_output_len: int): + """Verify greedy equality on a tiny model, with a large batch size, and when + sampling respects the EOS token. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -256,6 +295,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( def test_spec_decode_e2e_greedy_correctness_real_model_bs1( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality on a "real" model and batch size of 1. This is + separate from large BS tests to make identifying the source of bugs easier. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -297,6 +339,9 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality with a "real" model on a nontrivial batch size. + This is the closest test to a real production workload. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -341,6 +386,9 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( def test_spec_decode_e2e_greedy_correctness_with_preemption( baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -392,6 +440,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( def test_spec_decode_different_block_size(baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): + """Verify greedy equality over different block sizes. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -416,6 +466,9 @@ def test_spec_decode_different_block_size(baseline_llm_generator, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + + # Artificially limit the draft model max model len; this forces vLLM to + # skip speculation once the sequences grow beyond 32-k tokens. "speculative_max_model_len": 32, }, ]) @@ -423,14 +476,18 @@ def test_spec_decode_different_block_size(baseline_llm_generator, @pytest.mark.parametrize( "output_len", [ - # Use smaller output len for fast test. + # This must be a good bit larger than speculative_max_model_len so that + # we can test the case where all seqs are skipped, but still small to + # ensure fast test. 64, ]) @pytest.mark.parametrize("seed", [1]) def test_skip_speculation(baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - """Verify correct output when we skip speculation. - Test skip 1, skip >1, skip all. + """Verify greedy equality when some (or all) sequences skip speculation. + We do this by setting the max model len of the draft model to an + artificially low value, such that when the sequences grow beyond it, they + are skipped in speculative decoding. """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, @@ -472,54 +529,9 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, @pytest.mark.parametrize("seed", [1]) def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int, output_len: int): - run_greedy_equality_correctness_test(baseline_llm_generator, - test_llm_generator, - batch_size, - max_output_len=output_len, - force_output_len=True) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True, - "disable_log_stats": False, - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model": "JackFram/llama-68m", - }, - #{ - # "model": "JackFram/llama-160m", - #}, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use long output len for the small model test. - #1536, - 128, - ]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize("seed", [1]) -def test_wip_validate_acceptance_rate(baseline_llm_generator, - test_llm_generator, batch_size: int, - output_len: int): + """Verify that speculative decoding produces exact equality to without spec + decode with many different values of k. + """ run_greedy_equality_correctness_test(baseline_llm_generator, test_llm_generator, batch_size, @@ -533,6 +545,10 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, max_output_len, force_output_len: bool, print_tokens: bool = False): + """Helper method that compares the outputs of both the baseline LLM and + the test LLM. It asserts greedy equality, e.g. that the outputs are exactly + the same when temperature is zero. + """ temperature = 0.0 prompts = [ From 1676607932b374a6c6572e86a35199f00a475b8c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Sun, 21 Apr 2024 23:47:40 -0700 Subject: [PATCH 159/165] fix --- tests/samplers/test_sampler.py | 3 +- tests/spec_decode/e2e/test_correctness.py | 34 +++++++++++------------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index dbbe13b8da06..52a2b0ca52aa 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -636,7 +636,8 @@ def test_sampler_top_k_top_p(seed: int, device: str): def mock_sample(probs, *args, **kwargs): nonlocal sample_probs sample_probs = probs - return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs] + return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] + for prob in probs], None) with patch("vllm.model_executor.layers.sampler._sample", mock_sample): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 43a63d072367..643dde18340f 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -187,14 +187,12 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - ]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) @pytest.mark.parametrize( "output_len", [ @@ -462,16 +460,18 @@ def test_spec_decode_different_block_size(baseline_llm_generator, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, - # Artificially limit the draft model max model len; this forces vLLM to - # skip speculation once the sequences grow beyond 32-k tokens. - "speculative_max_model_len": 32, - }, -]) + # Artificially limit the draft model max model len; this forces vLLM + # to skip speculation once the sequences grow beyond 32-k tokens. + "speculative_max_model_len": 32, + }, + ]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize( "output_len", From 5a51b82dabaf141c717167a126652ca951fd9380 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 00:13:53 -0700 Subject: [PATCH 160/165] mypy fix --- vllm/engine/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 66fe12ab77d3..25e96f6c7eaf 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -247,7 +247,8 @@ def log(self, stats: Stats) -> None: stats.spec_decode_metrics)) def _format_spec_decode_metrics_str( - self, metrics: Optional["SpecDecodeWorkerMetrics"]) -> str: + self, metrics: "SpecDecodeWorkerMetrics") -> str: + return ("Speculative metrics: " f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " f"System efficiency: {metrics.system_efficiency:.3f}, " From c3d619e25dbcf341b76b7079dbd4e00232ce8862 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 00:41:02 -0700 Subject: [PATCH 161/165] rejection sampler test fix --- tests/samplers/test_rejection_sampler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index d2c3a798d308..13b5b80cccfd 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -91,12 +91,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, bonus_token_ids, ) + # Bonus tokens are currently disabled. Verify they're set to -1. + # See https://github.com/vllm-project/vllm/issues/4212 + expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1 + if which_tokens_accepted == "all_tokens_accepted": # Expect all tokens to be equal to draft tokens. assert torch.equal(output_token_ids[:, :-1], draft_token_ids) # Expect all bonus tokens to be included. - assert torch.equal(output_token_ids[:, -1:], bonus_token_ids) + assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids) elif which_tokens_accepted == "no_tokens_accepted": # Expect first token to be equal to recovered tokens. assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0]) @@ -106,7 +110,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, torch.ones_like(output_token_ids[:, 1:]) * -1) elif which_tokens_accepted == "some_tokens_accepted": recovered_plus_bonus = torch.cat( - (recovered_token_ids, bonus_token_ids), dim=-1) + (recovered_token_ids, expected_bonus_token_ids), dim=-1) # Assert first rejected token is a recovered token or bonus token. assert torch.equal( recovered_plus_bonus[torch.arange(0, batch_size), From 7bfe6dd22b791af967fd9a681da11649ab38808c Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 13:34:43 -0700 Subject: [PATCH 162/165] pr feedback --- vllm/config.py | 15 +++++++++++ vllm/engine/arg_utils.py | 1 + vllm/model_executor/layers/sampler.py | 37 +++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 96f960e1182e..1e2005792e4a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -656,6 +656,7 @@ def maybe_create_spec_config( speculative_model: Optional[str], num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], + enable_chunked_prefill: bool, ) -> Optional["SpeculativeConfig"]: """Create a SpeculativeConfig if possible, else return None. @@ -694,6 +695,11 @@ def maybe_create_spec_config( assert (speculative_model is not None and num_speculative_tokens is not None) + if enable_chunked_prefill: + raise ValueError( + "Speculative decoding and chunked prefill are " + "currently mutually exclusive ({enable_chunked_prefill=}).") + # TODO: The user should be able to specify revision/quantization/max # model len for the draft model. It is not currently supported. draft_revision = None @@ -754,6 +760,15 @@ def _maybe_override_draft_max_model_len( """ if speculative_max_model_len is not None: + + if speculative_max_model_len > draft_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {draft_max_model_len=}") + + if speculative_max_model_len > target_max_model_len: + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {target_max_model_len=}") + return speculative_max_model_len return min( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2f1b492f7fd5..215f3023bc6c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -491,6 +491,7 @@ def create_engine_config(self, ) -> EngineConfig: speculative_model=self.speculative_model, num_speculative_tokens=self.num_speculative_tokens, speculative_max_model_len=self.speculative_max_model_len, + enable_chunked_prefill=self.enable_chunked_prefill, ) scheduler_config = SchedulerConfig( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index b6199137aa3a..c4b11cb33a67 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -767,6 +767,43 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, that each sampled token has a "probability" of 1.0. This is required by speculative decoding, which depends on the sampling method being encoded within the probability distribution for correctness. + + # Why do we only need to do this for greedy sampling? + + vLLM's sampler performs the following steps for greedy or multinomial + (random) sampling: + 1. Get logits from model. + 2. Modify logits according to per-sequence sampling parameters. + - Multiply by temperature, top-k and top-p masking, penalize tokens + according to their frequency, etc. + 3. Sample a token. + - Random sampling simply samples from the modified probability + distribution. + - Greedy sampling performs `argmax` to obtain the token with the + highest likelihood. + + Ignoring greedy sampling for a moment, we find that the computed probability + distribution has the following property: we can sample from it independently + and find that the token sampled by the Sampler has a frequency corresponding + to how often we see it in our sampling. In other words, for tokens sampled + with vLLM's random SamplingType, the computed probability distribution + encodes the sampling methodology completely. + + Greedy sampling does not normally have this property. vLLM modifies logits + according to sampling params, then performs `argmax`, then returns the + sampled token and the computed probability distribution. If we sample from + the distribution, we'll find the likelihood of the greedily-sampled token + is not always 1.0. + + Since lossless speculative decoding requires that the sampling methodology + be encoded within the probability distribution, we are motivated to modify + the probability distribution such that the sampled token has probability 1 + when speculative decoding is used. + + NOTE: Alternatively, we could use an extremely low temperature to achieve + greedy sampling using multinomial computation and unite the codepaths. This + has implications on the overall design of the sampler, e.g. how to record + accurate logprobs for the user, so this improvement is deferred to later. """ logprobs[sample_indices, :] = -float('inf') logprobs[sample_indices, greedy_samples] = 0.0 From c38aa973f4c1e1b05355ba57165c9cb8e77a6559 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 14:17:33 -0700 Subject: [PATCH 163/165] break compatibility tests into own file --- tests/spec_decode/e2e/__init__.py | 0 tests/spec_decode/e2e/conftest.py | 16 ++ tests/spec_decode/e2e/test_compatibility.py | 199 ++++++++++++++++++++ tests/spec_decode/e2e/test_correctness.py | 67 +------ vllm/config.py | 14 +- vllm/engine/arg_utils.py | 1 + 6 files changed, 231 insertions(+), 66 deletions(-) create mode 100644 tests/spec_decode/e2e/__init__.py create mode 100644 tests/spec_decode/e2e/test_compatibility.py diff --git a/tests/spec_decode/e2e/__init__.py b/tests/spec_decode/e2e/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b9f9001511ec..59fb8311fc5b 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,3 +1,5 @@ +from typing import List, Tuple + import pytest from tests.conftest import cleanup @@ -48,3 +50,17 @@ def generator_outer(): del llm return generator_outer + + +def get_output_from_llm_generator( + llm_generator, prompts, + sampling_params) -> Tuple[List[str], List[List[int]]]: + tokens = [] + token_ids = [] + for llm in llm_generator(): + outputs = llm.generate(prompts, sampling_params, use_tqdm=True) + token_ids = [output.outputs[0].token_ids for output in outputs] + tokens = [output.outputs[0].text for output in outputs] + del llm + + return tokens, token_ids diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py new file mode 100644 index 000000000000..b35d08bcb71b --- /dev/null +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -0,0 +1,199 @@ +import pytest + +from vllm import SamplingParams + +from .conftest import get_output_from_llm_generator + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + # Expect failure as spec decode not supported by + # Ray backend. + "worker_use_ray": True, + }, + ]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail_ray(test_llm_generator): + """Verify that speculative decoding with Ray fails. + """ + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises(AssertionError, + match="Speculative decoding not yet supported for "): + get_output_from_llm_generator(test_llm_generator, prompts, + sampling_params) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "enable_chunked_prefill": True, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail_chunked_prefill(test_llm_generator): + """Verify that speculative decoding with chunked prefill fails. + """ + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises(ValueError, + match="Speculative decoding and chunked prefill"): + get_output_from_llm_generator(test_llm_generator, prompts, + sampling_params) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "meta-llama/Llama-2-7b-chat-hf", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True + }]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [ + { + # Speculative max model len > overridden max model len should raise. + "max_model_len": 128, + "speculative_max_model_len": 129, + }, + { + # Speculative max model len > draft max model len should raise. + # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12 + "speculative_max_model_len": 2048 + 1, + }, + { + # Speculative max model len > target max model len should raise. + # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12 + "speculative_max_model_len": 4096 + 1, + }, + ]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): + """Verify that speculative decoding validates speculative_max_model_len. + """ + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises(ValueError, match="cannot be larger than"): + get_output_from_llm_generator(test_llm_generator, prompts, + sampling_params) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + + # Skip real loading for fast test. + "load_format": "dummy", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail_block_manager_v1(test_llm_generator): + """Verify that speculative decoding with block manager v1 fails. + """ + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises(ValueError, + match="Speculative decoding requires usage of the V2"): + get_output_from_llm_generator(test_llm_generator, prompts, + sampling_params) diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py index 643dde18340f..0536cc4ecde7 100644 --- a/tests/spec_decode/e2e/test_correctness.py +++ b/tests/spec_decode/e2e/test_correctness.py @@ -29,13 +29,14 @@ """ from itertools import cycle -from typing import List, Tuple import pytest from transformers import AutoTokenizer from vllm import SamplingParams +from .conftest import get_output_from_llm_generator + @pytest.mark.parametrize( "common_llm_kwargs", @@ -594,67 +595,3 @@ def run_greedy_equality_correctness_test(baseline_llm_generator, print(f'{i=} {baseline_token_ids=}') print(f'{i=} {spec_token_ids=}') assert baseline_token_ids == spec_token_ids - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - "model": "JackFram/llama-68m", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - - # Skip real loading for fast test. - "load_format": "dummy", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Required for spec decode. - "use_v2_block_manager": True - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - # Expect failure as spec decode not supported by - # Ray backend. - "worker_use_ray": True, - }, - ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_xfail(test_llm_generator): - """Verify that speculative decoding with Ray fails. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - with pytest.raises(AssertionError, - match="Speculative decoding not yet supported for "): - get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) - - -def get_output_from_llm_generator( - llm_generator, prompts, - sampling_params) -> Tuple[List[str], List[List[int]]]: - tokens = [] - token_ids = [] - for llm in llm_generator(): - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - token_ids = [output.outputs[0].token_ids for output in outputs] - tokens = [output.outputs[0].text for output in outputs] - del llm - - return tokens, token_ids diff --git a/vllm/config.py b/vllm/config.py index 1e2005792e4a..2ff42de08f8f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -657,6 +657,7 @@ def maybe_create_spec_config( num_speculative_tokens: Optional[int], speculative_max_model_len: Optional[int], enable_chunked_prefill: bool, + use_v2_block_manager: bool, ) -> Optional["SpeculativeConfig"]: """Create a SpeculativeConfig if possible, else return None. @@ -677,6 +678,12 @@ def maybe_create_spec_config( speculative_max_model_len (Optional[int]): The maximum model len of the speculative model. Used when testing the ability to skip speculation for some sequences. + enable_chunked_prefill (bool): Whether vLLM is configured to use + chunked prefill or not. Used for raising an error since its not + yet compatible with spec decode. + use_v2_block_manager (bool): Whether vLLM is configured to use the + v2 block manager or not. Used for raising an error since the v2 + block manager is required with spec decode. Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if @@ -698,7 +705,12 @@ def maybe_create_spec_config( if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " - "currently mutually exclusive ({enable_chunked_prefill=}).") + f"currently mutually exclusive ({enable_chunked_prefill=}).") + + if not use_v2_block_manager: + raise ValueError( + "Speculative decoding requires usage of the V2 " + "block manager. Enable it with --use-v2-block-manager.") # TODO: The user should be able to specify revision/quantization/max # model len for the draft model. It is not currently supported. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 215f3023bc6c..6a6ac49ae321 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -492,6 +492,7 @@ def create_engine_config(self, ) -> EngineConfig: num_speculative_tokens=self.num_speculative_tokens, speculative_max_model_len=self.speculative_max_model_len, enable_chunked_prefill=self.enable_chunked_prefill, + use_v2_block_manager=self.use_v2_block_manager, ) scheduler_config = SchedulerConfig( From f300f08bf2ee6c1cacf4e91c328fedf5acdc5b29 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 14:20:59 -0700 Subject: [PATCH 164/165] remove unnecessary flags --- tests/spec_decode/e2e/test_compatibility.py | 28 --------------------- 1 file changed, 28 deletions(-) diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index b35d08bcb71b..b7a2ce46a686 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -8,17 +8,10 @@ @pytest.mark.parametrize( "common_llm_kwargs", [{ - # Use a small model for a fast test. "model": "JackFram/llama-68m", "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - # Skip real loading for fast test. - "load_format": "dummy", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # Required for spec decode. "use_v2_block_manager": True }]) @@ -58,17 +51,10 @@ def test_spec_decode_xfail_ray(test_llm_generator): @pytest.mark.parametrize( "common_llm_kwargs", [{ - # Use a small model for a fast test. "model": "JackFram/llama-68m", "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - # Skip real loading for fast test. - "load_format": "dummy", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # Required for spec decode. "use_v2_block_manager": True }]) @@ -104,17 +90,10 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator): @pytest.mark.parametrize( "common_llm_kwargs", [{ - # Use a small model for a fast test. "model": "meta-llama/Llama-2-7b-chat-hf", "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - # Skip real loading for fast test. - "load_format": "dummy", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # Required for spec decode. "use_v2_block_manager": True }]) @@ -163,16 +142,9 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): @pytest.mark.parametrize( "common_llm_kwargs", [{ - # Use a small model for a fast test. "model": "JackFram/llama-68m", "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, - - # Skip real loading for fast test. - "load_format": "dummy", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) From 5434d905a6dd8828c34b1ff19c00a2c01ca687d3 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Mon, 22 Apr 2024 14:22:18 -0700 Subject: [PATCH 165/165] lint --- tests/spec_decode/e2e/test_compatibility.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index b7a2ce46a686..fde950c14382 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -139,13 +139,11 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): sampling_params) -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model": "JackFram/llama-68m", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }]) +@pytest.mark.parametrize("common_llm_kwargs", [{ + "model": "JackFram/llama-68m", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, +}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("seed", [1])