|
3 | 3 | from dataclasses import dataclass |
4 | 4 | from typing import Any, List, NamedTuple, Optional, Tuple |
5 | 5 |
|
| 6 | +from vllm.config import VllmConfig |
6 | 7 | from vllm.logger import init_logger |
| 8 | +from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, |
| 9 | + KVCacheTensor) |
7 | 10 | from vllm.v1.request import Request |
8 | 11 |
|
9 | 12 | logger = init_logger(__name__) |
@@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int, |
305 | 308 | ret.append(block_hash) |
306 | 309 | parent_block_hash_value = block_hash.hash_value |
307 | 310 | return ret |
| 311 | + |
| 312 | + |
| 313 | +def check_enough_kv_cache_memory(vllm_config: VllmConfig, |
| 314 | + kv_cache_spec: KVCacheSpec, |
| 315 | + available_memory: int): |
| 316 | + """ |
| 317 | + Checks whether `available_memory` is enough for the KV cache to hold at |
| 318 | + least one request with the model's max_model_len. |
| 319 | +
|
| 320 | + Args: |
| 321 | + vllm_config: The global VllmConfig |
| 322 | + kv_cache_spec: The kv cache spec of the model |
| 323 | + available_memory: Memory available for KV cache in bytes. |
| 324 | +
|
| 325 | + Raises: |
| 326 | + ValueError: If there is not enough memory available for the KV cache. |
| 327 | + """ |
| 328 | + |
| 329 | + if available_memory <= 0: |
| 330 | + raise ValueError("No available memory for the cache blocks. " |
| 331 | + "Try increasing `gpu_memory_utilization` when " |
| 332 | + "initializing the engine.") |
| 333 | + |
| 334 | + max_model_len = vllm_config.model_config.max_model_len |
| 335 | + needed_memory = 0 |
| 336 | + for layer_spec in kv_cache_spec.values(): |
| 337 | + needed_memory += layer_spec.bytes_for_tokens(max_model_len) |
| 338 | + |
| 339 | + if needed_memory > available_memory: |
| 340 | + raise ValueError( |
| 341 | + f"To serve at least one request with the models's max seq len " |
| 342 | + f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV " |
| 343 | + f"cache is needed, which is larger than the available KV cache " |
| 344 | + f"memory ({available_memory/1024/1024/1024:.2f} GB). Try " |
| 345 | + f"increasing `gpu_memory_utilization` or decreasing " |
| 346 | + f"`max_model_len` when initializing the engine.") |
| 347 | + |
| 348 | + |
| 349 | +def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: |
| 350 | + """ |
| 351 | + Whether all layers in the given KVCacheSpec have the same type of KV cache. |
| 352 | +
|
| 353 | + Args: |
| 354 | + kv_cache_spec: The KVCacheSpec of the model |
| 355 | +
|
| 356 | + Returns: |
| 357 | + True if all layers have the same type, False otherwise. |
| 358 | + """ |
| 359 | + |
| 360 | + layer_keys = set(layer.type_id for layer in kv_cache_spec.values()) |
| 361 | + return len(layer_keys) == 1 |
| 362 | + |
| 363 | + |
| 364 | +def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, |
| 365 | + kv_cache_spec: KVCacheSpec, |
| 366 | + available_memory: int) -> KVCacheConfig: |
| 367 | + """ |
| 368 | + Generates the KV cache configuration for a model with one type of KV cache. |
| 369 | + Divide the available memory equally among all layers. |
| 370 | +
|
| 371 | + Args: |
| 372 | + vllm_config: The global VllmConfig |
| 373 | + kv_cache_spec: The kv cache spec of the model |
| 374 | + available_memory: Memory available for KV cache in bytes. |
| 375 | +
|
| 376 | + Returns: |
| 377 | + The generated KVCacheConfig |
| 378 | + """ |
| 379 | + |
| 380 | + page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} |
| 381 | + assert len(page_sizes) == 1 |
| 382 | + page_size = page_sizes.pop() |
| 383 | + |
| 384 | + num_blocks = int(available_memory // page_size // len(kv_cache_spec)) |
| 385 | + num_blocks = max(num_blocks, 0) |
| 386 | + |
| 387 | + if vllm_config.cache_config.num_gpu_blocks_override is not None: |
| 388 | + num_gpu_blocks_override = \ |
| 389 | + vllm_config.cache_config.num_gpu_blocks_override |
| 390 | + logger.info( |
| 391 | + "Overriding num_gpu_blocks=%d with " |
| 392 | + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) |
| 393 | + num_blocks = num_gpu_blocks_override |
| 394 | + |
| 395 | + logger.info("# GPU blocks: %d", num_blocks) |
| 396 | + |
| 397 | + per_layer_size = page_size * num_blocks |
| 398 | + |
| 399 | + kv_cache_config = KVCacheConfig( |
| 400 | + num_blocks=num_blocks, |
| 401 | + tensors={ |
| 402 | + layer_name: KVCacheTensor(size=per_layer_size) |
| 403 | + for layer_name in kv_cache_spec |
| 404 | + }, |
| 405 | + groups=[[layer_name for layer_name in kv_cache_spec]], |
| 406 | + kv_cache_spec=kv_cache_spec) |
| 407 | + return kv_cache_config |
| 408 | + |
| 409 | + |
| 410 | +def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec, |
| 411 | + available_memory: int) -> KVCacheConfig: |
| 412 | + """ |
| 413 | + Generates the KV cache configuration for a model |
| 414 | + TODO: support hybrid models with more than one type of KV cache. |
| 415 | +
|
| 416 | + Args: |
| 417 | + vllm_config: The global VllmConfig |
| 418 | + kv_cache_spec: The kv cache spec of the model |
| 419 | + available_memory: Memory available for KV cache in bytes. |
| 420 | +
|
| 421 | + Returns: |
| 422 | + The generated KVCacheConfig |
| 423 | + """ |
| 424 | + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) |
| 425 | + if is_kv_cache_type_uniform(kv_cache_spec): |
| 426 | + # KV cache of all layers are the same, which is true for most models. |
| 427 | + # Allocate the same amount of memory for each layer. |
| 428 | + return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, |
| 429 | + available_memory) |
| 430 | + else: |
| 431 | + raise NotImplementedError |
0 commit comments