From 0a551cbd546d67d35fb40451731c1763ca9cb86d Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Tue, 22 Oct 2024 02:21:01 +0000 Subject: [PATCH 1/2] Remove evictor_v1 as evictor_v1 is for block manager v1, which has been removed. --- examples/offline_inference.py | 2 +- vllm/core/block/prefix_caching_block.py | 2 +- vllm/core/{evictor_v2.py => evictor.py} | 0 vllm/core/evictor_v1.py | 106 ------------------------ 4 files changed, 2 insertions(+), 108 deletions(-) rename vllm/core/{evictor_v2.py => evictor.py} (100%) delete mode 100644 vllm/core/evictor_v1.py diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 9b758fa2479f..92895382459e 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -11,7 +11,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="facebook/opt-125m") +llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 7c8a2bc49351..57527e39b9bd 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -7,7 +7,7 @@ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) -from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor +from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor PrefixHash = int diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor.py similarity index 100% rename from vllm/core/evictor_v2.py rename to vllm/core/evictor.py diff --git a/vllm/core/evictor_v1.py b/vllm/core/evictor_v1.py deleted file mode 100644 index 5db5a08a5bb6..000000000000 --- a/vllm/core/evictor_v1.py +++ /dev/null @@ -1,106 +0,0 @@ -import enum -from abc import ABC, abstractmethod -from typing import OrderedDict - -from vllm.block import PhysicalTokenBlock - - -class EvictionPolicy(enum.Enum): - """Enum for eviction policy used by make_evictor to instantiate the correct - Evictor subclass. - """ - LRU = enum.auto() - - -class Evictor(ABC): - """The Evictor subclasses should be used by the BlockAllocator class to - handle eviction of freed PhysicalTokenBlocks. - """ - - @abstractmethod - def __init__(self): - pass - - @abstractmethod - def __contains__(self, block_hash: int) -> bool: - pass - - @abstractmethod - def evict(self) -> PhysicalTokenBlock: - """Runs the eviction algorithm and returns the evicted block""" - pass - - @abstractmethod - def add(self, block: PhysicalTokenBlock): - """Adds block to the evictor, making it a candidate for eviction""" - pass - - @abstractmethod - def remove(self, block_hash: int) -> PhysicalTokenBlock: - """Simply removes the block with the hash value block_hash from the - evictor. Caller is responsible for making sure that block_hash is - contained in the evictor before calling remove. Should be used to - "bring back" blocks that have been freed but not evicted yet. - """ - pass - - @property - @abstractmethod - def num_blocks(self) -> int: - pass - - -class LRUEvictor(Evictor): - """Evicts in a least-recently-used order using the last_accessed timestamp - that's recorded in the PhysicalTokenBlock. If there are multiple blocks with - the same last_accessed time, then the one with the largest num_hashed_tokens - will be evicted. If two blocks each have the lowest last_accessed time and - highest num_hashed_tokens value, then one will be chose arbitrarily - """ - - def __init__(self): - self.free_table: OrderedDict[int, PhysicalTokenBlock] = OrderedDict() - - def __contains__(self, block_hash: int) -> bool: - return block_hash in self.free_table - - def evict(self) -> PhysicalTokenBlock: - if len(self.free_table) == 0: - raise ValueError("No usable cache memory left") - - evicted_block = next(iter(self.free_table.values())) - # The blocks with the lowest timestamps should be placed consecutively - # at the start of OrderedDict. Loop through all these blocks to - # find the one with maximum number of hashed tokens. - for _, block in self.free_table.items(): - if evicted_block.last_accessed < block.last_accessed: - break - if evicted_block.num_hashed_tokens < block.num_hashed_tokens: - evicted_block = block - - self.free_table.pop(evicted_block.block_hash) - - evicted_block.computed = False - return evicted_block - - def add(self, block: PhysicalTokenBlock): - self.free_table[block.block_hash] = block - - def remove(self, block_hash: int) -> PhysicalTokenBlock: - if block_hash not in self.free_table: - raise ValueError( - "Attempting to remove block that's not in the evictor") - block: PhysicalTokenBlock = self.free_table[block_hash] - self.free_table.pop(block_hash) - return block - - @property - def num_blocks(self) -> int: - return len(self.free_table) - - -def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: - if eviction_policy == EvictionPolicy.LRU: - return LRUEvictor() - else: - raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") From 87d19e6c9e5d43851e2b547ea84cc5a07cb0432e Mon Sep 17 00:00:00 2001 From: KuntaiDu Date: Tue, 22 Oct 2024 02:24:05 +0000 Subject: [PATCH 2/2] Remove the footprint in the example when testing --- examples/offline_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 92895382459e..9b758fa2479f 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -11,7 +11,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True) +llm = LLM(model="facebook/opt-125m") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params)