From 0a551cbd546d67d35fb40451731c1763ca9cb86d Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 22 Oct 2024 02:21:01 +0000
Subject: [PATCH 1/2] Remove evictor_v1 as evictor_v1 is for block manager v1,
 which has been removed.

---
 examples/offline_inference.py           |   2 +-
 vllm/core/block/prefix_caching_block.py |   2 +-
 vllm/core/{evictor_v2.py => evictor.py} |   0
 vllm/core/evictor_v1.py                 | 106 ------------------------
 4 files changed, 2 insertions(+), 108 deletions(-)
 rename vllm/core/{evictor_v2.py => evictor.py} (100%)
 delete mode 100644 vllm/core/evictor_v1.py

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f..92895382459e 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 7c8a2bc49351..57527e39b9bd 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -7,7 +7,7 @@
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
                                          NaiveBlockAllocator)
-from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
+from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
 
 PrefixHash = int
 
diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor.py
similarity index 100%
rename from vllm/core/evictor_v2.py
rename to vllm/core/evictor.py
diff --git a/vllm/core/evictor_v1.py b/vllm/core/evictor_v1.py
deleted file mode 100644
index 5db5a08a5bb6..000000000000
--- a/vllm/core/evictor_v1.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import enum
-from abc import ABC, abstractmethod
-from typing import OrderedDict
-
-from vllm.block import PhysicalTokenBlock
-
-
-class EvictionPolicy(enum.Enum):
-    """Enum for eviction policy used by make_evictor to instantiate the correct
-       Evictor subclass.
-    """
-    LRU = enum.auto()
-
-
-class Evictor(ABC):
-    """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed PhysicalTokenBlocks.
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def __contains__(self, block_hash: int) -> bool:
-        pass
-
-    @abstractmethod
-    def evict(self) -> PhysicalTokenBlock:
-        """Runs the eviction algorithm and returns the evicted block"""
-        pass
-
-    @abstractmethod
-    def add(self, block: PhysicalTokenBlock):
-        """Adds block to the evictor, making it a candidate for eviction"""
-        pass
-
-    @abstractmethod
-    def remove(self, block_hash: int) -> PhysicalTokenBlock:
-        """Simply removes the block with the hash value block_hash from the
-        evictor. Caller is responsible for making sure that block_hash is
-        contained in the evictor before calling remove. Should be used to
-        "bring back" blocks that have been freed but not evicted yet.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def num_blocks(self) -> int:
-        pass
-
-
-class LRUEvictor(Evictor):
-    """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
-    the same last_accessed time, then the one with the largest num_hashed_tokens
-    will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chose arbitrarily
-    """
-
-    def __init__(self):
-        self.free_table: OrderedDict[int, PhysicalTokenBlock] = OrderedDict()
-
-    def __contains__(self, block_hash: int) -> bool:
-        return block_hash in self.free_table
-
-    def evict(self) -> PhysicalTokenBlock:
-        if len(self.free_table) == 0:
-            raise ValueError("No usable cache memory left")
-
-        evicted_block = next(iter(self.free_table.values()))
-        # The blocks with the lowest timestamps should be placed consecutively
-        # at the start of OrderedDict. Loop through all these blocks to
-        # find the one with maximum number of hashed tokens.
-        for _, block in self.free_table.items():
-            if evicted_block.last_accessed < block.last_accessed:
-                break
-            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
-                evicted_block = block
-
-        self.free_table.pop(evicted_block.block_hash)
-
-        evicted_block.computed = False
-        return evicted_block
-
-    def add(self, block: PhysicalTokenBlock):
-        self.free_table[block.block_hash] = block
-
-    def remove(self, block_hash: int) -> PhysicalTokenBlock:
-        if block_hash not in self.free_table:
-            raise ValueError(
-                "Attempting to remove block that's not in the evictor")
-        block: PhysicalTokenBlock = self.free_table[block_hash]
-        self.free_table.pop(block_hash)
-        return block
-
-    @property
-    def num_blocks(self) -> int:
-        return len(self.free_table)
-
-
-def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
-    if eviction_policy == EvictionPolicy.LRU:
-        return LRUEvictor()
-    else:
-        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")

From 87d19e6c9e5d43851e2b547ea84cc5a07cb0432e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 22 Oct 2024 02:24:05 +0000
Subject: [PATCH 2/2] Remove the footprint in the example when testing

---
 examples/offline_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 92895382459e..9b758fa2479f 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
+llm = LLM(model="facebook/opt-125m")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)