Skip to content

Commit 4cbf286

Browse files
authored
[V1] Remove cache from StructuredOutputManager (#14622)
Signed-off-by: Russell Bryant <[email protected]>
1 parent c6e14a6 commit 4cbf286

File tree

2 files changed

+12
-48
lines changed

2 files changed

+12
-48
lines changed

vllm/v1/engine/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def add_request(self, request: EngineCoreRequest):
137137
req = Request.from_engine_core_request(request)
138138
if req.use_structured_output:
139139
# Start grammar compilation asynchronously
140-
self.structured_output_manager.populate_cache(req)
140+
self.structured_output_manager.grammar_init(req)
141141

142142
self.scheduler.add_request(req)
143143

vllm/v1/structured_output/__init__.py

Lines changed: 11 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
11
# SPDX-License-Identifier: Apache-2.0
22
from __future__ import annotations
33

4-
import copy
54
import multiprocessing
6-
from collections import OrderedDict
7-
from concurrent.futures import ThreadPoolExecutor
5+
from concurrent.futures import Future, ThreadPoolExecutor
86
from typing import TYPE_CHECKING, Optional
97

108
from vllm.config import VllmConfig
119
from vllm.logger import init_logger
1210
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
1311
from vllm.utils import LazyLoader
14-
from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
15-
StructuredOutputOptions)
12+
from vllm.v1.structured_output.grammar import Grammar, StructuredOutputOptions
1613

1714
if TYPE_CHECKING:
1815
import numpy as np
@@ -29,7 +26,7 @@
2926

3027
class StructuredOutputManager:
3128

32-
def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
29+
def __init__(self, vllm_config: VllmConfig):
3330
tokenizer_group = init_tokenizer_from_configs(
3431
model_config=vllm_config.model_config,
3532
scheduler_config=vllm_config.scheduler_config,
@@ -44,10 +41,6 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
4441
tokenizer, vocab_size=self.vocab_size)
4542
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
4643

47-
self.max_cache_size = max_cache_size
48-
self.request_key_to_grammar: OrderedDict[StructuredOutputKey,
49-
Grammar] = OrderedDict()
50-
5144
# The default max_workers if not specified is the number of CPUs * 5,
5245
# which is way too high since these tasks are CPU-bound, not I/O bound.
5346
# We also know we would never dominate CPU usage with just grammar
@@ -56,51 +49,22 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
5649
self.executor = ThreadPoolExecutor(max_workers=max_workers)
5750
self._grammar_bitmask: Optional[torch.Tensor] = None
5851

59-
def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
60-
# We need to pop and re-insert the grammar here for LRU cache
61-
# of request_key_to_grammar
62-
if key in self.request_key_to_grammar:
63-
# Move accessed item to the end (most recently used)
64-
value = self.request_key_to_grammar.pop(key)
65-
if value is not None:
66-
self.request_key_to_grammar[key] = value
67-
return value
68-
return None
69-
70-
def populate_cache(self, request: Request) -> None:
52+
def grammar_init(self, request: Request) -> None:
7153
if request.structured_output_request is None:
7254
return
7355

74-
grammar = self.request_key_to_grammar.get(
75-
request.structured_output_request.structured_output_key)
76-
if grammar:
77-
request.structured_output_request.grammar = copy.copy(grammar)
78-
return
79-
request.structured_output_request.grammar = self.cache(request)
80-
81-
def cache(self, request: Request):
82-
return self.executor.submit(self._executor_loop, request)
56+
grammar: Future[Grammar] = self.executor.submit(
57+
self._async_create_grammar, request)
58+
request.structured_output_request.grammar = grammar # type: ignore[assignment]
8359

84-
def _executor_loop(self, request: Request) -> Grammar:
85-
# NOTE: The structured_output_request should never be
86-
# None in this case, but mypy can't infer this
87-
# correctly, so we need to ignore the error here.
60+
def _async_create_grammar(self, request: Request) -> Grammar:
8861
key = request.structured_output_request.structured_output_key # type: ignore[union-attr]
89-
grammar = self.request_key_to_grammar.get(key)
90-
if grammar is not None:
91-
return copy.copy(grammar)
92-
grammar = self.initialize_grammar(key)
93-
# If cache is full, remove the least recently used item
94-
if len(self.request_key_to_grammar) >= self.max_cache_size:
95-
self.request_key_to_grammar.popitem(last=False)
96-
self.request_key_to_grammar[key] = grammar
97-
return copy.copy(grammar)
98-
99-
def initialize_grammar(self, key: StructuredOutputKey) -> Grammar:
62+
10063
# Note that the request was validated in the engine core client,
10164
# so at this point we know it is a supported type of request.
10265
#
103-
# TODO: we still need to handle xgrammar compilation failures
66+
# TODO: we still need to handle xgrammar compilation failures,
67+
# though it should be unlikely as we test that up front as well.
10468
request_type, grammar_spec = key
10569

10670
if request_type == StructuredOutputOptions.JSON:

0 commit comments

Comments
 (0)