11# SPDX-License-Identifier: Apache-2.0
22from __future__ import annotations
33
4- import copy
54import multiprocessing
6- from collections import OrderedDict
7- from concurrent .futures import ThreadPoolExecutor
5+ from concurrent .futures import Future , ThreadPoolExecutor
86from typing import TYPE_CHECKING , Optional
97
108from vllm .config import VllmConfig
119from vllm .logger import init_logger
1210from vllm .transformers_utils .tokenizer_group import init_tokenizer_from_configs
1311from vllm .utils import LazyLoader
14- from vllm .v1 .structured_output .grammar import (Grammar , StructuredOutputKey ,
15- StructuredOutputOptions )
12+ from vllm .v1 .structured_output .grammar import Grammar , StructuredOutputOptions
1613
1714if TYPE_CHECKING :
1815 import numpy as np
2926
3027class StructuredOutputManager :
3128
32- def __init__ (self , vllm_config : VllmConfig , max_cache_size : int = 500 ):
29+ def __init__ (self , vllm_config : VllmConfig ):
3330 tokenizer_group = init_tokenizer_from_configs (
3431 model_config = vllm_config .model_config ,
3532 scheduler_config = vllm_config .scheduler_config ,
@@ -44,10 +41,6 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
4441 tokenizer , vocab_size = self .vocab_size )
4542 self .compiler = xgr .GrammarCompiler (tokenizer_info , max_threads = 8 )
4643
47- self .max_cache_size = max_cache_size
48- self .request_key_to_grammar : OrderedDict [StructuredOutputKey ,
49- Grammar ] = OrderedDict ()
50-
5144 # The default max_workers if not specified is the number of CPUs * 5,
5245 # which is way too high since these tasks are CPU-bound, not I/O bound.
5346 # We also know we would never dominate CPU usage with just grammar
@@ -56,51 +49,22 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
5649 self .executor = ThreadPoolExecutor (max_workers = max_workers )
5750 self ._grammar_bitmask : Optional [torch .Tensor ] = None
5851
59- def __getitem__ (self , key : StructuredOutputKey ) -> Optional [Grammar ]:
60- # We need to pop and re-insert the grammar here for LRU cache
61- # of request_key_to_grammar
62- if key in self .request_key_to_grammar :
63- # Move accessed item to the end (most recently used)
64- value = self .request_key_to_grammar .pop (key )
65- if value is not None :
66- self .request_key_to_grammar [key ] = value
67- return value
68- return None
69-
70- def populate_cache (self , request : Request ) -> None :
52+ def grammar_init (self , request : Request ) -> None :
7153 if request .structured_output_request is None :
7254 return
7355
74- grammar = self .request_key_to_grammar .get (
75- request .structured_output_request .structured_output_key )
76- if grammar :
77- request .structured_output_request .grammar = copy .copy (grammar )
78- return
79- request .structured_output_request .grammar = self .cache (request )
80-
81- def cache (self , request : Request ):
82- return self .executor .submit (self ._executor_loop , request )
56+ grammar : Future [Grammar ] = self .executor .submit (
57+ self ._async_create_grammar , request )
58+ request .structured_output_request .grammar = grammar # type: ignore[assignment]
8359
84- def _executor_loop (self , request : Request ) -> Grammar :
85- # NOTE: The structured_output_request should never be
86- # None in this case, but mypy can't infer this
87- # correctly, so we need to ignore the error here.
60+ def _async_create_grammar (self , request : Request ) -> Grammar :
8861 key = request .structured_output_request .structured_output_key # type: ignore[union-attr]
89- grammar = self .request_key_to_grammar .get (key )
90- if grammar is not None :
91- return copy .copy (grammar )
92- grammar = self .initialize_grammar (key )
93- # If cache is full, remove the least recently used item
94- if len (self .request_key_to_grammar ) >= self .max_cache_size :
95- self .request_key_to_grammar .popitem (last = False )
96- self .request_key_to_grammar [key ] = grammar
97- return copy .copy (grammar )
98-
99- def initialize_grammar (self , key : StructuredOutputKey ) -> Grammar :
62+
10063 # Note that the request was validated in the engine core client,
10164 # so at this point we know it is a supported type of request.
10265 #
103- # TODO: we still need to handle xgrammar compilation failures
66+ # TODO: we still need to handle xgrammar compilation failures,
67+ # though it should be unlikely as we test that up front as well.
10468 request_type , grammar_spec = key
10569
10670 if request_type == StructuredOutputOptions .JSON :
0 commit comments