33import pytest
44from transformers import AutoTokenizer
55
6+ from vllm .engine .arg_utils import EngineArgs
67from vllm .sampling_params import RequestOutputKind , SamplingParams
8+ from vllm .transformers_utils .tokenizer_group import init_tokenizer_from_configs
79from vllm .v1 .engine import EngineCoreOutput , EngineCoreRequest
8- from vllm .v1 .engine .detokenizer import Detokenizer
10+ from vllm .v1 .engine .output_processor import OutputProcessor
911
1012TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
13+ VLLM_CONFIG = EngineArgs (model = TOKENIZER_NAME ).create_engine_config ()
14+ TOKENIZER_GROUP = init_tokenizer_from_configs (VLLM_CONFIG .model_config ,
15+ VLLM_CONFIG .scheduler_config ,
16+ VLLM_CONFIG .parallel_config ,
17+ VLLM_CONFIG .lora_config )
1118tokenizer = AutoTokenizer .from_pretrained (TOKENIZER_NAME )
1219
1320FULL_STRINGS = [
@@ -66,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
6673 "request_output_kind" ,
6774 [RequestOutputKind .DELTA , RequestOutputKind .FINAL_ONLY ])
6875def test_incremental_detokenization (request_output_kind : RequestOutputKind ):
69- detokenizer = Detokenizer ( TOKENIZER_NAME )
76+ output_processor = OutputProcessor ( TOKENIZER_GROUP , log_stats = False )
7077 engine_core = MockEngineCore (GENERATION_TOKENS )
7178
7279 # Make N requests.
@@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
93100
94101 # Add requests to the detokenizer.
95102 for request in requests :
96- detokenizer .add_request (request )
103+ output_processor .add_request (request )
97104
98105 gen_strings = {}
99106 gen_tokens = {}
@@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
104111 break
105112
106113 # Step the Detokenizer.
107- request_outputs , requests_to_abort = detokenizer .step (outputs )
114+ processed_outputs = output_processor .process_outputs (outputs , )
115+ request_outputs = processed_outputs .request_outputs
116+ requests_to_abort = processed_outputs .reqs_to_abort
108117 assert len (requests_to_abort ) == 0
109118
110119 # Update tracking.
@@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
128137 assert gen_str == ref_gen_str , f"{ gen_str = } , { ref_gen_str = } "
129138 assert gen_toks == ref_gen_toks , f"{ gen_toks = } , { ref_gen_toks = } "
130139
131- assert detokenizer .get_num_unfinished_requests () == 0
132- assert not detokenizer .has_unfinished_requests ()
140+ assert output_processor .get_num_unfinished_requests () == 0
141+ assert not output_processor .has_unfinished_requests ()
133142
134143
135144@pytest .mark .parametrize ("include_stop_str_in_output" , [True , False ])
136145def test_stop_string (include_stop_str_in_output : bool ):
137- detokenizer = Detokenizer ( TOKENIZER_NAME )
146+ output_processor = OutputProcessor ( TOKENIZER_GROUP , log_stats = False )
138147 engine_core = MockEngineCore (GENERATION_TOKENS )
139148
140149 # Make N requests.
@@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
162171
163172 # Add requests to the detokenizer.
164173 for request in requests :
165- detokenizer .add_request (request )
174+ output_processor .add_request (request )
166175
167176 gen_strings = {}
168177 aborted = []
@@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
173182 break
174183
175184 # Step the Detokenizer.
176- request_outputs , requests_to_abort = detokenizer .step (outputs )
185+ processed_outputs = output_processor .process_outputs (outputs )
186+ request_outputs = processed_outputs .request_outputs
187+ requests_to_abort = processed_outputs .reqs_to_abort
177188 for request_output in request_outputs :
178189 # If aborted, we should not get a request output.
179190 assert request_output .request_id not in aborted
@@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool):
214225 assert gen_str == ref_str_exc_stop , (
215226 f"{ gen_str = } , { ref_str_exc_stop = } " )
216227
217- assert detokenizer .get_num_unfinished_requests () == 0
218- assert not detokenizer .has_unfinished_requests ()
228+ assert output_processor .get_num_unfinished_requests () == 0
229+ assert not output_processor .has_unfinished_requests ()
230+
231+
232+ def test_iteration_stats ():
233+ output_processor = OutputProcessor (TOKENIZER_GROUP , log_stats = True )
234+ engine_core = MockEngineCore (GENERATION_TOKENS )
235+
236+ # Make N requests.
237+ requests = [
238+ EngineCoreRequest (
239+ request_id = f"request-{ idx } " ,
240+ prompt = prompt ,
241+ prompt_token_ids = prompt_tokens ,
242+ arrival_time = 0 ,
243+ mm_inputs = None ,
244+ mm_hashes = None ,
245+ mm_placeholders = None ,
246+ eos_token_id = None ,
247+ lora_request = None ,
248+ sampling_params = SamplingParams (),
249+ ) for idx , (
250+ prompt ,
251+ prompt_tokens ) in enumerate (zip (PROMPT_STRINGS , PROMPT_TOKENS ))
252+ ]
253+
254+ # Add all requests except one to the OutputProcessor.
255+ num_active = len (GENERATION_TOKENS ) - 1
256+ for request in requests [:num_active ]:
257+ output_processor .add_request (request )
258+ inactive_request = requests [num_active ]
259+
260+ # First iteration has 2 prefills.
261+ outputs = engine_core .get_outputs ()[:num_active ]
262+ processed_outputs = output_processor .process_outputs (outputs )
263+ iteration_stats = processed_outputs .iteration_stats
264+ total_prompt_tokens = sum (
265+ [len (prompt_tokens ) for prompt_tokens in PROMPT_TOKENS [:num_active ]])
266+
267+ assert iteration_stats .num_prompt_tokens == total_prompt_tokens
268+ assert iteration_stats .num_generation_tokens == num_active
269+
270+ # Just decodes in this step.
271+ outputs = engine_core .get_outputs ()[:num_active ]
272+ processed_outputs = output_processor .process_outputs (outputs )
273+ iteration_stats = processed_outputs .iteration_stats
274+
275+ assert iteration_stats .num_prompt_tokens == 0
276+ assert iteration_stats .num_generation_tokens == num_active
277+
278+ # Add a new request - prefill and 2 decodes in this step.
279+ output_processor .add_request (inactive_request )
280+ num_active += 1
281+ outputs = engine_core .get_outputs ()[:num_active ]
282+ processed_outputs = output_processor .process_outputs (outputs )
283+ iteration_stats = processed_outputs .iteration_stats
284+ total_prompt_tokens = len (PROMPT_TOKENS [num_active - 1 ])
285+
286+ assert iteration_stats .num_prompt_tokens == total_prompt_tokens
287+ assert iteration_stats .num_generation_tokens == num_active
288+
289+ # Just decodes in this step.
290+ outputs = engine_core .get_outputs ()[:num_active ]
291+ processed_outputs = output_processor .process_outputs (outputs )
292+ iteration_stats = processed_outputs .iteration_stats
293+
294+ assert iteration_stats .num_prompt_tokens == 0
295+ assert iteration_stats .num_generation_tokens == num_active
0 commit comments