|
| 1 | +from typing import List |
| 2 | + |
| 3 | +import pytest |
| 4 | +from transformers import AutoTokenizer |
| 5 | + |
| 6 | +from vllm.sampling_params import RequestOutputKind |
| 7 | +from vllm.v1.engine import EngineCoreOutput |
| 8 | +from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest |
| 9 | + |
| 10 | +TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" |
| 11 | +tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) |
| 12 | + |
| 13 | +FULL_STRINGS = [ |
| 14 | + "My name is Robert from Neural Magic and I love working on vLLM so much!", |
| 15 | + "Red Hat is the best open source company by far across Linux, K8s, and AI.", |
| 16 | + "Nick is the name of my brother in addition to my colleague from Red Hat.", |
| 17 | +] |
| 18 | + |
| 19 | +STOP_STRINGS = ["I love working on", "company by far", "brother in"] |
| 20 | + |
| 21 | +FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS] |
| 22 | +PROMPT_LEN = 5 |
| 23 | +PROMPT_TOKENS = [ |
| 24 | + tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS |
| 25 | +] |
| 26 | +GENERATION_TOKENS = [ |
| 27 | + tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS |
| 28 | +] |
| 29 | +PROMPT_STRINGS = [ |
| 30 | + tokenizer.decode(prompt_tokens, skip_special_tokens=True) |
| 31 | + for prompt_tokens in PROMPT_TOKENS |
| 32 | +] |
| 33 | +PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS] |
| 34 | +GENERATION_STRINGS = [ |
| 35 | + text[prompt_len:] |
| 36 | + for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN) |
| 37 | +] |
| 38 | + |
| 39 | + |
| 40 | +class MockEngineCore: |
| 41 | + """Mock outputs form premade tokens lists.""" |
| 42 | + |
| 43 | + def __init__(self, tokens_list: List[List[int]]): |
| 44 | + self.tokens_list = tokens_list |
| 45 | + self.current_idx = 0 |
| 46 | + |
| 47 | + def get_outputs(self) -> List[EngineCoreOutput]: |
| 48 | + token_idx = self.current_idx |
| 49 | + self.current_idx += 1 |
| 50 | + |
| 51 | + outputs = [] |
| 52 | + for req_idx, token_ids in enumerate(self.tokens_list): |
| 53 | + if len(token_ids) > token_idx: |
| 54 | + output = EngineCoreOutput(request_id=f"request-{req_idx}", |
| 55 | + new_token_ids=[token_ids[token_idx]], |
| 56 | + finished=False) |
| 57 | + if token_idx == len(token_ids) - 1: |
| 58 | + output.finished = True |
| 59 | + output.finish_reason = "stopped" |
| 60 | + outputs.append(output) |
| 61 | + |
| 62 | + return outputs |
| 63 | + |
| 64 | + |
| 65 | +@pytest.mark.parametrize( |
| 66 | + "request_output_kind", |
| 67 | + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) |
| 68 | +def test_incremental_detokenization(request_output_kind: RequestOutputKind): |
| 69 | + detokenizer = Detokenizer(TOKENIZER_NAME) |
| 70 | + engine_core = MockEngineCore(GENERATION_TOKENS) |
| 71 | + |
| 72 | + # Make N requests. |
| 73 | + requests = [ |
| 74 | + DetokenizerRequest( |
| 75 | + request_id=f"request-{idx}", |
| 76 | + prompt=prompt, |
| 77 | + prompt_token_ids=prompt_tokens, |
| 78 | + skip_special_tokens=False, |
| 79 | + spaces_between_special_tokens=False, |
| 80 | + output_kind=request_output_kind, |
| 81 | + stop=[], |
| 82 | + include_stop_str_in_output=False, |
| 83 | + ) for idx, ( |
| 84 | + prompt, |
| 85 | + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) |
| 86 | + ] |
| 87 | + |
| 88 | + # Add requests to the detokenizer. |
| 89 | + for request in requests: |
| 90 | + detokenizer.add_request(request) |
| 91 | + |
| 92 | + gen_strings = {} |
| 93 | + gen_tokens = {} |
| 94 | + while True: |
| 95 | + # Mock output from the EngineCore. |
| 96 | + outputs = engine_core.get_outputs() |
| 97 | + if len(outputs) == 0: |
| 98 | + break |
| 99 | + |
| 100 | + # Step the Detokenizer. |
| 101 | + request_outputs, requests_to_abort = detokenizer.step(outputs) |
| 102 | + assert len(requests_to_abort) == 0 |
| 103 | + |
| 104 | + # Update tracking. |
| 105 | + for request_output in request_outputs: |
| 106 | + request_id = request_output.request_id |
| 107 | + new_text = request_output.outputs[0].text |
| 108 | + new_tokens = request_output.outputs[0].token_ids |
| 109 | + if request_id not in gen_strings: |
| 110 | + gen_strings[request_id] = new_text |
| 111 | + gen_tokens[request_id] = new_tokens |
| 112 | + else: |
| 113 | + gen_strings[request_id] += new_text |
| 114 | + gen_tokens[request_id].extend(new_tokens) |
| 115 | + |
| 116 | + # Confirmed tracked values matches what we expected. |
| 117 | + for idx, (ref_gen_str, ref_gen_toks) in enumerate( |
| 118 | + zip(GENERATION_STRINGS, GENERATION_TOKENS)): |
| 119 | + gen_str = gen_strings[f"request-{idx}"] |
| 120 | + gen_toks = gen_tokens[f"request-{idx}"] |
| 121 | + |
| 122 | + assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" |
| 123 | + assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" |
| 124 | + |
| 125 | + assert detokenizer.get_num_unfinished_requests() == 0 |
| 126 | + assert not detokenizer.has_unfinished_requests() |
| 127 | + |
| 128 | + |
| 129 | +@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) |
| 130 | +def test_stop_string(include_stop_str_in_output: bool): |
| 131 | + detokenizer = Detokenizer(TOKENIZER_NAME) |
| 132 | + engine_core = MockEngineCore(GENERATION_TOKENS) |
| 133 | + |
| 134 | + # Make N requests. |
| 135 | + requests = [ |
| 136 | + DetokenizerRequest( |
| 137 | + request_id=f"request-{idx}", |
| 138 | + prompt=prompt, |
| 139 | + prompt_token_ids=prompt_tokens, |
| 140 | + skip_special_tokens=False, |
| 141 | + spaces_between_special_tokens=False, |
| 142 | + output_kind=RequestOutputKind.DELTA, |
| 143 | + stop=STOP_STRINGS, |
| 144 | + include_stop_str_in_output=include_stop_str_in_output, |
| 145 | + ) for idx, ( |
| 146 | + prompt, |
| 147 | + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) |
| 148 | + ] |
| 149 | + |
| 150 | + # Add requests to the detokenizer. |
| 151 | + for request in requests: |
| 152 | + detokenizer.add_request(request) |
| 153 | + |
| 154 | + gen_strings = {} |
| 155 | + aborted = [] |
| 156 | + while True: |
| 157 | + # Mock output from the EngineCore. |
| 158 | + outputs = engine_core.get_outputs() |
| 159 | + if len(outputs) == 0: |
| 160 | + break |
| 161 | + |
| 162 | + # Step the Detokenizer. |
| 163 | + request_outputs, requests_to_abort = detokenizer.step(outputs) |
| 164 | + for request_output in request_outputs: |
| 165 | + # If aborted, we should not get a request output. |
| 166 | + assert request_output.request_id not in aborted |
| 167 | + aborted.extend(requests_to_abort) |
| 168 | + |
| 169 | + # Update tracking. |
| 170 | + for request_output in request_outputs: |
| 171 | + if request_output.finished: |
| 172 | + assert request_output.outputs[0].finish_reason == "stop" |
| 173 | + |
| 174 | + request_id = request_output.request_id |
| 175 | + new_text = request_output.outputs[0].text |
| 176 | + if request_id not in gen_strings: |
| 177 | + gen_strings[request_id] = new_text |
| 178 | + else: |
| 179 | + gen_strings[request_id] += new_text |
| 180 | + |
| 181 | + # Confirmed tracked values matches what we expected. |
| 182 | + for idx, (ref_gen_str, |
| 183 | + stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)): |
| 184 | + |
| 185 | + # Request should be aborted. |
| 186 | + request_id = f"request-{idx}" |
| 187 | + assert request_id in aborted |
| 188 | + |
| 189 | + # Collected values that were generated. |
| 190 | + gen_str = gen_strings[request_id] |
| 191 | + |
| 192 | + # Construct reference strings. |
| 193 | + stop_str_idx = ref_gen_str.find(stop_str) |
| 194 | + ref_str_exc_stop = ref_gen_str[:stop_str_idx] |
| 195 | + ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str |
| 196 | + |
| 197 | + if include_stop_str_in_output: |
| 198 | + assert gen_str == ref_str_inc_stop, ( |
| 199 | + f"{gen_str=}, {ref_str_inc_stop=}") |
| 200 | + else: |
| 201 | + assert gen_str == ref_str_exc_stop, ( |
| 202 | + f"{gen_str=}, {ref_str_exc_stop=}") |
| 203 | + |
| 204 | + assert detokenizer.get_num_unfinished_requests() == 0 |
| 205 | + assert not detokenizer.has_unfinished_requests() |
0 commit comments