vllm-project · vllm-bot · Mar 7, 2025 · Feb 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -6,7 +6,7 @@
 import random
 import time
 from functools import cache
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import uvloop
@@ -171,7 +171,7 @@ def run_vllm(
     llm = LLM(**dataclasses.asdict(engine_args))
 
     # Add the requests to the engine.
-    prompts: List[TextPrompt | TokensPrompt] = []
+    prompts: List[Union[TextPrompt, TokensPrompt]] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
         prompts.append(
@@ -232,7 +232,7 @@ async def run_vllm_async(
             engine_args, disable_frontend_multiprocessing) as llm:
 
         # Add the requests to the engine.
-        prompts: List[TextPrompt | TokensPrompt] = []
+        prompts: List[Union[TextPrompt, TokensPrompt]] = []
         sampling_params: List[SamplingParams] = []
         lora_requests: List[Optional[LoRARequest]] = []
         for request in requests:

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -255,7 +255,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
-            help='Skip initialization of tokenizer and detokenizer.')
+            help='Skip tokenizer\detokenizer use during model inference. '
+            'Use of tokenizer\detokenizer increases host overhead and might '
+            'end up of GPU underutilization')
         parser.add_argument(
             '--revision',
             type=nullable_str,