vllm-project · Gaohan123 · Feb 12, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 12, 2026
@@ -184,7 +184,7 @@ def calculate_metrics(
             total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
-                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                latency_minus_ttft = outputs[i].text_latency - outputs[i].ttft
                 tpot = latency_minus_ttft / (output_len - 1)
                 tpots.append(tpot)
             # Note: if output_len <= 1, we regard tpot as 0 for goodput

@@ -71,6 +71,7 @@ class MixRequestFuncOutput(RequestFuncOutput):
     audio_duration: float = 0.0
     audio_frames: int = 0
     audio_rtf: float = 0.0
+    text_latency: float = 0.0
 
 
 async def async_request_openai_chat_omni_completions(
@@ -148,6 +149,7 @@ async def async_request_openai_chat_omni_completions(
                                         output.itl.append(timestamp - most_recent_timestamp)
                                     generated_text += content or ""
                                     most_recent_timestamp = timestamp
+                                    output.text_latency = timestamp - st
                                 elif modality == "audio":
                                     if output.audio_ttfp == 0.0:
                                         output.audio_ttfp = timestamp - st
@@ -161,8 +163,8 @@ async def async_request_openai_chat_omni_completions(
                                             else:
                                                 generated_audio = generated_audio + seg
 
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get("completion_tokens")
+                            if metrics := data.get("metrics"):
+                                output.output_tokens = metrics.get("num_tokens_out")
 
                 output.latency = timestamp - st
                 output.generated_text = generated_text