NVIDIA-NeMo · Pouyanpi · Jan 8, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/benchmark/aiperf/README.md b/benchmark/aiperf/README.md
@@ -20,9 +20,9 @@ Instead of manually running AIPerf multiple times with different parameters, you
 ### Prerequisites
 
 These steps have been tested with Python 3.11.11.
-To use the provided configurations, you need to create accounts at https://build.nvidia.com/ and [Huggingface](https://huggingface.co/).
-* The provided configurations use models hosted at https://build.nvidia.com/, you'll need to create a Personal API Key to access the models.
-* The provided AIperf configurations require the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
+To use the provided configurations, you need to create accounts at <https://build.nvidia.com/> and [Huggingface](https://huggingface.co/).
+- The provided configurations use models hosted at <https://build.nvidia.com/>, you'll need to create a Personal API Key to access the models.
+- The provided AIperf configurations require the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
 
 1. **Create a virtual environment in which to install AIPerf**
 
@@ -37,13 +37,13 @@ To use the provided configurations, you need to create accounts at https://build
    $ pip install aiperf huggingface_hub typer
    ```
 
-3. ** Login to Hugging Face:**
+3. **Login to Hugging Face:**
 
    ```bash
    huggingface-cli login
    ```
 
-4. ** Set NVIDIA API Key:**
+4. **Set NVIDIA API Key:**
 
    The provided configs use models hosted on [build.nvidia.com](https://build.nvidia.com/).
    To access these, [create an account](https://build.nvidia.com/), and create a Personal API Key.

diff --git a/benchmark/mock_llm_server/api.py b/benchmark/mock_llm_server/api.py
@@ -17,28 +17,36 @@
 import asyncio
 import logging
 import time
-from typing import Annotated, Union
+from typing import Annotated, AsyncGenerator, Union
 
 from fastapi import Depends, FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
 
 from benchmark.mock_llm_server.config import ModelSettings, get_settings
 from benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ChatCompletionStreamChoice,
+    ChatCompletionStreamResponse,
     CompletionChoice,
     CompletionRequest,
     CompletionResponse,
+    CompletionStreamChoice,
+    CompletionStreamResponse,
+    DeltaMessage,
     Message,
     Model,
     ModelsResponse,
     Usage,
 )
 from benchmark.mock_llm_server.response_data import (
     calculate_tokens,
+    generate_chunk_latencies,
     generate_id,
     get_latency_seconds,
     get_response,
+    split_response_into_chunks,
 )
 
 # Create a console logging handler
@@ -120,8 +128,149 @@ async def list_models(config: ModelSettingsDep):
     return response
 
 
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
-async def chat_completions(request: ChatCompletionRequest, config: ModelSettingsDep) -> ChatCompletionResponse:
+async def stream_chat_completion(
+    completion_id: str,
+    model: str,
+    response_content: str,
+    config: ModelSettings,
+    n_choices: int = 1,
+) -> AsyncGenerator[str, None]:
+    """Generate Server-Sent Events for streaming chat completions.
+
+    Args:
+        completion_id: Unique ID for this completion
+        model: Model name
+        response_content: Full response text to stream
+        config: Model settings for latency configuration
+        n_choices: Number of choices to generate
+    """
+    created_timestamp = int(time.time())
+    chunks = split_response_into_chunks(response_content)
+    latencies = generate_chunk_latencies(config, len(chunks))
+
+    # First chunk with role
+    for i in range(n_choices):
+        first_response = ChatCompletionStreamResponse(
+            id=completion_id,
+            object="chat.completion.chunk",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                ChatCompletionStreamChoice(
+                    index=i,
+                    delta=DeltaMessage(role="assistant", content=""),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield f"data: {first_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Stream content chunks
+    for chunk_idx, chunk in enumerate(chunks):
+        await asyncio.sleep(latencies[chunk_idx])
+
+        for i in range(n_choices):
+            chunk_response = ChatCompletionStreamResponse(
+                id=completion_id,
+                object="chat.completion.chunk",
+                created=created_timestamp,
+                model=model,
+                choices=[
+                    ChatCompletionStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=chunk),
+                        finish_reason=None,
+                    )
+                ],
+            )
+            yield f"data: {chunk_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Final chunk with finish_reason
+    for i in range(n_choices):
+        final_response = ChatCompletionStreamResponse(
+            id=completion_id,
+            object="chat.completion.chunk",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                ChatCompletionStreamChoice(
+                    index=i,
+                    delta=DeltaMessage(),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield f"data: {final_response.model_dump_json(exclude_none=True)}\n\n"
+
+    yield "data: [DONE]\n\n"
+
+
+async def stream_completion(
+    completion_id: str,
+    model: str,
+    response_text: str,
+    config: ModelSettings,
+    n: int = 1,
+) -> AsyncGenerator[str, None]:
+    """Generate Server-Sent Events for streaming text completions.
+
+    Args:
+        completion_id: Unique ID for this completion
+        model: Model name
+        response_text: Full response text to stream
+        config: Model settings for latency configuration
+        n: Number of choices to generate
+    """
+    created_timestamp = int(time.time())
+    chunks = split_response_into_chunks(response_text)
+    latencies = generate_chunk_latencies(config, len(chunks))
+
+    # Stream content chunks
+    for chunk_idx, chunk in enumerate(chunks):
+        await asyncio.sleep(latencies[chunk_idx])
+
+        for i in range(n):
+            chunk_response = CompletionStreamResponse(
+                id=completion_id,
+                object="text_completion",
+                created=created_timestamp,
+                model=model,
+                choices=[
+                    CompletionStreamChoice(
+                        text=chunk,
+                        index=i,
+                        logprobs=None,
+                        finish_reason=None,
+                    )
+                ],
+            )
+            yield f"data: {chunk_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Final chunk with finish_reason
+    for i in range(n):
+        final_response = CompletionStreamResponse(
+            id=completion_id,
+            object="text_completion",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                CompletionStreamChoice(
+                    text="",
+                    index=i,
+                    logprobs=None,
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield f"data: {final_response.model_dump_json(exclude_none=True)}\n\n"
+
+    yield "data: [DONE]\n\n"
+
+
+@app.post("/v1/chat/completions", response_model=None)
+async def chat_completions(
+    request: ChatCompletionRequest, config: ModelSettingsDep
+) -> Union[ChatCompletionResponse, StreamingResponse]:
     """Create a chat completion."""
 
     log.debug("/v1/chat/completions request: %s", request)
@@ -131,6 +280,23 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
 
     # Generate dummy response
     response_content = get_response(config)
+    completion_id = generate_id("chatcmpl")
+
+    # Handle streaming response
+    if request.stream:
+        log.debug("/v1/chat/completions streaming response for id: %s", completion_id)
+        return StreamingResponse(
+            stream_chat_completion(
+                completion_id=completion_id,
+                model=request.model,
+                response_content=response_content,
+                config=config,
+                n_choices=request.n or 1,
+            ),
+            media_type="text/event-stream",
+        )
+
+    # Non-streaming response
     response_latency_seconds = get_latency_seconds(config)
 
     # Calculate token usage
@@ -139,7 +305,6 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
     completion_tokens = calculate_tokens(response_content)
 
     # Create response
-    completion_id = generate_id("chatcmpl")
     created_timestamp = int(time.time())
 
     choices = []
@@ -168,8 +333,10 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
     return response
 
 
-@app.post("/v1/completions", response_model=CompletionResponse)
-async def completions(request: CompletionRequest, config: ModelSettingsDep) -> CompletionResponse:
+@app.post("/v1/completions", response_model=None)
+async def completions(
+    request: CompletionRequest, config: ModelSettingsDep
+) -> Union[CompletionResponse, StreamingResponse]:
     """Create a text completion."""
 
     log.debug("/v1/completions request: %s", request)
@@ -185,14 +352,30 @@ async def completions(request: CompletionRequest, config: ModelSettingsDep) -> C
 
     # Generate dummy response
     response_text = get_response(config)
+    completion_id = generate_id("cmpl")
+
+    # Handle streaming response
+    if request.stream:
+        log.debug("/v1/completions streaming response for id: %s", completion_id)
+        return StreamingResponse(
+            stream_completion(
+                completion_id=completion_id,
+                model=request.model,
+                response_text=response_text,
+                config=config,
+                n=request.n or 1,
+            ),
+            media_type="text/event-stream",
+        )
+
+    # Non-streaming response
     response_latency_seconds = get_latency_seconds(config)
 
     # Calculate token usage
     prompt_tokens = calculate_tokens(prompt_text)
     completion_tokens = calculate_tokens(response_text)
 
     # Create response
-    completion_id = generate_id("cmpl")
     created_timestamp = int(time.time())
 
     choices = []

diff --git a/benchmark/mock_llm_server/config.py b/benchmark/mock_llm_server/config.py
@@ -37,10 +37,24 @@ class ModelSettings(BaseSettings):
     # Config with default values
     # Latency sampled from a truncated-normal distribution.
     # Plain Normal distributions have infinite support, and can be negative
-    latency_min_seconds: float = Field(default=0.1, description="Minimum latency in seconds")
-    latency_max_seconds: float = Field(default=5, description="Maximum latency in seconds")
-    latency_mean_seconds: float = Field(default=0.5, description="The average response time in seconds")
-    latency_std_seconds: float = Field(default=0.1, description="Standard deviation of response time")
+    e2e_latency_min_seconds: float = Field(default=0.1, description="Minimum latency in seconds")
+    e2e_latency_max_seconds: float = Field(default=5, description="Maximum latency in seconds")
+    e2e_latency_mean_seconds: float = Field(default=0.5, description="The average response time in seconds")
+    e2e_latency_std_seconds: float = Field(default=0.1, description="Standard deviation of response time")
+
+    # Streaming latency: Time to First Token (TTFT)
+    # https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html#time-to-first-token-ttft
+    ttft_min_seconds: float = Field(default=0.05, description="Minimum TTFT in seconds")
+    ttft_max_seconds: float = Field(default=0.5, description="Maximum TTFT in seconds")
+    ttft_mean_seconds: float = Field(default=0.1, description="Average TTFT in seconds")
+    ttft_std_seconds: float = Field(default=0.02, description="Standard deviation of TTFT")
+
+    # Streaming latency: Inter-Token Latency (ITL)
+    # https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html#inter-token-latency-itl
+    itl_min_seconds: float = Field(default=0.01, description="Minimum ITL in seconds")
+    itl_max_seconds: float = Field(default=0.1, description="Maximum ITL in seconds")
+    itl_mean_seconds: float = Field(default=0.03, description="Average ITL in seconds")
+    itl_std_seconds: float = Field(default=0.01, description="Standard deviation of ITL")
 
     model_config = SettingsConfigDict(env_file=CONFIG_FILE)
 

diff --git a/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env b/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
@@ -2,7 +2,18 @@ MODEL="meta/llama-3.3-70b-instruct"
 UNSAFE_PROBABILITY=0.0
 UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
 SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
-LATENCY_MIN_SECONDS=4.0
-LATENCY_MAX_SECONDS=4.0
-LATENCY_MEAN_SECONDS=4.0
-LATENCY_STD_SECONDS=0.0
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Inter-Token Latency (ITL)
+ITL_MIN_SECONDS=0.015
+ITL_MAX_SECONDS=0.015
+ITL_MEAN_SECONDS=0.015
+ITL_STD_SECONDS=0.0
diff --git a/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env b/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
@@ -2,7 +2,18 @@ MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
 UNSAFE_PROBABILITY=0.03
 UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
 SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
-LATENCY_MIN_SECONDS=0.5
-LATENCY_MAX_SECONDS=0.5
-LATENCY_MEAN_SECONDS=0.5
-LATENCY_STD_SECONDS=0.0
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Inter-Token Latency (ITL)
+ITL_MIN_SECONDS=0.015
+ITL_MAX_SECONDS=0.015
+ITL_MEAN_SECONDS=0.015
+ITL_STD_SECONDS=0.0