diff --git a/benchmark/aiperf/README.md b/benchmark/aiperf/README.md
index 9a3608927..5c39fa78d 100644
--- a/benchmark/aiperf/README.md
+++ b/benchmark/aiperf/README.md
@@ -20,9 +20,9 @@ Instead of manually running AIPerf multiple times with different parameters, you
 ### Prerequisites
 
 These steps have been tested with Python 3.11.11.
-To use the provided configurations, you need to create accounts at https://build.nvidia.com/ and [Huggingface](https://huggingface.co/).
-* The provided configurations use models hosted at https://build.nvidia.com/, you'll need to create a Personal API Key to access the models.
-* The provided AIperf configurations require the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
+To use the provided configurations, you need to create accounts at <https://build.nvidia.com/> and [Huggingface](https://huggingface.co/).
+- The provided configurations use models hosted at <https://build.nvidia.com/>, you'll need to create a Personal API Key to access the models.
+- The provided AIperf configurations require the [Meta Llama 3.3 70B Instruct tokenizer](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) to calculate token-counts.
 
 1. **Create a virtual environment in which to install AIPerf**
 
@@ -37,13 +37,13 @@ To use the provided configurations, you need to create accounts at https://build
    $ pip install aiperf huggingface_hub typer
    ```
 
-3. ** Login to Hugging Face:**
+3. **Login to Hugging Face:**
 
    ```bash
    huggingface-cli login
    ```
 
-4. ** Set NVIDIA API Key:**
+4. **Set NVIDIA API Key:**
 
    The provided configs use models hosted on [build.nvidia.com](https://build.nvidia.com/).
    To access these, [create an account](https://build.nvidia.com/), and create a Personal API Key.
diff --git a/benchmark/mock_llm_server/api.py b/benchmark/mock_llm_server/api.py
index 1dac1c2ee..6e3a2d85e 100644
--- a/benchmark/mock_llm_server/api.py
+++ b/benchmark/mock_llm_server/api.py
@@ -17,18 +17,24 @@
 import asyncio
 import logging
 import time
-from typing import Annotated, Union
+from typing import Annotated, AsyncGenerator, Union
 
 from fastapi import Depends, FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
 
 from benchmark.mock_llm_server.config import ModelSettings, get_settings
 from benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ChatCompletionStreamChoice,
+    ChatCompletionStreamResponse,
     CompletionChoice,
     CompletionRequest,
     CompletionResponse,
+    CompletionStreamChoice,
+    CompletionStreamResponse,
+    DeltaMessage,
     Message,
     Model,
     ModelsResponse,
@@ -36,9 +42,11 @@
 )
 from benchmark.mock_llm_server.response_data import (
     calculate_tokens,
+    generate_chunk_latencies,
     generate_id,
     get_latency_seconds,
     get_response,
+    split_response_into_chunks,
 )
 
 # Create a console logging handler
@@ -89,7 +97,7 @@ async def log_http_duration(request: Request, call_next):
     response_time = time.time()
 
     duration_seconds = response_time - request_time
-    log.info(
+    log.debug(
         "Request finished: %s, took %.3f seconds",
         response.status_code,
         duration_seconds,
@@ -120,8 +128,149 @@ async def list_models(config: ModelSettingsDep):
     return response
 
 
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
-async def chat_completions(request: ChatCompletionRequest, config: ModelSettingsDep) -> ChatCompletionResponse:
+async def stream_chat_completion(
+    completion_id: str,
+    model: str,
+    response_content: str,
+    config: ModelSettings,
+    n_choices: int = 1,
+) -> AsyncGenerator[str, None]:
+    """Generate Server-Sent Events for streaming chat completions.
+
+    Args:
+        completion_id: Unique ID for this completion
+        model: Model name
+        response_content: Full response text to stream
+        config: Model settings for latency configuration
+        n_choices: Number of choices to generate
+    """
+    created_timestamp = int(time.time())
+    chunks = split_response_into_chunks(response_content)
+    latencies = generate_chunk_latencies(config, len(chunks))
+
+    # First chunk with role
+    for i in range(n_choices):
+        first_response = ChatCompletionStreamResponse(
+            id=completion_id,
+            object="chat.completion.chunk",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                ChatCompletionStreamChoice(
+                    index=i,
+                    delta=DeltaMessage(role="assistant", content=""),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield f"data: {first_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Stream content chunks
+    for chunk_idx, chunk in enumerate(chunks):
+        await asyncio.sleep(latencies[chunk_idx])
+
+        for i in range(n_choices):
+            chunk_response = ChatCompletionStreamResponse(
+                id=completion_id,
+                object="chat.completion.chunk",
+                created=created_timestamp,
+                model=model,
+                choices=[
+                    ChatCompletionStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=chunk),
+                        finish_reason=None,
+                    )
+                ],
+            )
+            yield f"data: {chunk_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Final chunk with finish_reason
+    for i in range(n_choices):
+        final_response = ChatCompletionStreamResponse(
+            id=completion_id,
+            object="chat.completion.chunk",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                ChatCompletionStreamChoice(
+                    index=i,
+                    delta=DeltaMessage(),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield f"data: {final_response.model_dump_json(exclude_none=True)}\n\n"
+
+    yield "data: [DONE]\n\n"
+
+
+async def stream_completion(
+    completion_id: str,
+    model: str,
+    response_text: str,
+    config: ModelSettings,
+    n: int = 1,
+) -> AsyncGenerator[str, None]:
+    """Generate Server-Sent Events for streaming text completions.
+
+    Args:
+        completion_id: Unique ID for this completion
+        model: Model name
+        response_text: Full response text to stream
+        config: Model settings for latency configuration
+        n: Number of choices to generate
+    """
+    created_timestamp = int(time.time())
+    chunks = split_response_into_chunks(response_text)
+    latencies = generate_chunk_latencies(config, len(chunks))
+
+    # Stream content chunks
+    for chunk_idx, chunk in enumerate(chunks):
+        await asyncio.sleep(latencies[chunk_idx])
+
+        for i in range(n):
+            chunk_response = CompletionStreamResponse(
+                id=completion_id,
+                object="text_completion",
+                created=created_timestamp,
+                model=model,
+                choices=[
+                    CompletionStreamChoice(
+                        text=chunk,
+                        index=i,
+                        logprobs=None,
+                        finish_reason=None,
+                    )
+                ],
+            )
+            yield f"data: {chunk_response.model_dump_json(exclude_none=True)}\n\n"
+
+    # Final chunk with finish_reason
+    for i in range(n):
+        final_response = CompletionStreamResponse(
+            id=completion_id,
+            object="text_completion",
+            created=created_timestamp,
+            model=model,
+            choices=[
+                CompletionStreamChoice(
+                    text="",
+                    index=i,
+                    logprobs=None,
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield f"data: {final_response.model_dump_json(exclude_none=True)}\n\n"
+
+    yield "data: [DONE]\n\n"
+
+
+@app.post("/v1/chat/completions", response_model=None)
+async def chat_completions(
+    request: ChatCompletionRequest, config: ModelSettingsDep
+) -> Union[ChatCompletionResponse, StreamingResponse]:
     """Create a chat completion."""
 
     log.debug("/v1/chat/completions request: %s", request)
@@ -131,6 +280,23 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
 
     # Generate dummy response
     response_content = get_response(config)
+    completion_id = generate_id("chatcmpl")
+
+    # Handle streaming response
+    if request.stream:
+        log.debug("/v1/chat/completions streaming response for id: %s", completion_id)
+        return StreamingResponse(
+            stream_chat_completion(
+                completion_id=completion_id,
+                model=request.model,
+                response_content=response_content,
+                config=config,
+                n_choices=request.n or 1,
+            ),
+            media_type="text/event-stream",
+        )
+
+    # Non-streaming response
     response_latency_seconds = get_latency_seconds(config)
 
     # Calculate token usage
@@ -139,7 +305,6 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
     completion_tokens = calculate_tokens(response_content)
 
     # Create response
-    completion_id = generate_id("chatcmpl")
     created_timestamp = int(time.time())
 
     choices = []
@@ -168,8 +333,10 @@ async def chat_completions(request: ChatCompletionRequest, config: ModelSettings
     return response
 
 
-@app.post("/v1/completions", response_model=CompletionResponse)
-async def completions(request: CompletionRequest, config: ModelSettingsDep) -> CompletionResponse:
+@app.post("/v1/completions", response_model=None)
+async def completions(
+    request: CompletionRequest, config: ModelSettingsDep
+) -> Union[CompletionResponse, StreamingResponse]:
     """Create a text completion."""
 
     log.debug("/v1/completions request: %s", request)
@@ -185,6 +352,23 @@ async def completions(request: CompletionRequest, config: ModelSettingsDep) -> C
 
     # Generate dummy response
     response_text = get_response(config)
+    completion_id = generate_id("cmpl")
+
+    # Handle streaming response
+    if request.stream:
+        log.debug("/v1/completions streaming response for id: %s", completion_id)
+        return StreamingResponse(
+            stream_completion(
+                completion_id=completion_id,
+                model=request.model,
+                response_text=response_text,
+                config=config,
+                n=request.n or 1,
+            ),
+            media_type="text/event-stream",
+        )
+
+    # Non-streaming response
     response_latency_seconds = get_latency_seconds(config)
 
     # Calculate token usage
@@ -192,7 +376,6 @@ async def completions(request: CompletionRequest, config: ModelSettingsDep) -> C
     completion_tokens = calculate_tokens(response_text)
 
     # Create response
-    completion_id = generate_id("cmpl")
     created_timestamp = int(time.time())
 
     choices = []
diff --git a/benchmark/mock_llm_server/config.py b/benchmark/mock_llm_server/config.py
index 0bb60edc6..2aa5e36bc 100644
--- a/benchmark/mock_llm_server/config.py
+++ b/benchmark/mock_llm_server/config.py
@@ -37,10 +37,24 @@ class ModelSettings(BaseSettings):
     # Config with default values
     # Latency sampled from a truncated-normal distribution.
     # Plain Normal distributions have infinite support, and can be negative
-    latency_min_seconds: float = Field(default=0.1, description="Minimum latency in seconds")
-    latency_max_seconds: float = Field(default=5, description="Maximum latency in seconds")
-    latency_mean_seconds: float = Field(default=0.5, description="The average response time in seconds")
-    latency_std_seconds: float = Field(default=0.1, description="Standard deviation of response time")
+    e2e_latency_min_seconds: float = Field(default=0.1, description="Minimum latency in seconds")
+    e2e_latency_max_seconds: float = Field(default=5, description="Maximum latency in seconds")
+    e2e_latency_mean_seconds: float = Field(default=0.5, description="The average response time in seconds")
+    e2e_latency_std_seconds: float = Field(default=0.1, description="Standard deviation of response time")
+
+    # Streaming latency: Time to First Token (TTFT)
+    # https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html#time-to-first-token-ttft
+    ttft_min_seconds: float = Field(default=0.05, description="Minimum TTFT in seconds")
+    ttft_max_seconds: float = Field(default=0.5, description="Maximum TTFT in seconds")
+    ttft_mean_seconds: float = Field(default=0.1, description="Average TTFT in seconds")
+    ttft_std_seconds: float = Field(default=0.02, description="Standard deviation of TTFT")
+
+    # Streaming latency: Chunk Latency
+    # https://docs.nvidia.com/nim/benchmarking/llm/latest/metrics.html#inter-token-latency-itl
+    chunk_latency_min_seconds: float = Field(default=0.01, description="Minimum chunk latency in seconds")
+    chunk_latency_max_seconds: float = Field(default=0.1, description="Maximum chunk latency in seconds")
+    chunk_latency_mean_seconds: float = Field(default=0.03, description="Average chunk latency in seconds")
+    chunk_latency_std_seconds: float = Field(default=0.01, description="Standard deviation of chunk latency")
 
     model_config = SettingsConfigDict(env_file=CONFIG_FILE)
 
diff --git a/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env b/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
index d1cc35477..0cec095b3 100644
--- a/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
+++ b/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
@@ -2,7 +2,18 @@ MODEL="meta/llama-3.3-70b-instruct"
 UNSAFE_PROBABILITY=0.0
 UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
 SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
-LATENCY_MIN_SECONDS=4.0
-LATENCY_MAX_SECONDS=4.0
-LATENCY_MEAN_SECONDS=4.0
-LATENCY_STD_SECONDS=0.0
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env b/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
index 2ae46d578..8d11bbaa7 100644
--- a/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
+++ b/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
@@ -2,7 +2,18 @@ MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
 UNSAFE_PROBABILITY=0.03
 UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
 SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
-LATENCY_MIN_SECONDS=0.5
-LATENCY_MAX_SECONDS=0.5
-LATENCY_MEAN_SECONDS=0.5
-LATENCY_STD_SECONDS=0.0
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/benchmark/mock_llm_server/models.py b/benchmark/mock_llm_server/models.py
index 804171a39..ae17c693b 100644
--- a/benchmark/mock_llm_server/models.py
+++ b/benchmark/mock_llm_server/models.py
@@ -78,6 +78,21 @@ class ChatCompletionChoice(BaseModel):
     finish_reason: str = Field(..., description="The reason the model stopped generating")
 
 
+class DeltaMessage(BaseModel):
+    """Delta message for streaming responses."""
+
+    role: Optional[str] = Field(default=None, description="The role of the message author")
+    content: Optional[str] = Field(default=None, description="The content delta")
+
+
+class ChatCompletionStreamChoice(BaseModel):
+    """Chat completion streaming choice - https://platform.openai.com/docs/api-reference/chat/streaming"""
+
+    index: int = Field(..., description="The index of this choice")
+    delta: DeltaMessage = Field(..., description="The delta message content")
+    finish_reason: Optional[str] = Field(None, description="The reason the model stopped generating")
+
+
 class CompletionChoice(BaseModel):
     """Text completion choice."""
 
@@ -87,6 +102,15 @@ class CompletionChoice(BaseModel):
     finish_reason: str = Field(..., description="The reason the model stopped generating")
 
 
+class CompletionStreamChoice(BaseModel):
+    """Text completion streaming choice."""
+
+    text: str = Field(..., description="The generated text delta")
+    index: int = Field(..., description="The index of this choice")
+    logprobs: Optional[dict[str, Any]] = Field(None, description="Log probability information")
+    finish_reason: Optional[str] = Field(None, description="The reason the model stopped generating")
+
+
 class ChatCompletionResponse(BaseModel):
     """Chat completion response - https://platform.openai.com/docs/api-reference/chat/object"""
 
@@ -98,6 +122,16 @@ class ChatCompletionResponse(BaseModel):
     usage: Usage = Field(..., description="Token usage information")
 
 
+class ChatCompletionStreamResponse(BaseModel):
+    """Chat completion streaming response chunk - https://platform.openai.com/docs/api-reference/chat/streaming"""
+
+    id: str = Field(..., description="Unique identifier for the completion")
+    object: str = Field("chat.completion.chunk", description="Object type")
+    created: int = Field(..., description="Unix timestamp when the completion was created")
+    model: str = Field(..., description="The model used for completion")
+    choices: list[ChatCompletionStreamChoice] = Field(..., description="List of completion choices")
+
+
 class CompletionResponse(BaseModel):
     """Text completion response. https://platform.openai.com/docs/api-reference/completions/object"""
 
@@ -109,6 +143,16 @@ class CompletionResponse(BaseModel):
     usage: Usage = Field(..., description="Token usage information")
 
 
+class CompletionStreamResponse(BaseModel):
+    """Text completion streaming response chunk."""
+
+    id: str = Field(..., description="Unique identifier for the completion")
+    object: str = Field("text_completion", description="Object type")
+    created: int = Field(..., description="Unix timestamp when the completion was created")
+    model: str = Field(..., description="The model used for completion")
+    choices: list[CompletionStreamChoice] = Field(..., description="List of completion choices")
+
+
 class Model(BaseModel):
     """Model information."""
 
diff --git a/benchmark/mock_llm_server/response_data.py b/benchmark/mock_llm_server/response_data.py
index 27f035266..6868c408b 100644
--- a/benchmark/mock_llm_server/response_data.py
+++ b/benchmark/mock_llm_server/response_data.py
@@ -41,20 +41,24 @@ def get_response(config: ModelSettings, seed: Optional[int] = None) -> str:
 
 
 def get_latency_seconds(config: ModelSettings, seed: Optional[int] = None) -> float:
-    """Sample latency for this request using the model's config
-    Very inefficient to generate each sample singly rather than in batch
+    """Sample end-to-end latency for this request using the model's config.
+    Very inefficient to generate each sample singly rather than in batch.
     """
     if seed:
         np.random.seed(seed)
 
     # Sample from the normal distribution using model config
-    latency_seconds = np.random.normal(loc=config.latency_mean_seconds, scale=config.latency_std_seconds, size=1)
+    latency_seconds = np.random.normal(
+        loc=config.e2e_latency_mean_seconds,
+        scale=config.e2e_latency_std_seconds,
+        size=1,
+    )
 
     # Truncate distribution's support using min and max config values
     latency_seconds = np.clip(
         latency_seconds,
-        a_min=config.latency_min_seconds,
-        a_max=config.latency_max_seconds,
+        a_min=config.e2e_latency_min_seconds,
+        a_max=config.e2e_latency_max_seconds,
     )
     return float(latency_seconds[0])
 
@@ -68,3 +72,73 @@ def is_unsafe(config: ModelSettings, seed: Optional[int] = None) -> bool:
 
     refusal = np.random.binomial(n=1, p=config.unsafe_probability, size=1)
     return bool(refusal[0])
+
+
+def split_response_into_chunks(response_text: str) -> list[str]:
+    """Split response text by whitespace into chunks for streaming.
+
+    Each word (and any attached punctuation) becomes a separate chunk.
+    Whitespace is preserved by appending a space after each chunk except the last.
+
+    Args:
+        response_text: The full response text to split
+
+    Returns:
+        List of text chunks to stream back
+    """
+    words = response_text.split()
+    if not words:
+        return []
+
+    # Add space after each word except the last to preserve original spacing
+    chunks = [word + " " for word in words[:-1]]
+    chunks.append(words[-1])  # Last word without trailing space
+    return chunks
+
+
+def generate_chunk_latencies(
+    config: ModelSettings,
+    num_chunks: int,
+    seed: Optional[int] = None,
+) -> np.ndarray:
+    """Generate latencies for each streaming chunk.
+
+    Uses TTFT (Time to First Token) for the first chunk and ITL (Inter-Token Latency)
+    for subsequent chunks. Both are sampled from truncated normal distributions.
+
+    Args:
+        config: Model settings containing TTFT and ITL parameters
+        num_chunks: Number of chunks to generate latencies for
+        seed: Optional random seed for reproducibility
+
+    Returns:
+        Numpy array of latencies in seconds, one for each chunk
+    """
+    if num_chunks <= 0:
+        return np.array([])
+
+    if seed:
+        np.random.seed(seed)
+
+    latencies = np.zeros(num_chunks)
+
+    # First chunk uses TTFT
+    ttft = np.random.normal(loc=config.ttft_mean_seconds, scale=config.ttft_std_seconds, size=1)
+    ttft = np.clip(ttft, a_min=config.ttft_min_seconds, a_max=config.ttft_max_seconds)
+    latencies[0] = ttft[0]
+
+    # Remaining chunks use Inter Token Latencies
+    if num_chunks > 1:
+        inter_token_latencies = np.random.normal(
+            loc=config.chunk_latency_mean_seconds,
+            scale=config.chunk_latency_std_seconds,
+            size=num_chunks - 1,
+        )
+        inter_token_latencies = np.clip(
+            inter_token_latencies,
+            a_min=config.chunk_latency_min_seconds,
+            a_max=config.chunk_latency_max_seconds,
+        )
+        latencies[1:] = inter_token_latencies
+
+    return latencies
diff --git a/benchmark/tests/test_mock_api.py b/benchmark/tests/test_mock_api.py
index 96d665dca..bed4c259c 100644
--- a/benchmark/tests/test_mock_api.py
+++ b/benchmark/tests/test_mock_api.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import time
 
 import pytest
@@ -21,17 +22,29 @@
 from benchmark.mock_llm_server.api import app
 from benchmark.mock_llm_server.config import ModelSettings, get_settings
 
+UNSAFE_RESPONSE = "I cannot help with that request"
+SAFE_RESPONSE = "This is a safe response, with many words to test streaming where we split on whitespace"
+
 
 def get_test_settings():
     return ModelSettings(
         model="gpt-3.5-turbo",
-        unsafe_probability=0.1,
-        unsafe_text="I cannot help with that request",
-        safe_text="This is a safe response",
-        latency_min_seconds=0,
-        latency_max_seconds=0,
-        latency_mean_seconds=0,
-        latency_std_seconds=0,
+        unsafe_probability=0.0,
+        unsafe_text=UNSAFE_RESPONSE,
+        safe_text=SAFE_RESPONSE,
+        e2e_latency_min_seconds=0,
+        e2e_latency_max_seconds=0,
+        e2e_latency_mean_seconds=0,
+        e2e_latency_std_seconds=0,
+        # Streaming latency settings (set to 0 for fast tests)
+        ttft_min_seconds=0,
+        ttft_max_seconds=0,
+        ttft_mean_seconds=0,
+        ttft_std_seconds=0,
+        chunk_latency_min_seconds=0,
+        chunk_latency_max_seconds=0,
+        chunk_latency_mean_seconds=0,
+        chunk_latency_std_seconds=0,
     )
 
 
@@ -359,6 +372,66 @@ def test_validate_request_model_invalid(self, client):
         assert "gpt-3.5-turbo" in response.json()["detail"]
 
 
+def get_safe_only_settings():
+    """Settings with unsafe_probability=0 (always safe response)."""
+    return ModelSettings(
+        model="gpt-3.5-turbo",
+        unsafe_probability=0.0,
+        unsafe_text=UNSAFE_RESPONSE,
+        safe_text=SAFE_RESPONSE,
+        e2e_latency_min_seconds=0,
+        e2e_latency_max_seconds=0,
+        e2e_latency_mean_seconds=0,
+        e2e_latency_std_seconds=0,
+        ttft_min_seconds=0,
+        ttft_max_seconds=0,
+        ttft_mean_seconds=0,
+        ttft_std_seconds=0,
+        chunk_latency_min_seconds=0,
+        chunk_latency_max_seconds=0,
+        chunk_latency_mean_seconds=0,
+        chunk_latency_std_seconds=0,
+    )
+
+
+def get_unsafe_only_settings():
+    """Settings with unsafe_probability=1.0 (always unsafe response)."""
+    return ModelSettings(
+        model="gpt-3.5-turbo",
+        unsafe_probability=1.0,
+        unsafe_text=UNSAFE_RESPONSE,
+        safe_text=SAFE_RESPONSE,
+        e2e_latency_min_seconds=0,
+        e2e_latency_max_seconds=0,
+        e2e_latency_mean_seconds=0,
+        e2e_latency_std_seconds=0,
+        ttft_min_seconds=0,
+        ttft_max_seconds=0,
+        ttft_mean_seconds=0,
+        ttft_std_seconds=0,
+        chunk_latency_min_seconds=0,
+        chunk_latency_max_seconds=0,
+        chunk_latency_mean_seconds=0,
+        chunk_latency_std_seconds=0,
+    )
+
+
+@pytest.fixture
+def safe_client():
+    """Create a test client that always returns safe responses."""
+    app.dependency_overrides[get_settings] = get_safe_only_settings
+    yield TestClient(app)
+    app.dependency_overrides[get_settings] = get_test_settings
+
+
+@pytest.fixture
+def unsafe_client():
+    """Create a test client that always returns unsafe responses."""
+    app.dependency_overrides[get_settings] = get_unsafe_only_settings
+    yield TestClient(app)
+    app.dependency_overrides[get_settings] = get_test_settings
+
+
 class TestResponseContent:
     """Test that responses contain expected content."""
 
@@ -373,7 +446,7 @@ def test_chat_response_content_type(self, client):
 
         content = data["choices"][0]["message"]["content"]
         # Should be one of the configured responses
-        assert content in ["This is a safe response", "I cannot help with that request"]
+        assert content == SAFE_RESPONSE
 
     def test_completion_response_content_type(self, client):
         """Test that completion response contains expected text."""
@@ -386,4 +459,297 @@ def test_completion_response_content_type(self, client):
 
         text = data["choices"][0]["text"]
         # Should be one of the configured responses
-        assert text in ["This is a safe response", "I cannot help with that request"]
+        assert text in {SAFE_RESPONSE, UNSAFE_RESPONSE}
+
+    def test_chat_completions_safe_response_when_probability_zero(self, safe_client):
+        """Test that chat completions returns safe response when unsafe_probability=0."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Test"}],
+        }
+        response = safe_client.post("/v1/chat/completions", json=payload)
+        data = response.json()
+
+        content = data["choices"][0]["message"]["content"]
+        assert content == SAFE_RESPONSE
+
+    def test_chat_completions_unsafe_response_when_probability_one(self, unsafe_client):
+        """Test that chat completions returns unsafe response when unsafe_probability=1.0."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Test"}],
+        }
+        response = unsafe_client.post("/v1/chat/completions", json=payload)
+        data = response.json()
+
+        content = data["choices"][0]["message"]["content"]
+        assert content == UNSAFE_RESPONSE
+
+    def test_completions_safe_response_when_probability_zero(self, safe_client):
+        """Test that completions returns safe response when unsafe_probability=0."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+        }
+        response = safe_client.post("/v1/completions", json=payload)
+        data = response.json()
+
+        text = data["choices"][0]["text"]
+        assert text == SAFE_RESPONSE
+
+    def test_completions_unsafe_response_when_probability_one(self, unsafe_client):
+        """Test that completions returns unsafe response when unsafe_probability=1.0."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+        }
+        response = unsafe_client.post("/v1/completions", json=payload)
+        data = response.json()
+
+        text = data["choices"][0]["text"]
+        assert text == UNSAFE_RESPONSE
+
+
+class TestChatCompletionsStreaming:
+    """Test the /v1/chat/completions endpoint with streaming."""
+
+    def test_chat_completions_streaming_returns_sse(self, client):
+        """Test that streaming returns Server-Sent Events format."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        assert response.status_code == 200
+        assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
+
+    def test_chat_completions_streaming_chunks_format(self, client):
+        """Test that streaming chunks have correct SSE format."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        # Each chunk should start with "data: "
+        lines = [line for line in content.split("\n") if line.strip()]
+        for line in lines:
+            assert line.startswith("data: ")
+
+        # Should end with [DONE]
+        assert "data: [DONE]" in content
+
+    def test_chat_completions_streaming_first_chunk_has_role(self, client):
+        """Test that first streaming chunk contains role."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        # Get first data chunk (skip empty lines)
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+        first_chunk = json.loads(lines[0].replace("data: ", ""))
+
+        assert first_chunk["object"] == "chat.completion.chunk"
+        assert first_chunk["choices"][0]["delta"]["role"] == "assistant"
+        assert first_chunk["choices"][0]["delta"]["content"] == ""
+
+    def test_chat_completions_streaming_content_chunks(self, client):
+        """Test that content chunks only have content field."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        # Skip first chunk (role) and last chunk (finish_reason)
+        content_chunks = lines[1:-1]
+        for line in content_chunks:
+            chunk = json.loads(line.replace("data: ", ""))
+            delta = chunk["choices"][0]["delta"]
+            # Should only have content, no role
+            assert "content" in delta
+            assert "role" not in delta
+
+    def test_chat_completions_streaming_final_chunk(self, client):
+        """Test that final chunk has finish_reason and empty delta."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+        last_chunk = json.loads(lines[-1].replace("data: ", ""))
+
+        assert last_chunk["choices"][0]["finish_reason"] == "stop"
+        assert last_chunk["choices"][0]["delta"] == {}
+
+    def test_chat_completions_streaming_reconstructs_response(self, client):
+        """Test that concatenating chunks reconstructs the full response."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        # Reconstruct content from all chunks
+        full_content = ""
+        for line in lines:
+            chunk = json.loads(line.replace("data: ", ""))
+            delta = chunk["choices"][0]["delta"]
+            if "content" in delta and delta["content"]:
+                full_content += delta["content"]
+
+        # Should be one of the configured responses
+        assert full_content == SAFE_RESPONSE
+
+    def test_chat_completions_streaming_consistent_id(self, client):
+        """Test that all chunks have the same ID."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": True,
+        }
+        response = client.post("/v1/chat/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        ids = set()
+        for line in lines:
+            chunk = json.loads(line.replace("data: ", ""))
+            ids.add(chunk["id"])
+
+        # All chunks should have the same ID
+        assert len(ids) == 1
+        # ID should have correct prefix
+        assert list(ids)[0].startswith("chatcmpl-")
+
+
+class TestCompletionsStreaming:
+    """Test the /v1/completions endpoint with streaming."""
+
+    def test_completions_streaming_returns_sse(self, client):
+        """Test that streaming returns Server-Sent Events format."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Once upon a time",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        assert response.status_code == 200
+        assert response.headers["content-type"] == "text/event-stream; charset=utf-8"
+
+    def test_completions_streaming_chunks_format(self, client):
+        """Test that streaming chunks have correct SSE format."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        content = response.text
+
+        # Each chunk should start with "data: "
+        lines = [line for line in content.split("\n") if line.strip()]
+        for line in lines:
+            assert line.startswith("data: ")
+
+        # Should end with [DONE]
+        assert "data: [DONE]" in content
+
+    def test_completions_streaming_content_chunks(self, client):
+        """Test that content chunks have text field."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        # All chunks except last should have text content
+        content_chunks = lines[:-1]
+        for line in content_chunks:
+            chunk = json.loads(line.replace("data: ", ""))
+            assert chunk["object"] == "text_completion"
+            assert "text" in chunk["choices"][0]
+
+    def test_completions_streaming_final_chunk(self, client):
+        """Test that final chunk has finish_reason."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+        last_chunk = json.loads(lines[-1].replace("data: ", ""))
+
+        assert last_chunk["choices"][0]["finish_reason"] == "stop"
+
+    def test_completions_streaming_reconstructs_response(self, client):
+        """Test that concatenating chunks reconstructs the full response."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        # Reconstruct content from all chunks
+        full_content = ""
+        for line in lines:
+            chunk = json.loads(line.replace("data: ", ""))
+            text = chunk["choices"][0]["text"]
+            if text:
+                full_content += text
+
+        # Should be one of the configured responses
+        assert full_content == SAFE_RESPONSE
+
+    def test_completions_streaming_consistent_id(self, client):
+        """Test that all chunks have the same ID."""
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "prompt": "Test",
+            "stream": True,
+        }
+        response = client.post("/v1/completions", json=payload)
+        content = response.text
+
+        lines = [line for line in content.split("\n") if line.startswith("data: ") and line != "data: [DONE]"]
+
+        ids = set()
+        for line in lines:
+            chunk = json.loads(line.replace("data: ", ""))
+            ids.add(chunk["id"])
+
+        # All chunks should have the same ID
+        assert len(ids) == 1
+        # ID should have correct prefix
+        assert list(ids)[0].startswith("cmpl-")
diff --git a/benchmark/tests/test_mock_config.py b/benchmark/tests/test_mock_config.py
index 4b9ac6231..62343df00 100644
--- a/benchmark/tests/test_mock_config.py
+++ b/benchmark/tests/test_mock_config.py
@@ -32,10 +32,10 @@ def test_app_model_config_with_defaults(self):
         )
         # Check defaults
         assert config.unsafe_probability == 0.1
-        assert config.latency_min_seconds == 0.1
-        assert config.latency_max_seconds == 5
-        assert config.latency_mean_seconds == 0.5
-        assert config.latency_std_seconds == 0.1
+        assert config.e2e_latency_min_seconds == 0.1
+        assert config.e2e_latency_max_seconds == 5
+        assert config.e2e_latency_mean_seconds == 0.5
+        assert config.e2e_latency_std_seconds == 0.1
 
     def test_app_model_config_missing_required_field(self):
         """Test that missing required fields raise validation error."""
diff --git a/benchmark/tests/test_mock_response_data.py b/benchmark/tests/test_mock_response_data.py
index cd6704193..688655828 100644
--- a/benchmark/tests/test_mock_response_data.py
+++ b/benchmark/tests/test_mock_response_data.py
@@ -22,10 +22,12 @@
 from benchmark.mock_llm_server.config import ModelSettings
 from benchmark.mock_llm_server.response_data import (
     calculate_tokens,
+    generate_chunk_latencies,
     generate_id,
     get_latency_seconds,
     get_response,
     is_unsafe,
+    split_response_into_chunks,
 )
 
 
@@ -102,10 +104,10 @@ def model_settings() -> ModelSettings:
         unsafe_probability=0.5,
         unsafe_text="Sorry Dave, I'm afraid I can't do that.",
         safe_text="I'm an AI assistant and am happy to help",
-        latency_min_seconds=0.2,
-        latency_max_seconds=1.0,
-        latency_mean_seconds=0.5,
-        latency_std_seconds=0.1,
+        e2e_latency_min_seconds=0.2,
+        e2e_latency_max_seconds=1.0,
+        e2e_latency_mean_seconds=0.5,
+        e2e_latency_std_seconds=0.1,
     )
     return settings
 
@@ -182,22 +184,22 @@ def test_get_response_unsafe(model_settings: ModelSettings):
 def test_get_latency_seconds_mocks_no_seed(mock_clip, mock_normal, mock_seed, model_settings: ModelSettings):
     """Check we call the correct numpy functions (not including seed)"""
 
-    mock_normal.return_value = np.array([model_settings.latency_mean_seconds])
-    mock_clip.return_value = np.array([model_settings.latency_max_seconds])
+    mock_normal.return_value = np.array([model_settings.e2e_latency_mean_seconds])
+    mock_clip.return_value = np.array([model_settings.e2e_latency_max_seconds])
 
     result = get_latency_seconds(model_settings)
 
     assert result == mock_clip.return_value
     assert mock_seed.call_count == 0
     mock_normal.assert_called_once_with(
-        loc=model_settings.latency_mean_seconds,
-        scale=model_settings.latency_std_seconds,
+        loc=model_settings.e2e_latency_mean_seconds,
+        scale=model_settings.e2e_latency_std_seconds,
         size=1,
     )
     mock_clip.assert_called_once_with(
         mock_normal.return_value,
-        a_min=model_settings.latency_min_seconds,
-        a_max=model_settings.latency_max_seconds,
+        a_min=model_settings.e2e_latency_min_seconds,
+        a_max=model_settings.e2e_latency_max_seconds,
     )
 
 
@@ -209,20 +211,263 @@ def test_get_latency_seconds_mocks_with_seed(
 ):
     """Check we call the correct numpy functions (not including seed)"""
 
-    mock_normal.return_value = np.array([model_settings.latency_mean_seconds])
-    mock_clip.return_value = np.array([model_settings.latency_max_seconds])
+    mock_normal.return_value = np.array([model_settings.e2e_latency_mean_seconds])
+    mock_clip.return_value = np.array([model_settings.e2e_latency_max_seconds])
 
     result = get_latency_seconds(model_settings, seed=random_seed)
 
     assert result == mock_clip.return_value
     mock_seed.assert_called_once_with(random_seed)
     mock_normal.assert_called_once_with(
-        loc=model_settings.latency_mean_seconds,
-        scale=model_settings.latency_std_seconds,
+        loc=model_settings.e2e_latency_mean_seconds,
+        scale=model_settings.e2e_latency_std_seconds,
         size=1,
     )
     mock_clip.assert_called_once_with(
         mock_normal.return_value,
-        a_min=model_settings.latency_min_seconds,
-        a_max=model_settings.latency_max_seconds,
+        a_min=model_settings.e2e_latency_min_seconds,
+        a_max=model_settings.e2e_latency_max_seconds,
     )
+
+
+class TestSplitResponseIntoChunks:
+    """Test the split_response_into_chunks function."""
+
+    def test_split_simple_sentence(self):
+        """Test splitting a simple sentence into word chunks."""
+        text = "Hello world"
+        chunks = split_response_into_chunks(text)
+        assert chunks == ["Hello ", "world"]
+
+    def test_split_multiple_words(self):
+        """Test splitting multiple words preserves spacing."""
+        text = "I can help you"
+        chunks = split_response_into_chunks(text)
+        assert chunks == ["I ", "can ", "help ", "you"]
+
+    def test_split_empty_string(self):
+        """Test splitting an empty string returns empty list."""
+        text = ""
+        chunks = split_response_into_chunks(text)
+        assert chunks == []
+
+    def test_split_single_word(self):
+        """Test splitting a single word returns it without trailing space."""
+        text = "Hello"
+        chunks = split_response_into_chunks(text)
+        assert chunks == ["Hello"]
+
+    def test_split_preserves_punctuation(self):
+        """Test that punctuation stays attached to words."""
+        text = "Hello, world!"
+        chunks = split_response_into_chunks(text)
+        assert chunks == ["Hello, ", "world!"]
+
+    def test_split_reconstructs_original(self):
+        """Test that joining chunks reconstructs the original text."""
+        text = "I can provide information and help with a wide range of topics."
+        chunks = split_response_into_chunks(text)
+        reconstructed = "".join(chunks)
+        assert reconstructed == text
+
+    def test_split_whitespace_only(self):
+        """Test splitting whitespace-only string returns empty list."""
+        text = "   "
+        chunks = split_response_into_chunks(text)
+        assert chunks == []
+
+
+class TestGenerateChunkLatencies:
+    """Test the generate_chunk_latencies function."""
+
+    @pytest.fixture
+    def streaming_settings(self) -> ModelSettings:
+        """Generate config data with streaming latency settings.
+        Each value should be unique to make sure we pass the correct configs to numpy functions
+        """
+        return ModelSettings(
+            model="test-model",
+            unsafe_text="Unsafe",
+            safe_text="Safe",
+            ttft_min_seconds=0.1,
+            ttft_max_seconds=4.0,
+            ttft_mean_seconds=1.3,
+            ttft_std_seconds=0.96,
+            chunk_latency_min_seconds=0.01,
+            chunk_latency_max_seconds=0.12,
+            chunk_latency_mean_seconds=0.05,
+            chunk_latency_std_seconds=0.02,
+        )
+
+    def test_generate_latencies_zero_chunks(self, streaming_settings: ModelSettings):
+        """Test generating latencies for zero chunks returns empty array."""
+        latencies = generate_chunk_latencies(streaming_settings, 0)
+        assert len(latencies) == 0
+        assert isinstance(latencies, np.ndarray)
+
+    def test_generate_latencies_negative_chunks(self, streaming_settings: ModelSettings):
+        """Test generating latencies for negative chunks returns empty array."""
+        latencies = generate_chunk_latencies(streaming_settings, -1)
+        assert len(latencies) == 0
+
+    @patch("benchmark.mock_llm_server.response_data.np.random.seed")
+    @patch("benchmark.mock_llm_server.response_data.np.random.normal")
+    @patch("benchmark.mock_llm_server.response_data.np.clip")
+    def test_generate_latencies_single_chunk_no_seed(
+        self,
+        mock_clip: MagicMock,
+        mock_normal: MagicMock,
+        mock_seed: MagicMock,
+        streaming_settings: ModelSettings,
+    ):
+        """Test single chunk calls TTFT normal and clip, no seed call."""
+        mock_normal.return_value = np.array([streaming_settings.ttft_mean_seconds])
+        mock_clip.return_value = np.array([streaming_settings.ttft_mean_seconds])
+
+        latencies = generate_chunk_latencies(streaming_settings, 1)
+
+        assert len(latencies) == 1
+        assert mock_seed.call_count == 0
+        mock_normal.assert_called_once_with(
+            loc=streaming_settings.ttft_mean_seconds,
+            scale=streaming_settings.ttft_std_seconds,
+            size=1,
+        )
+        mock_clip.assert_called_once_with(
+            mock_normal.return_value,
+            a_min=streaming_settings.ttft_min_seconds,
+            a_max=streaming_settings.ttft_max_seconds,
+        )
+
+    @patch("benchmark.mock_llm_server.response_data.np.random.seed")
+    @patch("benchmark.mock_llm_server.response_data.np.random.normal")
+    @patch("benchmark.mock_llm_server.response_data.np.clip")
+    def test_generate_latencies_single_chunk_with_seed(
+        self,
+        mock_clip: MagicMock,
+        mock_normal: MagicMock,
+        mock_seed: MagicMock,
+        streaming_settings: ModelSettings,
+    ):
+        """Test single chunk with seed calls np.random.seed."""
+        mock_normal.return_value = np.array([streaming_settings.ttft_mean_seconds])
+        mock_clip.return_value = np.array([streaming_settings.ttft_min_seconds])
+        seed_value = 42
+
+        latencies = generate_chunk_latencies(streaming_settings, 1, seed=seed_value)
+
+        assert len(latencies) == 1
+        mock_seed.assert_called_once_with(seed_value)
+        mock_normal.assert_called_once_with(
+            loc=streaming_settings.ttft_mean_seconds,
+            scale=streaming_settings.ttft_std_seconds,
+            size=1,
+        )
+        mock_clip.assert_called_once_with(
+            mock_normal.return_value,
+            a_min=streaming_settings.ttft_min_seconds,
+            a_max=streaming_settings.ttft_max_seconds,
+        )
+
+    @patch("benchmark.mock_llm_server.response_data.np.random.seed")
+    @patch("benchmark.mock_llm_server.response_data.np.random.normal")
+    @patch("benchmark.mock_llm_server.response_data.np.clip")
+    def test_generate_latencies_multiple_chunks_no_seed(
+        self,
+        mock_clip: MagicMock,
+        mock_normal: MagicMock,
+        mock_seed: MagicMock,
+        streaming_settings: ModelSettings,
+    ):
+        """Test multiple chunks calls TTFT then ITL normal and clip."""
+        num_chunks = 5
+        ttft_value = np.array([streaming_settings.ttft_mean_seconds])
+        chunk_values = np.array([0.01, 0.02, 0.03, 0.04])
+
+        # First call returns TTFT, second call returns ITL values
+        mock_normal.side_effect = [ttft_value, chunk_values]
+        mock_clip.side_effect = [ttft_value, chunk_values]
+
+        latencies = generate_chunk_latencies(streaming_settings, num_chunks)
+
+        assert len(latencies) == num_chunks
+        assert mock_seed.call_count == 0
+        assert mock_normal.call_count == 2
+        assert mock_clip.call_count == 2
+
+        # Check the TTFT Normal distribution
+        ttft_normal_call_args, ttft_normal_call_kwargs = mock_normal.call_args_list[0]
+        assert ttft_normal_call_args == ()  # All arguments are passed as kwargs, so args list is empty
+        assert ttft_normal_call_kwargs["loc"] == streaming_settings.ttft_mean_seconds
+        assert ttft_normal_call_kwargs["scale"] == streaming_settings.ttft_std_seconds
+        assert ttft_normal_call_kwargs["size"] == 1
+
+        # Check the ITL Normal distribution call (for all but the first chunk)
+        chunk_normal_call_args, chunk_normal_call_kwargs = mock_normal.call_args_list[1]
+        assert chunk_normal_call_args == ()  # All arguments are passed as kwargs, so args list is empty
+        assert chunk_normal_call_kwargs["loc"] == streaming_settings.chunk_latency_mean_seconds
+        assert chunk_normal_call_kwargs["scale"] == streaming_settings.chunk_latency_std_seconds
+        assert chunk_normal_call_kwargs["size"] == num_chunks - 1
+
+        # Check TTFT clip calls
+        ttft_clip_call_args, ttft_clip_call_kwargs = mock_clip.call_args_list[0]
+        assert ttft_clip_call_args[0] == ttft_value
+        assert ttft_clip_call_kwargs["a_max"] == streaming_settings.ttft_max_seconds
+        assert ttft_clip_call_kwargs["a_min"] == streaming_settings.ttft_min_seconds
+
+        # Check ITL clip calls
+        chunk_clip_call_args, chunk_clip_call_kwargs = mock_clip.call_args_list[1]
+        np.testing.assert_array_equal(chunk_clip_call_args[0], chunk_values)
+        assert chunk_clip_call_kwargs["a_max"] == streaming_settings.chunk_latency_max_seconds
+        assert chunk_clip_call_kwargs["a_min"] == streaming_settings.chunk_latency_min_seconds
+
+    @patch("benchmark.mock_llm_server.response_data.np.random.seed")
+    @patch("benchmark.mock_llm_server.response_data.np.random.normal")
+    @patch("benchmark.mock_llm_server.response_data.np.clip")
+    def test_generate_latencies_multiple_chunks_with_seed(
+        self,
+        mock_clip: MagicMock,
+        mock_normal: MagicMock,
+        mock_seed: MagicMock,
+        streaming_settings: ModelSettings,
+    ):
+        """Test multiple chunks with seed calls np.random.seed once."""
+        num_chunks = 3
+        seed_value = 12345
+        ttft_value = np.array([streaming_settings.ttft_mean_seconds])
+        chunk_values = np.array([streaming_settings.chunk_latency_mean_seconds] * (num_chunks - 1))
+
+        mock_normal.side_effect = [ttft_value, chunk_values]
+        mock_clip.side_effect = [ttft_value, chunk_values]
+
+        latencies = generate_chunk_latencies(streaming_settings, num_chunks, seed=seed_value)
+
+        assert len(latencies) == num_chunks
+        mock_seed.assert_called_once_with(seed_value)
+
+        # Exact call arguments are tested in test_generate_latencies_multiple_chunks_no_seed, no need to retest
+        assert mock_normal.call_count == 2
+        assert mock_clip.call_count == 2
+
+    @patch("benchmark.mock_llm_server.response_data.np.random.seed")
+    @patch("benchmark.mock_llm_server.response_data.np.random.normal")
+    @patch("benchmark.mock_llm_server.response_data.np.clip")
+    def test_generate_latencies_returns_correct_values(
+        self,
+        mock_clip: MagicMock,
+        mock_normal: MagicMock,
+        streaming_settings: ModelSettings,
+    ):
+        """Test that returned latencies contain the clipped values."""
+        num_chunks = 3
+        ttft_clipped = np.array([0.25])
+        chunk_clipped = np.array([0.04, 0.06])
+
+        mock_normal.side_effect = [np.array([0.3]), np.array([0.05, 0.07])]
+        mock_clip.side_effect = [ttft_clipped, chunk_clipped]
+
+        latencies = generate_chunk_latencies(streaming_settings, num_chunks)
+
+        assert len(latencies) == num_chunks
+        assert latencies[0] == ttft_clipped[0]
+        np.testing.assert_array_equal(latencies[1:], chunk_clipped)