From 9fc6fc46132e72b57cb5cc79a864b9b644bafb12 Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 4 Jun 2025 15:21:04 -0700
Subject: [PATCH 1/3] bugfix(OAI): Fix image_data processing for jinja chat
 templates in OpenAI API adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using custom .jinja chat templates, image_data was being stripped from
messages before apply_chat_template(), causing the model not getting any image
inputs

Changes:
- Keep full multimodal content in messages passed to apply_chat_template()
- Extract image_data/audio_data for SGLang's multimodal processing pipeline
- Normalize OpenAI content types (image_url→image, audio_url→audio) for template compatibility
- Handle text simplification differently for multimodal vs non-multimodal models
- Update llama4 jinja template to support both 'image' and 'image_url' content types
- Add detailed TODO comments about future template content format detection
---
 python/sglang/srt/openai_api/adapter.py | 71 ++++++++++++++++++++-----
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 9daca75881e2..fb39e176a5e8 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1000,20 +1000,67 @@ def v1_chat_generate_request(
 
             if chat_template_name is None:
                 openai_compatible_messages = []
+                image_data = []
+                audio_data = []
+                modalities = []
 
                 for message in request.messages:
                     if message.content is None:
                         message.content = ""
-                    msg_dict = message.dict()
+                    msg_dict = message.model_dump()
                     if isinstance(msg_dict.get("content"), list):
-                        for chunk in msg_dict["content"]:
-                            if isinstance(chunk, dict) and chunk.get("type") == "text":
-                                new_msg = msg_dict.copy()
-                                new_msg["content"] = chunk["text"]
-                                new_msg = {
-                                    k: v for k, v in new_msg.items() if v is not None
-                                }
-                                openai_compatible_messages.append(new_msg)
+                        if is_multimodal:
+                            # Extract image and audio data for mm processing while keeping full content for jinja template
+                            # NOTE: We assume common conventions: content['type'] == 'image'/'audio'/'text' but templates may vary.
+                            # This assumes the jinja template accepts message["content"] is a list. And it tries to parse
+                            # the content into a list of dictionaries with the following keys: "type", "image_url", "audio_url", "text".
+                            # TODO: Need to auto-detect template requirements by parsing jinja code or use template metadata.
+                            # Better modularization needed to handle different template content type expectations.
+                            processed_content_parts = []
+                            for chunk in msg_dict["content"]:
+                                if isinstance(chunk, dict):
+                                    chunk_type = chunk.get("type")
+
+                                    # Extract data and create normalized content dictionary
+                                    if chunk_type == "image_url":
+                                        image_data.append(chunk["image_url"]["url"])
+                                        if chunk.get("modalities"):
+                                            modalities.append(chunk.get("modalities"))
+                                        processed_content_parts.append(
+                                            {"type": "image"}
+                                        )
+                                    elif chunk_type == "audio_url":
+                                        audio_data.append(chunk["audio_url"]["url"])
+                                        processed_content_parts.append(
+                                            {"type": "audio"}
+                                        )
+                                    else:
+                                        processed_content_parts.append(chunk)
+
+                            # Create one message with all processed content list
+                            new_msg = {
+                                k: v
+                                for k, v in msg_dict.items()
+                                if v is not None and k != "content"
+                            }
+                            new_msg["content"] = processed_content_parts
+                            openai_compatible_messages.append(new_msg)
+                        else:
+                            # For non-multimodal models, we assume the content should be a string, such as
+                            # deeoseek-v3's jinja chat stemplate.
+                            for chunk in msg_dict["content"]:
+                                if (
+                                    isinstance(chunk, dict)
+                                    and chunk.get("type") == "text"
+                                ):
+                                    new_msg = msg_dict.copy()
+                                    new_msg["content"] = chunk["text"]
+                                    new_msg = {
+                                        k: v
+                                        for k, v in new_msg.items()
+                                        if v is not None
+                                    }
+                                    openai_compatible_messages.append(new_msg)
                     else:
                         msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
                         openai_compatible_messages.append(msg_dict)
@@ -1070,9 +1117,9 @@ def v1_chat_generate_request(
                 if is_multimodal:
                     prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
                 stop = request.stop
-                image_data = None
-                audio_data = None
-                modalities = []
+                image_data = image_data if image_data else None
+                audio_data = audio_data if audio_data else None
+                modalities = modalities if modalities else []
             else:
                 conv = generate_chat_conv(request, chat_template_name)
                 # If we should continue the final assistant message, adjust the conversation.

From d918fb3d8a192392f60890d84d1c3d62dcda6ede Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 4 Jun 2025 21:17:16 -0700
Subject: [PATCH 2/3] Refactor: process message content according to a jinja
 template content format detection

---
 python/sglang/srt/openai_api/adapter.py |  93 +++++--------
 python/sglang/srt/openai_api/utils.py   | 173 ++++++++++++++++++++++++
 2 files changed, 210 insertions(+), 56 deletions(-)
 create mode 100644 python/sglang/srt/openai_api/utils.py

diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index fb39e176a5e8..719205b36d93 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -75,6 +75,10 @@
     TopLogprob,
     UsageInfo,
 )
+from sglang.srt.openai_api.utils import (
+    detect_template_content_format,
+    process_content_for_template_format,
+)
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.utils import convert_json_schema_to_str, get_exception_traceback
 
@@ -82,6 +86,11 @@
 
 chat_template_name = None
 
+# Global cache for template content format detection (one model/template per instance)
+# NOTE: A better approach would be to initialize the chat template format when the endpoint is created
+_cached_chat_template = None
+_cached_template_format = None
+
 
 class FileMetadata:
     def __init__(self, filename: str, purpose: str):
@@ -1004,66 +1013,38 @@ def v1_chat_generate_request(
                 audio_data = []
                 modalities = []
 
+                # Detect template content format by analyzing the jinja template (cached globally)
+                global _cached_chat_template, _cached_template_format
+                current_template = tokenizer_manager.tokenizer.chat_template
+
+                if current_template != _cached_chat_template:
+                    # Template changed or first time - analyze it
+                    _cached_chat_template = current_template
+                    _cached_template_format = detect_template_content_format(
+                        current_template
+                    )
+                    logger.info(
+                        f"Detected chat template content format: {_cached_template_format}"
+                    )
+
+                template_content_format = _cached_template_format
+
                 for message in request.messages:
                     if message.content is None:
                         message.content = ""
                     msg_dict = message.model_dump()
-                    if isinstance(msg_dict.get("content"), list):
-                        if is_multimodal:
-                            # Extract image and audio data for mm processing while keeping full content for jinja template
-                            # NOTE: We assume common conventions: content['type'] == 'image'/'audio'/'text' but templates may vary.
-                            # This assumes the jinja template accepts message["content"] is a list. And it tries to parse
-                            # the content into a list of dictionaries with the following keys: "type", "image_url", "audio_url", "text".
-                            # TODO: Need to auto-detect template requirements by parsing jinja code or use template metadata.
-                            # Better modularization needed to handle different template content type expectations.
-                            processed_content_parts = []
-                            for chunk in msg_dict["content"]:
-                                if isinstance(chunk, dict):
-                                    chunk_type = chunk.get("type")
-
-                                    # Extract data and create normalized content dictionary
-                                    if chunk_type == "image_url":
-                                        image_data.append(chunk["image_url"]["url"])
-                                        if chunk.get("modalities"):
-                                            modalities.append(chunk.get("modalities"))
-                                        processed_content_parts.append(
-                                            {"type": "image"}
-                                        )
-                                    elif chunk_type == "audio_url":
-                                        audio_data.append(chunk["audio_url"]["url"])
-                                        processed_content_parts.append(
-                                            {"type": "audio"}
-                                        )
-                                    else:
-                                        processed_content_parts.append(chunk)
-
-                            # Create one message with all processed content list
-                            new_msg = {
-                                k: v
-                                for k, v in msg_dict.items()
-                                if v is not None and k != "content"
-                            }
-                            new_msg["content"] = processed_content_parts
-                            openai_compatible_messages.append(new_msg)
-                        else:
-                            # For non-multimodal models, we assume the content should be a string, such as
-                            # deeoseek-v3's jinja chat stemplate.
-                            for chunk in msg_dict["content"]:
-                                if (
-                                    isinstance(chunk, dict)
-                                    and chunk.get("type") == "text"
-                                ):
-                                    new_msg = msg_dict.copy()
-                                    new_msg["content"] = chunk["text"]
-                                    new_msg = {
-                                        k: v
-                                        for k, v in new_msg.items()
-                                        if v is not None
-                                    }
-                                    openai_compatible_messages.append(new_msg)
-                    else:
-                        msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
-                        openai_compatible_messages.append(msg_dict)
+
+                    # Process content based on detected template format
+                    processed_msg = process_content_for_template_format(
+                        msg_dict,
+                        template_content_format,
+                        image_data,
+                        audio_data,
+                        modalities,
+                    )
+                    openai_compatible_messages.append(processed_msg)
+
+                # Handle assistant prefix for continue_final_message
                 if (
                     openai_compatible_messages
                     and openai_compatible_messages[-1]["role"] == "assistant"
diff --git a/python/sglang/srt/openai_api/utils.py b/python/sglang/srt/openai_api/utils.py
new file mode 100644
index 000000000000..2cd353c7e6de
--- /dev/null
+++ b/python/sglang/srt/openai_api/utils.py
@@ -0,0 +1,173 @@
+"""
+Utility functions for OpenAI API adapter.
+"""
+
+import logging
+from typing import Dict, List
+
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
+
+logger = logging.getLogger(__name__)
+
+# ============================================================================
+# JINJA TEMPLATE CONTENT FORMAT DETECTION
+# ============================================================================
+# 
+# This adapts vLLM's approach for detecting chat template content format:
+# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
+# - Analyzes Jinja template AST to detect content iteration patterns
+# - 'openai' format: templates with {%- for content in message['content'] -%} loops
+# - 'string' format: templates that expect simple string content
+# - Processes content accordingly to match template expectations
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    """Check if node is a variable access like {{ varname }}"""
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str = None,
+) -> bool:
+    """Check if node accesses varname or varname[key] with filters/tests"""
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+
+
+def _try_extract_ast(chat_template: str):
+    """Try to parse the Jinja template into an AST"""
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception as e:
+        logger.debug(f"Error when compiling Jinja template: {e}")
+        return None
+
+
+def detect_template_content_format(chat_template: str) -> str:
+    """
+    Detect whether a chat template expects 'string' or 'openai' content format.
+    
+    - 'string': content is a simple string (like DeepSeek templates)
+    - 'openai': content is a list of structured dicts (like Llama4 templates)
+    
+    Detection logic:
+    - If template has loops like {%- for content in message['content'] -%} → 'openai'
+    - Otherwise → 'string'
+    """
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return "string"
+
+    try:
+        # Look for patterns like: {%- for content in message['content'] -%}
+        for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
+            loop_iter = loop_ast.iter
+            
+            # Check if iterating over message['content'] or similar
+            if _is_var_or_elems_access(loop_iter, "message", "content"):
+                return "openai"  # Found content iteration → openai format
+                
+        return "string"  # No content loops found → string format
+    except Exception as e:
+        logger.debug(f"Error when parsing AST of Jinja template: {e}")
+        return "string"
+
+
+def process_content_for_template_format(
+    msg_dict: dict, 
+    content_format: str,
+    image_data: list,
+    audio_data: list,
+    modalities: list
+) -> dict:
+    """
+    Process message content based on detected template format.
+    
+    Args:
+        msg_dict: Message dictionary with content
+        content_format: 'string' or 'openai' (detected via AST analysis)
+        image_data: List to append extracted image URLs
+        audio_data: List to append extracted audio URLs
+        modalities: List to append modalities
+        
+    Returns:
+        Processed message dictionary
+    """
+    if not isinstance(msg_dict.get("content"), list):
+        # Already a string or None, no processing needed
+        return {k: v for k, v in msg_dict.items() if v is not None}
+    
+    if content_format == "openai":
+        # OpenAI format: preserve structured content list, normalize types
+        processed_content_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict):
+                chunk_type = chunk.get("type")
+                
+                if chunk_type == "image_url":
+                    image_data.append(chunk["image_url"]["url"])
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'image' type for template compatibility
+                    processed_content_parts.append({"type": "image"})
+                elif chunk_type == "audio_url":
+                    audio_data.append(chunk["audio_url"]["url"])
+                    # Normalize to simple 'audio' type
+                    processed_content_parts.append({"type": "audio"})
+                else:
+                    # Keep other content as-is (text, etc.)
+                    processed_content_parts.append(chunk)
+        
+        new_msg = {
+            k: v for k, v in msg_dict.items() 
+            if v is not None and k != "content"
+        }
+        new_msg["content"] = processed_content_parts
+        return new_msg
+        
+    else:  # content_format == "string"
+        # String format: flatten to text only (for templates like DeepSeek)
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk["text"])
+            # Note: For string format, we ignore images/audio since the template
+            # doesn't expect structured content - multimodal placeholders would
+            # need to be inserted differently
+        
+        new_msg = msg_dict.copy()
+        new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        new_msg = {k: v for k, v in new_msg.items() if v is not None}
+        return new_msg

From cb83632dbefc284f30c511096ea07aa70e692dfe Mon Sep 17 00:00:00 2001
From: Chang Su <chang.s.su@oracle.com>
Date: Wed, 4 Jun 2025 21:25:39 -0700
Subject: [PATCH 3/3] Add UT for the new changes in openai_api/utils.py

---
 python/sglang/srt/openai_api/utils.py |  31 ++--
 test/srt/run_suite.py                 |   1 +
 test/srt/test_openai_adapter.py       | 225 ++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 test/srt/test_openai_adapter.py

diff --git a/python/sglang/srt/openai_api/utils.py b/python/sglang/srt/openai_api/utils.py
index 2cd353c7e6de..610251aff54a 100644
--- a/python/sglang/srt/openai_api/utils.py
+++ b/python/sglang/srt/openai_api/utils.py
@@ -13,7 +13,7 @@
 # ============================================================================
 # JINJA TEMPLATE CONTENT FORMAT DETECTION
 # ============================================================================
-# 
+#
 # This adapts vLLM's approach for detecting chat template content format:
 # https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
 # - Analyzes Jinja template AST to detect content iteration patterns
@@ -78,10 +78,10 @@ def _try_extract_ast(chat_template: str):
 def detect_template_content_format(chat_template: str) -> str:
     """
     Detect whether a chat template expects 'string' or 'openai' content format.
-    
+
     - 'string': content is a simple string (like DeepSeek templates)
     - 'openai': content is a list of structured dicts (like Llama4 templates)
-    
+
     Detection logic:
     - If template has loops like {%- for content in message['content'] -%} → 'openai'
     - Otherwise → 'string'
@@ -94,11 +94,11 @@ def detect_template_content_format(chat_template: str) -> str:
         # Look for patterns like: {%- for content in message['content'] -%}
         for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
             loop_iter = loop_ast.iter
-            
+
             # Check if iterating over message['content'] or similar
             if _is_var_or_elems_access(loop_iter, "message", "content"):
                 return "openai"  # Found content iteration → openai format
-                
+
         return "string"  # No content loops found → string format
     except Exception as e:
         logger.debug(f"Error when parsing AST of Jinja template: {e}")
@@ -106,36 +106,36 @@ def detect_template_content_format(chat_template: str) -> str:
 
 
 def process_content_for_template_format(
-    msg_dict: dict, 
+    msg_dict: dict,
     content_format: str,
     image_data: list,
     audio_data: list,
-    modalities: list
+    modalities: list,
 ) -> dict:
     """
     Process message content based on detected template format.
-    
+
     Args:
         msg_dict: Message dictionary with content
         content_format: 'string' or 'openai' (detected via AST analysis)
         image_data: List to append extracted image URLs
         audio_data: List to append extracted audio URLs
         modalities: List to append modalities
-        
+
     Returns:
         Processed message dictionary
     """
     if not isinstance(msg_dict.get("content"), list):
         # Already a string or None, no processing needed
         return {k: v for k, v in msg_dict.items() if v is not None}
-    
+
     if content_format == "openai":
         # OpenAI format: preserve structured content list, normalize types
         processed_content_parts = []
         for chunk in msg_dict["content"]:
             if isinstance(chunk, dict):
                 chunk_type = chunk.get("type")
-                
+
                 if chunk_type == "image_url":
                     image_data.append(chunk["image_url"]["url"])
                     if chunk.get("modalities"):
@@ -149,14 +149,13 @@ def process_content_for_template_format(
                 else:
                     # Keep other content as-is (text, etc.)
                     processed_content_parts.append(chunk)
-        
+
         new_msg = {
-            k: v for k, v in msg_dict.items() 
-            if v is not None and k != "content"
+            k: v for k, v in msg_dict.items() if v is not None and k != "content"
         }
         new_msg["content"] = processed_content_parts
         return new_msg
-        
+
     else:  # content_format == "string"
         # String format: flatten to text only (for templates like DeepSeek)
         text_parts = []
@@ -166,7 +165,7 @@ def process_content_for_template_format(
             # Note: For string format, we ignore images/audio since the template
             # doesn't expect structured content - multimodal placeholders would
             # need to be inserted differently
-        
+
         new_msg = msg_dict.copy()
         new_msg["content"] = " ".join(text_parts) if text_parts else ""
         new_msg = {k: v for k, v in new_msg.items() if v is not None}
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 197cf33496ca..83fde313ff06 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -56,6 +56,7 @@ class TestFile:
         TestFile("test_mla_fp8.py", 93),
         TestFile("test_no_chunked_prefill.py", 108),
         TestFile("test_no_overlap_scheduler.py", 234),
+        TestFile("test_openai_adapter.py", 1),
         TestFile("test_openai_function_calling.py", 60),
         TestFile("test_openai_server.py", 149),
         TestFile("test_penalty.py", 41),
diff --git a/test/srt/test_openai_adapter.py b/test/srt/test_openai_adapter.py
new file mode 100644
index 000000000000..598ddfd49147
--- /dev/null
+++ b/test/srt/test_openai_adapter.py
@@ -0,0 +1,225 @@
+"""
+Unit tests for OpenAI adapter utils.
+"""
+
+import unittest
+from unittest.mock import patch
+
+from sglang.srt.openai_api.utils import (
+    detect_template_content_format,
+    process_content_for_template_format,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestTemplateContentFormatDetection(CustomTestCase):
+    """Test template content format detection functionality."""
+
+    def test_detect_llama4_openai_format(self):
+        """Test detection of llama4-style template (should be 'openai' format)."""
+        llama4_pattern = """
+{%- for message in messages %}
+    {%- if message['content'] is string %}
+        {{- message['content'] }}
+    {%- else %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] | trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+        """
+
+        result = detect_template_content_format(llama4_pattern)
+        self.assertEqual(result, "openai")
+
+    def test_detect_deepseek_string_format(self):
+        """Test detection of deepseek-style template (should be 'string' format)."""
+        deepseek_pattern = """
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<|User|>' + message['content'] + '<|Assistant|>' }}
+    {%- endif %}
+{%- endfor %}
+        """
+
+        result = detect_template_content_format(deepseek_pattern)
+        self.assertEqual(result, "string")
+
+    def test_detect_invalid_template(self):
+        """Test handling of invalid template (should default to 'string')."""
+        invalid_pattern = "{{{{ invalid jinja syntax }}}}"
+
+        result = detect_template_content_format(invalid_pattern)
+        self.assertEqual(result, "string")
+
+    def test_detect_empty_template(self):
+        """Test handling of empty template (should default to 'string')."""
+        result = detect_template_content_format("")
+        self.assertEqual(result, "string")
+
+    def test_process_content_openai_format(self):
+        """Test content processing for openai format."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Look at this image:"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                },
+                {"type": "text", "text": "What do you see?"},
+            ],
+        }
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, audio_data, modalities
+        )
+
+        # Check that image_data was extracted
+        self.assertEqual(len(image_data), 1)
+        self.assertEqual(image_data[0], "http://example.com/image.jpg")
+
+        # Check that content was normalized
+        expected_content = [
+            {"type": "text", "text": "Look at this image:"},
+            {"type": "image"},  # normalized from image_url
+            {"type": "text", "text": "What do you see?"},
+        ]
+        self.assertEqual(result["content"], expected_content)
+        self.assertEqual(result["role"], "user")
+
+    def test_process_content_string_format(self):
+        """Test content processing for string format."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                },
+                {"type": "text", "text": "world"},
+            ],
+        }
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "string", image_data, audio_data, modalities
+        )
+
+        # For string format, should flatten to text only
+        self.assertEqual(result["content"], "Hello world")
+        self.assertEqual(result["role"], "user")
+
+        # Image data should not be extracted for string format
+        self.assertEqual(len(image_data), 0)
+
+    def test_process_content_with_audio(self):
+        """Test content processing with audio content."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Listen to this:"},
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": "http://example.com/audio.mp3"},
+                },
+            ],
+        }
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, audio_data, modalities
+        )
+
+        # Check that audio_data was extracted
+        self.assertEqual(len(audio_data), 1)
+        self.assertEqual(audio_data[0], "http://example.com/audio.mp3")
+
+        # Check that content was normalized
+        expected_content = [
+            {"type": "text", "text": "Listen to this:"},
+            {"type": "audio"},  # normalized from audio_url
+        ]
+        self.assertEqual(result["content"], expected_content)
+
+    def test_process_content_already_string(self):
+        """Test processing content that's already a string."""
+        msg_dict = {"role": "user", "content": "Hello world"}
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, audio_data, modalities
+        )
+
+        # Should pass through unchanged
+        self.assertEqual(result["content"], "Hello world")
+        self.assertEqual(result["role"], "user")
+        self.assertEqual(len(image_data), 0)
+
+    def test_process_content_with_modalities(self):
+        """Test content processing with modalities field."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                    "modalities": ["vision"],
+                }
+            ],
+        }
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, audio_data, modalities
+        )
+
+        # Check that modalities was extracted
+        self.assertEqual(len(modalities), 1)
+        self.assertEqual(modalities[0], ["vision"])
+
+    def test_process_content_filter_none_values(self):
+        """Test that None values are filtered out of processed messages."""
+        msg_dict = {
+            "role": "user",
+            "content": "Hello",
+            "name": None,
+            "tool_call_id": None,
+        }
+
+        image_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "string", image_data, audio_data, modalities
+        )
+
+        # None values should be filtered out
+        expected_keys = {"role", "content"}
+        self.assertEqual(set(result.keys()), expected_keys)
+
+
+if __name__ == "__main__":
+    unittest.main()