From 55eafa202529c2b0ba40a1c35524371363de0247 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 2 Dec 2025 16:22:18 -0800
Subject: [PATCH 1/8] allow reasoning details to pass through

---
 eval_protocol/mcp/execution/base_policy.py | 17 +++++++
 eval_protocol/mcp/execution/policy.py      | 53 +++++++++++++---------
 2 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
index bdced48a..78489f8a 100644
--- a/eval_protocol/mcp/execution/base_policy.py
+++ b/eval_protocol/mcp/execution/base_policy.py
@@ -59,6 +59,20 @@ def __init__(
         # Initialize conversation state tracking for proper OpenAI trajectories
         self.initialized = False
 
+    def _supports_reasoning_details(self) -> bool:
+        """
+        Returns True if this policy is configured for a provider/model that expects
+        top-level reasoning_details to be preserved (e.g., Gemini 3 via OpenRouter).
+        """
+        model_id = getattr(self, "model_id", "") or ""
+        base_url = getattr(self, "base_url", "") or ""
+
+        if isinstance(model_id, str) and "openrouter" in model_id:
+            return True
+        if isinstance(base_url, str) and "openrouter.ai" in base_url:
+            return True
+        return False
+
     @abstractmethod
     async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
         """
@@ -199,6 +213,9 @@ async def _generate_live_tool_calls(
         if message.get("tool_calls"):
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
+        if message.get("reasoning_details") and self._supports_reasoning_details():
+            assistant_message_for_history["reasoning_details"] = message["reasoning_details"]
+
         # Add to actual conversation history
         conversation_history.append(assistant_message_for_history)
 
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 777c4f7e..c64055fb 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -148,6 +148,9 @@ def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]:
         # Standard OpenAI message fields
         allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"}
 
+        if self._supports_reasoning_details():
+            allowed_fields.add("reasoning_details")
+
         clean_messages = []
         for msg in messages:
             # Only keep allowed fields
@@ -217,31 +220,37 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 logger.debug(f"🔄 API call for model: {self.model_id}")
 
             # LiteLLM already returns OpenAI-compatible format
+            message_obj = getattr(response.choices[0], "message", object())
+
+            message_dict: Dict[str, Any] = {
+                "role": getattr(message_obj, "role", "assistant"),
+                "content": getattr(message_obj, "content", None),
+                "tool_calls": (
+                    [
+                        {
+                            "id": getattr(tc, "id", None),
+                            "type": getattr(tc, "type", "function"),
+                            "function": {
+                                "name": getattr(getattr(tc, "function", None), "name", "tool"),
+                                "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
+                            },
+                        }
+                        for tc in (getattr(message_obj, "tool_calls", []) or [])
+                    ]
+                    if getattr(message_obj, "tool_calls", None)
+                    else []
+                ),
+            }
+
+            if self._supports_reasoning_details():
+                rd = getattr(message_obj, "reasoning_details", None)
+                if rd is not None:
+                    message_dict["reasoning_details"] = rd
+
             return {
                 "choices": [
                     {
-                        "message": {
-                            "role": getattr(getattr(response.choices[0], "message", object()), "role", "assistant"),
-                            "content": getattr(getattr(response.choices[0], "message", object()), "content", None),
-                            "tool_calls": (
-                                [
-                                    {
-                                        "id": getattr(tc, "id", None),
-                                        "type": getattr(tc, "type", "function"),
-                                        "function": {
-                                            "name": getattr(getattr(tc, "function", None), "name", "tool"),
-                                            "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
-                                        },
-                                    }
-                                    for tc in (
-                                        getattr(getattr(response.choices[0], "message", object()), "tool_calls", [])
-                                        or []
-                                    )
-                                ]
-                                if getattr(getattr(response.choices[0], "message", object()), "tool_calls", None)
-                                else []
-                            ),
-                        },
+                        "message": message_dict,
                         "finish_reason": getattr(response.choices[0], "finish_reason", None),
                     }
                 ],

From a8c103d73d10c8d921561d731d53657fc78d2697 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 2 Dec 2025 16:31:49 -0800
Subject: [PATCH 2/8] update

---
 eval_protocol/mcp/execution/base_policy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
index 78489f8a..c5c31805 100644
--- a/eval_protocol/mcp/execution/base_policy.py
+++ b/eval_protocol/mcp/execution/base_policy.py
@@ -213,8 +213,9 @@ async def _generate_live_tool_calls(
         if message.get("tool_calls"):
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
-        if message.get("reasoning_details") and self._supports_reasoning_details():
-            assistant_message_for_history["reasoning_details"] = message["reasoning_details"]
+        rd = message.get("reasoning_details", None)
+        if rd is not None and self._supports_reasoning_details():
+            assistant_message_for_history["reasoning_details"] = rd
 
         # Add to actual conversation history
         conversation_history.append(assistant_message_for_history)

From 1d630d46fdc3517d5ac6a0ce2debb4568587b26c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 2 Dec 2025 16:49:02 -0800
Subject: [PATCH 3/8] address comments

---
 eval_protocol/mcp/execution/base_policy.py | 16 +---------------
 eval_protocol/mcp/execution/policy.py      | 12 ++++--------
 2 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
index c5c31805..eba49dc7 100644
--- a/eval_protocol/mcp/execution/base_policy.py
+++ b/eval_protocol/mcp/execution/base_policy.py
@@ -59,20 +59,6 @@ def __init__(
         # Initialize conversation state tracking for proper OpenAI trajectories
         self.initialized = False
 
-    def _supports_reasoning_details(self) -> bool:
-        """
-        Returns True if this policy is configured for a provider/model that expects
-        top-level reasoning_details to be preserved (e.g., Gemini 3 via OpenRouter).
-        """
-        model_id = getattr(self, "model_id", "") or ""
-        base_url = getattr(self, "base_url", "") or ""
-
-        if isinstance(model_id, str) and "openrouter" in model_id:
-            return True
-        if isinstance(base_url, str) and "openrouter.ai" in base_url:
-            return True
-        return False
-
     @abstractmethod
     async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
         """
@@ -214,7 +200,7 @@ async def _generate_live_tool_calls(
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
         rd = message.get("reasoning_details", None)
-        if rd is not None and self._supports_reasoning_details():
+        if rd is not None:
             assistant_message_for_history["reasoning_details"] = rd
 
         # Add to actual conversation history
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index c64055fb..b293fa0e 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -146,10 +146,7 @@ def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]:
             Clean messages with only OpenAI API compatible fields
         """
         # Standard OpenAI message fields
-        allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"}
-
-        if self._supports_reasoning_details():
-            allowed_fields.add("reasoning_details")
+        allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name", "reasoning_details"}
 
         clean_messages = []
         for msg in messages:
@@ -242,10 +239,9 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 ),
             }
 
-            if self._supports_reasoning_details():
-                rd = getattr(message_obj, "reasoning_details", None)
-                if rd is not None:
-                    message_dict["reasoning_details"] = rd
+            rd = getattr(message_obj, "reasoning_details", None)
+            if rd is not None:
+                message_dict["reasoning_details"] = rd
 
             return {
                 "choices": [

From 8366a853581cc0d0a5f2336f52b8f21cc9d91ed6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 2 Dec 2025 18:37:27 -0800
Subject: [PATCH 4/8] make generic

---
 eval_protocol/mcp/execution/policy.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index b293fa0e..6f46a6b2 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -239,9 +239,11 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 ),
             }
 
-            rd = getattr(message_obj, "reasoning_details", None)
-            if rd is not None:
-                message_dict["reasoning_details"] = rd
+            provider_specific = getattr(message_obj, "provider_specific_fields", None)
+            if isinstance(provider_specific, dict):
+                for key, value in provider_specific.items():
+                    if value is not None and key not in message_dict:
+                        message_dict[key] = value
 
             return {
                 "choices": [

From 4bc58f7a1b9f6cc680c7b214f18bf280db78efd7 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 2 Dec 2025 21:38:32 -0800
Subject: [PATCH 5/8] update

---
 eval_protocol/mcp/execution/base_policy.py | 5 ++---
 eval_protocol/mcp/execution/policy.py      | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
index eba49dc7..efe57dfa 100644
--- a/eval_protocol/mcp/execution/base_policy.py
+++ b/eval_protocol/mcp/execution/base_policy.py
@@ -199,9 +199,8 @@ async def _generate_live_tool_calls(
         if message.get("tool_calls"):
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
-        rd = message.get("reasoning_details", None)
-        if rd is not None:
-            assistant_message_for_history["reasoning_details"] = rd
+        if message.get("reasoning_details"):
+            assistant_message_for_history["reasoning_details"] = message["reasoning_details"]
 
         # Add to actual conversation history
         conversation_history.append(assistant_message_for_history)
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 6f46a6b2..5d70179c 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -239,9 +239,9 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 ),
             }
 
-            provider_specific = getattr(message_obj, "provider_specific_fields", None)
-            if isinstance(provider_specific, dict):
-                for key, value in provider_specific.items():
+            provider_specific_fields = getattr(message_obj, "provider_specific_fields", None)
+            if isinstance(provider_specific_fields, dict):
+                for key, value in provider_specific_fields.items():
                     if value is not None and key not in message_dict:
                         message_dict[key] = value
 

From 4ebc8c4a08781d012dac27de43473bf17d1a78df Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 3 Dec 2025 00:41:14 -0800
Subject: [PATCH 6/8] more generic storing of provider_specific_fields

---
 eval_protocol/mcp/execution/base_policy.py | 8 ++++++--
 eval_protocol/mcp/execution/policy.py      | 7 +------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
index efe57dfa..9d44d02c 100644
--- a/eval_protocol/mcp/execution/base_policy.py
+++ b/eval_protocol/mcp/execution/base_policy.py
@@ -199,8 +199,12 @@ async def _generate_live_tool_calls(
         if message.get("tool_calls"):
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
-        if message.get("reasoning_details"):
-            assistant_message_for_history["reasoning_details"] = message["reasoning_details"]
+        # Preserve specific fields from provider_specific_fields if present
+        if message.get("provider_specific_fields"):
+            if message["provider_specific_fields"].get("reasoning_details"):
+                assistant_message_for_history["reasoning_details"] = message["provider_specific_fields"][
+                    "reasoning_details"
+                ]
 
         # Add to actual conversation history
         conversation_history.append(assistant_message_for_history)
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 5d70179c..47495d2f 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -222,6 +222,7 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             message_dict: Dict[str, Any] = {
                 "role": getattr(message_obj, "role", "assistant"),
                 "content": getattr(message_obj, "content", None),
+                "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None),
                 "tool_calls": (
                     [
                         {
@@ -239,12 +240,6 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 ),
             }
 
-            provider_specific_fields = getattr(message_obj, "provider_specific_fields", None)
-            if isinstance(provider_specific_fields, dict):
-                for key, value in provider_specific_fields.items():
-                    if value is not None and key not in message_dict:
-                        message_dict[key] = value
-
             return {
                 "choices": [
                     {

From 15ff290f05a5ad53f4fbd927349e695767a5b1bd Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 3 Dec 2025 00:44:19 -0800
Subject: [PATCH 7/8] add a test

---
 tests/test_litellm_policy_provider_fields.py | 95 ++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 tests/test_litellm_policy_provider_fields.py

diff --git a/tests/test_litellm_policy_provider_fields.py b/tests/test_litellm_policy_provider_fields.py
new file mode 100644
index 00000000..6812ef9b
--- /dev/null
+++ b/tests/test_litellm_policy_provider_fields.py
@@ -0,0 +1,95 @@
+import types
+
+import pytest
+
+import eval_protocol.mcp.execution.policy as policy_mod
+from eval_protocol.mcp.execution.policy import LiteLLMPolicy
+
+
+@pytest.mark.asyncio
+async def test_litellm_policy_surfaces_provider_specific_reasoning_details(monkeypatch):
+    """
+    Ensure that provider_specific_fields from the LiteLLM message object are
+    preserved on the returned message dict from LiteLLMPolicy._make_llm_call.
+    """
+
+    # Define a fake ModelResponse base class and patch the module's ModelResponse
+    class FakeModelResponseBase: ...
+
+    policy_mod.ModelResponse = FakeModelResponseBase
+
+    async def fake_acompletion(*args, **kwargs):
+        # This mimics the LiteLLM Message object shape we rely on in policy._make_llm_call
+        message_obj = types.SimpleNamespace(
+            role="assistant",
+            content="",
+            tool_calls=[
+                types.SimpleNamespace(
+                    id="tool_get_reservation_details_123",
+                    type="function",
+                    function=types.SimpleNamespace(
+                        name="get_reservation_details",
+                        arguments='{"reservation_id":"EHGLP3"}',
+                    ),
+                )
+            ],
+            provider_specific_fields={
+                "reasoning_details": [{"id": "tool_get_reservation_details_123", "type": "reasoning.encrypted"}],
+                "custom_field": "keep_me",
+            },
+        )
+
+        class FakeModelResponse(FakeModelResponseBase):
+            def __init__(self) -> None:
+                self.choices = [
+                    types.SimpleNamespace(
+                        finish_reason="tool_calls",
+                        index=0,
+                        message=message_obj,
+                    )
+                ]
+                self.usage = types.SimpleNamespace(
+                    prompt_tokens=10,
+                    completion_tokens=5,
+                    total_tokens=15,
+                )
+
+        return FakeModelResponse()
+
+    # Patch acompletion so we don't hit the network
+    monkeypatch.setattr(policy_mod, "acompletion", fake_acompletion)
+
+    # Use a concrete policy instance; base_url/model_id values don't matter for this unit test
+    policy = LiteLLMPolicy(model_id="openrouter/google/gemini-3-pro-preview", use_caching=False)
+
+    messages = [
+        {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [
+                {
+                    "id": "tool_get_reservation_details_123",
+                    "type": "function",
+                    "function": {"name": "get_reservation_details", "arguments": '{"reservation_id":"EHGLP3"}'},
+                }
+            ],
+        }
+    ]
+
+    # No tools are needed for this test – we only care about the returned message shape
+    result = await policy._make_llm_call(messages, tools=[])
+
+    assert "choices" in result
+    assert len(result["choices"]) == 1
+    msg = result["choices"][0]["message"]
+
+    # Core fields should be present
+    assert msg["role"] == "assistant"
+    assert isinstance(msg.get("tool_calls"), list)
+
+    # provider_specific_fields should be preserved on the message
+    ps = msg.get("provider_specific_fields")
+    assert isinstance(ps, dict)
+    assert ps["reasoning_details"] == [{"id": "tool_get_reservation_details_123", "type": "reasoning.encrypted"}]
+    # Non-core provider_specific_fields should also be preserved
+    assert ps.get("custom_field") == "keep_me"

From 16ffbac0a2077b398cf59efb1421e4a276623bfb Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 3 Dec 2025 00:48:40 -0800
Subject: [PATCH 8/8] put message back in dict

---
 eval_protocol/mcp/execution/policy.py | 42 +++++++++++++--------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 47495d2f..0b4aac4e 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -219,31 +219,29 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             # LiteLLM already returns OpenAI-compatible format
             message_obj = getattr(response.choices[0], "message", object())
 
-            message_dict: Dict[str, Any] = {
-                "role": getattr(message_obj, "role", "assistant"),
-                "content": getattr(message_obj, "content", None),
-                "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None),
-                "tool_calls": (
-                    [
-                        {
-                            "id": getattr(tc, "id", None),
-                            "type": getattr(tc, "type", "function"),
-                            "function": {
-                                "name": getattr(getattr(tc, "function", None), "name", "tool"),
-                                "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
-                            },
-                        }
-                        for tc in (getattr(message_obj, "tool_calls", []) or [])
-                    ]
-                    if getattr(message_obj, "tool_calls", None)
-                    else []
-                ),
-            }
-
             return {
                 "choices": [
                     {
-                        "message": message_dict,
+                        "message": {
+                            "role": getattr(message_obj, "role", "assistant"),
+                            "content": getattr(message_obj, "content", None),
+                            "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None),
+                            "tool_calls": (
+                                [
+                                    {
+                                        "id": getattr(tc, "id", None),
+                                        "type": getattr(tc, "type", "function"),
+                                        "function": {
+                                            "name": getattr(getattr(tc, "function", None), "name", "tool"),
+                                            "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
+                                        },
+                                    }
+                                    for tc in (getattr(message_obj, "tool_calls", []) or [])
+                                ]
+                                if getattr(message_obj, "tool_calls", None)
+                                else []
+                            ),
+                        },
                         "finish_reason": getattr(response.choices[0], "finish_reason", None),
                     }
                 ],