From 55eafa202529c2b0ba40a1c35524371363de0247 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 2 Dec 2025 16:22:18 -0800 Subject: [PATCH 1/8] allow reasoning details to pass through --- eval_protocol/mcp/execution/base_policy.py | 17 +++++++ eval_protocol/mcp/execution/policy.py | 53 +++++++++++++--------- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index bdced48a..78489f8a 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -59,6 +59,20 @@ def __init__( # Initialize conversation state tracking for proper OpenAI trajectories self.initialized = False + def _supports_reasoning_details(self) -> bool: + """ + Returns True if this policy is configured for a provider/model that expects + top-level reasoning_details to be preserved (e.g., Gemini 3 via OpenRouter). + """ + model_id = getattr(self, "model_id", "") or "" + base_url = getattr(self, "base_url", "") or "" + + if isinstance(model_id, str) and "openrouter" in model_id: + return True + if isinstance(base_url, str) and "openrouter.ai" in base_url: + return True + return False + @abstractmethod async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict: """ @@ -199,6 +213,9 @@ async def _generate_live_tool_calls( if message.get("tool_calls"): assistant_message_for_history["tool_calls"] = message["tool_calls"] + if message.get("reasoning_details") and self._supports_reasoning_details(): + assistant_message_for_history["reasoning_details"] = message["reasoning_details"] + # Add to actual conversation history conversation_history.append(assistant_message_for_history) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index 777c4f7e..c64055fb 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -148,6 +148,9 @@ def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]: # Standard OpenAI message fields allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"} + if self._supports_reasoning_details(): + allowed_fields.add("reasoning_details") + clean_messages = [] for msg in messages: # Only keep allowed fields @@ -217,31 +220,37 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ logger.debug(f"🔄 API call for model: {self.model_id}") # LiteLLM already returns OpenAI-compatible format + message_obj = getattr(response.choices[0], "message", object()) + + message_dict: Dict[str, Any] = { + "role": getattr(message_obj, "role", "assistant"), + "content": getattr(message_obj, "content", None), + "tool_calls": ( + [ + { + "id": getattr(tc, "id", None), + "type": getattr(tc, "type", "function"), + "function": { + "name": getattr(getattr(tc, "function", None), "name", "tool"), + "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"), + }, + } + for tc in (getattr(message_obj, "tool_calls", []) or []) + ] + if getattr(message_obj, "tool_calls", None) + else [] + ), + } + + if self._supports_reasoning_details(): + rd = getattr(message_obj, "reasoning_details", None) + if rd is not None: + message_dict["reasoning_details"] = rd + return { "choices": [ { - "message": { - "role": getattr(getattr(response.choices[0], "message", object()), "role", "assistant"), - "content": getattr(getattr(response.choices[0], "message", object()), "content", None), - "tool_calls": ( - [ - { - "id": getattr(tc, "id", None), - "type": getattr(tc, "type", "function"), - "function": { - "name": getattr(getattr(tc, "function", None), "name", "tool"), - "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"), - }, - } - for tc in ( - getattr(getattr(response.choices[0], "message", object()), "tool_calls", []) - or [] - ) - ] - if getattr(getattr(response.choices[0], "message", object()), "tool_calls", None) - else [] - ), - }, + "message": message_dict, "finish_reason": getattr(response.choices[0], "finish_reason", None), } ], From a8c103d73d10c8d921561d731d53657fc78d2697 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 2 Dec 2025 16:31:49 -0800 Subject: [PATCH 2/8] update --- eval_protocol/mcp/execution/base_policy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index 78489f8a..c5c31805 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -213,8 +213,9 @@ async def _generate_live_tool_calls( if message.get("tool_calls"): assistant_message_for_history["tool_calls"] = message["tool_calls"] - if message.get("reasoning_details") and self._supports_reasoning_details(): - assistant_message_for_history["reasoning_details"] = message["reasoning_details"] + rd = message.get("reasoning_details", None) + if rd is not None and self._supports_reasoning_details(): + assistant_message_for_history["reasoning_details"] = rd # Add to actual conversation history conversation_history.append(assistant_message_for_history) From 1d630d46fdc3517d5ac6a0ce2debb4568587b26c Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 2 Dec 2025 16:49:02 -0800 Subject: [PATCH 3/8] address comments --- eval_protocol/mcp/execution/base_policy.py | 16 +--------------- eval_protocol/mcp/execution/policy.py | 12 ++++-------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index c5c31805..eba49dc7 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -59,20 +59,6 @@ def __init__( # Initialize conversation state tracking for proper OpenAI trajectories self.initialized = False - def _supports_reasoning_details(self) -> bool: - """ - Returns True if this policy is configured for a provider/model that expects - top-level reasoning_details to be preserved (e.g., Gemini 3 via OpenRouter). - """ - model_id = getattr(self, "model_id", "") or "" - base_url = getattr(self, "base_url", "") or "" - - if isinstance(model_id, str) and "openrouter" in model_id: - return True - if isinstance(base_url, str) and "openrouter.ai" in base_url: - return True - return False - @abstractmethod async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict: """ @@ -214,7 +200,7 @@ async def _generate_live_tool_calls( assistant_message_for_history["tool_calls"] = message["tool_calls"] rd = message.get("reasoning_details", None) - if rd is not None and self._supports_reasoning_details(): + if rd is not None: assistant_message_for_history["reasoning_details"] = rd # Add to actual conversation history diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index c64055fb..b293fa0e 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -146,10 +146,7 @@ def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]: Clean messages with only OpenAI API compatible fields """ # Standard OpenAI message fields - allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"} - - if self._supports_reasoning_details(): - allowed_fields.add("reasoning_details") + allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name", "reasoning_details"} clean_messages = [] for msg in messages: @@ -242,10 +239,9 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ ), } - if self._supports_reasoning_details(): - rd = getattr(message_obj, "reasoning_details", None) - if rd is not None: - message_dict["reasoning_details"] = rd + rd = getattr(message_obj, "reasoning_details", None) + if rd is not None: + message_dict["reasoning_details"] = rd return { "choices": [ From 8366a853581cc0d0a5f2336f52b8f21cc9d91ed6 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 2 Dec 2025 18:37:27 -0800 Subject: [PATCH 4/8] make generic --- eval_protocol/mcp/execution/policy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index b293fa0e..6f46a6b2 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -239,9 +239,11 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ ), } - rd = getattr(message_obj, "reasoning_details", None) - if rd is not None: - message_dict["reasoning_details"] = rd + provider_specific = getattr(message_obj, "provider_specific_fields", None) + if isinstance(provider_specific, dict): + for key, value in provider_specific.items(): + if value is not None and key not in message_dict: + message_dict[key] = value return { "choices": [ From 4bc58f7a1b9f6cc680c7b214f18bf280db78efd7 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 2 Dec 2025 21:38:32 -0800 Subject: [PATCH 5/8] update --- eval_protocol/mcp/execution/base_policy.py | 5 ++--- eval_protocol/mcp/execution/policy.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index eba49dc7..efe57dfa 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -199,9 +199,8 @@ async def _generate_live_tool_calls( if message.get("tool_calls"): assistant_message_for_history["tool_calls"] = message["tool_calls"] - rd = message.get("reasoning_details", None) - if rd is not None: - assistant_message_for_history["reasoning_details"] = rd + if message.get("reasoning_details"): + assistant_message_for_history["reasoning_details"] = message["reasoning_details"] # Add to actual conversation history conversation_history.append(assistant_message_for_history) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index 6f46a6b2..5d70179c 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -239,9 +239,9 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ ), } - provider_specific = getattr(message_obj, "provider_specific_fields", None) - if isinstance(provider_specific, dict): - for key, value in provider_specific.items(): + provider_specific_fields = getattr(message_obj, "provider_specific_fields", None) + if isinstance(provider_specific_fields, dict): + for key, value in provider_specific_fields.items(): if value is not None and key not in message_dict: message_dict[key] = value From 4ebc8c4a08781d012dac27de43473bf17d1a78df Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 3 Dec 2025 00:41:14 -0800 Subject: [PATCH 6/8] more generic storing of provider_specific_fields --- eval_protocol/mcp/execution/base_policy.py | 8 ++++++-- eval_protocol/mcp/execution/policy.py | 7 +------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index efe57dfa..9d44d02c 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -199,8 +199,12 @@ async def _generate_live_tool_calls( if message.get("tool_calls"): assistant_message_for_history["tool_calls"] = message["tool_calls"] - if message.get("reasoning_details"): - assistant_message_for_history["reasoning_details"] = message["reasoning_details"] + # Preserve specific fields from provider_specific_fields if present + if message.get("provider_specific_fields"): + if message["provider_specific_fields"].get("reasoning_details"): + assistant_message_for_history["reasoning_details"] = message["provider_specific_fields"][ + "reasoning_details" + ] # Add to actual conversation history conversation_history.append(assistant_message_for_history) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index 5d70179c..47495d2f 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -222,6 +222,7 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ message_dict: Dict[str, Any] = { "role": getattr(message_obj, "role", "assistant"), "content": getattr(message_obj, "content", None), + "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None), "tool_calls": ( [ { @@ -239,12 +240,6 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ ), } - provider_specific_fields = getattr(message_obj, "provider_specific_fields", None) - if isinstance(provider_specific_fields, dict): - for key, value in provider_specific_fields.items(): - if value is not None and key not in message_dict: - message_dict[key] = value - return { "choices": [ { From 15ff290f05a5ad53f4fbd927349e695767a5b1bd Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 3 Dec 2025 00:44:19 -0800 Subject: [PATCH 7/8] add a test --- tests/test_litellm_policy_provider_fields.py | 95 ++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/test_litellm_policy_provider_fields.py diff --git a/tests/test_litellm_policy_provider_fields.py b/tests/test_litellm_policy_provider_fields.py new file mode 100644 index 00000000..6812ef9b --- /dev/null +++ b/tests/test_litellm_policy_provider_fields.py @@ -0,0 +1,95 @@ +import types + +import pytest + +import eval_protocol.mcp.execution.policy as policy_mod +from eval_protocol.mcp.execution.policy import LiteLLMPolicy + + +@pytest.mark.asyncio +async def test_litellm_policy_surfaces_provider_specific_reasoning_details(monkeypatch): + """ + Ensure that provider_specific_fields from the LiteLLM message object are + preserved on the returned message dict from LiteLLMPolicy._make_llm_call. + """ + + # Define a fake ModelResponse base class and patch the module's ModelResponse + class FakeModelResponseBase: ... + + policy_mod.ModelResponse = FakeModelResponseBase + + async def fake_acompletion(*args, **kwargs): + # This mimics the LiteLLM Message object shape we rely on in policy._make_llm_call + message_obj = types.SimpleNamespace( + role="assistant", + content="", + tool_calls=[ + types.SimpleNamespace( + id="tool_get_reservation_details_123", + type="function", + function=types.SimpleNamespace( + name="get_reservation_details", + arguments='{"reservation_id":"EHGLP3"}', + ), + ) + ], + provider_specific_fields={ + "reasoning_details": [{"id": "tool_get_reservation_details_123", "type": "reasoning.encrypted"}], + "custom_field": "keep_me", + }, + ) + + class FakeModelResponse(FakeModelResponseBase): + def __init__(self) -> None: + self.choices = [ + types.SimpleNamespace( + finish_reason="tool_calls", + index=0, + message=message_obj, + ) + ] + self.usage = types.SimpleNamespace( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ) + + return FakeModelResponse() + + # Patch acompletion so we don't hit the network + monkeypatch.setattr(policy_mod, "acompletion", fake_acompletion) + + # Use a concrete policy instance; base_url/model_id values don't matter for this unit test + policy = LiteLLMPolicy(model_id="openrouter/google/gemini-3-pro-preview", use_caching=False) + + messages = [ + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "tool_get_reservation_details_123", + "type": "function", + "function": {"name": "get_reservation_details", "arguments": '{"reservation_id":"EHGLP3"}'}, + } + ], + } + ] + + # No tools are needed for this test – we only care about the returned message shape + result = await policy._make_llm_call(messages, tools=[]) + + assert "choices" in result + assert len(result["choices"]) == 1 + msg = result["choices"][0]["message"] + + # Core fields should be present + assert msg["role"] == "assistant" + assert isinstance(msg.get("tool_calls"), list) + + # provider_specific_fields should be preserved on the message + ps = msg.get("provider_specific_fields") + assert isinstance(ps, dict) + assert ps["reasoning_details"] == [{"id": "tool_get_reservation_details_123", "type": "reasoning.encrypted"}] + # Non-core provider_specific_fields should also be preserved + assert ps.get("custom_field") == "keep_me" From 16ffbac0a2077b398cf59efb1421e4a276623bfb Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 3 Dec 2025 00:48:40 -0800 Subject: [PATCH 8/8] put message back in dict --- eval_protocol/mcp/execution/policy.py | 42 +++++++++++++-------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index 47495d2f..0b4aac4e 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -219,31 +219,29 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ # LiteLLM already returns OpenAI-compatible format message_obj = getattr(response.choices[0], "message", object()) - message_dict: Dict[str, Any] = { - "role": getattr(message_obj, "role", "assistant"), - "content": getattr(message_obj, "content", None), - "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None), - "tool_calls": ( - [ - { - "id": getattr(tc, "id", None), - "type": getattr(tc, "type", "function"), - "function": { - "name": getattr(getattr(tc, "function", None), "name", "tool"), - "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"), - }, - } - for tc in (getattr(message_obj, "tool_calls", []) or []) - ] - if getattr(message_obj, "tool_calls", None) - else [] - ), - } - return { "choices": [ { - "message": message_dict, + "message": { + "role": getattr(message_obj, "role", "assistant"), + "content": getattr(message_obj, "content", None), + "provider_specific_fields": getattr(message_obj, "provider_specific_fields", None), + "tool_calls": ( + [ + { + "id": getattr(tc, "id", None), + "type": getattr(tc, "type", "function"), + "function": { + "name": getattr(getattr(tc, "function", None), "name", "tool"), + "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"), + }, + } + for tc in (getattr(message_obj, "tool_calls", []) or []) + ] + if getattr(message_obj, "tool_calls", None) + else [] + ), + }, "finish_reason": getattr(response.choices[0], "finish_reason", None), } ],