From f1c6fa55fa1199c57fde86875c8197bf4824390d Mon Sep 17 00:00:00 2001 From: Aymeric Date: Fri, 16 May 2025 13:10:20 +0200 Subject: [PATCH 01/28] Start work on run results --- src/smolagents/agents.py | 64 +++++++++++++++++++++++++++++++----- src/smolagents/monitoring.py | 15 ++++++++- tests/test_monitoring.py | 19 +++++++++++ 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index a276018f5..4bd6a16ca 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -62,6 +62,7 @@ AgentLogger, LogLevel, Monitor, + RunResult, ) from .remote_executors import DockerExecutor, E2BExecutor from .tools import Tool @@ -204,6 +205,7 @@ def __init__( description: str | None = None, provide_run_summary: bool = False, final_answer_checks: list[Callable] | None = None, + return_full_results: bool = False, logger: AgentLogger | None = None, ): self.agent_name = self.__class__.__name__ @@ -230,6 +232,7 @@ def __init__( self.description = description self.provide_run_summary = provide_run_summary self.final_answer_checks = final_answer_checks + self.return_full_results = return_full_results self._setup_managed_agents(managed_agents) self._setup_tools(tools, add_base_tools) @@ -347,8 +350,41 @@ def run( if stream: # The steps are returned as they are executed through a generator to iterate on. return self._run_stream(task=self.task, max_steps=max_steps, images=images) + run_start_time = time.time() # Outputs are returned only at the end. We only look at the last step. - return list(self._run_stream(task=self.task, max_steps=max_steps, images=images))[-1].final_answer + try: + steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images)) + result = steps[-1].final_answer + state = "success" + except Exception: + run_duration = time.time() - run_start_time + raise + else: + run_duration = time.time() - run_start_time + + if self.return_full_results: + token_usage = None + try: + token_usage = self.monitor.get_total_token_counts() + except Exception: + token_usage = None + + if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError): + state = "max_steps" + elif self.memory.steps and getattr(self.memory.steps[-1], "error", None) is not None: + state = "error" + + messages = self.memory.get_full_steps() + + return RunResult( + result=result, + token_usage=token_usage, + messages=messages, + duration=run_duration, + state=state, + ) + + return result def _run_stream( self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None @@ -362,11 +398,25 @@ def _run_stream( if self.planning_interval is not None and ( self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0 ): + planning_start = step_start_time + planning_step = None for element in self._generate_planning_step( task, is_first_step=(self.step_number == 1), step=self.step_number ): yield element - self.memory.steps.append(element) + planning_step = element + if planning_step is not None: + planning_step.end_time = time.time() + planning_step.duration = planning_step.end_time - planning_start + if getattr(self.model, "last_input_token_count", None) is not None: + planning_step.input_token_count = self.model.last_input_token_count + planning_step.output_token_count = self.model.last_output_token_count + self.memory.steps.append(planning_step) + for callback in self.step_callbacks: + callback(planning_step) if len(inspect.signature(callback).parameters) == 1 else callback( + planning_step, agent=self + ) + step_start_time = time.time() action_step = ActionStep( step_number=self.step_number, start_time=step_start_time, observations_images=images ) @@ -411,6 +461,9 @@ def _validate_final_answer(self, final_answer: Any): def _finalize_step(self, memory_step: ActionStep, step_start_time: float): memory_step.end_time = time.time() memory_step.duration = memory_step.end_time - step_start_time + if getattr(self.model, "last_input_token_count", None) is not None: + memory_step.input_token_count = self.model.last_input_token_count + memory_step.output_token_count = self.model.last_output_token_count for callback in self.step_callbacks: # For compatibility with old callbacks that don't take the agent as an argument callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( @@ -423,13 +476,8 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"], step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger) ) final_memory_step.action_output = final_answer - final_memory_step.end_time = time.time() - final_memory_step.duration = final_memory_step.end_time - step_start_time + self._finalize_step(final_memory_step, step_start_time) self.memory.steps.append(final_memory_step) - for callback in self.step_callbacks: - callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( - final_memory_step, agent=self - ) return final_answer def _generate_planning_step( diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 0d827a95e..90f846fe8 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -15,7 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import json +from dataclasses import dataclass from enum import IntEnum +from typing import Any from rich import box from rich.console import Console, Group @@ -29,7 +31,18 @@ from smolagents.utils import escape_code_brackets -__all__ = ["AgentLogger", "LogLevel", "Monitor"] +__all__ = ["AgentLogger", "LogLevel", "Monitor", "RunResult"] + + +@dataclass +class RunResult: + """Holds extended information about an agent run.""" + + result: Any + token_usage: dict[str, int] | None + messages: list[dict] + duration: float + state: str class Monitor: diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index c7f6b9a64..b571a233a 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -186,3 +186,22 @@ def generate(self, prompt, **kwargs): final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIn("Malformed call", final_message.content) + + def test_run_return_full_results(self): + agent = CodeAgent( + tools=[], + model=FakeLLMModel(), + max_steps=1, + return_full_results=True, + ) + + result = agent.run("Fake task") + + from smolagents import RunResult + + self.assertIsInstance(result, RunResult) + self.assertEqual(result.result, "This is the final answer.") + self.assertEqual(result.state, "success") + self.assertEqual(result.token_usage, {"input": 10, "output": 20}) + self.assertIsInstance(result.messages, list) + self.assertGreater(result.duration, 0) From 46eb7c8645f0df4a7295da7dcb512e938583d746 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Fri, 16 May 2025 16:31:21 +0200 Subject: [PATCH 02/28] Create Timing and Usage objects --- src/smolagents/agents.py | 91 ++++++++++++++++++++++-------------- src/smolagents/memory.py | 20 ++++++-- src/smolagents/monitoring.py | 15 +----- 3 files changed, 74 insertions(+), 52 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 4bd6a16ca..90d9d1f05 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -24,6 +24,7 @@ import time from abc import ABC, abstractmethod from collections.abc import Callable, Generator +from dataclasses import dataclass from logging import getLogger from pathlib import Path from typing import TYPE_CHECKING, Any, TypedDict @@ -54,7 +55,9 @@ PlanningStep, SystemPromptStep, TaskStep, + Timing, ToolCall, + Usage, ) from .models import ChatMessage, ChatMessageStreamDelta, MessageRole, Model, parse_json_if_needed from .monitoring import ( @@ -62,7 +65,6 @@ AgentLogger, LogLevel, Monitor, - RunResult, ) from .remote_executors import DockerExecutor, E2BExecutor from .tools import Tool @@ -167,6 +169,17 @@ class PromptTemplates(TypedDict): ) +@dataclass +class RunResult: + """Holds extended information about an agent run.""" + + result: Any + token_usage: dict[str, int] | None + messages: list[dict] + duration: float + state: str + + class MultiStepAgent(ABC): """ Agent class that solves the given task step by step, using the ReAct framework: @@ -213,15 +226,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert not missing_keys, ( - f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" - ) + assert ( + not missing_keys + ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( - f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" - ) + assert ( + key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) + ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" self.max_steps = max_steps self.step_number = 0 @@ -261,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all(agent.name and agent.description for agent in managed_agents), ( - "All managed agents need both a name and a description!" - ) + assert all( + agent.name and agent.description for agent in managed_agents + ), "All managed agents need both a name and a description!" self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): @@ -394,31 +407,34 @@ def _run_stream( while final_answer is None and self.step_number <= max_steps: if self.interrupt_switch: raise AgentError("Agent interrupted.", self.logger) - step_start_time = time.time() if self.planning_interval is not None and ( self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0 ): - planning_start = step_start_time + planning_start_time = time.time() planning_step = None for element in self._generate_planning_step( task, is_first_step=(self.step_number == 1), step=self.step_number ): yield element planning_step = element - if planning_step is not None: - planning_step.end_time = time.time() - planning_step.duration = planning_step.end_time - planning_start - if getattr(self.model, "last_input_token_count", None) is not None: - planning_step.input_token_count = self.model.last_input_token_count - planning_step.output_token_count = self.model.last_output_token_count - self.memory.steps.append(planning_step) - for callback in self.step_callbacks: - callback(planning_step) if len(inspect.signature(callback).parameters) == 1 else callback( - planning_step, agent=self - ) - step_start_time = time.time() + assert isinstance(planning_step, PlanningStep) + self.memory.steps.append(planning_step) + if getattr(self.model, "last_input_token_count", None) is not None: + planning_step.usage = Usage( + input_tokens=self.model.last_input_token_count, + output_tokens=self.model.last_output_token_count, + ) + planning_end_time = time.time() + planning_step.timing = Timing( + start_time=planning_start_time, + end_time=planning_end_time, + duration=planning_end_time - planning_start_time, + ) + action_step_start_time = time.time() action_step = ActionStep( - step_number=self.step_number, start_time=step_start_time, observations_images=images + step_number=self.step_number, + timing=Timing(start_time=action_step_start_time), + observations_images=images, ) try: for el in self._execute_step(action_step): @@ -431,13 +447,13 @@ def _run_stream( # Other AgentError types are caused by the Model, so we should log them and iterate. action_step.error = e finally: - self._finalize_step(action_step, step_start_time) + self._finalize_step(action_step) self.memory.steps.append(action_step) yield action_step self.step_number += 1 if final_answer is None and self.step_number == max_steps + 1: - final_answer = self._handle_max_steps_reached(task, images, step_start_time) + final_answer = self._handle_max_steps_reached(task, images) yield action_step yield FinalAnswerStep(handle_agent_output_types(final_answer)) @@ -458,25 +474,30 @@ def _validate_final_answer(self, final_answer: Any): except Exception as e: raise AgentError(f"Check {check_function.__name__} failed with error: {e}", self.logger) - def _finalize_step(self, memory_step: ActionStep, step_start_time: float): - memory_step.end_time = time.time() - memory_step.duration = memory_step.end_time - step_start_time + def _finalize_step(self, memory_step: ActionStep): + memory_step.timing.end_time = time.time() + memory_step.timing.duration = memory_step.timing.end_time - memory_step.timing.start_time if getattr(self.model, "last_input_token_count", None) is not None: - memory_step.input_token_count = self.model.last_input_token_count - memory_step.output_token_count = self.model.last_output_token_count + memory_step.usage = Usage( + input_tokens=self.model.last_input_token_count, + output_tokens=self.model.last_output_token_count, + ) for callback in self.step_callbacks: # For compatibility with old callbacks that don't take the agent as an argument callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( memory_step, agent=self ) - def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"], step_start_time: float) -> Any: + def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"]) -> Any: + action_step_start_time = time.time() final_answer = self.provide_final_answer(task, images) final_memory_step = ActionStep( - step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger) + step_number=self.step_number, + error=AgentMaxStepsError("Reached max steps.", self.logger), + timing=Timing(start_time=action_step_start_time), ) final_memory_step.action_output = final_answer - self._finalize_step(final_memory_step, step_start_time) + self._finalize_step(final_memory_step) self.memory.steps.append(final_memory_step) return final_answer diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 38fa9e1e9..4b0ac992e 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -48,20 +48,32 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]: raise NotImplementedError +@dataclass +class Usage: + input_tokens: int + output_tokens: int + + +@dataclass +class Timing: + start_time: float + end_time: float | None = None + duration: float | None = None + + @dataclass class ActionStep(MemoryStep): model_input_messages: list[Message] | None = None tool_calls: list[ToolCall] | None = None - start_time: float | None = None - end_time: float | None = None + timing: Timing | None = None step_number: int | None = None error: AgentError | None = None - duration: float | None = None model_output_message: ChatMessage | None = None model_output: str | None = None observations: str | None = None observations_images: list["PIL.Image.Image"] | None = None action_output: Any = None + usage: Usage | None = None def dict(self): # We overwrite the method to parse the tool_calls and action_output manually @@ -145,6 +157,8 @@ class PlanningStep(MemoryStep): model_input_messages: list[Message] model_output_message: ChatMessage plan: str + timing: Timing | None = None + usage: Usage | None = None def to_messages(self, summary_mode: bool = False) -> list[Message]: if summary_mode: diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 90f846fe8..0d827a95e 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -15,9 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -from dataclasses import dataclass from enum import IntEnum -from typing import Any from rich import box from rich.console import Console, Group @@ -31,18 +29,7 @@ from smolagents.utils import escape_code_brackets -__all__ = ["AgentLogger", "LogLevel", "Monitor", "RunResult"] - - -@dataclass -class RunResult: - """Holds extended information about an agent run.""" - - result: Any - token_usage: dict[str, int] | None - messages: list[dict] - duration: float - state: str +__all__ = ["AgentLogger", "LogLevel", "Monitor"] class Monitor: From f4c8acb1607f46b80a15ad5a668e2f30d64c77ed Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 12:04:57 +0200 Subject: [PATCH 03/28] Improve usage class, add property caculations --- examples/agent_from_any_llm.py | 22 +++++---- examples/inspect_multiagent_run.py | 10 +++- examples/multi_llm_agent.py | 6 ++- src/smolagents/agents.py | 67 ++++++++++++------------- src/smolagents/gradio_ui.py | 2 +- src/smolagents/memory.py | 28 +++-------- src/smolagents/models.py | 14 +++--- src/smolagents/monitoring.py | 79 ++++++++++++++++++++++++++---- 8 files changed, 139 insertions(+), 89 deletions(-) diff --git a/examples/agent_from_any_llm.py b/examples/agent_from_any_llm.py index d5e33f0a1..593c61b2d 100644 --- a/examples/agent_from_any_llm.py +++ b/examples/agent_from_any_llm.py @@ -1,19 +1,23 @@ -from smolagents import InferenceClientModel, LiteLLMModel, OpenAIServerModel, TransformersModel, tool -from smolagents.agents import CodeAgent, ToolCallingAgent +from smolagents import ( + CodeAgent, + InferenceClientModel, + LiteLLMModel, + OpenAIServerModel, + ToolCallingAgent, + TransformersModel, + tool, +) # Choose which inference type to use! -available_inferences = ["hf_api", "hf_api_provider", "transformers", "ollama", "litellm", "openai"] -chosen_inference = "hf_api_provider" +available_inferences = ["inference_client", "transformers", "ollama", "litellm", "openai"] +chosen_inference = "inference_client" print(f"Chose model: '{chosen_inference}'") -if chosen_inference == "hf_api": - model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct") - -elif chosen_inference == "hf_api_provider": - model = InferenceClientModel(provider="together") +if chosen_inference == "inference_client": + model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct", provider="nebius") elif chosen_inference == "transformers": model = TransformersModel(model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", device_map="auto", max_new_tokens=1000) diff --git a/examples/inspect_multiagent_run.py b/examples/inspect_multiagent_run.py index c68dccb75..75bc0a07c 100644 --- a/examples/inspect_multiagent_run.py +++ b/examples/inspect_multiagent_run.py @@ -16,18 +16,24 @@ # Then we run the agentic part! -model = InferenceClientModel() +model = InferenceClientModel(provider="nebius") search_agent = ToolCallingAgent( tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", + return_full_result=True, ) manager_agent = CodeAgent( tools=[], model=model, managed_agents=[search_agent], + return_full_result=True, ) -manager_agent.run("If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?") +run_result = manager_agent.run( + "If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?" +) +print("Here is the token usage for the manager agent", run_result.token_usage) +print("Here are the timing informations for the manager agent:", run_result.timing) diff --git a/examples/multi_llm_agent.py b/examples/multi_llm_agent.py index 6f44ff8b4..5c002e1ea 100644 --- a/examples/multi_llm_agent.py +++ b/examples/multi_llm_agent.py @@ -39,6 +39,8 @@ model_list=llm_loadbalancer_model_list, client_kwargs={"routing_strategy": "simple-shuffle"}, ) -agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True) +agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True, return_full_results=True) -agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") +full_result = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") + +print(full_result) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 90d9d1f05..7d62c64df 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -56,8 +56,8 @@ SystemPromptStep, TaskStep, Timing, + TokenUsage, ToolCall, - Usage, ) from .models import ChatMessage, ChatMessageStreamDelta, MessageRole, Model, parse_json_if_needed from .monitoring import ( @@ -174,9 +174,9 @@ class RunResult: """Holds extended information about an agent run.""" result: Any - token_usage: dict[str, int] | None + token_usage: TokenUsage | None messages: list[dict] - duration: float + timing: Timing state: str @@ -218,7 +218,7 @@ def __init__( description: str | None = None, provide_run_summary: bool = False, final_answer_checks: list[Callable] | None = None, - return_full_results: bool = False, + return_full_result: bool = False, logger: AgentLogger | None = None, ): self.agent_name = self.__class__.__name__ @@ -226,15 +226,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert ( - not missing_keys - ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + assert not missing_keys, ( + f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + ) for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert ( - key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) - ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( + f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + ) self.max_steps = max_steps self.step_number = 0 @@ -245,7 +245,7 @@ def __init__( self.description = description self.provide_run_summary = provide_run_summary self.final_answer_checks = final_answer_checks - self.return_full_results = return_full_results + self.return_full_result = return_full_result self._setup_managed_agents(managed_agents) self._setup_tools(tools, add_base_tools) @@ -274,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all( - agent.name and agent.description for agent in managed_agents - ), "All managed agents need both a name and a description!" + assert all(agent.name and agent.description for agent in managed_agents), ( + "All managed agents need both a name and a description!" + ) self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): @@ -365,27 +365,18 @@ def run( return self._run_stream(task=self.task, max_steps=max_steps, images=images) run_start_time = time.time() # Outputs are returned only at the end. We only look at the last step. - try: - steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images)) - result = steps[-1].final_answer - state = "success" - except Exception: - run_duration = time.time() - run_start_time - raise - else: - run_duration = time.time() - run_start_time - if self.return_full_results: + steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images)) + result = steps[-1].final_answer + + if self.return_full_result: token_usage = None - try: - token_usage = self.monitor.get_total_token_counts() - except Exception: - token_usage = None + token_usage = self.monitor.get_total_token_counts() if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError): - state = "max_steps" - elif self.memory.steps and getattr(self.memory.steps[-1], "error", None) is not None: - state = "error" + state = "max_steps_error" + else: + state = "success" messages = self.memory.get_full_steps() @@ -393,7 +384,7 @@ def run( result=result, token_usage=token_usage, messages=messages, - duration=run_duration, + timing=Timing(start_time=run_start_time, end_time=time.time()), state=state, ) @@ -420,7 +411,7 @@ def _run_stream( assert isinstance(planning_step, PlanningStep) self.memory.steps.append(planning_step) if getattr(self.model, "last_input_token_count", None) is not None: - planning_step.usage = Usage( + planning_step.usage = TokenUsage( input_tokens=self.model.last_input_token_count, output_tokens=self.model.last_output_token_count, ) @@ -428,7 +419,6 @@ def _run_stream( planning_step.timing = Timing( start_time=planning_start_time, end_time=planning_end_time, - duration=planning_end_time - planning_start_time, ) action_step_start_time = time.time() action_step = ActionStep( @@ -476,9 +466,8 @@ def _validate_final_answer(self, final_answer: Any): def _finalize_step(self, memory_step: ActionStep): memory_step.timing.end_time = time.time() - memory_step.timing.duration = memory_step.timing.end_time - memory_step.timing.start_time if getattr(self.model, "last_input_token_count", None) is not None: - memory_step.usage = Usage( + memory_step.usage = TokenUsage( input_tokens=self.model.last_input_token_count, output_tokens=self.model.last_output_token_count, ) @@ -710,7 +699,11 @@ def __call__(self, task: str, **kwargs): self.prompt_templates["managed_agent"]["task"], variables=dict(name=self.name, task=task), ) - report = self.run(full_task, **kwargs) + result = self.run(full_task, **kwargs) + if isinstance(result, RunResult): + report = result.result + else: + report = result answer = populate_template( self.prompt_templates["managed_agent"]["report"], variables=dict(name=self.name, final_answer=report) ) diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index 384f50bc4..b85c1cddf 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -94,7 +94,7 @@ def _process_action_step(step_log: ActionStep, skip_model_outputs: bool = False) import gradio as gr # Output the step number - step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "Step" + step_number = f"Step {step_log.step_number}" if not skip_model_outputs: yield gr.ChatMessage(role="assistant", content=f"**{step_number}**", metadata={"status": "done"}) diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 4b0ac992e..6410d823e 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, TypedDict from smolagents.models import ChatMessage, MessageRole -from smolagents.monitoring import AgentLogger, LogLevel +from smolagents.monitoring import AgentLogger, LogLevel, Timing, TokenUsage from smolagents.utils import AgentError, make_json_serializable @@ -48,43 +48,29 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]: raise NotImplementedError -@dataclass -class Usage: - input_tokens: int - output_tokens: int - - -@dataclass -class Timing: - start_time: float - end_time: float | None = None - duration: float | None = None - - @dataclass class ActionStep(MemoryStep): + step_number: int + timing: Timing model_input_messages: list[Message] | None = None tool_calls: list[ToolCall] | None = None - timing: Timing | None = None - step_number: int | None = None error: AgentError | None = None model_output_message: ChatMessage | None = None model_output: str | None = None observations: str | None = None observations_images: list["PIL.Image.Image"] | None = None action_output: Any = None - usage: Usage | None = None + usage: TokenUsage | None = None def dict(self): # We overwrite the method to parse the tool_calls and action_output manually return { "model_input_messages": self.model_input_messages, "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [], - "start_time": self.start_time, - "end_time": self.end_time, + "timing": self.timing.dict(), + "usage": self.usage.dict() if self.usage else None, "step": self.step_number, "error": self.error.dict() if self.error else None, - "duration": self.duration, "model_output_message": self.model_output_message, "model_output": self.model_output, "observations": self.observations, @@ -158,7 +144,7 @@ class PlanningStep(MemoryStep): model_output_message: ChatMessage plan: str timing: Timing | None = None - usage: Usage | None = None + usage: TokenUsage | None = None def to_messages(self, summary_mode: bool = False) -> list[Message]: if summary_mode: diff --git a/src/smolagents/models.py b/src/smolagents/models.py index bba50ddc3..c0d5ac476 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -677,7 +677,7 @@ class TransformersModel(Model): Parameters: model_id (`str`): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. - For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`. + For example, `"Qwen/Qwen3-32B"`. device_map (`str`, *optional*): The device_map to initialize your model with. torch_dtype (`str`, *optional*): @@ -695,7 +695,7 @@ class TransformersModel(Model): Example: ```python >>> engine = TransformersModel( - ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", + ... model_id="Qwen/Qwen3-32B", ... device="cuda", ... max_new_tokens=5000, ... ) @@ -1172,10 +1172,10 @@ class InferenceClientModel(ApiModel): Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more. Parameters: - model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`): + model_id (`str`, *optional*, default `"Qwen/Qwen3-32B"`): The Hugging Face model ID to be used for inference. This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint. - Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future. + Currently, it defaults to `"Qwen/Qwen3-32B"`, but this may change in the future. provider (`str`, *optional*): Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"openai"`, `"replicate"`, "sambanova"`, `"together"`, etc. Currently, it defaults to hf-inference (HF Inference API). @@ -1206,8 +1206,8 @@ class InferenceClientModel(ApiModel): Example: ```python >>> engine = InferenceClientModel( - ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", - ... provider="together", + ... model_id="Qwen/Qwen3-32B", + ... provider="nebius", ... token="your_hf_token_here", ... max_tokens=5000, ... ) @@ -1220,7 +1220,7 @@ class InferenceClientModel(ApiModel): def __init__( self, - model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct", + model_id: str = "Qwen/Qwen3-32B", provider: str | None = None, token: str | None = None, timeout: int = 120, diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 0d827a95e..0d3a254c7 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json +from dataclasses import dataclass from enum import IntEnum from rich import box @@ -29,7 +30,61 @@ from smolagents.utils import escape_code_brackets -__all__ = ["AgentLogger", "LogLevel", "Monitor"] +__all__ = ["AgentLogger", "LogLevel", "Monitor", "TokenUsage", "Timing"] + + +@dataclass +class TokenUsage: + """ + Contains the token usage information for a given step or run. + """ + + input_tokens: int + output_tokens: int + + def dict(self): + return { + "input_tokens": self.input_tokens, + "output_tokens": self.output_tokens, + "total_tokens": self.total_tokens, + } + + @property + def total_tokens(self): + return self.input_tokens + self.output_tokens + + def __str__(self): + attributes = vars(self).copy() + attributes["total_tokens"] = self.total_tokens # This makes sure the total tokens are also printed + return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})" + + +@dataclass +class Timing: + """ + Contains the timing information for a given step or run. + """ + + start_time: float + end_time: float | None = None + + def dict(self): + return { + "start_time": self.start_time, + "end_time": self.end_time, + "duration": self.duration, + } + + @property + def duration(self): + if self.end_time is None: + return None + return self.end_time - self.start_time + + def __str__(self): + attributes = vars(self).copy() + attributes["duration"] = self.duration # This makes sure the duration is also printed + return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})" class Monitor: @@ -41,11 +96,15 @@ def __init__(self, tracked_model, logger): self.total_input_token_count = 0 self.total_output_token_count = 0 - def get_total_token_counts(self): - return { - "input": self.total_input_token_count, - "output": self.total_output_token_count, - } + def get_total_token_counts(self) -> TokenUsage | None: + return ( + TokenUsage( + input_tokens=self.total_input_token_count, + output_tokens=self.total_output_token_count, + ) + if hasattr(self, "total_input_token_count") + else None + ) def reset(self): self.step_durations = [] @@ -58,13 +117,13 @@ def update_metrics(self, step_log): Args: step_log ([`MemoryStep`]): Step log to update the monitor with. """ - step_duration = step_log.duration + step_duration = step_log.timing.duration self.step_durations.append(step_duration) console_outputs = f"[Step {len(self.step_durations)}: Duration {step_duration:.2f} seconds" - if getattr(self.tracked_model, "last_input_token_count", None) is not None: - self.total_input_token_count += self.tracked_model.last_input_token_count - self.total_output_token_count += self.tracked_model.last_output_token_count + if step_log.usage is not None: + self.total_input_token_count += step_log.usage.input_tokens + self.total_output_token_count += step_log.usage.output_tokens console_outputs += ( f"| Input tokens: {self.total_input_token_count:,} | Output tokens: {self.total_output_token_count:,}" ) From bbf194d8d3eab59fe2bcda06aa085d1c64a3df7f Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 12:08:20 +0200 Subject: [PATCH 04/28] Revert deletion of step callbacks after max steps error --- src/smolagents/agents.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 7d62c64df..2103a2e37 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -226,15 +226,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert not missing_keys, ( - f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" - ) + assert ( + not missing_keys + ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( - f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" - ) + assert ( + key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) + ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" self.max_steps = max_steps self.step_number = 0 @@ -274,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all(agent.name and agent.description for agent in managed_agents), ( - "All managed agents need both a name and a description!" - ) + assert all( + agent.name and agent.description for agent in managed_agents + ), "All managed agents need both a name and a description!" self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): @@ -488,6 +488,10 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"]) final_memory_step.action_output = final_answer self._finalize_step(final_memory_step) self.memory.steps.append(final_memory_step) + for callback in self.step_callbacks: + callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( + final_memory_step, agent=self + ) return final_answer def _generate_planning_step( From 962922fc468d1935106a719e60d5534bfdb8c383 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 12:11:08 +0200 Subject: [PATCH 05/28] Rename attributes to token_usage --- src/smolagents/agents.py | 4 ++-- src/smolagents/memory.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 2103a2e37..4108b9090 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -411,7 +411,7 @@ def _run_stream( assert isinstance(planning_step, PlanningStep) self.memory.steps.append(planning_step) if getattr(self.model, "last_input_token_count", None) is not None: - planning_step.usage = TokenUsage( + planning_step.token_usage = TokenUsage( input_tokens=self.model.last_input_token_count, output_tokens=self.model.last_output_token_count, ) @@ -467,7 +467,7 @@ def _validate_final_answer(self, final_answer: Any): def _finalize_step(self, memory_step: ActionStep): memory_step.timing.end_time = time.time() if getattr(self.model, "last_input_token_count", None) is not None: - memory_step.usage = TokenUsage( + memory_step.token_usage = TokenUsage( input_tokens=self.model.last_input_token_count, output_tokens=self.model.last_output_token_count, ) diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 6410d823e..0f24b3bce 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -60,7 +60,7 @@ class ActionStep(MemoryStep): observations: str | None = None observations_images: list["PIL.Image.Image"] | None = None action_output: Any = None - usage: TokenUsage | None = None + token_usage: TokenUsage | None = None def dict(self): # We overwrite the method to parse the tool_calls and action_output manually @@ -68,7 +68,7 @@ def dict(self): "model_input_messages": self.model_input_messages, "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [], "timing": self.timing.dict(), - "usage": self.usage.dict() if self.usage else None, + "token_usage": self.token_usage.dict() if self.token_usage else None, "step": self.step_number, "error": self.error.dict() if self.error else None, "model_output_message": self.model_output_message, @@ -144,7 +144,7 @@ class PlanningStep(MemoryStep): model_output_message: ChatMessage plan: str timing: Timing | None = None - usage: TokenUsage | None = None + token_usage: TokenUsage | None = None def to_messages(self, summary_mode: bool = False) -> list[Message]: if summary_mode: From 8dd6730250048dc26f34cc16f98af9f6e68d8bc8 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 12:24:35 +0200 Subject: [PATCH 06/28] Update gradio UI for token usage --- src/smolagents/gradio_ui.py | 24 +++---- src/smolagents/models.py | 129 +++++++++++++++++++---------------- src/smolagents/monitoring.py | 6 +- 3 files changed, 83 insertions(+), 76 deletions(-) diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index b85c1cddf..fce69d069 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -21,19 +21,21 @@ from smolagents.agent_types import AgentAudio, AgentImage, AgentText from smolagents.agents import MultiStepAgent, PlanningStep -from smolagents.memory import ActionStep, FinalAnswerStep, MemoryStep +from smolagents.memory import ActionStep, FinalAnswerStep from smolagents.models import ChatMessageStreamDelta from smolagents.utils import _is_package_available -def get_step_footnote_content(step_log: MemoryStep, step_name: str) -> str: +def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str: """Get a footnote string for a step log with duration and token information""" step_footnote = f"**{step_name}**" - if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"): - token_str = f" | Input tokens: {step_log.input_token_count:,} | Output tokens: {step_log.output_token_count:,}" + if hasattr(step_log, "token_usage"): + token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}" step_footnote += token_str - if hasattr(step_log, "duration"): - step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None + if hasattr(step_log, "timing"): + step_duration = ( + f" | Duration: {round(float(step_log.timing.duration), 2)}" if step_log.timing.duration else None + ) step_footnote += step_duration step_footnote_content = f"""{step_footnote} """ return step_footnote_content @@ -222,7 +224,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator: ) -def pull_messages_from_step(step_log: MemoryStep, skip_model_outputs: bool = False): +def pull_messages_from_step(step_log: ActionStep | PlanningStep | FinalAnswerStep, skip_model_outputs: bool = False): """Extract ChatMessage objects from agent steps with proper nesting. Args: @@ -260,13 +262,7 @@ def stream_to_gradio( for step_log in agent.run( task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args ): - # Track tokens if model provides them - if getattr(agent.model, "last_input_token_count", None) is not None: - if isinstance(step_log, (ActionStep, PlanningStep)): - step_log.input_token_count = agent.model.last_input_token_count - step_log.output_token_count = agent.model.last_output_token_count - - if isinstance(step_log, MemoryStep): + if isinstance(step_log, ActionStep | PlanningStep | FinalAnswerStep): intermediate_text = "" for message in pull_messages_from_step( step_log, diff --git a/src/smolagents/models.py b/src/smolagents/models.py index c0d5ac476..8870b3d55 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -24,6 +24,7 @@ from threading import Thread from typing import TYPE_CHECKING, Any +from .monitoring import TokenUsage from .tools import Tool from .utils import _is_package_available, encode_image_base64, make_image_url, parse_json_blob @@ -99,12 +100,13 @@ class ChatMessage: content: str | None = None tool_calls: list[ChatMessageToolCall] | None = None raw: Any | None = None # Stores the raw output from the API + token_usage: TokenUsage | None = None def model_dump_json(self): return json.dumps(get_dict_from_nested_dataclasses(self, ignore_key="raw")) @classmethod - def from_dict(cls, data: dict, raw: Any | None = None) -> "ChatMessage": + def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage | None = None) -> "ChatMessage": if data.get("tool_calls"): tool_calls = [ ChatMessageToolCall( @@ -113,7 +115,13 @@ def from_dict(cls, data: dict, raw: Any | None = None) -> "ChatMessage": for tc in data["tool_calls"] ] data["tool_calls"] = tool_calls - return cls(role=data["role"], content=data.get("content"), tool_calls=data.get("tool_calls"), raw=raw) + return cls( + role=data["role"], + content=data.get("content"), + tool_calls=data.get("tool_calls"), + raw=raw, + token_usage=token_usage, + ) def dict(self): return json.dumps(get_dict_from_nested_dataclasses(self)) @@ -142,6 +150,7 @@ def parse_json_if_needed(arguments: str | dict) -> str | dict: class ChatMessageStreamDelta: content: str | None = None tool_calls: list[ChatMessageToolCall] | None = None + token_usage: TokenUsage | None = None class MessageRole(str, Enum): @@ -301,8 +310,6 @@ def __init__( self.tool_name_key = tool_name_key self.tool_arguments_key = tool_arguments_key self.kwargs = kwargs - self.last_input_token_count: int | None = None - self.last_output_token_count: int | None = None self.model_id: str | None = model_id def _prepare_completion_kwargs( @@ -359,14 +366,6 @@ def _prepare_completion_kwargs( return completion_kwargs - def get_token_counts(self) -> dict[str, int]: - if self.last_input_token_count is None or self.last_output_token_count is None: - raise ValueError("Token counts are not available") - return { - "input_token_count": self.last_input_token_count, - "output_token_count": self.last_output_token_count, - } - def generate( self, messages: list[dict[str, str | list[dict]]], @@ -416,8 +415,6 @@ def to_dict(self) -> dict: """ model_dictionary = { **self.kwargs, - "last_input_token_count": self.last_input_token_count, - "last_output_token_count": self.last_output_token_count, "model_id": self.model_id, } for attribute in [ @@ -446,16 +443,7 @@ def to_dict(self) -> dict: @classmethod def from_dict(cls, model_dictionary: dict[str, Any]) -> "Model": - model_instance = cls( - **{ - k: v - for k, v in model_dictionary.items() - if k not in ["last_input_token_count", "last_output_token_count"] - } - ) - model_instance.last_input_token_count = model_dictionary.pop("last_input_token_count", None) - model_instance.last_output_token_count = model_dictionary.pop("last_output_token_count", None) - return model_instance + return cls(**{k: v for k, v in model_dictionary.items()}) class VLLMModel(Model): @@ -554,12 +542,14 @@ def generate( sampling_params=sampling_params, ) output_text = out[0].outputs[0].text - self.last_input_token_count = len(out[0].prompt_token_ids) - self.last_output_token_count = len(out[0].outputs[0].token_ids) return ChatMessage( role=MessageRole.ASSISTANT, content=output_text, raw={"out": output_text, "completion_kwargs": completion_kwargs}, + token_usage=TokenUsage( + input_tokens=len(out[0].prompt_token_ids), + output_tokens=len(out[0].outputs[0].token_ids), + ), ) @@ -651,18 +641,23 @@ def generate( add_generation_prompt=True, ) - self.last_input_token_count = len(prompt_ids) - self.last_output_token_count = 0 + output_tokens = 0 text = "" for response in self.stream_generate(self.model, self.tokenizer, prompt=prompt_ids, **completion_kwargs): - self.last_output_token_count += 1 + output_tokens += 1 text += response.text if any((stop_index := text.rfind(stop)) != -1 for stop in stops): text = text[:stop_index] break return ChatMessage( - role=MessageRole.ASSISTANT, content=text, raw={"out": text, "completion_kwargs": completion_kwargs} + role=MessageRole.ASSISTANT, + content=text, + raw={"out": text, "completion_kwargs": completion_kwargs}, + token_usage=TokenUsage( + input_tokens=len(prompt_ids), + output_tokens=output_tokens, + ), ) @@ -870,8 +865,6 @@ def generate( output_text = self.processor.decode(generated_tokens, skip_special_tokens=True) else: output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True) - self.last_input_token_count = count_prompt_tokens - self.last_output_token_count = len(generated_tokens) if stop_sequences is not None: output_text = remove_stop_sequences(output_text, stop_sequences) @@ -883,6 +876,10 @@ def generate( "out": output_text, "completion_kwargs": {key: value for key, value in generation_kwargs.items() if key != "inputs"}, }, + token_usage=TokenUsage( + input_tokens=count_prompt_tokens, + output_tokens=len(generated_tokens), + ), ) def generate_stream( @@ -905,14 +902,13 @@ def generate_stream( thread = Thread(target=self.model.generate, kwargs={"streamer": self.streamer, **generation_kwargs}) thread.start() - self.last_output_token_count = 0 - # Generate with streaming for new_text in self.streamer: - yield ChatMessageStreamDelta(content=new_text, tool_calls=None) - self.last_output_token_count += 1 - - self.last_input_token_count = count_prompt_tokens + yield ChatMessageStreamDelta( + content=new_text, + tool_calls=None, + token_usage=TokenUsage(input_tokens=count_prompt_tokens, output_tokens=1), + ) thread.join() @@ -1030,11 +1026,13 @@ def generate( response = self.client.completion(**completion_kwargs) - self.last_input_token_count = response.usage.prompt_tokens - self.last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, + token_usage=TokenUsage( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ), ) def generate_stream( @@ -1065,10 +1063,11 @@ def generate_stream( else: yield ChatMessageStreamDelta( content=event.choices[0].delta.content, + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), ) - if getattr(event, "usage", None): - self.last_input_token_count = event.usage.prompt_tokens - self.last_output_token_count = event.usage.completion_tokens class LiteLLMRouterModel(LiteLLMModel): @@ -1274,9 +1273,14 @@ def generate( ) response = self.client.chat_completion(**completion_kwargs) - self.last_input_token_count = response.usage.prompt_tokens - self.last_output_token_count = response.usage.completion_tokens - return ChatMessage.from_dict(asdict(response.choices[0].message), raw=response) + return ChatMessage.from_dict( + asdict(response.choices[0].message), + raw=response, + token_usage=TokenUsage( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ), + ) def generate_stream( self, @@ -1308,10 +1312,11 @@ def generate_stream( else: yield ChatMessageStreamDelta( content=event.choices[0].delta.content, + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), ) - if getattr(event, "usage", None): - self.last_input_token_count = event.usage.prompt_tokens - self.last_output_token_count = event.usage.completion_tokens class HfApiModel(InferenceClientModel): @@ -1414,10 +1419,11 @@ def generate_stream( else: yield ChatMessageStreamDelta( content=event.choices[0].delta.content, + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), ) - if getattr(event, "usage", None): - self.last_input_token_count = event.usage.prompt_tokens - self.last_output_token_count = event.usage.completion_tokens def generate( self, @@ -1438,12 +1444,14 @@ def generate( **kwargs, ) response = self.client.chat.completions.create(**completion_kwargs) - self.last_input_token_count = response.usage.prompt_tokens - self.last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, + token_usage=TokenUsage( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ), ) @@ -1665,13 +1673,16 @@ def generate( # self.client is created in ApiModel class response = self.client.converse(**completion_kwargs) - # Get usage - self.last_input_token_count = response["usage"]["inputTokens"] - self.last_output_token_count = response["usage"]["outputTokens"] - # Get first message response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"] - return ChatMessage.from_dict(response["output"]["message"], raw=response) + return ChatMessage.from_dict( + response["output"]["message"], + raw=response, + token_usage=TokenUsage( + input_tokens=response["usage"]["inputTokens"], + output_tokens=response["usage"]["outputTokens"], + ), + ) __all__ = [ diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 0d3a254c7..cfed370f3 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -121,9 +121,9 @@ def update_metrics(self, step_log): self.step_durations.append(step_duration) console_outputs = f"[Step {len(self.step_durations)}: Duration {step_duration:.2f} seconds" - if step_log.usage is not None: - self.total_input_token_count += step_log.usage.input_tokens - self.total_output_token_count += step_log.usage.output_tokens + if step_log.token_usage is not None: + self.total_input_token_count += step_log.token_usage.input_tokens + self.total_output_token_count += step_log.token_usage.output_tokens console_outputs += ( f"| Input tokens: {self.total_input_token_count:,} | Output tokens: {self.total_output_token_count:,}" ) From 795f76e18e66a3ec0dd62ffdccdda0dc7b8b7780 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 13:34:26 +0200 Subject: [PATCH 07/28] Fix gradio chatbot as much as possible --- examples/gradio_ui.py | 6 +- src/smolagents/agents.py | 107 ++++++++++++++++++++++++----------- src/smolagents/gradio_ui.py | 36 +++++------- src/smolagents/models.py | 16 ++++-- src/smolagents/monitoring.py | 19 +++---- 5 files changed, 110 insertions(+), 74 deletions(-) diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py index 87f532689..e82684202 100644 --- a/examples/gradio_ui.py +++ b/examples/gradio_ui.py @@ -3,13 +3,13 @@ agent = CodeAgent( tools=[], - model=InferenceClientModel(), + model=InferenceClientModel(provider="nebius"), verbosity_level=1, - planning_interval=3, + # planning_interval=3, name="example_agent", description="This is an example agent.", step_callbacks=[], - stream_outputs=False, + stream_outputs=True, ) GradioUI(agent, file_upload_folder="./data").launch() diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 1101bfa50..502272a31 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -99,6 +99,11 @@ def populate_template(template: str, variables: dict[str, Any]) -> str: raise Exception(f"Error during jinja template rendering: {type(e).__name__}: {e}") +@dataclass +class FinalOutput: + output: Any | None + + class PlanningPromptTemplate(TypedDict): """ Prompt templates for the planning step. @@ -173,7 +178,7 @@ class PromptTemplates(TypedDict): class RunResult: """Holds extended information about an agent run.""" - result: Any + output: Any | None token_usage: TokenUsage | None messages: list[dict] timing: Timing @@ -367,7 +372,7 @@ def run( # Outputs are returned only at the end. We only look at the last step. steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images)) - result = steps[-1].final_answer + result = steps[-1].output if self.return_full_result: token_usage = None @@ -381,7 +386,7 @@ def run( messages = self.memory.get_full_steps() return RunResult( - result=result, + output=result, token_usage=token_usage, messages=messages, timing=Timing(start_time=run_start_time, end_time=time.time()), @@ -392,12 +397,14 @@ def run( def _run_stream( self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None - ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep]: + ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep | ChatMessageStreamDelta]: final_answer = None self.step_number = 1 while final_answer is None and self.step_number <= max_steps: if self.interrupt_switch: raise AgentError("Agent interrupted.", self.logger) + + # Run a planning step if scheduled if self.planning_interval is not None and ( self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0 ): @@ -408,18 +415,15 @@ def _run_stream( ): yield element planning_step = element - assert isinstance(planning_step, PlanningStep) + assert isinstance(planning_step, PlanningStep) # Last yielded element should be a PlanningStep self.memory.steps.append(planning_step) - if getattr(self.model, "last_input_token_count", None) is not None: - planning_step.token_usage = TokenUsage( - input_tokens=self.model.last_input_token_count, - output_tokens=self.model.last_output_token_count, - ) planning_end_time = time.time() planning_step.timing = Timing( start_time=planning_start_time, end_time=planning_end_time, ) + + # Start action step! action_step_start_time = time.time() action_step = ActionStep( step_number=self.step_number, @@ -447,15 +451,17 @@ def _run_stream( yield action_step yield FinalAnswerStep(handle_agent_output_types(final_answer)) - def _execute_step(self, memory_step: ActionStep) -> Generator[Any]: + def _execute_step(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]: self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO) - final_answer = None for el in self._step_stream(memory_step): final_answer = el - yield el - if final_answer is not None and self.final_answer_checks: - self._validate_final_answer(final_answer) - yield final_answer + if isinstance(el, ChatMessageStreamDelta): + yield el + elif isinstance(el, FinalOutput): + final_answer = el.output + if self.final_answer_checks: + self._validate_final_answer(final_answer) + yield final_answer def _validate_final_answer(self, final_answer: Any): for check_function in self.final_answer_checks: @@ -496,7 +502,8 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"]) def _generate_planning_step( self, task, is_first_step: bool, step: int - ) -> Generator[ChatMessageStreamDelta, PlanningStep]: + ) -> Generator[ChatMessageStreamDelta | PlanningStep]: + start_time = time.time() if is_first_step: input_messages = [ { @@ -515,14 +522,23 @@ def _generate_planning_step( if self.stream_outputs and hasattr(self.model, "generate_stream"): plan_message_content = "" output_stream = self.model.generate_stream(input_messages, stop_sequences=[""]) # type: ignore + input_tokens, output_tokens = 0, 0 with Live("", console=self.logger.console, vertical_overflow="visible") as live: for event in output_stream: if event.content is not None: plan_message_content += event.content live.update(Markdown(plan_message_content)) + if event.token_usage: + output_tokens += event.token_usage.output_tokens + input_tokens = event.token_usage.input_tokens yield event else: - plan_message_content = self.model.generate(input_messages, stop_sequences=[""]).content + plan_message = self.model.generate(input_messages, stop_sequences=[""]) + plan_message_content = plan_message.content + input_tokens, output_tokens = ( + plan_message.token_usage.input_tokens, + plan_message.token_usage.output_tokens, + ) plan = textwrap.dedent( f"""Here are the facts I know and the plan of action that I will follow to solve the task:\n```\n{plan_message_content}\n```""" ) @@ -561,11 +577,25 @@ def _generate_planning_step( input_messages = [plan_update_pre] + memory_messages + [plan_update_post] if self.stream_outputs and hasattr(self.model, "generate_stream"): plan_message_content = "" - for completion_delta in self.model.generate_stream(input_messages, stop_sequences=[""]): # type: ignore - plan_message_content += completion_delta.content - yield completion_delta + input_tokens, output_tokens = 0, 0 + with Live("", console=self.logger.console, vertical_overflow="visible") as live: + for event in self.model.generate_stream( + input_messages, + stop_sequences=[""], + ): # type: ignore + if event.content is not None: + plan_message_content += event.content + live.update(Markdown(plan_message_content)) + output_tokens += event.token_usage.output_tokens + input_tokens = event.token_usage.input_tokens + yield event else: - plan_message_content = self.model.generate(input_messages, stop_sequences=[""]).content + plan_message = self.model.generate(input_messages, stop_sequences=[""]) + plan_message_content = plan_message.content + input_tokens, output_tokens = ( + plan_message.token_usage.input_tokens, + plan_message.token_usage.output_tokens, + ) plan = textwrap.dedent( f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere are the facts I know and my new/updated plan of action to solve the task:\n```\n{plan_message_content}\n```""" ) @@ -575,6 +605,8 @@ def _generate_planning_step( model_input_messages=input_messages, plan=plan, model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content=plan_message_content), + token_usage=TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens), + timing=Timing(start_time=start_time, end_time=time.time()), ) @property @@ -607,7 +639,7 @@ def write_memory_to_messages( messages.extend(memory_step.to_messages(summary_mode=summary_mode)) return messages - def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: + def _step_stream(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. Yields either None if the step is not final, or the final answer. @@ -709,7 +741,7 @@ def __call__(self, task: str, **kwargs): ) result = self.run(full_task, **kwargs) if isinstance(result, RunResult): - report = result.result + report = result.output else: report = result answer = populate_template( @@ -1105,7 +1137,7 @@ def initialize_system_prompt(self) -> str: ) return system_prompt - def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: + def _step_stream(self, memory_step: ActionStep) -> Generator[FinalOutput]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. Yields either None if the step is not final, or the final answer. @@ -1118,7 +1150,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: memory_step.model_input_messages = input_messages try: - chat_message: ChatMessage = self.model( + chat_message: ChatMessage = self.model.generate( input_messages, stop_sequences=["Observation:", "Calling tools:"], tools_to_call_from=list(self.tools.values()), @@ -1179,7 +1211,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: ) memory_step.action_output = final_answer - yield final_answer + yield FinalOutput(output=final_answer) else: if tool_arguments is None: tool_arguments = {} @@ -1201,7 +1233,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: level=LogLevel.INFO, ) memory_step.observations = updated_information - yield None + yield FinalOutput(output=None) def _substitute_state_variables(self, arguments: dict[str, str] | str) -> dict[str, Any] | str: """Replace string values in arguments with their corresponding state values if they exist.""" @@ -1369,10 +1401,11 @@ def initialize_system_prompt(self) -> str: ) return system_prompt - def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: + def _step_stream(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. - Yields either None if the step is not final, or the final answer. + Yields ChatMessageStreamDelta during the run if streaming is enabled. + At the end, yields either None if the step is not final, or the final answer. """ memory_messages = self.write_memory_to_messages() @@ -1388,15 +1421,24 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: **additional_args, ) output_text = "" + input_tokens, output_tokens = 0, 0 with Live("", console=self.logger.console, vertical_overflow="visible") as live: for event in output_stream: if event.content is not None: output_text += event.content live.update(Markdown(output_text)) + if event.token_usage: + output_tokens += event.token_usage.output_tokens + input_tokens = event.token_usage.input_tokens + assert isinstance(event, ChatMessageStreamDelta) yield event model_output = output_text - chat_message = ChatMessage(role="assistant", content=model_output) + chat_message = ChatMessage( + role="assistant", + content=model_output, + token_usage=TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens), + ) memory_step.model_output_message = chat_message model_output = chat_message.content else: @@ -1419,6 +1461,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: model_output += "" memory_step.model_output_message.content = model_output + memory_step.token_usage = chat_message.token_usage memory_step.model_output = model_output except Exception as e: raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e @@ -1480,7 +1523,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: ] self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO) memory_step.action_output = output - yield output if is_final_answer else None + yield FinalOutput(output=output if is_final_answer else None) def to_dict(self) -> dict[str, Any]: """Convert the agent to a dictionary representation. diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index fce69d069..89fb7aaa5 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -29,14 +29,11 @@ def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str: """Get a footnote string for a step log with duration and token information""" step_footnote = f"**{step_name}**" - if hasattr(step_log, "token_usage"): + if getattr(step_log, "token_usage", None): token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}" step_footnote += token_str - if hasattr(step_log, "timing"): - step_duration = ( - f" | Duration: {round(float(step_log.timing.duration), 2)}" if step_log.timing.duration else None - ) - step_footnote += step_duration + step_duration = f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else None + step_footnote += step_duration step_footnote_content = f"""{step_footnote} """ return step_footnote_content @@ -252,26 +249,27 @@ def stream_to_gradio( task_images: list | None = None, reset_agent_memory: bool = False, additional_args: dict | None = None, -): +) -> Generator: """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages.""" if not _is_package_available("gradio"): raise ModuleNotFoundError( "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) intermediate_text = "" - for step_log in agent.run( + for event in agent.run( task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args ): - if isinstance(step_log, ActionStep | PlanningStep | FinalAnswerStep): + if isinstance(event, ActionStep | PlanningStep | FinalAnswerStep): intermediate_text = "" for message in pull_messages_from_step( - step_log, + event, # If we're streaming model outputs, no need to display them twice skip_model_outputs=getattr(agent, "stream_outputs", False), ): yield message - elif isinstance(step_log, ChatMessageStreamDelta): - intermediate_text += step_log.content or "" + elif isinstance(event, ChatMessageStreamDelta): + print(event) + intermediate_text += event.content or "" yield intermediate_text @@ -306,19 +304,15 @@ def interact_with_agent(self, prompt, messages, session_state): if isinstance(msg, gr.ChatMessage): messages.append(msg) elif isinstance(msg, str): # Then it's only a completion delta - try: - if messages[-1].metadata["status"] == "pending": - messages[-1].content = msg - else: - messages.append( - gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}) - ) - except Exception as e: - raise e + if messages[-1].metadata["status"] == "pending": + messages[-1].content = msg + else: + messages.append(gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"})) yield messages yield messages except Exception as e: + raise e print(f"Error in interaction: {str(e)}") messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}")) yield messages diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 714123f09..68f4fa090 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -368,7 +368,7 @@ def _prepare_completion_kwargs( def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]]] | list[ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -377,7 +377,7 @@ def generate( """Process the input messages and return the model's response. Parameters: - messages (`list[dict[str, str]]`): + messages (`list[dict[str, str | list[dict]]] | list[ChatMessage]`): A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`. stop_sequences (`List[str]`, *optional*): A list of strings that will stop the generation if encountered in the model's output. @@ -1318,12 +1318,16 @@ def generate_stream( if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - yield ChatMessageStreamDelta( - content=event.choices[0].delta.content, - token_usage=TokenUsage( + if getattr(event, "usage", None): + token_usage = TokenUsage( input_tokens=event.usage.prompt_tokens, output_tokens=event.usage.completion_tokens, - ), + ) + else: + token_usage = None + yield ChatMessageStreamDelta( + content=event.choices[0].delta.content, + token_usage=token_usage, ) diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index cfed370f3..0effcfcf4 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -92,18 +92,13 @@ def __init__(self, tracked_model, logger): self.step_durations = [] self.tracked_model = tracked_model self.logger = logger - if getattr(self.tracked_model, "last_input_token_count", "Not found") != "Not found": - self.total_input_token_count = 0 - self.total_output_token_count = 0 - - def get_total_token_counts(self) -> TokenUsage | None: - return ( - TokenUsage( - input_tokens=self.total_input_token_count, - output_tokens=self.total_output_token_count, - ) - if hasattr(self, "total_input_token_count") - else None + self.total_input_token_count = 0 + self.total_output_token_count = 0 + + def get_total_token_counts(self) -> TokenUsage: + return TokenUsage( + input_tokens=self.total_input_token_count, + output_tokens=self.total_output_token_count, ) def reset(self): From c7be43b8bad0a5fa4e7089998bde2499052faecc Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 14:03:52 +0200 Subject: [PATCH 08/28] Fix gradio chatbot by escaping HTML tags --- examples/gradio_ui.py | 4 ++-- src/smolagents/agents.py | 18 +++++++++--------- src/smolagents/gradio_ui.py | 8 ++++++-- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py index e82684202..2b28ce109 100644 --- a/examples/gradio_ui.py +++ b/examples/gradio_ui.py @@ -3,9 +3,9 @@ agent = CodeAgent( tools=[], - model=InferenceClientModel(provider="nebius"), + model=InferenceClientModel(), verbosity_level=1, - # planning_interval=3, + planning_interval=3, name="example_agent", description="This is an example agent.", step_callbacks=[], diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 502272a31..834e3eb24 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -231,15 +231,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert ( - not missing_keys - ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + assert not missing_keys, ( + f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + ) for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert ( - key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) - ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( + f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + ) self.max_steps = max_steps self.step_number = 0 @@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all( - agent.name and agent.description for agent in managed_agents - ), "All managed agents need both a name and a description!" + assert all(agent.name and agent.description for agent in managed_agents), ( + "All managed agents need both a name and a description!" + ) self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index 89fb7aaa5..f0b0871bb 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -222,7 +222,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator: def pull_messages_from_step(step_log: ActionStep | PlanningStep | FinalAnswerStep, skip_model_outputs: bool = False): - """Extract ChatMessage objects from agent steps with proper nesting. + """Extract Gradio ChatMessage objects from agent steps with proper nesting. Args: step_log: The step log to display as gr.ChatMessage objects. @@ -256,6 +256,8 @@ def stream_to_gradio( "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) intermediate_text = "" + import time + for event in agent.run( task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args ): @@ -268,7 +270,7 @@ def stream_to_gradio( ): yield message elif isinstance(event, ChatMessageStreamDelta): - print(event) + time.sleep(0.1) intermediate_text += event.content or "" yield intermediate_text @@ -302,8 +304,10 @@ def interact_with_agent(self, prompt, messages, session_state): for msg in stream_to_gradio(session_state["agent"], task=prompt, reset_agent_memory=False): if isinstance(msg, gr.ChatMessage): + messages[-1].metadata["status"] = "done" messages.append(msg) elif isinstance(msg, str): # Then it's only a completion delta + msg = msg.replace("<", r"\<").replace(">", r"\>") # HTML tags seem to break Gradio Chatbot if messages[-1].metadata["status"] == "pending": messages[-1].content = msg else: From 3446f7ead7c2cd6ae14d4d2b90a844caf6d9c5b1 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 14:28:55 +0200 Subject: [PATCH 09/28] Pass monitoring tests --- src/smolagents/agents.py | 41 +++++++++++++++--------------------- src/smolagents/gradio_ui.py | 2 +- src/smolagents/memory.py | 2 +- tests/test_monitoring.py | 42 ++++++++++++++----------------------- 4 files changed, 35 insertions(+), 52 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 834e3eb24..f568531eb 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -231,15 +231,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert not missing_keys, ( - f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" - ) + assert ( + not missing_keys + ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( - f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" - ) + assert ( + key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) + ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" self.max_steps = max_steps self.step_number = 0 @@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all(agent.name and agent.description for agent in managed_agents), ( - "All managed agents need both a name and a description!" - ) + assert all( + agent.name and agent.description for agent in managed_agents + ), "All managed agents need both a name and a description!" self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): @@ -472,11 +472,6 @@ def _validate_final_answer(self, final_answer: Any): def _finalize_step(self, memory_step: ActionStep): memory_step.timing.end_time = time.time() - if getattr(self.model, "last_input_token_count", None) is not None: - memory_step.token_usage = TokenUsage( - input_tokens=self.model.last_input_token_count, - output_tokens=self.model.last_output_token_count, - ) for callback in self.step_callbacks: # For compatibility with old callbacks that don't take the agent as an argument callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( @@ -489,16 +484,13 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"]) final_memory_step = ActionStep( step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger), - timing=Timing(start_time=action_step_start_time), + timing=Timing(start_time=action_step_start_time, end_time=time.time()), + token_usage=final_answer.token_usage, ) - final_memory_step.action_output = final_answer + final_memory_step.action_output = final_answer.content self._finalize_step(final_memory_step) self.memory.steps.append(final_memory_step) - for callback in self.step_callbacks: - callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( - final_memory_step, agent=self - ) - return final_answer + return final_answer.content def _generate_planning_step( self, task, is_first_step: bool, step: int @@ -674,7 +666,7 @@ def extract_action(self, model_output: str, split_token: str) -> tuple[str, str] ) return rationale.strip(), action.strip() - def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None = None) -> str: + def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None = None) -> ChatMessage: """ Provide the final answer to the task, based on the logs of the agent's interactions. @@ -713,8 +705,8 @@ def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None } ] try: - chat_message: ChatMessage = self.model(messages) - return chat_message.content + chat_message: ChatMessage = self.model.generate(messages) + return chat_message except Exception as e: return f"Error in generating final LLM output:\n{e}" @@ -1181,6 +1173,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[FinalOutput]: tool_arguments = tool_call.function.arguments memory_step.model_output = str(f"Called Tool: '{tool_name}' with arguments: {tool_arguments}") memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)] + memory_step.token_usage = chat_message.token_usage # Execute self.logger.log( diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index f0b0871bb..a2b188c19 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -196,7 +196,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator: """ import gradio as gr - final_answer = step_log.final_answer + final_answer = step_log.output if isinstance(final_answer, AgentText): yield gr.ChatMessage( role="assistant", diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 0f24b3bce..033ca261a 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -182,7 +182,7 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]: @dataclass class FinalAnswerStep(MemoryStep): - final_answer: Any + output: Any class AgentMemory: diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index b571a233a..6b5eaac1a 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -20,6 +20,7 @@ from smolagents import ( AgentImage, CodeAgent, + RunResult, ToolCallingAgent, stream_to_gradio, ) @@ -28,14 +29,11 @@ ChatMessageToolCall, ChatMessageToolCallDefinition, Model, + TokenUsage, ) class FakeLLMModel(Model): - def __init__(self): - self.last_input_token_count = 10 - self.last_output_token_count = 20 - def generate(self, prompt, tools_to_call_from=None, **kwargs): if tools_to_call_from is not None: return ChatMessage( @@ -48,6 +46,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs): function=ChatMessageToolCallDefinition(name="final_answer", arguments={"answer": "image"}), ) ], + token_usage=TokenUsage(input_tokens=10, output_tokens=20), ) else: return ChatMessage( @@ -57,6 +56,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs): ```py final_answer('This is the final answer.') ```""", + token_usage=TokenUsage(input_tokens=10, output_tokens=20), ) @@ -86,12 +86,12 @@ def test_toolcalling_agent_metrics(self): def test_code_agent_metrics_max_steps(self): class FakeLLMModelMalformedAnswer(Model): - def __init__(self): - self.last_input_token_count = 10 - self.last_output_token_count = 20 - def generate(self, prompt, **kwargs): - return ChatMessage(role="assistant", content="Malformed answer") + return ChatMessage( + role="assistant", + content="Malformed answer", + token_usage=TokenUsage(input_tokens=10, output_tokens=20), + ) agent = CodeAgent( tools=[], @@ -106,13 +106,7 @@ def generate(self, prompt, **kwargs): def test_code_agent_metrics_generation_error(self): class FakeLLMModelGenerationException(Model): - def __init__(self): - self.last_input_token_count = 10 - self.last_output_token_count = 20 - def generate(self, prompt, **kwargs): - self.last_input_token_count = 10 - self.last_output_token_count = 0 raise Exception("Cannot generate") agent = CodeAgent( @@ -120,11 +114,9 @@ def generate(self, prompt, **kwargs): model=FakeLLMModelGenerationException(), max_steps=1, ) - with pytest.raises(Exception): + with pytest.raises(Exception) as e: agent.run("Fake task") - - self.assertEqual(agent.monitor.total_input_token_count, 10) # Should have done one monitoring callbacks - self.assertEqual(agent.monitor.total_output_token_count, 0) + assert "Cannot generate" in str(e.value) def test_streaming_agent_text_output(self): agent = CodeAgent( @@ -187,21 +179,19 @@ def generate(self, prompt, **kwargs): self.assertEqual(final_message.role, "assistant") self.assertIn("Malformed call", final_message.content) - def test_run_return_full_results(self): + def test_run_return_full_result(self): agent = CodeAgent( tools=[], model=FakeLLMModel(), max_steps=1, - return_full_results=True, + return_full_result=True, ) result = agent.run("Fake task") - from smolagents import RunResult - self.assertIsInstance(result, RunResult) - self.assertEqual(result.result, "This is the final answer.") + self.assertEqual(result.output, "This is the final answer.") self.assertEqual(result.state, "success") - self.assertEqual(result.token_usage, {"input": 10, "output": 20}) + self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20)) self.assertIsInstance(result.messages, list) - self.assertGreater(result.duration, 0) + self.assertGreater(result.timing.duration, 0) From 816466ac4334b6f5591a0091136e5221ebe3c541 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 14:38:25 +0200 Subject: [PATCH 10/28] Pass more tests --- src/smolagents/agents.py | 18 +++++++-------- tests/test_agents.py | 3 --- tests/test_gradio_ui.py | 47 +++++++++++++++++++++++++++------------- tests/test_memory.py | 13 +++++------ 4 files changed, 47 insertions(+), 34 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index f568531eb..6cc860110 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -231,15 +231,15 @@ def __init__( self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES if prompt_templates is not None: missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys()) - assert ( - not missing_keys - ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + assert not missing_keys, ( + f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}" + ) for key, value in EMPTY_PROMPT_TEMPLATES.items(): if isinstance(value, dict): for subkey in value.keys(): - assert ( - key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()) - ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), ( + f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}" + ) self.max_steps = max_steps self.step_number = 0 @@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None: """Setup managed agents with proper logging.""" self.managed_agents = {} if managed_agents: - assert all( - agent.name and agent.description for agent in managed_agents - ), "All managed agents need both a name and a description!" + assert all(agent.name and agent.description for agent in managed_agents), ( + "All managed agents need both a name and a description!" + ) self.managed_agents = {agent.name: agent for agent in managed_agents} def _setup_tools(self, tools, add_base_tools): diff --git a/tests/test_agents.py b/tests/test_agents.py index a350dbff4..650c76faa 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -678,8 +678,6 @@ def generate(self, messages, stop_sequences=None): def test_step_number(self): fake_model = MagicMock() - fake_model.last_input_token_count = 10 - fake_model.last_output_token_count = 20 max_steps = 2 agent = CodeAgent(tools=[], model=fake_model, max_steps=max_steps) assert hasattr(agent, "step_number"), "step_number attribute should be defined" @@ -852,7 +850,6 @@ def test_provide_final_answer(self, images, expected_messages_list): def test_interrupt(self): fake_model = MagicMock() fake_model.return_value.content = "Model output." - fake_model.last_input_token_count = None def interrupt_callback(memory_step, agent): agent.interrupt() diff --git a/tests/test_gradio_ui.py b/tests/test_gradio_ui.py index f748bf1e0..4a527b5c4 100644 --- a/tests/test_gradio_ui.py +++ b/tests/test_gradio_ui.py @@ -25,6 +25,7 @@ from smolagents.gradio_ui import GradioUI, pull_messages_from_step, stream_to_gradio from smolagents.memory import ActionStep, FinalAnswerStep, PlanningStep, ToolCall from smolagents.models import ChatMessageStreamDelta +from smolagents.monitoring import Timing, TokenUsage class GradioUITester(unittest.TestCase): @@ -221,11 +222,9 @@ def test_action_step_basic( model_output="This is the model output", observations="Some execution logs", error=None, - duration=2.5, + timing=Timing(start_time=1.0, end_time=3.5), + token_usage=TokenUsage(input_tokens=100, output_tokens=50), ) - # Set in stream_to_gradio: - step.input_token_count = 100 - step.output_token_count = 50 messages = list(pull_messages_from_step(step)) assert len(messages) == 5 # step number, model_output, logs, footnote, divider for message, expected_content in zip( @@ -246,7 +245,8 @@ def test_action_step_with_tool_calls(self): step_number=2, tool_calls=[ToolCall(name="test_tool", arguments={"answer": "Test answer"}, id="tool_call_1")], observations="Tool execution logs", - duration=1.5, + timing=Timing(start_time=1.0, end_time=2.5), + token_usage=TokenUsage(input_tokens=100, output_tokens=50), ) messages = list(pull_messages_from_step(step)) assert len(messages) == 5 # step, tool call, logs, footnote, divider @@ -266,7 +266,12 @@ def test_action_step_tool_call_formats(self, tool_name, args, expected): tool_call = Mock() tool_call.name = tool_name tool_call.arguments = args - step = ActionStep(step_number=1, tool_calls=[tool_call], duration=1.5) + step = ActionStep( + step_number=1, + tool_calls=[tool_call], + timing=Timing(start_time=1.0, end_time=2.5), + token_usage=TokenUsage(input_tokens=100, output_tokens=50), + ) messages = list(pull_messages_from_step(step)) tool_message = next( msg @@ -281,7 +286,12 @@ def test_action_step_tool_call_formats(self, tool_name, args, expected): def test_action_step_with_error(self): """Test ActionStep with error.""" - step = ActionStep(step_number=3, error="This is an error message", duration=1.0) + step = ActionStep( + step_number=3, + error="This is an error message", + timing=Timing(start_time=1.0, end_time=2.0), + token_usage=TokenUsage(input_tokens=100, output_tokens=200), + ) messages = list(pull_messages_from_step(step)) error_message = next((m for m in messages if "error" in str(m.content).lower()), None) assert error_message is not None @@ -289,7 +299,12 @@ def test_action_step_with_error(self): def test_action_step_with_images(self): """Test ActionStep with observation images.""" - step = ActionStep(step_number=4, observations_images=["image1.png", "image2.jpg"], duration=1.0) + step = ActionStep( + step_number=4, + observations_images=["image1.png", "image2.jpg"], + token_usage=TokenUsage(input_tokens=100, output_tokens=200), + timing=Timing(start_time=1.0, end_time=2.0), + ) with patch("smolagents.gradio_ui.AgentImage") as mock_agent_image: mock_agent_image.return_value.to_string.side_effect = lambda: "path/to/image.png" messages = list(pull_messages_from_step(step)) @@ -301,11 +316,11 @@ def test_action_step_with_images(self): def test_planning_step(self, skip_model_outputs, expected_messages_length): """Test PlanningStep processing.""" step = PlanningStep( - plan="1. First step\n2. Second step", model_input_messages=Mock(), model_output_message=Mock() + plan="1. First step\n2. Second step", + model_input_messages=Mock(), + model_output_message=Mock(), + token_usage=TokenUsage(input_tokens=10, output_tokens=20), ) - # Set in stream_to_gradio: - step.input_token_count = 80 - step.output_token_count = 30 messages = list(pull_messages_from_step(step, skip_model_outputs=skip_model_outputs)) assert len(messages) == expected_messages_length # [header, plan,] footnote, divider expected_contents = [ @@ -331,7 +346,9 @@ def test_final_answer_step(self, answer_type, answer_value, expected_content): except TypeError: with patch.object(answer_type, "to_string", return_value=answer_value): final_answer = answer_type(answer_value) - step = FinalAnswerStep(final_answer=final_answer) + step = FinalAnswerStep( + output=final_answer, + ) messages = list(pull_messages_from_step(step)) assert len(messages) == 1 assert messages[0].content == expected_content @@ -339,7 +356,7 @@ def test_final_answer_step(self, answer_type, answer_value, expected_content): def test_final_answer_step_image(self): """Test FinalAnswerStep with image answer.""" with patch.object(AgentImage, "to_string", return_value="path/to/image.png"): - step = FinalAnswerStep(final_answer=AgentImage("path/to/image.png")) + step = FinalAnswerStep(output=AgentImage("path/to/image.png")) messages = list(pull_messages_from_step(step)) assert len(messages) == 1 assert messages[0].content["path"] == "path/to/image.png" @@ -348,7 +365,7 @@ def test_final_answer_step_image(self): def test_final_answer_step_audio(self): """Test FinalAnswerStep with audio answer.""" with patch.object(AgentAudio, "to_string", return_value="path/to/audio.wav"): - step = FinalAnswerStep(final_answer=AgentAudio("path/to/audio.wav")) + step = FinalAnswerStep(output=AgentAudio("path/to/audio.wav")) messages = list(pull_messages_from_step(step)) assert len(messages) == 1 assert messages[0].content["path"] == "path/to/audio.wav" diff --git a/tests/test_memory.py b/tests/test_memory.py index 04c6b7f47..7990698f6 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -12,6 +12,7 @@ SystemPromptStep, TaskStep, ) +from smolagents.monitoring import Timing, TokenUsage class TestAgentMemory: @@ -43,16 +44,15 @@ def test_action_step_to_messages(): tool_calls=[ ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}), ], - start_time=0.0, - end_time=1.0, + timing=Timing(start_time=0.0, end_time=1.0), step_number=1, error=None, - duration=1.0, model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"), model_output="Hi", observations="This is a nice observation", observations_images=["image1.png"], action_output="Output", + token_usage=TokenUsage(input_tokens=10, output_tokens=20), ) messages = action_step.to_messages() assert len(messages) == 4 @@ -93,16 +93,15 @@ def test_action_step_to_messages_no_tool_calls_with_observations(): action_step = ActionStep( model_input_messages=None, tool_calls=None, - start_time=None, - end_time=None, - step_number=None, + timing=Timing(start_time=0.0, end_time=1.0), + step_number=1, error=None, - duration=None, model_output_message=None, model_output=None, observations="This is an observation.", observations_images=None, action_output=None, + token_usage=TokenUsage(input_tokens=10, output_tokens=20), ) messages = action_step.to_messages() assert len(messages) == 1 From 934361130bfa863207d653a1d12b658bd3c8331a Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 16:47:06 +0200 Subject: [PATCH 11/28] Revert default LLM upgrade --- examples/open_deep_research/run_gaia.py | 2 +- src/smolagents/models.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py index 9c7bacd4e..9a22e7174 100644 --- a/examples/open_deep_research/run_gaia.py +++ b/examples/open_deep_research/run_gaia.py @@ -183,7 +183,7 @@ def answer_single_question( else: model_params["max_tokens"] = 4096 model = LiteLLMModel(**model_params) - # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096) + # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="novita", max_tokens=4096) document_inspection_tool = TextInspectorTool(model, 100000) agent = create_agent_team(model) diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 68f4fa090..77b2f26eb 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -672,7 +672,7 @@ class TransformersModel(Model): Parameters: model_id (`str`): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. - For example, `"Qwen/Qwen3-32B"`. + For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`. device_map (`str`, *optional*): The device_map to initialize your model with. torch_dtype (`str`, *optional*): @@ -690,7 +690,7 @@ class TransformersModel(Model): Example: ```python >>> engine = TransformersModel( - ... model_id="Qwen/Qwen3-32B", + ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", ... device="cuda", ... max_new_tokens=5000, ... ) @@ -1173,10 +1173,10 @@ class InferenceClientModel(ApiModel): Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more. Parameters: - model_id (`str`, *optional*, default `"Qwen/Qwen3-32B"`): + model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`): The Hugging Face model ID to be used for inference. This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint. - Currently, it defaults to `"Qwen/Qwen3-32B"`, but this may change in the future. + Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future. provider (`str`, *optional*): Name of the provider to use for inference. A list of supported providers can be found in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index#partners). Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order [here](https://hf.co/settings/inference-providers). @@ -1211,7 +1211,7 @@ class InferenceClientModel(ApiModel): Example: ```python >>> engine = InferenceClientModel( - ... model_id="Qwen/Qwen3-32B", + ... model_id="Qwen/Qwen2.5-Coder-32B-Instruct", ... provider="nebius", ... token="your_hf_token_here", ... max_tokens=5000, @@ -1225,7 +1225,7 @@ class InferenceClientModel(ApiModel): def __init__( self, - model_id: str = "Qwen/Qwen3-32B", + model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct", provider: str | None = None, token: str | None = None, timeout: int = 120, From 7a944b319ed01dffdac6c7a55ea17f77be684712 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Mon, 19 May 2025 16:49:11 +0200 Subject: [PATCH 12/28] Remove sleep --- src/smolagents/gradio_ui.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index a2b188c19..69957cbb8 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -256,7 +256,6 @@ def stream_to_gradio( "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) intermediate_text = "" - import time for event in agent.run( task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args @@ -270,7 +269,6 @@ def stream_to_gradio( ): yield message elif isinstance(event, ChatMessageStreamDelta): - time.sleep(0.1) intermediate_text += event.content or "" yield intermediate_text @@ -316,7 +314,6 @@ def interact_with_agent(self, prompt, messages, session_state): yield messages except Exception as e: - raise e print(f"Error in interaction: {str(e)}") messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}")) yield messages From 4de35e9c01b4644b4b15995b85ee2b8e364aeca8 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 11:11:21 +0200 Subject: [PATCH 13/28] Re-add last_input_token_count attribute for Model --- src/smolagents/agents.py | 16 ++++++++++---- src/smolagents/models.py | 47 +++++++++++++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 6cc860110..20a815e8d 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -27,7 +27,7 @@ from dataclasses import dataclass from logging import getLogger from pathlib import Path -from typing import TYPE_CHECKING, Any, TypedDict +from typing import TYPE_CHECKING, Any, Literal, TypedDict import jinja2 import yaml @@ -176,13 +176,21 @@ class PromptTemplates(TypedDict): @dataclass class RunResult: - """Holds extended information about an agent run.""" + """Holds extended information about an agent run. + + Attributes: + output (Any | None): The final output of the agent run, if available. + state (Literal["success", "max_steps_error"]): The final state of the agent after the run. + messages (list[dict]): The agent's memory, as a list of messages. + token_usage (TokenUsage | None): Count of tokens used during the run. + timing (Timing): Timing details of the agent run: start time, end time, duration. + """ output: Any | None - token_usage: TokenUsage | None + state: Literal["success", "max_steps_error"] messages: list[dict] + token_usage: TokenUsage | None timing: Timing - state: str class MultiStepAgent(ABC): diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 77b2f26eb..1bf76f548 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -310,8 +310,24 @@ def __init__( self.tool_name_key = tool_name_key self.tool_arguments_key = tool_arguments_key self.kwargs = kwargs + self._last_input_token_count: int | None = None + self._last_output_token_count: int | None = None self.model_id: str | None = model_id + @property + def last_input_token_count(self) -> int | None: + logger.warning( + "The last_input_token_count attribute is deprecated and will be removed in a future version.", + ) + return self._last_input_token_count + + @property + def last_output_token_count(self) -> int | None: + logger.warning( + "The last_output_token_count attribute is deprecated and will be removed in a future version.", + ) + return self._last_output_token_count + def _prepare_completion_kwargs( self, messages: list[dict[str, str | list[dict]]], @@ -368,7 +384,7 @@ def _prepare_completion_kwargs( def generate( self, - messages: list[dict[str, str | list[dict]]] | list[ChatMessage], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -496,7 +512,7 @@ def cleanup(self): def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -542,6 +558,8 @@ def generate( sampling_params=sampling_params, ) output_text = out[0].outputs[0].text + self._last_input_token_count = len(out[0].prompt_token_ids) + self._last_output_token_count = len(out[0].outputs[0].token_ids) return ChatMessage( role=MessageRole.ASSISTANT, content=output_text, @@ -617,7 +635,7 @@ def __init__( def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -650,6 +668,8 @@ def generate( text = text[:stop_index] break + self._last_input_token_count = len(prompt_ids) + self._last_output_token_count = output_tokens return ChatMessage( role=MessageRole.ASSISTANT, content=text, @@ -843,7 +863,7 @@ def _prepare_completion_args( def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -869,6 +889,8 @@ def generate( if stop_sequences is not None: output_text = remove_stop_sequences(output_text, stop_sequences) + self._last_input_token_count = count_prompt_tokens + self._last_output_token_count = len(generated_tokens) return ChatMessage( role=MessageRole.ASSISTANT, content=output_text, @@ -1005,7 +1027,7 @@ def create_client(self): def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -1026,6 +1048,8 @@ def generate( response = self.client.completion(**completion_kwargs) + self._last_input_token_count = response.usage.prompt_tokens + self._last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, @@ -1264,7 +1288,7 @@ def create_client(self): def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -1281,6 +1305,8 @@ def generate( ) response = self.client.chat_completion(**completion_kwargs) + self._last_input_token_count = response.usage.prompt_tokens + self._last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( asdict(response.choices[0].message), raw=response, @@ -1439,7 +1465,7 @@ def generate_stream( def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -1457,6 +1483,8 @@ def generate( ) response = self.client.chat.completions.create(**completion_kwargs) + self._last_input_token_count = response.usage.prompt_tokens + self._last_output_token_count = response.usage.completion_tokens return ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}), raw=response, @@ -1668,7 +1696,7 @@ def create_client(self): def generate( self, - messages: list[dict[str, str | list[dict]]], + messages: list[dict[str, str | list[dict]] | ChatMessage], stop_sequences: list[str] | None = None, grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, @@ -1687,6 +1715,9 @@ def generate( # Get first message response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"] + + self._last_input_token_count = response["usage"]["inputTokens"] + self._last_output_token_count = response["usage"]["outputTokens"] return ChatMessage.from_dict( response["output"]["message"], raw=response, From d47642087a1d6b10451b9f46327544cd9002b35a Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 11:58:58 +0200 Subject: [PATCH 14/28] Add tests --- examples/gradio_ui.py | 1 + src/smolagents/agents.py | 29 +++++++++++++----- src/smolagents/gradio_ui.py | 11 +++---- src/smolagents/memory.py | 2 +- src/smolagents/models.py | 2 ++ tests/test_gradio_ui.py | 15 +++++++--- tests/test_monitoring.py | 60 +++++++++++++++++++++++++++++++++++-- 7 files changed, 99 insertions(+), 21 deletions(-) diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py index 2b28ce109..fb69481ec 100644 --- a/examples/gradio_ui.py +++ b/examples/gradio_ui.py @@ -10,6 +10,7 @@ description="This is an example agent.", step_callbacks=[], stream_outputs=True, + return_full_result=True, ) GradioUI(agent, file_upload_folder="./data").launch() diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 20a815e8d..9006db22d 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -380,11 +380,25 @@ def run( # Outputs are returned only at the end. We only look at the last step. steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images)) - result = steps[-1].output + assert isinstance(steps[-1], FinalAnswerStep) + output = steps[-1].output if self.return_full_result: - token_usage = None - token_usage = self.monitor.get_total_token_counts() + total_input_tokens = 0 + total_output_tokens = 0 + correct_token_usage = True + for step in self.memory.steps: + if isinstance(step, (ActionStep, PlanningStep)): + if step.token_usage is None: + correct_token_usage = False + break + else: + total_input_tokens += step.token_usage.input_tokens + total_output_tokens += step.token_usage.output_tokens + if correct_token_usage: + token_usage = TokenUsage(input_tokens=total_input_tokens, output_tokens=total_output_tokens) + else: + token_usage = None if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError): state = "max_steps_error" @@ -394,14 +408,14 @@ def run( messages = self.memory.get_full_steps() return RunResult( - output=result, + output=output, token_usage=token_usage, messages=messages, timing=Timing(start_time=run_start_time, end_time=time.time()), state=state, ) - return result + return output def _run_stream( self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None @@ -586,8 +600,9 @@ def _generate_planning_step( if event.content is not None: plan_message_content += event.content live.update(Markdown(plan_message_content)) - output_tokens += event.token_usage.output_tokens - input_tokens = event.token_usage.input_tokens + if event.token_usage: + output_tokens += event.token_usage.output_tokens + input_tokens = event.token_usage.input_tokens yield event else: plan_message = self.model.generate(input_messages, stop_sequences=[""]) diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index 69957cbb8..e5e6ede1c 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -29,11 +29,9 @@ def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str: """Get a footnote string for a step log with duration and token information""" step_footnote = f"**{step_name}**" - if getattr(step_log, "token_usage", None): - token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}" - step_footnote += token_str - step_duration = f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else None - step_footnote += step_duration + if step_log.token_usage is not None: + step_footnote += f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}" + step_footnote += f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else "" step_footnote_content = f"""{step_footnote} """ return step_footnote_content @@ -314,9 +312,8 @@ def interact_with_agent(self, prompt, messages, session_state): yield messages except Exception as e: - print(f"Error in interaction: {str(e)}") - messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}")) yield messages + raise gr.Error(f"Error in interaction: {str(e)}") def upload_file(self, file, file_uploads_log, allowed_file_types=None): """ diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 033ca261a..ca0075c5e 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -143,7 +143,7 @@ class PlanningStep(MemoryStep): model_input_messages: list[Message] model_output_message: ChatMessage plan: str - timing: Timing | None = None + timing: Timing token_usage: TokenUsage | None = None def to_messages(self, summary_mode: bool = False) -> list[Message]: diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 1bf76f548..3bffedbb4 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -1339,6 +1339,8 @@ def generate_stream( for event in self.client.chat.completions.create( **completion_kwargs, stream=True, stream_options={"include_usage": True} ): + if getattr(event, "usage", None): + print("EV:", event) if event.choices: if event.choices[0].delta is None: if not getattr(event.choices[0], "finish_reason", None): diff --git a/tests/test_gradio_ui.py b/tests/test_gradio_ui.py index 4a527b5c4..1e39a7343 100644 --- a/tests/test_gradio_ui.py +++ b/tests/test_gradio_ui.py @@ -312,26 +312,33 @@ def test_action_step_with_images(self): assert len(image_messages) == 2 assert "path/to/image.png" in str(image_messages[0]) - @pytest.mark.parametrize("skip_model_outputs, expected_messages_length", [(False, 4), (True, 2)]) - def test_planning_step(self, skip_model_outputs, expected_messages_length): + @pytest.mark.parametrize( + "skip_model_outputs, expected_messages_length, token_usage", + [(False, 4, TokenUsage(input_tokens=80, output_tokens=30)), (True, 2, None)], + ) + def test_planning_step(self, skip_model_outputs, expected_messages_length, token_usage): """Test PlanningStep processing.""" step = PlanningStep( plan="1. First step\n2. Second step", model_input_messages=Mock(), model_output_message=Mock(), - token_usage=TokenUsage(input_tokens=10, output_tokens=20), + token_usage=token_usage, + timing=Timing(start_time=1.0, end_time=2.0), ) messages = list(pull_messages_from_step(step, skip_model_outputs=skip_model_outputs)) assert len(messages) == expected_messages_length # [header, plan,] footnote, divider expected_contents = [ "**Planning step**", "1. First step\n2. Second step", - "Input tokens: 80 | Output tokens: 30", + "Input tokens: 80 | Output tokens: 30" if token_usage else "", "-----", ] for message, expected_content in zip(messages, expected_contents[-expected_messages_length:]): assert expected_content in message.content + if not token_usage: + assert "Input tokens: 80 | Output tokens: 30" not in message.content + @pytest.mark.parametrize( "answer_type, answer_value, expected_content", [ diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index 6b5eaac1a..6ebfa2982 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -34,6 +34,9 @@ class FakeLLMModel(Model): + def __init__(self, give_token_usage: bool = True): + self.give_token_usage = give_token_usage + def generate(self, prompt, tools_to_call_from=None, **kwargs): if tools_to_call_from is not None: return ChatMessage( @@ -46,7 +49,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs): function=ChatMessageToolCallDefinition(name="final_answer", arguments={"answer": "image"}), ) ], - token_usage=TokenUsage(input_tokens=10, output_tokens=20), + token_usage=TokenUsage(input_tokens=10, output_tokens=20) if self.give_token_usage else None, ) else: return ChatMessage( @@ -56,7 +59,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs): ```py final_answer('This is the final answer.') ```""", - token_usage=TokenUsage(input_tokens=10, output_tokens=20), + token_usage=TokenUsage(input_tokens=10, output_tokens=20) if self.give_token_usage else None, ) @@ -195,3 +198,56 @@ def test_run_return_full_result(self): self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20)) self.assertIsInstance(result.messages, list) self.assertGreater(result.timing.duration, 0) + + agent = ToolCallingAgent( + tools=[], + model=FakeLLMModel(), + max_steps=1, + return_full_result=True, + ) + + result = agent.run("Fake task") + + self.assertIsInstance(result, RunResult) + self.assertEqual(result.output, "image") + self.assertEqual(result.state, "success") + self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20)) + self.assertIsInstance(result.messages, list) + self.assertGreater(result.timing.duration, 0) + + # Below 2 lines should be removed when the attributes are removed + assert agent.monitor.total_input_token_count == 10 + assert agent.monitor.total_output_token_count == 20 + + def test_run_result_no_token_usage(self): + agent = CodeAgent( + tools=[], + model=FakeLLMModel(give_token_usage=False), + max_steps=1, + return_full_result=True, + ) + + result = agent.run("Fake task") + + self.assertIsInstance(result, RunResult) + self.assertEqual(result.output, "This is the final answer.") + self.assertEqual(result.state, "success") + self.assertIsNone(result.token_usage) + self.assertIsInstance(result.messages, list) + self.assertGreater(result.timing.duration, 0) + + agent = ToolCallingAgent( + tools=[], + model=FakeLLMModel(give_token_usage=False), + max_steps=1, + return_full_result=True, + ) + + result = agent.run("Fake task") + + self.assertIsInstance(result, RunResult) + self.assertEqual(result.output, "image") + self.assertEqual(result.state, "success") + self.assertIsNone(result.token_usage) + self.assertIsInstance(result.messages, list) + self.assertGreater(result.timing.duration, 0) From f93846c3a74be39e5a0ebc3a04bcf8973cb593b4 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 13:56:18 +0200 Subject: [PATCH 15/28] Pass tests --- tests/test_agents.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 650c76faa..a80b18921 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -678,6 +678,13 @@ def generate(self, messages, stop_sequences=None): def test_step_number(self): fake_model = MagicMock() + fake_model.generate.return_value = ChatMessage( + role="assistant", + content="Model output.", + tool_calls=None, + raw="Model output.", + token_usage=None, + ) max_steps = 2 agent = CodeAgent(tools=[], model=fake_model, max_steps=max_steps) assert hasattr(agent, "step_number"), "step_number attribute should be defined" @@ -810,13 +817,19 @@ def test_planning_step(self, step, expected_messages_list): ) def test_provide_final_answer(self, images, expected_messages_list): fake_model = MagicMock() - fake_model.return_value.content = "Final answer." + fake_model.generate.return_value = ChatMessage( + role="assistant", + content="Final answer.", + tool_calls=None, + raw="Final answer.", + token_usage=None, + ) agent = CodeAgent( tools=[], model=fake_model, ) task = "Test task" - final_answer = agent.provide_final_answer(task, images=images) + final_answer = agent.provide_final_answer(task, images=images).content expected_message_texts = { "FINAL_ANSWER_SYSTEM_PROMPT": agent.prompt_templates["final_answer"]["pre_messages"], "FINAL_ANSWER_USER_PROMPT": populate_template( @@ -830,8 +843,8 @@ def test_provide_final_answer(self, images, expected_messages_list): expected_content["text"] = expected_message_texts[expected_content["text"]] assert final_answer == "Final answer." # Test calls to model - assert len(fake_model.call_args_list) == 1 - for call_args, expected_messages in zip(fake_model.call_args_list, expected_messages_list): + assert len(fake_model.generate.call_args_list) == 1 + for call_args, expected_messages in zip(fake_model.generate.call_args_list, expected_messages_list): assert len(call_args.args) == 1 messages = call_args.args[0] assert isinstance(messages, list) @@ -849,7 +862,13 @@ def test_provide_final_answer(self, images, expected_messages_list): def test_interrupt(self): fake_model = MagicMock() - fake_model.return_value.content = "Model output." + fake_model.generate.return_value = ChatMessage( + role="assistant", + content="Model output.", + tool_calls=None, + raw="Model output.", + token_usage=None, + ) def interrupt_callback(memory_step, agent): agent.interrupt() From eefc8a8e4d6027dc9889b1428b5fb272614252a0 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 13:57:11 +0200 Subject: [PATCH 16/28] Pass memory test --- tests/test_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_memory.py b/tests/test_memory.py index 7990698f6..9896324a9 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -115,6 +115,7 @@ def test_planning_step_to_messages(): model_input_messages=[Message(role=MessageRole.USER, content="Hello")], model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Plan"), plan="This is a plan.", + timing=Timing(start_time=0.0, end_time=1.0), ) messages = planning_step.to_messages(summary_mode=False) assert len(messages) == 2 From 55bd6c7c5d636d870d3309f5693992d6a6df77bc Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 13:59:08 +0200 Subject: [PATCH 17/28] Pass agents test --- tests/test_agents.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index a80b18921..fb1fde593 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -1218,8 +1218,13 @@ def generate(self, messages, stop_sequences=None, grammar=None): def test_local_python_executor_with_custom_functions(self): model = MagicMock() - model.last_input_token_count = 10 - model.last_output_token_count = 5 + model.generate.return_value = ChatMessage( + role="assistant", + content="", + tool_calls=None, + raw="", + token_usage=None, + ) agent = CodeAgent(tools=[], model=model, executor_kwargs={"additional_functions": {"open": open}}) agent.run("Test run") assert "open" in agent.python_executor.static_tools From 3b796d9de71d51739feffeb96ac5bbad374e6afd Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 14:06:15 +0200 Subject: [PATCH 18/28] Revert model change in GAIA --- examples/open_deep_research/run_gaia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py index 9a22e7174..9c7bacd4e 100644 --- a/examples/open_deep_research/run_gaia.py +++ b/examples/open_deep_research/run_gaia.py @@ -183,7 +183,7 @@ def answer_single_question( else: model_params["max_tokens"] = 4096 model = LiteLLMModel(**model_params) - # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="novita", max_tokens=4096) + # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096) document_inspection_tool = TextInspectorTool(model, 100000) agent = create_agent_team(model) From 4e6b593f8c5995a17086c978d13b3f76343b8427 Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Tue, 20 May 2025 15:03:09 +0200 Subject: [PATCH 19/28] Update src/smolagents/models.py Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- src/smolagents/models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 3bffedbb4..099bb043b 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -316,8 +316,10 @@ def __init__( @property def last_input_token_count(self) -> int | None: - logger.warning( - "The last_input_token_count attribute is deprecated and will be removed in a future version.", + warnings.warn( + "Attribute last_input_token_count is deprecated and will be removed in version 1.20. " + "Please use TokenUsage.input_tokens instead.", + FutureWarning, ) return self._last_input_token_count From 4c92bebf3f31e0d6a030356e58a92ac523d98b87 Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Tue, 20 May 2025 15:07:05 +0200 Subject: [PATCH 20/28] Update src/smolagents/monitoring.py Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- src/smolagents/monitoring.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 0effcfcf4..6ba2d427f 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -67,24 +67,10 @@ class Timing: start_time: float end_time: float | None = None - - def dict(self): - return { - "start_time": self.start_time, - "end_time": self.end_time, - "duration": self.duration, - } - - @property - def duration(self): - if self.end_time is None: - return None - return self.end_time - self.start_time - - def __str__(self): - attributes = vars(self).copy() - attributes["duration"] = self.duration # This makes sure the duration is also printed - return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})" + duration : float | None = field(init=False) + + def __post_init__(self): + self.duration = self.end_time - self.start_time if self.end_time else None class Monitor: From a87ef7049379c04bccc1c7ef95fe01615bd38d4d Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Tue, 20 May 2025 15:07:12 +0200 Subject: [PATCH 21/28] Update src/smolagents/monitoring.py Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- src/smolagents/monitoring.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 6ba2d427f..533a84308 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -41,22 +41,10 @@ class TokenUsage: input_tokens: int output_tokens: int - - def dict(self): - return { - "input_tokens": self.input_tokens, - "output_tokens": self.output_tokens, - "total_tokens": self.total_tokens, - } - - @property - def total_tokens(self): - return self.input_tokens + self.output_tokens - - def __str__(self): - attributes = vars(self).copy() - attributes["total_tokens"] = self.total_tokens # This makes sure the total tokens are also printed - return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})" + total_tokens: int = field(init=False) + + def __post_init__(self): + self.total_tokens = self.input_tokens + self.output_tokens @dataclass From 18b1849b4be343de767d29d885cd070f9972496f Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:10:09 +0200 Subject: [PATCH 22/28] Revert suggestion to avoid None durations --- src/smolagents/models.py | 6 ++++-- src/smolagents/monitoring.py | 28 ++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 099bb043b..582b0951d 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -325,8 +325,10 @@ def last_input_token_count(self) -> int | None: @property def last_output_token_count(self) -> int | None: - logger.warning( - "The last_output_token_count attribute is deprecated and will be removed in a future version.", + warnings.warn( + "Attribute last_output_token_count is deprecated and will be removed in version 1.20. " + "Please use TokenUsage.output_tokens instead.", + FutureWarning, ) return self._last_output_token_count diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 533a84308..417e51a31 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -41,10 +41,15 @@ class TokenUsage: input_tokens: int output_tokens: int - total_tokens: int = field(init=False) - - def __post_init__(self): - self.total_tokens = self.input_tokens + self.output_tokens + + @property + def total_tokens(self): + return self.input_tokens + self.output_tokens + + def __str__(self): + attributes = vars(self).copy() + attributes["total_tokens"] = self.total_tokens # This makes sure the total tokens are also printed + return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})" @dataclass @@ -55,10 +60,17 @@ class Timing: start_time: float end_time: float | None = None - duration : float | None = field(init=False) - - def __post_init__(self): - self.duration = self.end_time - self.start_time if self.end_time else None + + @property + def duration(self): + if self.end_time is None: + return None + return self.end_time - self.start_time + + def __str__(self): + attributes = vars(self).copy() + attributes["duration"] = self.duration # This makes sure the duration is also printed + return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})" class Monitor: From 582a09e2948487cfb8ea8e4550c9236b9a3fdfb9 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:17:07 +0200 Subject: [PATCH 23/28] Use post-init suggestion for TokenUsage, property for Timing --- src/smolagents/monitoring.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 417e51a31..f05218cb0 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import IntEnum from rich import box @@ -41,15 +41,10 @@ class TokenUsage: input_tokens: int output_tokens: int + total_tokens: int = field(init=False) - @property - def total_tokens(self): - return self.input_tokens + self.output_tokens - - def __str__(self): - attributes = vars(self).copy() - attributes["total_tokens"] = self.total_tokens # This makes sure the total tokens are also printed - return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})" + def __post_init__(self): + self.total_tokens = self.input_tokens + self.output_tokens @dataclass @@ -63,14 +58,10 @@ class Timing: @property def duration(self): - if self.end_time is None: - return None - return self.end_time - self.start_time - - def __str__(self): - attributes = vars(self).copy() - attributes["duration"] = self.duration # This makes sure the duration is also printed - return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})" + return None if self.end_time is None else self.end_time - self.start_time + + def __repr__(self) -> str: + return f"Timing(start_time={self.start_time}, end_time={self.end_time}, duration={self.duration})" class Monitor: From e25715e6bee9a1b1a1dfb79e856d03143c012208 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:35:55 +0200 Subject: [PATCH 24/28] Re-add deprecated token count increment in stream methods --- src/smolagents/models.py | 54 ++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 582b0951d..26e96f86a 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -930,6 +930,8 @@ def generate_stream( # Generate with streaming for new_text in self.streamer: + self._last_input_token_count = count_prompt_tokens + self._last_output_token_count = 1 yield ChatMessageStreamDelta( content=new_text, tool_calls=None, @@ -1093,11 +1095,17 @@ def generate_stream( else: yield ChatMessageStreamDelta( content=event.choices[0].delta.content, - token_usage=TokenUsage( - input_tokens=event.usage.prompt_tokens, - output_tokens=event.usage.completion_tokens, - ), ) + if getattr(event, "usage", None): + self._last_input_token_count = event.usage.prompt_tokens + self._last_output_token_count = event.usage.completion_tokens + yield ChatMessageStreamDelta( + content="", + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), + ) class LiteLLMRouterModel(LiteLLMModel): @@ -1343,24 +1351,24 @@ def generate_stream( for event in self.client.chat.completions.create( **completion_kwargs, stream=True, stream_options={"include_usage": True} ): - if getattr(event, "usage", None): - print("EV:", event) if event.choices: if event.choices[0].delta is None: if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - if getattr(event, "usage", None): - token_usage = TokenUsage( - input_tokens=event.usage.prompt_tokens, - output_tokens=event.usage.completion_tokens, - ) - else: - token_usage = None yield ChatMessageStreamDelta( content=event.choices[0].delta.content, - token_usage=token_usage, ) + if getattr(event, "usage", None): + self._last_input_token_count = event.usage.prompt_tokens + self._last_output_token_count = event.usage.completion_tokens + yield ChatMessageStreamDelta( + content="", + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), + ) class HfApiModel(InferenceClientModel): @@ -1461,13 +1469,17 @@ def generate_stream( if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - yield ChatMessageStreamDelta( - content=event.choices[0].delta.content, - token_usage=TokenUsage( - input_tokens=event.usage.prompt_tokens, - output_tokens=event.usage.completion_tokens, - ), - ) + yield ChatMessageStreamDelta(content=event.choices[0].delta.content) + if event.usage: + self._last_input_token_count = event.usage.prompt_tokens + self._last_output_token_count = event.usage.completion_tokens + yield ChatMessageStreamDelta( + content="", + token_usage=TokenUsage( + input_tokens=event.usage.prompt_tokens, + output_tokens=event.usage.completion_tokens, + ), + ) def generate( self, From 3a321e7fc41e0922573d45811ad475d3f39d3492 Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:40:47 +0200 Subject: [PATCH 25/28] Fix dict conversion error --- src/smolagents/memory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index ca0075c5e..806fa60c9 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -67,8 +67,8 @@ def dict(self): return { "model_input_messages": self.model_input_messages, "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [], - "timing": self.timing.dict(), - "token_usage": self.token_usage.dict() if self.token_usage else None, + "timing": asdict(self.timing), + "token_usage": asdict(self.token_usage) if self.token_usage else None, "step": self.step_number, "error": self.error.dict() if self.error else None, "model_output_message": self.model_output_message, From 6dc4c6d48ecba7e057af0bd3e5b206072ac5acfa Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:56:32 +0200 Subject: [PATCH 26/28] Test ActionStep.dict() --- src/smolagents/memory.py | 4 +-- src/smolagents/models.py | 2 +- src/smolagents/monitoring.py | 7 ++++ tests/test_agents.py | 3 ++ tests/test_memory.py | 63 ++++++++++++++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 806fa60c9..0a196ed4c 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -67,11 +67,11 @@ def dict(self): return { "model_input_messages": self.model_input_messages, "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [], - "timing": asdict(self.timing), + "timing": self.timing.dict(), "token_usage": asdict(self.token_usage) if self.token_usage else None, "step": self.step_number, "error": self.error.dict() if self.error else None, - "model_output_message": self.model_output_message, + "model_output_message": self.model_output_message.dict(), "model_output": self.model_output, "observations": self.observations, "action_output": make_json_serializable(self.action_output), diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 26e96f86a..48a702b65 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -124,7 +124,7 @@ def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage | ) def dict(self): - return json.dumps(get_dict_from_nested_dataclasses(self)) + return get_dict_from_nested_dataclasses(self) @classmethod def from_hf_api(cls, message: "ChatCompletionOutputMessage", raw) -> "ChatMessage": diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index f05218cb0..5f5c174da 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -60,6 +60,13 @@ class Timing: def duration(self): return None if self.end_time is None else self.end_time - self.start_time + def dict(self): + return { + "start_time": self.start_time, + "end_time": self.end_time, + "duration": self.duration, + } + def __repr__(self) -> str: return f"Timing(start_time={self.start_time}, end_time={self.end_time}, duration={self.duration})" diff --git a/tests/test_agents.py b/tests/test_agents.py index fb1fde593..5f8c4397e 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -573,7 +573,10 @@ def weather_api(location: str, celsius: bool = False) -> str: step_memory_dict = agent.memory.get_succinct_steps()[1] assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api" assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100 + assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api" assert "model_input_messages" in agent.memory.get_full_steps()[1] + assert step_memory_dict["model_output_message"].token_usage.total_tokens > 100 + assert step_memory_dict["model_output_message"].timing.duration > 0.1 def test_final_answer_checks(self): def check_always_fails(final_answer, agent_memory): diff --git a/tests/test_memory.py b/tests/test_memory.py index 9896324a9..4bf4fbab7 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -38,6 +38,69 @@ def test_to_messages(self): step.to_messages() +def test_action_step_dict(): + action_step = ActionStep( + model_input_messages=[Message(role=MessageRole.USER, content="Hello")], + tool_calls=[ + ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}), + ], + timing=Timing(start_time=0.0, end_time=1.0), + step_number=1, + error=None, + model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"), + model_output="Hi", + observations="This is a nice observation", + observations_images=["image1.png"], + action_output="Output", + token_usage=TokenUsage(input_tokens=10, output_tokens=20), + ) + action_step_dict = action_step.dict() + # Check each key individually for better test failure messages + assert "model_input_messages" in action_step_dict + assert action_step_dict["model_input_messages"] == [Message(role=MessageRole.USER, content="Hello")] + + assert "tool_calls" in action_step_dict + assert len(action_step_dict["tool_calls"]) == 1 + assert action_step_dict["tool_calls"][0] == { + "id": "id", + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + } + + assert "timing" in action_step_dict + assert action_step_dict["timing"] == {"start_time": 0.0, "end_time": 1.0, "duration": 1.0} + + assert "token_usage" in action_step_dict + assert action_step_dict["token_usage"] == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30} + + assert "step" in action_step_dict + assert action_step_dict["step"] == 1 + + assert "error" in action_step_dict + assert action_step_dict["error"] is None + + assert "model_output_message" in action_step_dict + assert action_step_dict["model_output_message"] == { + "role": "assistant", + "content": "Hi", + "tool_calls": None, + "raw": None, + "token_usage": None, + } + + assert "model_output" in action_step_dict + assert action_step_dict["model_output"] == "Hi" + + assert "observations" in action_step_dict + assert action_step_dict["observations"] == "This is a nice observation" + + assert "action_output" in action_step_dict + assert action_step_dict["action_output"] == "Output" + + def test_action_step_to_messages(): action_step = ActionStep( model_input_messages=[Message(role=MessageRole.USER, content="Hello")], From 56117ad6c3f9db313540a5b1375d8191619239ed Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 15:56:59 +0200 Subject: [PATCH 27/28] Fix edge case --- src/smolagents/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py index 0a196ed4c..912bad6e9 100644 --- a/src/smolagents/memory.py +++ b/src/smolagents/memory.py @@ -71,7 +71,7 @@ def dict(self): "token_usage": asdict(self.token_usage) if self.token_usage else None, "step": self.step_number, "error": self.error.dict() if self.error else None, - "model_output_message": self.model_output_message.dict(), + "model_output_message": self.model_output_message.dict() if self.model_output_message else None, "model_output": self.model_output, "observations": self.observations, "action_output": make_json_serializable(self.action_output), From dbdb3fa2adc05677f6cb497270166065b81ea60c Mon Sep 17 00:00:00 2001 From: Aymeric Date: Tue, 20 May 2025 16:07:20 +0200 Subject: [PATCH 28/28] Fix even more tests --- tests/test_agents.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 5f8c4397e..d6ace4471 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -571,12 +571,11 @@ def weather_api(location: str, celsius: bool = False) -> str: assert agent.memory.steps[0].task == task assert agent.memory.steps[1].tool_calls[0].name == "weather_api" step_memory_dict = agent.memory.get_succinct_steps()[1] - assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api" - assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100 - assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api" + assert step_memory_dict["model_output_message"]["tool_calls"][0]["function"]["name"] == "weather_api" + assert step_memory_dict["model_output_message"]["raw"]["completion_kwargs"]["max_new_tokens"] == 100 assert "model_input_messages" in agent.memory.get_full_steps()[1] - assert step_memory_dict["model_output_message"].token_usage.total_tokens > 100 - assert step_memory_dict["model_output_message"].timing.duration > 0.1 + assert step_memory_dict["token_usage"]["total_tokens"] > 100 + assert step_memory_dict["timing"]["duration"] > 0.1 def test_final_answer_checks(self): def check_always_fails(final_answer, agent_memory):