From f1c6fa55fa1199c57fde86875c8197bf4824390d Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Fri, 16 May 2025 13:10:20 +0200
Subject: [PATCH 01/28] Start work on run results

---
 src/smolagents/agents.py     | 64 +++++++++++++++++++++++++++++++-----
 src/smolagents/monitoring.py | 15 ++++++++-
 tests/test_monitoring.py     | 19 +++++++++++
 3 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index a276018f5..4bd6a16ca 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -62,6 +62,7 @@
     AgentLogger,
     LogLevel,
     Monitor,
+    RunResult,
 )
 from .remote_executors import DockerExecutor, E2BExecutor
 from .tools import Tool
@@ -204,6 +205,7 @@ def __init__(
         description: str | None = None,
         provide_run_summary: bool = False,
         final_answer_checks: list[Callable] | None = None,
+        return_full_results: bool = False,
         logger: AgentLogger | None = None,
     ):
         self.agent_name = self.__class__.__name__
@@ -230,6 +232,7 @@ def __init__(
         self.description = description
         self.provide_run_summary = provide_run_summary
         self.final_answer_checks = final_answer_checks
+        self.return_full_results = return_full_results
 
         self._setup_managed_agents(managed_agents)
         self._setup_tools(tools, add_base_tools)
@@ -347,8 +350,41 @@ def run(
         if stream:
             # The steps are returned as they are executed through a generator to iterate on.
             return self._run_stream(task=self.task, max_steps=max_steps, images=images)
+        run_start_time = time.time()
         # Outputs are returned only at the end. We only look at the last step.
-        return list(self._run_stream(task=self.task, max_steps=max_steps, images=images))[-1].final_answer
+        try:
+            steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images))
+            result = steps[-1].final_answer
+            state = "success"
+        except Exception:
+            run_duration = time.time() - run_start_time
+            raise
+        else:
+            run_duration = time.time() - run_start_time
+
+        if self.return_full_results:
+            token_usage = None
+            try:
+                token_usage = self.monitor.get_total_token_counts()
+            except Exception:
+                token_usage = None
+
+            if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError):
+                state = "max_steps"
+            elif self.memory.steps and getattr(self.memory.steps[-1], "error", None) is not None:
+                state = "error"
+
+            messages = self.memory.get_full_steps()
+
+            return RunResult(
+                result=result,
+                token_usage=token_usage,
+                messages=messages,
+                duration=run_duration,
+                state=state,
+            )
+
+        return result
 
     def _run_stream(
         self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None
@@ -362,11 +398,25 @@ def _run_stream(
             if self.planning_interval is not None and (
                 self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0
             ):
+                planning_start = step_start_time
+                planning_step = None
                 for element in self._generate_planning_step(
                     task, is_first_step=(self.step_number == 1), step=self.step_number
                 ):
                     yield element
-                self.memory.steps.append(element)
+                    planning_step = element
+                if planning_step is not None:
+                    planning_step.end_time = time.time()
+                    planning_step.duration = planning_step.end_time - planning_start
+                    if getattr(self.model, "last_input_token_count", None) is not None:
+                        planning_step.input_token_count = self.model.last_input_token_count
+                        planning_step.output_token_count = self.model.last_output_token_count
+                    self.memory.steps.append(planning_step)
+                    for callback in self.step_callbacks:
+                        callback(planning_step) if len(inspect.signature(callback).parameters) == 1 else callback(
+                            planning_step, agent=self
+                        )
+                step_start_time = time.time()
             action_step = ActionStep(
                 step_number=self.step_number, start_time=step_start_time, observations_images=images
             )
@@ -411,6 +461,9 @@ def _validate_final_answer(self, final_answer: Any):
     def _finalize_step(self, memory_step: ActionStep, step_start_time: float):
         memory_step.end_time = time.time()
         memory_step.duration = memory_step.end_time - step_start_time
+        if getattr(self.model, "last_input_token_count", None) is not None:
+            memory_step.input_token_count = self.model.last_input_token_count
+            memory_step.output_token_count = self.model.last_output_token_count
         for callback in self.step_callbacks:
             # For compatibility with old callbacks that don't take the agent as an argument
             callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
@@ -423,13 +476,8 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"],
             step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger)
         )
         final_memory_step.action_output = final_answer
-        final_memory_step.end_time = time.time()
-        final_memory_step.duration = final_memory_step.end_time - step_start_time
+        self._finalize_step(final_memory_step, step_start_time)
         self.memory.steps.append(final_memory_step)
-        for callback in self.step_callbacks:
-            callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
-                final_memory_step, agent=self
-            )
         return final_answer
 
     def _generate_planning_step(
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 0d827a95e..90f846fe8 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -15,7 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+from dataclasses import dataclass
 from enum import IntEnum
+from typing import Any
 
 from rich import box
 from rich.console import Console, Group
@@ -29,7 +31,18 @@
 from smolagents.utils import escape_code_brackets
 
 
-__all__ = ["AgentLogger", "LogLevel", "Monitor"]
+__all__ = ["AgentLogger", "LogLevel", "Monitor", "RunResult"]
+
+
+@dataclass
+class RunResult:
+    """Holds extended information about an agent run."""
+
+    result: Any
+    token_usage: dict[str, int] | None
+    messages: list[dict]
+    duration: float
+    state: str
 
 
 class Monitor:
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py
index c7f6b9a64..b571a233a 100644
--- a/tests/test_monitoring.py
+++ b/tests/test_monitoring.py
@@ -186,3 +186,22 @@ def generate(self, prompt, **kwargs):
         final_message = outputs[-1]
         self.assertEqual(final_message.role, "assistant")
         self.assertIn("Malformed call", final_message.content)
+
+    def test_run_return_full_results(self):
+        agent = CodeAgent(
+            tools=[],
+            model=FakeLLMModel(),
+            max_steps=1,
+            return_full_results=True,
+        )
+
+        result = agent.run("Fake task")
+
+        from smolagents import RunResult
+
+        self.assertIsInstance(result, RunResult)
+        self.assertEqual(result.result, "This is the final answer.")
+        self.assertEqual(result.state, "success")
+        self.assertEqual(result.token_usage, {"input": 10, "output": 20})
+        self.assertIsInstance(result.messages, list)
+        self.assertGreater(result.duration, 0)

From 46eb7c8645f0df4a7295da7dcb512e938583d746 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Fri, 16 May 2025 16:31:21 +0200
Subject: [PATCH 02/28] Create Timing and Usage objects

---
 src/smolagents/agents.py     | 91 ++++++++++++++++++++++--------------
 src/smolagents/memory.py     | 20 ++++++--
 src/smolagents/monitoring.py | 15 +-----
 3 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 4bd6a16ca..90d9d1f05 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -24,6 +24,7 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Generator
+from dataclasses import dataclass
 from logging import getLogger
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypedDict
@@ -54,7 +55,9 @@
     PlanningStep,
     SystemPromptStep,
     TaskStep,
+    Timing,
     ToolCall,
+    Usage,
 )
 from .models import ChatMessage, ChatMessageStreamDelta, MessageRole, Model, parse_json_if_needed
 from .monitoring import (
@@ -62,7 +65,6 @@
     AgentLogger,
     LogLevel,
     Monitor,
-    RunResult,
 )
 from .remote_executors import DockerExecutor, E2BExecutor
 from .tools import Tool
@@ -167,6 +169,17 @@ class PromptTemplates(TypedDict):
 )
 
 
+@dataclass
+class RunResult:
+    """Holds extended information about an agent run."""
+
+    result: Any
+    token_usage: dict[str, int] | None
+    messages: list[dict]
+    duration: float
+    state: str
+
+
 class MultiStepAgent(ABC):
     """
     Agent class that solves the given task step by step, using the ReAct framework:
@@ -213,15 +226,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert not missing_keys, (
-                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
-            )
+            assert (
+                not missing_keys
+            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
-                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
-                        )
+                        assert (
+                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
+                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -261,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(agent.name and agent.description for agent in managed_agents), (
-                "All managed agents need both a name and a description!"
-            )
+            assert all(
+                agent.name and agent.description for agent in managed_agents
+            ), "All managed agents need both a name and a description!"
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
@@ -394,31 +407,34 @@ def _run_stream(
         while final_answer is None and self.step_number <= max_steps:
             if self.interrupt_switch:
                 raise AgentError("Agent interrupted.", self.logger)
-            step_start_time = time.time()
             if self.planning_interval is not None and (
                 self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0
             ):
-                planning_start = step_start_time
+                planning_start_time = time.time()
                 planning_step = None
                 for element in self._generate_planning_step(
                     task, is_first_step=(self.step_number == 1), step=self.step_number
                 ):
                     yield element
                     planning_step = element
-                if planning_step is not None:
-                    planning_step.end_time = time.time()
-                    planning_step.duration = planning_step.end_time - planning_start
-                    if getattr(self.model, "last_input_token_count", None) is not None:
-                        planning_step.input_token_count = self.model.last_input_token_count
-                        planning_step.output_token_count = self.model.last_output_token_count
-                    self.memory.steps.append(planning_step)
-                    for callback in self.step_callbacks:
-                        callback(planning_step) if len(inspect.signature(callback).parameters) == 1 else callback(
-                            planning_step, agent=self
-                        )
-                step_start_time = time.time()
+                assert isinstance(planning_step, PlanningStep)
+                self.memory.steps.append(planning_step)
+                if getattr(self.model, "last_input_token_count", None) is not None:
+                    planning_step.usage = Usage(
+                        input_tokens=self.model.last_input_token_count,
+                        output_tokens=self.model.last_output_token_count,
+                    )
+                planning_end_time = time.time()
+                planning_step.timing = Timing(
+                    start_time=planning_start_time,
+                    end_time=planning_end_time,
+                    duration=planning_end_time - planning_start_time,
+                )
+            action_step_start_time = time.time()
             action_step = ActionStep(
-                step_number=self.step_number, start_time=step_start_time, observations_images=images
+                step_number=self.step_number,
+                timing=Timing(start_time=action_step_start_time),
+                observations_images=images,
             )
             try:
                 for el in self._execute_step(action_step):
@@ -431,13 +447,13 @@ def _run_stream(
                 # Other AgentError types are caused by the Model, so we should log them and iterate.
                 action_step.error = e
             finally:
-                self._finalize_step(action_step, step_start_time)
+                self._finalize_step(action_step)
                 self.memory.steps.append(action_step)
                 yield action_step
                 self.step_number += 1
 
         if final_answer is None and self.step_number == max_steps + 1:
-            final_answer = self._handle_max_steps_reached(task, images, step_start_time)
+            final_answer = self._handle_max_steps_reached(task, images)
             yield action_step
         yield FinalAnswerStep(handle_agent_output_types(final_answer))
 
@@ -458,25 +474,30 @@ def _validate_final_answer(self, final_answer: Any):
             except Exception as e:
                 raise AgentError(f"Check {check_function.__name__} failed with error: {e}", self.logger)
 
-    def _finalize_step(self, memory_step: ActionStep, step_start_time: float):
-        memory_step.end_time = time.time()
-        memory_step.duration = memory_step.end_time - step_start_time
+    def _finalize_step(self, memory_step: ActionStep):
+        memory_step.timing.end_time = time.time()
+        memory_step.timing.duration = memory_step.timing.end_time - memory_step.timing.start_time
         if getattr(self.model, "last_input_token_count", None) is not None:
-            memory_step.input_token_count = self.model.last_input_token_count
-            memory_step.output_token_count = self.model.last_output_token_count
+            memory_step.usage = Usage(
+                input_tokens=self.model.last_input_token_count,
+                output_tokens=self.model.last_output_token_count,
+            )
         for callback in self.step_callbacks:
             # For compatibility with old callbacks that don't take the agent as an argument
             callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
                 memory_step, agent=self
             )
 
-    def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"], step_start_time: float) -> Any:
+    def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"]) -> Any:
+        action_step_start_time = time.time()
         final_answer = self.provide_final_answer(task, images)
         final_memory_step = ActionStep(
-            step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger)
+            step_number=self.step_number,
+            error=AgentMaxStepsError("Reached max steps.", self.logger),
+            timing=Timing(start_time=action_step_start_time),
         )
         final_memory_step.action_output = final_answer
-        self._finalize_step(final_memory_step, step_start_time)
+        self._finalize_step(final_memory_step)
         self.memory.steps.append(final_memory_step)
         return final_answer
 
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 38fa9e1e9..4b0ac992e 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -48,20 +48,32 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]:
         raise NotImplementedError
 
 
+@dataclass
+class Usage:
+    input_tokens: int
+    output_tokens: int
+
+
+@dataclass
+class Timing:
+    start_time: float
+    end_time: float | None = None
+    duration: float | None = None
+
+
 @dataclass
 class ActionStep(MemoryStep):
     model_input_messages: list[Message] | None = None
     tool_calls: list[ToolCall] | None = None
-    start_time: float | None = None
-    end_time: float | None = None
+    timing: Timing | None = None
     step_number: int | None = None
     error: AgentError | None = None
-    duration: float | None = None
     model_output_message: ChatMessage | None = None
     model_output: str | None = None
     observations: str | None = None
     observations_images: list["PIL.Image.Image"] | None = None
     action_output: Any = None
+    usage: Usage | None = None
 
     def dict(self):
         # We overwrite the method to parse the tool_calls and action_output manually
@@ -145,6 +157,8 @@ class PlanningStep(MemoryStep):
     model_input_messages: list[Message]
     model_output_message: ChatMessage
     plan: str
+    timing: Timing | None = None
+    usage: Usage | None = None
 
     def to_messages(self, summary_mode: bool = False) -> list[Message]:
         if summary_mode:
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 90f846fe8..0d827a95e 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -15,9 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-from dataclasses import dataclass
 from enum import IntEnum
-from typing import Any
 
 from rich import box
 from rich.console import Console, Group
@@ -31,18 +29,7 @@
 from smolagents.utils import escape_code_brackets
 
 
-__all__ = ["AgentLogger", "LogLevel", "Monitor", "RunResult"]
-
-
-@dataclass
-class RunResult:
-    """Holds extended information about an agent run."""
-
-    result: Any
-    token_usage: dict[str, int] | None
-    messages: list[dict]
-    duration: float
-    state: str
+__all__ = ["AgentLogger", "LogLevel", "Monitor"]
 
 
 class Monitor:

From f4c8acb1607f46b80a15ad5a668e2f30d64c77ed Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 12:04:57 +0200
Subject: [PATCH 03/28] Improve usage class, add property caculations

---
 examples/agent_from_any_llm.py     | 22 +++++----
 examples/inspect_multiagent_run.py | 10 +++-
 examples/multi_llm_agent.py        |  6 ++-
 src/smolagents/agents.py           | 67 ++++++++++++-------------
 src/smolagents/gradio_ui.py        |  2 +-
 src/smolagents/memory.py           | 28 +++--------
 src/smolagents/models.py           | 14 +++---
 src/smolagents/monitoring.py       | 79 ++++++++++++++++++++++++++----
 8 files changed, 139 insertions(+), 89 deletions(-)

diff --git a/examples/agent_from_any_llm.py b/examples/agent_from_any_llm.py
index d5e33f0a1..593c61b2d 100644
--- a/examples/agent_from_any_llm.py
+++ b/examples/agent_from_any_llm.py
@@ -1,19 +1,23 @@
-from smolagents import InferenceClientModel, LiteLLMModel, OpenAIServerModel, TransformersModel, tool
-from smolagents.agents import CodeAgent, ToolCallingAgent
+from smolagents import (
+    CodeAgent,
+    InferenceClientModel,
+    LiteLLMModel,
+    OpenAIServerModel,
+    ToolCallingAgent,
+    TransformersModel,
+    tool,
+)
 
 
 # Choose which inference type to use!
 
-available_inferences = ["hf_api", "hf_api_provider", "transformers", "ollama", "litellm", "openai"]
-chosen_inference = "hf_api_provider"
+available_inferences = ["inference_client", "transformers", "ollama", "litellm", "openai"]
+chosen_inference = "inference_client"
 
 print(f"Chose model: '{chosen_inference}'")
 
-if chosen_inference == "hf_api":
-    model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
-
-elif chosen_inference == "hf_api_provider":
-    model = InferenceClientModel(provider="together")
+if chosen_inference == "inference_client":
+    model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct", provider="nebius")
 
 elif chosen_inference == "transformers":
     model = TransformersModel(model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", device_map="auto", max_new_tokens=1000)
diff --git a/examples/inspect_multiagent_run.py b/examples/inspect_multiagent_run.py
index c68dccb75..75bc0a07c 100644
--- a/examples/inspect_multiagent_run.py
+++ b/examples/inspect_multiagent_run.py
@@ -16,18 +16,24 @@
 
 
 # Then we run the agentic part!
-model = InferenceClientModel()
+model = InferenceClientModel(provider="nebius")
 
 search_agent = ToolCallingAgent(
     tools=[WebSearchTool(), VisitWebpageTool()],
     model=model,
     name="search_agent",
     description="This is an agent that can do web search.",
+    return_full_result=True,
 )
 
 manager_agent = CodeAgent(
     tools=[],
     model=model,
     managed_agents=[search_agent],
+    return_full_result=True,
 )
-manager_agent.run("If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?")
+run_result = manager_agent.run(
+    "If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?"
+)
+print("Here is the token usage for the manager agent", run_result.token_usage)
+print("Here are the timing informations for the manager agent:", run_result.timing)
diff --git a/examples/multi_llm_agent.py b/examples/multi_llm_agent.py
index 6f44ff8b4..5c002e1ea 100644
--- a/examples/multi_llm_agent.py
+++ b/examples/multi_llm_agent.py
@@ -39,6 +39,8 @@
     model_list=llm_loadbalancer_model_list,
     client_kwargs={"routing_strategy": "simple-shuffle"},
 )
-agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True)
+agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True, return_full_results=True)
 
-agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
+full_result = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
+
+print(full_result)
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 90d9d1f05..7d62c64df 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -56,8 +56,8 @@
     SystemPromptStep,
     TaskStep,
     Timing,
+    TokenUsage,
     ToolCall,
-    Usage,
 )
 from .models import ChatMessage, ChatMessageStreamDelta, MessageRole, Model, parse_json_if_needed
 from .monitoring import (
@@ -174,9 +174,9 @@ class RunResult:
     """Holds extended information about an agent run."""
 
     result: Any
-    token_usage: dict[str, int] | None
+    token_usage: TokenUsage | None
     messages: list[dict]
-    duration: float
+    timing: Timing
     state: str
 
 
@@ -218,7 +218,7 @@ def __init__(
         description: str | None = None,
         provide_run_summary: bool = False,
         final_answer_checks: list[Callable] | None = None,
-        return_full_results: bool = False,
+        return_full_result: bool = False,
         logger: AgentLogger | None = None,
     ):
         self.agent_name = self.__class__.__name__
@@ -226,15 +226,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert (
-                not missing_keys
-            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            assert not missing_keys, (
+                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            )
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert (
-                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
-                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
+                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        )
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -245,7 +245,7 @@ def __init__(
         self.description = description
         self.provide_run_summary = provide_run_summary
         self.final_answer_checks = final_answer_checks
-        self.return_full_results = return_full_results
+        self.return_full_result = return_full_result
 
         self._setup_managed_agents(managed_agents)
         self._setup_tools(tools, add_base_tools)
@@ -274,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(
-                agent.name and agent.description for agent in managed_agents
-            ), "All managed agents need both a name and a description!"
+            assert all(agent.name and agent.description for agent in managed_agents), (
+                "All managed agents need both a name and a description!"
+            )
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
@@ -365,27 +365,18 @@ def run(
             return self._run_stream(task=self.task, max_steps=max_steps, images=images)
         run_start_time = time.time()
         # Outputs are returned only at the end. We only look at the last step.
-        try:
-            steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images))
-            result = steps[-1].final_answer
-            state = "success"
-        except Exception:
-            run_duration = time.time() - run_start_time
-            raise
-        else:
-            run_duration = time.time() - run_start_time
 
-        if self.return_full_results:
+        steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images))
+        result = steps[-1].final_answer
+
+        if self.return_full_result:
             token_usage = None
-            try:
-                token_usage = self.monitor.get_total_token_counts()
-            except Exception:
-                token_usage = None
+            token_usage = self.monitor.get_total_token_counts()
 
             if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError):
-                state = "max_steps"
-            elif self.memory.steps and getattr(self.memory.steps[-1], "error", None) is not None:
-                state = "error"
+                state = "max_steps_error"
+            else:
+                state = "success"
 
             messages = self.memory.get_full_steps()
 
@@ -393,7 +384,7 @@ def run(
                 result=result,
                 token_usage=token_usage,
                 messages=messages,
-                duration=run_duration,
+                timing=Timing(start_time=run_start_time, end_time=time.time()),
                 state=state,
             )
 
@@ -420,7 +411,7 @@ def _run_stream(
                 assert isinstance(planning_step, PlanningStep)
                 self.memory.steps.append(planning_step)
                 if getattr(self.model, "last_input_token_count", None) is not None:
-                    planning_step.usage = Usage(
+                    planning_step.usage = TokenUsage(
                         input_tokens=self.model.last_input_token_count,
                         output_tokens=self.model.last_output_token_count,
                     )
@@ -428,7 +419,6 @@ def _run_stream(
                 planning_step.timing = Timing(
                     start_time=planning_start_time,
                     end_time=planning_end_time,
-                    duration=planning_end_time - planning_start_time,
                 )
             action_step_start_time = time.time()
             action_step = ActionStep(
@@ -476,9 +466,8 @@ def _validate_final_answer(self, final_answer: Any):
 
     def _finalize_step(self, memory_step: ActionStep):
         memory_step.timing.end_time = time.time()
-        memory_step.timing.duration = memory_step.timing.end_time - memory_step.timing.start_time
         if getattr(self.model, "last_input_token_count", None) is not None:
-            memory_step.usage = Usage(
+            memory_step.usage = TokenUsage(
                 input_tokens=self.model.last_input_token_count,
                 output_tokens=self.model.last_output_token_count,
             )
@@ -710,7 +699,11 @@ def __call__(self, task: str, **kwargs):
             self.prompt_templates["managed_agent"]["task"],
             variables=dict(name=self.name, task=task),
         )
-        report = self.run(full_task, **kwargs)
+        result = self.run(full_task, **kwargs)
+        if isinstance(result, RunResult):
+            report = result.result
+        else:
+            report = result
         answer = populate_template(
             self.prompt_templates["managed_agent"]["report"], variables=dict(name=self.name, final_answer=report)
         )
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index 384f50bc4..b85c1cddf 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -94,7 +94,7 @@ def _process_action_step(step_log: ActionStep, skip_model_outputs: bool = False)
     import gradio as gr
 
     # Output the step number
-    step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "Step"
+    step_number = f"Step {step_log.step_number}"
     if not skip_model_outputs:
         yield gr.ChatMessage(role="assistant", content=f"**{step_number}**", metadata={"status": "done"})
 
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 4b0ac992e..6410d823e 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING, Any, TypedDict
 
 from smolagents.models import ChatMessage, MessageRole
-from smolagents.monitoring import AgentLogger, LogLevel
+from smolagents.monitoring import AgentLogger, LogLevel, Timing, TokenUsage
 from smolagents.utils import AgentError, make_json_serializable
 
 
@@ -48,43 +48,29 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]:
         raise NotImplementedError
 
 
-@dataclass
-class Usage:
-    input_tokens: int
-    output_tokens: int
-
-
-@dataclass
-class Timing:
-    start_time: float
-    end_time: float | None = None
-    duration: float | None = None
-
-
 @dataclass
 class ActionStep(MemoryStep):
+    step_number: int
+    timing: Timing
     model_input_messages: list[Message] | None = None
     tool_calls: list[ToolCall] | None = None
-    timing: Timing | None = None
-    step_number: int | None = None
     error: AgentError | None = None
     model_output_message: ChatMessage | None = None
     model_output: str | None = None
     observations: str | None = None
     observations_images: list["PIL.Image.Image"] | None = None
     action_output: Any = None
-    usage: Usage | None = None
+    usage: TokenUsage | None = None
 
     def dict(self):
         # We overwrite the method to parse the tool_calls and action_output manually
         return {
             "model_input_messages": self.model_input_messages,
             "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [],
-            "start_time": self.start_time,
-            "end_time": self.end_time,
+            "timing": self.timing.dict(),
+            "usage": self.usage.dict() if self.usage else None,
             "step": self.step_number,
             "error": self.error.dict() if self.error else None,
-            "duration": self.duration,
             "model_output_message": self.model_output_message,
             "model_output": self.model_output,
             "observations": self.observations,
@@ -158,7 +144,7 @@ class PlanningStep(MemoryStep):
     model_output_message: ChatMessage
     plan: str
     timing: Timing | None = None
-    usage: Usage | None = None
+    usage: TokenUsage | None = None
 
     def to_messages(self, summary_mode: bool = False) -> list[Message]:
         if summary_mode:
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index bba50ddc3..c0d5ac476 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -677,7 +677,7 @@ class TransformersModel(Model):
     Parameters:
         model_id (`str`):
             The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
-            For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`.
+            For example, `"Qwen/Qwen3-32B"`.
         device_map (`str`, *optional*):
             The device_map to initialize your model with.
         torch_dtype (`str`, *optional*):
@@ -695,7 +695,7 @@ class TransformersModel(Model):
     Example:
     ```python
     >>> engine = TransformersModel(
-    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
+    ...     model_id="Qwen/Qwen3-32B",
     ...     device="cuda",
     ...     max_new_tokens=5000,
     ... )
@@ -1172,10 +1172,10 @@ class InferenceClientModel(ApiModel):
     Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more.
 
     Parameters:
-        model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`):
+        model_id (`str`, *optional*, default `"Qwen/Qwen3-32B"`):
             The Hugging Face model ID to be used for inference.
             This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint.
-            Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future.
+            Currently, it defaults to `"Qwen/Qwen3-32B"`, but this may change in the future.
         provider (`str`, *optional*):
             Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"openai"`, `"replicate"`, "sambanova"`, `"together"`, etc.
             Currently, it defaults to hf-inference (HF Inference API).
@@ -1206,8 +1206,8 @@ class InferenceClientModel(ApiModel):
     Example:
     ```python
     >>> engine = InferenceClientModel(
-    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
-    ...     provider="together",
+    ...     model_id="Qwen/Qwen3-32B",
+    ...     provider="nebius",
     ...     token="your_hf_token_here",
     ...     max_tokens=5000,
     ... )
@@ -1220,7 +1220,7 @@ class InferenceClientModel(ApiModel):
 
     def __init__(
         self,
-        model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
+        model_id: str = "Qwen/Qwen3-32B",
         provider: str | None = None,
         token: str | None = None,
         timeout: int = 120,
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 0d827a95e..0d3a254c7 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+from dataclasses import dataclass
 from enum import IntEnum
 
 from rich import box
@@ -29,7 +30,61 @@
 from smolagents.utils import escape_code_brackets
 
 
-__all__ = ["AgentLogger", "LogLevel", "Monitor"]
+__all__ = ["AgentLogger", "LogLevel", "Monitor", "TokenUsage", "Timing"]
+
+
+@dataclass
+class TokenUsage:
+    """
+    Contains the token usage information for a given step or run.
+    """
+
+    input_tokens: int
+    output_tokens: int
+
+    def dict(self):
+        return {
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "total_tokens": self.total_tokens,
+        }
+
+    @property
+    def total_tokens(self):
+        return self.input_tokens + self.output_tokens
+
+    def __str__(self):
+        attributes = vars(self).copy()
+        attributes["total_tokens"] = self.total_tokens  # This makes sure the total tokens are also printed
+        return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})"
+
+
+@dataclass
+class Timing:
+    """
+    Contains the timing information for a given step or run.
+    """
+
+    start_time: float
+    end_time: float | None = None
+
+    def dict(self):
+        return {
+            "start_time": self.start_time,
+            "end_time": self.end_time,
+            "duration": self.duration,
+        }
+
+    @property
+    def duration(self):
+        if self.end_time is None:
+            return None
+        return self.end_time - self.start_time
+
+    def __str__(self):
+        attributes = vars(self).copy()
+        attributes["duration"] = self.duration  # This makes sure the duration is also printed
+        return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})"
 
 
 class Monitor:
@@ -41,11 +96,15 @@ def __init__(self, tracked_model, logger):
             self.total_input_token_count = 0
             self.total_output_token_count = 0
 
-    def get_total_token_counts(self):
-        return {
-            "input": self.total_input_token_count,
-            "output": self.total_output_token_count,
-        }
+    def get_total_token_counts(self) -> TokenUsage | None:
+        return (
+            TokenUsage(
+                input_tokens=self.total_input_token_count,
+                output_tokens=self.total_output_token_count,
+            )
+            if hasattr(self, "total_input_token_count")
+            else None
+        )
 
     def reset(self):
         self.step_durations = []
@@ -58,13 +117,13 @@ def update_metrics(self, step_log):
         Args:
             step_log ([`MemoryStep`]): Step log to update the monitor with.
         """
-        step_duration = step_log.duration
+        step_duration = step_log.timing.duration
         self.step_durations.append(step_duration)
         console_outputs = f"[Step {len(self.step_durations)}: Duration {step_duration:.2f} seconds"
 
-        if getattr(self.tracked_model, "last_input_token_count", None) is not None:
-            self.total_input_token_count += self.tracked_model.last_input_token_count
-            self.total_output_token_count += self.tracked_model.last_output_token_count
+        if step_log.usage is not None:
+            self.total_input_token_count += step_log.usage.input_tokens
+            self.total_output_token_count += step_log.usage.output_tokens
             console_outputs += (
                 f"| Input tokens: {self.total_input_token_count:,} | Output tokens: {self.total_output_token_count:,}"
             )

From bbf194d8d3eab59fe2bcda06aa085d1c64a3df7f Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 12:08:20 +0200
Subject: [PATCH 04/28] Revert deletion of step callbacks after max steps error

---
 src/smolagents/agents.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 7d62c64df..2103a2e37 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -226,15 +226,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert not missing_keys, (
-                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
-            )
+            assert (
+                not missing_keys
+            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
-                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
-                        )
+                        assert (
+                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
+                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -274,9 +274,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(agent.name and agent.description for agent in managed_agents), (
-                "All managed agents need both a name and a description!"
-            )
+            assert all(
+                agent.name and agent.description for agent in managed_agents
+            ), "All managed agents need both a name and a description!"
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
@@ -488,6 +488,10 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"])
         final_memory_step.action_output = final_answer
         self._finalize_step(final_memory_step)
         self.memory.steps.append(final_memory_step)
+        for callback in self.step_callbacks:
+            callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
+                final_memory_step, agent=self
+            )
         return final_answer
 
     def _generate_planning_step(

From 962922fc468d1935106a719e60d5534bfdb8c383 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 12:11:08 +0200
Subject: [PATCH 05/28] Rename attributes to token_usage

---
 src/smolagents/agents.py | 4 ++--
 src/smolagents/memory.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 2103a2e37..4108b9090 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -411,7 +411,7 @@ def _run_stream(
                 assert isinstance(planning_step, PlanningStep)
                 self.memory.steps.append(planning_step)
                 if getattr(self.model, "last_input_token_count", None) is not None:
-                    planning_step.usage = TokenUsage(
+                    planning_step.token_usage = TokenUsage(
                         input_tokens=self.model.last_input_token_count,
                         output_tokens=self.model.last_output_token_count,
                     )
@@ -467,7 +467,7 @@ def _validate_final_answer(self, final_answer: Any):
     def _finalize_step(self, memory_step: ActionStep):
         memory_step.timing.end_time = time.time()
         if getattr(self.model, "last_input_token_count", None) is not None:
-            memory_step.usage = TokenUsage(
+            memory_step.token_usage = TokenUsage(
                 input_tokens=self.model.last_input_token_count,
                 output_tokens=self.model.last_output_token_count,
             )
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 6410d823e..0f24b3bce 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -60,7 +60,7 @@ class ActionStep(MemoryStep):
     observations: str | None = None
     observations_images: list["PIL.Image.Image"] | None = None
     action_output: Any = None
-    usage: TokenUsage | None = None
+    token_usage: TokenUsage | None = None
 
     def dict(self):
         # We overwrite the method to parse the tool_calls and action_output manually
@@ -68,7 +68,7 @@ def dict(self):
             "model_input_messages": self.model_input_messages,
             "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [],
             "timing": self.timing.dict(),
-            "usage": self.usage.dict() if self.usage else None,
+            "token_usage": self.token_usage.dict() if self.token_usage else None,
             "step": self.step_number,
             "error": self.error.dict() if self.error else None,
             "model_output_message": self.model_output_message,
@@ -144,7 +144,7 @@ class PlanningStep(MemoryStep):
     model_output_message: ChatMessage
     plan: str
     timing: Timing | None = None
-    usage: TokenUsage | None = None
+    token_usage: TokenUsage | None = None
 
     def to_messages(self, summary_mode: bool = False) -> list[Message]:
         if summary_mode:

From 8dd6730250048dc26f34cc16f98af9f6e68d8bc8 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 12:24:35 +0200
Subject: [PATCH 06/28] Update gradio UI for token usage

---
 src/smolagents/gradio_ui.py  |  24 +++----
 src/smolagents/models.py     | 129 +++++++++++++++++++----------------
 src/smolagents/monitoring.py |   6 +-
 3 files changed, 83 insertions(+), 76 deletions(-)

diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index b85c1cddf..fce69d069 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -21,19 +21,21 @@
 
 from smolagents.agent_types import AgentAudio, AgentImage, AgentText
 from smolagents.agents import MultiStepAgent, PlanningStep
-from smolagents.memory import ActionStep, FinalAnswerStep, MemoryStep
+from smolagents.memory import ActionStep, FinalAnswerStep
 from smolagents.models import ChatMessageStreamDelta
 from smolagents.utils import _is_package_available
 
 
-def get_step_footnote_content(step_log: MemoryStep, step_name: str) -> str:
+def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str:
     """Get a footnote string for a step log with duration and token information"""
     step_footnote = f"**{step_name}**"
-    if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"):
-        token_str = f" | Input tokens: {step_log.input_token_count:,} | Output tokens: {step_log.output_token_count:,}"
+    if hasattr(step_log, "token_usage"):
+        token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}"
         step_footnote += token_str
-    if hasattr(step_log, "duration"):
-        step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None
+    if hasattr(step_log, "timing"):
+        step_duration = (
+            f" | Duration: {round(float(step_log.timing.duration), 2)}" if step_log.timing.duration else None
+        )
         step_footnote += step_duration
     step_footnote_content = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
     return step_footnote_content
@@ -222,7 +224,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator:
         )
 
 
-def pull_messages_from_step(step_log: MemoryStep, skip_model_outputs: bool = False):
+def pull_messages_from_step(step_log: ActionStep | PlanningStep | FinalAnswerStep, skip_model_outputs: bool = False):
     """Extract ChatMessage objects from agent steps with proper nesting.
 
     Args:
@@ -260,13 +262,7 @@ def stream_to_gradio(
     for step_log in agent.run(
         task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args
     ):
-        # Track tokens if model provides them
-        if getattr(agent.model, "last_input_token_count", None) is not None:
-            if isinstance(step_log, (ActionStep, PlanningStep)):
-                step_log.input_token_count = agent.model.last_input_token_count
-                step_log.output_token_count = agent.model.last_output_token_count
-
-        if isinstance(step_log, MemoryStep):
+        if isinstance(step_log, ActionStep | PlanningStep | FinalAnswerStep):
             intermediate_text = ""
             for message in pull_messages_from_step(
                 step_log,
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index c0d5ac476..8870b3d55 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -24,6 +24,7 @@
 from threading import Thread
 from typing import TYPE_CHECKING, Any
 
+from .monitoring import TokenUsage
 from .tools import Tool
 from .utils import _is_package_available, encode_image_base64, make_image_url, parse_json_blob
 
@@ -99,12 +100,13 @@ class ChatMessage:
     content: str | None = None
     tool_calls: list[ChatMessageToolCall] | None = None
     raw: Any | None = None  # Stores the raw output from the API
+    token_usage: TokenUsage | None = None
 
     def model_dump_json(self):
         return json.dumps(get_dict_from_nested_dataclasses(self, ignore_key="raw"))
 
     @classmethod
-    def from_dict(cls, data: dict, raw: Any | None = None) -> "ChatMessage":
+    def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage | None = None) -> "ChatMessage":
         if data.get("tool_calls"):
             tool_calls = [
                 ChatMessageToolCall(
@@ -113,7 +115,13 @@ def from_dict(cls, data: dict, raw: Any | None = None) -> "ChatMessage":
                 for tc in data["tool_calls"]
             ]
             data["tool_calls"] = tool_calls
-        return cls(role=data["role"], content=data.get("content"), tool_calls=data.get("tool_calls"), raw=raw)
+        return cls(
+            role=data["role"],
+            content=data.get("content"),
+            tool_calls=data.get("tool_calls"),
+            raw=raw,
+            token_usage=token_usage,
+        )
 
     def dict(self):
         return json.dumps(get_dict_from_nested_dataclasses(self))
@@ -142,6 +150,7 @@ def parse_json_if_needed(arguments: str | dict) -> str | dict:
 class ChatMessageStreamDelta:
     content: str | None = None
     tool_calls: list[ChatMessageToolCall] | None = None
+    token_usage: TokenUsage | None = None
 
 
 class MessageRole(str, Enum):
@@ -301,8 +310,6 @@ def __init__(
         self.tool_name_key = tool_name_key
         self.tool_arguments_key = tool_arguments_key
         self.kwargs = kwargs
-        self.last_input_token_count: int | None = None
-        self.last_output_token_count: int | None = None
         self.model_id: str | None = model_id
 
     def _prepare_completion_kwargs(
@@ -359,14 +366,6 @@ def _prepare_completion_kwargs(
 
         return completion_kwargs
 
-    def get_token_counts(self) -> dict[str, int]:
-        if self.last_input_token_count is None or self.last_output_token_count is None:
-            raise ValueError("Token counts are not available")
-        return {
-            "input_token_count": self.last_input_token_count,
-            "output_token_count": self.last_output_token_count,
-        }
-
     def generate(
         self,
         messages: list[dict[str, str | list[dict]]],
@@ -416,8 +415,6 @@ def to_dict(self) -> dict:
         """
         model_dictionary = {
             **self.kwargs,
-            "last_input_token_count": self.last_input_token_count,
-            "last_output_token_count": self.last_output_token_count,
             "model_id": self.model_id,
         }
         for attribute in [
@@ -446,16 +443,7 @@ def to_dict(self) -> dict:
 
     @classmethod
     def from_dict(cls, model_dictionary: dict[str, Any]) -> "Model":
-        model_instance = cls(
-            **{
-                k: v
-                for k, v in model_dictionary.items()
-                if k not in ["last_input_token_count", "last_output_token_count"]
-            }
-        )
-        model_instance.last_input_token_count = model_dictionary.pop("last_input_token_count", None)
-        model_instance.last_output_token_count = model_dictionary.pop("last_output_token_count", None)
-        return model_instance
+        return cls(**{k: v for k, v in model_dictionary.items()})
 
 
 class VLLMModel(Model):
@@ -554,12 +542,14 @@ def generate(
             sampling_params=sampling_params,
         )
         output_text = out[0].outputs[0].text
-        self.last_input_token_count = len(out[0].prompt_token_ids)
-        self.last_output_token_count = len(out[0].outputs[0].token_ids)
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=output_text,
             raw={"out": output_text, "completion_kwargs": completion_kwargs},
+            token_usage=TokenUsage(
+                input_tokens=len(out[0].prompt_token_ids),
+                output_tokens=len(out[0].outputs[0].token_ids),
+            ),
         )
 
 
@@ -651,18 +641,23 @@ def generate(
             add_generation_prompt=True,
         )
 
-        self.last_input_token_count = len(prompt_ids)
-        self.last_output_token_count = 0
+        output_tokens = 0
         text = ""
         for response in self.stream_generate(self.model, self.tokenizer, prompt=prompt_ids, **completion_kwargs):
-            self.last_output_token_count += 1
+            output_tokens += 1
             text += response.text
             if any((stop_index := text.rfind(stop)) != -1 for stop in stops):
                 text = text[:stop_index]
                 break
 
         return ChatMessage(
-            role=MessageRole.ASSISTANT, content=text, raw={"out": text, "completion_kwargs": completion_kwargs}
+            role=MessageRole.ASSISTANT,
+            content=text,
+            raw={"out": text, "completion_kwargs": completion_kwargs},
+            token_usage=TokenUsage(
+                input_tokens=len(prompt_ids),
+                output_tokens=output_tokens,
+            ),
         )
 
 
@@ -870,8 +865,6 @@ def generate(
             output_text = self.processor.decode(generated_tokens, skip_special_tokens=True)
         else:
             output_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        self.last_input_token_count = count_prompt_tokens
-        self.last_output_token_count = len(generated_tokens)
 
         if stop_sequences is not None:
             output_text = remove_stop_sequences(output_text, stop_sequences)
@@ -883,6 +876,10 @@ def generate(
                 "out": output_text,
                 "completion_kwargs": {key: value for key, value in generation_kwargs.items() if key != "inputs"},
             },
+            token_usage=TokenUsage(
+                input_tokens=count_prompt_tokens,
+                output_tokens=len(generated_tokens),
+            ),
         )
 
     def generate_stream(
@@ -905,14 +902,13 @@ def generate_stream(
         thread = Thread(target=self.model.generate, kwargs={"streamer": self.streamer, **generation_kwargs})
         thread.start()
 
-        self.last_output_token_count = 0
-
         # Generate with streaming
         for new_text in self.streamer:
-            yield ChatMessageStreamDelta(content=new_text, tool_calls=None)
-            self.last_output_token_count += 1
-
-        self.last_input_token_count = count_prompt_tokens
+            yield ChatMessageStreamDelta(
+                content=new_text,
+                tool_calls=None,
+                token_usage=TokenUsage(input_tokens=count_prompt_tokens, output_tokens=1),
+            )
         thread.join()
 
 
@@ -1030,11 +1026,13 @@ def generate(
 
         response = self.client.completion(**completion_kwargs)
 
-        self.last_input_token_count = response.usage.prompt_tokens
-        self.last_output_token_count = response.usage.completion_tokens
         return ChatMessage.from_dict(
             response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}),
             raw=response,
+            token_usage=TokenUsage(
+                input_tokens=response.usage.prompt_tokens,
+                output_tokens=response.usage.completion_tokens,
+            ),
         )
 
     def generate_stream(
@@ -1065,10 +1063,11 @@ def generate_stream(
                 else:
                     yield ChatMessageStreamDelta(
                         content=event.choices[0].delta.content,
+                        token_usage=TokenUsage(
+                            input_tokens=event.usage.prompt_tokens,
+                            output_tokens=event.usage.completion_tokens,
+                        ),
                     )
-            if getattr(event, "usage", None):
-                self.last_input_token_count = event.usage.prompt_tokens
-                self.last_output_token_count = event.usage.completion_tokens
 
 
 class LiteLLMRouterModel(LiteLLMModel):
@@ -1274,9 +1273,14 @@ def generate(
         )
         response = self.client.chat_completion(**completion_kwargs)
 
-        self.last_input_token_count = response.usage.prompt_tokens
-        self.last_output_token_count = response.usage.completion_tokens
-        return ChatMessage.from_dict(asdict(response.choices[0].message), raw=response)
+        return ChatMessage.from_dict(
+            asdict(response.choices[0].message),
+            raw=response,
+            token_usage=TokenUsage(
+                input_tokens=response.usage.prompt_tokens,
+                output_tokens=response.usage.completion_tokens,
+            ),
+        )
 
     def generate_stream(
         self,
@@ -1308,10 +1312,11 @@ def generate_stream(
                 else:
                     yield ChatMessageStreamDelta(
                         content=event.choices[0].delta.content,
+                        token_usage=TokenUsage(
+                            input_tokens=event.usage.prompt_tokens,
+                            output_tokens=event.usage.completion_tokens,
+                        ),
                     )
-            if getattr(event, "usage", None):
-                self.last_input_token_count = event.usage.prompt_tokens
-                self.last_output_token_count = event.usage.completion_tokens
 
 
 class HfApiModel(InferenceClientModel):
@@ -1414,10 +1419,11 @@ def generate_stream(
                 else:
                     yield ChatMessageStreamDelta(
                         content=event.choices[0].delta.content,
+                        token_usage=TokenUsage(
+                            input_tokens=event.usage.prompt_tokens,
+                            output_tokens=event.usage.completion_tokens,
+                        ),
                     )
-            if getattr(event, "usage", None):
-                self.last_input_token_count = event.usage.prompt_tokens
-                self.last_output_token_count = event.usage.completion_tokens
 
     def generate(
         self,
@@ -1438,12 +1444,14 @@ def generate(
             **kwargs,
         )
         response = self.client.chat.completions.create(**completion_kwargs)
-        self.last_input_token_count = response.usage.prompt_tokens
-        self.last_output_token_count = response.usage.completion_tokens
 
         return ChatMessage.from_dict(
             response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}),
             raw=response,
+            token_usage=TokenUsage(
+                input_tokens=response.usage.prompt_tokens,
+                output_tokens=response.usage.completion_tokens,
+            ),
         )
 
 
@@ -1665,13 +1673,16 @@ def generate(
         # self.client is created in ApiModel class
         response = self.client.converse(**completion_kwargs)
 
-        # Get usage
-        self.last_input_token_count = response["usage"]["inputTokens"]
-        self.last_output_token_count = response["usage"]["outputTokens"]
-
         # Get first message
         response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"]
-        return ChatMessage.from_dict(response["output"]["message"], raw=response)
+        return ChatMessage.from_dict(
+            response["output"]["message"],
+            raw=response,
+            token_usage=TokenUsage(
+                input_tokens=response["usage"]["inputTokens"],
+                output_tokens=response["usage"]["outputTokens"],
+            ),
+        )
 
 
 __all__ = [
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 0d3a254c7..cfed370f3 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -121,9 +121,9 @@ def update_metrics(self, step_log):
         self.step_durations.append(step_duration)
         console_outputs = f"[Step {len(self.step_durations)}: Duration {step_duration:.2f} seconds"
 
-        if step_log.usage is not None:
-            self.total_input_token_count += step_log.usage.input_tokens
-            self.total_output_token_count += step_log.usage.output_tokens
+        if step_log.token_usage is not None:
+            self.total_input_token_count += step_log.token_usage.input_tokens
+            self.total_output_token_count += step_log.token_usage.output_tokens
             console_outputs += (
                 f"| Input tokens: {self.total_input_token_count:,} | Output tokens: {self.total_output_token_count:,}"
             )

From 795f76e18e66a3ec0dd62ffdccdda0dc7b8b7780 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 13:34:26 +0200
Subject: [PATCH 07/28] Fix gradio chatbot as much as possible

---
 examples/gradio_ui.py        |   6 +-
 src/smolagents/agents.py     | 107 ++++++++++++++++++++++++-----------
 src/smolagents/gradio_ui.py  |  36 +++++-------
 src/smolagents/models.py     |  16 ++++--
 src/smolagents/monitoring.py |  19 +++----
 5 files changed, 110 insertions(+), 74 deletions(-)

diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py
index 87f532689..e82684202 100644
--- a/examples/gradio_ui.py
+++ b/examples/gradio_ui.py
@@ -3,13 +3,13 @@
 
 agent = CodeAgent(
     tools=[],
-    model=InferenceClientModel(),
+    model=InferenceClientModel(provider="nebius"),
     verbosity_level=1,
-    planning_interval=3,
+    # planning_interval=3,
     name="example_agent",
     description="This is an example agent.",
     step_callbacks=[],
-    stream_outputs=False,
+    stream_outputs=True,
 )
 
 GradioUI(agent, file_upload_folder="./data").launch()
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 1101bfa50..502272a31 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -99,6 +99,11 @@ def populate_template(template: str, variables: dict[str, Any]) -> str:
         raise Exception(f"Error during jinja template rendering: {type(e).__name__}: {e}")
 
 
+@dataclass
+class FinalOutput:
+    output: Any | None
+
+
 class PlanningPromptTemplate(TypedDict):
     """
     Prompt templates for the planning step.
@@ -173,7 +178,7 @@ class PromptTemplates(TypedDict):
 class RunResult:
     """Holds extended information about an agent run."""
 
-    result: Any
+    output: Any | None
     token_usage: TokenUsage | None
     messages: list[dict]
     timing: Timing
@@ -367,7 +372,7 @@ def run(
         # Outputs are returned only at the end. We only look at the last step.
 
         steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images))
-        result = steps[-1].final_answer
+        result = steps[-1].output
 
         if self.return_full_result:
             token_usage = None
@@ -381,7 +386,7 @@ def run(
             messages = self.memory.get_full_steps()
 
             return RunResult(
-                result=result,
+                output=result,
                 token_usage=token_usage,
                 messages=messages,
                 timing=Timing(start_time=run_start_time, end_time=time.time()),
@@ -392,12 +397,14 @@ def run(
 
     def _run_stream(
         self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None
-    ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep]:
+    ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep | ChatMessageStreamDelta]:
         final_answer = None
         self.step_number = 1
         while final_answer is None and self.step_number <= max_steps:
             if self.interrupt_switch:
                 raise AgentError("Agent interrupted.", self.logger)
+
+            # Run a planning step if scheduled
             if self.planning_interval is not None and (
                 self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0
             ):
@@ -408,18 +415,15 @@ def _run_stream(
                 ):
                     yield element
                     planning_step = element
-                assert isinstance(planning_step, PlanningStep)
+                assert isinstance(planning_step, PlanningStep)  # Last yielded element should be a PlanningStep
                 self.memory.steps.append(planning_step)
-                if getattr(self.model, "last_input_token_count", None) is not None:
-                    planning_step.token_usage = TokenUsage(
-                        input_tokens=self.model.last_input_token_count,
-                        output_tokens=self.model.last_output_token_count,
-                    )
                 planning_end_time = time.time()
                 planning_step.timing = Timing(
                     start_time=planning_start_time,
                     end_time=planning_end_time,
                 )
+
+            # Start action step!
             action_step_start_time = time.time()
             action_step = ActionStep(
                 step_number=self.step_number,
@@ -447,15 +451,17 @@ def _run_stream(
             yield action_step
         yield FinalAnswerStep(handle_agent_output_types(final_answer))
 
-    def _execute_step(self, memory_step: ActionStep) -> Generator[Any]:
+    def _execute_step(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]:
         self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO)
-        final_answer = None
         for el in self._step_stream(memory_step):
             final_answer = el
-            yield el
-        if final_answer is not None and self.final_answer_checks:
-            self._validate_final_answer(final_answer)
-        yield final_answer
+            if isinstance(el, ChatMessageStreamDelta):
+                yield el
+            elif isinstance(el, FinalOutput):
+                final_answer = el.output
+                if self.final_answer_checks:
+                    self._validate_final_answer(final_answer)
+                yield final_answer
 
     def _validate_final_answer(self, final_answer: Any):
         for check_function in self.final_answer_checks:
@@ -496,7 +502,8 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"])
 
     def _generate_planning_step(
         self, task, is_first_step: bool, step: int
-    ) -> Generator[ChatMessageStreamDelta, PlanningStep]:
+    ) -> Generator[ChatMessageStreamDelta | PlanningStep]:
+        start_time = time.time()
         if is_first_step:
             input_messages = [
                 {
@@ -515,14 +522,23 @@ def _generate_planning_step(
             if self.stream_outputs and hasattr(self.model, "generate_stream"):
                 plan_message_content = ""
                 output_stream = self.model.generate_stream(input_messages, stop_sequences=["<end_plan>"])  # type: ignore
+                input_tokens, output_tokens = 0, 0
                 with Live("", console=self.logger.console, vertical_overflow="visible") as live:
                     for event in output_stream:
                         if event.content is not None:
                             plan_message_content += event.content
                             live.update(Markdown(plan_message_content))
+                            if event.token_usage:
+                                output_tokens += event.token_usage.output_tokens
+                                input_tokens = event.token_usage.input_tokens
                         yield event
             else:
-                plan_message_content = self.model.generate(input_messages, stop_sequences=["<end_plan>"]).content
+                plan_message = self.model.generate(input_messages, stop_sequences=["<end_plan>"])
+                plan_message_content = plan_message.content
+                input_tokens, output_tokens = (
+                    plan_message.token_usage.input_tokens,
+                    plan_message.token_usage.output_tokens,
+                )
             plan = textwrap.dedent(
                 f"""Here are the facts I know and the plan of action that I will follow to solve the task:\n```\n{plan_message_content}\n```"""
             )
@@ -561,11 +577,25 @@ def _generate_planning_step(
             input_messages = [plan_update_pre] + memory_messages + [plan_update_post]
             if self.stream_outputs and hasattr(self.model, "generate_stream"):
                 plan_message_content = ""
-                for completion_delta in self.model.generate_stream(input_messages, stop_sequences=["<end_plan>"]):  # type: ignore
-                    plan_message_content += completion_delta.content
-                    yield completion_delta
+                input_tokens, output_tokens = 0, 0
+                with Live("", console=self.logger.console, vertical_overflow="visible") as live:
+                    for event in self.model.generate_stream(
+                        input_messages,
+                        stop_sequences=["<end_plan>"],
+                    ):  # type: ignore
+                        if event.content is not None:
+                            plan_message_content += event.content
+                            live.update(Markdown(plan_message_content))
+                            output_tokens += event.token_usage.output_tokens
+                            input_tokens = event.token_usage.input_tokens
+                        yield event
             else:
-                plan_message_content = self.model.generate(input_messages, stop_sequences=["<end_plan>"]).content
+                plan_message = self.model.generate(input_messages, stop_sequences=["<end_plan>"])
+                plan_message_content = plan_message.content
+                input_tokens, output_tokens = (
+                    plan_message.token_usage.input_tokens,
+                    plan_message.token_usage.output_tokens,
+                )
             plan = textwrap.dedent(
                 f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere are the facts I know and my new/updated plan of action to solve the task:\n```\n{plan_message_content}\n```"""
             )
@@ -575,6 +605,8 @@ def _generate_planning_step(
             model_input_messages=input_messages,
             plan=plan,
             model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content=plan_message_content),
+            token_usage=TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens),
+            timing=Timing(start_time=start_time, end_time=time.time()),
         )
 
     @property
@@ -607,7 +639,7 @@ def write_memory_to_messages(
             messages.extend(memory_step.to_messages(summary_mode=summary_mode))
         return messages
 
-    def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
+    def _step_stream(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
         Yields either None if the step is not final, or the final answer.
@@ -709,7 +741,7 @@ def __call__(self, task: str, **kwargs):
         )
         result = self.run(full_task, **kwargs)
         if isinstance(result, RunResult):
-            report = result.result
+            report = result.output
         else:
             report = result
         answer = populate_template(
@@ -1105,7 +1137,7 @@ def initialize_system_prompt(self) -> str:
         )
         return system_prompt
 
-    def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
+    def _step_stream(self, memory_step: ActionStep) -> Generator[FinalOutput]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
         Yields either None if the step is not final, or the final answer.
@@ -1118,7 +1150,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
         memory_step.model_input_messages = input_messages
 
         try:
-            chat_message: ChatMessage = self.model(
+            chat_message: ChatMessage = self.model.generate(
                 input_messages,
                 stop_sequences=["Observation:", "Calling tools:"],
                 tools_to_call_from=list(self.tools.values()),
@@ -1179,7 +1211,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
                 )
 
             memory_step.action_output = final_answer
-            yield final_answer
+            yield FinalOutput(output=final_answer)
         else:
             if tool_arguments is None:
                 tool_arguments = {}
@@ -1201,7 +1233,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
                 level=LogLevel.INFO,
             )
             memory_step.observations = updated_information
-            yield None
+            yield FinalOutput(output=None)
 
     def _substitute_state_variables(self, arguments: dict[str, str] | str) -> dict[str, Any] | str:
         """Replace string values in arguments with their corresponding state values if they exist."""
@@ -1369,10 +1401,11 @@ def initialize_system_prompt(self) -> str:
         )
         return system_prompt
 
-    def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
+    def _step_stream(self, memory_step: ActionStep) -> Generator[ChatMessageStreamDelta | FinalOutput]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
-        Yields either None if the step is not final, or the final answer.
+        Yields ChatMessageStreamDelta during the run if streaming is enabled.
+        At the end, yields either None if the step is not final, or the final answer.
         """
         memory_messages = self.write_memory_to_messages()
 
@@ -1388,15 +1421,24 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
                     **additional_args,
                 )
                 output_text = ""
+                input_tokens, output_tokens = 0, 0
                 with Live("", console=self.logger.console, vertical_overflow="visible") as live:
                     for event in output_stream:
                         if event.content is not None:
                             output_text += event.content
                             live.update(Markdown(output_text))
+                            if event.token_usage:
+                                output_tokens += event.token_usage.output_tokens
+                                input_tokens = event.token_usage.input_tokens
+                        assert isinstance(event, ChatMessageStreamDelta)
                         yield event
 
                 model_output = output_text
-                chat_message = ChatMessage(role="assistant", content=model_output)
+                chat_message = ChatMessage(
+                    role="assistant",
+                    content=model_output,
+                    token_usage=TokenUsage(input_tokens=input_tokens, output_tokens=output_tokens),
+                )
                 memory_step.model_output_message = chat_message
                 model_output = chat_message.content
             else:
@@ -1419,6 +1461,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
                 model_output += "<end_code>"
                 memory_step.model_output_message.content = model_output
 
+            memory_step.token_usage = chat_message.token_usage
             memory_step.model_output = model_output
         except Exception as e:
             raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e
@@ -1480,7 +1523,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]:
         ]
         self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
         memory_step.action_output = output
-        yield output if is_final_answer else None
+        yield FinalOutput(output=output if is_final_answer else None)
 
     def to_dict(self) -> dict[str, Any]:
         """Convert the agent to a dictionary representation.
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index fce69d069..89fb7aaa5 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -29,14 +29,11 @@
 def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str:
     """Get a footnote string for a step log with duration and token information"""
     step_footnote = f"**{step_name}**"
-    if hasattr(step_log, "token_usage"):
+    if getattr(step_log, "token_usage", None):
         token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}"
         step_footnote += token_str
-    if hasattr(step_log, "timing"):
-        step_duration = (
-            f" | Duration: {round(float(step_log.timing.duration), 2)}" if step_log.timing.duration else None
-        )
-        step_footnote += step_duration
+    step_duration = f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else None
+    step_footnote += step_duration
     step_footnote_content = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
     return step_footnote_content
 
@@ -252,26 +249,27 @@ def stream_to_gradio(
     task_images: list | None = None,
     reset_agent_memory: bool = False,
     additional_args: dict | None = None,
-):
+) -> Generator:
     """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
     if not _is_package_available("gradio"):
         raise ModuleNotFoundError(
             "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
         )
     intermediate_text = ""
-    for step_log in agent.run(
+    for event in agent.run(
         task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args
     ):
-        if isinstance(step_log, ActionStep | PlanningStep | FinalAnswerStep):
+        if isinstance(event, ActionStep | PlanningStep | FinalAnswerStep):
             intermediate_text = ""
             for message in pull_messages_from_step(
-                step_log,
+                event,
                 # If we're streaming model outputs, no need to display them twice
                 skip_model_outputs=getattr(agent, "stream_outputs", False),
             ):
                 yield message
-        elif isinstance(step_log, ChatMessageStreamDelta):
-            intermediate_text += step_log.content or ""
+        elif isinstance(event, ChatMessageStreamDelta):
+            print(event)
+            intermediate_text += event.content or ""
             yield intermediate_text
 
 
@@ -306,19 +304,15 @@ def interact_with_agent(self, prompt, messages, session_state):
                 if isinstance(msg, gr.ChatMessage):
                     messages.append(msg)
                 elif isinstance(msg, str):  # Then it's only a completion delta
-                    try:
-                        if messages[-1].metadata["status"] == "pending":
-                            messages[-1].content = msg
-                        else:
-                            messages.append(
-                                gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"})
-                            )
-                    except Exception as e:
-                        raise e
+                    if messages[-1].metadata["status"] == "pending":
+                        messages[-1].content = msg
+                    else:
+                        messages.append(gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}))
                 yield messages
 
             yield messages
         except Exception as e:
+            raise e
             print(f"Error in interaction: {str(e)}")
             messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
             yield messages
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 714123f09..68f4fa090 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -368,7 +368,7 @@ def _prepare_completion_kwargs(
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]]] | list[ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -377,7 +377,7 @@ def generate(
         """Process the input messages and return the model's response.
 
         Parameters:
-            messages (`list[dict[str, str]]`):
+            messages (`list[dict[str, str | list[dict]]] | list[ChatMessage]`):
                 A list of message dictionaries to be processed. Each dictionary should have the structure `{"role": "user/system", "content": "message content"}`.
             stop_sequences (`List[str]`, *optional*):
                 A list of strings that will stop the generation if encountered in the model's output.
@@ -1318,12 +1318,16 @@ def generate_stream(
                     if not getattr(event.choices[0], "finish_reason", None):
                         raise ValueError(f"No content or tool calls in event: {event}")
                 else:
-                    yield ChatMessageStreamDelta(
-                        content=event.choices[0].delta.content,
-                        token_usage=TokenUsage(
+                    if getattr(event, "usage", None):
+                        token_usage = TokenUsage(
                             input_tokens=event.usage.prompt_tokens,
                             output_tokens=event.usage.completion_tokens,
-                        ),
+                        )
+                    else:
+                        token_usage = None
+                    yield ChatMessageStreamDelta(
+                        content=event.choices[0].delta.content,
+                        token_usage=token_usage,
                     )
 
 
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index cfed370f3..0effcfcf4 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -92,18 +92,13 @@ def __init__(self, tracked_model, logger):
         self.step_durations = []
         self.tracked_model = tracked_model
         self.logger = logger
-        if getattr(self.tracked_model, "last_input_token_count", "Not found") != "Not found":
-            self.total_input_token_count = 0
-            self.total_output_token_count = 0
-
-    def get_total_token_counts(self) -> TokenUsage | None:
-        return (
-            TokenUsage(
-                input_tokens=self.total_input_token_count,
-                output_tokens=self.total_output_token_count,
-            )
-            if hasattr(self, "total_input_token_count")
-            else None
+        self.total_input_token_count = 0
+        self.total_output_token_count = 0
+
+    def get_total_token_counts(self) -> TokenUsage:
+        return TokenUsage(
+            input_tokens=self.total_input_token_count,
+            output_tokens=self.total_output_token_count,
         )
 
     def reset(self):

From c7be43b8bad0a5fa4e7089998bde2499052faecc Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 14:03:52 +0200
Subject: [PATCH 08/28] Fix gradio chatbot by escaping HTML tags

---
 examples/gradio_ui.py       |  4 ++--
 src/smolagents/agents.py    | 18 +++++++++---------
 src/smolagents/gradio_ui.py |  8 ++++++--
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py
index e82684202..2b28ce109 100644
--- a/examples/gradio_ui.py
+++ b/examples/gradio_ui.py
@@ -3,9 +3,9 @@
 
 agent = CodeAgent(
     tools=[],
-    model=InferenceClientModel(provider="nebius"),
+    model=InferenceClientModel(),
     verbosity_level=1,
-    # planning_interval=3,
+    planning_interval=3,
     name="example_agent",
     description="This is an example agent.",
     step_callbacks=[],
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 502272a31..834e3eb24 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -231,15 +231,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert (
-                not missing_keys
-            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            assert not missing_keys, (
+                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            )
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert (
-                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
-                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
+                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        )
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(
-                agent.name and agent.description for agent in managed_agents
-            ), "All managed agents need both a name and a description!"
+            assert all(agent.name and agent.description for agent in managed_agents), (
+                "All managed agents need both a name and a description!"
+            )
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index 89fb7aaa5..f0b0871bb 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -222,7 +222,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator:
 
 
 def pull_messages_from_step(step_log: ActionStep | PlanningStep | FinalAnswerStep, skip_model_outputs: bool = False):
-    """Extract ChatMessage objects from agent steps with proper nesting.
+    """Extract Gradio ChatMessage objects from agent steps with proper nesting.
 
     Args:
         step_log: The step log to display as gr.ChatMessage objects.
@@ -256,6 +256,8 @@ def stream_to_gradio(
             "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
         )
     intermediate_text = ""
+    import time
+
     for event in agent.run(
         task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args
     ):
@@ -268,7 +270,7 @@ def stream_to_gradio(
             ):
                 yield message
         elif isinstance(event, ChatMessageStreamDelta):
-            print(event)
+            time.sleep(0.1)
             intermediate_text += event.content or ""
             yield intermediate_text
 
@@ -302,8 +304,10 @@ def interact_with_agent(self, prompt, messages, session_state):
 
             for msg in stream_to_gradio(session_state["agent"], task=prompt, reset_agent_memory=False):
                 if isinstance(msg, gr.ChatMessage):
+                    messages[-1].metadata["status"] = "done"
                     messages.append(msg)
                 elif isinstance(msg, str):  # Then it's only a completion delta
+                    msg = msg.replace("<", r"\<").replace(">", r"\>")  # HTML tags seem to break Gradio Chatbot
                     if messages[-1].metadata["status"] == "pending":
                         messages[-1].content = msg
                     else:

From 3446f7ead7c2cd6ae14d4d2b90a844caf6d9c5b1 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 14:28:55 +0200
Subject: [PATCH 09/28] Pass monitoring tests

---
 src/smolagents/agents.py    | 41 +++++++++++++++---------------------
 src/smolagents/gradio_ui.py |  2 +-
 src/smolagents/memory.py    |  2 +-
 tests/test_monitoring.py    | 42 ++++++++++++++-----------------------
 4 files changed, 35 insertions(+), 52 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 834e3eb24..f568531eb 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -231,15 +231,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert not missing_keys, (
-                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
-            )
+            assert (
+                not missing_keys
+            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
-                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
-                        )
+                        assert (
+                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
+                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(agent.name and agent.description for agent in managed_agents), (
-                "All managed agents need both a name and a description!"
-            )
+            assert all(
+                agent.name and agent.description for agent in managed_agents
+            ), "All managed agents need both a name and a description!"
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
@@ -472,11 +472,6 @@ def _validate_final_answer(self, final_answer: Any):
 
     def _finalize_step(self, memory_step: ActionStep):
         memory_step.timing.end_time = time.time()
-        if getattr(self.model, "last_input_token_count", None) is not None:
-            memory_step.token_usage = TokenUsage(
-                input_tokens=self.model.last_input_token_count,
-                output_tokens=self.model.last_output_token_count,
-            )
         for callback in self.step_callbacks:
             # For compatibility with old callbacks that don't take the agent as an argument
             callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
@@ -489,16 +484,13 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"])
         final_memory_step = ActionStep(
             step_number=self.step_number,
             error=AgentMaxStepsError("Reached max steps.", self.logger),
-            timing=Timing(start_time=action_step_start_time),
+            timing=Timing(start_time=action_step_start_time, end_time=time.time()),
+            token_usage=final_answer.token_usage,
         )
-        final_memory_step.action_output = final_answer
+        final_memory_step.action_output = final_answer.content
         self._finalize_step(final_memory_step)
         self.memory.steps.append(final_memory_step)
-        for callback in self.step_callbacks:
-            callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
-                final_memory_step, agent=self
-            )
-        return final_answer
+        return final_answer.content
 
     def _generate_planning_step(
         self, task, is_first_step: bool, step: int
@@ -674,7 +666,7 @@ def extract_action(self, model_output: str, split_token: str) -> tuple[str, str]
             )
         return rationale.strip(), action.strip()
 
-    def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None = None) -> str:
+    def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None = None) -> ChatMessage:
         """
         Provide the final answer to the task, based on the logs of the agent's interactions.
 
@@ -713,8 +705,8 @@ def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None
             }
         ]
         try:
-            chat_message: ChatMessage = self.model(messages)
-            return chat_message.content
+            chat_message: ChatMessage = self.model.generate(messages)
+            return chat_message
         except Exception as e:
             return f"Error in generating final LLM output:\n{e}"
 
@@ -1181,6 +1173,7 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[FinalOutput]:
         tool_arguments = tool_call.function.arguments
         memory_step.model_output = str(f"Called Tool: '{tool_name}' with arguments: {tool_arguments}")
         memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
+        memory_step.token_usage = chat_message.token_usage
 
         # Execute
         self.logger.log(
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index f0b0871bb..a2b188c19 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -196,7 +196,7 @@ def _process_final_answer_step(step_log: FinalAnswerStep) -> Generator:
     """
     import gradio as gr
 
-    final_answer = step_log.final_answer
+    final_answer = step_log.output
     if isinstance(final_answer, AgentText):
         yield gr.ChatMessage(
             role="assistant",
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 0f24b3bce..033ca261a 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -182,7 +182,7 @@ def to_messages(self, summary_mode: bool = False) -> list[Message]:
 
 @dataclass
 class FinalAnswerStep(MemoryStep):
-    final_answer: Any
+    output: Any
 
 
 class AgentMemory:
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py
index b571a233a..6b5eaac1a 100644
--- a/tests/test_monitoring.py
+++ b/tests/test_monitoring.py
@@ -20,6 +20,7 @@
 from smolagents import (
     AgentImage,
     CodeAgent,
+    RunResult,
     ToolCallingAgent,
     stream_to_gradio,
 )
@@ -28,14 +29,11 @@
     ChatMessageToolCall,
     ChatMessageToolCallDefinition,
     Model,
+    TokenUsage,
 )
 
 
 class FakeLLMModel(Model):
-    def __init__(self):
-        self.last_input_token_count = 10
-        self.last_output_token_count = 20
-
     def generate(self, prompt, tools_to_call_from=None, **kwargs):
         if tools_to_call_from is not None:
             return ChatMessage(
@@ -48,6 +46,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs):
                         function=ChatMessageToolCallDefinition(name="final_answer", arguments={"answer": "image"}),
                     )
                 ],
+                token_usage=TokenUsage(input_tokens=10, output_tokens=20),
             )
         else:
             return ChatMessage(
@@ -57,6 +56,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs):
 ```py
 final_answer('This is the final answer.')
 ```""",
+                token_usage=TokenUsage(input_tokens=10, output_tokens=20),
             )
 
 
@@ -86,12 +86,12 @@ def test_toolcalling_agent_metrics(self):
 
     def test_code_agent_metrics_max_steps(self):
         class FakeLLMModelMalformedAnswer(Model):
-            def __init__(self):
-                self.last_input_token_count = 10
-                self.last_output_token_count = 20
-
             def generate(self, prompt, **kwargs):
-                return ChatMessage(role="assistant", content="Malformed answer")
+                return ChatMessage(
+                    role="assistant",
+                    content="Malformed answer",
+                    token_usage=TokenUsage(input_tokens=10, output_tokens=20),
+                )
 
         agent = CodeAgent(
             tools=[],
@@ -106,13 +106,7 @@ def generate(self, prompt, **kwargs):
 
     def test_code_agent_metrics_generation_error(self):
         class FakeLLMModelGenerationException(Model):
-            def __init__(self):
-                self.last_input_token_count = 10
-                self.last_output_token_count = 20
-
             def generate(self, prompt, **kwargs):
-                self.last_input_token_count = 10
-                self.last_output_token_count = 0
                 raise Exception("Cannot generate")
 
         agent = CodeAgent(
@@ -120,11 +114,9 @@ def generate(self, prompt, **kwargs):
             model=FakeLLMModelGenerationException(),
             max_steps=1,
         )
-        with pytest.raises(Exception):
+        with pytest.raises(Exception) as e:
             agent.run("Fake task")
-
-        self.assertEqual(agent.monitor.total_input_token_count, 10)  # Should have done one monitoring callbacks
-        self.assertEqual(agent.monitor.total_output_token_count, 0)
+        assert "Cannot generate" in str(e.value)
 
     def test_streaming_agent_text_output(self):
         agent = CodeAgent(
@@ -187,21 +179,19 @@ def generate(self, prompt, **kwargs):
         self.assertEqual(final_message.role, "assistant")
         self.assertIn("Malformed call", final_message.content)
 
-    def test_run_return_full_results(self):
+    def test_run_return_full_result(self):
         agent = CodeAgent(
             tools=[],
             model=FakeLLMModel(),
             max_steps=1,
-            return_full_results=True,
+            return_full_result=True,
         )
 
         result = agent.run("Fake task")
 
-        from smolagents import RunResult
-
         self.assertIsInstance(result, RunResult)
-        self.assertEqual(result.result, "This is the final answer.")
+        self.assertEqual(result.output, "This is the final answer.")
         self.assertEqual(result.state, "success")
-        self.assertEqual(result.token_usage, {"input": 10, "output": 20})
+        self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20))
         self.assertIsInstance(result.messages, list)
-        self.assertGreater(result.duration, 0)
+        self.assertGreater(result.timing.duration, 0)

From 816466ac4334b6f5591a0091136e5221ebe3c541 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 14:38:25 +0200
Subject: [PATCH 10/28] Pass more tests

---
 src/smolagents/agents.py | 18 +++++++--------
 tests/test_agents.py     |  3 ---
 tests/test_gradio_ui.py  | 47 +++++++++++++++++++++++++++-------------
 tests/test_memory.py     | 13 +++++------
 4 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index f568531eb..6cc860110 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -231,15 +231,15 @@ def __init__(
         self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
         if prompt_templates is not None:
             missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
-            assert (
-                not missing_keys
-            ), f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            assert not missing_keys, (
+                f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+            )
             for key, value in EMPTY_PROMPT_TEMPLATES.items():
                 if isinstance(value, dict):
                     for subkey in value.keys():
-                        assert (
-                            key in prompt_templates.keys() and (subkey in prompt_templates[key].keys())
-                        ), f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
+                            f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+                        )
 
         self.max_steps = max_steps
         self.step_number = 0
@@ -279,9 +279,9 @@ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
         """Setup managed agents with proper logging."""
         self.managed_agents = {}
         if managed_agents:
-            assert all(
-                agent.name and agent.description for agent in managed_agents
-            ), "All managed agents need both a name and a description!"
+            assert all(agent.name and agent.description for agent in managed_agents), (
+                "All managed agents need both a name and a description!"
+            )
             self.managed_agents = {agent.name: agent for agent in managed_agents}
 
     def _setup_tools(self, tools, add_base_tools):
diff --git a/tests/test_agents.py b/tests/test_agents.py
index a350dbff4..650c76faa 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -678,8 +678,6 @@ def generate(self, messages, stop_sequences=None):
 
     def test_step_number(self):
         fake_model = MagicMock()
-        fake_model.last_input_token_count = 10
-        fake_model.last_output_token_count = 20
         max_steps = 2
         agent = CodeAgent(tools=[], model=fake_model, max_steps=max_steps)
         assert hasattr(agent, "step_number"), "step_number attribute should be defined"
@@ -852,7 +850,6 @@ def test_provide_final_answer(self, images, expected_messages_list):
     def test_interrupt(self):
         fake_model = MagicMock()
         fake_model.return_value.content = "Model output."
-        fake_model.last_input_token_count = None
 
         def interrupt_callback(memory_step, agent):
             agent.interrupt()
diff --git a/tests/test_gradio_ui.py b/tests/test_gradio_ui.py
index f748bf1e0..4a527b5c4 100644
--- a/tests/test_gradio_ui.py
+++ b/tests/test_gradio_ui.py
@@ -25,6 +25,7 @@
 from smolagents.gradio_ui import GradioUI, pull_messages_from_step, stream_to_gradio
 from smolagents.memory import ActionStep, FinalAnswerStep, PlanningStep, ToolCall
 from smolagents.models import ChatMessageStreamDelta
+from smolagents.monitoring import Timing, TokenUsage
 
 
 class GradioUITester(unittest.TestCase):
@@ -221,11 +222,9 @@ def test_action_step_basic(
             model_output="This is the model output",
             observations="Some execution logs",
             error=None,
-            duration=2.5,
+            timing=Timing(start_time=1.0, end_time=3.5),
+            token_usage=TokenUsage(input_tokens=100, output_tokens=50),
         )
-        # Set in stream_to_gradio:
-        step.input_token_count = 100
-        step.output_token_count = 50
         messages = list(pull_messages_from_step(step))
         assert len(messages) == 5  # step number, model_output, logs, footnote, divider
         for message, expected_content in zip(
@@ -246,7 +245,8 @@ def test_action_step_with_tool_calls(self):
             step_number=2,
             tool_calls=[ToolCall(name="test_tool", arguments={"answer": "Test answer"}, id="tool_call_1")],
             observations="Tool execution logs",
-            duration=1.5,
+            timing=Timing(start_time=1.0, end_time=2.5),
+            token_usage=TokenUsage(input_tokens=100, output_tokens=50),
         )
         messages = list(pull_messages_from_step(step))
         assert len(messages) == 5  # step, tool call, logs, footnote, divider
@@ -266,7 +266,12 @@ def test_action_step_tool_call_formats(self, tool_name, args, expected):
         tool_call = Mock()
         tool_call.name = tool_name
         tool_call.arguments = args
-        step = ActionStep(step_number=1, tool_calls=[tool_call], duration=1.5)
+        step = ActionStep(
+            step_number=1,
+            tool_calls=[tool_call],
+            timing=Timing(start_time=1.0, end_time=2.5),
+            token_usage=TokenUsage(input_tokens=100, output_tokens=50),
+        )
         messages = list(pull_messages_from_step(step))
         tool_message = next(
             msg
@@ -281,7 +286,12 @@ def test_action_step_tool_call_formats(self, tool_name, args, expected):
 
     def test_action_step_with_error(self):
         """Test ActionStep with error."""
-        step = ActionStep(step_number=3, error="This is an error message", duration=1.0)
+        step = ActionStep(
+            step_number=3,
+            error="This is an error message",
+            timing=Timing(start_time=1.0, end_time=2.0),
+            token_usage=TokenUsage(input_tokens=100, output_tokens=200),
+        )
         messages = list(pull_messages_from_step(step))
         error_message = next((m for m in messages if "error" in str(m.content).lower()), None)
         assert error_message is not None
@@ -289,7 +299,12 @@ def test_action_step_with_error(self):
 
     def test_action_step_with_images(self):
         """Test ActionStep with observation images."""
-        step = ActionStep(step_number=4, observations_images=["image1.png", "image2.jpg"], duration=1.0)
+        step = ActionStep(
+            step_number=4,
+            observations_images=["image1.png", "image2.jpg"],
+            token_usage=TokenUsage(input_tokens=100, output_tokens=200),
+            timing=Timing(start_time=1.0, end_time=2.0),
+        )
         with patch("smolagents.gradio_ui.AgentImage") as mock_agent_image:
             mock_agent_image.return_value.to_string.side_effect = lambda: "path/to/image.png"
             messages = list(pull_messages_from_step(step))
@@ -301,11 +316,11 @@ def test_action_step_with_images(self):
     def test_planning_step(self, skip_model_outputs, expected_messages_length):
         """Test PlanningStep processing."""
         step = PlanningStep(
-            plan="1. First step\n2. Second step", model_input_messages=Mock(), model_output_message=Mock()
+            plan="1. First step\n2. Second step",
+            model_input_messages=Mock(),
+            model_output_message=Mock(),
+            token_usage=TokenUsage(input_tokens=10, output_tokens=20),
         )
-        # Set in stream_to_gradio:
-        step.input_token_count = 80
-        step.output_token_count = 30
         messages = list(pull_messages_from_step(step, skip_model_outputs=skip_model_outputs))
         assert len(messages) == expected_messages_length  # [header, plan,] footnote, divider
         expected_contents = [
@@ -331,7 +346,9 @@ def test_final_answer_step(self, answer_type, answer_value, expected_content):
         except TypeError:
             with patch.object(answer_type, "to_string", return_value=answer_value):
                 final_answer = answer_type(answer_value)
-        step = FinalAnswerStep(final_answer=final_answer)
+        step = FinalAnswerStep(
+            output=final_answer,
+        )
         messages = list(pull_messages_from_step(step))
         assert len(messages) == 1
         assert messages[0].content == expected_content
@@ -339,7 +356,7 @@ def test_final_answer_step(self, answer_type, answer_value, expected_content):
     def test_final_answer_step_image(self):
         """Test FinalAnswerStep with image answer."""
         with patch.object(AgentImage, "to_string", return_value="path/to/image.png"):
-            step = FinalAnswerStep(final_answer=AgentImage("path/to/image.png"))
+            step = FinalAnswerStep(output=AgentImage("path/to/image.png"))
             messages = list(pull_messages_from_step(step))
             assert len(messages) == 1
             assert messages[0].content["path"] == "path/to/image.png"
@@ -348,7 +365,7 @@ def test_final_answer_step_image(self):
     def test_final_answer_step_audio(self):
         """Test FinalAnswerStep with audio answer."""
         with patch.object(AgentAudio, "to_string", return_value="path/to/audio.wav"):
-            step = FinalAnswerStep(final_answer=AgentAudio("path/to/audio.wav"))
+            step = FinalAnswerStep(output=AgentAudio("path/to/audio.wav"))
             messages = list(pull_messages_from_step(step))
             assert len(messages) == 1
             assert messages[0].content["path"] == "path/to/audio.wav"
diff --git a/tests/test_memory.py b/tests/test_memory.py
index 04c6b7f47..7990698f6 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -12,6 +12,7 @@
     SystemPromptStep,
     TaskStep,
 )
+from smolagents.monitoring import Timing, TokenUsage
 
 
 class TestAgentMemory:
@@ -43,16 +44,15 @@ def test_action_step_to_messages():
         tool_calls=[
             ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}),
         ],
-        start_time=0.0,
-        end_time=1.0,
+        timing=Timing(start_time=0.0, end_time=1.0),
         step_number=1,
         error=None,
-        duration=1.0,
         model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"),
         model_output="Hi",
         observations="This is a nice observation",
         observations_images=["image1.png"],
         action_output="Output",
+        token_usage=TokenUsage(input_tokens=10, output_tokens=20),
     )
     messages = action_step.to_messages()
     assert len(messages) == 4
@@ -93,16 +93,15 @@ def test_action_step_to_messages_no_tool_calls_with_observations():
     action_step = ActionStep(
         model_input_messages=None,
         tool_calls=None,
-        start_time=None,
-        end_time=None,
-        step_number=None,
+        timing=Timing(start_time=0.0, end_time=1.0),
+        step_number=1,
         error=None,
-        duration=None,
         model_output_message=None,
         model_output=None,
         observations="This is an observation.",
         observations_images=None,
         action_output=None,
+        token_usage=TokenUsage(input_tokens=10, output_tokens=20),
     )
     messages = action_step.to_messages()
     assert len(messages) == 1

From 934361130bfa863207d653a1d12b658bd3c8331a Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 16:47:06 +0200
Subject: [PATCH 11/28] Revert default LLM upgrade

---
 examples/open_deep_research/run_gaia.py |  2 +-
 src/smolagents/models.py                | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py
index 9c7bacd4e..9a22e7174 100644
--- a/examples/open_deep_research/run_gaia.py
+++ b/examples/open_deep_research/run_gaia.py
@@ -183,7 +183,7 @@ def answer_single_question(
     else:
         model_params["max_tokens"] = 4096
     model = LiteLLMModel(**model_params)
-    # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096)
+    # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="novita", max_tokens=4096)
     document_inspection_tool = TextInspectorTool(model, 100000)
 
     agent = create_agent_team(model)
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 68f4fa090..77b2f26eb 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -672,7 +672,7 @@ class TransformersModel(Model):
     Parameters:
         model_id (`str`):
             The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
-            For example, `"Qwen/Qwen3-32B"`.
+            For example, `"Qwen/Qwen2.5-Coder-32B-Instruct"`.
         device_map (`str`, *optional*):
             The device_map to initialize your model with.
         torch_dtype (`str`, *optional*):
@@ -690,7 +690,7 @@ class TransformersModel(Model):
     Example:
     ```python
     >>> engine = TransformersModel(
-    ...     model_id="Qwen/Qwen3-32B",
+    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
     ...     device="cuda",
     ...     max_new_tokens=5000,
     ... )
@@ -1173,10 +1173,10 @@ class InferenceClientModel(ApiModel):
     Providers include Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more.
 
     Parameters:
-        model_id (`str`, *optional*, default `"Qwen/Qwen3-32B"`):
+        model_id (`str`, *optional*, default `"Qwen/Qwen2.5-Coder-32B-Instruct"`):
             The Hugging Face model ID to be used for inference.
             This can be a model identifier from the Hugging Face model hub or a URL to a deployed Inference Endpoint.
-            Currently, it defaults to `"Qwen/Qwen3-32B"`, but this may change in the future.
+            Currently, it defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`, but this may change in the future.
         provider (`str`, *optional*):
             Name of the provider to use for inference. A list of supported providers can be found in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index#partners).
             Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order [here](https://hf.co/settings/inference-providers).
@@ -1211,7 +1211,7 @@ class InferenceClientModel(ApiModel):
     Example:
     ```python
     >>> engine = InferenceClientModel(
-    ...     model_id="Qwen/Qwen3-32B",
+    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
     ...     provider="nebius",
     ...     token="your_hf_token_here",
     ...     max_tokens=5000,
@@ -1225,7 +1225,7 @@ class InferenceClientModel(ApiModel):
 
     def __init__(
         self,
-        model_id: str = "Qwen/Qwen3-32B",
+        model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct",
         provider: str | None = None,
         token: str | None = None,
         timeout: int = 120,

From 7a944b319ed01dffdac6c7a55ea17f77be684712 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Mon, 19 May 2025 16:49:11 +0200
Subject: [PATCH 12/28] Remove sleep

---
 src/smolagents/gradio_ui.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index a2b188c19..69957cbb8 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -256,7 +256,6 @@ def stream_to_gradio(
             "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
         )
     intermediate_text = ""
-    import time
 
     for event in agent.run(
         task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args
@@ -270,7 +269,6 @@ def stream_to_gradio(
             ):
                 yield message
         elif isinstance(event, ChatMessageStreamDelta):
-            time.sleep(0.1)
             intermediate_text += event.content or ""
             yield intermediate_text
 
@@ -316,7 +314,6 @@ def interact_with_agent(self, prompt, messages, session_state):
 
             yield messages
         except Exception as e:
-            raise e
             print(f"Error in interaction: {str(e)}")
             messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
             yield messages

From 4de35e9c01b4644b4b15995b85ee2b8e364aeca8 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 11:11:21 +0200
Subject: [PATCH 13/28] Re-add last_input_token_count attribute for Model

---
 src/smolagents/agents.py | 16 ++++++++++----
 src/smolagents/models.py | 47 +++++++++++++++++++++++++++++++++-------
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 6cc860110..20a815e8d 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -27,7 +27,7 @@
 from dataclasses import dataclass
 from logging import getLogger
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypedDict
+from typing import TYPE_CHECKING, Any, Literal, TypedDict
 
 import jinja2
 import yaml
@@ -176,13 +176,21 @@ class PromptTemplates(TypedDict):
 
 @dataclass
 class RunResult:
-    """Holds extended information about an agent run."""
+    """Holds extended information about an agent run.
+
+    Attributes:
+        output (Any | None): The final output of the agent run, if available.
+        state (Literal["success", "max_steps_error"]): The final state of the agent after the run.
+        messages (list[dict]): The agent's memory, as a list of messages.
+        token_usage (TokenUsage | None): Count of tokens used during the run.
+        timing (Timing): Timing details of the agent run: start time, end time, duration.
+    """
 
     output: Any | None
-    token_usage: TokenUsage | None
+    state: Literal["success", "max_steps_error"]
     messages: list[dict]
+    token_usage: TokenUsage | None
     timing: Timing
-    state: str
 
 
 class MultiStepAgent(ABC):
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 77b2f26eb..1bf76f548 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -310,8 +310,24 @@ def __init__(
         self.tool_name_key = tool_name_key
         self.tool_arguments_key = tool_arguments_key
         self.kwargs = kwargs
+        self._last_input_token_count: int | None = None
+        self._last_output_token_count: int | None = None
         self.model_id: str | None = model_id
 
+    @property
+    def last_input_token_count(self) -> int | None:
+        logger.warning(
+            "The last_input_token_count attribute is deprecated and will be removed in a future version.",
+        )
+        return self._last_input_token_count
+
+    @property
+    def last_output_token_count(self) -> int | None:
+        logger.warning(
+            "The last_output_token_count attribute is deprecated and will be removed in a future version.",
+        )
+        return self._last_output_token_count
+
     def _prepare_completion_kwargs(
         self,
         messages: list[dict[str, str | list[dict]]],
@@ -368,7 +384,7 @@ def _prepare_completion_kwargs(
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]] | list[ChatMessage],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -496,7 +512,7 @@ def cleanup(self):
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -542,6 +558,8 @@ def generate(
             sampling_params=sampling_params,
         )
         output_text = out[0].outputs[0].text
+        self._last_input_token_count = len(out[0].prompt_token_ids)
+        self._last_output_token_count = len(out[0].outputs[0].token_ids)
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=output_text,
@@ -617,7 +635,7 @@ def __init__(
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -650,6 +668,8 @@ def generate(
                 text = text[:stop_index]
                 break
 
+        self._last_input_token_count = len(prompt_ids)
+        self._last_output_token_count = output_tokens
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=text,
@@ -843,7 +863,7 @@ def _prepare_completion_args(
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -869,6 +889,8 @@ def generate(
         if stop_sequences is not None:
             output_text = remove_stop_sequences(output_text, stop_sequences)
 
+        self._last_input_token_count = count_prompt_tokens
+        self._last_output_token_count = len(generated_tokens)
         return ChatMessage(
             role=MessageRole.ASSISTANT,
             content=output_text,
@@ -1005,7 +1027,7 @@ def create_client(self):
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -1026,6 +1048,8 @@ def generate(
 
         response = self.client.completion(**completion_kwargs)
 
+        self._last_input_token_count = response.usage.prompt_tokens
+        self._last_output_token_count = response.usage.completion_tokens
         return ChatMessage.from_dict(
             response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}),
             raw=response,
@@ -1264,7 +1288,7 @@ def create_client(self):
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -1281,6 +1305,8 @@ def generate(
         )
         response = self.client.chat_completion(**completion_kwargs)
 
+        self._last_input_token_count = response.usage.prompt_tokens
+        self._last_output_token_count = response.usage.completion_tokens
         return ChatMessage.from_dict(
             asdict(response.choices[0].message),
             raw=response,
@@ -1439,7 +1465,7 @@ def generate_stream(
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -1457,6 +1483,8 @@ def generate(
         )
         response = self.client.chat.completions.create(**completion_kwargs)
 
+        self._last_input_token_count = response.usage.prompt_tokens
+        self._last_output_token_count = response.usage.completion_tokens
         return ChatMessage.from_dict(
             response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}),
             raw=response,
@@ -1668,7 +1696,7 @@ def create_client(self):
 
     def generate(
         self,
-        messages: list[dict[str, str | list[dict]]],
+        messages: list[dict[str, str | list[dict]] | ChatMessage],
         stop_sequences: list[str] | None = None,
         grammar: str | None = None,
         tools_to_call_from: list[Tool] | None = None,
@@ -1687,6 +1715,9 @@ def generate(
 
         # Get first message
         response["output"]["message"]["content"] = response["output"]["message"]["content"][0]["text"]
+
+        self._last_input_token_count = response["usage"]["inputTokens"]
+        self._last_output_token_count = response["usage"]["outputTokens"]
         return ChatMessage.from_dict(
             response["output"]["message"],
             raw=response,

From d47642087a1d6b10451b9f46327544cd9002b35a Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 11:58:58 +0200
Subject: [PATCH 14/28] Add tests

---
 examples/gradio_ui.py       |  1 +
 src/smolagents/agents.py    | 29 +++++++++++++-----
 src/smolagents/gradio_ui.py | 11 +++----
 src/smolagents/memory.py    |  2 +-
 src/smolagents/models.py    |  2 ++
 tests/test_gradio_ui.py     | 15 +++++++---
 tests/test_monitoring.py    | 60 +++++++++++++++++++++++++++++++++++--
 7 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py
index 2b28ce109..fb69481ec 100644
--- a/examples/gradio_ui.py
+++ b/examples/gradio_ui.py
@@ -10,6 +10,7 @@
     description="This is an example agent.",
     step_callbacks=[],
     stream_outputs=True,
+    return_full_result=True,
 )
 
 GradioUI(agent, file_upload_folder="./data").launch()
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 20a815e8d..9006db22d 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -380,11 +380,25 @@ def run(
         # Outputs are returned only at the end. We only look at the last step.
 
         steps = list(self._run_stream(task=self.task, max_steps=max_steps, images=images))
-        result = steps[-1].output
+        assert isinstance(steps[-1], FinalAnswerStep)
+        output = steps[-1].output
 
         if self.return_full_result:
-            token_usage = None
-            token_usage = self.monitor.get_total_token_counts()
+            total_input_tokens = 0
+            total_output_tokens = 0
+            correct_token_usage = True
+            for step in self.memory.steps:
+                if isinstance(step, (ActionStep, PlanningStep)):
+                    if step.token_usage is None:
+                        correct_token_usage = False
+                        break
+                    else:
+                        total_input_tokens += step.token_usage.input_tokens
+                        total_output_tokens += step.token_usage.output_tokens
+            if correct_token_usage:
+                token_usage = TokenUsage(input_tokens=total_input_tokens, output_tokens=total_output_tokens)
+            else:
+                token_usage = None
 
             if self.memory.steps and isinstance(getattr(self.memory.steps[-1], "error", None), AgentMaxStepsError):
                 state = "max_steps_error"
@@ -394,14 +408,14 @@ def run(
             messages = self.memory.get_full_steps()
 
             return RunResult(
-                output=result,
+                output=output,
                 token_usage=token_usage,
                 messages=messages,
                 timing=Timing(start_time=run_start_time, end_time=time.time()),
                 state=state,
             )
 
-        return result
+        return output
 
     def _run_stream(
         self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None
@@ -586,8 +600,9 @@ def _generate_planning_step(
                         if event.content is not None:
                             plan_message_content += event.content
                             live.update(Markdown(plan_message_content))
-                            output_tokens += event.token_usage.output_tokens
-                            input_tokens = event.token_usage.input_tokens
+                            if event.token_usage:
+                                output_tokens += event.token_usage.output_tokens
+                                input_tokens = event.token_usage.input_tokens
                         yield event
             else:
                 plan_message = self.model.generate(input_messages, stop_sequences=["<end_plan>"])
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index 69957cbb8..e5e6ede1c 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -29,11 +29,9 @@
 def get_step_footnote_content(step_log: ActionStep | PlanningStep, step_name: str) -> str:
     """Get a footnote string for a step log with duration and token information"""
     step_footnote = f"**{step_name}**"
-    if getattr(step_log, "token_usage", None):
-        token_str = f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}"
-        step_footnote += token_str
-    step_duration = f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else None
-    step_footnote += step_duration
+    if step_log.token_usage is not None:
+        step_footnote += f" | Input tokens: {step_log.token_usage.input_tokens:,} | Output tokens: {step_log.token_usage.output_tokens:,}"
+    step_footnote += f" | Duration: {round(float(step_log.timing.duration), 2)}s" if step_log.timing.duration else ""
     step_footnote_content = f"""<span style="color: #bbbbc2; font-size: 12px;">{step_footnote}</span> """
     return step_footnote_content
 
@@ -314,9 +312,8 @@ def interact_with_agent(self, prompt, messages, session_state):
 
             yield messages
         except Exception as e:
-            print(f"Error in interaction: {str(e)}")
-            messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
             yield messages
+            raise gr.Error(f"Error in interaction: {str(e)}")
 
     def upload_file(self, file, file_uploads_log, allowed_file_types=None):
         """
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 033ca261a..ca0075c5e 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -143,7 +143,7 @@ class PlanningStep(MemoryStep):
     model_input_messages: list[Message]
     model_output_message: ChatMessage
     plan: str
-    timing: Timing | None = None
+    timing: Timing
     token_usage: TokenUsage | None = None
 
     def to_messages(self, summary_mode: bool = False) -> list[Message]:
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 1bf76f548..3bffedbb4 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -1339,6 +1339,8 @@ def generate_stream(
         for event in self.client.chat.completions.create(
             **completion_kwargs, stream=True, stream_options={"include_usage": True}
         ):
+            if getattr(event, "usage", None):
+                print("EV:", event)
             if event.choices:
                 if event.choices[0].delta is None:
                     if not getattr(event.choices[0], "finish_reason", None):
diff --git a/tests/test_gradio_ui.py b/tests/test_gradio_ui.py
index 4a527b5c4..1e39a7343 100644
--- a/tests/test_gradio_ui.py
+++ b/tests/test_gradio_ui.py
@@ -312,26 +312,33 @@ def test_action_step_with_images(self):
             assert len(image_messages) == 2
             assert "path/to/image.png" in str(image_messages[0])
 
-    @pytest.mark.parametrize("skip_model_outputs, expected_messages_length", [(False, 4), (True, 2)])
-    def test_planning_step(self, skip_model_outputs, expected_messages_length):
+    @pytest.mark.parametrize(
+        "skip_model_outputs, expected_messages_length, token_usage",
+        [(False, 4, TokenUsage(input_tokens=80, output_tokens=30)), (True, 2, None)],
+    )
+    def test_planning_step(self, skip_model_outputs, expected_messages_length, token_usage):
         """Test PlanningStep processing."""
         step = PlanningStep(
             plan="1. First step\n2. Second step",
             model_input_messages=Mock(),
             model_output_message=Mock(),
-            token_usage=TokenUsage(input_tokens=10, output_tokens=20),
+            token_usage=token_usage,
+            timing=Timing(start_time=1.0, end_time=2.0),
         )
         messages = list(pull_messages_from_step(step, skip_model_outputs=skip_model_outputs))
         assert len(messages) == expected_messages_length  # [header, plan,] footnote, divider
         expected_contents = [
             "**Planning step**",
             "1. First step\n2. Second step",
-            "Input tokens: 80 | Output tokens: 30",
+            "Input tokens: 80 | Output tokens: 30" if token_usage else "",
             "-----",
         ]
         for message, expected_content in zip(messages, expected_contents[-expected_messages_length:]):
             assert expected_content in message.content
 
+        if not token_usage:
+            assert "Input tokens: 80 | Output tokens: 30" not in message.content
+
     @pytest.mark.parametrize(
         "answer_type, answer_value, expected_content",
         [
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py
index 6b5eaac1a..6ebfa2982 100644
--- a/tests/test_monitoring.py
+++ b/tests/test_monitoring.py
@@ -34,6 +34,9 @@
 
 
 class FakeLLMModel(Model):
+    def __init__(self, give_token_usage: bool = True):
+        self.give_token_usage = give_token_usage
+
     def generate(self, prompt, tools_to_call_from=None, **kwargs):
         if tools_to_call_from is not None:
             return ChatMessage(
@@ -46,7 +49,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs):
                         function=ChatMessageToolCallDefinition(name="final_answer", arguments={"answer": "image"}),
                     )
                 ],
-                token_usage=TokenUsage(input_tokens=10, output_tokens=20),
+                token_usage=TokenUsage(input_tokens=10, output_tokens=20) if self.give_token_usage else None,
             )
         else:
             return ChatMessage(
@@ -56,7 +59,7 @@ def generate(self, prompt, tools_to_call_from=None, **kwargs):
 ```py
 final_answer('This is the final answer.')
 ```""",
-                token_usage=TokenUsage(input_tokens=10, output_tokens=20),
+                token_usage=TokenUsage(input_tokens=10, output_tokens=20) if self.give_token_usage else None,
             )
 
 
@@ -195,3 +198,56 @@ def test_run_return_full_result(self):
         self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20))
         self.assertIsInstance(result.messages, list)
         self.assertGreater(result.timing.duration, 0)
+
+        agent = ToolCallingAgent(
+            tools=[],
+            model=FakeLLMModel(),
+            max_steps=1,
+            return_full_result=True,
+        )
+
+        result = agent.run("Fake task")
+
+        self.assertIsInstance(result, RunResult)
+        self.assertEqual(result.output, "image")
+        self.assertEqual(result.state, "success")
+        self.assertEqual(result.token_usage, TokenUsage(input_tokens=10, output_tokens=20))
+        self.assertIsInstance(result.messages, list)
+        self.assertGreater(result.timing.duration, 0)
+
+        # Below 2 lines should be removed when the attributes are removed
+        assert agent.monitor.total_input_token_count == 10
+        assert agent.monitor.total_output_token_count == 20
+
+    def test_run_result_no_token_usage(self):
+        agent = CodeAgent(
+            tools=[],
+            model=FakeLLMModel(give_token_usage=False),
+            max_steps=1,
+            return_full_result=True,
+        )
+
+        result = agent.run("Fake task")
+
+        self.assertIsInstance(result, RunResult)
+        self.assertEqual(result.output, "This is the final answer.")
+        self.assertEqual(result.state, "success")
+        self.assertIsNone(result.token_usage)
+        self.assertIsInstance(result.messages, list)
+        self.assertGreater(result.timing.duration, 0)
+
+        agent = ToolCallingAgent(
+            tools=[],
+            model=FakeLLMModel(give_token_usage=False),
+            max_steps=1,
+            return_full_result=True,
+        )
+
+        result = agent.run("Fake task")
+
+        self.assertIsInstance(result, RunResult)
+        self.assertEqual(result.output, "image")
+        self.assertEqual(result.state, "success")
+        self.assertIsNone(result.token_usage)
+        self.assertIsInstance(result.messages, list)
+        self.assertGreater(result.timing.duration, 0)

From f93846c3a74be39e5a0ebc3a04bcf8973cb593b4 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 13:56:18 +0200
Subject: [PATCH 15/28] Pass tests

---
 tests/test_agents.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tests/test_agents.py b/tests/test_agents.py
index 650c76faa..a80b18921 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -678,6 +678,13 @@ def generate(self, messages, stop_sequences=None):
 
     def test_step_number(self):
         fake_model = MagicMock()
+        fake_model.generate.return_value = ChatMessage(
+            role="assistant",
+            content="Model output.",
+            tool_calls=None,
+            raw="Model output.",
+            token_usage=None,
+        )
         max_steps = 2
         agent = CodeAgent(tools=[], model=fake_model, max_steps=max_steps)
         assert hasattr(agent, "step_number"), "step_number attribute should be defined"
@@ -810,13 +817,19 @@ def test_planning_step(self, step, expected_messages_list):
     )
     def test_provide_final_answer(self, images, expected_messages_list):
         fake_model = MagicMock()
-        fake_model.return_value.content = "Final answer."
+        fake_model.generate.return_value = ChatMessage(
+            role="assistant",
+            content="Final answer.",
+            tool_calls=None,
+            raw="Final answer.",
+            token_usage=None,
+        )
         agent = CodeAgent(
             tools=[],
             model=fake_model,
         )
         task = "Test task"
-        final_answer = agent.provide_final_answer(task, images=images)
+        final_answer = agent.provide_final_answer(task, images=images).content
         expected_message_texts = {
             "FINAL_ANSWER_SYSTEM_PROMPT": agent.prompt_templates["final_answer"]["pre_messages"],
             "FINAL_ANSWER_USER_PROMPT": populate_template(
@@ -830,8 +843,8 @@ def test_provide_final_answer(self, images, expected_messages_list):
                         expected_content["text"] = expected_message_texts[expected_content["text"]]
         assert final_answer == "Final answer."
         # Test calls to model
-        assert len(fake_model.call_args_list) == 1
-        for call_args, expected_messages in zip(fake_model.call_args_list, expected_messages_list):
+        assert len(fake_model.generate.call_args_list) == 1
+        for call_args, expected_messages in zip(fake_model.generate.call_args_list, expected_messages_list):
             assert len(call_args.args) == 1
             messages = call_args.args[0]
             assert isinstance(messages, list)
@@ -849,7 +862,13 @@ def test_provide_final_answer(self, images, expected_messages_list):
 
     def test_interrupt(self):
         fake_model = MagicMock()
-        fake_model.return_value.content = "Model output."
+        fake_model.generate.return_value = ChatMessage(
+            role="assistant",
+            content="Model output.",
+            tool_calls=None,
+            raw="Model output.",
+            token_usage=None,
+        )
 
         def interrupt_callback(memory_step, agent):
             agent.interrupt()

From eefc8a8e4d6027dc9889b1428b5fb272614252a0 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 13:57:11 +0200
Subject: [PATCH 16/28] Pass memory test

---
 tests/test_memory.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_memory.py b/tests/test_memory.py
index 7990698f6..9896324a9 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -115,6 +115,7 @@ def test_planning_step_to_messages():
         model_input_messages=[Message(role=MessageRole.USER, content="Hello")],
         model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Plan"),
         plan="This is a plan.",
+        timing=Timing(start_time=0.0, end_time=1.0),
     )
     messages = planning_step.to_messages(summary_mode=False)
     assert len(messages) == 2

From 55bd6c7c5d636d870d3309f5693992d6a6df77bc Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 13:59:08 +0200
Subject: [PATCH 17/28] Pass agents test

---
 tests/test_agents.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_agents.py b/tests/test_agents.py
index a80b18921..fb1fde593 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -1218,8 +1218,13 @@ def generate(self, messages, stop_sequences=None, grammar=None):
 
     def test_local_python_executor_with_custom_functions(self):
         model = MagicMock()
-        model.last_input_token_count = 10
-        model.last_output_token_count = 5
+        model.generate.return_value = ChatMessage(
+            role="assistant",
+            content="",
+            tool_calls=None,
+            raw="",
+            token_usage=None,
+        )
         agent = CodeAgent(tools=[], model=model, executor_kwargs={"additional_functions": {"open": open}})
         agent.run("Test run")
         assert "open" in agent.python_executor.static_tools

From 3b796d9de71d51739feffeb96ac5bbad374e6afd Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 14:06:15 +0200
Subject: [PATCH 18/28] Revert model change in GAIA

---
 examples/open_deep_research/run_gaia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py
index 9a22e7174..9c7bacd4e 100644
--- a/examples/open_deep_research/run_gaia.py
+++ b/examples/open_deep_research/run_gaia.py
@@ -183,7 +183,7 @@ def answer_single_question(
     else:
         model_params["max_tokens"] = 4096
     model = LiteLLMModel(**model_params)
-    # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="novita", max_tokens=4096)
+    # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096)
     document_inspection_tool = TextInspectorTool(model, 100000)
 
     agent = create_agent_team(model)

From 4e6b593f8c5995a17086c978d13b3f76343b8427 Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Tue, 20 May 2025 15:03:09 +0200
Subject: [PATCH 19/28] Update src/smolagents/models.py

Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
---
 src/smolagents/models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 3bffedbb4..099bb043b 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -316,8 +316,10 @@ def __init__(
 
     @property
     def last_input_token_count(self) -> int | None:
-        logger.warning(
-            "The last_input_token_count attribute is deprecated and will be removed in a future version.",
+        warnings.warn(
+            "Attribute last_input_token_count is deprecated and will be removed in version 1.20. "
+            "Please use TokenUsage.input_tokens instead.",
+            FutureWarning,
         )
         return self._last_input_token_count
 

From 4c92bebf3f31e0d6a030356e58a92ac523d98b87 Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Tue, 20 May 2025 15:07:05 +0200
Subject: [PATCH 20/28] Update src/smolagents/monitoring.py

Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
---
 src/smolagents/monitoring.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 0effcfcf4..6ba2d427f 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -67,24 +67,10 @@ class Timing:
 
     start_time: float
     end_time: float | None = None
-
-    def dict(self):
-        return {
-            "start_time": self.start_time,
-            "end_time": self.end_time,
-            "duration": self.duration,
-        }
-
-    @property
-    def duration(self):
-        if self.end_time is None:
-            return None
-        return self.end_time - self.start_time
-
-    def __str__(self):
-        attributes = vars(self).copy()
-        attributes["duration"] = self.duration  # This makes sure the duration is also printed
-        return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})"
+    duration : float | None = field(init=False)
+    
+    def __post_init__(self):
+        self.duration = self.end_time - self.start_time if self.end_time else None
 
 
 class Monitor:

From a87ef7049379c04bccc1c7ef95fe01615bd38d4d Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Tue, 20 May 2025 15:07:12 +0200
Subject: [PATCH 21/28] Update src/smolagents/monitoring.py

Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
---
 src/smolagents/monitoring.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 6ba2d427f..533a84308 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -41,22 +41,10 @@ class TokenUsage:
 
     input_tokens: int
     output_tokens: int
-
-    def dict(self):
-        return {
-            "input_tokens": self.input_tokens,
-            "output_tokens": self.output_tokens,
-            "total_tokens": self.total_tokens,
-        }
-
-    @property
-    def total_tokens(self):
-        return self.input_tokens + self.output_tokens
-
-    def __str__(self):
-        attributes = vars(self).copy()
-        attributes["total_tokens"] = self.total_tokens  # This makes sure the total tokens are also printed
-        return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})"
+    total_tokens: int = field(init=False)
+    
+    def __post_init__(self):
+        self.total_tokens = self.input_tokens + self.output_tokens
 
 
 @dataclass

From 18b1849b4be343de767d29d885cd070f9972496f Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:10:09 +0200
Subject: [PATCH 22/28] Revert suggestion to avoid None durations

---
 src/smolagents/models.py     |  6 ++++--
 src/smolagents/monitoring.py | 28 ++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 099bb043b..582b0951d 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -325,8 +325,10 @@ def last_input_token_count(self) -> int | None:
 
     @property
     def last_output_token_count(self) -> int | None:
-        logger.warning(
-            "The last_output_token_count attribute is deprecated and will be removed in a future version.",
+        warnings.warn(
+            "Attribute last_output_token_count is deprecated and will be removed in version 1.20. "
+            "Please use TokenUsage.output_tokens instead.",
+            FutureWarning,
         )
         return self._last_output_token_count
 
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 533a84308..417e51a31 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -41,10 +41,15 @@ class TokenUsage:
 
     input_tokens: int
     output_tokens: int
-    total_tokens: int = field(init=False)
-    
-    def __post_init__(self):
-        self.total_tokens = self.input_tokens + self.output_tokens
+
+    @property
+    def total_tokens(self):
+        return self.input_tokens + self.output_tokens
+
+    def __str__(self):
+        attributes = vars(self).copy()
+        attributes["total_tokens"] = self.total_tokens  # This makes sure the total tokens are also printed
+        return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})"
 
 
 @dataclass
@@ -55,10 +60,17 @@ class Timing:
 
     start_time: float
     end_time: float | None = None
-    duration : float | None = field(init=False)
-    
-    def __post_init__(self):
-        self.duration = self.end_time - self.start_time if self.end_time else None
+
+    @property
+    def duration(self):
+        if self.end_time is None:
+            return None
+        return self.end_time - self.start_time
+
+    def __str__(self):
+        attributes = vars(self).copy()
+        attributes["duration"] = self.duration  # This makes sure the duration is also printed
+        return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})"
 
 
 class Monitor:

From 582a09e2948487cfb8ea8e4550c9236b9a3fdfb9 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:17:07 +0200
Subject: [PATCH 23/28] Use post-init suggestion for TokenUsage, property for
 Timing

---
 src/smolagents/monitoring.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 417e51a31..f05218cb0 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import IntEnum
 
 from rich import box
@@ -41,15 +41,10 @@ class TokenUsage:
 
     input_tokens: int
     output_tokens: int
+    total_tokens: int = field(init=False)
 
-    @property
-    def total_tokens(self):
-        return self.input_tokens + self.output_tokens
-
-    def __str__(self):
-        attributes = vars(self).copy()
-        attributes["total_tokens"] = self.total_tokens  # This makes sure the total tokens are also printed
-        return f"TokenUsage({', '.join(f'{key}={value}' for key, value in attributes.items())})"
+    def __post_init__(self):
+        self.total_tokens = self.input_tokens + self.output_tokens
 
 
 @dataclass
@@ -63,14 +58,10 @@ class Timing:
 
     @property
     def duration(self):
-        if self.end_time is None:
-            return None
-        return self.end_time - self.start_time
-
-    def __str__(self):
-        attributes = vars(self).copy()
-        attributes["duration"] = self.duration  # This makes sure the duration is also printed
-        return f"Timing({', '.join(f'{key}={value}' for key, value in attributes.items())})"
+        return None if self.end_time is None else self.end_time - self.start_time
+
+    def __repr__(self) -> str:
+        return f"Timing(start_time={self.start_time}, end_time={self.end_time}, duration={self.duration})"
 
 
 class Monitor:

From e25715e6bee9a1b1a1dfb79e856d03143c012208 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:35:55 +0200
Subject: [PATCH 24/28] Re-add deprecated token count increment in stream
 methods

---
 src/smolagents/models.py | 54 ++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 582b0951d..26e96f86a 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -930,6 +930,8 @@ def generate_stream(
 
         # Generate with streaming
         for new_text in self.streamer:
+            self._last_input_token_count = count_prompt_tokens
+            self._last_output_token_count = 1
             yield ChatMessageStreamDelta(
                 content=new_text,
                 tool_calls=None,
@@ -1093,11 +1095,17 @@ def generate_stream(
                 else:
                     yield ChatMessageStreamDelta(
                         content=event.choices[0].delta.content,
-                        token_usage=TokenUsage(
-                            input_tokens=event.usage.prompt_tokens,
-                            output_tokens=event.usage.completion_tokens,
-                        ),
                     )
+            if getattr(event, "usage", None):
+                self._last_input_token_count = event.usage.prompt_tokens
+                self._last_output_token_count = event.usage.completion_tokens
+                yield ChatMessageStreamDelta(
+                    content="",
+                    token_usage=TokenUsage(
+                        input_tokens=event.usage.prompt_tokens,
+                        output_tokens=event.usage.completion_tokens,
+                    ),
+                )
 
 
 class LiteLLMRouterModel(LiteLLMModel):
@@ -1343,24 +1351,24 @@ def generate_stream(
         for event in self.client.chat.completions.create(
             **completion_kwargs, stream=True, stream_options={"include_usage": True}
         ):
-            if getattr(event, "usage", None):
-                print("EV:", event)
             if event.choices:
                 if event.choices[0].delta is None:
                     if not getattr(event.choices[0], "finish_reason", None):
                         raise ValueError(f"No content or tool calls in event: {event}")
                 else:
-                    if getattr(event, "usage", None):
-                        token_usage = TokenUsage(
-                            input_tokens=event.usage.prompt_tokens,
-                            output_tokens=event.usage.completion_tokens,
-                        )
-                    else:
-                        token_usage = None
                     yield ChatMessageStreamDelta(
                         content=event.choices[0].delta.content,
-                        token_usage=token_usage,
                     )
+            if getattr(event, "usage", None):
+                self._last_input_token_count = event.usage.prompt_tokens
+                self._last_output_token_count = event.usage.completion_tokens
+                yield ChatMessageStreamDelta(
+                    content="",
+                    token_usage=TokenUsage(
+                        input_tokens=event.usage.prompt_tokens,
+                        output_tokens=event.usage.completion_tokens,
+                    ),
+                )
 
 
 class HfApiModel(InferenceClientModel):
@@ -1461,13 +1469,17 @@ def generate_stream(
                     if not getattr(event.choices[0], "finish_reason", None):
                         raise ValueError(f"No content or tool calls in event: {event}")
                 else:
-                    yield ChatMessageStreamDelta(
-                        content=event.choices[0].delta.content,
-                        token_usage=TokenUsage(
-                            input_tokens=event.usage.prompt_tokens,
-                            output_tokens=event.usage.completion_tokens,
-                        ),
-                    )
+                    yield ChatMessageStreamDelta(content=event.choices[0].delta.content)
+            if event.usage:
+                self._last_input_token_count = event.usage.prompt_tokens
+                self._last_output_token_count = event.usage.completion_tokens
+                yield ChatMessageStreamDelta(
+                    content="",
+                    token_usage=TokenUsage(
+                        input_tokens=event.usage.prompt_tokens,
+                        output_tokens=event.usage.completion_tokens,
+                    ),
+                )
 
     def generate(
         self,

From 3a321e7fc41e0922573d45811ad475d3f39d3492 Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:40:47 +0200
Subject: [PATCH 25/28] Fix dict conversion error

---
 src/smolagents/memory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index ca0075c5e..806fa60c9 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -67,8 +67,8 @@ def dict(self):
         return {
             "model_input_messages": self.model_input_messages,
             "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [],
-            "timing": self.timing.dict(),
-            "token_usage": self.token_usage.dict() if self.token_usage else None,
+            "timing": asdict(self.timing),
+            "token_usage": asdict(self.token_usage) if self.token_usage else None,
             "step": self.step_number,
             "error": self.error.dict() if self.error else None,
             "model_output_message": self.model_output_message,

From 6dc4c6d48ecba7e057af0bd3e5b206072ac5acfa Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:56:32 +0200
Subject: [PATCH 26/28] Test ActionStep.dict()

---
 src/smolagents/memory.py     |  4 +--
 src/smolagents/models.py     |  2 +-
 src/smolagents/monitoring.py |  7 ++++
 tests/test_agents.py         |  3 ++
 tests/test_memory.py         | 63 ++++++++++++++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 806fa60c9..0a196ed4c 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -67,11 +67,11 @@ def dict(self):
         return {
             "model_input_messages": self.model_input_messages,
             "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [],
-            "timing": asdict(self.timing),
+            "timing": self.timing.dict(),
             "token_usage": asdict(self.token_usage) if self.token_usage else None,
             "step": self.step_number,
             "error": self.error.dict() if self.error else None,
-            "model_output_message": self.model_output_message,
+            "model_output_message": self.model_output_message.dict(),
             "model_output": self.model_output,
             "observations": self.observations,
             "action_output": make_json_serializable(self.action_output),
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 26e96f86a..48a702b65 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -124,7 +124,7 @@ def from_dict(cls, data: dict, raw: Any | None = None, token_usage: TokenUsage |
         )
 
     def dict(self):
-        return json.dumps(get_dict_from_nested_dataclasses(self))
+        return get_dict_from_nested_dataclasses(self)
 
     @classmethod
     def from_hf_api(cls, message: "ChatCompletionOutputMessage", raw) -> "ChatMessage":
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index f05218cb0..5f5c174da 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -60,6 +60,13 @@ class Timing:
     def duration(self):
         return None if self.end_time is None else self.end_time - self.start_time
 
+    def dict(self):
+        return {
+            "start_time": self.start_time,
+            "end_time": self.end_time,
+            "duration": self.duration,
+        }
+
     def __repr__(self) -> str:
         return f"Timing(start_time={self.start_time}, end_time={self.end_time}, duration={self.duration})"
 
diff --git a/tests/test_agents.py b/tests/test_agents.py
index fb1fde593..5f8c4397e 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -573,7 +573,10 @@ def weather_api(location: str, celsius: bool = False) -> str:
         step_memory_dict = agent.memory.get_succinct_steps()[1]
         assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api"
         assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100
+        assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api"
         assert "model_input_messages" in agent.memory.get_full_steps()[1]
+        assert step_memory_dict["model_output_message"].token_usage.total_tokens > 100
+        assert step_memory_dict["model_output_message"].timing.duration > 0.1
 
     def test_final_answer_checks(self):
         def check_always_fails(final_answer, agent_memory):
diff --git a/tests/test_memory.py b/tests/test_memory.py
index 9896324a9..4bf4fbab7 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -38,6 +38,69 @@ def test_to_messages(self):
             step.to_messages()
 
 
+def test_action_step_dict():
+    action_step = ActionStep(
+        model_input_messages=[Message(role=MessageRole.USER, content="Hello")],
+        tool_calls=[
+            ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}),
+        ],
+        timing=Timing(start_time=0.0, end_time=1.0),
+        step_number=1,
+        error=None,
+        model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"),
+        model_output="Hi",
+        observations="This is a nice observation",
+        observations_images=["image1.png"],
+        action_output="Output",
+        token_usage=TokenUsage(input_tokens=10, output_tokens=20),
+    )
+    action_step_dict = action_step.dict()
+    # Check each key individually for better test failure messages
+    assert "model_input_messages" in action_step_dict
+    assert action_step_dict["model_input_messages"] == [Message(role=MessageRole.USER, content="Hello")]
+
+    assert "tool_calls" in action_step_dict
+    assert len(action_step_dict["tool_calls"]) == 1
+    assert action_step_dict["tool_calls"][0] == {
+        "id": "id",
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "arguments": {"location": "Paris"},
+        },
+    }
+
+    assert "timing" in action_step_dict
+    assert action_step_dict["timing"] == {"start_time": 0.0, "end_time": 1.0, "duration": 1.0}
+
+    assert "token_usage" in action_step_dict
+    assert action_step_dict["token_usage"] == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}
+
+    assert "step" in action_step_dict
+    assert action_step_dict["step"] == 1
+
+    assert "error" in action_step_dict
+    assert action_step_dict["error"] is None
+
+    assert "model_output_message" in action_step_dict
+    assert action_step_dict["model_output_message"] == {
+        "role": "assistant",
+        "content": "Hi",
+        "tool_calls": None,
+        "raw": None,
+        "token_usage": None,
+    }
+
+    assert "model_output" in action_step_dict
+    assert action_step_dict["model_output"] == "Hi"
+
+    assert "observations" in action_step_dict
+    assert action_step_dict["observations"] == "This is a nice observation"
+
+    assert "action_output" in action_step_dict
+    assert action_step_dict["action_output"] == "Output"
+
+
 def test_action_step_to_messages():
     action_step = ActionStep(
         model_input_messages=[Message(role=MessageRole.USER, content="Hello")],

From 56117ad6c3f9db313540a5b1375d8191619239ed Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 15:56:59 +0200
Subject: [PATCH 27/28] Fix edge case

---
 src/smolagents/memory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 0a196ed4c..912bad6e9 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -71,7 +71,7 @@ def dict(self):
             "token_usage": asdict(self.token_usage) if self.token_usage else None,
             "step": self.step_number,
             "error": self.error.dict() if self.error else None,
-            "model_output_message": self.model_output_message.dict(),
+            "model_output_message": self.model_output_message.dict() if self.model_output_message else None,
             "model_output": self.model_output,
             "observations": self.observations,
             "action_output": make_json_serializable(self.action_output),

From dbdb3fa2adc05677f6cb497270166065b81ea60c Mon Sep 17 00:00:00 2001
From: Aymeric <aymeric.roucher@gmail.com>
Date: Tue, 20 May 2025 16:07:20 +0200
Subject: [PATCH 28/28] Fix even more tests

---
 tests/test_agents.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test_agents.py b/tests/test_agents.py
index 5f8c4397e..d6ace4471 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -571,12 +571,11 @@ def weather_api(location: str, celsius: bool = False) -> str:
         assert agent.memory.steps[0].task == task
         assert agent.memory.steps[1].tool_calls[0].name == "weather_api"
         step_memory_dict = agent.memory.get_succinct_steps()[1]
-        assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api"
-        assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100
-        assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api"
+        assert step_memory_dict["model_output_message"]["tool_calls"][0]["function"]["name"] == "weather_api"
+        assert step_memory_dict["model_output_message"]["raw"]["completion_kwargs"]["max_new_tokens"] == 100
         assert "model_input_messages" in agent.memory.get_full_steps()[1]
-        assert step_memory_dict["model_output_message"].token_usage.total_tokens > 100
-        assert step_memory_dict["model_output_message"].timing.duration > 0.1
+        assert step_memory_dict["token_usage"]["total_tokens"] > 100
+        assert step_memory_dict["timing"]["duration"] > 0.1
 
     def test_final_answer_checks(self):
         def check_always_fails(final_answer, agent_memory):