[rollout, tool] feat: export rollout rewards to total rewards (volcengine#3563)

Tavish9 · techkang · commit 9530472821b9 · 2025-10-31T16:18:16.000+08:00
### What does this PR do? This PR exports rollout rewards including tool calling rewards and interaction rewards to `compute_score` fn. Currently, rollout reward_scores is calculated but not used in the final `compute_score`. https://github.com/volcengine/verl/blob/96e7071de1bc6a7c0e12f4999f97556da9310cc3/verl/workers/rollout/sglang_rollout/sglang_rollout.py#L1320-L1324 Fix volcengine#3525 ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -701,7 +701,9 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
         extra_fields = {}
         all_keys = set(key for input_item in inputs for key in input_item.extra_fields)
         for key in all_keys:
-            extra_fields[key] = np.array([input.extra_fields.get(key) for input in inputs], dtype=object)
+            temp_arr = np.empty(len(inputs), dtype=object)
+            temp_arr[:] = [input.extra_fields.get(key) for input in inputs]
+            extra_fields[key] = temp_arr
 
         non_tensor_batch.update(extra_fields)
         return DataProto(
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -68,6 +68,7 @@ def __init__(
         self.response_mask: list[int] = []
         self.response_logprobs: list[float] = []
         self.turn_scores: list[float] = []
+        self.tool_rewards: list[float] = []
         self.user_turns = 0
         self.assistant_turns = 0
 
@@ -175,7 +176,7 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu
             metrics=agent_data.metrics,
             extra_fields={},
         )
-        output.extra_fields.update({"turn_scores": agent_data.turn_scores})
+        output.extra_fields.update({"turn_scores": agent_data.turn_scores, "tool_rewards": agent_data.tool_rewards})
         return output
 
     async def _handle_pending_state(self, agent_data: AgentData, sampling_params: dict[str, Any]) -> AgentState:
@@ -268,7 +269,7 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
 
         # Process tool responses and update multi_modal_data
         # Removed: agent_data.new_images_this_turn = []
-        for tool_response in responses:
+        for tool_response, tool_reward, _ in responses:
             # Create message from tool response
             if tool_response.image or tool_response.video:
                 # Multi-modal content with structured format
@@ -321,6 +322,9 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
                     "Multimedia type 'video' is not currently supported. Only 'image' is supported."
                 )
 
+            if tool_reward is not None:
+                agent_data.tool_rewards.append(tool_reward)
+
         # Update prompt with tool responses
         if self.processor is not None:
             raw_tool_response = await self.loop.run_in_executor(
@@ -403,7 +407,9 @@ async def _handle_interacting_state(self, agent_data: AgentData) -> AgentState:
         else:
             return AgentState.GENERATING
 
-    async def _call_tool(self, tool_call: FunctionCall, tools_kwargs: dict[str, Any]) -> ToolResponse:
+    async def _call_tool(
+        self, tool_call: FunctionCall, tools_kwargs: dict[str, Any]
+    ) -> tuple[ToolResponse, float, dict]:
         """Call tool and return tool response."""
         tool, instance_id = None, None
         try:
@@ -413,11 +419,15 @@ async def _call_tool(self, tool_call: FunctionCall, tools_kwargs: dict[str, Any]
             tool = self.tools[tool_name]
             kwargs = tools_kwargs.get(tool_name, {})
             instance_id, _ = await tool.create(create_kwargs=kwargs.get("create_kwargs", {}))
-            tool_execution_response, _, _ = await tool.execute(instance_id, tool_args)
+            tool_execution_response, tool_reward, res = await tool.execute(instance_id, tool_args)
         except Exception as e:
             logger.warning(f"Error when executing tool: {e}")
-            return ToolResponse(
-                text=f"Error when executing tool: {e}",
+            return (
+                ToolResponse(
+                    text=f"Error when executing tool: {e}",
+                ),
+                0.0,
+                {},
             )
         finally:
             if tool and instance_id:
@@ -443,7 +453,7 @@ async def _call_tool(self, tool_call: FunctionCall, tools_kwargs: dict[str, Any]
                 if attr_value is not None:
                     tool_response_kwargs[attr_name] = attr_value
 
-        return ToolResponse(**tool_response_kwargs)
+        return ToolResponse(**tool_response_kwargs), tool_reward, res
 
     @classmethod
     def _initialize_interactions(cls, interaction_config_file):
diff --git a/verl/workers/reward_manager/batch.py b/verl/workers/reward_manager/batch.py
@@ -61,7 +61,11 @@ def verify(self, data):
 
         ground_truths = [item.non_tensor_batch["reward_model"].get("ground_truth", None) for item in data]
         data_sources = data.non_tensor_batch[self.reward_fn_key]
-        extras = data.non_tensor_batch.get("extra_info", [None] * len(data))
+        rollout_reward_scores = data.non_tensor_batch.get("reward_scores", [{} for _ in range(len(data))])
+        extras = data.non_tensor_batch.get("extra_info", [{} for _ in range(len(data))])
+
+        for i in range(len(data)):
+            extras[i]["rollout_reward_scores"] = rollout_reward_scores[i]
 
         scores = self.compute_score(
             data_sources=data_sources,
diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py
@@ -92,7 +92,11 @@ def __call__(self, data: DataProto, return_dict: bool = False):
 
             data_source = data_item.non_tensor_batch[self.reward_fn_key]
 
-            extra_info = data_item.non_tensor_batch.get("extra_info", None)
+            extra_info = data_item.non_tensor_batch.get("extra_info", {})
+
+            rollout_reward_scores = data_item.non_tensor_batch.get("reward_scores", {})
+
+            extra_info["rollout_reward_scores"] = rollout_reward_scores
 
             result = self.compute_score(
                 data_source=data_source,
diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py
@@ -82,7 +82,9 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor |
             data_source = data_item.non_tensor_batch[self.reward_fn_key]
             extra_info = data_item.non_tensor_batch.get("extra_info", {})
             num_turns = data_item.non_tensor_batch.get("__num_turns__", None)
+            rollout_reward_scores = data_item.non_tensor_batch.get("reward_scores", {})
             extra_info["num_turns"] = num_turns
+            extra_info["rollout_reward_scores"] = rollout_reward_scores
 
             score = self.compute_score(
                 data_source=data_source,