From 949ae4e2f082a0994fa1699cf7ed7e9053ad1a5c Mon Sep 17 00:00:00 2001 From: Kaican Li Date: Wed, 1 Oct 2025 00:29:28 +0800 Subject: [PATCH] fix qwen2_vl position_ids bug --- verl/experimental/agent_loop/agent_loop.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 01ff2bc98b2..0d683d0a21d 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -589,7 +589,7 @@ async def _run_agent_loop( video_grid_thw = multi_modal_inputs.get("video_grid_thw") second_per_grid_ts = multi_modal_inputs.get("second_per_grid_ts") - position_ids = get_rope_index( + vision_position_ids = get_rope_index( self.processor, input_ids=input_ids.squeeze(0), image_grid_thw=image_grid_thw, @@ -597,6 +597,12 @@ async def _run_agent_loop( second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask.squeeze(0), ).unsqueeze(0) # (1, 3, seq_len) + + valid_mask = attention_mask[0].bool() + text_position_ids = torch.ones((1, len(input_ids[0])), dtype=torch.long) + text_position_ids[0, valid_mask] = torch.arange(valid_mask.sum().item()) + text_position_ids = text_position_ids.unsqueeze(0) + position_ids = torch.cat((text_position_ids, vision_position_ids), dim=1) # (1, 4, seq_length) else: position_ids = compute_position_id_with_mask(attention_mask) # (1, seq_len) enable_async_reward = (