fix position ids for latest transformers

hiyouga · hiyouga · commit e9065e7ca862 · 2025-09-13T17:37:01.000Z
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
 import torch
 from PIL.Image import Image
 
 from verl.utils.dataset import RLHFDataset
 from verl.utils.tokenizer import get_processor, get_tokenizer
 
 
-def test_image_dataset():
-    tokenizer = get_tokenizer("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
-    processor = get_processor("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
+@pytest.mark.parametrize("use_fast", [True, False])
+def test_image_dataset(use_fast: bool):
+    tokenizer = get_tokenizer("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=use_fast)
+    processor = get_processor("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=use_fast)
     dataset = RLHFDataset(
         data_path="hiyouga/geometry3k@test",
         tokenizer=tokenizer,
@@ -44,8 +46,8 @@ def test_image_dataset():
     }
     assert torch.all(dataset[0]["input_ids"] == torch.tensor(token_ids))
     assert torch.all(dataset[0]["attention_mask"] == torch.ones(16))
-    assert torch.all(dataset[0]["position_ids"] == torch.arange(16).unsqueeze(0).expand(3, -1))
-    assert list(dataset[0]["position_ids"].size()) == [3, 16]  # avoid fake positive caused by broadcasting
+    assert torch.all(dataset[0]["position_ids"] == torch.arange(16).unsqueeze(0).expand(4, -1))
+    assert list(dataset[0]["position_ids"].size()) == [4, 16]  # avoid fake positive caused by broadcasting
     assert dataset[0]["raw_prompt_ids"] == token_ids
     assert dataset[0]["ground_truth"] == "48"
     assert isinstance(dataset[0]["multi_modal_data"]["images"][0], Image)
diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py
@@ -147,6 +147,19 @@ def get_rope_index(
     return position_ids
 
 
+def process_position_ids(position_ids: torch.Tensor) -> torch.Tensor:
+    if position_ids.dim() != 3 or position_ids.size(0) != 4:
+        # we concat the text position ids with the 3D vision position ids by default
+        # see https://github.com/huggingface/transformers/pull/39447
+        raise ValueError("position_ids should be a 3D tensor of shape (4, batch_size, seq_length).")
+
+    if not is_transformers_version_greater_than("4.54.0"):
+        # transformers < 4.54.0 only accepts vision position ids, so we discard the text position ids here
+        position_ids = position_ids[1:]
+
+    return position_ids
+
+
 def qwen2_vl_attn_forward(
     self: "Qwen2VLAttention",
     hidden_states: torch.Tensor,
@@ -272,7 +285,7 @@ def qwen2_vl_forward_old(
     outputs = self.model(
         input_ids=None,
         attention_mask=attention_mask,
-        position_ids=position_ids,
+        position_ids=process_position_ids(position_ids),
         inputs_embeds=inputs_embeds,
         **kwargs,
     )
@@ -306,7 +319,7 @@ def qwen2_vl_base_forward_new(
     )
     outputs = self.language_model(
         input_ids=None,
-        position_ids=position_ids,
+        position_ids=process_position_ids(position_ids),
         attention_mask=attention_mask,
         inputs_embeds=inputs_embeds,
         **kwargs,
diff --git a/verl/utils/dataset.py b/verl/utils/dataset.py
@@ -266,14 +266,16 @@ def __getitem__(self, index):
 
         if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
             # qwen2vl mrope
-            position_ids = get_rope_index(
+            vision_position_ids = get_rope_index(
                 self.processor,
                 input_ids=input_ids,
                 image_grid_thw=model_inputs.get("image_grid_thw", None),
                 video_grid_thw=model_inputs.get("video_grid_thw", None),
                 second_per_grid_ts=model_inputs.get("second_per_grid_ts", None),
                 attention_mask=attention_mask,
             )  # (3, seq_length)
+            text_position_ids = torch.arange(len(input_ids)).unsqueeze(0)  # (1, seq_length)
+            position_ids = torch.cat((text_position_ids, vision_position_ids), dim=0)  # (4, seq_length)
         else:
             position_ids = torch.clip(attention_mask.cumsum(dim=0) - 1, min=0, max=None)  # (seq_length,)
 
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
@@ -77,7 +77,7 @@ def _forward_micro_batch(self, micro_batch: dict[str, torch.Tensor], temperature
         responses = micro_batch["responses"]
         response_length = responses.size(-1)
         if position_ids.dim() == 3:  # qwen2vl mrope
-            position_ids = position_ids.transpose(0, 1)  # (bsz, 3, seqlen) -> (3, bsz, seqlen)
+            position_ids = position_ids.transpose(0, 1)  # (bsz, 4, seqlen) -> (4, bsz, seqlen)
 
         multi_modal_inputs = defaultdict(list)
         if "multi_modal_inputs" in micro_batch:
@@ -96,7 +96,7 @@ def _forward_micro_batch(self, micro_batch: dict[str, torch.Tensor], temperature
                     index_first_axis(rearrange(position_ids, "c b s ... -> (b s) c ..."), indices)
                     .transpose(0, 1)
                     .unsqueeze(1)
-                )  # (3, bsz, seqlen) -> (3, 1, bsz * seqlen)
+                )  # (4, bsz, seqlen) -> (4, 1, bsz * seqlen)
             else:
                 position_ids_rmpad = index_first_axis(
                     rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), indices
diff --git a/verl/workers/rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout_spmd.py
@@ -210,7 +210,7 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
         delta_position_id = torch.arange(1, response_length + 1, device=position_ids.device)
         delta_position_id = delta_position_id.view(1, -1).expand(batch_size, -1)
         if position_ids.dim() == 3:  # qwen2vl mrope
-            delta_position_id = delta_position_id.view(batch_size, 1, -1).expand(batch_size, 3, -1)
+            delta_position_id = delta_position_id.view(batch_size, 1, -1).expand(batch_size, 4, -1)
 
         # prompt: left pad + response: right pad
         # attention_mask: [0,0,0,0,1,1,1,1 | 1,1,1,0,0,0,0,0]