huggingface · qgallouedec · Oct 21, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/docs/source/data_utils.md b/docs/source/data_utils.md
@@ -4,6 +4,10 @@
 
 [[autodoc]] prepare_multimodal_messages
 
+## prepare_multimodal_messages_vllm
+
+[[autodoc]] prepare_multimodal_messages_vllm
+
 ## is_conversational
 
 [[autodoc]] is_conversational

diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
@@ -19,7 +19,7 @@
 
 from datasets import Dataset, DatasetDict
 from parameterized import parameterized
-from transformers import AutoProcessor, AutoTokenizer
+from transformers import AutoProcessor, AutoTokenizer, is_vision_available
 
 from trl.data_utils import (
     apply_chat_template,
@@ -32,44 +32,66 @@
     maybe_unpair_preference_dataset,
     pack_dataset,
     prepare_multimodal_messages,
+    prepare_multimodal_messages_vllm,
     truncate_dataset,
     unpair_preference_dataset,
 )
 
-from .testing_utils import TrlTestCase
+from .testing_utils import TrlTestCase, require_vision
 
 
+if is_vision_available():
+    from PIL import Image
+
+
+@require_vision
 class TestPrepareMultimodalMessages:
     def test_basic_user_assistant_conversation(self):
         """Test basic conversation with user and assistant messages."""
         messages = [
             {"role": "user", "content": "What color is the sky?"},
             {"role": "assistant", "content": "It is blue."},
         ]
-
-        prepare_multimodal_messages(messages, num_images=1)
+        image = Image.new("RGB", (10, 10), color="blue")
+        messages = prepare_multimodal_messages(messages, images=[image])
 
         expected = [
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]},
-            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
+            {
+                "role": "user",
+                "content": [{"type": "image", "image": image}, {"type": "text", "text": "What color is the sky?"}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
         ]
 
         assert messages == expected
 
     def test_first_user_message_gets_image(self):
-        """Test that only the first user message gets an image placeholder."""
+        """Test that only the first user message gets an image."""
         messages = [
             {"role": "user", "content": "What color is the sky?"},
             {"role": "assistant", "content": "It is blue."},
             {"role": "user", "content": "How about the grass?"},
         ]
 
-        prepare_multimodal_messages(messages, num_images=1)
+        image = Image.new("RGB", (10, 10), color="blue")
+        messages = prepare_multimodal_messages(messages, images=[image])
 
         expected = [
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]},
-            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
-            {"role": "user", "content": [{"type": "text", "text": "How about the grass?"}]},
+            {
+                "role": "user",
+                "content": [{"type": "image", "image": image}, {"type": "text", "text": "What color is the sky?"}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "How about the grass?"}],
+            },
         ]
 
         assert messages == expected
@@ -80,20 +102,23 @@ def test_multiple_images(self):
             {"role": "user", "content": "What color is the sky?"},
             {"role": "assistant", "content": "It is blue."},
         ]
-
-        prepare_multimodal_messages(messages, num_images=3)
+        images = [Image.new("RGB", (10, 10), color=color) for color in ["red", "green", "blue"]]
+        messages = prepare_multimodal_messages(messages, images=images)
 
         expected = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "image"},
-                    {"type": "image"},
-                    {"type": "image"},
+                    {"type": "image", "image": images[0]},
+                    {"type": "image", "image": images[1]},
+                    {"type": "image", "image": images[2]},
                     {"type": "text", "text": "What color is the sky?"},
                 ],
             },
-            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
         ]
 
         assert messages == expected
@@ -105,11 +130,18 @@ def test_system_message_transformation(self):
             {"role": "user", "content": "What color is the sky?"},
         ]
 
-        prepare_multimodal_messages(messages, num_images=1)
+        image = Image.new("RGB", (10, 10), color="blue")
+        messages = prepare_multimodal_messages(messages, images=[image])
 
         expected = [
-            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant"}]},
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]},
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant"}],
+            },
+            {
+                "role": "user",
+                "content": [{"type": "image", "image": image}, {"type": "text", "text": "What color is the sky?"}],
+            },
         ]
 
         assert messages == expected
@@ -122,10 +154,25 @@ def test_already_prepared_messages_unchanged(self):
             {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
         ]
 
-        original = copy.deepcopy(messages)
-        prepare_multimodal_messages(messages, num_images=1)
+        image = Image.new("RGB", (10, 10), color="blue")
+        messages = prepare_multimodal_messages(messages, images=[image])
 
-        assert messages == original
+        expected = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant"}],
+            },
+            {
+                "role": "user",
+                "content": [{"type": "image", "image": image}, {"type": "text", "text": "What color is the sky?"}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
+        ]
+
+        assert messages == expected
 
     def test_mixed_prepared_and_unprepared_messages(self):
         """Test handling of mixed prepared and unprepared messages."""
@@ -135,17 +182,119 @@ def test_mixed_prepared_and_unprepared_messages(self):
             {"role": "user", "content": "What about the grass?"},
         ]
 
-        prepare_multimodal_messages(messages, num_images=1)
+        image = Image.new("RGB", (10, 10), color="blue")
+        messages = prepare_multimodal_messages(messages, images=[image])
 
         expected = [
-            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]},
-            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
-            {"role": "user", "content": [{"type": "text", "text": "What about the grass?"}]},
+            {
+                "role": "user",
+                "content": [{"type": "image", "image": image}, {"type": "text", "text": "What color is the sky?"}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "What about the grass?"}],
+            },
         ]
 
         assert messages == expected
 
 
+@require_vision
+class TestPrepareMultimodalMessagesVLLM:
+    def test_single_image_conversion(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": Image.new("RGB", (10, 10), color="blue")},
+                    {"type": "text", "text": "What color is the sky?"},
+                ],
+            }
+        ]
+
+        result = prepare_multimodal_messages_vllm(messages)
+
+        # Original should remain unchanged (deepcopy test)
+        assert messages[0]["content"][0]["type"] == "image"
+
+        # Converted version should have correct structure
+        assert result[0]["content"][0]["type"] == "image_pil"
+        assert "image_pil" in result[0]["content"][0]
+        assert "image" not in result[0]["content"][0]
+        assert isinstance(result[0]["content"][0]["image_pil"], Image.Image)
+        assert result[0]["content"][1]["type"] == "text"
+
+    def test_mixed_content_conversion(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What color is the sky?"},
+                    {"type": "image", "image": Image.new("RGB", (10, 10), color="blue")},
+                ],
+            }
+        ]
+
+        result = prepare_multimodal_messages_vllm(messages)
+
+        # The image part should be converted, text should be unchanged
+        assert result[0]["content"][0]["type"] == "text"
+        assert result[0]["content"][1]["type"] == "image_pil"
+
+    def test_no_images(self):
+        messages = [{"role": "user", "content": [{"type": "text", "text": "What color is the sky?"}]}]
+
+        result = prepare_multimodal_messages_vllm(messages)
+
+        # Should be identical since there are no images
+        assert result == messages
+        # And a deepcopy — not the same object
+        assert result is not messages
+        assert result[0] is not messages[0]
+
+    def test_multiple_messages(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What color is the sky?"},
+                    {"type": "image", "image": Image.new("RGB", (10, 10), color="blue")},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is blue."}],
+            },
+        ]
+
+        result = prepare_multimodal_messages_vllm(messages)
+
+        assert result[0]["content"][1]["type"] == "image_pil"
+        assert result[1]["content"][0]["type"] == "text"
+        assert result[1]["content"][0]["text"] == "It is blue."
+
+    def test_deepcopy_integrity(self):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What color is the sky?"},
+                    {"type": "image", "image": Image.new("RGB", (10, 10), color="blue")},
+                ],
+            },
+        ]
+        original = copy.deepcopy(messages)
+
+        _ = prepare_multimodal_messages_vllm(messages)
+
+        # Original should not be mutated
+        assert messages == original
+
+
 class TestIsConversational(TrlTestCase):
     conversational_examples = [
         {  # Language modeling