vllm-project · imkero · Apr 18, 2025 · Apr 18, 2025
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -47,3 +47,5 @@ opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding & Qwen2/2.5-VL
+numba == 0.61.2; python_version > '3.9'
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
@@ -1,9 +1,6 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
-
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.6.0

@@ -1,9 +1,6 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
-
 # Dependencies for AMD GPUs
 awscli
 boto3

diff --git a/tests/model_executor/test_qwen2_5_vl_window_index.py b/tests/model_executor/test_qwen2_5_vl_window_index.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VisionAttentionScheduler)
+
+
+@pytest.mark.parametrize("window_size, patch_size, spatial_merge_size", [
+    (112, 14, 2),
+    (128, 16, 2),
+])
+def test_qwen2_5_vl_get_window_indices_correctness(window_size, patch_size,
+                                                   spatial_merge_size):
+    scheduler = Qwen2_5_VisionAttentionScheduler(
+        spatial_merge_size=spatial_merge_size,
+        window_size=window_size,
+        patch_size=patch_size,
+        max_position_embeddings=32768,
+        device=torch.device("cpu"),
+    )
+
+    get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"
+
+    for t in range(1, 3):
+        for h in range(1, 50):
+            for w in range(1, 50):
+                grid_thw = torch.tensor(
+                    [[t, h * spatial_merge_size, w * spatial_merge_size]],
+                    dtype=torch.int64,
+                )
+
+                (
+                    window_indices_torch,
+                    reverse_indices_torch,
+                    seqlens_full_torch,
+                    seqlens_window_torch,
+                    cu_seqlens_full_torch,
+                    cu_seqlens_window_torch,
+                ) = scheduler.generate_by_torch(grid_thw)
+
+                (
+                    window_indices_numba,
+                    reverse_indices_numba,
+                    seqlens_full_numba,
+                    seqlens_window_numba,
+                    cu_seqlens_full_numba,
+                    cu_seqlens_window_numba,
+                ) = scheduler.generate_by_torch_with_numba(grid_thw)
+
+                assert window_indices_torch.dtype == \
+                    window_indices_numba.dtype, get_assertion_msg(grid_thw)
+                assert reverse_indices_torch.dtype == \
+                    reverse_indices_numba.dtype, get_assertion_msg(grid_thw)
+                assert seqlens_full_torch.dtype == \
+                    seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
+                assert seqlens_window_torch.dtype == \
+                    seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
+                assert cu_seqlens_full_torch.dtype == \
+                    cu_seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
+                assert cu_seqlens_window_torch.dtype == \
+                    cu_seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
+
+                assert window_indices_torch.shape == \
+                    window_indices_numba.shape, get_assertion_msg(grid_thw)
+                assert reverse_indices_torch.shape == \
+                    reverse_indices_numba.shape, get_assertion_msg(grid_thw)
+                assert seqlens_full_torch.shape == \
+                    seqlens_full_numba.shape, get_assertion_msg(grid_thw)
+                assert seqlens_window_torch.shape == \
+                    seqlens_window_numba.shape, get_assertion_msg(grid_thw)
+                assert cu_seqlens_full_torch.shape == \
+                    cu_seqlens_full_numba.shape, get_assertion_msg(grid_thw)
+                assert cu_seqlens_window_torch.shape == \
+                    cu_seqlens_window_numba.shape, get_assertion_msg(grid_thw)
+
+                assert torch.equal(window_indices_torch,
+                                   window_indices_numba), \
+                       get_assertion_msg(grid_thw)
+                assert torch.equal(reverse_indices_torch,
+                                   reverse_indices_numba), \
+                       get_assertion_msg(grid_thw)
+                assert torch.equal(seqlens_full_torch,
+                                   seqlens_full_numba), \
+                       get_assertion_msg(grid_thw)
+                assert torch.equal(seqlens_window_torch,
+                                   seqlens_window_numba), \
+                       get_assertion_msg(grid_thw)
+                assert torch.equal(cu_seqlens_full_torch,
+                                   cu_seqlens_full_numba), \
+                       get_assertion_msg(grid_thw)
+                assert torch.equal(cu_seqlens_window_torch,
+                                    cu_seqlens_window_numba), \
+                       get_assertion_msg(grid_thw)
+
+
+def _grid_thw_generator(t_range, h_range, w_range, spatial_merge_size):
+    for t in t_range:
+        for h in h_range:
+            for w in w_range:
+                yield torch.tensor(
+                    [[t, h * spatial_merge_size, w * spatial_merge_size]],
+                    dtype=torch.int64,
+                )
+
+
+@pytest.mark.parametrize("window_size, patch_size, spatial_merge_size", [
+    (112, 14, 2),
+    (128, 16, 2),
+])
+def test_qwen2_5_vl_get_window_indices_multi_items_correctness(
+        window_size, patch_size, spatial_merge_size):
+    scheduler = Qwen2_5_VisionAttentionScheduler(
+        spatial_merge_size=spatial_merge_size,
+        window_size=window_size,
+        patch_size=patch_size,
+        max_position_embeddings=32768,
+        device=torch.device("cpu"),
+    )
+
+    get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"
+
+    for grid_thw1 in _grid_thw_generator(
+            range(1, 3),
+            range(1, 18, 3),
+            range(1, 18, 3),
+            spatial_merge_size,
+    ):
+        for grid_thw2 in _grid_thw_generator(
+                range(1, 3),
+                range(1, 18, 3),
+                range(1, 18, 3),
+                spatial_merge_size,
+        ):
+            grid_thw = torch.cat([grid_thw1, grid_thw2])
+
+            (
+                window_indices_torch,
+                reverse_indices_torch,
+                seqlens_full_torch,
+                seqlens_window_torch,
+                cu_seqlens_full_torch,
+                cu_seqlens_window_torch,
+            ) = scheduler.generate_by_torch(grid_thw)
+
+            (
+                window_indices_numba,
+                reverse_indices_numba,
+                seqlens_full_numba,
+                seqlens_window_numba,
+                cu_seqlens_full_numba,
+                cu_seqlens_window_numba,
+            ) = scheduler.generate_by_torch_with_numba(grid_thw)
+
+            assert window_indices_torch.dtype == \
+                window_indices_numba.dtype, get_assertion_msg(grid_thw)
+            assert reverse_indices_torch.dtype == \
+                reverse_indices_numba.dtype, get_assertion_msg(grid_thw)
+            assert seqlens_full_torch.dtype == \
+                seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
+            assert seqlens_window_torch.dtype == \
+                seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
+            assert cu_seqlens_full_torch.dtype == \
+                cu_seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
+            assert cu_seqlens_window_torch.dtype == \
+                cu_seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
+
+            assert window_indices_torch.shape == \
+                window_indices_numba.shape, get_assertion_msg(grid_thw)
+            assert reverse_indices_torch.shape == \
+                reverse_indices_numba.shape, get_assertion_msg(grid_thw)
+            assert seqlens_full_torch.shape == \
+                seqlens_full_numba.shape, get_assertion_msg(grid_thw)
+            assert seqlens_window_torch.shape == \
+                seqlens_window_numba.shape, get_assertion_msg(grid_thw)
+            assert cu_seqlens_full_torch.shape == \
+                cu_seqlens_full_numba.shape, get_assertion_msg(grid_thw)
+            assert cu_seqlens_window_torch.shape == \
+                cu_seqlens_window_numba.shape, get_assertion_msg(grid_thw)
+
+            assert torch.equal(window_indices_torch,
+                                window_indices_numba), \
+                    get_assertion_msg(grid_thw)
+            assert torch.equal(reverse_indices_torch,
+                                reverse_indices_numba), \
+                    get_assertion_msg(grid_thw)
+            assert torch.equal(seqlens_full_torch,
+                                seqlens_full_numba), \
+                    get_assertion_msg(grid_thw)
+            assert torch.equal(seqlens_window_torch,
+                                seqlens_window_numba), \
+                    get_assertion_msg(grid_thw)
+            assert torch.equal(cu_seqlens_full_torch,
+                                cu_seqlens_full_numba), \
+                    get_assertion_msg(grid_thw)
+            assert torch.equal(cu_seqlens_window_torch,
+                                cu_seqlens_window_numba), \
+                    get_assertion_msg(grid_thw)
diff --git a/tests/model_executor/test_qwen2_vl_rot_pos.py b/tests/model_executor/test_qwen2_vl_rot_pos.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm.model_executor.models.qwen2_vl import Qwen2VLViTRotaryPosGenerator
+
+
+@pytest.mark.parametrize("spatial_merge_size", [2, 3])
+@pytest.mark.parametrize("impl_name, device", [
+    ("generate_by_torch_fused", "cuda"),
+    ("generate_by_numba", "cpu"),
+])
+def test_qwen2_vl_rot_pos_correctness(
+    spatial_merge_size,
+    impl_name,
+    device,
+):
+    rot_pos_generator = Qwen2VLViTRotaryPosGenerator(
+        spatial_merge_size=spatial_merge_size,
+        max_position_embeddings=32768,
+        device=torch.device(device),
+    )
+
+    get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"
+
+    for t in range(1, 3):
+        for h in range(1, 32):
+            for w in range(1, 32):
+                for grid_thw in [
+                        torch.tensor([
+                            [
+                                t, h * spatial_merge_size,
+                                w * spatial_merge_size
+                            ],
+                        ],
+                                     dtype=torch.int64),
+                        torch.tensor([
+                            [
+                                t, h * spatial_merge_size,
+                                w * spatial_merge_size
+                            ],
+                        ] * 2,
+                                     dtype=torch.int64),
+                        torch.tensor([
+                            [
+                                t, h * spatial_merge_size,
+                                w * spatial_merge_size
+                            ],
+                            [
+                                1, 2 * spatial_merge_size,
+                                2 * spatial_merge_size
+                            ],
+                        ],
+                                     dtype=torch.int64),
+                ]:
+                    groundtruth = rot_pos_generator.generate_by_torch(grid_thw)
+                    testing_impl = getattr(rot_pos_generator, impl_name)
+                    actual = testing_impl(grid_thw)
+
+                    assert actual.device.type == device, \
+                        get_assertion_msg(grid_thw)
+                    assert groundtruth.dtype == actual.dtype, \
+                        get_assertion_msg(grid_thw)
+                    assert groundtruth.shape == actual.shape, \
+                        get_assertion_msg(grid_thw)
+
+                    actual = actual.cpu()
+                    assert torch.equal(groundtruth, actual), \
+                        get_assertion_msg(grid_thw)