Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding & Qwen2/2.5-VL
numba == 0.61.2; python_version > '3.9'
3 changes: 0 additions & 3 deletions requirements/cuda.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# Common dependencies
-r common.txt

numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'

# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.6.0
Expand Down
3 changes: 0 additions & 3 deletions requirements/rocm.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# Common dependencies
-r common.txt

numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'

# Dependencies for AMD GPUs
awscli
boto3
Expand Down
198 changes: 198 additions & 0 deletions tests/model_executor/test_qwen2_5_vl_window_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch

from vllm.model_executor.models.qwen2_5_vl import (
Qwen2_5_VisionAttentionScheduler)


@pytest.mark.parametrize("window_size, patch_size, spatial_merge_size", [
(112, 14, 2),
(128, 16, 2),
])
def test_qwen2_5_vl_get_window_indices_correctness(window_size, patch_size,
spatial_merge_size):
scheduler = Qwen2_5_VisionAttentionScheduler(
spatial_merge_size=spatial_merge_size,
window_size=window_size,
patch_size=patch_size,
max_position_embeddings=32768,
device=torch.device("cpu"),
)

get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"

for t in range(1, 3):
for h in range(1, 50):
for w in range(1, 50):
grid_thw = torch.tensor(
[[t, h * spatial_merge_size, w * spatial_merge_size]],
dtype=torch.int64,
)

(
window_indices_torch,
reverse_indices_torch,
seqlens_full_torch,
seqlens_window_torch,
cu_seqlens_full_torch,
cu_seqlens_window_torch,
) = scheduler.generate_by_torch(grid_thw)

(
window_indices_numba,
reverse_indices_numba,
seqlens_full_numba,
seqlens_window_numba,
cu_seqlens_full_numba,
cu_seqlens_window_numba,
) = scheduler.generate_by_torch_with_numba(grid_thw)

assert window_indices_torch.dtype == \
window_indices_numba.dtype, get_assertion_msg(grid_thw)
assert reverse_indices_torch.dtype == \
reverse_indices_numba.dtype, get_assertion_msg(grid_thw)
assert seqlens_full_torch.dtype == \
seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
assert seqlens_window_torch.dtype == \
seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
assert cu_seqlens_full_torch.dtype == \
cu_seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
assert cu_seqlens_window_torch.dtype == \
cu_seqlens_window_numba.dtype, get_assertion_msg(grid_thw)

assert window_indices_torch.shape == \
window_indices_numba.shape, get_assertion_msg(grid_thw)
assert reverse_indices_torch.shape == \
reverse_indices_numba.shape, get_assertion_msg(grid_thw)
assert seqlens_full_torch.shape == \
seqlens_full_numba.shape, get_assertion_msg(grid_thw)
assert seqlens_window_torch.shape == \
seqlens_window_numba.shape, get_assertion_msg(grid_thw)
assert cu_seqlens_full_torch.shape == \
cu_seqlens_full_numba.shape, get_assertion_msg(grid_thw)
assert cu_seqlens_window_torch.shape == \
cu_seqlens_window_numba.shape, get_assertion_msg(grid_thw)

assert torch.equal(window_indices_torch,
window_indices_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(reverse_indices_torch,
reverse_indices_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(seqlens_full_torch,
seqlens_full_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(seqlens_window_torch,
seqlens_window_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(cu_seqlens_full_torch,
cu_seqlens_full_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(cu_seqlens_window_torch,
cu_seqlens_window_numba), \
get_assertion_msg(grid_thw)


def _grid_thw_generator(t_range, h_range, w_range, spatial_merge_size):
for t in t_range:
for h in h_range:
for w in w_range:
yield torch.tensor(
[[t, h * spatial_merge_size, w * spatial_merge_size]],
dtype=torch.int64,
)


@pytest.mark.parametrize("window_size, patch_size, spatial_merge_size", [
(112, 14, 2),
(128, 16, 2),
])
def test_qwen2_5_vl_get_window_indices_multi_items_correctness(
window_size, patch_size, spatial_merge_size):
scheduler = Qwen2_5_VisionAttentionScheduler(
spatial_merge_size=spatial_merge_size,
window_size=window_size,
patch_size=patch_size,
max_position_embeddings=32768,
device=torch.device("cpu"),
)

get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"

for grid_thw1 in _grid_thw_generator(
range(1, 3),
range(1, 18, 3),
range(1, 18, 3),
spatial_merge_size,
):
for grid_thw2 in _grid_thw_generator(
range(1, 3),
range(1, 18, 3),
range(1, 18, 3),
spatial_merge_size,
):
grid_thw = torch.cat([grid_thw1, grid_thw2])

(
window_indices_torch,
reverse_indices_torch,
seqlens_full_torch,
seqlens_window_torch,
cu_seqlens_full_torch,
cu_seqlens_window_torch,
) = scheduler.generate_by_torch(grid_thw)

(
window_indices_numba,
reverse_indices_numba,
seqlens_full_numba,
seqlens_window_numba,
cu_seqlens_full_numba,
cu_seqlens_window_numba,
) = scheduler.generate_by_torch_with_numba(grid_thw)

assert window_indices_torch.dtype == \
window_indices_numba.dtype, get_assertion_msg(grid_thw)
assert reverse_indices_torch.dtype == \
reverse_indices_numba.dtype, get_assertion_msg(grid_thw)
assert seqlens_full_torch.dtype == \
seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
assert seqlens_window_torch.dtype == \
seqlens_window_numba.dtype, get_assertion_msg(grid_thw)
assert cu_seqlens_full_torch.dtype == \
cu_seqlens_full_numba.dtype, get_assertion_msg(grid_thw)
assert cu_seqlens_window_torch.dtype == \
cu_seqlens_window_numba.dtype, get_assertion_msg(grid_thw)

assert window_indices_torch.shape == \
window_indices_numba.shape, get_assertion_msg(grid_thw)
assert reverse_indices_torch.shape == \
reverse_indices_numba.shape, get_assertion_msg(grid_thw)
assert seqlens_full_torch.shape == \
seqlens_full_numba.shape, get_assertion_msg(grid_thw)
assert seqlens_window_torch.shape == \
seqlens_window_numba.shape, get_assertion_msg(grid_thw)
assert cu_seqlens_full_torch.shape == \
cu_seqlens_full_numba.shape, get_assertion_msg(grid_thw)
assert cu_seqlens_window_torch.shape == \
cu_seqlens_window_numba.shape, get_assertion_msg(grid_thw)

assert torch.equal(window_indices_torch,
window_indices_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(reverse_indices_torch,
reverse_indices_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(seqlens_full_torch,
seqlens_full_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(seqlens_window_torch,
seqlens_window_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(cu_seqlens_full_torch,
cu_seqlens_full_numba), \
get_assertion_msg(grid_thw)
assert torch.equal(cu_seqlens_window_torch,
cu_seqlens_window_numba), \
get_assertion_msg(grid_thw)
69 changes: 69 additions & 0 deletions tests/model_executor/test_qwen2_vl_rot_pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch

from vllm.model_executor.models.qwen2_vl import Qwen2VLViTRotaryPosGenerator


@pytest.mark.parametrize("spatial_merge_size", [2, 3])
@pytest.mark.parametrize("impl_name, device", [
("generate_by_torch_fused", "cuda"),
("generate_by_numba", "cpu"),
])
def test_qwen2_vl_rot_pos_correctness(
spatial_merge_size,
impl_name,
device,
):
rot_pos_generator = Qwen2VLViTRotaryPosGenerator(
spatial_merge_size=spatial_merge_size,
max_position_embeddings=32768,
device=torch.device(device),
)

get_assertion_msg = lambda grid_thw: f"mismatch at grid_thw={grid_thw}"

for t in range(1, 3):
for h in range(1, 32):
for w in range(1, 32):
for grid_thw in [
torch.tensor([
[
t, h * spatial_merge_size,
w * spatial_merge_size
],
],
dtype=torch.int64),
torch.tensor([
[
t, h * spatial_merge_size,
w * spatial_merge_size
],
] * 2,
dtype=torch.int64),
torch.tensor([
[
t, h * spatial_merge_size,
w * spatial_merge_size
],
[
1, 2 * spatial_merge_size,
2 * spatial_merge_size
],
],
dtype=torch.int64),
]:
groundtruth = rot_pos_generator.generate_by_torch(grid_thw)
testing_impl = getattr(rot_pos_generator, impl_name)
actual = testing_impl(grid_thw)

assert actual.device.type == device, \
get_assertion_msg(grid_thw)
assert groundtruth.dtype == actual.dtype, \
get_assertion_msg(grid_thw)
assert groundtruth.shape == actual.shape, \
get_assertion_msg(grid_thw)

actual = actual.cpu()
assert torch.equal(groundtruth, actual), \
get_assertion_msg(grid_thw)
Loading