From db4858883b8976f27fe30b097fc888c8620aa87d Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 17:47:35 +0800 Subject: [PATCH 1/6] [Bugfix] Initialize attention bias on the same device as Query/Key/Value for Qwen2VL Series The same issue as `https://github.com/vllm-project/vllm/pull/13468` for QwenVL Series in vllm GRPOTrainer --- vllm/model_executor/models/qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cb92fcbe9fa1..f43c31aea384 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -367,7 +367,7 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None) + kv_seqlen=None, device=q.device) context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) From dfa2cc953e9801fdb076b8c722fc92dcfe24c206 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 17:57:21 +0800 Subject: [PATCH 2/6] Update qwen2_5_vl.py --- vllm/model_executor/models/qwen2_5_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0dbff665b5d3..1b010b480108 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -323,7 +323,7 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None) + kv_seqlen=None, device=q.device) context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) From 57dfd839d1f513b85a4efb335fda7d42bdd06df3 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 21:20:05 +0800 Subject: [PATCH 3/6] Initialize attention bias on the same device Fix linting --- vllm/model_executor/models/qwen2_5_vl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 1b010b480108..50f11e9b85f8 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -323,7 +323,8 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None, device=q.device) + kv_seqlen=None, + device=q.device) context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) From 2cd9f3f1a04e314db671942187d9289519e4092e Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 21:20:45 +0800 Subject: [PATCH 4/6] Initialize attention bias on the same device Fix linting problem. --- vllm/model_executor/models/qwen2_vl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f43c31aea384..bc2bd23ef720 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -367,7 +367,8 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None, device=q.device) + kv_seqlen=None, + device=q.device) context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) From 89225b8f5fa309109a4f39c285837ea5d2f34af4 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 21:37:52 +0800 Subject: [PATCH 5/6] Update qwen2_5_vl.py quick fix linting --- vllm/model_executor/models/qwen2_5_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 50f11e9b85f8..ef3d28c8087d 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -323,7 +323,7 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None, + kv_seqlen=None, device=q.device) context_layer = xops.memory_efficient_attention_forward( From efd828032ebbe5aecabd8482dd464c1e89bcbc21 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 28 Feb 2025 21:38:18 +0800 Subject: [PATCH 6/6] Update qwen2_vl.py quick fix linting --- vllm/model_executor/models/qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index bc2bd23ef720..523b53d5ee41 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -367,7 +367,7 @@ def forward( seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None, + kv_seqlen=None, device=q.device) context_layer = xops.memory_efficient_attention_forward(