flash attn pytest marker (#41781)

ydshieh · web-flow · commit 01f5ac70a3ac · 2025-10-23T08:39:19.000Z
* flash attn marker

* 111

---------

Co-authored-by: ydshieh &lt;ydshieh@users.noreply.github.com&gt;
diff --git a/conftest.py b/conftest.py
@@ -87,6 +87,8 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
     config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
     config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
+    config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
+    config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
 
     os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"
 
diff --git a/tests/models/exaone4/test_modeling_exaone4.py b/tests/models/exaone4/test_modeling_exaone4.py
@@ -120,6 +120,7 @@ def test_model_generation_sdpa(self):
         text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT, text)
 
+    @pytest.mark.flash_attn_test
     @slow
     @require_torch_accelerator
     @require_flash_attn
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
@@ -643,6 +643,7 @@ def test_integration_test_4bit_batch2(self):
         self.assertEqual(batched_generated_texts[0], generated_text_0[0])
         self.assertEqual(batched_generated_texts[1], generated_text_1[0])
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @require_torch_gpu
     @require_bitsandbytes
diff --git a/tests/models/ministral/test_modeling_ministral.py b/tests/models/ministral/test_modeling_ministral.py
@@ -208,6 +208,7 @@ def test_export_text_with_hybrid_cache(self):
 
         self.assertEqual(export_generated_text, eager_generated_text)
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @slow
     def test_past_sliding_window_generation(self):
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
@@ -300,6 +300,7 @@ def test_compile_static_cache(self):
         static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
 
+    @pytest.mark.flash_attn_test
     @parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)])
     @require_flash_attn
     @slow
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
@@ -274,6 +274,7 @@ def test_export_static_cache(self):
         ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @slow
     def test_3b_generation(self):
diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py
@@ -816,6 +816,7 @@ def test_small_model_integration_test_w_audio(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2.5-Omni-7B",
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -17,6 +17,7 @@
 import tempfile
 import unittest
 
+import pytest
 import requests
 
 from transformers import (
@@ -630,6 +631,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2.5-VL-7B-Instruct",
@@ -658,6 +660,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2.5-VL-7B-Instruct",
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -18,6 +18,7 @@
 import tempfile
 import unittest
 
+import pytest
 import requests
 
 from transformers import (
@@ -562,6 +563,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2-VL-7B-Instruct",
@@ -589,6 +591,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2-VL-7B-Instruct",
diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py
@@ -266,6 +266,7 @@ def test_export_static_cache(self):
 
     @require_flash_attn
     @slow
+    @pytest.mark.flash_attn_test
     def test_600m_generation(self):
         model_id = "Qwen/Qwen3-0.6B-Base"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py
@@ -848,6 +848,7 @@ def test_small_model_integration_test_w_audio(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2.5-Omni-7B",
diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py
@@ -16,6 +16,8 @@
 import copy
 import unittest
 
+import pytest
+
 from transformers import (
     AutoProcessor,
     Qwen3VLMoeConfig,
@@ -513,6 +515,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
             "Qwen/Qwen3-VL-30B-A3B-Instruct",
@@ -545,6 +548,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
     @slow
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
             "Qwen/Qwen3-VL-30B-A3B-Instruct",
diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py
@@ -20,6 +20,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import requests
 import torch.nn as nn
 from parameterized import parameterized
@@ -907,6 +908,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
 
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_flashatt2(self):
         model = VideoLlama3ForConditionalGeneration.from_pretrained(
             "lkhl/VideoLLaMA3-2B-Image-HF",
@@ -933,6 +935,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
 
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         model = VideoLlama3ForConditionalGeneration.from_pretrained(
             "lkhl/VideoLLaMA3-2B-Image-HF",
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
@@ -16,6 +16,8 @@
 import tempfile
 import unittest
 
+import pytest
+
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
 from transformers.testing_utils import (
     backend_empty_cache,
@@ -369,6 +371,7 @@ def test_fused_modules_to_not_convert(self):
     )
     @require_flash_attn
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
     def test_generation_fused(self):
         """
         Test generation quality for fused models - single batch case
@@ -391,6 +394,7 @@ def test_generation_fused(self):
 
         self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @require_torch_gpu
     @unittest.skipIf(
@@ -443,6 +447,7 @@ def test_generation_llava_fused(self):
 
         self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @require_torch_multi_gpu
     @unittest.skipIf(
@@ -484,6 +489,7 @@ def test_generation_custom_model(self):
         outputs = model.generate(**inputs, max_new_tokens=12)
         self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
 
+    @pytest.mark.flash_attn_test
     @require_flash_attn
     @require_torch_multi_gpu
     @unittest.skip(reason="Not enough GPU memory on CI runners")