diff --git a/conftest.py b/conftest.py index 35b4cfb1ebfa..910e9fcc1766 100644 --- a/conftest.py +++ b/conftest.py @@ -87,6 +87,8 @@ def pytest_configure(config): config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu") config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality") config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality") + config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality") + config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality") os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true" diff --git a/tests/models/exaone4/test_modeling_exaone4.py b/tests/models/exaone4/test_modeling_exaone4.py index 3bef0a07cae1..fb7754d652d9 100644 --- a/tests/models/exaone4/test_modeling_exaone4.py +++ b/tests/models/exaone4/test_modeling_exaone4.py @@ -120,6 +120,7 @@ def test_model_generation_sdpa(self): text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT, text) + @pytest.mark.flash_attn_test @slow @require_torch_accelerator @require_flash_attn diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index a345f5f37574..4d958aff7007 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -643,6 +643,7 @@ def test_integration_test_4bit_batch2(self): self.assertEqual(batched_generated_texts[0], generated_text_0[0]) self.assertEqual(batched_generated_texts[1], generated_text_1[0]) + @pytest.mark.flash_attn_test @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/models/ministral/test_modeling_ministral.py b/tests/models/ministral/test_modeling_ministral.py index 012a8acf7637..1a41fecd582d 100644 --- a/tests/models/ministral/test_modeling_ministral.py +++ b/tests/models/ministral/test_modeling_ministral.py @@ -208,6 +208,7 @@ def test_export_text_with_hybrid_cache(self): self.assertEqual(export_generated_text, eager_generated_text) + @pytest.mark.flash_attn_test @require_flash_attn @slow def test_past_sliding_window_generation(self): diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index df3ec6bf7373..ac4797102a78 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -300,6 +300,7 @@ def test_compile_static_cache(self): static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) + @pytest.mark.flash_attn_test @parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)]) @require_flash_attn @slow diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index dc76f8b00fba..2c016d861da5 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -274,6 +274,7 @@ def test_export_static_cache(self): ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + @pytest.mark.flash_attn_test @require_flash_attn @slow def test_3b_generation(self): diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index 2b36f597de3b..b54bf3f6d514 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -816,6 +816,7 @@ def test_small_model_integration_test_w_audio(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2_5OmniForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-Omni-7B", diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index 951000985a04..4a4950c5104f 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -17,6 +17,7 @@ import tempfile import unittest +import pytest import requests from transformers import ( @@ -630,6 +631,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", @@ -658,6 +660,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index a309b6ecdb1f..e06c2872ed5d 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -18,6 +18,7 @@ import tempfile import unittest +import pytest import requests from transformers import ( @@ -562,6 +563,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-7B-Instruct", @@ -589,6 +591,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-7B-Instruct", diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py index dc383fc69ab3..06223b474beb 100644 --- a/tests/models/qwen3/test_modeling_qwen3.py +++ b/tests/models/qwen3/test_modeling_qwen3.py @@ -266,6 +266,7 @@ def test_export_static_cache(self): @require_flash_attn @slow + @pytest.mark.flash_attn_test def test_600m_generation(self): model_id = "Qwen/Qwen3-0.6B-Base" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py index ebdb16bc1714..e752406e0a2a 100644 --- a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py +++ b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py @@ -848,6 +848,7 @@ def test_small_model_integration_test_w_audio(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-Omni-7B", diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index ccfbca9948e9..ea48ef000d42 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -16,6 +16,8 @@ import copy import unittest +import pytest + from transformers import ( AutoProcessor, Qwen3VLMoeConfig, @@ -513,6 +515,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen3VLMoeForConditionalGeneration.from_pretrained( "Qwen/Qwen3-VL-30B-A3B-Instruct", @@ -545,6 +548,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen3VLMoeForConditionalGeneration.from_pretrained( "Qwen/Qwen3-VL-30B-A3B-Instruct", diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py index 90efb9a43fa6..5ad0cf9d7d4c 100644 --- a/tests/models/video_llama_3/test_modeling_video_llama_3.py +++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py @@ -20,6 +20,7 @@ import unittest import numpy as np +import pytest import requests import torch.nn as nn from parameterized import parameterized @@ -907,6 +908,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = VideoLlama3ForConditionalGeneration.from_pretrained( "lkhl/VideoLLaMA3-2B-Image-HF", @@ -933,6 +935,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = VideoLlama3ForConditionalGeneration.from_pretrained( "lkhl/VideoLLaMA3-2B-Image-HF", diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 2dfa06585497..78c694a848fc 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -16,6 +16,8 @@ import tempfile import unittest +import pytest + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM from transformers.testing_utils import ( backend_empty_cache, @@ -369,6 +371,7 @@ def test_fused_modules_to_not_convert(self): ) @require_flash_attn @require_torch_gpu + @pytest.mark.flash_attn_test def test_generation_fused(self): """ Test generation quality for fused models - single batch case @@ -391,6 +394,7 @@ def test_generation_fused(self): self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION) + @pytest.mark.flash_attn_test @require_flash_attn @require_torch_gpu @unittest.skipIf( @@ -443,6 +447,7 @@ def test_generation_llava_fused(self): self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT) + @pytest.mark.flash_attn_test @require_flash_attn @require_torch_multi_gpu @unittest.skipIf( @@ -484,6 +489,7 @@ def test_generation_custom_model(self): outputs = model.generate(**inputs, max_new_tokens=12) self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL) + @pytest.mark.flash_attn_test @require_flash_attn @require_torch_multi_gpu @unittest.skip(reason="Not enough GPU memory on CI runners")