Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def pytest_configure(config):
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")

os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"

Expand Down
1 change: 1 addition & 0 deletions tests/models/exaone4/test_modeling_exaone4.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def test_model_generation_sdpa(self):
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT, text)

@pytest.mark.flash_attn_test
@slow
@require_torch_accelerator
@require_flash_attn
Expand Down
1 change: 1 addition & 0 deletions tests/models/idefics2/test_modeling_idefics2.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ def test_integration_test_4bit_batch2(self):
self.assertEqual(batched_generated_texts[0], generated_text_0[0])
self.assertEqual(batched_generated_texts[1], generated_text_1[0])

@pytest.mark.flash_attn_test
@require_flash_attn
@require_torch_gpu
@require_bitsandbytes
Expand Down
1 change: 1 addition & 0 deletions tests/models/ministral/test_modeling_ministral.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def test_export_text_with_hybrid_cache(self):

self.assertEqual(export_generated_text, eager_generated_text)

@pytest.mark.flash_attn_test
@require_flash_attn
@slow
def test_past_sliding_window_generation(self):
Expand Down
1 change: 1 addition & 0 deletions tests/models/mistral/test_modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def test_compile_static_cache(self):
static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)

@pytest.mark.flash_attn_test
@parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)])
@require_flash_attn
@slow
Expand Down
1 change: 1 addition & 0 deletions tests/models/qwen2/test_modeling_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def test_export_static_cache(self):
ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)

@pytest.mark.flash_attn_test
@require_flash_attn
@slow
def test_3b_generation(self):
Expand Down
1 change: 1 addition & 0 deletions tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,7 @@ def test_small_model_integration_test_w_audio(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
Expand Down
3 changes: 3 additions & 0 deletions tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import tempfile
import unittest

import pytest
import requests

from transformers import (
Expand Down Expand Up @@ -630,6 +631,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
Expand Down Expand Up @@ -658,6 +660,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
Expand Down
3 changes: 3 additions & 0 deletions tests/models/qwen2_vl/test_modeling_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import tempfile
import unittest

import pytest
import requests

from transformers import (
Expand Down Expand Up @@ -562,6 +563,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
Expand Down Expand Up @@ -589,6 +591,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
Expand Down
1 change: 1 addition & 0 deletions tests/models/qwen3/test_modeling_qwen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ def test_export_static_cache(self):

@require_flash_attn
@slow
@pytest.mark.flash_attn_test
def test_600m_generation(self):
model_id = "Qwen/Qwen3-0.6B-Base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,7 @@ def test_small_model_integration_test_w_audio(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
Expand Down
4 changes: 4 additions & 0 deletions tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import copy
import unittest

import pytest

from transformers import (
AutoProcessor,
Qwen3VLMoeConfig,
Expand Down Expand Up @@ -513,6 +515,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-30B-A3B-Instruct",
Expand Down Expand Up @@ -545,6 +548,7 @@ def test_small_model_integration_test_batch_flashatt2(self):
@slow
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-30B-A3B-Instruct",
Expand Down
3 changes: 3 additions & 0 deletions tests/models/video_llama_3/test_modeling_video_llama_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import unittest

import numpy as np
import pytest
import requests
import torch.nn as nn
from parameterized import parameterized
Expand Down Expand Up @@ -907,6 +908,7 @@ def test_small_model_integration_test_batch_different_resolutions(self):

@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF",
Expand All @@ -933,6 +935,7 @@ def test_small_model_integration_test_batch_flashatt2(self):

@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF",
Expand Down
6 changes: 6 additions & 0 deletions tests/quantization/autoawq/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import tempfile
import unittest

import pytest

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
Expand Down Expand Up @@ -369,6 +371,7 @@ def test_fused_modules_to_not_convert(self):
)
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
def test_generation_fused(self):
"""
Test generation quality for fused models - single batch case
Expand All @@ -391,6 +394,7 @@ def test_generation_fused(self):

self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)

@pytest.mark.flash_attn_test
@require_flash_attn
@require_torch_gpu
@unittest.skipIf(
Expand Down Expand Up @@ -443,6 +447,7 @@ def test_generation_llava_fused(self):

self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)

@pytest.mark.flash_attn_test
@require_flash_attn
@require_torch_multi_gpu
@unittest.skipIf(
Expand Down Expand Up @@ -484,6 +489,7 @@ def test_generation_custom_model(self):
outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)

@pytest.mark.flash_attn_test
@require_flash_attn
@require_torch_multi_gpu
@unittest.skip(reason="Not enough GPU memory on CI runners")
Expand Down