From 40cd03c28df1552f0331d1e523b316a91e260ba0 Mon Sep 17 00:00:00 2001 From: Helena Date: Fri, 13 Mar 2026 18:03:56 +0100 Subject: [PATCH] Fixes for Windows and NPU support for GenAI test - Fix TemporaryDirectory issues on Windows - Compare model output tokens instead of tokenized outputs for LLMs - Initial NPU support - Use chat template for VLM test --- tests/openvino/test_genai.py | 347 +++++++++++++++++++++-------------- 1 file changed, 213 insertions(+), 134 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 2d075e7874..8d70a8c0aa 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -1,9 +1,23 @@ +""" +Test OpenVINO GenAI inference on models exported with optimum-intel + +- OpenVINO device can be set by environment variable OPENVINO_TEST_DEVICE + - For NPU, Text2Speech test is not supported; for LLM and VLM only a limited list of models is currently supported. + - For GPU, there are known failed tests on some GPUs: qwen2_moe, qwen3_moe, qwen2_vl, qwen2_5_vl, speecht5, qwen3_eagle3. + This is under investigation. +""" + +import gc import os +import shutil import tempfile +import traceback as traceback_mod import unittest +from pathlib import Path import numpy as np import openvino as ov +import pytest import requests import torch from openvino_genai import ( @@ -33,15 +47,76 @@ OVModelForTextToSpeechSeq2Seq, OVModelForVisualCausalLM, ) +from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING from optimum.intel.utils.import_utils import is_openvino_version from optimum.utils import is_transformers_version +# NPU does not support f32 inference +TEST_CONFIG = {"CACHE_DIR": ""} if OPENVINO_DEVICE == "NPU" else {**F32_CONFIG, "CACHE_DIR": ""} + os.environ["TOKENIZERS_PARALLELISM"] = "false" +_temp_dirs = [] # Collect temp dirs for batch cleanup after all tests finish + + +class _ClearFramesPlugin: + """Pytest plugin that clears traceback frames and deletes temp dirs after all tests finish. + + On Windows, when a test fails, pytest holds the exception traceback which keeps + references to all local variables in the test frame — including OpenVINO model + objects that hold file handles on temp directory contents. + + Clearing frames between tests can cause access violations in subsequent tests. Instead, all + tracebacks are collected and cleared once at session end, then temp dirs are deleted. + """ + + def __init__(self): + self._pending_tracebacks = [] + + @pytest.hookimpl(hookwrapper=True) + def pytest_runtest_makereport(self, item, call): + yield + if call.excinfo is not None and call.excinfo.value is not None: + tb = call.excinfo.value.__traceback__ + if tb is not None: + self._pending_tracebacks.append(tb) + + def pytest_sessionfinish(self, session, exitstatus): + for tb in self._pending_tracebacks: + traceback_mod.clear_frames(tb) + self._pending_tracebacks.clear() + gc.collect() + for tmp_path in _temp_dirs: + shutil.rmtree(tmp_path, ignore_errors=True) + _temp_dirs.clear() + + +_clear_frames_plugin = _ClearFramesPlugin() + + +@pytest.fixture(autouse=True) +def temp_dir_fixture(request): + """ + Provides a temporary directory as self.temp_dir, cleaned up after all tests finish. + + Immediate cleanup is attempted but may fail on Windows if OpenVINO model objects + still hold file handles. Failed directories are cleaned up at session end after + traceback frames are cleared and objects are garbage collected. + """ + if not request.config.pluginmanager.has_plugin("_clear_frames"): + request.config.pluginmanager.register(_clear_frames_plugin, "_clear_frames") + tmp_path = tempfile.mkdtemp() + request.instance.temp_dir = tmp_path + yield + try: + shutil.rmtree(tmp_path) + except (PermissionError, OSError): + _temp_dirs.append(tmp_path) + class LLMPipelineTestCase(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ( + ALL_SUPPORTED_ARCHITECTURES = ( "gpt_bigcode", "bloom", "codegen", @@ -78,30 +153,36 @@ class LLMPipelineTestCase(unittest.TestCase): "granitemoe", ) - if is_transformers_version(">=", "4.48.0"): - SUPPORTED_ARCHITECTURES += ("cohere2",) + # to be expanded, other architectures work on NPU too + # qwen2, phi and phi3 tests are flaky on NPU, not including for now + NPU_SUPPORTED_ARCHITECTURES = ("gpt2", "glm", "opt", "qwen3_moe", "gpt_oss") + + # min versions if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt") - if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("deepseek",) - if is_transformers_version("<", "4.56.0"): - SUPPORTED_ARCHITECTURES += ("qwen",) + ALL_SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt") + if is_transformers_version(">=", "4.48.0"): + ALL_SUPPORTED_ARCHITECTURES += ("cohere2",) if is_transformers_version(">=", "4.49"): - SUPPORTED_ARCHITECTURES += ("gemma3_text",) + ALL_SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): - SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") + ALL_SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") if is_transformers_version(">=", "4.51.3"): - SUPPORTED_ARCHITECTURES += ("glm4",) + ALL_SUPPORTED_ARCHITECTURES += ("glm4",) if is_transformers_version(">=", "4.53.0"): - SUPPORTED_ARCHITECTURES += ("arcee",) + ALL_SUPPORTED_ARCHITECTURES += ("arcee",) if is_transformers_version(">=", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("exaone4",) + ALL_SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version(">=", "4.55.0"): - SUPPORTED_ARCHITECTURES += ("gpt_oss",) + ALL_SUPPORTED_ARCHITECTURES += ("gpt_oss",) + + # max versions if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("minicpm", "minicpm3", "arctic") + ALL_SUPPORTED_ARCHITECTURES += ("minicpm", "minicpm3", "arctic", "deepseek") if is_transformers_version("<", "4.56.0"): - SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4") + ALL_SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4", "qwen") + + # for now we do not test NPU with old transformers versions + SUPPORTED_ARCHITECTURES = NPU_SUPPORTED_ARCHITECTURES if OPENVINO_DEVICE == "NPU" else ALL_SUPPORTED_ARCHITECTURES REMOTE_CODE_MODELS = ( "chatglm", @@ -132,11 +213,6 @@ class LLMPipelineTestCase(unittest.TestCase): "jais", "qwen", ) - NO_ECHO_MODELS = ( # weird - "gpt_oss", - "orion", - "xglm", - ) GEN_KWARGS = { "max_new_tokens": 10, @@ -148,60 +224,53 @@ class LLMPipelineTestCase(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_outputs(self, model_arch): model_id = MODEL_NAMES[model_arch] - echo = model_arch not in self.NO_ECHO_MODELS use_cache = model_arch not in self.NO_CACHE_MODELS trust_remote_code = model_arch in self.REMOTE_CODE_MODELS set_seed(42) transformers_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=trust_remote_code).eval() - with tempfile.TemporaryDirectory() as tmpdirname: - set_seed(42) - main_export( - model_name_or_path=model_id, - task="text-generation-with-past", - trust_remote_code=trust_remote_code, - convert_tokenizer=True, - output=tmpdirname, - ) - optimum_model = OVModelForCausalLM.from_pretrained( - tmpdirname, trust_remote_code=trust_remote_code, device=OPENVINO_DEVICE, ov_config=F32_CONFIG - ) - genai_model = LLMPipeline(tmpdirname, device=OPENVINO_DEVICE, **F32_CONFIG) + set_seed(42) + main_export( + model_name_or_path=model_id, + task="text-generation-with-past", + trust_remote_code=trust_remote_code, + convert_tokenizer=True, + output=self.temp_dir, + ) + genai_model = LLMPipeline(self.temp_dir, device=OPENVINO_DEVICE, **TEST_CONFIG) prompt = "Paris is the capital of" tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) inputs = tokenizer(prompt, return_tensors="pt") + input_len = inputs["input_ids"].shape[-1] with torch.no_grad(): transformers_ids = transformers_model.generate(**inputs, use_cache=use_cache, **self.GEN_KWARGS) - transformers_output = tokenizer.decode(transformers_ids[0], skip_special_tokens=True) + transformers_ids = transformers_ids.squeeze()[input_len:] - optimum_ids = optimum_model.generate(**inputs, use_cache=use_cache, **self.GEN_KWARGS) - optimum_output = tokenizer.decode(optimum_ids[0], skip_special_tokens=True) + if OPENVINO_DEVICE != "NPU": + optimum_model = OVModelForCausalLM.from_pretrained( + self.temp_dir, trust_remote_code=trust_remote_code, device=OPENVINO_DEVICE, ov_config=TEST_CONFIG + ) + optimum_ids = optimum_model.generate(**inputs, use_cache=use_cache, **self.GEN_KWARGS) + optimum_ids = optimum_ids.squeeze()[input_len:] + self.assertEqual( + transformers_ids.squeeze().tolist(), + optimum_ids.squeeze().tolist(), + "Transformers ids and Optimum ids are not the same", + ) - genai_output = genai_model.generate( - prompt, echo=echo, apply_chat_template=False, ignore_eos=True, **self.GEN_KWARGS + genai_ids = genai_model( + ov.Tensor(inputs["input_ids"].numpy()), apply_chat_template=False, **self.GEN_KWARGS + ).tokens[0] + self.assertEqual( + transformers_ids.tolist(), genai_ids, "Transformers ids and OpenVINO GenAI ids are not the same" ) - if not echo: - # if echo is not supported, trim the prompt from the outputs and trim spaces - # NOTE: this is an approximation, as detokenize(prompt_ids + generated_ids) - prompt != detokenize(generated_ids) - transformers_output = transformers_output[len(prompt) :].strip() - optimum_output = optimum_output[len(prompt) :].strip() - - # assert they are not empty - self.assertTrue(transformers_output) - self.assertTrue(optimum_output) - self.assertTrue(genai_output) - - # compare outputs - self.assertEqual(transformers_output, optimum_output) - self.assertEqual(transformers_output, genai_output) - class VLMPipelineTestCase(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ( + ALL_SUPPORTED_ARCHITECTURES = ( "llava", "llava_next", "llava_next_video", @@ -209,17 +278,22 @@ class VLMPipelineTestCase(unittest.TestCase): "qwen2_vl", ) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("llava_next_mistral",) + ALL_SUPPORTED_ARCHITECTURES += ("llava_next_mistral",) if is_transformers_version("<", "4.52.0"): - SUPPORTED_ARCHITECTURES += ("minicpmo",) + ALL_SUPPORTED_ARCHITECTURES += ("minicpmo",) if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("llava-qwen2", "phi3_v") + ALL_SUPPORTED_ARCHITECTURES += ("llava-qwen2", "phi3_v") if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) + ALL_SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("phi4mm",) + ALL_SUPPORTED_ARCHITECTURES += ("phi4mm",) if is_transformers_version(">=", "4.49"): - SUPPORTED_ARCHITECTURES += ("gemma3",) + ALL_SUPPORTED_ARCHITECTURES += ("gemma3",) + + # for now we do not test NPU with old transformers versions + NPU_SUPPORTED_ARCHITECTURES = ("qwen2_vl", "qwen2_5_vl") + + SUPPORTED_ARCHITECTURES = NPU_SUPPORTED_ARCHITECTURES if OPENVINO_DEVICE == "NPU" else ALL_SUPPORTED_ARCHITECTURES REMOTE_CODE_MODELS = ( "minicpmv", @@ -278,55 +352,57 @@ def test_compare_outputs(self, model_arch): transformers_class = self._get_model_class(model_arch) transformers_model = transformers_class.from_pretrained(model_id, trust_remote_code=trust_remote_code).eval() - with tempfile.TemporaryDirectory() as tmpdirname: - set_seed(42) - main_export( - model_name_or_path=model_id, - trust_remote_code=trust_remote_code, - task="image-text-to-text", - convert_tokenizer=True, - output=tmpdirname, - ) - optimum_model = OVModelForVisualCausalLM.from_pretrained( - tmpdirname, device=OPENVINO_DEVICE, ov_config=F32_CONFIG, trust_remote_code=trust_remote_code - ) - genai_model = VLMPipeline(tmpdirname, device=OPENVINO_DEVICE, **F32_CONFIG) + set_seed(42) + main_export( + model_name_or_path=model_id, + trust_remote_code=trust_remote_code, + task="image-text-to-text", + convert_tokenizer=True, + output=self.temp_dir, + ) + genai_model = VLMPipeline(self.temp_dir, device=OPENVINO_DEVICE, **TEST_CONFIG) image = self.IMAGE prompt = "A photo of a cat sitting on a" config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code) - inputs = optimum_model.preprocess_inputs( + # On NPU, the optimum models cannot be loaded, so we use the preprocess_inputs method from the model class directly + model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type] + inputs = model_cls.preprocess_inputs( text=prompt, image=image, tokenizer=tokenizer, processor=processor, config=config ) + full_prompt = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) with torch.no_grad(): transformers_ids = transformers_model.generate(**inputs, **self.GEN_KWARGS) transformers_output = tokenizer.decode(transformers_ids[0], skip_special_tokens=True) + transformers_output = transformers_output[len(full_prompt) :].strip() - optimum_ids = optimum_model.generate(**inputs, **self.GEN_KWARGS) - optimum_output = tokenizer.decode(optimum_ids[0], skip_special_tokens=True) + if OPENVINO_DEVICE != "NPU": + optimum_model = OVModelForVisualCausalLM.from_pretrained( + self.temp_dir, device=OPENVINO_DEVICE, ov_config=TEST_CONFIG, trust_remote_code=trust_remote_code + ) + optimum_ids = optimum_model.generate(**inputs, **self.GEN_KWARGS) + optimum_output = tokenizer.decode(optimum_ids[0], skip_special_tokens=True) + optimum_output = optimum_output[len(full_prompt) :].strip() + self.assertTrue(optimum_output) + self.assertEqual(transformers_output, optimum_output, "Transformers and Optimum outputs are not the same") + # apply_chat_template is set to True because it is also set in preprocess_inputs() genai_output = genai_model.generate( - prompt, images=[ov.Tensor(np.array(image))], ignore_eos=True, **self.GEN_KWARGS + prompt, images=[ov.Tensor(np.array(image))], ignore_eos=True, apply_chat_template=True, **self.GEN_KWARGS ).texts[0] - full_prompt = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) - transformers_output = transformers_output[len(full_prompt) :].strip() - optimum_output = optimum_output[len(full_prompt) :].strip() - # assert they are not empty self.assertTrue(transformers_output) - self.assertTrue(optimum_output) self.assertTrue(genai_output) # compare outputs - self.assertEqual(transformers_output, optimum_output) - self.assertEqual(transformers_output, genai_output) + self.assertEqual(transformers_output, genai_output, "Transformers and OpenVINO GenAI outputs are not the same") -class Speeh2TextPipelineTestCase(unittest.TestCase): +class Speech2TextPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("whisper",) GEN_KWARGS = { @@ -349,18 +425,15 @@ def test_compare_outputs(self, model_arch): set_seed(42) transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).eval() - with tempfile.TemporaryDirectory() as tmpdirname: - set_seed(42) - main_export( - model_name_or_path=model_id, - task="automatic-speech-recognition-with-past", - convert_tokenizer=True, - output=tmpdirname, - ) - optimum_model = OVModelForSpeechSeq2Seq.from_pretrained( - tmpdirname, device=OPENVINO_DEVICE, ov_config=F32_CONFIG - ) - genai_model = WhisperPipeline(tmpdirname, device=OPENVINO_DEVICE, **F32_CONFIG) + set_seed(42) + main_export( + model_name_or_path=model_id, + task="automatic-speech-recognition-with-past", + convert_tokenizer=True, + output=self.temp_dir, + ) + + genai_model = WhisperPipeline(self.temp_dir, device=OPENVINO_DEVICE, **TEST_CONFIG) audio = self._get_audio() processor = AutoProcessor.from_pretrained(model_id) @@ -371,15 +444,20 @@ def test_compare_outputs(self, model_arch): transformers_ids = transformers_model.generate(**inputs, **self.GEN_KWARGS) transformers_output = tokenizer.decode(transformers_ids[0], skip_special_tokens=True) - optimum_ids = optimum_model.generate(**inputs, **self.GEN_KWARGS) - optimum_output = tokenizer.decode(optimum_ids[0], skip_special_tokens=True) + if OPENVINO_DEVICE != "NPU": + optimum_model = OVModelForSpeechSeq2Seq.from_pretrained( + self.temp_dir, device=OPENVINO_DEVICE, ov_config=TEST_CONFIG + ) + optimum_ids = optimum_model.generate(**inputs, **self.GEN_KWARGS) + optimum_output = tokenizer.decode(optimum_ids[0], skip_special_tokens=True) + self.assertEqual(transformers_output, optimum_output) genai_output = genai_model.generate(inputs["input_features"].flatten().tolist(), **self.GEN_KWARGS).texts[0] - self.assertEqual(transformers_output, optimum_output) self.assertEqual(transformers_output, genai_output) +@pytest.mark.skipif(OPENVINO_DEVICE == "NPU", reason="Text2Speech test is not yet supported on NPU") class Text2SpeechPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("speecht5",) VOCODER = "fxmarty/speecht5-hifigan-tiny" @@ -412,19 +490,18 @@ def test_compare_outputs(self, model_arch): set_seed(42) transformers_model = AutoModelForTextToSpectrogram.from_pretrained(model_id).eval() - with tempfile.TemporaryDirectory() as tmpdirname: - set_seed(42) - main_export( - model_name_or_path=model_id, - task="text-to-audio-with-past", - model_kwargs={"vocoder": self.VOCODER}, - convert_tokenizer=True, - output=tmpdirname, - ) - optimum_model = OVModelForTextToSpeechSeq2Seq.from_pretrained( - tmpdirname, device=OPENVINO_DEVICE, ov_config=F32_CONFIG - ) - genai_model = Text2SpeechPipeline(tmpdirname, device=OPENVINO_DEVICE, **F32_CONFIG) + set_seed(42) + main_export( + model_name_or_path=model_id, + task="text-to-audio-with-past", + model_kwargs={"vocoder": self.VOCODER}, + convert_tokenizer=True, + output=self.temp_dir, + ) + optimum_model = OVModelForTextToSpeechSeq2Seq.from_pretrained( + self.temp_dir, device=OPENVINO_DEVICE, ov_config=TEST_CONFIG + ) + genai_model = Text2SpeechPipeline(self.temp_dir, device=OPENVINO_DEVICE, **TEST_CONFIG) text = "Hello, how are you?" processor = AutoProcessor.from_pretrained(model_id) @@ -443,10 +520,11 @@ def test_compare_outputs(self, model_arch): genai_output = genai_model.generate(text, **self.GEN_KWARGS).speeches[0] genai_output = torch.from_numpy(genai_output.data).squeeze(0) # collapse batch dimension (if any) - torch.testing.assert_close(transformers_output, optimum_output, rtol=1e-2, atol=1e-4) - torch.testing.assert_close(transformers_output, genai_output, rtol=1e-2, atol=1e-4) + torch.testing.assert_close(transformers_output, optimum_output, rtol=1e-2, atol=1e-3) + torch.testing.assert_close(transformers_output, genai_output, rtol=1e-2, atol=1e-3) +@pytest.mark.skipif(OPENVINO_DEVICE == "NPU", reason="Eagle3 test is not yet supported on NPU") class LLMPipelineWithEagle3TestCase(unittest.TestCase): GEN_KWARGS = { "max_new_tokens": 10, @@ -466,24 +544,25 @@ def test_compare_outputs(self, model_arch, model_pair): trust_remote_code = model_arch in REMOTE_CODE_MODELS # export main and draft eagle3 models and initialize OV LLM pipelines w/o Eagle3 - with tempfile.TemporaryDirectory() as draft_model_path, tempfile.TemporaryDirectory() as main_model_path: - main_export( - model_name_or_path=draft_model_id, - task="text-generation-with-past", - trust_remote_code=trust_remote_code, - convert_tokenizer=False, - output=draft_model_path, - ) - main_export( - model_name_or_path=target_model_id, - task="text-generation-with-past", - convert_tokenizer=True, - output=main_model_path, - ) + draft_model_path = Path(self.temp_dir) / "draft_model" + main_model_path = Path(self.temp_dir) / "main_model" + main_export( + model_name_or_path=draft_model_id, + task="text-generation-with-past", + trust_remote_code=trust_remote_code, + convert_tokenizer=False, + output=draft_model_path, + ) + main_export( + model_name_or_path=target_model_id, + task="text-generation-with-past", + convert_tokenizer=True, + output=main_model_path, + ) - ov_draft_model = draft_model(draft_model_path, "CPU") - ov_eagle3_pipe = LLMPipeline(main_model_path, OPENVINO_DEVICE, draft_model=ov_draft_model, **F32_CONFIG) - ov_pipe = LLMPipeline(main_model_path, OPENVINO_DEVICE, **F32_CONFIG) + ov_draft_model = draft_model(draft_model_path, "CPU") + ov_eagle3_pipe = LLMPipeline(main_model_path, OPENVINO_DEVICE, draft_model=ov_draft_model, **TEST_CONFIG) + ov_pipe = LLMPipeline(main_model_path, OPENVINO_DEVICE, **TEST_CONFIG) prompt = "Paris is the capital of" genai_eagle3_output = ov_eagle3_pipe.generate(