diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 896c5f8b43..ce3eb464ce 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,6 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index ac3291acfd..6b0b89f3f1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,6 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index c75ba43bef..48f07b9396 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,15 +34,15 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] - name: Test run: | - HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2 + HF_HOME=/tmp/ hf download hf-internal-testing/tiny-random-gpt2 HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - huggingface-cli download hf-internal-testing/tiny-random-gpt2 + hf download hf-internal-testing/tiny-random-gpt2 HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv - HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv + pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv + HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index ace91222e7..48e3a7409b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest"] + transformers-version: ["4.45.0", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -54,14 +54,29 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] librosa diffusers + + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Install OpenVINO + run: | + uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0 - if: ${{ matrix.transformers-version != 'latest' }} + name: Install transformers + run: | + uv pip install transformers==${{ matrix.transformers-version }} + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers + + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml index 90df6a2af3..ace0246329 100644 --- a/.github/workflows/test_openvino_nightly.yml +++ b/.github/workflows/test_openvino_nightly.yml @@ -97,7 +97,12 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] librosa diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.openvino-version == 'openvino-nightly' }} name: Install OpenVINO Nightly diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 4b271d898b..580253a36a 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -59,7 +59,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip uv - uv pip install .[tests,diffusers] transformers[testing] + uv pip install .[tests] librosa diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} name: Install specific dependencies and versions required for older transformers diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 51200060e8..4ab826378b 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -131,6 +131,7 @@ Here is the list of the supported architectures : - Qwen2VL - Qwen2.5VL - Qwen3VL +- Qwen3.5 - ResNet - Roberta - Roformer diff --git a/optimum/exporters/openvino/_ov_ops.py b/optimum/exporters/openvino/_ov_ops.py new file mode 100644 index 0000000000..78e5b6d23b --- /dev/null +++ b/optimum/exporters/openvino/_ov_ops.py @@ -0,0 +1,113 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Conversion rule for the `RecurrentAttentionCellOp` operation in a Torch graph. +# The `RecurrentAttentionCellOp` appears in the Torch graph as a result of replacing +# the `torch.nn.Module` block `RecurrentAttentionCell` via a registered +# `ModuleExtension` for `RecurrentAttentionCell` in the OpenVINO PyTorch frontend. +import numpy as np + +import openvino as ov +import openvino.opset14 as ops + + +def convert_recurrent_attention_cell(context): + query = context.get_input(0) + key = context.get_input(1) + value = context.get_input(2) + g = context.get_input(3) + beta = context.get_input(4) + last_recurrent_state_old = context.get_input(5) + + value_shape = ops.shape_of(value) + const_zero = ops.constant(0, dtype=np.float32) + core_attn_out = ops.broadcast(const_zero, value_shape) + const_two_out = ops.constant(2, dtype=np.int32) + const_zero_out = ops.constant(0, dtype=np.int32) + seq_len = ops.gather(value_shape, const_two_out, const_zero_out) + + timestep_param = ops.parameter([], np.int32, "timestep") + q_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "q_t") + k_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "k_t") + v_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "v_t") + g_t_param = ops.parameter([-1, -1, 1], np.float32, "g_t") + beta_t_param = ops.parameter([-1, -1, 1], np.float32, "beta_t") + last_recurrent_state_t = ops.parameter([-1, -1, -1, -1], np.float32, "last_recurrent_state_t") + core_attn_out_t = ops.parameter([-1, -1, -1, -1], np.float32, "core_attn_out_t") + + const_two = ops.constant(2, dtype=np.int32) + q_t = ops.squeeze(q_t_param, const_two) + k_t = ops.squeeze(k_t_param, const_two) + v_t = ops.squeeze(v_t_param, const_two) + const_minus_one = ops.constant(-1, dtype=np.int32) + g_t = ops.unsqueeze(ops.exp(g_t_param), const_minus_one) + beta_t = beta_t_param + + last_recurrent_state_in = ops.multiply(last_recurrent_state_t, g_t) + const_minus_two = ops.constant(-2, dtype=np.int32) + kv_mem = ops.multiply(last_recurrent_state_in, ops.unsqueeze(k_t, const_minus_one)) + kv_mem = ops.reduce_sum(kv_mem, const_minus_two, False) + delta = ops.multiply(ops.subtract(v_t, kv_mem), beta_t) + last_recurrent_state_delta = ops.multiply( + ops.unsqueeze(k_t, const_minus_one), ops.unsqueeze(delta, const_minus_two) + ) + last_recurrent_state_in = ops.add(last_recurrent_state_in, last_recurrent_state_delta) + core_attn_update = ops.multiply(last_recurrent_state_in, ops.unsqueeze(q_t, const_minus_one)) + core_attn_update = ops.reduce_sum(core_attn_update, const_minus_two, True) + const_zero = ops.constant(0, dtype=np.int32) + timestep = ops.unsqueeze(timestep_param, const_zero) + + core_attn_out_res = ops.scatter_update(core_attn_out_t, timestep, core_attn_update, const_two) + last_recurrent_state_res = last_recurrent_state_in + + body_cond = ops.constant([True], dtype=bool) + + body_model = ov.Model( + [body_cond, last_recurrent_state_res, core_attn_out_res], + [ + timestep_param, + q_t_param, + k_t_param, + v_t_param, + g_t_param, + beta_t_param, + last_recurrent_state_t, + core_attn_out_t, + ], + "body_model", + ) + + seq_len = ops.convert(seq_len, "i32") + loop = ops.loop(seq_len, ops.constant(True, dtype="bool")) + loop.set_function(body_model) + + loop.set_sliced_input(q_t_param, query, 0, 1, 1, -1, 2) + loop.set_sliced_input(k_t_param, key, 0, 1, 1, -1, 2) + loop.set_sliced_input(v_t_param, value, 0, 1, 1, -1, 2) + loop.set_sliced_input(g_t_param, g, 0, 1, 1, -1, 2) + loop.set_sliced_input(beta_t_param, beta, 0, 1, 1, -1, 2) + loop.set_merged_input(last_recurrent_state_t, last_recurrent_state_old, last_recurrent_state_res.output(0)) + loop.set_merged_input(core_attn_out_t, core_attn_out.output(0), core_attn_out_res.output(0)) + loop.set_special_body_ports([0, 0]) + + core_attn_out_new = loop.get_iter_value(core_attn_out_res.output(0), -1) + last_recurrent_state_new = loop.get_iter_value(last_recurrent_state_res.output(0), -1) + + flatten_shape = ops.constant([-1], dtype=np.int32) + core_attn_out_new = ops.reshape(core_attn_out_new, flatten_shape, False) + last_recurrent_state_new = ops.reshape(last_recurrent_state_new, flatten_shape, False) + + final_output = ops.concat([core_attn_out_new, last_recurrent_state_new], 0) + + return [final_output.output(0)] diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index eda3c7e01b..cc46de53aa 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -696,20 +696,21 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - except (AttributeError, KeyError, TypeError): - misplaced_generation_parameters = {} - if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(model.generation_config, param_name, param_value) - setattr(model.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + except (AttributeError, KeyError, TypeError): + misplaced_generation_parameters = {} + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) # Saving the model config and preprocessor as this is needed sometimes. save_config(model.config, output) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7ffe158396..a1e77272ab 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -194,6 +194,8 @@ Qwen2MoEPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, + Qwen3_5ModelPatcher, + Qwen3_5VisionEmbMergerPatcher, Qwen3MoeModelPatcher, Qwen3VLLanguageModelPatcher, Qwen3VLVisionEmbMergerPatcher, @@ -201,6 +203,7 @@ SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, + _get_model_attribute, ) @@ -237,38 +240,7 @@ def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} - TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( - "transformers", - "LlavaForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( - "transformers", - "LlavaNextForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( - "transformers", - "Qwen2VLForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( - "transformers", - "AutoModelForVision2Seq", - ) - TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( - "transformers", - "Gemma3ForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "idefics3", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) - TasksManager._CUSTOM_CLASSES[("pt", "smolvlm", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) + TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", @@ -282,15 +254,49 @@ def init_model_configs(): "transformers", "AutoModelForCausalLM", ) - TasksManager._CUSTOM_CLASSES[("pt", "llama4", "image-text-to-text")] = ( + TasksManager._CUSTOM_CLASSES[("pt", "qwen3_5", "image-text-to-text")] = ( "transformers", "AutoModelForImageTextToText", ) + # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776 + if is_transformers_version("<", "4.46"): + TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( + "transformers", + "LlavaForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( + "transformers", + "LlavaNextForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( + "transformers", + "Qwen2VLForConditionalGeneration", + ) + + # since transformers v4.50, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/models/auto/modeling_auto.py#L835 + if is_transformers_version("<", "4.50"): + TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( + "transformers", + "Gemma3ForConditionalGeneration", + ) + + # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899 + if is_transformers_version("<", "4.52"): + TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( + "transformers", + "AutoModelForVision2Seq", + ) + if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") + if "text-to-image" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: + TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"] = {} TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana-sprint"] = "SanaSprintPipeline" if is_diffusers_available() and "text-to-video" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: @@ -329,6 +335,7 @@ class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) _MODEL_PATCHER = BaichuanModelPatcher + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -843,6 +850,7 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig): ) class BitnetOpenVINOConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.52.1" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = OVDecoderModelPatcher @@ -858,7 +866,7 @@ class BitnetOpenVINOConfig(LlamaOnnxConfig): library_name="transformers", ) class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): - pass + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -871,6 +879,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): ) class Exaone4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -1030,6 +1040,7 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1039,7 +1050,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 - + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1092,6 +1103,8 @@ class Phi3OpenVINOConfig(PhiOnnxConfig): ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = PhiMoEModelPatcher @@ -1296,6 +1309,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) @@ -1305,6 +1319,7 @@ class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1314,6 +1329,7 @@ class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1336,6 +1352,7 @@ class CodeGenOpenVINOConfig(CodeGenOnnxConfig): ) class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", @@ -1355,6 +1372,7 @@ class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): ) class JaisOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator @@ -1523,6 +1541,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1881,14 +1900,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model if not hasattr(model, "lm_head") else model + return _get_model_attribute(model, "language_model") if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -1994,6 +2013,8 @@ class LlavaNextVideoConfigBehavior(str, enum.Enum): @register_in_tasks_manager("llava_next_video", *["image-text-to-text"], library_name="transformers") class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.42.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] def with_behavior( @@ -2075,11 +2096,16 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) text_embedding.config = model.language_model.config return text_embedding + if behavior == VLMConfigBehavior.LANGUAGE: + return model.language_model + return super().get_model_for_behavior(model, behavior) @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" + def __init__( self, config: "PretrainedConfig", @@ -2148,14 +2174,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model + return _get_model_attribute(model, "language_model") if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.language_model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding = _get_model_attribute(model, "language_model").get_input_embeddings() + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -2864,6 +2890,7 @@ class MiniCPMVConfigBehavior(str, enum.Enum): @register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") class MiniCPMVOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = () @@ -3681,18 +3708,20 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): return model if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: - vision_embeddings = model.visual.patch_embed + vision_embeddings = _get_model_attribute(model, "visual").patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: - vision_emb_merger = model.visual + vision_emb_merger = _get_model_attribute(model, "visual") vision_emb_merger.config = model.config.vision_config return vision_emb_merger if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: text_embedding = ( - model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens + model.model.embed_tokens + if hasattr(model.model, "embed_tokens") + else _get_model_attribute(model, "language_model").embed_tokens ) text_embedding.config = model.config return text_embedding @@ -3711,12 +3740,17 @@ def with_behavior( behavior = QwenVLConfigBehavior(behavior) if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: - return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_embeddings_config( + "qwen2", + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + ) if behavior == QwenVLConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( "qwen2", - self._orig_config, + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, self.int_dtype, self.float_dtype, model_patcher=Qwen2VLLanguageModelPatcher, @@ -3827,7 +3861,7 @@ def __init__( @staticmethod def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = model.visual.pos_embed + vision_emb_pos = _get_model_attribute(model, "visual").pos_embed vision_emb_pos.config = model.config.vision_config return vision_emb_pos @@ -3999,7 +4033,8 @@ class T5OpenVINOConfig(T5OnnxConfig): library_name="transformers", ) class MT5OpenVINOConfig(T5OpenVINOConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4088,6 +4123,8 @@ class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): @register_in_tasks_manager("got_ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers") class GotOCR2OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4203,6 +4240,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class Idefics3OpenVINOConfig(BaseVLMOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsInputGenerator) MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4261,6 +4300,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("smolvlm", *["image-text-to-text"], library_name="transformers") class SmolVLMOpenVINOConfig(Idefics3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4325,6 +4366,8 @@ class PegasusOpenVINOConfig(PegasusOnnxConfig): ) class MarianOpenVINOConfig(MarianOnnxConfig): _MODEL_PATCHER = MarianModelPatcher + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" class DummySpeechT5OpenVINOInputGenerator(DummyInputGenerator): @@ -4528,6 +4571,8 @@ def with_behavior( ) class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator _MODEL_PATCHER = Llama4TextModelPatcher @@ -4538,6 +4583,8 @@ class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): ) class Llama4OpenVINOConfig(GotOCR2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} @@ -4779,6 +4826,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = Zamba2ModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): @@ -4976,7 +5025,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig): ) class AfmoeOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.55.0" - MAX_TRANSFORMERS_VERSION = "4.57.99" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = AfmoeModelPatcher @@ -5005,7 +5054,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - pass + _MODEL_PATCHER = OVSeq2SeqModelPatcher @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) @@ -5050,7 +5099,8 @@ class MobileBertOpenVINOConfig(MobileBertOnnxConfig): @register_in_tasks_manager("xlm", *COMMON_TEXT_TASKS) class XLMOpenVINOConfig(XLMOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("xlm-roberta", *COMMON_TEXT_TASKS) @@ -5075,7 +5125,8 @@ class CamembertOpenVINOConfig(CamembertOnnxConfig): @register_in_tasks_manager("flaubert", *COMMON_TEXT_TASKS) class FlaubertOpenVINOConfig(FlaubertOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -5107,7 +5158,8 @@ class Data2VecAudioOpenVINOConfig(Data2VecAudioOnnxConfig): @register_in_tasks_manager("data2vec-text", *COMMON_TEXT_TASKS) class Data2VecTextOpenVINOConfig(Data2VecTextOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("data2vec-vision", *["feature-extraction", "image-classification"]) @@ -5314,3 +5366,261 @@ class HunyuanV1DenseOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.57.0" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator + + +class Qwen3_5DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + """ + Generates dummy cache_params inputs for Qwen3.5 architectures. + """ + + SUPPORTED_INPUT_NAMES = ("cache_params",) + + def __init__( + self, + task: str, + normalized_config, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + **kwargs, + ) + + config = normalized_config.config + self.num_full_attn_layers = config.layer_types.count("full_attention") + self.num_linear_attn_layers = config.layer_types.count("linear_attention") + self.conv_kernel_size = config.linear_conv_kernel_dim + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.head_k_dim = config.linear_key_head_dim + self.head_v_dim = config.linear_value_head_dim + self.num_v_heads = config.linear_num_value_heads + self.num_k_heads = config.linear_num_key_heads + self.num_key_value_heads = config.num_key_value_heads + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + cache_params = [] + + for idx in range(self.num_linear_attn_layers): + d_inner = self.num_k_heads * (2 * self.head_k_dim + self.head_v_dim * self.num_v_heads // self.num_k_heads) + conv_state_shape = ( + self.batch_size, + d_inner, + self.conv_kernel_size, + ) + conv_state = self.random_float_tensor(conv_state_shape, framework=framework, dtype=float_dtype) + cache_params.append(conv_state) + num_heads = self.num_v_heads + recurrent_state_shape = (self.batch_size, num_heads, self.head_k_dim, self.head_v_dim) + recurrent_state = self.random_float_tensor(recurrent_state_shape, framework=framework, dtype=float_dtype) + cache_params.append(recurrent_state) + + for idx in range(self.num_full_attn_layers): + kv_shape = (self.batch_size, self.num_key_value_heads, self.sequence_length, self.head_dim) + k = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + v = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + cache_params.append(k) + cache_params.append(v) + + return cache_params + + +@register_in_tasks_manager( + "qwen3_5_text", + *["text-generation", "text-generation-with-past"], + library_name="transformers", +) +class Qwen3_5TextOpenVINOConfig(Qwen3OpenVINOConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + MIN_TRANSFORMERS_VERSION = "4.57.0" + _MODEL_PATCHER = Qwen3_5ModelPatcher + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + cache_name_prefix = "cache_params.past" + else: + decoder_sequence_name = "past_sequence_length + sequence_length" + cache_name_prefix = "cache_params.present" + + self.num_full_attn_layers = self._normalized_config.layer_types.count("full_attention") + self.num_linear_attn_layers = self._normalized_config.layer_types.count("linear_attention") + + for i in range(self.num_linear_attn_layers): + inputs_or_outputs[f"{cache_name_prefix}.conv.{i}"] = {0: "batch_size"} + inputs_or_outputs[f"{cache_name_prefix}.ssm.{i}"] = {0: "batch_size"} + + for i in range(self.num_full_attn_layers): + inputs_or_outputs[f"{cache_name_prefix}.key.{i}"] = {0: "batch_size", 2: decoder_sequence_name} + inputs_or_outputs[f"{cache_name_prefix}.value.{i}"] = {0: "batch_size", 2: decoder_sequence_name} + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + if self.use_past_in_inputs: + self.add_past_key_values(common_inputs, direction="inputs") + return common_inputs + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("cache_params")] + if self.use_past_in_inputs: + input_names.extend(["cache_params"]) + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + return dummy_inputs + + +@register_in_tasks_manager( + "qwen3_5", + *["image-text-to-text"], + library_name="transformers", +) +class Qwen3_5OpenVINOConfig(Qwen2VLOpenVINOConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior] + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = "4.57.0" + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: QwenVLConfigBehavior = QwenVLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + behavior=behavior, + ) + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): + if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + vision_emb_pos = model.visual.pos_embed + vision_emb_pos.config = model.config.vision_config + return vision_emb_pos + + if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.model.language_model.embed_tokens + text_embedding.config = model.config + return text_embedding + + return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior) + + def with_behavior( + self, + behavior: Union[str, QwenVLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, QwenVLConfigBehavior): + behavior = QwenVLConfigBehavior(behavior) + + if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config( + "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ) + + if behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen3_5_text", + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen3_5ModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior in ( + QwenVLConfigBehavior.VISION_EMBEDDINGS, + QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER, + QwenVLConfigBehavior.VISION_EMBEDDINGS_POS, + ): + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None): + model_kwargs = model_kwargs or {} + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Qwen3_5VisionEmbMergerPatcher(self, model, model_kwargs) + if ( + self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS + or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS + ): + return ModelPatcher(self, model, model_kwargs=model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return { + "input": {1: "sequence_length"}, + } + return super().inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: + return super().outputs + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return {"last_hidden_state": {0: "seq_len"}} + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return {"last_hidden_state": {0: "seq_len", 1: "seq_len"}} + if self._behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + return {"inputs_embeds": {0: "batch_size", 1: "sequence_length"}} + if self._behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_internal_text_generation_config( + "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ).outputs + raise Exception("Unknown Qwen3.5 behavior type.") diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 557cd1f8d1..7321b2371d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -19,7 +19,7 @@ import math import types from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -80,6 +80,28 @@ logger = logging.getLogger(__name__) +def postprocess_past_key_values(past_key_values): + if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): + if hasattr(past_key_values, "to_legacy_cache"): + past_key_values = past_key_values.to_legacy_cache() + elif isinstance(past_key_values, DynamicCache): + past_key_values = [(lay.keys, lay.values) for lay in past_key_values.layers] + elif isinstance(past_key_values, EncoderDecoderCache): + past_key_values = [ + (self_lay.keys, self_lay.values, cross_lay.keys, cross_lay.values) + for self_lay, cross_lay in zip( + past_key_values.self_attention_cache.layers, + past_key_values.cross_attention_cache.layers, + ) + ] + return past_key_values + + +def _get_model_attribute(model, name): + target = getattr(model, "model", model) if is_transformers_version(">=", "5") else model + return getattr(target, name) + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -309,18 +331,21 @@ def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class MixtralModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - - for layer in self._model.model.layers: - layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _mixtral_sparse_moe_block_forward, layer.block_sparse_moe - ) + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _mixtral_sparse_moe_block_forward, layer.block_sparse_moe + ) + else: + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward class ArcticModelPatcher(MixtralModelPatcher): @@ -1364,7 +1389,11 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -1437,7 +1466,7 @@ def phi3_442_forward( next_cache = None if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = postprocess_past_key_values(next_decoder_cache) if use_legacy_cache else next_decoder_cache if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( @@ -1621,16 +1650,22 @@ def _phi_moe_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class PhiMoEModelPatcher(Phi3ModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe - ) + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe + ) + else: + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward def _aquila_self_attn_sdpa_forward( @@ -2697,15 +2732,19 @@ def gptj_attn_forward( if output_attentions: self._attn = self._orig_attn + kwargs = {} + if is_transformers_version("<", "5"): + kwargs["head_mask"] = head_mask + return self._orig_forward( hidden_states, layer_past, attention_mask, position_ids, - head_mask, use_cache=use_cache, output_attentions=output_attentions, cache_position=cache_position, + **kwargs, ) @@ -2936,7 +2975,11 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = DynamicCache.from_legacy_cache(legacy_pkv) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(legacy_pkv) + else: + pkv = DynamicCache(legacy_pkv) + return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -2957,7 +3000,7 @@ def patched_forward(*args, **kwargs): outputs = self.orig_forward(*args, **kwargs) if return_legacy_cache: - outputs.past_key_values = outputs.past_key_values.to_legacy_cache() + outputs.past_key_values = postprocess_past_key_values(outputs.past_key_values) return outputs @@ -3213,7 +3256,7 @@ def llava_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + image_outputs = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] @@ -3224,7 +3267,7 @@ def llava_vision_embed_forward(self, pixel_values): else: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = self.multi_modal_projector(selected_image_feature) + image_features = _get_model_attribute(self, "multi_modal_projector")(selected_image_feature) return image_features @@ -3232,7 +3275,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_features = self.vision_tower(pixel_values, output_hidden_states=True) + image_features = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) vision_feature_layer = self.config.vision_feature_layer if isinstance(vision_feature_layer, int): selected_image_feature = image_features.hidden_states[vision_feature_layer] @@ -3265,7 +3308,6 @@ def __init__( ): model.__orig_forward = model.forward model.forward = types.MethodType(llava_vision_embed_forward, model) - super().__init__(config, model, model_kwargs) def __exit__(self, exc_type, exc_value, traceback): @@ -3298,8 +3340,9 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward + # TODO: use get_image_features instead and add image_sizes as input when exporting + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L746 model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) - super().__init__(config, model, model_kwargs) def __exit__(self, exc_type, exc_value, traceback): @@ -4067,7 +4110,11 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + new_past_key_values = DynamicCache(past_key_values) + result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4077,7 +4124,7 @@ def forward_wrap( use_cache=use_cache, ) if past_key_values is not None: - result["past_key_values"] = result["past_key_values"].to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(result["past_key_values"]) return result model.forward = types.MethodType(forward_wrap, model) @@ -4108,9 +4155,11 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - from transformers.cache_utils import DynamicCache + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) - pkv = DynamicCache.from_legacy_cache(past_key_values) outputs = self.model.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -4123,7 +4172,7 @@ def lm_forward( hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -4435,6 +4484,7 @@ def _granite_moe_parallel_experts_forward(self, inputs, expert_size): class GraniteMoEModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() + for layer in self._model.model.layers: block_sparse_moe = layer.block_sparse_moe block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward @@ -4452,6 +4502,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: block_sparse_moe = layer.block_sparse_moe block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward @@ -4492,10 +4543,14 @@ def patched_forward(*args, **kwargs): if pkv is not None: if isinstance(pkv, EncoderDecoderCache): - pkv = pkv.self_attention_cache.to_legacy_cache() + pkv = postprocess_past_key_values(pkv.self_attention_cache) else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = EncoderDecoderCache.from_legacy_cache(pkv) + + if is_transformers_version("<", "5"): + pkv = EncoderDecoderCache.from_legacy_cache(pkv) + else: + pkv = EncoderDecoderCache(DynamicCache(pkv), DynamicCache()) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4506,7 +4561,7 @@ def patched_forward(*args, **kwargs): # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): - outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"]) # we still need to filter out cross attention in the case of non-stateful decoder filtered_outputs = {} @@ -4602,18 +4657,31 @@ def __init__( model: "PreTrainedModel", model_kwargs: Dict[str, Any], ): - model.__orig_forward = model.forward - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 - if hasattr(model, "model") and hasattr(model.model, "get_image_features"): - model.forward = model.model.get_image_features - else: - model.forward = model.get_image_features super().__init__(config, model, model_kwargs) - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.forward = self._model.__orig_forward + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 + if ( + hasattr(self._model, "model") + and hasattr(self._model.model, "get_image_features") + and is_transformers_version("<", "5") + ): + get_image_features = self._model.model.get_image_features + else: + get_image_features = self._model.get_image_features + + outputs = get_image_features(*args, **kwargs) + + # we should be able to specify pooler_output as output_name, not supported here as pooler_output key does not exist + if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"): + outputs = outputs.pooler_output + + output_names = list(config.outputs.keys()) + return {output_names[0]: outputs} + + self.patched_forward = patched_forward # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147 @@ -4680,7 +4748,10 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = DynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -4706,7 +4777,7 @@ def forward( **forward_kwargs, ) upd_pkv = result["past_key_values"] - result["past_key_values"] = upd_pkv.to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(upd_pkv) return result if is_transformers_version("<", "4.53.0"): @@ -5045,8 +5116,6 @@ def _blenderbot_attn_forward_new( output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - from transformers.cache_utils import EncoderDecoderCache - """Input shape: Batch x Time x Channel""" # if key_value_states are provided this layer is used as a cross-attention layer @@ -5264,14 +5333,18 @@ def _qwen2moe_sparse_block_forward(self, hidden_states: torch.Tensor) -> torch.T class Qwen2MoEPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.52.0"): + + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward) + if is_transformers_version(">=", "5"): + self._model.set_experts_implementation("batched_mm") + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.52.0"): + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_unpatch(self._model, Qwen2MoeSparseMoeBlock) @@ -5594,7 +5667,10 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + else: + past_key_values = EncoderDecoderCache(DynamicCache(past_key_values), DynamicCache()) output_sequence = inputs_embeds output_cross_attentions = False @@ -5627,7 +5703,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: if isinstance(past_key_values, EncoderDecoderCache): - past_key_values = past_key_values.self_attention_cache.to_legacy_cache() + past_key_values = postprocess_past_key_values(past_key_values.self_attention_cache) else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5678,7 +5754,11 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = DynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) + outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -5689,7 +5769,7 @@ def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_value hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -6260,6 +6340,15 @@ def llama4_moe_forward(self, hidden_states): return out, router_scores +# Copied from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/masking_utils.py#L105 +# transformers.masking_utils._legacy_chunked_overlay deprecated since transformers v5 +def _legacy_chunked_overlay(chunk_size: int) -> Callable: + def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: + return kv_idx // chunk_size == q_idx // chunk_size + + return inner_mask + + class Llama4TextModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -6276,8 +6365,8 @@ def __enter__(self): if is_transformers_version(">=", "4.56"): # openvino is not able to trace through the new chunked_overlay with left_padding self.original_chunked_overlay = transformers.masking_utils.chunked_overlay - transformers.masking_utils.chunked_overlay = ( - lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) + transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: _legacy_chunked_overlay( + chunk_size ) def __exit__(self, exc_type, exc_value, traceback): @@ -6619,14 +6708,16 @@ class Qwen3MoeModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched + if is_transformers_version(">=", "5"): + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): Qwen3MoeSparseMoeBlock.forward = self.original_moe_forward @@ -7296,16 +7387,19 @@ class GptOssModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts self.original_gpt_oss_forward = GptOssExperts.forward GptOssExperts.forward = gpt_oss_forward + if is_transformers_version(">=", "5"): + self._model.set_experts_implementation("batched_mm") + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts GptOssExperts.forward = self.original_gpt_oss_forward @@ -7494,10 +7588,12 @@ def patch_sparse_moe(sparse_moe_layer): super().__enter__() setattr(self._model, self.orig_forward_name, self.patched_forward) - self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask - self._model.model._update_causal_mask = types.MethodType( - granite_moe_hybrid_update_causal_mask, self._model.model - ) + if is_transformers_version("<", "5"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + granite_moe_hybrid_update_causal_mask, self._model.model + ) + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): patch_sparse_moe(layer.block_sparse_moe) @@ -7517,7 +7613,9 @@ def unpatch_sparse_moe(sparse_moe_layer): super().__exit__(exc_type, exc_value, traceback) setattr(self._model, self.orig_forward_name, self.model_orig_forward) - self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + if is_transformers_version("<", "5"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): unpatch_sparse_moe(layer.block_sparse_moe) @@ -7899,3 +7997,417 @@ def forward( hidden_states=outputs.hidden_states, d2t=d2t_out, ) + + +# Patched implementation of the gated delta rule in recurrent form. +# Adapted from: +# https://github.com/huggingface/transformers/blob/v4.57-release/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L522 +# +# To represent the for-loop that generates output embeddings, we use a module +# and the conversion extension mechanism. This is necessary because there is +# no known vectorized form of this loop that would allow it to be correctly +# traced with torch.jit.trace +def patched_recurrent_gated_delta_rule( + self, query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False +): + def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6): + """This function is intended to align with the l2norm implementation in the FLA library.""" + inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps) + return x * inv_norm + + initial_dtype = query.dtype + if use_qk_l2norm_in_kernel: + query = l2norm(query, dim=-1, eps=1e-6) + key = l2norm(key, dim=-1, eps=1e-6) + query, key, value, beta, g = [ + x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g) + ] + + batch_size, num_heads, sequence_length, k_head_dim = key.shape + v_head_dim = value.shape[-1] + scale = 1 / (query.shape[-1] ** 0.5) + query = query * scale + + last_recurrent_state = ( + torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value) + if initial_state is None + else initial_state.to(value) + ) + + output_cell = self.recurrent_attention_cell( + query, # (B, H, T, D1) + key, # (B, H, T, D1) + value, # (B, H, T, D2) + g, # (B, H, T) + beta, # (B, H, T) + last_recurrent_state, # (B, H, D1, D2) + ) + + num_elems = value.numel() + core_attn_out = output_cell[:num_elems].reshape(value.shape) + last_recurrent_state = output_cell[num_elems:].reshape(last_recurrent_state.shape) + + if not output_final_state: + last_recurrent_state = None + core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype) + return core_attn_out, last_recurrent_state + + +# The CausalConv1D block is overridden with a generic patch provided by `ov_causal_conv1d()`. +# The GatedDeltaNet block is overridden with a recurrent version of its implementation. +# +# To replace GatedDeltaNet with its recurrent form, patching uses the ModuleExtension +# approach, which replaces the GatedDeltaNet block with a single operation, +# `GatedDeltaNetOp`. OpenVINO then applies the `convert_recurrent_attention_cell()` +# conversion rule to this operation. +def qwen3_5_gated_delta_net_forward( + self, + hidden_states: torch.Tensor, + cache_params=None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, +): + def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + # NOTE: attention mask is a 2D boolean tensor + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + + # getting projected states from cache if it exists + layer_idx = None + recurrent_state = None + if cache_params is not None: + layer_idx = cache_params.linear_attn_mapping[self.layer_idx] + conv_state = cache_params.conv_states[layer_idx] + recurrent_state = cache_params.recurrent_states[layer_idx] + + mixed_qkv = self.in_proj_qkv(hidden_states) + mixed_qkv = mixed_qkv.transpose(1, 2) + + z = self.in_proj_z(hidden_states) + z = z.reshape(batch_size, seq_len, -1, self.head_v_dim) + + b = self.in_proj_b(hidden_states) + a = self.in_proj_a(hidden_states) + + if cache_params is not None: + new_mixed_qkv, new_conv_state = ov_causal_conv1d(conv_state, mixed_qkv, self.conv1d.weight, self.conv1d.bias) + mixed_qkv = F.silu(new_mixed_qkv) + cache_params.conv_states[layer_idx] = new_conv_state + else: + mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len]) + + mixed_qkv = mixed_qkv.transpose(1, 2) + query, key, value = torch.split( + mixed_qkv, + [ + self.key_dim, + self.key_dim, + self.value_dim, + ], + dim=-1, + ) + query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim) + key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim) + value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim) + + beta = b.sigmoid() + # If the model is loaded in fp16, without the .float() here, A might be -inf + g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) + if self.num_v_heads // self.num_k_heads > 1: + query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + + core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule( + self, + query, + key, + value, + g=g, + beta=beta, + initial_state=recurrent_state, + output_final_state=cache_params is not None, + use_qk_l2norm_in_kernel=True, + ) + + # Update cache + if cache_params is not None: + cache_params.recurrent_states[layer_idx] = last_recurrent_state + + # reshape input data into 2D tensor + core_attn_out = core_attn_out.reshape(-1, self.head_v_dim) + z = z.reshape(-1, self.head_v_dim) + core_attn_out = self.norm(core_attn_out, z) + core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1) + + output = self.out_proj(core_attn_out) + return output + + +# This torch.nn.Module represents the GatedDeltaNet layer in its recurrent form. +# It is required for converting the GatedDeltaNet layer with OpenVINO using the ModuleExtension mechanism. +class RecurrentAttentionCell(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward( + self, + query, # (B, H, T, D1) + key, # (B, H, T, D1) + value, # (B, H, T, D2) + g, # (B, H, T) + beta, # (B, H, T) + last_recurrent_state, # (B, H, D1, D2) + ): + _, _, sequence_length, _ = key.shape + core_attn_out = torch.zeros_like(value) + + for i in range(sequence_length): + q_t = query[:, :, i] + k_t = key[:, :, i] + v_t = value[:, :, i] + g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1) + beta_t = beta[:, :, i].unsqueeze(-1) + + last_recurrent_state = last_recurrent_state * g_t + kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) + delta = (v_t - kv_mem) * beta_t + last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) + core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) + + # This is a workaround to ensure a single output from the torch.nn.Module. + # The OpenVINO ModuleExtension mechanism has a limitation and expects + # the module to produce only one output. + output_cell = torch.cat([core_attn_out.flatten(), last_recurrent_state.flatten()], dim=0) + return output_cell + + +class Qwen3_5ModelPatcher(OVDecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5DynamicCache + + from openvino.frontend.pytorch import ConversionExtension, ModuleExtension + + from ._ov_ops import convert_recurrent_attention_cell + + super().__init__(config, model, model_kwargs) + + # Detect VLM vs text-only model + self._is_vlm = hasattr(self._model.model, "language_model") + if self._is_vlm: + self._text_model = self._model.model.language_model + self._text_config = self._model.config.text_config + else: + self._text_model = self._model.model + self._text_config = self._model.model.config + + class Qwen3_5DynamicCacheWrap(Qwen3_5DynamicCache): + def __init__(self, config, conv_states, recurrent_states, key_cache, value_cache): + # Call parent constructor with all required arguments + super().__init__(config=config) + + self.conv_states = conv_states + self.recurrent_states = recurrent_states + self.key_cache = key_cache + self.value_cache = value_cache + self.full_attn_mapping = {} + self.linear_attn_mapping = {} + full_attn_layer_idx = 0 + linear_attn_layer_idx = 0 + for i in range(len(config.layer_types)): + if self.layer_types[i] == "full_attention": + self.full_attn_mapping[i] = full_attn_layer_idx + full_attn_layer_idx += 1 + elif self.layer_types[i] == "linear_attention": + self.linear_attn_mapping[i] = linear_attn_layer_idx + linear_attn_layer_idx += 1 + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # map layer_idx to key_cache (value_cache) idx + layer_idx = self.full_attn_mapping[layer_idx] + if self.key_cache[layer_idx] is None: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + layer_idx = self.full_attn_mapping[layer_idx] + if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None: + return 0 + return self.key_cache[layer_idx].shape[-2] + + @property + def has_previous_state(self): + """We have a previous state if the last linear (conv) layer was already updated.""" + layer_idx = self.linear_attn_mapping[self.last_linear_layer] + return self.conv_states[layer_idx] is not None + + # the patch is needed to include KV-cache, Conv, and SSM states in the inputs and outputs. + def patched_forward( + input_ids=None, + attention_mask=None, + cache_params=None, + inputs_embeds=None, + position_ids=None, + ): + text_config = self._text_config + num_full_attn_layers = text_config.layer_types.count("full_attention") + num_linear_attn_layers = text_config.layer_types.count("linear_attention") + + use_cache = False + wrapped_cache_params = None + if cache_params is not None: + use_cache = True + conv_states = [] + recurrent_states = [] + key_cache = [] + value_cache = [] + + # decouple ssm_states, conv_states, keys and values from cache_params + for idx in range(num_linear_attn_layers): + conv_states.append(cache_params[2 * idx]) + recurrent_states.append(cache_params[2 * idx + 1]) + + for idx in range(num_full_attn_layers): + key_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx]) + value_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx + 1]) + + wrapped_cache_params = Qwen3_5DynamicCacheWrap( + text_config, conv_states, recurrent_states, key_cache, value_cache + ) + + if self._is_vlm: + # VLM case: call language model through the composite model + outputs_lm = self._text_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=wrapped_cache_params, + use_cache=use_cache, + ) + hidden_states = outputs_lm[0] + logits = self._model.lm_head(hidden_states) + past_kv = outputs_lm.past_key_values + else: + causal_lm_output = self.model_orig_forward( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=wrapped_cache_params, + use_cache=use_cache, + ) + logits = causal_lm_output.logits + past_kv = causal_lm_output.past_key_values + outputs = { + "logits": logits, + } + + if use_cache: + present_key_values = [] + for idx in range(num_linear_attn_layers): + present_key_values.append(past_kv.conv_states[idx]) + present_key_values.append(past_kv.recurrent_states[idx]) + + for idx in range(num_full_attn_layers): + present_key_values.append(past_kv.key_cache[idx]) + present_key_values.append(past_kv.value_cache[idx]) + + outputs["present_key_values"] = present_key_values + + return outputs + + self.patched_forward = patched_forward + self.model_orig_forward = self.orig_forward + self.orig_forward = patched_forward + + self.module_extensions = { + RecurrentAttentionCell: ModuleExtension(RecurrentAttentionCell, "RecurrentAttentionCellOp"), + } + self.conversion_extensions = [ + ConversionExtension("RecurrentAttentionCellOp", convert_recurrent_attention_cell), + ] + + def __enter__(self): + super().__enter__() + setattr(self._model, self.orig_forward_name, self.patched_forward) + + for idx, decoder_layer in enumerate(self._text_model.layers): + layer_type = self._text_config.layer_types[idx] + if layer_type == "linear_attention": + linear_attn_layer = decoder_layer.linear_attn + linear_attn_layer._orig_forward = linear_attn_layer.forward + linear_attn_layer.forward = types.MethodType(qwen3_5_gated_delta_net_forward, linear_attn_layer) + linear_attn_layer.recurrent_gated_delta_rule = patched_recurrent_gated_delta_rule + linear_attn_layer.recurrent_attention_cell = RecurrentAttentionCell() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + setattr(self._model, self.orig_forward_name, self.model_orig_forward) + for idx, decoder_layer in enumerate(self._text_model.layers): + layer_type = self._text_config.layer_types[idx] + if layer_type == "linear_attention": + linear_attn_layer = decoder_layer.linear_attn + linear_attn_layer.forward = linear_attn_layer._orig_forward + + +class Qwen3_5VisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + # Adapted from Qwen3.5 VisionModel forward + # added attention_mask input instead of cu_seqlens for its internal calculation + # separated patch_embed and rot_pos_emb calls for performing as part of another model + def image_embed_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor + ) -> torch.Tensor: + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + for blk in self.blocks: + hidden_states = blk(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings) + return self.merger(hidden_states) + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + patch_qwen2vl_vision_blocks(self._model) + super().__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 3d9a854e39..08011e44b2 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -295,6 +295,7 @@ def get_submodels(model): "qwen2_vl", "qwen2_5_vl", "qwen3_vl", + "qwen3_5", "got_ocr2", "gemma3", "idefics3", @@ -305,7 +306,7 @@ def get_submodels(model): "minicpmo", ] -SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid"] +SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_5_text"] # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test # TODO: add tests for all models that are compatible and remove support for all others diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8441944800..28e39f0528 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -35,8 +35,12 @@ warnings.simplefilter(action="ignore", category=FutureWarning) + +logger = logging.getLogger(__name__) + + if is_openvino_version("<", "2025.4.0"): - raise ImportError( + logger.warning( "Optimum-intel requires OpenVINO version 2025.4.0 or higher. " "Please upgrade OpenVINO to version 2025.4 or later. " f"The current version of OpenVINO is {_openvino_version}." @@ -51,8 +55,6 @@ ) -logger = logging.getLogger(__name__) - if is_nncf_available(): import nncf diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py index 214a4a7e8c..bd62e047bb 100644 --- a/optimum/intel/openvino/loaders.py +++ b/optimum/intel/openvino/loaders.py @@ -22,7 +22,7 @@ from openvino import Type from openvino import opset11 as ops from openvino.passes import Manager, Matcher, MatcherPass, WrapType -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizerBase from .utils import TEXTUAL_INVERSION_EMBEDDING_KEYS @@ -80,7 +80,7 @@ def load_textual_inversion( self, pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]], token: Optional[Union[str, List[str]]] = None, - tokenizer: Optional["PreTrainedTokenizer"] = None, # noqa: F821 + tokenizer: Optional["PreTrainedTokenizerBase"] = None, # noqa: F821 text_encoder: Optional["openvino.Model"] = None, # noqa: F821 **kwargs, ): @@ -88,9 +88,9 @@ def load_textual_inversion( raise ValueError( f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`" ) - elif not isinstance(self.tokenizer, PreTrainedTokenizer): + elif not isinstance(self.tokenizer, PreTrainedTokenizerBase): raise ValueError( - f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`" + f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizerBase` for calling `{self.load_textual_inversion.__name__}`" ) if not hasattr(self, "text_encoder"): diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a97416cea1..0d95cc233d 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -25,16 +25,11 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import GenerationMixin -from transformers.utils import is_offline_mode from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig -from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel - -from ...exporters.openvino import export, main_export -from ..utils.import_utils import is_nncf_available -from ..utils.modeling_utils import _find_files_matching_pattern -from .configuration import ( +from optimum.exporters.openvino import export, main_export +from optimum.intel.openvino.configuration import ( _DEFAULT_4BIT_WQ_CONFIG, OVConfig, OVQuantizationConfigBase, @@ -43,7 +38,7 @@ _quantization_config_from_dict, get_default_quantization_config, ) -from .utils import ( +from optimum.intel.openvino.utils import ( ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, @@ -52,6 +47,15 @@ classproperty, model_has_dynamic_inputs, ) +from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available, is_transformers_version +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern +from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel + + +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode core = Core() @@ -261,21 +265,21 @@ def __init__( if self.can_generate(): self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) - + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) else: self.generation_config = None @@ -793,16 +797,14 @@ def _export( **kwargs, ): """ - Export a vanilla Transformers model into an ONNX model using `transformers.onnx.export_onnx`. + Load and export a model to the OpenVINO IR. Arguments: model_id (`str` or `Path`): The directory from which to load the model. Can be either: - The model id of a pretrained model hosted inside a model repo on huggingface.co. - - The path to a directory containing the model weights. save_dir (`str` or `Path`): - The directory where the exported ONNX model should be saved, default to - `transformers.file_utils.default_cache_path`, which is the cache directory for transformers. + - The path to a directory containing the model weights. token (Optional[Union[bool, str]], defaults to `None`): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3b95b5f276..66e036bc37 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -1449,8 +1449,8 @@ def prepare_inputs_for_generation( # decoding stage so it takes the last token input_ids = input_ids[:, -1].unsqueeze(-1) - if self.config.model_type not in ["lfm2", "granitemoehybrid"]: - # LFM2 and GraniteMoeHybrid (Granite-4.0) require the attention mask + if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_5_text"]: + # LFM2, GraniteMoeHybrid (Granite-4.0), and Qwen3.5 require the attention mask # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used. # Other models like Mamba typically do not require an attention_mask # for the decoding step after the first token so use attention mask of ones. diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index de5b7e1b39..dc33cbfa38 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -50,7 +50,7 @@ from huggingface_hub.utils import validate_hf_hub_args from openvino import Core from openvino._offline_transformations import compress_model_transformation -from transformers import CLIPFeatureExtractor, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTokenizer from transformers.modeling_outputs import ModelOutput from transformers.utils import http_user_agent @@ -170,7 +170,7 @@ def __init__( tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, tokenizer_3: Optional[CLIPTokenizer] = None, - feature_extractor: Optional[CLIPFeatureExtractor] = None, + feature_extractor: Optional[CLIPImageProcessor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, requires_aesthetics_score: bool = False, diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index 2e2ee2d63c..952f23a225 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -31,16 +31,27 @@ from transformers.file_utils import add_start_docstrings from transformers.modeling_outputs import ModelOutput from transformers.models.clip.modeling_clip import CLIPOutput -from transformers.utils import is_offline_mode +from optimum.exporters.openvino import main_export from optimum.exporters.tasks import TasksManager +from optimum.intel.openvino.configuration import ( + OVConfig, + OVWeightQuantizationConfig, +) +from optimum.intel.openvino.modeling import MODEL_START_DOCSTRING, OVModel +from optimum.intel.openvino.modeling_base import OVModelHostMixin +from optimum.intel.openvino.utils import ( + TemporaryDirectory, + classproperty, +) +from optimum.intel.utils.import_utils import is_huggingface_hub_version +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification + -from ...exporters.openvino import main_export -from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification -from .configuration import OVConfig, OVWeightQuantizationConfig -from .modeling import MODEL_START_DOCSTRING, OVModel -from .modeling_base import OVModelHostMixin -from .utils import TemporaryDirectory, classproperty +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode logger = logging.getLogger(__name__) diff --git a/optimum/intel/openvino/modeling_sam.py b/optimum/intel/openvino/modeling_sam.py index 75e987557f..18f46661a3 100644 --- a/optimum/intel/openvino/modeling_sam.py +++ b/optimum/intel/openvino/modeling_sam.py @@ -403,7 +403,7 @@ def get_image_wide_positional_embeddings(self): x_embed = x_embed / size positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) - return positional_embedding.permute(2, 0, 1).unsqueeze(0) + return positional_embedding.permute(2, 0, 1).unsqueeze(0).detach() def get_image_features(self, pixel_values, *args, **kwargs): return torch.from_numpy(self.vision_encoder(pixel_values).image_embeddings) diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index e6e99ffd56..cb8d6b7fa4 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -27,7 +27,6 @@ AutoConfig, AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, - AutoModelForVision2Seq, GenerationConfig, Pix2StructForConditionalGeneration, PretrainedConfig, @@ -56,6 +55,18 @@ ) +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + core = Core() logger = logging.getLogger(__name__) @@ -359,20 +370,21 @@ def __init__( generation_config = kwargs.get("generation_config", None) self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) self._openvino_config = None if quantization_config: @@ -1037,7 +1049,7 @@ def _reorder_cache( INPUTS_DOCSTRING, ) class OVModelForVision2Seq(OVModelForSeq2SeqLM): - auto_model_class = AutoModelForVision2Seq + auto_model_class = transformers_auto_class main_input_name = "pixel_values" export_feature = "image-to-text" diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 2fe8cb0ea0..ba002befde 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -190,7 +190,7 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if (self.config.model_type in ["qwen2_vl", "qwen3_vl"]) and position_ids.ndim != 3: + if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5"]) and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -3431,6 +3431,12 @@ def preprocess_inputs( Qwen3VLVisionModel, Qwen3VLVisionRotaryEmbedding, ) + + from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5Model, + Qwen3_5VisionModel, + Qwen3_5VisionRotaryEmbedding, + ) else: class Qwen3VLModel: @@ -3439,6 +3445,12 @@ class Qwen3VLModel: class Qwen3VLVisionModel: pass + class Qwen3_5Model: + pass + + class Qwen3_5VisionModel: + pass + # The inheritance from Qwen3VLModel is needed to get access to methods: # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066 @@ -4802,6 +4814,355 @@ def preprocess_inputs( return inputs +# The inheritance from Qwen3_5Model is needed to get access to methods: +# get_placeholder_mask(), get_rope_index(), get_image_features(), get_video_features(), compute_3d_position_ids() +# +# and inheritance from Qwen3_5VisionModel is needed for accessing the following method: +# rot_pos_emb() +class _OVQwen3_5ForCausalLM(OVModelForVisualCausalLM, Qwen3_5Model, Qwen3_5VisionModel): + additional_parts = ["vision_embeddings_merger", "vision_embeddings_pos"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = None, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + if is_transformers_version("<", "4.57.0"): + raise Exception("Qwen3.5 is not supported in transformers versions earlier than 4.57.0.") + + super().__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + self.rope_deltas = None # cache rope_deltas here + + self.num_grid_per_side = int(config.vision_config.num_position_embeddings**0.5) + self.spatial_merge_size = config.vision_config.spatial_merge_size + head_dim = config.vision_config.hidden_size // config.vision_config.num_heads + self.rotary_pos_emb = Qwen3_5VisionRotaryEmbedding(head_dim // 2) + + def __setattr__(self, name, value): + OVModelForVisualCausalLM.__setattr__(self, name, value) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if past_key_values is not None: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif inputs_embeds is not None: + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + if cache_position[0] != 0: + pixel_values = None + pixel_values_videos = None + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + model_inputs = {"input_ids": input_ids, "inputs_embeds": None} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + "cache_position": cache_position, + } + ) + return model_inputs + + # Adapted from Qwen3_5VisionModel.fast_pos_embed_interpolate + # This method needs to be changed, as instead of running self.pos_embed of type nn.Embedding, openvino model needs to be inferred (self.vision_embeddings_pos) + def fast_pos_embed_interpolate(self, grid_thw): + grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2] + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in zip(grid_ts, grid_hs, grid_ws): + h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h) + w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w) + + h_idxs_floor = h_idxs.int() + w_idxs_floor = w_idxs.int() + h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + base_h = h_idxs_floor * self.num_grid_per_side + base_h_ceil = h_idxs_ceil * self.num_grid_per_side + + indices = [ + (base_h[None].T + w_idxs_floor[None]).flatten(), + (base_h[None].T + w_idxs_ceil[None]).flatten(), + (base_h_ceil[None].T + w_idxs_floor[None]).flatten(), + (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(), + ] + + weights = [ + ((1 - dh)[None].T * (1 - dw)[None]).flatten(), + ((1 - dh)[None].T * dw[None]).flatten(), + (dh[None].T * (1 - dw)[None]).flatten(), + (dh[None].T * dw[None]).flatten(), + ] + + for i in range(4): + idx_list[i].extend(indices[i].tolist()) + weight_list[i].extend(weights[i].tolist()) + + idx_tensor = torch.tensor(idx_list) + weight_tensor = torch.tensor(weight_list) + pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)) * weight_tensor[:, :, None] + patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] + + patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) + + patch_pos_embeds_permute = [] + merge_size = self.config.vision_config.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = torch.from_numpy(self.vision_embeddings(pixel_values)[0]) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb + ) + return res[0] + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + """ + Encodes images into continuous embeddings that can be forwarded to the language model. + """ + image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw)) + split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist() + image_embeds = torch.split(image_embeds, split_sizes) + return image_embeds + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. + """ + return self.get_image_features(pixel_values_videos, video_grid_thw) + + def get_multimodal_embeddings( + self, + input_ids, + pixel_values=None, + attention_mask=None, + position_ids=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + cache_position=None, + **kwargs, + ): + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + if pixel_values is not None and input_ids.shape[1] != 1: + image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0) + n_image_tokens = (input_ids == self.config.image_token_id).sum().item() + n_image_features = image_embeds.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + mask = input_ids == self.config.image_token_id + mask_unsqueezed = mask.unsqueeze(-1) + mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) + image_mask = mask_expanded.to(inputs_embeds.device) + + image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None and input_ids.shape[1] != 1: + video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0) + n_video_tokens = (input_ids == self.config.video_token_id).sum().item() + n_video_features = video_embeds.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) + + mask = input_ids == self.config.video_token_id + mask_unsqueezed = mask.unsqueeze(-1) + mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) + video_mask = mask_expanded.to(inputs_embeds.device) + + video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): + # calculate RoPE index once per generation in the pre-fill stage only + if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: + # Construct mm_token_type_ids from input_ids + mm_token_type_ids = torch.zeros_like(input_ids, dtype=torch.int) + mm_token_type_ids[input_ids == self.config.image_token_id] = 1 + mm_token_type_ids[input_ids == self.config.video_token_id] = 2 + position_ids, rope_deltas = self.get_rope_index( + input_ids, mm_token_type_ids, image_grid_thw, video_grid_thw, attention_mask + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0 + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + return inputs_embeds, attention_mask, position_ids + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + video: Optional["VideoInput"] = None, + audio: Optional[np.ndarray] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if audio is not None: + raise ValueError("Audio input is not supported") + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + ], + } + ] + if image is not None: + conversation[0]["content"].insert(0, {"type": "image"}) + if video is not None: + conversation[0]["content"].insert(0, {"type": "video"}) + + text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") + return inputs + + def forward( + self, + input_ids, + pixel_values=None, + past_key_values=None, + inputs_embeds=None, + image_sizes=None, + attention_mask=None, + position_ids=None, + image_bound=None, + tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, + **kwargs, + ): + result = super().forward( + input_ids, + pixel_values, + past_key_values, + inputs_embeds, + image_sizes, + attention_mask, + position_ids, + image_bound, + tgt_sizes, + pixel_values_videos, + image_grid_thw, + video_grid_thw, + rope_deltas, + **kwargs, + ) + final_result = QWen2VLModelOutputWithPast( + logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas + ) + return final_result + + def generate(self, *args, **kwargs): + # Clear cached rope delta from previous generations + self.rope_deltas = None + + return super().generate(*args, **kwargs) + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, @@ -4823,5 +5184,7 @@ def preprocess_inputs( "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, "qwen3_vl": _OVQwen3VLForCausalLM, + "qwen3_5": _OVQwen3_5ForCausalLM, + "qwen3_5_text": _OVQwen3_5ForCausalLM, "minicpmo": _OVMiniCPMOForCausalLM, } diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 818eb41726..be6ac41d31 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -32,7 +32,6 @@ from openvino import Type as OVType from packaging.version import Version from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size from optimum.intel.utils.import_utils import is_torch_version @@ -239,18 +238,6 @@ def maybe_convert_tokenizer_to_fast( return hf_tokenizer -def use_external_data_format(num_parameters: int) -> bool: - """ - Returns whether or not the model requires using external data format for the ONNX export - Args: - num_parameters: Number of parameter on the model - Returns: - True if model.num_parameters() * size_of(float32) >= 2Gb False otherwise - """ - - return compute_serialized_parameters_size(num_parameters, ParameterFormat.Float) >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT - - def _is_timm_ov_dir(model_dir): config_file = None has_xml = False diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 7ea4102ec7..9ac24d06d7 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -13,15 +13,20 @@ # limitations under the License. import contextlib -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, Optional, Tuple import transformers.pipelines from transformers import AutoConfig +from optimum.intel.utils import ( + IPEX_IMPORT_ERROR, + OPENVINO_IMPORT_ERROR, + is_ipex_available, + is_openvino_available, + is_transformers_version, +) from optimum.utils.logging import get_logger -from ..utils import IPEX_IMPORT_ERROR, OPENVINO_IMPORT_ERROR, is_ipex_available, is_openvino_available - if TYPE_CHECKING: from transformers import PretrainedConfig @@ -138,12 +143,15 @@ def get_openvino_model_class( # a modified transformers.pipelines.base.infer_framework_load_model that loads OpenVINO models def openvino_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for OVModel - model_kwargs.pop("model_classes", None) ov_model_class = get_openvino_model_class(task, config, model, **model_kwargs) ov_model = ov_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, OVBaseModel): @@ -154,7 +162,10 @@ def openvino_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ov_model + if is_transformers_version("<", "5"): + return "pt", ov_model + + return ov_model def get_ipex_model_class(task: str, **model_kwargs): @@ -173,12 +184,15 @@ def get_ipex_model_class(task: str, **model_kwargs): # a modified transformers.pipelines.base.infer_framework_load_model that loads IPEX models def ipex_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for IPEXModel - model_kwargs.pop("model_classes", None) ipex_model_class = get_ipex_model_class(task, **model_kwargs) ipex_model = ipex_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, IPEXModel): @@ -189,27 +203,33 @@ def ipex_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ipex_model + if is_transformers_version("<", "5"): + return "pt", ipex_model + + return ipex_model @contextlib.contextmanager def patch_pipelines_to_load_accelerator_model(accelerator: str): - original_infer_framework_load_model = transformers.pipelines.infer_framework_load_model + target_fn = "infer_framework_load_model" if is_transformers_version("<", "5") else "load_model" + + original_infer_framework_load_model = getattr(transformers.pipelines, target_fn) if accelerator == "openvino": if not is_openvino_available(): raise ImportError(OPENVINO_IMPORT_ERROR.format("`accelerator=openvino`")) - transformers.pipelines.infer_framework_load_model = openvino_infer_framework_load_model + setattr(transformers.pipelines, target_fn, openvino_infer_framework_load_model) + elif accelerator == "ipex": if not is_ipex_available(): raise ImportError(IPEX_IMPORT_ERROR.format("`accelerator=ipex`")) - transformers.pipelines.infer_framework_load_model = ipex_infer_framework_load_model + setattr(transformers.pipelines, target_fn, ipex_infer_framework_load_model) else: raise ValueError(f"Accelerator '{accelerator}' is not supported. Only 'openvino' and 'ipex' are supported.") try: yield finally: - transformers.pipelines.infer_framework_load_model = original_infer_framework_load_model + setattr(transformers.pipelines, target_fn, original_infer_framework_load_model) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 3ad9877a82..d5e44d06d0 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -119,6 +119,15 @@ pass +_huggingface_hub_available = importlib.util.find_spec("huggingface_hub") is not None +_huggingface_hub_version = "N/A" +if _huggingface_hub_available: + try: + _huggingface_hub_version = importlib_metadata.version("huggingface_hub") + except importlib_metadata.PackageNotFoundError: + _huggingface_hub_available = False + + _safetensors_version = "N/A" _safetensors_available = importlib.util.find_spec("safetensors") is not None if _safetensors_available: @@ -486,6 +495,15 @@ def is_sentence_transformers_version(operation: str, version: str): return compare_versions(parse(_sentence_transformers_version), operation, version) +def is_huggingface_hub_version(operation: str, version: str): + """ + Compare the current huggingface_hub version to a given reference with an operation. + """ + if not _huggingface_hub_available: + return False + return compare_versions(parse(_huggingface_hub_version), operation, version) + + DIFFUSERS_IMPORT_ERROR = """ {0} requires the diffusers library but it was not found in your environment. You can install it with pip: `pip install diffusers`. Please note that you may need to restart your runtime after installation. diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..69de1770ce 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -23,14 +23,18 @@ from typing import Dict, List, Optional, Type, Union import torch -from huggingface_hub import HfApi, HfFolder, hf_hub_download +from huggingface_hub import HfApi, get_token, hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.hf_api import file_exists from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.tasks import TasksManager - -from .import_utils import is_diffusers_available, is_numa_available, is_open_clip_available, is_psutil_available +from optimum.intel.utils.import_utils import ( + is_diffusers_available, + is_numa_available, + is_open_clip_available, + is_psutil_available, +) if is_diffusers_available(): @@ -115,7 +119,7 @@ def _find_files_matching_pattern( model_path = Path(model_name_or_path) if not isinstance(model_name_or_path, Path) else model_name_or_path if isinstance(use_auth_token, bool): - token = HfFolder().get_token() + token = get_token() else: token = use_auth_token diff --git a/setup.py b/setup.py index b86c176463..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,10 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", - "transformers>=4.45,<4.58", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", + "transformers>=4.45,<5.1", "setuptools", + "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0", @@ -55,7 +56,7 @@ "sentence-transformers", "open_clip_torch>=2.26.1", "peft", - "datasets[audio]>=1.4.0,<4.0.0", + "datasets>=1.4.0,<4.0.0", "tbb", "langchain-huggingface", "hf_xet", @@ -69,10 +70,10 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], - "diffusers": ["diffusers"], + "diffusers": ["diffusers", "transformers<5"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, } diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 91121023d8..c3a89fe3aa 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -23,10 +23,12 @@ ) from optimum.exporters.openvino.model_configs import ( + AfmoeOpenVINOConfig, BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig, Qwen3VLOpenVINOConfig, + Qwen3_5TextOpenVINOConfig, ) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES @@ -46,21 +48,17 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", - "baichuan2", - "baichuan2-13b", "gpt_bigcode", "bigbird_pegasus", "blenderbot", "blenderbot-small", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neo", "gpt_neox", "llama", - "marian", "mistral", "mixtral", "mpt", @@ -68,38 +66,29 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "opt", "pegasus", "phi", - "internlm2", - "orion", "falcon", "falcon-40b", "persimmon", "biogpt", "gpt_neox_japanese", "xglm", - "aquila", - "aquila2", - "xverse", - "internlm", - "jais", - "decilm", "gemma", "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", "granitemoe", ) SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") - if is_transformers_version(">=", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("zamba2",) if is_transformers_version(">=", "4.53.0"): @@ -114,11 +103,15 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("cohere2",) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) + # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") @@ -126,11 +119,15 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - if is_transformers_version(">", "4.49"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): - SUPPORTED_ARCHITECTURES += ("llama4", "qwen3", "qwen3_moe") + SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llama4",) if is_transformers_version(">=", "4.51.3"): SUPPORTED_ARCHITECTURES += ("glm4",) @@ -138,10 +135,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.52.1"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version("<", "4.54.0"): @@ -159,6 +157,28 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + "baichuan2", + "baichuan2-13b", + # remote modeling code failing with v5 + "aquila", + "xverse", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + "marian", + # "zamba2", + ) GENERATION_LENGTH = 100 EXPECTED_NUM_SDPA = { @@ -286,11 +306,13 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + if is_transformers_version(">", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): if "deepseek_v2" in supported_architectures: supported_architectures.remove("deepseek_v2") if "deepseek_v3" in supported_architectures: supported_architectures.remove("deepseek_v3") + if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + supported_architectures -= {"afmoe"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): @@ -300,6 +322,22 @@ def test_find_untested_architectures(self): if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"qwen3_vl_text"} + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "5"): + supported_architectures -= { + "phimoe", + "bitnet", + "dbrx", + "zamba2", + "marian", + "llama4", + "exaone4", + } + + # qwen3_5_text a part of qwen3_5 architecture and is tested in seq2seq group + if is_transformers_version(">=", str(Qwen3_5TextOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + supported_architectures -= {"qwen3_5_text"} + supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures @@ -375,7 +413,17 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch in ["qwen", "arctic", "chatglm4", "gpt_oss_mxfp4"]: + if model_arch in [ + "qwen", + "arctic", + "chatglm4", + "gpt_oss_mxfp4", + "llama", + "lfm2", + "gemma3_text", + "llama4", + "exaone4", + ]: transformers_model.to(torch.float32) with torch.no_grad(): @@ -392,7 +440,8 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + inputs = "Today is a nice day and" if model_arch == "decilm" else "The quick brown fox jumps over the" + tokens = tokenizer([inputs, "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -468,7 +517,7 @@ def test_pipeline(self, model_arch): tokenizer._convert_tokens_to_ids = lambda x: 0 additional_args = {} - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_args["use_model_defaults"] = False set_seed(SEED) @@ -739,11 +788,11 @@ def test_beam_search(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch == "arctic" or "mxfp4" in model_arch: + if model_arch in ["arctic", "gemma3_text"] or "mxfp4" in model_arch: transformers_model.to(torch.float32) additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model - if model_arch in ["gemma2", "gemma3_text"]: + if model_arch in ["gemma2", "gemma3_text"] and is_transformers_version("<", "4.53.0"): patch_update_causal_mask(transformers_model, "4.43.0") transformers_model._supports_cache_class = True transformers_model.generation_config.cache_implementation = None @@ -774,7 +823,7 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_inputs["use_model_defaults"] = False for gen_config in gen_configs: @@ -823,7 +872,7 @@ def test_beam_search(self, model_arch): def test_load_with_different_dtype(self): set_seed(SEED) - model_id = MODEL_NAMES["llama"] + model_id = MODEL_NAMES["mistral"] pt_model = AutoModelForCausalLM.from_pretrained( model_id, ) @@ -846,7 +895,11 @@ def test_load_with_different_dtype(self): ) @parameterized.expand(EAGLE3_MODELS.items()) - @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54") + # TODO (@echarlaix) transformers v5 support + @pytest.mark.skipif( + is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), + reason="Eagle3 requires transformers >= 4.54", + ) def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 8efc69f8ec..bc58c91796 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -38,7 +38,7 @@ OVPipelineForText2Video, ) from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import is_diffusers_version +from optimum.intel.utils.import_utils import is_diffusers_version, is_transformers_version from optimum.utils.testing_utils import require_diffusers @@ -80,7 +80,6 @@ class OVPipelineForText2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", "sana", ] @@ -93,6 +92,10 @@ class OVPipelineForText2ImageTest(unittest.TestCase): if is_diffusers_version(">=", "0.33.0"): SUPPORTED_ARCHITECTURES.extend(["sana-sprint"]) + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -499,9 +502,11 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", ] + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image TASK = "image-to-image" @@ -754,7 +759,11 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3", "flux", "flux-fill"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "flux", "flux-fill"] + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting TASK = "inpainting" diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index d1c373e2bc..42872d2685 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -98,13 +98,13 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM}) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM}) if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM}) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"exaone4": OVModelForCausalLM, "lfm2": OVModelForCausalLM}) if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index edbc01e310..9690496089 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -128,24 +128,30 @@ class OVCLIExportTestCase(unittest.TestCase): [ ("text-generation", "lfm2"), ("text-generation-with-past", "lfm2"), + ] + ) + + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.extend( + [ ("text-generation-with-past", "qwen3_eagle3"), ] ) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "zamba2"), ] ) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "exaone4"), ] ) - if is_transformers_version(">=", "4.52.1"): + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "bitnet"), @@ -421,7 +427,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -445,7 +451,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( @@ -486,7 +492,7 @@ class OVCLIExportTestCase(unittest.TestCase): ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -503,48 +509,52 @@ class OVCLIExportTestCase(unittest.TestCase): "prompt_encoder_mask_decoder": {"int8": 49}, }, ), - ( - "image-text-to-text", - "internvl_chat", - "f8e4m3", - "--dataset contextual --num-samples 1 --trust-remote-code", - { - "lm_model": 15, - "text_embeddings_model": 0, - "vision_embeddings_model": 17, - }, - { - "lm_model": {"f8e4m3": 15}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"f8e4m3": 11}, - }, - ), ] + if is_transformers_version("<", "5"): + SUPPORTED_QUANTIZATION_ARCHITECTURES.append( + ( + "image-text-to-text", + "internvl_chat", + "f8e4m3", + "--dataset contextual --num-samples 1 --trust-remote-code", + { + "lm_model": 15, + "text_embeddings_model": 0, + "vision_embeddings_model": 17, + }, + { + "lm_model": {"f8e4m3": 15}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"f8e4m3": 11}, + }, + ), + ) + TRANSFORMERS_4BIT_CONFIGURATIONS = [ ( "text-generation-with-past", "opt125m", "int4 --sym --group-size 128", - {"model": {"int8": 4, "int4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 72}}, ), ( "text-generation-with-past", "opt125m", "int4 --group-size 64", - {"model": {"int8": 4, "int4": 144}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 144}}, ), ( "text-generation-with-past", "opt125m", "mxfp4", - {"model": {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "f4e2m1": 72, "f8e8m0": 72}}, ), ( "text-generation-with-past", "opt125m", "nf4", - {"model": {"int8": 4, "nf4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "nf4": 72}}, ), ( "text-generation-with-past", @@ -832,6 +842,8 @@ def test_filtered_architectures(cls): expected = {"qwen3_vl"} else: expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"} + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS} @@ -1221,13 +1233,14 @@ def test_exporters_cli_full_quantization( {"model": 65}, ), ( - "gpt_oss_mxfp4", + # mxfp4 fixing saving broken since v5, fixed in https://github.com/huggingface/transformers/pull/43148, test can be added back for v5.3 + "gpt_oss_mxfp4" if is_transformers_version("<", "5") else "gpt_oss", "openai/gpt-oss-20b", AutoModelForCausalLM, OVModelForCausalLM, "--task text-generation-with-past --weight-format int4", _DEFAULT_4BIT_WQ_CONFIGS, - {"model": {"int8": 22, "int4": 4}}, + {"model": {"int8": 22, "int4": 4} if is_transformers_version("<", "5") else {"int8": 40, "int4": 0}}, {"model": 0}, ), ( diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 2d075e7874..584d798e88 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -45,7 +45,6 @@ class LLMPipelineTestCase(unittest.TestCase): "gpt_bigcode", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neox", @@ -53,27 +52,18 @@ class LLMPipelineTestCase(unittest.TestCase): "mistral", "mixtral", "phi", - "internlm2", - "orion", "falcon", "persimmon", "xglm", - "aquila", - "aquila2", - "internlm", - "jais", - "decilm", "gemma", "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", "granitemoe", ) @@ -81,12 +71,14 @@ class LLMPipelineTestCase(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES += ("cohere2",) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "opt") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen",) - if is_transformers_version(">=", "4.49"): + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") @@ -94,7 +86,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("glm4",) if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version(">=", "4.55.0"): SUPPORTED_ARCHITECTURES += ("gpt_oss",) @@ -103,6 +95,24 @@ class LLMPipelineTestCase(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + # remote modeling code failing with v5 + "aquila", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + ) + REMOTE_CODE_MODELS = ( "chatglm", "minicpm", @@ -202,9 +212,7 @@ def test_compare_outputs(self, model_arch): class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( - "llava", "llava_next", - "llava_next_video", # "minicpmv", # output is truncated for some reason "qwen2_vl", ) @@ -218,8 +226,10 @@ class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("phi4mm",) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3",) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") REMOTE_CODE_MODELS = ( "minicpmv", @@ -251,9 +261,9 @@ def _get_model_class(self, model_arch): return AutoModelForImageTextToText elif model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration elif model_arch == "llava": from transformers import LlavaForConditionalGeneration @@ -457,8 +467,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54"): - self.skipTest("Eagle3 requires transformers >= 4.54") + if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): + self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 32266ea54b..0c5011a908 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -34,6 +34,7 @@ from sentence_transformers import SentenceTransformer from transformers import ( AutoFeatureExtractor, + AutoImageProcessor, AutoModel, AutoModelForAudioClassification, AutoModelForAudioFrameClassification, @@ -53,7 +54,6 @@ pipeline, set_seed, ) -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TENSOR_ALIAS_TO_TYPE, TEST_IMAGE_URL @@ -221,7 +221,7 @@ def test_load_from_hub_and_save_visual_language_model(self): # anymore due to an internal bug in transformers model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( @@ -476,7 +476,7 @@ def test_load_from_hub_and_save_sam_model(self): self.assertEqual( loaded_model.prompt_encoder_mask_decoder.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT" ) - processor = get_preprocessor(self.OV_SAM_MODEL_ID) + processor = AutoProcessor.from_pretrained(self.OV_SAM_MODEL_ID) img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" input_points = [[[450, 600]]] raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") @@ -709,6 +709,8 @@ def test_load_model_from_hub(self): # verify could load both pytorch and openvino model (export argument should automatically infered) ov_exported_pipe = optimum_pipeline("text-generation", model_id, revision="pt", accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline("text-generation", model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) @@ -726,20 +728,21 @@ def test_load_model_from_hub(self): gc.collect() def test_seq2seq_load_from_hub(self): - model_id = "echarlaix/tiny-random-t5" + model_id = MODEL_NAMES["whisper"] + task = "automatic-speech-recognition" # verify could load both pytorch and openvino model (export argument should automatically infered) - ov_exported_pipe = optimum_pipeline("text2text-generation", model_id, accelerator="openvino") - ov_pipe = optimum_pipeline("text2text-generation", model_id, revision="ov", accelerator="openvino") + ov_exported_pipe = optimum_pipeline(task, model_id, accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline(task, model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) with TemporaryDirectory() as tmpdirname: ov_exported_pipe.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) - if not ov_exported_pipe.model.decoder.stateful: - self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents) - self.assertTrue(OV_DECODER_WITH_PAST_NAME.replace(".xml", ".bin") in folder_contents) - ov_exported_pipe = optimum_pipeline("text2text-generation", tmpdirname, accelerator="openvino") + self.assertTrue(ov_exported_pipe.model._ov_model_paths["encoder"] in folder_contents) + self.assertTrue(ov_exported_pipe.model._ov_model_paths["decoder"] in folder_contents) + ov_exported_pipe = optimum_pipeline(task, tmpdirname, accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) del ov_exported_pipe @@ -754,14 +757,16 @@ class OVModelForSequenceClassificationIntegrationTest(unittest.TestCase): "convbert", "distilbert", "electra", - "flaubert", "ibert", "roberta", "roformer", "squeezebert", - "xlm", ) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -891,12 +896,12 @@ def test_pipeline(self, model_arch): pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) question = "What's my name?" context = "My Name is Arthur and I live in Lyon." - outputs = pipe(question, context) + outputs = pipe(question=question, context=context) self.assertEqual(pipe.device, model.device) self.assertGreaterEqual(outputs["score"], 0.0) self.assertIsInstance(outputs["answer"], str) ov_pipe = optimum_pipeline("question-answering", model_id, accelerator="openvino") - ov_outputs = ov_pipe(question, context) + ov_outputs = ov_pipe(question=question, context=context) self.assertEqual(outputs["score"], ov_outputs["score"]) del model del ov_pipe @@ -1084,13 +1089,11 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "bert", "camembert", "convbert", - "data2vec-text", "deberta", "deberta-v2", "distilbert", "electra", "esm", - "flaubert", "ibert", "mobilebert", "mpnet", @@ -1099,7 +1102,6 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "roberta", "roformer", "squeezebert", - "xlm", "xlm-roberta", ) @@ -1107,6 +1109,10 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.51.0"): SUPPORTED_ARCHITECTURES += ("nystromformer",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("data2vec-text", "flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -1185,19 +1191,18 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") with torch.no_grad(): transformers_outputs = transformers_model(**inputs) - for input_type in ["pt", "np"]: - inputs = preprocessor(images=image, return_tensors=input_type) - ov_outputs = ov_model(**inputs) - self.assertIn("logits", ov_outputs) - self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) - # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + inputs = preprocessor(images=image, return_tensors="pt") + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + # Compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) del transformers_model del ov_model gc.collect() @@ -1210,7 +1215,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE) model.eval() - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) inputs = TEST_IMAGE_URL outputs = pipe(inputs) @@ -1596,14 +1601,14 @@ def _get_sample_image(self): image = Image.open(requests.get(url, stream=True).raw) return image - def test_load_from_hub_and_save_model(self): + def test_load_from_hub_and_save_model_openclip(self): loaded_model = OVModelOpenCLIPForZeroShotImageClassification.from_pretrained( self.OV_MODEL_ID_IR, device=OPENVINO_DEVICE ) tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=loaded_model.config.text_config.context_length, @@ -1681,7 +1686,7 @@ def test_functions(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=model.config.text_config.context_length, @@ -1721,7 +1726,7 @@ def test_functions(self): self.assertTrue(torch.allclose(model_outputs.logits_per_image, res.logits_per_image, atol=1e-2)) model.reshape(1, -1) - reshaped_tokens = tokenizer.batch_encode_plus( + reshaped_tokens = tokenizer( ["a dog"], return_tensors="pt", max_length=model.config.text_config.context_length, @@ -1832,7 +1837,7 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.vision_encoder, OVSamVisionEncoder) self.assertIsInstance(ov_model.prompt_encoder_mask_decoder, OVSamPromptEncoder) @@ -1885,7 +1890,7 @@ def test_reshape(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertTrue(ov_model.is_dynamic) input_points = [[[450, 600]]] IMAGE = Image.open( @@ -1921,7 +1926,7 @@ def test_compare_to_transformers(self, model_arch): ov_model = OVModelForZeroShotImageClassification.from_pretrained( model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE ) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.config, PretrainedConfig) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 3dac24c69a..c2576db98b 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -17,18 +17,13 @@ from transformers import AutoTokenizer, pipeline from utils_tests import OPENVINO_DEVICE from optimum.intel import ( - OVModelForAudioClassification, OVModelForCausalLM, - OVModelForFeatureExtraction, - OVModelForImageClassification, OVModelForMaskedLM, OVModelForQuestionAnswering, OVModelForSeq2SeqLM, OVModelForSequenceClassification, - OVModelForTokenClassification, - OVStableDiffusionPipeline, ) - +from optimum.intel.utils.import_utils import is_transformers_version # Make sure that common architectures are used in combination with common tasks MODEL_NAMES = { @@ -58,6 +53,9 @@ def test_pipeline(self, model_id): """ tokenizer = AutoTokenizer.from_pretrained(model_id) model_class_str = MODEL_NAMES[model_id] + if model_class_str == "OVModelForSeq2SeqLM" and is_transformers_version(">=", "5"): + self.skipTest("text2text-generation pipeline was deprecated in transformers v5") + model_class = eval(model_class_str) model = model_class.from_pretrained(model_id, device=OPENVINO_DEVICE) model.save_pretrained(f"{model_id}_ov") @@ -69,9 +67,13 @@ def test_pipeline(self, model_id): elif model_class_str == "OVModelForMaskedLM": input_text[0] = f"{input_text[0]} {tokenizer.mask_token}" - if model_class_str in TASKS: - task = TASKS[model_class_str] - pipe = pipeline(task, model=model, tokenizer=tokenizer) + task = TASKS[model_class_str] + pipe = pipeline(task, model=model, tokenizer=tokenizer) + + if task == "question-answering": + # positional arguments deprecated for question-answering pipeline since v5 + pipe(question=input_text[0], context=input_text[1]) + else: pipe(*input_text) gc.collect() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index bfc6ec976a..ec9d7b84f7 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -135,8 +135,8 @@ class OVQuantizerTest(unittest.TestCase): (OVModelForSequenceClassification, "bert", 32, 35), (OVModelForCausalLM, "gpt2", 31, 22), (OVSentenceTransformer, "sentence-transformers-bert", 12, 15), - (OVModelForFeatureExtraction, "blenderbot", 33, 35), - (OVModelForMaskedLM, "roberta", 32, 34), + (OVModelForFeatureExtraction, "blenderbot", 33, 35 if is_transformers_version("<", "5") else 36), + (OVModelForMaskedLM, "roberta", 32, 34 if is_transformers_version("<", "5") else 35), (OVModelForZeroShotImageClassification, "clip", 65, 65), ) SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET = [ @@ -269,7 +269,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -299,7 +299,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( @@ -342,14 +342,11 @@ class OVQuantizerTest(unittest.TestCase): ), {"encoder": 30, "decoder": 52, "decoder_with_past": 61} if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52, - }, + else {"encoder": 30, "decoder": 52}, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -596,7 +593,9 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelForCausalLM, "gpt2", 44, 44), ) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 43),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ( + (OVModelForCausalLM, "opt125m", 62 if is_transformers_version("<", "5") else 64, 43), + ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),) SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),) @@ -1063,8 +1062,6 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False), (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False), (OVModelForVisualCausalLM, "llava", False), - (OVModelForVisualCausalLM, "llava_next_video", False), - (OVModelForVisualCausalLM, "minicpmv", True), (OVModelForVisualCausalLM, "qwen2_vl", False), ] @@ -1077,13 +1074,21 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "cohere2", False)) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "exaone4", True)) if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_vl", False)) SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "hunyuan_v1_dense", False)) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend( + [ + (OVModelForVisualCausalLM, "llava_next_video", False), + (OVModelForVisualCausalLM, "minicpmv", True), + ] + ) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), @@ -1226,6 +1231,8 @@ def test_filtered_architectures(cls): expected.add("qwen3_vl") if is_transformers_version(">=", "4.54"): expected.update({"llava-qwen2", "phi3_v", "minicpmo"}) + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat", "exaone4"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} @@ -1802,31 +1809,35 @@ class OVPipelineQuantizationTest(unittest.TestCase): {"encoder": 14, "decoder": 22}, {"encoder": {"int8": 14}, "decoder": {"int8": 22}}, ), - ( - OVModelForVisualCausalLM, - "internvl_chat", - True, - dict( - quantization_configs={ - "lm_model": dict(bits=8, weight_only=True), - "vision_embeddings_model": dict(bits=8, weight_only=False), + ] + + if is_transformers_version("<", "5"): + PIPELINE_QUANTIZATION_SCOPE.append( + ( + OVModelForVisualCausalLM, + "internvl_chat", + True, + dict( + quantization_configs={ + "lm_model": dict(bits=8, weight_only=True), + "vision_embeddings_model": dict(bits=8, weight_only=False), + }, + dataset="contextual", + num_samples=1, + default_config=dict(bits=8, sym=True, weight_only=True), + ), + { + "lm_model": 0, + "text_embeddings_model": 0, + "vision_embeddings_model": 15, + }, + { + "lm_model": {"int8": 30}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 11}, }, - dataset="contextual", - num_samples=1, - default_config=dict(bits=8, sym=True, weight_only=True), ), - { - "lm_model": 0, - "text_embeddings_model": 0, - "vision_embeddings_model": 15, - }, - { - "lm_model": {"int8": 30}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 11}, - }, - ), - ] + ) if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.54.0"): PIPELINE_QUANTIZATION_SCOPE.extend( @@ -2024,7 +2035,7 @@ def preprocess_function(examples, tokenizer): # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir, device=OPENVINO_DEVICE) - tokens = tokenizer.encode_plus( + tokens = tokenizer( "This is a sample question", "This is a sample context", add_special_tokens=True, return_tensors="pt" ) model(**tokens, return_dict=True) @@ -2499,7 +2510,7 @@ def check_model_inference(ov_model, model_id, trust_remote_code): if isinstance(ov_model, OVModelForSpeechSeq2Seq): input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} ov_model.generate(input_features, generation_config=gen_config, **generate_kwrgs) else: diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index ac91b2f4ad..9ceab2d227 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -33,7 +33,6 @@ AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, AutoModelForTextToSpectrogram, - AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, GenerationConfig, @@ -42,7 +41,6 @@ set_seed, ) from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TEST_IMAGE_URL, Timer @@ -70,6 +68,18 @@ from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -135,7 +145,6 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): "longt5", "m2m_100", "mbart", - "mt5", "pegasus", "t5", ) @@ -144,10 +153,20 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): TASK = "text2text-generation" GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2025.5.0")): + UNSUPPORTED_ARCHITECTURES = set() + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( + "<", "5" + ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) + else: + UNSUPPORTED_ARCHITECTURES.add("marian") + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("mt5",) + else: + UNSUPPORTED_ARCHITECTURES.add("mt5") SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): @@ -214,6 +233,10 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since summarization/translation/text2text-generation pipelines are deprecated", + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] @@ -336,7 +359,7 @@ def test_compare_to_transformers(self, model_arch): self._check_openvino_model_attributes(ov_model, use_cache=True, stateful=True) self._check_openvino_model_attributes(ov_model_stateless, use_cache=True, stateful=False) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) data = self._generate_random_audio_data() pt_features = processor.feature_extractor(data, return_tensors="pt") decoder_start_token_id = transformers_model.config.decoder_start_token_id @@ -361,7 +384,7 @@ def test_compare_to_transformers(self, model_arch): ) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} gen_config = GenerationConfig( @@ -391,11 +414,14 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames" + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, @@ -422,7 +448,7 @@ class OVModelForVision2SeqIntegrationTest(OVSeq2SeqTestMixin): UNSUPPORTED_ARCHITECTURES = {"got_ocr2", "pix2struct"} TASK = "image-to-text" OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -493,6 +519,10 @@ def test_compare_to_transformers(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since image-to-text pipelines is deprecated", + ) def test_pipeline(self, model_arch: str): set_seed(SEED) model_id = MODEL_NAMES[model_arch] @@ -521,31 +551,45 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ - "internvl_chat", "llava", "llava_next", "llava_next_mistral", - "llava_next_video", - "llava-qwen2", - "minicpmv", - "phi3_v", "qwen2_vl", ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] SUPPORT_AUDIO = [] + UNSUPPORTED_ARCHITECTURES = {"phi4_multimodal"} OVMODEL_CLASS = OVModelForVisualCausalLM TASK = "image-text-to-text" if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"] + SUPPORTED_ARCHITECTURES += ["maira2"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["idefics3"] if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2", "phi4mm"] + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"] SUPPORT_VIDEO.append("qwen2_5_vl") - SUPPORT_AUDIO.append("phi4mm") - if is_transformers_version(">", "4.49"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] - if is_transformers_version(">=", "4.51"): + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["got_ocr2"] + + if is_transformers_version("<", "4.54.0"): + # remote code models differs after transformers v4.54 + SUPPORTED_ARCHITECTURES += ["phi4mm"] + SUPPORT_AUDIO.append("phi4mm") + + if is_transformers_version(">=", "4.50"): + SUPPORTED_ARCHITECTURES += ["gemma3"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["smolvlm"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): # SUPPORTED_ARCHITECTURES += ["llama4", "phi4_multimodal"] SUPPORTED_ARCHITECTURES += ["llama4"] @@ -555,10 +599,19 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["qwen3_vl"] SUPPORT_VIDEO += ["qwen3_vl"] - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version("<", "4.54.0"): # remote code models differs after transformers v4.54 - SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} + SUPPORTED_ARCHITECTURES += ["llava-qwen2", "phi3_v"] + + if is_transformers_version("<", "5"): + # remote code models incompatible after transformers v5 + SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava_next_video",) + else: + UNSUPPORTED_ARCHITECTURES.update({"got_ocr2", "idefics3", "llama4", "llava_next_video", "smolvlm"}) REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( @@ -585,9 +638,9 @@ def get_transformer_model_class(self, model_arch): return AutoModelForImageTextToText if model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration if model_arch == "llava": from transformers import LlavaForConditionalGeneration @@ -725,9 +778,9 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) additional_inputs = {} - # gemma3 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, + # gemma3 does not support dynamic cache until v4.53, we cannot compare dynamic cache result vs hybrid cache, # align cache representation in torch model - if model_arch == "gemma3": + if model_arch == "gemma3" and is_transformers_version("<", "4.53.0"): patch_update_causal_mask( transformers_model if is_transformers_version("<", "4.52.0") else transformers_model.language_model, "4.43.0", @@ -815,7 +868,11 @@ def test_compare_to_transformers(self, model_arch): gc.collect() - @parameterized.expand(["llava", "llava_next", "llava_next_video", "llava_next_mistral"]) + @parameterized.expand( + ["llava", "llava_next", "llava_next_video", "llava_next_mistral"] + if is_transformers_version("<", "5") + else ["llava", "llava_next", "llava_next_mistral"] + ) def test_llava_with_new_preprocessing(self, model_arch): prompt = "\n What is shown in this image?" model_id = MODEL_NAMES[model_arch] @@ -1061,7 +1118,7 @@ class OVModelForPix2StructIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = ["pix2struct"] TASK = "image-to-text" # is it fine as well with visual-question-answering? OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -1083,7 +1140,7 @@ def test_compare_to_transformers(self, model_arch): question = "Who am I?" transformers_model = self.AUTOMODEL_CLASS.from_pretrained(model_id) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) inputs = preprocessor(images=self.IMAGE, text=question, padding=True, return_tensors="pt") ov_outputs = ov_model(**inputs) @@ -1104,7 +1161,7 @@ def test_compare_to_transformers(self, model_arch): def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, export=True, device=OPENVINO_DEVICE) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") @@ -1118,7 +1175,7 @@ def test_generate_utils(self, model_arch): def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["pix2struct"] - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") model_with_pkv = self.OVMODEL_CLASS.from_pretrained( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 230ec88e45..a17750c2cc 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -172,6 +172,7 @@ "qwen3": "optimum-intel-internal-testing/tiny-random-qwen3", "qwen3_moe": "optimum-intel-internal-testing/tiny-random-qwen3moe", "qwen3_vl": "optimum-intel-internal-testing/tiny-random-qwen3-vl", + "qwen3_5": "optimum-intel-internal-testing/tiny-random-qwen3.5", "rembert": "optimum-intel-internal-testing/tiny-random-rembert", "resnet": "optimum-intel-internal-testing/tiny-random-resnet", "roberta": "optimum-intel-internal-testing/tiny-random-roberta", @@ -213,7 +214,7 @@ "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer", "whisper": "optimum-intel-internal-testing/tiny-random-whisper", "xlm": "optimum-intel-internal-testing/tiny-random-xlm", - "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta", + "xlm-roberta": "optimum-intel-internal-testing/tiny-random-xlm-roberta", "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM", "xverse": "optimum-intel-internal-testing/tiny-random-xverse", "glm4": "optimum-intel-internal-testing/tiny-random-glm4", @@ -233,11 +234,11 @@ _ARCHITECTURES_TO_EXPECTED_INT8 = { "afmoe": {"model": 16}, - "bert": {"model": 68}, + "bert": {"model": 68 if is_transformers_version("<", "5") else 70}, "roberta": {"model": 68}, "albert": {"model": 84}, "vit": {"model": 64}, - "blenderbot": {"model": 70}, + "blenderbot": {"model": 70 if is_transformers_version("<", "5") else 72}, "cohere2": {"model": 30}, "gpt2": {"model": 44}, "granitemoehybrid": {"model": 118}, @@ -245,8 +246,8 @@ "distilbert": {"model": 66}, "t5": { "encoder": 64, - "decoder": 104, - "decoder_with_past": 84, + "decoder": 104 if is_transformers_version("<", "5") else 106, + "decoder_with_past": 84 if is_transformers_version("<", "5") else 86, }, "stable-diffusion": { "unet": 242, @@ -334,6 +335,13 @@ "vision_embeddings_merger_model": 32, "vision_embeddings_pos_model": 1, }, + "qwen3_5": { + "lm_model": 100, + "text_embeddings_model": 1, + "vision_embeddings_model": 1, + "vision_embeddings_merger_model": 32, + "vision_embeddings_pos_model": 1, + }, "sana": { "transformer": 58, "vae_decoder": 28, @@ -357,8 +365,8 @@ "vocoder": 80, }, "clip": {"model": 130}, - "mamba": {"model": 322}, - "falcon_mamba": {"model": 162}, + "mamba": {"model": 322 if is_transformers_version("<", "5") else 324}, + "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164}, "minicpmo": { "lm_model": 16, "text_embeddings_model": 1, @@ -367,7 +375,7 @@ }, "zamba2": {"model": 44}, "exaone4": {"model": 16}, - "lfm2": {"model": 52}, + "lfm2": {"model": 52 if is_transformers_version("<", "5") else 54}, "hunyuan_v1_dense": {"model": 32}, "qwen3_eagle3": {"model": 20}, }