diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7ffe158396..270db0b350 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -194,6 +194,7 @@ Qwen2MoEPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, + Qwen3_5Patcher, Qwen3MoeModelPatcher, Qwen3VLLanguageModelPatcher, Qwen3VLVisionEmbMergerPatcher, @@ -4961,6 +4962,195 @@ def inputs(self) -> Dict[str, Dict[int, str]]: return common_inputs +class Qwen3_5DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + """ + Generates dummy cache_params inputs for Qwen3.5 hybrid GatedDeltaNet + Attention architectures. + Linear attention layers produce conv_states and recurrent_states (fixed size). + Full attention layers produce standard KV cache (variable size). + """ + + SUPPORTED_INPUT_NAMES = ("cache_params",) + + def __init__( + self, + task: str, + normalized_config, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + **kwargs, + ) + + config = normalized_config.config + self._model_config = config + # Derive attention layer indices from layer_types list + layer_types = config.layer_types + self.attention_layers_indices = set( + i for i, lt in enumerate(layer_types) if lt == "full_attention" + ) + self.num_hidden_layers = config.num_hidden_layers + self.num_linear_layers = self.num_hidden_layers - len(self.attention_layers_indices) + self.num_attention_layers = len(self.attention_layers_indices) + + # Linear attention (GatedDeltaNet) state dimensions + self.linear_num_key_heads = config.linear_num_key_heads + self.linear_key_head_dim = config.linear_key_head_dim + self.linear_value_head_dim = config.linear_value_head_dim + self.linear_num_value_heads = config.linear_num_value_heads + self.linear_conv_kernel_dim = config.linear_conv_kernel_dim + # conv_dim = key_dim * 2 + value_dim + self.conv_dim = ( + self.linear_num_key_heads * self.linear_key_head_dim * 2 + + self.linear_num_value_heads * self.linear_value_head_dim + ) + + # Full attention KV cache dimensions + self.num_key_value_heads = config.num_key_value_heads + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.sequence_length = 0 + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_values = [] + + # Linear attention layers: conv_states + recurrent_states + for i in range(self.num_linear_layers): + conv_state_shape = (self.batch_size, self.conv_dim, self.linear_conv_kernel_dim) + conv_state = self.random_float_tensor(conv_state_shape, framework=framework, dtype=float_dtype) + past_key_values.append(conv_state) + + recurrent_state_shape = ( + self.batch_size, + self.linear_num_key_heads, + self.linear_key_head_dim, + self.linear_value_head_dim, + ) + recurrent_state = self.random_float_tensor(recurrent_state_shape, framework=framework, dtype=float_dtype) + past_key_values.append(recurrent_state) + + # Full attention layers: key + value cache + for i in range(self.num_attention_layers): + kv_shape = (self.batch_size, self.num_key_value_heads, self.sequence_length, self.head_dim) + k = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + v = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + past_key_values.append(k) + past_key_values.append(v) + + return past_key_values + + +@register_in_tasks_manager( + "qwen3_5", + *[ + "text-generation", + "text-generation-with-past", + ], + library_name="transformers", +) +@register_in_tasks_manager( + "qwen3_5_text", + *[ + "text-generation", + "text-generation-with-past", + ], + library_name="transformers", +) +class Qwen3_5OpenVINOConfig(MambaOpenVINOConfig): + PAD_ATTENTION_MASK_TO_PAST = False + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + # No attention_mask input: patched_forward passes None to avoid hardcoded reshapes. + NO_ATTENTION_MASK = True + MIN_TRANSFORMERS_VERSION = "5.3.0" + _MODEL_PATCHER = Qwen3_5Patcher + + def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): + model_kwargs = model_kwargs or {} + return Qwen3_5Patcher(self, model, model_kwargs=model_kwargs) + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + cache_name_prefix = "cache_params.past" + else: + decoder_sequence_name = "past_sequence_length + sequence_length" + cache_name_prefix = "cache_params.present" + + config = self._normalized_config.config + layer_types = config.layer_types + num_hidden_layers = config.num_hidden_layers + + # Grouped order: all linear attention layers first, then all full attention layers. + # This must match the order in Qwen3_5DummyPastKeyValuesGenerator.generate() + # and Qwen3_5Patcher.patched_forward() cache unpacking/repacking. + linear_layer_idx = 0 + for i in range(num_hidden_layers): + if layer_types[i] == "linear_attention": + inputs_or_outputs[f"{cache_name_prefix}.conv.{linear_layer_idx}"] = {0: "batch_size"} + inputs_or_outputs[f"{cache_name_prefix}.recurrent.{linear_layer_idx}"] = {0: "batch_size"} + linear_layer_idx += 1 + + attention_layer_idx = 0 + for i in range(num_hidden_layers): + if layer_types[i] == "full_attention": + inputs_or_outputs[f"{cache_name_prefix}.key.{attention_layer_idx}"] = { + 0: "batch_size", + 2: decoder_sequence_name, + } + inputs_or_outputs[f"{cache_name_prefix}.value.{attention_layer_idx}"] = { + 0: "batch_size", + 2: decoder_sequence_name, + } + attention_layer_idx += 1 + + def overwrite_shape_and_generate_input(self, dummy_input_gen, input_name, framework, input_shapes): + # Qwen3.5's GatedDeltaNet has separate prefill (seq_len > 1) and decode (seq_len == 1) paths. + # The stateful model must trace the decode path so that conv/recurrent cache inputs are consumed. + # Force seq_len=1 for input_ids when past states are present. + if self.use_past and self.use_past_in_inputs and input_name in ("input_ids", "position_ids"): + saved = dummy_input_gen.sequence_length + dummy_input_gen.sequence_length = 1 + result = dummy_input_gen.generate( + input_name, framework=framework, int_dtype=self.int_dtype, float_dtype=self.float_dtype + ) + dummy_input_gen.sequence_length = saved + return result + if self.use_past and self.use_past_in_inputs and input_name == "attention_mask": + # attention_mask must be LONGER than input_ids (length > 1) during tracing so that + # torch.jit.trace captures the padding_mask slicing branch in sdpa_mask(): + # if padding_mask.shape[-1] > kv_length: padding_mask = padding_mask[:, -kv_length:] + # This makes the graph correctly adapt to growing attention_mask at runtime. + # apply_mask_to_padding_states is patched to no-op by Qwen3_5Patcher to avoid + # the broadcast issue with hidden_states * attention_mask[:, :, None]. + import torch + return torch.ones(dummy_input_gen.batch_size, 2, dtype=torch.int64) + return super().overwrite_shape_and_generate_input(dummy_input_gen, input_name, framework, input_shapes) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + # attention_mask is NOT included: patched_forward passes attention_mask=None + # to avoid hardcoded reshapes from torch.jit.trace's causal mask computation. + # The model creates a pure causal mask depending only on KV cache shape (dynamic). + # position_ids IS included: needed for correct RoPE in full_attention layers. + # Without it, cache_position (baked to [0]) would give every token position 0. + common_inputs = { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "position_ids": {0: "batch_size", 1: "sequence_length"}, + } + if self.use_past_in_inputs: + self.add_past_key_values(common_inputs, direction="inputs") + return common_inputs + + @register_in_tasks_manager("audio-spectrogram-transformer", *["feature-extraction", "audio-classification"]) class ASTOpenVINOConfig(ASTOnnxConfig): pass diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 72605106a1..c4d840be60 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -6919,10 +6919,211 @@ def segment_sum(input_tensor): # This patcher class serves the following purposes: -# 1. Packs the KV-cache, conv_state, and ssm_state tensors into a Zamba2HybridDynamicCache structure +# 1. Packs the conv_state, recurrent_state, and KV-cache tensors into a Qwen3_5DynamicCache structure # for subsequent invocation of the model's `forward` method. -# 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly -# during both the prefill and decoding steps. +# 2. Ensures GatedDeltaNet layers use torch fallback paths (not CUDA-only fast kernels) +# during export tracing. +# +# Qwen3.5 is a hybrid model: +# - Linear attention layers (GatedDeltaNet) with conv_states and recurrent_states (fixed-size) +# - Full attention layers with standard KV cache (variable-size) +# +# The flat cache_params list is ordered as (matching the dummy generator): +# [conv_0, rec_0, conv_1, rec_1, ..., conv_{L-1}, rec_{L-1}, +# key_0, val_0, key_1, val_1, ..., key_{A-1}, val_{A-1}] +# where L = number of linear attention layers, A = number of full attention layers. +class Qwen3_5Patcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5DynamicCache + + super().__init__(config, model, model_kwargs) + orig_model = model + + model_config = self.real_config._config + layer_types = model_config.layer_types + num_hidden_layers = model_config.num_hidden_layers + + # Compute absolute indices for each layer type + linear_layer_indices = [i for i in range(num_hidden_layers) if layer_types[i] == "linear_attention"] + attention_layer_indices = [i for i in range(num_hidden_layers) if layer_types[i] == "full_attention"] + num_linear = len(linear_layer_indices) + num_attention = len(attention_layer_indices) + + # Cache wrapper that reconstructs Qwen3_5DynamicCache from compact per-type lists. + # The model accesses cache fields by absolute layer index (0..num_hidden_layers-1), + # so we expand compact lists into full-length lists with None at unused positions. + class Qwen3_5HybridCacheWrap(Qwen3_5DynamicCache): + def __init__(self, cfg, conv_states_compact, recurrent_states_compact, key_cache_compact, value_cache_compact): + # Initialize full-length lists of None + super().__init__(cfg) + # Place conv/recurrent tensors at linear layer positions + for compact_idx, abs_idx in enumerate(linear_layer_indices): + self.conv_states[abs_idx] = conv_states_compact[compact_idx] + self.recurrent_states[abs_idx] = recurrent_states_compact[compact_idx] + # Place key/value tensors at attention layer positions + for compact_idx, abs_idx in enumerate(attention_layer_indices): + self.key_cache[abs_idx] = key_cache_compact[compact_idx] + self.value_cache[abs_idx] = value_cache_compact[compact_idx] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # key_cache at attention positions is an actual tensor (possibly empty with + # seq_len=0 from the dummy generator), never None. Concatenation with an + # empty tensor (dim=2 size 0) produces the correct result. + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if self.key_cache[layer_idx] is None: + return 0 + return self.key_cache[layer_idx].shape[-2] + + # Patched forward that converts between flat tensor list and cache object. + def patched_forward( + input_ids, + position_ids=None, + cache_params=None, + ): + use_cache = False + wrapped_cache_params = None + if cache_params is not None: + use_cache = True + + # Unpack flat list (grouped order: all linear pairs, then all attention pairs) + conv_states = [] + recurrent_states = [] + for idx in range(num_linear): + conv_states.append(cache_params[2 * idx]) + recurrent_states.append(cache_params[2 * idx + 1]) + + key_cache = [] + value_cache = [] + offset = 2 * num_linear + for idx in range(num_attention): + key_cache.append(cache_params[offset + 2 * idx]) + value_cache.append(cache_params[offset + 2 * idx + 1]) + + # Remember input dtype for output matching (stateful transform needs same dtypes) + input_dtype = conv_states[0].dtype if conv_states else key_cache[0].dtype + + # Cast cache tensors to model dtype for computation + model_dtype = next(orig_model.parameters()).dtype + if model_dtype != input_dtype: + conv_states = [s.to(model_dtype) for s in conv_states] + recurrent_states = [s.to(model_dtype) for s in recurrent_states] + key_cache = [s.to(model_dtype) for s in key_cache] + value_cache = [s.to(model_dtype) for s in value_cache] + + wrapped_cache_params = Qwen3_5HybridCacheWrap( + model_config, conv_states, recurrent_states, key_cache, value_cache + ) + + # Pass attention_mask=None to avoid tracing the mask-dependent indexing + # in create_causal_mask/sdpa_mask, which produces hardcoded reshapes + # from torch.jit.trace. Without attention_mask, the model creates a + # pure causal mask that depends only on KV cache shape (dynamic). + # Pass position_ids explicitly so the model uses correct RoPE positions + # instead of computing from cache state (which would be baked by trace). + causal_lm_output = self.model_orig_forward( + input_ids=input_ids, + attention_mask=None, + position_ids=position_ids, + past_key_values=wrapped_cache_params, + use_cache=use_cache, + ) + outputs = { + "logits": causal_lm_output.logits, + } + + if use_cache: + past_key_values = causal_lm_output.past_key_values + # Repack into flat list (same grouped order), casting back to input dtype + present_key_values = [] + for abs_idx in linear_layer_indices: + present_key_values.append(past_key_values.conv_states[abs_idx].to(input_dtype)) + present_key_values.append(past_key_values.recurrent_states[abs_idx].to(input_dtype)) + + for abs_idx in attention_layer_indices: + present_key_values.append(past_key_values.key_cache[abs_idx].to(input_dtype)) + present_key_values.append(past_key_values.value_cache[abs_idx].to(input_dtype)) + + outputs["present_key_values"] = present_key_values + + return outputs + + self.patched_forward = patched_forward + self.model_orig_forward = self.orig_forward + self.orig_forward = patched_forward + + def __enter__(self): + import transformers.models.qwen3_5.modeling_qwen3_5 as _qwen3_5_module + from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5GatedDeltaNet, + torch_causal_conv1d_update, + torch_chunk_gated_delta_rule, + torch_recurrent_gated_delta_rule, + ) + + super().__enter__() + setattr(self._model, self.orig_forward_name, self.patched_forward) + + # Patch apply_mask_to_padding_states to no-op during export. + # This function broadcasts hidden_states * attention_mask[:,:,None], which would + # expand hidden_states from [B,1,D] to [B,mask_len,D] if attention_mask is longer + # than input_ids. It's only needed for padding during training, not inference. + self._orig_apply_mask = _qwen3_5_module.apply_mask_to_padding_states + _qwen3_5_module.apply_mask_to_padding_states = lambda hidden_states, attention_mask: hidden_states + + # Patch each GatedDeltaNet layer to use torch fallback paths instead of + # CUDA-only fast kernels (causal-conv1d, flash-linear-attention). + for layer in self._model.model.layers: + if not (hasattr(layer, "linear_attn") and isinstance(layer.linear_attn, Qwen3_5GatedDeltaNet)): + continue + gdn = layer.linear_attn + # Save originals for restoration + gdn._orig_causal_conv1d_fn = gdn.causal_conv1d_fn + gdn._orig_causal_conv1d_update = gdn.causal_conv1d_update + gdn._orig_chunk_gated_delta_rule = gdn.chunk_gated_delta_rule + gdn._orig_recurrent_gated_delta_rule = gdn.recurrent_gated_delta_rule + # Force torch fallback paths + gdn.causal_conv1d_fn = None # triggers native conv1d path in prefill + gdn.causal_conv1d_update = torch_causal_conv1d_update + gdn.chunk_gated_delta_rule = torch_chunk_gated_delta_rule + gdn.recurrent_gated_delta_rule = torch_recurrent_gated_delta_rule + + def __exit__(self, exc_type, exc_value, traceback): + import transformers.models.qwen3_5.modeling_qwen3_5 as _qwen3_5_module + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5GatedDeltaNet + + super().__exit__(exc_type, exc_value, traceback) + setattr(self._model, self.orig_forward_name, self.model_orig_forward) + + # Restore apply_mask_to_padding_states + _qwen3_5_module.apply_mask_to_padding_states = self._orig_apply_mask + + # Restore original GatedDeltaNet methods + for layer in self._model.model.layers: + if not (hasattr(layer, "linear_attn") and isinstance(layer.linear_attn, Qwen3_5GatedDeltaNet)): + continue + gdn = layer.linear_attn + gdn.causal_conv1d_fn = gdn._orig_causal_conv1d_fn + gdn.causal_conv1d_update = gdn._orig_causal_conv1d_update + gdn.chunk_gated_delta_rule = gdn._orig_chunk_gated_delta_rule + gdn.recurrent_gated_delta_rule = gdn._orig_recurrent_gated_delta_rule + + class Zamba2ModelPatcher(ModelPatcher): def __init__( self, diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index 3b8642d65a..f7f6e42438 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -284,14 +284,14 @@ def get_kv_ssm_tensor_names(ssm_prefix_names: list, kv_prefix_names: list, ov_te other_tensors.append(ov_tensor) return kv_names, ssm_names, other_tensors - ssm_prefix_input_names = ["cache_params.past.ssm", "cache_params.past.conv"] + ssm_prefix_input_names = ["cache_params.past.ssm", "cache_params.past.recurrent", "cache_params.past.conv"] kv_prefix_input_names = ["cache_params.past.key", "cache_params.past.value"] kv_input_names, ssm_input_names, not_cache_inputs = get_kv_ssm_tensor_names( ssm_prefix_input_names, kv_prefix_input_names, ov_model.inputs ) cache_inputs = kv_input_names + ssm_input_names - ssm_prefix_output_names = ["cache_params.present.ssm", "cache_params.present.conv"] + ssm_prefix_output_names = ["cache_params.present.ssm", "cache_params.present.recurrent", "cache_params.present.conv"] kv_prefix_output_names = ["cache_params.present.key", "cache_params.present.value"] kv_output_names, ssm_output_names, _ = get_kv_ssm_tensor_names( ssm_prefix_output_names, kv_prefix_output_names, ov_model.outputs @@ -307,10 +307,11 @@ def get_kv_ssm_tensor_names(ssm_prefix_names: list, kv_prefix_names: list, ov_te def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): if config.is_encoder_decoder and model_has_input_output_name(ov_model, "encoder_hidden_states"): - return patch_stateful_encoder_decoder(config, ov_model) - if config.model_type in SSM_MODELS: - return patch_stateful_hybrid_ssm(ov_model) - return patch_stateful_decoder(config, ov_model) + patch_stateful_encoder_decoder(config, ov_model) + elif config.model_type in SSM_MODELS: + patch_stateful_hybrid_ssm(ov_model) + else: + patch_stateful_decoder(config, ov_model) def patch_stateful_decoder(config: PretrainedConfig, ov_model: ov.Model): diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 3d9a854e39..a8f14fc79d 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -103,7 +103,13 @@ def _get_input_info( for i in range(len(ordered_input_names)): name = ordered_input_names[i] example = flatten_inputs[i] - type = get_element_type(example.cpu().numpy().dtype) + # NumPy doesn't support bfloat16; convert to float32 for dtype detection, then fix up + import torch + if example.dtype == torch.bfloat16: + from openvino import Type as OVType + type = OVType.bf16 + else: + type = get_element_type(example.cpu().numpy().dtype) shape = PartialShape(example.shape) if name in inputs: named_dims = inputs[name] @@ -305,7 +311,7 @@ def get_submodels(model): "minicpmo", ] -SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid"] +SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_5", "qwen3_5_text"] # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test # TODO: add tests for all models that are compatible and remove support for all others @@ -504,6 +510,8 @@ def set_simplified_chat_template(ov_tokenizer_model, processor_chat_template=Non "esm", "levit", "llama4", + "qwen3_5", + "qwen3_5_text", ) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 2002e268ac..5a28b82c57 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -172,6 +172,22 @@ class OVQuantizationMethod(str, Enum): "ratio": 1.0, "quant_method": OVQuantizationMethod.AWQ, }, + "Qwen/Qwen3.5-3B": { + "bits": 4, + "sym": True, + "group_size": 128, + "ratio": 1.0, + "quant_method": OVQuantizationMethod.AWQ, + }, + "Qwen/Qwen3.5-8B": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "dataset": "wikitext2", + "quant_method": OVQuantizationMethod.AWQ, + "scale_estimation": True, + }, "openlm-research/open_llama_3b": {"bits": 4, "sym": False, "group_size": 64, "all_layers": True}, "openlm-research/open_llama_3b_v2": { "bits": 4, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a97416cea1..108c13a947 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -25,7 +25,10 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import GenerationMixin -from transformers.utils import is_offline_mode +try: + from transformers.utils import is_offline_mode +except ImportError: + from transformers.utils.hub import is_offline_mode from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig @@ -264,7 +267,7 @@ def __init__( # some model configs may have issues with loading without parameters initialization try: misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): + except (KeyError, TypeError, AttributeError): misplaced_generation_parameters = {} if len(misplaced_generation_parameters) > 0: logger.warning( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3b95b5f276..f09b479422 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -1082,6 +1082,7 @@ def __init__( max_batch_size: Optional[int] = None, conv_states: Optional[List[torch.Tensor]] = None, ssm_states: Optional[List[torch.Tensor]] = None, + recurrent_states: Optional[List[torch.Tensor]] = None, key_cache: Optional[List[torch.Tensor]] = None, value_cache: Optional[List[torch.Tensor]] = None, ): @@ -1093,9 +1094,27 @@ def __init__( self.mamba_d_conv = getattr(config, "mamba_d_conv", None) self.mamba_expand = getattr(config, "mamba_expand", None) self.mamba_d_state = getattr(config, "mamba_d_state", None) - self.intermediate_size = config.intermediate_size + self.intermediate_size = getattr(config, "intermediate_size", None) self.conv_kernel_size = getattr(config, "conv_kernel", getattr(config, "mamba_d_conv", None)) - if config.model_type == "granitemoehybrid": + if config.model_type in ("qwen3_5", "qwen3_5_text"): + text_cfg = getattr(config, "text_config", config) + layer_types = text_cfg.layer_types + self.num_key_value_heads = text_cfg.num_key_value_heads + self.head_dim = text_cfg.head_dim + self.num_mamba_layers = layer_types.count("linear_attention") + self.num_attn_layers = layer_types.count("full_attention") + # Store linear attention parameters for state initialization + self._linear_num_key_heads = text_cfg.linear_num_key_heads + self._linear_key_head_dim = text_cfg.linear_key_head_dim + self._linear_num_value_heads = text_cfg.linear_num_value_heads + self._linear_value_head_dim = text_cfg.linear_value_head_dim + self._linear_conv_kernel_dim = text_cfg.linear_conv_kernel_dim + # Not applicable to qwen3_5 + self.mamba_ngroups = None + self.n_mamba_heads = None + self.mamba_headdim = None + self.ssm_state_size = None + elif config.model_type == "granitemoehybrid": layer_types = getattr(config, "layer_types", None) self.num_key_value_heads = getattr(config, "num_key_value_heads", None) self.head_dim = int(self.hidden_size / self.num_attention_heads) @@ -1123,7 +1142,14 @@ def __init__( if self.conv_states is None: self.conv_states = [] for _ in range(self.num_mamba_layers): - if ( + if hasattr(self, "_linear_conv_kernel_dim"): + # Qwen3.5 linear attention conv state: + # d_inner = key_dim * 2 + value_dim + key_dim = self._linear_key_head_dim * self._linear_num_key_heads + value_dim = self._linear_value_head_dim * self._linear_num_value_heads + d_inner = key_dim * 2 + value_dim + conv_state_shape = (self.max_batch_size, d_inner, self._linear_conv_kernel_dim) + elif ( self.mamba_ngroups and self.mamba_d_state and self.mamba_d_conv @@ -1146,25 +1172,46 @@ def __init__( self.ssm_states = ssm_states if self.ssm_states is None: self.ssm_states: List[torch.Tensor] = [] - for _ in range(self.num_mamba_layers): - if self.n_mamba_heads and self.mamba_headdim: - # Mamba2 block - ssm_state_shape = ( + if not hasattr(self, "_linear_conv_kernel_dim"): + # SSM states only apply to Mamba-based models, not Qwen3.5 + for _ in range(self.num_mamba_layers): + if self.n_mamba_heads and self.mamba_headdim: + # Mamba2 block + ssm_state_shape = ( + self.max_batch_size, + self.n_mamba_heads, + self.mamba_headdim, + self.ssm_state_size, + ) + else: + # Mamba block + ssm_state_shape = (self.max_batch_size, self.intermediate_size, self.ssm_state_size) + + ssm_state: torch.Tensor = torch.zeros( + ssm_state_shape, + device=self.device, + dtype=dtype, + ) + self.ssm_states.append(ssm_state) + + # Recurrent states for Qwen3.5 linear attention layers (gated delta rule K^T V accumulation) + self.recurrent_states = recurrent_states + if self.recurrent_states is None: + self.recurrent_states: List[torch.Tensor] = [] + if hasattr(self, "_linear_conv_kernel_dim"): + for _ in range(self.num_mamba_layers): + recurrent_state_shape = ( self.max_batch_size, - self.n_mamba_heads, - self.mamba_headdim, - self.ssm_state_size, + self._linear_num_key_heads, + self._linear_key_head_dim, + self._linear_value_head_dim, ) - else: - # Mamba block - ssm_state_shape = (self.max_batch_size, self.intermediate_size, self.ssm_state_size) - - ssm_state: torch.Tensor = torch.zeros( - ssm_state_shape, - device=self.device, - dtype=dtype, - ) - self.ssm_states.append(ssm_state) + recurrent_state: torch.Tensor = torch.zeros( + recurrent_state_shape, + device=self.device, + dtype=dtype, + ) + self.recurrent_states.append(recurrent_state) self.key_cache = key_cache if self.key_cache is None: @@ -1245,6 +1292,7 @@ def __init__( self.key_cache_names = [] self.value_cache_names = [] self.ssm_cache_names = [] + self.recurrent_cache_names = [] self.conv_cache_names = [] super().__init__( @@ -1261,6 +1309,9 @@ def __init__( self.key_cache_input_names = sorted([key for key in self.input_names if "cache_params.past.key" in key]) self.value_cache_input_names = sorted([key for key in self.input_names if "cache_params.past.value" in key]) self.ssm_cache_input_names = sorted([key for key in self.input_names if "cache_params.past.ssm" in key]) + self.recurrent_cache_input_names = sorted( + [key for key in self.input_names if "cache_params.past.recurrent" in key] + ) self.conv_cache_input_names = sorted([key for key in self.input_names if "cache_params.past.conv" in key]) self.key_cache_output_names = sorted([key for key in self.output_names if "cache_params.present.key" in key]) @@ -1268,6 +1319,9 @@ def __init__( [key for key in self.output_names if "cache_params.present.value" in key] ) self.ssm_cache_output_names = sorted([key for key in self.output_names if "cache_params.present.ssm" in key]) + self.recurrent_cache_output_names = sorted( + [key for key in self.output_names if "cache_params.present.recurrent" in key] + ) self.conv_cache_output_names = sorted([key for key in self.output_names if "cache_params.present.conv" in key]) if hasattr(config, "conv_kernel") and config.conv_kernel is not None: @@ -1280,17 +1334,23 @@ def compile(self): super().compile() if is_first_time_compile and self.stateful: for state in self.request.query_state(): - if "cache_params.present.key" in state.name: - self.key_cache_names.append(state.name) - elif "cache_params.present.value" in state.name: - self.value_cache_names.append(state.name) - elif "cache_params.present.ssm" in state.name: - self.ssm_cache_names.append(state.name) - elif "cache_params.present.conv" in state.name: - self.conv_cache_names.append(state.name) + name = state.name + # Match both "past" and "present" prefixes since the variable_id + # may use either convention depending on the stateful conversion method + if ".key" in name and ("cache_params.present.key" in name or "cache_params.past.key" in name): + self.key_cache_names.append(name) + elif ".value" in name and ("cache_params.present.value" in name or "cache_params.past.value" in name): + self.value_cache_names.append(name) + elif "cache_params.present.ssm" in name or "cache_params.past.ssm" in name: + self.ssm_cache_names.append(name) + elif "cache_params.present.recurrent" in name or "cache_params.past.recurrent" in name: + self.recurrent_cache_names.append(name) + elif "cache_params.present.conv" in name or "cache_params.past.conv" in name: + self.conv_cache_names.append(name) self.key_cache_names = sorted(self.key_cache_names) self.value_cache_names = sorted(self.value_cache_names) self.ssm_cache_names = sorted(self.ssm_cache_names) + self.recurrent_cache_names = sorted(self.recurrent_cache_names) self.conv_cache_names = sorted(self.conv_cache_names) @staticmethod @@ -1310,20 +1370,21 @@ def prepare_inputs( ) -> Dict: if kwargs.get("past_key_values") is not None: raise ValueError("`past_key_values` input is not supported for `OVModelWithMambaForCausalLM`") - if kwargs.get("position_ids") is not None: - raise ValueError("`position_ids` input is not supported for `OVModelWithMambaForCausalLM`") inputs = {"input_ids": input_ids} if "cache_position" in self.input_names: if cache_position is None: # initialize it as for prefill stage - cache_position = torch.arange(0, self.config.conv_kernel) + cache_position = torch.arange(0, self.conv_kernel) inputs["cache_position"] = cache_position if "attention_mask" in self.input_names: if attention_mask is None: # during decoding stage it must be a tensor of ones attention_mask = torch.ones_like(input_ids, dtype=torch.int64) inputs["attention_mask"] = attention_mask + position_ids = kwargs.get("position_ids", None) + if "position_ids" in self.input_names and position_ids is not None: + inputs["position_ids"] = position_ids if self.stateful and cache_params is None: # this is prefill step, reset all states @@ -1335,6 +1396,7 @@ def prepare_inputs( cache_params = OVCacheWithMambaStates(self.config, input_ids.shape[0]) ssm_cache = cache_params.ssm_states + recurrent_cache = cache_params.recurrent_states conv_cache = cache_params.conv_states key_cache = cache_params.key_cache value_cache = cache_params.value_cache @@ -1342,6 +1404,7 @@ def prepare_inputs( inputs.update(zip(self.key_cache_input_names, key_cache)) inputs.update(zip(self.value_cache_input_names, value_cache)) inputs.update(zip(self.ssm_cache_input_names, ssm_cache)) + inputs.update(zip(self.recurrent_cache_input_names, recurrent_cache)) inputs.update(zip(self.conv_cache_input_names, conv_cache)) # prepare beam_idx input that is required for hybrid models with both KV cache and Mamba states @@ -1361,19 +1424,50 @@ def forward( **kwargs, ): self.compile() - inputs = self.prepare_inputs(input_ids, attention_mask, cache_params, use_cache, cache_position, **kwargs) - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + seq_len = input_ids.shape[1] if input_ids is not None else 1 + + # For models exported in decode mode (seq_len=1), process prefill + # tokens one at a time. This is needed for hybrid models like Qwen3.5 + # where the model graph contains hardcoded shapes for seq_len=1. + if seq_len > 1 and self.stateful: + # Reset state once at the start of prefill (NOT per-token). + # We avoid calling prepare_inputs in the loop since it would + # reset the OV state on every token (cache_params=None triggers reset). + if cache_params is None: + if self.request is not None: + self.request.reset_state() + self._past_length = 0 + batch_size = input_ids.shape[0] + for t in range(seq_len): + token = input_ids[:, t : t + 1] + inputs = {"input_ids": token} + if "position_ids" in self.input_names: + inputs["position_ids"] = torch.tensor([[t]], dtype=torch.int64) + if "beam_idx" in self.input_names: + inputs["beam_idx"] = np.arange(batch_size, dtype=int) + self.request.start_async(inputs, share_inputs=True) + self.request.wait() + logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + else: + position_ids = torch.tensor([[self._past_length]], dtype=torch.int64) if hasattr(self, "_past_length") else None + inputs = self.prepare_inputs( + input_ids, attention_mask, cache_params, use_cache, cache_position, + position_ids=position_ids, **kwargs, + ) + self.request.start_async(inputs, share_inputs=True) + self.request.wait() + logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) ssm_states = None + recurrent_states = None conv_states = None key_cache = None value_cache = None if self.stateful: self._past_length += input_ids.shape[1] ssm_states = [None] * len(self.ssm_cache_names) + recurrent_states = [None] * len(self.recurrent_cache_names) conv_states = [None] * len(self.conv_cache_names) key_cache = [None] * len(self.key_cache_names) value_cache = [None] * len(self.value_cache_names) @@ -1381,6 +1475,9 @@ def forward( if "cache_params.past.ssm" in state.name: idx = int(state.name.rsplit(".", 1)[-1]) ssm_states[idx] = state.state.data + elif "cache_params.past.recurrent" in state.name: + idx = int(state.name.rsplit(".", 1)[-1]) + recurrent_states[idx] = state.state.data elif "cache_params.past.conv" in state.name: idx = int(state.name.rsplit(".", 1)[-1]) conv_states[idx] = state.state.data @@ -1392,6 +1489,9 @@ def forward( value_cache[idx] = state.state.data elif not self.stateful and use_cache: ssm_states = [self.request.get_tensor(key).data for key in self.ssm_cache_output_names] + recurrent_states = [ + self.request.get_tensor(key).data for key in self.recurrent_cache_output_names + ] conv_states = [self.request.get_tensor(key).data for key in self.conv_cache_output_names] key_cache = [self.request.get_tensor(key).data for key in self.key_cache_output_names] value_cache = [self.request.get_tensor(key).data for key in self.value_cache_output_names] @@ -1401,6 +1501,7 @@ def forward( batch_size=input_ids.shape[0], conv_states=conv_states, ssm_states=ssm_states, + recurrent_states=recurrent_states, key_cache=key_cache, value_cache=value_cache, ) @@ -1418,6 +1519,8 @@ def _update_model_kwargs_for_generation( model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens if "attention_mask" in model_kwargs: + config = getattr(self, "config", None) + model_type = getattr(config, "model_type", "") attention_mask = model_kwargs["attention_mask"] model_kwargs["attention_mask"] = torch.cat( [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 @@ -1449,8 +1552,8 @@ def prepare_inputs_for_generation( # decoding stage so it takes the last token input_ids = input_ids[:, -1].unsqueeze(-1) - if self.config.model_type not in ["lfm2", "granitemoehybrid"]: - # LFM2 and GraniteMoeHybrid (Granite-4.0) require the attention mask + if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_5", "qwen3_5_text"]: + # LFM2, GraniteMoeHybrid (Granite-4.0), and Qwen3.5 require the attention mask # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used. # Other models like Mamba typically do not require an attention_mask # for the decoding step after the first token so use attention mask of ones. diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index 2e2ee2d63c..1918c95f0f 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -31,7 +31,10 @@ from transformers.file_utils import add_start_docstrings from transformers.modeling_outputs import ModelOutput from transformers.models.clip.modeling_clip import CLIPOutput -from transformers.utils import is_offline_mode +try: + from transformers.utils import is_offline_mode +except ImportError: + from transformers.utils.hub import is_offline_mode from optimum.exporters.tasks import TasksManager diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index e6e99ffd56..2afd2f6c95 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -27,12 +27,15 @@ AutoConfig, AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, - AutoModelForVision2Seq, GenerationConfig, Pix2StructForConditionalGeneration, PretrainedConfig, WhisperForConditionalGeneration, ) +try: + from transformers import AutoModelForVision2Seq +except ImportError: + AutoModelForVision2Seq = None from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 818eb41726..b22c5adc96 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -32,7 +32,19 @@ from openvino import Type as OVType from packaging.version import Version from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size +try: + from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size +except (ImportError, ModuleNotFoundError): + # transformers 5.x removed transformers.onnx; inline the logic + import enum + + class ParameterFormat(enum.Enum): + Float = 4 + Float16 = 2 + Int8 = 1 + + def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int: + return num_parameters * dtype.value from optimum.intel.utils.import_utils import is_torch_version diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 3ad9877a82..bdc622ea8a 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -84,11 +84,9 @@ from openvino import get_version version = get_version() - # avoid invalid format + # avoid invalid format: strip dev/commit suffixes (e.g. "2026.0.0-17740-abc" -> "2026.0.0") if "-" in version: - ov_major_version, dev_info = version.split("-", 1) - commit_id = dev_info.split("-")[0] - version = f"{ov_major_version}-{commit_id}" + version = version.split("-")[0] _openvino_version = version except ImportError: _openvino_available = False @@ -425,7 +423,8 @@ def is_openvino_tokenizers_version(operation: str, version: str): except importlib_metadata.PackageNotFoundError: pass - tokenizers_version = tokenizers_version[: len("2025.0.0.0")] + # Take only the first 4 version components (e.g. "2026.10.0.0" -> "2026.10.0.0") + tokenizers_version = ".".join(tokenizers_version.split(".")[:4]) return compare_versions(parse(tokenizers_version), operation, version) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..561b747cb7 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -23,7 +23,15 @@ from typing import Dict, List, Optional, Type, Union import torch -from huggingface_hub import HfApi, HfFolder, hf_hub_download +from huggingface_hub import HfApi, hf_hub_download +try: + from huggingface_hub import HfFolder +except ImportError: + from huggingface_hub import get_token as _get_token + class HfFolder: + @staticmethod + def get_token(): + return _get_token() from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.hf_api import file_exists from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel diff --git a/setup.py b/setup.py index b86c176463..5561d3569b 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", - "transformers>=4.45,<4.58", + "optimum-onnx", + "transformers>=4.45", "setuptools", "nncf>=2.19.0", "openvino>=2025.4.0",