From 8ab9f2f0bdfb56edc8906280aa9c32d441327516 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Sun, 22 Mar 2026 16:20:39 -0700 Subject: [PATCH 1/8] Flux Encoder Type Resolution + CasualLM Unwrapping + PooledProjection Guard --- optimum/exporters/openvino/convert.py | 67 ++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 60d90f53e0..b6a84040cf 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1286,28 +1286,64 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): return models_for_export +def _resolve_flux_text_encoder_model_type(text_encoder, default_model_type: str, tokenizer=None) -> str: + config = getattr(text_encoder, "config", None) + model_type = str(getattr(config, "model_type", "") or "").lower() + architectures = [str(x) for x in (getattr(config, "architectures", []) or [])] + encoder_cls_name = text_encoder.__class__.__name__ + tokenizer_cls_name = tokenizer.__class__.__name__ if tokenizer is not None else "" + + looks_like_gemma = ( + model_type in {"gemma", "gemma2", "gemma3", "gemma3_text"} + or any("Gemma" in arch for arch in architectures) + or "Gemma" in encoder_cls_name + or "CausalLM" in encoder_cls_name + or "Gemma" in tokenizer_cls_name + ) + + if looks_like_gemma: + return "gemma2-text-encoder" + + return default_model_type + + def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export = {} # Text encoder text_encoder = getattr(pipeline, "text_encoder", None) if text_encoder is not None: + text_encoder_for_export = text_encoder + if "CausalLM" in text_encoder.__class__.__name__ and hasattr(text_encoder, "model"): + text_encoder_for_export = text_encoder.model + + text_encoder_model_type = _resolve_flux_text_encoder_model_type( + text_encoder, + "clip-text", + getattr(pipeline, "tokenizer", None), + ) + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=text_encoder, + model=text_encoder_for_export, exporter=exporter, library_name="diffusers", task="feature-extraction", - model_type="clip-text", + model_type=text_encoder_model_type, ) text_encoder_export_config = text_encoder_config_constructor( - pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + text_encoder_for_export.config, int_dtype=int_dtype, float_dtype=float_dtype ) - models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + models_for_export["text_encoder"] = (text_encoder_for_export, text_encoder_export_config) transformer = pipeline.transformer transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) transformer.config.time_cond_proj_dim = None + + transformer_forward_inputs = inspect.signature(transformer.forward).parameters + if "pooled_projections" in transformer_forward_inputs and not hasattr(transformer.config, "pooled_projection_dim"): + transformer.config.pooled_projection_dim = transformer.config.joint_attention_dim + export_config_constructor = TasksManager.get_exporter_config_constructor( model=transformer, exporter=exporter, @@ -1321,7 +1357,7 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): transformer_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} models_for_export["transformer"] = (transformer, transformer_export_config) - # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + # VAE Encoder vae_encoder = copy.deepcopy(pipeline.vae) vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} vae_config_constructor = TasksManager.get_exporter_config_constructor( @@ -1337,7 +1373,7 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): vae_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + # VAE Decoder vae_decoder = copy.deepcopy(pipeline.vae) vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) vae_config_constructor = TasksManager.get_exporter_config_constructor( @@ -1355,24 +1391,33 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): text_encoder_2 = getattr(pipeline, "text_encoder_2", None) if text_encoder_2 is not None: + text_encoder_2_for_export = text_encoder_2 + if "CausalLM" in text_encoder_2.__class__.__name__ and hasattr(text_encoder_2, "model"): + text_encoder_2_for_export = text_encoder_2.model + + text_encoder_2_model_type = _resolve_flux_text_encoder_model_type( + text_encoder_2, + "t5-encoder-model", + getattr(pipeline, "tokenizer_2", None), + ) + export_config_constructor = TasksManager.get_exporter_config_constructor( - model=text_encoder_2, + model=text_encoder_2_for_export, exporter=exporter, library_name="diffusers", task="feature-extraction", - model_type="t5-encoder-model", + model_type=text_encoder_2_model_type, ) export_config = export_config_constructor( - text_encoder_2.config, + text_encoder_2_for_export.config, int_dtype=int_dtype, float_dtype=float_dtype, ) export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} - models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + models_for_export["text_encoder_2"] = (text_encoder_2_for_export, export_config) return models_for_export - def _get_encoder_decoder_stateful_models_for_export( model: "PreTrainedModel", task: str, From 80379e46919f984fcf9eeec599061f2a7c8b1119 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Sun, 22 Mar 2026 16:22:02 -0700 Subject: [PATCH 2/8] Flux Dummy ID dimensions + Conditional Pooled Projection Fixes --- optimum/exporters/openvino/model_configs.py | 80 +++++++++++++++++++-- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0624624a77..ab47af67ef 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2533,6 +2533,27 @@ def outputs(self) -> Dict[str, Dict[int, str]]: } +def _get_flux_ids_dim(config) -> int: + for attr_name in ("axes_dims_rope", "axes_dim", "axes_dims"): + value = getattr(config, attr_name, None) + if value is not None: + if isinstance(value, (list, tuple)): + return len(value) + if isinstance(value, int): + return value + + if hasattr(config, "get"): + for key in ("axes_dims_rope", "axes_dim", "axes_dims"): + value = config.get(key, None) + if value is not None: + if isinstance(value, (list, tuple)): + return len(value) + if isinstance(value, int): + return value + + return 3 + + class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "pixel_values", @@ -2551,12 +2572,12 @@ def __init__( num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"] // 4, height: int = DEFAULT_DUMMY_SHAPES["height"] // 4, - # Reduce img shape by 4 for FLUX to reduce memory usage on conversion **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) if getattr(normalized_config, "in_channels", None): self.num_channels = normalized_config.in_channels // 4 + self.ids_dim = _get_flux_ids_dim(normalized_config.config) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name in ["hidden_states", "sample"]: @@ -2567,9 +2588,9 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int img_ids_width = self.width // 2 return self.random_int_tensor( ( - [self.batch_size, img_ids_height * img_ids_width, 3] + [self.batch_size, img_ids_height * img_ids_width, self.ids_dim] if is_diffusers_version("<", "0.31.0") - else [img_ids_height * img_ids_width, 3] + else [img_ids_height * img_ids_width, self.ids_dim] ), min_value=0, max_value=min(img_ids_height, img_ids_width), @@ -2589,14 +2610,35 @@ class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): "txt_ids", ) + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + **kwargs, + ) + self.ids_dim = _get_flux_ids_dim(normalized_config.config) + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "txt_ids": import torch shape = ( - [self.batch_size, self.sequence_length, 3] + [self.batch_size, self.sequence_length, self.ids_dim] if is_diffusers_version("<", "0.31.0") - else [self.sequence_length, 3] + else [self.sequence_length, self.ids_dim] ) dtype = DTYPE_MAPPER.pt(float_dtype) return torch.full(shape, 0, dtype=dtype) @@ -2614,10 +2656,38 @@ class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig): ) _MODEL_PATCHER = FluxTransfromerModelPatcher + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + config = self._normalized_config.config + pooled_projection_dim = getattr(config, "pooled_projection_dim", None) + if pooled_projection_dim is None and hasattr(config, "get"): + pooled_projection_dim = config.get("pooled_projection_dim", None) + + self._use_pooled_projections = pooled_projection_dim is not None + + if self._use_pooled_projections: + self.DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestpsInputGenerator, + DummyFluxTransformerInputGenerator, + DummyFluxTextInputGenerator, + PooledProjectionsDummyInputGenerator, + ) + else: + self.DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestpsInputGenerator, + DummyFluxTransformerInputGenerator, + DummyFluxTextInputGenerator, + ) + @property def inputs(self): common_inputs = super().inputs common_inputs.pop("sample", None) + + if not getattr(self, "_use_pooled_projections", True): + common_inputs.pop("pooled_projections", None) + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} common_inputs["txt_ids"] = ( {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"} From 4b75ed2449d77ec23e531eea25f0860973362b16 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Sun, 22 Mar 2026 16:28:39 -0700 Subject: [PATCH 3/8] Added tests --- optimum/exporters/openvino/__init__.py | 2 +- tests/openvino/test_export.py | 63 +++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index 94ea4f103b..96b77e7731 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -15,7 +15,7 @@ import optimum.exporters.openvino.model_configs from .__main__ import main_export -from .convert import export, export_from_model, export_models, export_pytorch_via_onnx +from .convert import export, export_from_model, export_models, export_pytorch_via_onnx, _resolve_flux_text_encoder_model_type from .stateful import ensure_stateful_is_available, patch_stateful diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 9519cea1ec..476e43fc62 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -20,6 +20,7 @@ from parameterized import parameterized from sentence_transformers import SentenceTransformer, models from transformers import AutoConfig, AutoTokenizer, GenerationConfig +from transformers.utils import FrozenDict from utils_tests import ( MODEL_NAMES, OPENVINO_DEVICE, @@ -28,7 +29,8 @@ from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.onnx.model_configs import BertOnnxConfig -from optimum.exporters.openvino import export_from_model, main_export +from optimum.exporters.openvino import export_from_model, main_export, _resolve_flux_text_encoder_model_type +from optimum.exporters.openvino.model_configs import _get_flux_ids_dim from optimum.exporters.tasks import TasksManager from optimum.intel import ( OVFluxPipeline, @@ -334,6 +336,65 @@ def test_compare_openvino_onnx_supported_architectures(self): if len(only_onnx) > 0: logger.warning(f"The following architectures export {only_onnx} is supported by ONNX but not OpenVINO") +class Flux2KleinSupportUnitTest(unittest.TestCase): + def test_get_flux_ids_dim_from_object_axes_dims_rope_list(self): + class Cfg: + axes_dims_rope = [16, 56, 56, 8] + + self.assertEqual(_get_flux_ids_dim(Cfg()), 4) + + def test_get_flux_ids_dim_from_frozendict_axes_dims_rope_list(self): + cfg = FrozenDict({"axes_dims_rope": [16, 56, 56, 8]}) + self.assertEqual(_get_flux_ids_dim(cfg), 4) + + def test_get_flux_ids_dim_default_fallback(self): + class Cfg: + pass + + self.assertEqual(_get_flux_ids_dim(Cfg()), 3) + + def test_resolve_flux_text_encoder_model_type_from_model_type(self): + class EncCfg: + model_type = "gemma2" + architectures = ["Gemma2ForCausalLM"] + + class Encoder: + config = EncCfg() + + model_type = _resolve_flux_text_encoder_model_type( + Encoder(), default_model_type="clip-text", tokenizer=None + ) + self.assertEqual(model_type, "gemma2-text-encoder") + + def test_resolve_flux_text_encoder_model_type_from_tokenizer_name(self): + class EncCfg: + model_type = "" + architectures = [] + + class Encoder: + config = EncCfg() + + GemmaTokenizerFast = type("GemmaTokenizerFast", (), {}) + tokenizer = GemmaTokenizerFast() + + model_type = _resolve_flux_text_encoder_model_type( + Encoder(), default_model_type="clip-text", tokenizer=tokenizer + ) + self.assertEqual(model_type, "gemma2-text-encoder") + + def test_resolve_flux_text_encoder_model_type_falls_back_to_default(self): + class EncCfg: + model_type = "clip_text_model" + architectures = ["CLIPTextModel"] + + class Encoder: + config = EncCfg() + + model_type = _resolve_flux_text_encoder_model_type( + Encoder(), default_model_type="clip-text", tokenizer=None + ) + self.assertEqual(model_type, "clip-text") + class CustomExportModelTest(unittest.TestCase): def test_custom_export_config_model(self): From 57f96b6c6aa391eb31a301603adc5a05533b3412 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Sun, 22 Mar 2026 16:35:22 -0700 Subject: [PATCH 4/8] Fixed Test Imports --- tests/openvino/test_export.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 476e43fc62..ddb27afc90 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -20,7 +20,6 @@ from parameterized import parameterized from sentence_transformers import SentenceTransformer, models from transformers import AutoConfig, AutoTokenizer, GenerationConfig -from transformers.utils import FrozenDict from utils_tests import ( MODEL_NAMES, OPENVINO_DEVICE, @@ -343,8 +342,8 @@ class Cfg: self.assertEqual(_get_flux_ids_dim(Cfg()), 4) - def test_get_flux_ids_dim_from_frozendict_axes_dims_rope_list(self): - cfg = FrozenDict({"axes_dims_rope": [16, 56, 56, 8]}) + def test_get_flux_ids_dim_from_dict_axes_dims_rope_list(self): + cfg = {"axes_dims_rope": [16, 56, 56, 8]} self.assertEqual(_get_flux_ids_dim(cfg), 4) def test_get_flux_ids_dim_default_fallback(self): From c4feca196fb30afc8a17b9d60e28e4dce6159462 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Sun, 22 Mar 2026 20:42:01 -0700 Subject: [PATCH 5/8] Fixes to enable Optimum Image Generation --- optimum/exporters/openvino/convert.py | 58 +++++++++--- optimum/intel/__init__.py | 2 + optimum/intel/openvino/__init__.py | 1 + optimum/intel/openvino/modeling_diffusion.py | 92 +++++++++++++++++-- .../dummy_openvino_and_diffusers_objects.py | 11 +++ 5 files changed, 140 insertions(+), 24 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index b6a84040cf..a82d2ad35d 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -532,21 +532,43 @@ def export_models( output_name = output_names[i] if output_names is not None else Path(model_name + ".xml") output_path = output_dir / output_name output_path.parent.mkdir(parents=True, exist_ok=True) - outputs.append( - export( - model=submodel, - config=sub_export_config, - output=output_path, - opset=opset, - device=device, - input_shapes=input_shapes, - model_kwargs=model_kwargs, - ov_config=ov_config, - stateful=stateful[i] if isinstance(stateful, (list, tuple)) else stateful, - patch_16bit_model=patch_16bit_model, - library_name=library_name, + try: + outputs.append( + export( + model=submodel, + config=sub_export_config, + output=output_path, + opset=opset, + device=device, + input_shapes=input_shapes, + model_kwargs=model_kwargs, + ov_config=ov_config, + stateful=stateful[i] if isinstance(stateful, (list, tuple)) else stateful, + patch_16bit_model=patch_16bit_model, + library_name=library_name, + ) + ) + except Exception as e: + if "prim::TupleConstruct" not in str(e): + raise + + resolved_opset = opset or getattr(sub_export_config, "DEFAULT_ONNX_OPSET", 14) + logger.warning( + f"Falling back to ONNX export for submodel `{model_name}` due to PyTorch frontend limitation: {e}" + ) + outputs.append( + export_pytorch_via_onnx( + model=submodel, + config=sub_export_config, + opset=resolved_opset, + output=output_path, + device=device, + input_shapes=input_shapes, + model_kwargs=model_kwargs, + ov_config=ov_config, + library_name=library_name, + ) ) - ) outputs = list(map(list, zip(*outputs))) return outputs @@ -1375,6 +1397,14 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): # VAE Decoder vae_decoder = copy.deepcopy(pipeline.vae) + if hasattr(vae_decoder, "bn") and hasattr(vae_decoder.bn, "running_mean") and hasattr(vae_decoder.bn, "running_var"): + vae_decoder.register_to_config( + **{ + "bn_running_mean_data": vae_decoder.bn.running_mean.detach().cpu().tolist(), + "bn_running_var_data": vae_decoder.bn.running_var.detach().cpu().tolist(), + "bn_eps": float(getattr(vae_decoder.bn, "eps", 1e-5)), + } + ) vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_decoder, diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index dd110267ea..ff2ca376a6 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -190,6 +190,7 @@ "OVLatentConsistencyModelImg2ImgPipeline", "OVLTXPipeline", "OVFluxPipeline", + "OVFlux2KleinPipeline", "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", @@ -217,6 +218,7 @@ "OVLatentConsistencyModelImg2ImgPipeline", "OVLTXPipeline", "OVFluxPipeline", + "OVFlux2KleinPipeline", "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8441944800..538dd981c1 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -93,6 +93,7 @@ if is_diffusers_available(): from .modeling_diffusion import ( OVDiffusionPipeline, + OVFlux2KleinPipeline, OVFluxFillPipeline, OVFluxImg2ImgPipeline, OVFluxInpaintPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index de5b7e1b39..5716208fcb 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -98,9 +98,14 @@ if is_diffusers_version(">=", "0.30.0"): from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline + try: + from diffusers import Flux2KleinPipeline + except Exception: + Flux2KleinPipeline = object else: StableDiffusion3InpaintPipeline = object FluxPipeline = object + Flux2KleinPipeline = object if is_diffusers_version(">=", "0.31.0"): @@ -799,13 +804,28 @@ def _reshape_transformer( elif inputs.get_any_name() == "pooled_projections": shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] elif inputs.get_any_name() == "img_ids": + ids_dim = inputs.get_partial_shape()[-1] + if hasattr(ids_dim, "is_dynamic") and ids_dim.is_dynamic: + ids_dim = 3 + else: + ids_dim = int(ids_dim.get_length()) + shapes[inputs] = ( - [batch_size, packed_height_width, 3] + [batch_size, packed_height_width, ids_dim] if is_diffusers_version("<", "0.31.0") - else [packed_height_width, 3] + else [packed_height_width, ids_dim] ) elif inputs.get_any_name() == "txt_ids": - shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] + ids_dim = inputs.get_partial_shape()[-1] + if hasattr(ids_dim, "is_dynamic") and ids_dim.is_dynamic: + ids_dim = 3 + else: + ids_dim = int(ids_dim.get_length()) + + shapes[inputs] = ( + [batch_size, -1, ids_dim] if is_diffusers_version("<", "0.31.0") else [-1, ids_dim] + ) + elif inputs.get_any_name() in ["height", "width", "num_frames", "rope_interpolation_scale"]: shapes[inputs] = inputs.get_partial_shape() else: @@ -1163,6 +1183,7 @@ def forward( attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, output_hidden_states: Optional[bool] = None, return_dict: bool = False, + **kwargs, ): self.compile() model_inputs = {"input_ids": input_ids} @@ -1176,15 +1197,28 @@ def forward( model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name(): model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) + request_hidden_states = bool(output_hidden_states) or bool(kwargs.get("output_hidden_states", False)) + expected_hidden_states_count = getattr(self.config, "num_hidden_layers", None) + if expected_hidden_states_count is None: + expected_hidden_states_count = getattr(self.config, "n_layer", None) + if expected_hidden_states_count is None: + expected_hidden_states_count = 1 + else: + expected_hidden_states_count = int(expected_hidden_states_count) + 1 + if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) - if ( - self.hidden_states_output_names - and output_hidden_states - or getattr(self.config, "output_hidden_states", False) - ): + + if self.hidden_states_output_names: hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] - model_outputs["hidden_states"] = hidden_states + target_len = max(expected_hidden_states_count, len(hidden_states)) + if len(hidden_states) < target_len: + hidden_states.extend([hidden_states[-1]] * (target_len - len(hidden_states))) + + if request_hidden_states or getattr(self.config, "output_hidden_states", False): + model_outputs["hidden_states"] = hidden_states + elif request_hidden_states and "last_hidden_state" in model_outputs: + model_outputs["hidden_states"] = [model_outputs["last_hidden_state"]] * expected_hidden_states_count if return_dict: return model_outputs @@ -1277,8 +1311,27 @@ def forward( if pooled_projections is not None: model_inputs["pooled_projections"] = pooled_projections if img_ids is not None: + for inp in self.model.inputs: + if inp.get_any_name() == "img_ids": + expected_rank = inp.get_partial_shape().rank.get_length() + actual_rank = len(img_ids.shape) + if expected_rank == 2 and actual_rank == 3: + img_ids = img_ids[0] + elif expected_rank == 3 and actual_rank == 2: + img_ids = img_ids.unsqueeze(0) + break model_inputs["img_ids"] = img_ids + if txt_ids is not None: + for inp in self.model.inputs: + if inp.get_any_name() == "txt_ids": + expected_rank = inp.get_partial_shape().rank.get_length() + actual_rank = len(txt_ids.shape) + if expected_rank == 2 and actual_rank == 3: + txt_ids = txt_ids[0] + elif expected_rank == 3 and actual_rank == 2: + txt_ids = txt_ids.unsqueeze(0) + break model_inputs["txt_ids"] = txt_ids if guidance is not None: model_inputs["guidance"] = guidance @@ -1403,6 +1456,18 @@ def __init__(self, decoder: OVModelVaeDecoder, encoder: OVModelVaeEncoder): self.latents_mean = torch.tensor(self.decoder.config.latents_mean_data) if hasattr(self.decoder.config, "latents_std_data"): self.latents_std = torch.tensor(self.decoder.config.latents_std_data) + # Flux2Klein compatibility: pipeline expects self.vae.bn.running_mean/running_var/eps + self.bn = None + bn_mean = getattr(self.decoder.config, "bn_running_mean_data", None) + bn_var = getattr(self.decoder.config, "bn_running_var_data", None) + bn_eps = float(getattr(self.decoder.config, "bn_eps", 1e-5)) + + if bn_mean is not None and bn_var is not None: + bn_state = type("BNState", (), {})() + bn_state.running_mean = torch.tensor(bn_mean, dtype=torch.float32) + bn_state.running_var = torch.tensor(bn_var, dtype=torch.float32) + bn_state.eps = bn_eps + self.bn = bn_state @property def _component_names(self) -> List[str]: @@ -1627,6 +1692,12 @@ class OVFluxPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxPip auto_model_class = FluxPipeline +class OVFlux2KleinPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flux2KleinPipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = Flux2KleinPipeline + + class OVFluxImg2ImgPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxImg2ImgPipeline): main_input_name = "image" export_feature = "image-to-image" @@ -1728,9 +1799,10 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline if is_diffusers_version(">=", "0.30.0"): - SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline]) + SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline, OVFlux2KleinPipeline]) OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline + OV_TEXT2IMAGE_PIPELINES_MAPPING["flux2-klein"] = OVFlux2KleinPipeline if is_diffusers_version(">=", "0.31.0"): SUPPORTED_OV_PIPELINES.extend([OVFluxImg2ImgPipeline, OVFluxInpaintPipeline]) diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index ed38231e08..8e16bc47cc 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -213,6 +213,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) +class OVFlux2KleinPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + class OVFluxImg2ImgPipeline(metaclass=DummyObject): _backends = ["openvino", "diffusers"] From 588a045f9ce15e9e4f1f673001be0767dfbde71b Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Mon, 23 Mar 2026 08:43:14 -0700 Subject: [PATCH 6/8] Fixes to add all hidden_states for Encoder --- optimum/exporters/openvino/convert.py | 29 ++++++++++-- optimum/exporters/openvino/model_configs.py | 28 ++++++++++++ optimum/intel/openvino/modeling_diffusion.py | 48 +++++++++++--------- 3 files changed, 80 insertions(+), 25 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index a82d2ad35d..ff381f3554 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1319,13 +1319,20 @@ def _resolve_flux_text_encoder_model_type(text_encoder, default_model_type: str, model_type in {"gemma", "gemma2", "gemma3", "gemma3_text"} or any("Gemma" in arch for arch in architectures) or "Gemma" in encoder_cls_name - or "CausalLM" in encoder_cls_name or "Gemma" in tokenizer_cls_name ) - if looks_like_gemma: return "gemma2-text-encoder" + looks_like_qwen = ( + model_type in {"qwen", "qwen2", "qwen3"} + or any("Qwen" in arch for arch in architectures) + or "Qwen" in encoder_cls_name + or "Qwen" in tokenizer_cls_name + ) + if looks_like_qwen: + return "qwen3-text-encoder" + return default_model_type @@ -1345,10 +1352,18 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): getattr(pipeline, "tokenizer", None), ) + text_encoder_library_name = "diffusers" + if text_encoder_model_type in {"qwen3", "qwen2", "qwen"}: + text_encoder_library_name = "transformers" + + if hasattr(text_encoder_for_export, "config"): + text_encoder_for_export.config.output_hidden_states = True + text_encoder_for_export.config.return_dict = True + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( model=text_encoder_for_export, exporter=exporter, - library_name="diffusers", + library_name=text_encoder_library_name, task="feature-extraction", model_type=text_encoder_model_type, ) @@ -1379,8 +1394,14 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): transformer_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} models_for_export["transformer"] = (transformer, transformer_export_config) + vae_scaling_factor = None + if hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"): + vae_scaling_factor = getattr(pipeline.vae.config, "scaling_factor", None) + # VAE Encoder vae_encoder = copy.deepcopy(pipeline.vae) + if vae_scaling_factor is not None: + vae_encoder.register_to_config(scaling_factor=float(vae_scaling_factor)) vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_encoder, @@ -1397,6 +1418,8 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): # VAE Decoder vae_decoder = copy.deepcopy(pipeline.vae) + if vae_scaling_factor is not None: + vae_decoder.register_to_config(scaling_factor=float(vae_scaling_factor)) if hasattr(vae_decoder, "bn") and hasattr(vae_decoder.bn, "running_mean") and hasattr(vae_decoder.bn, "running_var"): vae_decoder.register_to_config( **{ diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ab47af67ef..e13fe70cb8 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -390,6 +390,34 @@ def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = super().inputs return common_inputs +@register_in_tasks_manager("qwen3-text-encoder", *["feature-extraction"], library_name="diffusers") +class Qwen3TextEncoderOpenVINOConfig(Qwen3OpenVINOConfig): + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + common_outputs = {"last_hidden_state": {0: "batch_size", 1: "sequence_length"}} + + num_layers = getattr(self._normalized_config, "num_layers", None) + if num_layers is None: + num_layers = getattr(self._normalized_config, "num_hidden_layers", 0) + + for i in range(int(num_layers) + 1): + common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"} + + return common_outputs + + @property + def values_override(self) -> Optional[Dict[str, Any]]: + values = super().values_override or {} + values.update({"output_hidden_states": True, "return_dict": True, "use_cache": False}) + return values + class DummyQwen3VLLMInputGenerator(DummyTextInputGenerator): SUPPORTED_INPUT_NAMES = ( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5716208fcb..ed1cb2d5e3 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -1178,13 +1178,13 @@ def __init__(self, model: openvino.Model, parent_pipeline: OVDiffusionPipeline, self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( - self, - input_ids: Union[np.ndarray, torch.Tensor], - attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, - output_hidden_states: Optional[bool] = None, - return_dict: bool = False, - **kwargs, - ): + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + **kwargs, +): self.compile() model_inputs = {"input_ids": input_ids} @@ -1195,30 +1195,34 @@ def forward( main_out = ov_outputs[0] model_outputs = {} model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) + if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name(): model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) + request_hidden_states = bool(output_hidden_states) or bool(kwargs.get("output_hidden_states", False)) - expected_hidden_states_count = getattr(self.config, "num_hidden_layers", None) - if expected_hidden_states_count is None: - expected_hidden_states_count = getattr(self.config, "n_layer", None) - if expected_hidden_states_count is None: - expected_hidden_states_count = 1 - else: - expected_hidden_states_count = int(expected_hidden_states_count) + 1 if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) - if self.hidden_states_output_names: - hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] - target_len = max(expected_hidden_states_count, len(hidden_states)) - if len(hidden_states) < target_len: - hidden_states.extend([hidden_states[-1]] * (target_len - len(hidden_states))) + if request_hidden_states or getattr(self.config, "output_hidden_states", False): + hidden_states = [] - if request_hidden_states or getattr(self.config, "output_hidden_states", False): + if self.hidden_states_output_names: + hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] + else: + for i, out in enumerate(self.model.outputs): + if i == 0: + continue + out_name = out.get_any_name() + if "pooler_output" in out_name: + continue + hidden_states.append(torch.from_numpy(ov_outputs[i])) + + if not hidden_states and "last_hidden_state" in model_outputs: + hidden_states = [model_outputs["last_hidden_state"]] + + if hidden_states: model_outputs["hidden_states"] = hidden_states - elif request_hidden_states and "last_hidden_state" in model_outputs: - model_outputs["hidden_states"] = [model_outputs["last_hidden_state"]] * expected_hidden_states_count if return_dict: return model_outputs From c791b7fd4d74257327603d994fa3773b3988cd4b Mon Sep 17 00:00:00 2001 From: Omkar Vivek Sabnis Date: Tue, 24 Mar 2026 07:59:30 -0700 Subject: [PATCH 7/8] Fix indentation error Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- optimum/intel/openvino/modeling_diffusion.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index ed1cb2d5e3..ef08ad348a 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -1178,13 +1178,13 @@ def __init__(self, model: openvino.Model, parent_pipeline: OVDiffusionPipeline, self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( - self, - input_ids: Union[np.ndarray, torch.Tensor], - attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, - output_hidden_states: Optional[bool] = None, - return_dict: bool = False, - **kwargs, -): + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + **kwargs, + ): self.compile() model_inputs = {"input_ids": input_ids} From d00fc606a05766ca1b8a43f6189983ee9bf398a8 Mon Sep 17 00:00:00 2001 From: "Sabnis, Omkar" Date: Tue, 24 Mar 2026 09:05:39 -0700 Subject: [PATCH 8/8] Addressed Copilot Review comments --- optimum/exporters/openvino/__init__.py | 2 +- optimum/exporters/openvino/convert.py | 3 --- optimum/intel/openvino/modeling_diffusion.py | 21 ++++++++++---------- tests/openvino/test_export.py | 3 ++- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index 96b77e7731..94ea4f103b 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -15,7 +15,7 @@ import optimum.exporters.openvino.model_configs from .__main__ import main_export -from .convert import export, export_from_model, export_models, export_pytorch_via_onnx, _resolve_flux_text_encoder_model_type +from .convert import export, export_from_model, export_models, export_pytorch_via_onnx from .stateful import ensure_stateful_is_available, patch_stateful diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index ff381f3554..4b8fc97e8b 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1353,9 +1353,6 @@ def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): ) text_encoder_library_name = "diffusers" - if text_encoder_model_type in {"qwen3", "qwen2", "qwen"}: - text_encoder_library_name = "transformers" - if hasattr(text_encoder_for_export, "config"): text_encoder_for_export.config.output_hidden_states = True text_encoder_for_export.config.return_dict = True diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index ef08ad348a..b22879cff2 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -98,15 +98,9 @@ if is_diffusers_version(">=", "0.30.0"): from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline - try: - from diffusers import Flux2KleinPipeline - except Exception: - Flux2KleinPipeline = object else: StableDiffusion3InpaintPipeline = object FluxPipeline = object - Flux2KleinPipeline = object - if is_diffusers_version(">=", "0.31.0"): from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline @@ -125,12 +119,16 @@ else: SanaSprintPipeline = object - if is_diffusers_version(">=", "0.35.0"): from diffusers.models.cache_utils import CacheMixin else: CacheMixin = object +if is_diffusers_version(">=", "0.37.0"): + from diffusers import Flux2KleinPipeline +else: + Flux2KleinPipeline = object + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" @@ -1803,10 +1801,10 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline if is_diffusers_version(">=", "0.30.0"): - SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline, OVFlux2KleinPipeline]) + SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline]) OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline - OV_TEXT2IMAGE_PIPELINES_MAPPING["flux2-klein"] = OVFlux2KleinPipeline + if is_diffusers_version(">=", "0.31.0"): SUPPORTED_OV_PIPELINES.extend([OVFluxImg2ImgPipeline, OVFluxInpaintPipeline]) @@ -1819,11 +1817,14 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline SUPPORTED_OV_PIPELINES.append(OVSanaPipeline) - if is_diffusers_version(">=", "0.33.0"): SUPPORTED_OV_PIPELINES.append(OVSanaSprintPipeline) OV_TEXT2IMAGE_PIPELINES_MAPPING["sana-sprint"] = OVSanaSprintPipeline +if is_diffusers_version(">=", "0.37.0"): + SUPPORTED_OV_PIPELINES.append(OVFlux2KleinPipeline) + OV_TEXT2IMAGE_PIPELINES_MAPPING["flux2-klein"] = OVFlux2KleinPipeline + SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, OV_IMAGE2IMAGE_PIPELINES_MAPPING, diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index ddb27afc90..ad0d7f3623 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -28,7 +28,8 @@ from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.onnx.model_configs import BertOnnxConfig -from optimum.exporters.openvino import export_from_model, main_export, _resolve_flux_text_encoder_model_type +from optimum.exporters.openvino import export_from_model, main_export +from optimum.exporters.openvino.convert import _resolve_flux_text_encoder_model_type from optimum.exporters.openvino.model_configs import _get_flux_ids_dim from optimum.exporters.tasks import TasksManager from optimum.intel import (