From d3abd870491ef3b4cc687b757142fb951d55607e Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Mon, 30 Mar 2026 05:54:49 +0000
Subject: [PATCH 1/4] Support Kokoro TTS model

---
 docs/source/openvino/models.mdx               |   5 +-
 optimum/exporters/openvino/__main__.py        |   5 +
 optimum/exporters/openvino/convert.py         |  96 +++++++-
 optimum/exporters/openvino/model_configs.py   |  83 +++++++
 optimum/exporters/openvino/model_patcher.py   |  17 ++
 optimum/exporters/openvino/utils.py           |   8 +-
 .../intel/openvino/modeling_text2speech.py    | 209 +++++++++++++++++-
 optimum/intel/utils/import_utils.py           |   6 +
 optimum/intel/utils/modeling_utils.py         |  75 +++++++
 tests/openvino/test_export.py                 |  20 ++
 tests/openvino/utils_tests.py                 |   1 +
 11 files changed, 512 insertions(+), 13 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index c11505fa4d..9fd9f10410 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -180,4 +180,7 @@ Here is the list of the supported architectures :
 - All Transformer and CLIP-based models.
 
 ## [OpenCLIP](https://github.com/mlfoundations/open_clip)
-- All CLIP-based models
\ No newline at end of file
+- All CLIP-based models
+
+## [Kokoro](https://github.com/hexgrad/kokoro)
+- Kokoro-82M (text-to-speech)
\ No newline at end of file
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index eb763b45d4..1b862dce21 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -39,6 +39,7 @@
 )
 from optimum.intel.utils.modeling_utils import (
     _infer_library_from_model_name_or_path,
+    _KokoroForTextToSpeech,
     _OpenClipForZeroShotImageClassification,
 )
 
@@ -86,6 +87,8 @@ def infer_task(
     if task == "auto":
         if library_name == "open_clip":
             task = "zero-shot-image-classification"
+        elif library_name == "kokoro":
+            task = "text-to-audio"
         else:
             try:
                 task = TasksManager._infer_task_from_model_name_or_path(
@@ -471,6 +474,8 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
     try:
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        elif library_name == "kokoro":
+            model = _KokoroForTextToSpeech.from_pretrained(model_name_or_path, cache_dir=cache_dir, token=token)
         else:
             # remote code models like phi3_v internvl2, minicpmv, internvl2, nanollava, maira2 should be loaded using AutoModelForCausalLM and not AutoModelForImageTextToText
             # TODO: use config.auto_map to load remote code models instead (for other models we can directly use config.architectures)
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 60d90f53e0..a35ee70f0d 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -552,6 +552,68 @@ def export_models(
     return outputs
 
 
+def _save_kokoro_config_and_assets(model, output: Path):
+    """Save Kokoro model config.json and export voice embeddings."""
+    import json
+    import tempfile
+
+    import numpy as np
+    from huggingface_hub import hf_hub_download, list_repo_files
+
+    repo_id = getattr(model, "_kokoro_repo_id", None)
+
+    # Save config.json
+    config_dict = {}
+    for key in vars(model.config):
+        if not key.startswith("_"):
+            config_dict[key] = getattr(model.config, key)
+    config_path = output / "config.json"
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(config_dict, f, indent=2)
+
+    if repo_id is None:
+        return
+
+    if True:
+        return
+
+    # Export voice embeddings to .bin format
+    voices_dir = output / "voices"
+    voices_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        repo_files = list_repo_files(repo_id=repo_id)
+    except Exception:
+        logger.warning(f"Could not list files for {repo_id}. Skipping voice export.")
+        return
+
+    voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt"))
+    if not voice_pt_files:
+        return
+
+    logger.info(f"Found {len(voice_pt_files)} voice files. Exporting to {voices_dir} ...")
+    with tempfile.TemporaryDirectory(prefix="kokoro_voice_pt_") as tmp_dir:
+        for remote_path in voice_pt_files:
+            local_pt = hf_hub_download(repo_id=repo_id, filename=remote_path, local_dir=tmp_dir)
+            voice_name = Path(remote_path).stem
+            out_bin = voices_dir / f"{voice_name}.bin"
+
+            import torch
+
+            voice_obj = torch.load(local_pt, map_location="cpu")
+            if torch.is_tensor(voice_obj):
+                voice_tensor = voice_obj
+            elif isinstance(voice_obj, dict):
+                voice_tensor = next(v for v in voice_obj.values() if torch.is_tensor(v))
+            else:
+                logger.warning(f"Unsupported voice format in {remote_path}, skipping.")
+                continue
+
+            voice_tensor = voice_tensor.detach().cpu().to(torch.float32).contiguous()
+            np.asarray(voice_tensor.numpy(), dtype=np.float32).tofile(out_bin)
+            logger.info(f"Exported {remote_path} -> {out_bin}")
+
+
 def export_from_model(
     model: Union["PreTrainedModel", "ModelMixin", "DiffusionPipeline"],
     output: Union[str, Path],
@@ -576,7 +638,7 @@ def export_from_model(
         )
 
     library_name = _infer_library_from_model_or_model_class(model)
-    if library_name != "open_clip":
+    if library_name not in ("open_clip", "kokoro"):
         TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type") and model.config.export_model_type is not None:
@@ -594,12 +656,15 @@ def export_from_model(
     if task is not None and task != "auto":
         task = TasksManager.map_from_synonym(task)
     else:
-        try:
-            task = TasksManager._infer_task_from_model_or_model_class(model=model)
-        except (ValueError, KeyError) as e:
-            raise RuntimeError(
-                f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
+        if library_name == "kokoro":
+            task = "text-to-audio"
+        else:
+            try:
+                task = TasksManager._infer_task_from_model_or_model_class(model=model)
+            except (ValueError, KeyError) as e:
+                raise RuntimeError(
+                    f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+                )
 
         if (
             not custom_architecture
@@ -661,6 +726,20 @@ def export_from_model(
             model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels
         )
 
+    if library_name == "kokoro":
+        custom_architecture = True
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=model, exporter="openvino", task=task, library_name="kokoro"
+        )
+        kokoro_export_config = export_config_constructor(model.config, task=task)
+        custom_export_configs = {"model": kokoro_export_config}
+
+        def _get_kokoro_submodels(model):
+            return {"model": model}
+
+        fn_get_submodels = _get_kokoro_submodels
+
+    #raise "Exception"
     if library_name == "diffusers":
         export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
         stateful_submodels = False
@@ -699,6 +778,9 @@ def export_from_model(
             if hasattr(preprocess, "save_pretrained"):
                 preprocess.save_pretrained(output)
 
+        files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
+    elif library_name == "kokoro":
+        _save_kokoro_config_and_assets(model, output)
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
     elif library_name != "diffusers":
         # some model configs may have issues with loading without parameters initialization
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0624624a77..a37eb516c2 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -132,6 +132,7 @@
 from .model_patcher import (
     AfmoeModelPatcher,
     AquilaModelPatcher,
+    KokoroModelPatcher,
     ArcticModelPatcher,
     BaichuanModelPatcher,
     BigBirdPegasusModelPatcher,
@@ -238,6 +239,8 @@
 def init_model_configs():
     if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
         TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
+    if "kokoro" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
+        TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["kokoro"] = {}
     TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
         "transformers",
         "LlavaForConditionalGeneration",
@@ -5451,3 +5454,83 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
                 )
 
         return dummy_inputs
+
+
+class DummyKokoroInputGenerator(DummyInputGenerator):
+    """Generates dummy inputs for the Kokoro TTS model."""
+
+    SUPPORTED_INPUT_NAMES = ("input_ids", "ref_s", "speed")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = 1
+        self.sequence_length = sequence_length
+        self.style_dim = getattr(normalized_config, "style_dim", 128)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "input_ids":
+            shape = [self.batch_size, self.sequence_length]
+            input_ids_value = self.random_int_tensor(shape=shape, min_value=0, max_value=178, framework=framework, dtype=int_dtype)
+            input_ids_value[:, 0] = 0
+            input_ids_value[:, -1] = 0
+            return input_ids_value
+        elif input_name == "ref_s":
+            shape = [self.batch_size, self.style_dim * 2]
+            return self.random_float_tensor(
+                shape=shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype
+            )
+        elif input_name == "speed":
+            return self.random_int_tensor(
+                shape=[1], min_value=1, max_value=10, framework=framework, dtype=float_dtype
+            )
+        else:
+            raise ValueError(f"Unsupported input {input_name} for DummyKokoroInputGenerator")
+
+
+@register_in_tasks_manager(
+    "kokoro",
+    *["text-to-audio"],
+    library_name="kokoro",
+)
+class KokoroOpenVINOConfig(OnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyKokoroInputGenerator,)
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+    _MODEL_PATCHER = KokoroModelPatcher
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "text-to-audio",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {1: ("sequence_length", 2, -1)},
+            "ref_s": {1: "style_dim"},
+            "speed": {},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "waveform": {0: "batch_size", 1: "audio_length"},
+            "phonemes": {0: "batch_size", 1: "phoneme_length"},
+        }
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 32dd2d6c6d..2cda2a8d74 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8319,3 +8319,20 @@ def __exit__(self, exc_type, exc_value, traceback):
                 sparse_moe_block = decoder_layer.mlp
                 decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
                 del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs
+
+
+class KokoroModelPatcher(ModelPatcher):
+    """
+    Patches the Kokoro TTS model for OpenVINO export by redirecting forward
+    to forward_with_tokens, which takes (input_ids, ref_s, speed) and returns
+    (audio_waveform, phonemes).
+    """
+
+    def __enter__(self):
+        super().__enter__()
+        self._model._orig_forward = self._model.forward
+        self._model.forward = self._model.forward_with_tokens
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model._orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index af2f1edaba..51f3b96ac5 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -108,12 +108,18 @@ def _get_input_info(
         if name in inputs:
             named_dims = inputs[name]
             for idx, dim_name in named_dims.items():
+                orig_dim_name = dim_name
+                if isinstance(orig_dim_name, tuple):
+                    dim_name, min_value, max_value = dim_name
                 if dim_name in name_to_symbol:
                     symbol = name_to_symbol[dim_name]
                 else:
                     symbol = Symbol()
                     name_to_symbol[dim_name] = symbol
-                dim = Dimension(-1)
+                if isinstance(orig_dim_name, tuple):
+                    dim = Dimension(min_value, max_value)
+                else:
+                    dim = Dimension(-1)
                 dim.set_symbol(symbol)
                 shape[idx] = dim
         info = InputInfo(name=name, shape=shape, type=type, example=example)
diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py
index 749edef73c..e27a6662a6 100644
--- a/optimum/intel/openvino/modeling_text2speech.py
+++ b/optimum/intel/openvino/modeling_text2speech.py
@@ -160,6 +160,24 @@ class OVModelForTextToSpeechSeq2Seq(OVModelForSeq2SeqLM):
     auto_model_class = AutoModelForTextToSpectrogram
     export_feature = "text-to-audio"
 
+    @classmethod
+    def from_pretrained(cls, model_id, **kwargs):
+        # For Kokoro models, load config via PretrainedConfig since AutoConfig
+        # does not recognize the "kokoro" model_type.
+        if kwargs.get("config") is None:
+            try:
+                config = PretrainedConfig.from_pretrained(
+                    model_id,
+                    cache_dir=kwargs.get("cache_dir", HUGGINGFACE_HUB_CACHE),
+                    token=kwargs.get("token"),
+                    revision=kwargs.get("revision"),
+                )
+                if getattr(config, "model_type", None) == "kokoro":
+                    kwargs["config"] = config
+            except Exception:
+                pass
+        return super().from_pretrained(model_id, **kwargs)
+
     @classmethod
     def _from_pretrained(
         cls,
@@ -167,12 +185,14 @@ def _from_pretrained(
         config: "PretrainedConfig",
         **kwargs,
     ):
-        if "SpeechT5ForTextToSpeech" in config.architectures:
+        if getattr(config, "model_type", None) == "kokoro":
+            return _OVModelForKokoroTextToSpeech._from_pretrained(model_id, config, **kwargs)
+        elif getattr(config, "architectures", None) and "SpeechT5ForTextToSpeech" in config.architectures:
             return _OVModelForSpeechT5ForTextToSpeech._from_pretrained(model_id, config, **kwargs)
         else:
-            raise ValueError(f"{config.architectures} are not supported text-to-audio model using OpenVINO")
-
-            return super()._from_pretrained(model_id, config, **kwargs)
+            raise ValueError(
+                f"{getattr(config, 'architectures', None)} are not supported text-to-audio model using OpenVINO"
+            )
 
     def reshape(self, *args, **kwargs):
         logger.warning("Static shapes are not supported for this model.")
@@ -522,3 +542,184 @@ def generate(
                 )
                 outputs = (*outputs, cross_attentions)
         return outputs
+
+
+class _OVModelForKokoroTextToSpeech(OVBaseModel):
+    """
+    OpenVINO inference model for Kokoro TTS.
+
+    Kokoro is a single-model architecture with inputs (input_ids, ref_s, speed) and
+    outputs (waveform, phonemes). Voice embeddings are stored as .bin files in a voices/ subdirectory.
+    """
+
+    export_feature = "text-to-audio"
+    auto_model_class = AutoModelForTextToSpectrogram
+
+    def __init__(self, model: openvino.Model, config: PretrainedConfig = None, **kwargs):
+        # Kokoro model does not support dynamic shapes due to Squeeze op limitations,
+        # so we skip the automatic reshape to dynamic shapes.
+        kwargs.setdefault("dynamic_shapes", False)
+        super().__init__(model, config, **kwargs)
+        self._voices = {}
+        self._voices_dir = None
+
+    def _reshape(self, model, batch_size, sequence_length, height=None, width=None):
+        # Kokoro has inputs with different ranks (speed is 1D), so only reshape
+        # dimensions that exist in each input.
+        shapes = {}
+        for inp in model.inputs:
+            shape = inp.get_partial_shape()
+            if len(shape) >= 1:
+                shape[0] = batch_size
+            if len(shape) >= 2:
+                shape[1] = sequence_length
+            shapes[inp] = shape
+        model.reshape(shapes)
+        return model
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        config: "PretrainedConfig",
+        token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        local_files_only: bool = False,
+        load_in_8bit: bool = False,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        trust_remote_code: bool = False,
+        **kwargs,
+    ):
+        model = super()._from_pretrained(
+            model_id,
+            config=config,
+            token=token,
+            revision=revision,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            load_in_8bit=load_in_8bit,
+            quantization_config=quantization_config,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+        # Locate voices directory
+        if model.model_save_dir is not None:
+            voices_dir = Path(model.model_save_dir) / "voices"
+            if voices_dir.is_dir():
+                model._voices_dir = voices_dir
+        return model
+
+    def _load_voice(self, voice_name: str) -> np.ndarray:
+        """Load a voice embedding by name, caching results."""
+        if voice_name in self._voices:
+            return self._voices[voice_name]
+
+        if self._voices_dir is None:
+            raise FileNotFoundError("No voices directory found in model directory.")
+
+        voice_path = self._voices_dir / f"{voice_name}.bin"
+        if not voice_path.exists():
+            raise FileNotFoundError(
+                f"Voice '{voice_name}' not found at {voice_path}. "
+                f"Available voices: {[f.stem for f in self._voices_dir.glob('*.bin')]}"
+            )
+
+        voice_data = np.fromfile(voice_path, dtype=np.float32)
+        self._voices[voice_name] = voice_data
+        return voice_data
+
+    @property
+    def available_voices(self) -> List[str]:
+        """Returns list of available voice names."""
+        if self._voices_dir is None or not self._voices_dir.is_dir():
+            return []
+        return sorted(f.stem for f in self._voices_dir.glob("*.bin"))
+
+    def forward(
+        self,
+        input_ids: Union[torch.Tensor, np.ndarray],
+        ref_s: Union[torch.Tensor, np.ndarray],
+        speed: Union[torch.Tensor, np.ndarray, float],
+        **kwargs,
+    ) -> ModelOutput:
+        """
+        Run inference on the Kokoro model.
+
+        Args:
+            input_ids: Token IDs of shape [batch_size, sequence_length].
+            ref_s: Voice style embedding of shape [batch_size, style_dim].
+            speed: Speed factor, scalar or array.
+
+        Returns:
+            ModelOutput with `waveform` and `phonemes`.
+        """
+        self.compile()
+
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.numpy()
+        if isinstance(ref_s, torch.Tensor):
+            ref_s = ref_s.numpy()
+        if isinstance(speed, (int, float)):
+            speed = np.array([speed], dtype=np.float32)
+        elif isinstance(speed, torch.Tensor):
+            speed = speed.numpy()
+
+        inputs = {
+            "input_ids": input_ids,
+            "ref_s": ref_s,
+            "speed": speed,
+        }
+
+        outputs = self._inference(inputs)
+        waveform = torch.from_numpy(outputs[0])
+        phonemes = torch.from_numpy(outputs[1])
+        return ModelOutput(waveform=waveform, phonemes=phonemes)
+
+    def generate(
+        self,
+        input_ids: Union[torch.Tensor, np.ndarray],
+        voice: Optional[str] = None,
+        ref_s: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        speed: float = 1.0,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Generate audio waveform from token IDs.
+
+        Args:
+            input_ids: Token IDs of shape [batch_size, sequence_length].
+            voice: Name of a voice preset (e.g., "af_heart"). Ignored if ref_s is provided.
+            ref_s: Voice style embedding. If None, loaded from voice preset.
+            speed: Speed factor (default 1.0).
+
+        Returns:
+            Audio waveform tensor.
+        """
+        if ref_s is None:
+            if voice is None:
+                voice = "af_heart"
+            voice_data = self._load_voice(voice)
+            ref_s = voice_data.reshape(1, -1)
+
+        if isinstance(input_ids, torch.Tensor):
+            if input_ids.dim() == 1:
+                input_ids = input_ids.unsqueeze(0)
+        elif isinstance(input_ids, np.ndarray):
+            if input_ids.ndim == 1:
+                input_ids = input_ids.reshape(1, -1)
+
+        if isinstance(ref_s, np.ndarray) and ref_s.ndim == 1:
+            ref_s = ref_s.reshape(1, -1)
+
+        result = self.forward(input_ids=input_ids, ref_s=ref_s, speed=speed)
+        return result.waveform
+
+    def reshape(self, *args, **kwargs):
+        logger.warning("Static shapes are not supported for Kokoro model.")
+        return self
+
+    def can_generate(self) -> bool:
+        return True
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index 3ad9877a82..91ce1a2038 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -119,6 +119,8 @@
         pass
 
 
+_kokoro_available = importlib.util.find_spec("kokoro") is not None
+
 _safetensors_version = "N/A"
 _safetensors_available = importlib.util.find_spec("safetensors") is not None
 if _safetensors_available:
@@ -308,6 +310,10 @@ def is_open_clip_available():
     return _open_clip_available
 
 
+def is_kokoro_available():
+    return _kokoro_available
+
+
 def is_safetensors_available():
     return _safetensors_available
 
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index cab9e5efa3..31c3168979 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -245,6 +245,30 @@ def bind_cores_for_best_perf():
     logger.info(f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}")
 
 
+def _is_kokoro_model(
+    model_name_or_path: Union[str, Path],
+    all_files: list,
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
+    token: Optional[Union[bool, str]] = None,
+) -> bool:
+    """Detect Kokoro TTS models by checking for 'istftnet' key in config.json."""
+    if "config.json" not in all_files:
+        return False
+    try:
+        config_path = Path(model_name_or_path)
+        if config_path.is_dir():
+            config_file = config_path / "config.json"
+        else:
+            config_file = hf_hub_download(
+                repo_id=str(model_name_or_path), filename="config.json", cache_dir=cache_dir, token=token
+            )
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return "istftnet" in config and "plbert" in config
+    except Exception:
+        return False
+
+
 def _infer_library_from_model_name_or_path(
     model_name_or_path: Union[str, Path],
     subfolder: str = "",
@@ -257,6 +281,8 @@ def _infer_library_from_model_name_or_path(
     )
     if "open_clip_config.json" in all_files or "open_clip_pytorch_model.bin" in all_files:
         library_name = "open_clip"
+    elif _is_kokoro_model(model_name_or_path, all_files, cache_dir=cache_dir, token=token):
+        library_name = "kokoro"
     else:
         library_name = TasksManager._infer_library_from_model_name_or_path(
             model_name_or_path=model_name_or_path, cache_dir=cache_dir
@@ -273,6 +299,8 @@ def _infer_library_from_model_or_model_class(
         return library_name
     if model.__module__.startswith("open_clip"):
         library_name = "open_clip"
+    elif model.__module__.startswith("kokoro") or getattr(model, "_kokoro_model", False):
+        library_name = "kokoro"
     elif model.__module__.startswith("optimum"):
         # for wrapped models like timm in optimum.intel.openvino.modeling_timm
         library_name = TasksManager._infer_library_from_model_or_model_class(model=model.model)
@@ -493,3 +521,50 @@ def from_pretrained(
                 setattr(model.config, "export_model_type", "clip")
 
             return model
+
+
+class _KokoroForTextToSpeech:
+    """Wrapper for loading Kokoro TTS model with a config conforming to optimum-intel expectations."""
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: Union[str, Path],
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
+        **kwargs,
+    ):
+        try:
+            from kokoro import KPipeline
+        except ImportError:
+            raise ImportError(
+                "To load a Kokoro TTS model, the `kokoro` package is required. "
+                "Please install it with `pip install kokoro`."
+            )
+
+        pipeline = KPipeline(lang_code="a", repo_id=str(model_name_or_path))
+        model = pipeline.model
+        model._kokoro_model = True
+        model._kokoro_repo_id = str(model_name_or_path)
+
+        # Load config.json and create a PretrainedConfig-like object
+        config_path = Path(model_name_or_path)
+        if config_path.is_dir():
+            config_file = config_path / "config.json"
+        else:
+            config_file = hf_hub_download(
+                repo_id=str(model_name_or_path), filename="config.json", cache_dir=cache_dir, token=token
+            )
+
+        with open(config_file, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+
+        config = PretrainedConfig()
+        config.model_type = "kokoro"
+        config.export_model_type = "kokoro"
+        for key, value in config_dict.items():
+            setattr(config, key, value)
+
+        model.config = config
+
+        return model
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 9519cea1ec..b6187a674b 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import importlib.util
 import unittest
 from pathlib import Path
 
@@ -393,3 +394,22 @@ def test_export_custom_model(self):
         ov_outputs = ov_model(**tokens)
         self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4))
         self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4))
+
+
+@unittest.skipUnless(
+    importlib.util.find_spec("kokoro") is not None,
+    "kokoro package is not installed",
+)
+class KokoroExportModelTest(unittest.TestCase):
+    def test_kokoro_export(self):
+        model_id = MODEL_NAMES["kokoro"]
+        with TemporaryDirectory() as tmpdirname:
+            main_export(
+                model_name_or_path=model_id,
+                output=Path(tmpdirname),
+                task="text-to-audio",
+            )
+            output_path = Path(tmpdirname)
+            self.assertTrue((output_path / "openvino_model.xml").exists())
+            self.assertTrue((output_path / "openvino_model.bin").exists())
+            self.assertTrue((output_path / "config.json").exists())
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index fe6d584d2f..40e189ae3e 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -114,6 +114,7 @@
     "internlm2": "optimum-intel-internal-testing/tiny-random-internlm2",
     "internvl_chat": "optimum-intel-internal-testing/tiny-random-internvl2",
     "jais": "optimum-intel-internal-testing/tiny-random-jais",
+    "kokoro": "hexgrad/Kokoro-82M",
     "levit": "optimum-intel-internal-testing/tiny-random-LevitModel",
     "lfm2": "optimum-intel-internal-testing/tiny-random-lfm2",
     "longt5": "optimum-intel-internal-testing/tiny-random-longt5",

From 605dd93eb3784be4cbbf3083aef351346c8d810e Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Mon, 30 Mar 2026 06:40:10 +0000
Subject: [PATCH 2/4] Save misaki data files

---
 optimum/exporters/openvino/convert.py | 28 +++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index a35ee70f0d..596782819e 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -574,9 +574,6 @@ def _save_kokoro_config_and_assets(model, output: Path):
     if repo_id is None:
         return
 
-    if True:
-        return
-
     # Export voice embeddings to .bin format
     voices_dir = output / "voices"
     voices_dir.mkdir(parents=True, exist_ok=True)
@@ -587,6 +584,30 @@ def _save_kokoro_config_and_assets(model, output: Path):
         logger.warning(f"Could not list files for {repo_id}. Skipping voice export.")
         return
 
+    # Save misaki data files from GitHub to data dir of output directory
+    try:
+        import urllib.request
+
+        MISAKI_DATA_URL = "https://raw.githubusercontent.com/hexgrad/misaki/main/misaki/data"
+        MISAKI_DATA_FILES = [
+            "gb_gold.json", "gb_silver.json",
+            "us_gold.json", "us_silver.json",
+            "vi_acronyms.json", "vi_symbols.json", "vi_teencode.json",
+            "ja_words.txt",
+        ]
+        data_out = output / "data"
+        data_out.mkdir(parents=True, exist_ok=True)
+        for fname in MISAKI_DATA_FILES:
+            url = f"{MISAKI_DATA_URL}/{fname}"
+            dest = data_out / fname
+            try:
+                urllib.request.urlretrieve(url, dest)
+                logger.info(f"Downloaded misaki data file: {fname}")
+            except Exception as e:
+                logger.warning(f"Failed to download {fname} from {url}: {e}")
+    except Exception as e:
+        logger.warning(f"Could not download misaki data files: {e}")
+
     voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt"))
     if not voice_pt_files:
         return
@@ -739,7 +760,6 @@ def _get_kokoro_submodels(model):
 
         fn_get_submodels = _get_kokoro_submodels
 
-    #raise "Exception"
     if library_name == "diffusers":
         export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
         stateful_submodels = False

From 73189825f3a5ecfe5fa5d4c2811949a710b81218 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Fri, 3 Apr 2026 11:45:31 +0000
Subject: [PATCH 3/4] Fix inference

---
 optimum/intel/openvino/modeling_text2speech.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py
index e27a6662a6..5754001f98 100644
--- a/optimum/intel/openvino/modeling_text2speech.py
+++ b/optimum/intel/openvino/modeling_text2speech.py
@@ -172,12 +172,22 @@ def from_pretrained(cls, model_id, **kwargs):
                     token=kwargs.get("token"),
                     revision=kwargs.get("revision"),
                 )
+                # Detect Kokoro models that lack model_type by checking for
+                # characteristic config keys (same heuristic used by CLI export).
+                if not getattr(config, "model_type", None):
+                    if hasattr(config, "istftnet") and hasattr(config, "plbert"):
+                        config.model_type = "kokoro"
+                        config.export_model_type = "kokoro"
                 if getattr(config, "model_type", None) == "kokoro":
                     kwargs["config"] = config
-            except Exception:
-                pass
+            except Exception as e:
+                logger.warning(f"Could not pre-load config for Kokoro detection: {e}")
         return super().from_pretrained(model_id, **kwargs)
 
+    @classmethod
+    def _export(cls, model_id, config, use_cache=False, **kwargs):
+        return super()._export(model_id, config, use_cache=use_cache, **kwargs)
+
     @classmethod
     def _from_pretrained(
         cls,

From 4b42a04f9f4f4b928b2dca1267264c4ee1b0f0bb Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Fri, 3 Apr 2026 12:15:25 +0000
Subject: [PATCH 4/4] Add preprocess_input

---
 .../intel/openvino/modeling_text2speech.py    | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py
index 5754001f98..f21c5b5283 100644
--- a/optimum/intel/openvino/modeling_text2speech.py
+++ b/optimum/intel/openvino/modeling_text2speech.py
@@ -208,6 +208,22 @@ def reshape(self, *args, **kwargs):
         logger.warning("Static shapes are not supported for this model.")
         return self
 
+    def preprocess_input(self, text: str, **kwargs) -> dict:
+        """
+        Preprocess a text string into model inputs (input_ids and other required tensors).
+
+        Args:
+            text: The input text to synthesize.
+            **kwargs: Model-specific arguments (e.g., voice, speed, lang_code for Kokoro).
+
+        Returns:
+            Dictionary with model inputs ready for `generate()`.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement `preprocess_input`. "
+            "Use the appropriate model-specific subclass."
+        )
+
 
 class _OVModelForSpeechT5ForTextToSpeech(OVModelForTextToSpeechSeq2Seq):
     """
@@ -733,3 +749,62 @@ def reshape(self, *args, **kwargs):
 
     def can_generate(self) -> bool:
         return True
+
+    def preprocess_input(
+        self,
+        text: str,
+        voice: str = "af_heart",
+        speed: float = 1.0,
+        lang_code: str = "a",
+        **kwargs,
+    ) -> dict:
+        """
+        Preprocess a text string into model inputs for Kokoro TTS.
+
+        Uses the ``kokoro`` and ``misaki`` packages for grapheme-to-phoneme
+        conversion and phoneme tokenization.
+
+        Args:
+            text: The input text to synthesize.
+            voice: Name of a voice preset (e.g., ``"af_heart"``).
+            speed: Speed factor (default 1.0).
+            lang_code: Language code for G2P (default ``"a"`` for American English).
+
+        Returns:
+            Dictionary with ``input_ids``, ``ref_s``, and ``speed`` ready for
+            ``generate()`` or ``forward()``.
+        """
+        try:
+            from kokoro import KPipeline
+        except ImportError:
+            raise ImportError(
+                "The `kokoro` and `misaki` packages are required for text preprocessing. "
+                "Install them with: pip install kokoro misaki[en]"
+            )
+
+        vocab = getattr(self.config, "vocab", None)
+        if vocab is None:
+            raise ValueError("Model config does not contain 'vocab'. Cannot tokenize phonemes.")
+
+        pipeline = KPipeline(lang_code=lang_code, model=False)
+
+        # G2P: text -> phoneme tokens -> phoneme string
+        _, tokens = pipeline.g2p(text)
+        phonemes = KPipeline.tokens_to_ps(tokens)
+        if not phonemes:
+            raise ValueError(f"G2P produced no phonemes for input text: {text!r}")
+
+        # Tokenize: phoneme string -> token IDs (with BOS/EOS)
+        input_ids = [vocab.get(p) for p in phonemes]
+        input_ids = [i for i in input_ids if i is not None]
+        input_ids = torch.LongTensor([[0, *input_ids, 0]])
+
+        # Load voice embedding indexed by phoneme length
+        voice_pack = pipeline.load_voice(voice)
+        ref_s = voice_pack[len(phonemes) - 1]
+
+        return {
+            "input_ids": input_ids,
+            "ref_s": ref_s,
+            "speed": speed,
+        }