huggingface · openvino-agent · Mar 30, 2026 · Mar 30, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -180,4 +180,7 @@ Here is the list of the supported architectures :
 - All Transformer and CLIP-based models.
 
 ## [OpenCLIP](https://github.com/mlfoundations/open_clip)
-- All CLIP-based models
+- All CLIP-based models
+
+## [Kokoro](https://github.com/hexgrad/kokoro)
+- Kokoro-82M (text-to-speech)
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -39,6 +39,7 @@
 )
 from optimum.intel.utils.modeling_utils import (
     _infer_library_from_model_name_or_path,
+    _KokoroForTextToSpeech,
     _OpenClipForZeroShotImageClassification,
 )
 
@@ -86,6 +87,8 @@ def infer_task(
     if task == "auto":
         if library_name == "open_clip":
             task = "zero-shot-image-classification"
+        elif library_name == "kokoro":
+            task = "text-to-audio"
         else:
             try:
                 task = TasksManager._infer_task_from_model_name_or_path(
@@ -471,6 +474,8 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
     try:
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
+        elif library_name == "kokoro":
+            model = _KokoroForTextToSpeech.from_pretrained(model_name_or_path, cache_dir=cache_dir, token=token)
         else:
             # remote code models like phi3_v internvl2, minicpmv, internvl2, nanollava, maira2 should be loaded using AutoModelForCausalLM and not AutoModelForImageTextToText
             # TODO: use config.auto_map to load remote code models instead (for other models we can directly use config.architectures)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -552,6 +552,89 @@ def export_models(
     return outputs
 
 
+def _save_kokoro_config_and_assets(model, output: Path):
+    """Save Kokoro model config.json and export voice embeddings."""
+    import json
+    import tempfile
+
+    import numpy as np
+    from huggingface_hub import hf_hub_download, list_repo_files
+
+    repo_id = getattr(model, "_kokoro_repo_id", None)
+
+    # Save config.json
+    config_dict = {}
+    for key in vars(model.config):
+        if not key.startswith("_"):
+            config_dict[key] = getattr(model.config, key)
+    config_path = output / "config.json"
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(config_dict, f, indent=2)
+
+    if repo_id is None:
+        return
+
+    # Export voice embeddings to .bin format
+    voices_dir = output / "voices"
+    voices_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        repo_files = list_repo_files(repo_id=repo_id)
+    except Exception:
+        logger.warning(f"Could not list files for {repo_id}. Skipping voice export.")
+        return
+
+    # Save misaki data files from GitHub to data dir of output directory
+    try:
+        import urllib.request
+
+        MISAKI_DATA_URL = "https://raw.githubusercontent.com/hexgrad/misaki/main/misaki/data"
+        MISAKI_DATA_FILES = [
+            "gb_gold.json", "gb_silver.json",
+            "us_gold.json", "us_silver.json",
+            "vi_acronyms.json", "vi_symbols.json", "vi_teencode.json",
+            "ja_words.txt",
+        ]
+        data_out = output / "data"
+        data_out.mkdir(parents=True, exist_ok=True)
+        for fname in MISAKI_DATA_FILES:
+            url = f"{MISAKI_DATA_URL}/{fname}"
+            dest = data_out / fname
+            try:
+                urllib.request.urlretrieve(url, dest)
+                logger.info(f"Downloaded misaki data file: {fname}")
+            except Exception as e:
+                logger.warning(f"Failed to download {fname} from {url}: {e}")
+    except Exception as e:
+        logger.warning(f"Could not download misaki data files: {e}")
+
+    voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt"))
+    if not voice_pt_files:
+        return
+
+    logger.info(f"Found {len(voice_pt_files)} voice files. Exporting to {voices_dir} ...")
+    with tempfile.TemporaryDirectory(prefix="kokoro_voice_pt_") as tmp_dir:
+        for remote_path in voice_pt_files:
+            local_pt = hf_hub_download(repo_id=repo_id, filename=remote_path, local_dir=tmp_dir)
+            voice_name = Path(remote_path).stem
+            out_bin = voices_dir / f"{voice_name}.bin"
+
+            import torch
+
+            voice_obj = torch.load(local_pt, map_location="cpu")
+            if torch.is_tensor(voice_obj):
+                voice_tensor = voice_obj
+            elif isinstance(voice_obj, dict):
+                voice_tensor = next(v for v in voice_obj.values() if torch.is_tensor(v))
+            else:
+                logger.warning(f"Unsupported voice format in {remote_path}, skipping.")
+                continue
+
+            voice_tensor = voice_tensor.detach().cpu().to(torch.float32).contiguous()
+            np.asarray(voice_tensor.numpy(), dtype=np.float32).tofile(out_bin)
+            logger.info(f"Exported {remote_path} -> {out_bin}")
+
+
 def export_from_model(
     model: Union["PreTrainedModel", "ModelMixin", "DiffusionPipeline"],
     output: Union[str, Path],
@@ -576,7 +659,7 @@ def export_from_model(
         )
 
     library_name = _infer_library_from_model_or_model_class(model)
-    if library_name != "open_clip":
+    if library_name not in ("open_clip", "kokoro"):
         TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type") and model.config.export_model_type is not None:
@@ -594,12 +677,15 @@ def export_from_model(
     if task is not None and task != "auto":
         task = TasksManager.map_from_synonym(task)
     else:
-        try:
-            task = TasksManager._infer_task_from_model_or_model_class(model=model)
-        except (ValueError, KeyError) as e:
-            raise RuntimeError(
-                f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
+        if library_name == "kokoro":
+            task = "text-to-audio"
+        else:
+            try:
+                task = TasksManager._infer_task_from_model_or_model_class(model=model)
+            except (ValueError, KeyError) as e:
+                raise RuntimeError(
+                    f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+                )
 
         if (
             not custom_architecture
@@ -661,6 +747,19 @@ def export_from_model(
             model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels
         )
 
+    if library_name == "kokoro":
+        custom_architecture = True
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=model, exporter="openvino", task=task, library_name="kokoro"
+        )
+        kokoro_export_config = export_config_constructor(model.config, task=task)
+        custom_export_configs = {"model": kokoro_export_config}
+
+        def _get_kokoro_submodels(model):
+            return {"model": model}
+
+        fn_get_submodels = _get_kokoro_submodels
+
     if library_name == "diffusers":
         export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
         stateful_submodels = False
@@ -699,6 +798,9 @@ def export_from_model(
             if hasattr(preprocess, "save_pretrained"):
                 preprocess.save_pretrained(output)
 
+        files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
+    elif library_name == "kokoro":
+        _save_kokoro_config_and_assets(model, output)
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
     elif library_name != "diffusers":
         # some model configs may have issues with loading without parameters initialization

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -132,6 +132,7 @@
 from .model_patcher import (
     AfmoeModelPatcher,
     AquilaModelPatcher,
+    KokoroModelPatcher,
     ArcticModelPatcher,
     BaichuanModelPatcher,
     BigBirdPegasusModelPatcher,
@@ -238,6 +239,8 @@
 def init_model_configs():
     if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
         TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
+    if "kokoro" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
+        TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["kokoro"] = {}
     TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
         "transformers",
         "LlavaForConditionalGeneration",
@@ -5451,3 +5454,83 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
                 )
 
         return dummy_inputs
+
+
+class DummyKokoroInputGenerator(DummyInputGenerator):
+    """Generates dummy inputs for the Kokoro TTS model."""
+
+    SUPPORTED_INPUT_NAMES = ("input_ids", "ref_s", "speed")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = 1
+        self.sequence_length = sequence_length
+        self.style_dim = getattr(normalized_config, "style_dim", 128)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "input_ids":
+            shape = [self.batch_size, self.sequence_length]
+            input_ids_value = self.random_int_tensor(shape=shape, min_value=0, max_value=178, framework=framework, dtype=int_dtype)
+            input_ids_value[:, 0] = 0
+            input_ids_value[:, -1] = 0
+            return input_ids_value
+        elif input_name == "ref_s":
+            shape = [self.batch_size, self.style_dim * 2]
+            return self.random_float_tensor(
+                shape=shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype
+            )
+        elif input_name == "speed":
+            return self.random_int_tensor(
+                shape=[1], min_value=1, max_value=10, framework=framework, dtype=float_dtype
+            )
+        else:
+            raise ValueError(f"Unsupported input {input_name} for DummyKokoroInputGenerator")
+
+
+@register_in_tasks_manager(
+    "kokoro",
+    *["text-to-audio"],
+    library_name="kokoro",
+)
+class KokoroOpenVINOConfig(OnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyKokoroInputGenerator,)
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+    _MODEL_PATCHER = KokoroModelPatcher
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "text-to-audio",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {1: ("sequence_length", 2, -1)},
+            "ref_s": {1: "style_dim"},
+            "speed": {},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "waveform": {0: "batch_size", 1: "audio_length"},
+            "phonemes": {0: "batch_size", 1: "phoneme_length"},
+        }
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -8319,3 +8319,20 @@ def __exit__(self, exc_type, exc_value, traceback):
                 sparse_moe_block = decoder_layer.mlp
                 decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
                 del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs
+
+
+class KokoroModelPatcher(ModelPatcher):
+    """
+    Patches the Kokoro TTS model for OpenVINO export by redirecting forward
+    to forward_with_tokens, which takes (input_ids, ref_s, speed) and returns
+    (audio_waveform, phonemes).
+    """
+
+    def __enter__(self):
+        super().__enter__()
+        self._model._orig_forward = self._model.forward
+        self._model.forward = self._model.forward_with_tokens
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model._orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
@@ -108,12 +108,18 @@ def _get_input_info(
         if name in inputs:
             named_dims = inputs[name]
             for idx, dim_name in named_dims.items():
+                orig_dim_name = dim_name
+                if isinstance(orig_dim_name, tuple):
+                    dim_name, min_value, max_value = dim_name
                 if dim_name in name_to_symbol:
                     symbol = name_to_symbol[dim_name]
                 else:
                     symbol = Symbol()
                     name_to_symbol[dim_name] = symbol
-                dim = Dimension(-1)
+                if isinstance(orig_dim_name, tuple):
+                    dim = Dimension(min_value, max_value)
+                else:
+                    dim = Dimension(-1)
                 dim.set_symbol(symbol)
                 shape[idx] = dim
         info = InputInfo(name=name, shape=shape, type=type, example=example)