From d3abd870491ef3b4cc687b757142fb951d55607e Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Mon, 30 Mar 2026 05:54:49 +0000 Subject: [PATCH 1/4] Support Kokoro TTS model --- docs/source/openvino/models.mdx | 5 +- optimum/exporters/openvino/__main__.py | 5 + optimum/exporters/openvino/convert.py | 96 +++++++- optimum/exporters/openvino/model_configs.py | 83 +++++++ optimum/exporters/openvino/model_patcher.py | 17 ++ optimum/exporters/openvino/utils.py | 8 +- .../intel/openvino/modeling_text2speech.py | 209 +++++++++++++++++- optimum/intel/utils/import_utils.py | 6 + optimum/intel/utils/modeling_utils.py | 75 +++++++ tests/openvino/test_export.py | 20 ++ tests/openvino/utils_tests.py | 1 + 11 files changed, 512 insertions(+), 13 deletions(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index c11505fa4d..9fd9f10410 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -180,4 +180,7 @@ Here is the list of the supported architectures : - All Transformer and CLIP-based models. ## [OpenCLIP](https://github.com/mlfoundations/open_clip) -- All CLIP-based models \ No newline at end of file +- All CLIP-based models + +## [Kokoro](https://github.com/hexgrad/kokoro) +- Kokoro-82M (text-to-speech) \ No newline at end of file diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index eb763b45d4..1b862dce21 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -39,6 +39,7 @@ ) from optimum.intel.utils.modeling_utils import ( _infer_library_from_model_name_or_path, + _KokoroForTextToSpeech, _OpenClipForZeroShotImageClassification, ) @@ -86,6 +87,8 @@ def infer_task( if task == "auto": if library_name == "open_clip": task = "zero-shot-image-classification" + elif library_name == "kokoro": + task = "text-to-audio" else: try: task = TasksManager._infer_task_from_model_name_or_path( @@ -471,6 +474,8 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs): try: if library_name == "open_clip": model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir) + elif library_name == "kokoro": + model = _KokoroForTextToSpeech.from_pretrained(model_name_or_path, cache_dir=cache_dir, token=token) else: # remote code models like phi3_v internvl2, minicpmv, internvl2, nanollava, maira2 should be loaded using AutoModelForCausalLM and not AutoModelForImageTextToText # TODO: use config.auto_map to load remote code models instead (for other models we can directly use config.architectures) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 60d90f53e0..a35ee70f0d 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -552,6 +552,68 @@ def export_models( return outputs +def _save_kokoro_config_and_assets(model, output: Path): + """Save Kokoro model config.json and export voice embeddings.""" + import json + import tempfile + + import numpy as np + from huggingface_hub import hf_hub_download, list_repo_files + + repo_id = getattr(model, "_kokoro_repo_id", None) + + # Save config.json + config_dict = {} + for key in vars(model.config): + if not key.startswith("_"): + config_dict[key] = getattr(model.config, key) + config_path = output / "config.json" + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config_dict, f, indent=2) + + if repo_id is None: + return + + if True: + return + + # Export voice embeddings to .bin format + voices_dir = output / "voices" + voices_dir.mkdir(parents=True, exist_ok=True) + + try: + repo_files = list_repo_files(repo_id=repo_id) + except Exception: + logger.warning(f"Could not list files for {repo_id}. Skipping voice export.") + return + + voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt")) + if not voice_pt_files: + return + + logger.info(f"Found {len(voice_pt_files)} voice files. Exporting to {voices_dir} ...") + with tempfile.TemporaryDirectory(prefix="kokoro_voice_pt_") as tmp_dir: + for remote_path in voice_pt_files: + local_pt = hf_hub_download(repo_id=repo_id, filename=remote_path, local_dir=tmp_dir) + voice_name = Path(remote_path).stem + out_bin = voices_dir / f"{voice_name}.bin" + + import torch + + voice_obj = torch.load(local_pt, map_location="cpu") + if torch.is_tensor(voice_obj): + voice_tensor = voice_obj + elif isinstance(voice_obj, dict): + voice_tensor = next(v for v in voice_obj.values() if torch.is_tensor(v)) + else: + logger.warning(f"Unsupported voice format in {remote_path}, skipping.") + continue + + voice_tensor = voice_tensor.detach().cpu().to(torch.float32).contiguous() + np.asarray(voice_tensor.numpy(), dtype=np.float32).tofile(out_bin) + logger.info(f"Exported {remote_path} -> {out_bin}") + + def export_from_model( model: Union["PreTrainedModel", "ModelMixin", "DiffusionPipeline"], output: Union[str, Path], @@ -576,7 +638,7 @@ def export_from_model( ) library_name = _infer_library_from_model_or_model_class(model) - if library_name != "open_clip": + if library_name not in ("open_clip", "kokoro"): TasksManager.standardize_model_attributes(model) if hasattr(model.config, "export_model_type") and model.config.export_model_type is not None: @@ -594,12 +656,15 @@ def export_from_model( if task is not None and task != "auto": task = TasksManager.map_from_synonym(task) else: - try: - task = TasksManager._infer_task_from_model_or_model_class(model=model) - except (ValueError, KeyError) as e: - raise RuntimeError( - f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) + if library_name == "kokoro": + task = "text-to-audio" + else: + try: + task = TasksManager._infer_task_from_model_or_model_class(model=model) + except (ValueError, KeyError) as e: + raise RuntimeError( + f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) if ( not custom_architecture @@ -661,6 +726,20 @@ def export_from_model( model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels ) + if library_name == "kokoro": + custom_architecture = True + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=model, exporter="openvino", task=task, library_name="kokoro" + ) + kokoro_export_config = export_config_constructor(model.config, task=task) + custom_export_configs = {"model": kokoro_export_config} + + def _get_kokoro_submodels(model): + return {"model": model} + + fn_get_submodels = _get_kokoro_submodels + + #raise "Exception" if library_name == "diffusers": export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") stateful_submodels = False @@ -699,6 +778,9 @@ def export_from_model( if hasattr(preprocess, "save_pretrained"): preprocess.save_pretrained(output) + files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] + elif library_name == "kokoro": + _save_kokoro_config_and_assets(model, output) files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": # some model configs may have issues with loading without parameters initialization diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0624624a77..a37eb516c2 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -132,6 +132,7 @@ from .model_patcher import ( AfmoeModelPatcher, AquilaModelPatcher, + KokoroModelPatcher, ArcticModelPatcher, BaichuanModelPatcher, BigBirdPegasusModelPatcher, @@ -238,6 +239,8 @@ def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} + if "kokoro" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: + TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["kokoro"] = {} TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( "transformers", "LlavaForConditionalGeneration", @@ -5451,3 +5454,83 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): ) return dummy_inputs + + +class DummyKokoroInputGenerator(DummyInputGenerator): + """Generates dummy inputs for the Kokoro TTS model.""" + + SUPPORTED_INPUT_NAMES = ("input_ids", "ref_s", "speed") + + def __init__( + self, + task: str, + normalized_config: NormalizedConfig, + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + **kwargs, + ): + self.task = task + self.batch_size = 1 + self.sequence_length = sequence_length + self.style_dim = getattr(normalized_config, "style_dim", 128) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "input_ids": + shape = [self.batch_size, self.sequence_length] + input_ids_value = self.random_int_tensor(shape=shape, min_value=0, max_value=178, framework=framework, dtype=int_dtype) + input_ids_value[:, 0] = 0 + input_ids_value[:, -1] = 0 + return input_ids_value + elif input_name == "ref_s": + shape = [self.batch_size, self.style_dim * 2] + return self.random_float_tensor( + shape=shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype + ) + elif input_name == "speed": + return self.random_int_tensor( + shape=[1], min_value=1, max_value=10, framework=framework, dtype=float_dtype + ) + else: + raise ValueError(f"Unsupported input {input_name} for DummyKokoroInputGenerator") + + +@register_in_tasks_manager( + "kokoro", + *["text-to-audio"], + library_name="kokoro", +) +class KokoroOpenVINOConfig(OnnxConfig): + DEFAULT_ONNX_OPSET = 14 + DUMMY_INPUT_GENERATOR_CLASSES = (DummyKokoroInputGenerator,) + NORMALIZED_CONFIG_CLASS = NormalizedConfig + _MODEL_PATCHER = KokoroModelPatcher + + def __init__( + self, + config: "PretrainedConfig", + task: str = "text-to-audio", + int_dtype: str = "int64", + float_dtype: str = "fp32", + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {1: ("sequence_length", 2, -1)}, + "ref_s": {1: "style_dim"}, + "speed": {}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "waveform": {0: "batch_size", 1: "audio_length"}, + "phonemes": {0: "batch_size", 1: "phoneme_length"}, + } diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 32dd2d6c6d..2cda2a8d74 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8319,3 +8319,20 @@ def __exit__(self, exc_type, exc_value, traceback): sparse_moe_block = decoder_layer.mlp decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs + + +class KokoroModelPatcher(ModelPatcher): + """ + Patches the Kokoro TTS model for OpenVINO export by redirecting forward + to forward_with_tokens, which takes (input_ids, ref_s, speed) and returns + (audio_waveform, phonemes). + """ + + def __enter__(self): + super().__enter__() + self._model._orig_forward = self._model.forward + self._model.forward = self._model.forward_with_tokens + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index af2f1edaba..51f3b96ac5 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -108,12 +108,18 @@ def _get_input_info( if name in inputs: named_dims = inputs[name] for idx, dim_name in named_dims.items(): + orig_dim_name = dim_name + if isinstance(orig_dim_name, tuple): + dim_name, min_value, max_value = dim_name if dim_name in name_to_symbol: symbol = name_to_symbol[dim_name] else: symbol = Symbol() name_to_symbol[dim_name] = symbol - dim = Dimension(-1) + if isinstance(orig_dim_name, tuple): + dim = Dimension(min_value, max_value) + else: + dim = Dimension(-1) dim.set_symbol(symbol) shape[idx] = dim info = InputInfo(name=name, shape=shape, type=type, example=example) diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py index 749edef73c..e27a6662a6 100644 --- a/optimum/intel/openvino/modeling_text2speech.py +++ b/optimum/intel/openvino/modeling_text2speech.py @@ -160,6 +160,24 @@ class OVModelForTextToSpeechSeq2Seq(OVModelForSeq2SeqLM): auto_model_class = AutoModelForTextToSpectrogram export_feature = "text-to-audio" + @classmethod + def from_pretrained(cls, model_id, **kwargs): + # For Kokoro models, load config via PretrainedConfig since AutoConfig + # does not recognize the "kokoro" model_type. + if kwargs.get("config") is None: + try: + config = PretrainedConfig.from_pretrained( + model_id, + cache_dir=kwargs.get("cache_dir", HUGGINGFACE_HUB_CACHE), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + ) + if getattr(config, "model_type", None) == "kokoro": + kwargs["config"] = config + except Exception: + pass + return super().from_pretrained(model_id, **kwargs) + @classmethod def _from_pretrained( cls, @@ -167,12 +185,14 @@ def _from_pretrained( config: "PretrainedConfig", **kwargs, ): - if "SpeechT5ForTextToSpeech" in config.architectures: + if getattr(config, "model_type", None) == "kokoro": + return _OVModelForKokoroTextToSpeech._from_pretrained(model_id, config, **kwargs) + elif getattr(config, "architectures", None) and "SpeechT5ForTextToSpeech" in config.architectures: return _OVModelForSpeechT5ForTextToSpeech._from_pretrained(model_id, config, **kwargs) else: - raise ValueError(f"{config.architectures} are not supported text-to-audio model using OpenVINO") - - return super()._from_pretrained(model_id, config, **kwargs) + raise ValueError( + f"{getattr(config, 'architectures', None)} are not supported text-to-audio model using OpenVINO" + ) def reshape(self, *args, **kwargs): logger.warning("Static shapes are not supported for this model.") @@ -522,3 +542,184 @@ def generate( ) outputs = (*outputs, cross_attentions) return outputs + + +class _OVModelForKokoroTextToSpeech(OVBaseModel): + """ + OpenVINO inference model for Kokoro TTS. + + Kokoro is a single-model architecture with inputs (input_ids, ref_s, speed) and + outputs (waveform, phonemes). Voice embeddings are stored as .bin files in a voices/ subdirectory. + """ + + export_feature = "text-to-audio" + auto_model_class = AutoModelForTextToSpectrogram + + def __init__(self, model: openvino.Model, config: PretrainedConfig = None, **kwargs): + # Kokoro model does not support dynamic shapes due to Squeeze op limitations, + # so we skip the automatic reshape to dynamic shapes. + kwargs.setdefault("dynamic_shapes", False) + super().__init__(model, config, **kwargs) + self._voices = {} + self._voices_dir = None + + def _reshape(self, model, batch_size, sequence_length, height=None, width=None): + # Kokoro has inputs with different ranks (speed is 1D), so only reshape + # dimensions that exist in each input. + shapes = {} + for inp in model.inputs: + shape = inp.get_partial_shape() + if len(shape) >= 1: + shape[0] = batch_size + if len(shape) >= 2: + shape[1] = sequence_length + shapes[inp] = shape + model.reshape(shapes) + return model + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: "PretrainedConfig", + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + local_files_only: bool = False, + load_in_8bit: bool = False, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + trust_remote_code: bool = False, + **kwargs, + ): + model = super()._from_pretrained( + model_id, + config=config, + token=token, + revision=revision, + force_download=force_download, + cache_dir=cache_dir, + local_files_only=local_files_only, + load_in_8bit=load_in_8bit, + quantization_config=quantization_config, + trust_remote_code=trust_remote_code, + **kwargs, + ) + # Locate voices directory + if model.model_save_dir is not None: + voices_dir = Path(model.model_save_dir) / "voices" + if voices_dir.is_dir(): + model._voices_dir = voices_dir + return model + + def _load_voice(self, voice_name: str) -> np.ndarray: + """Load a voice embedding by name, caching results.""" + if voice_name in self._voices: + return self._voices[voice_name] + + if self._voices_dir is None: + raise FileNotFoundError("No voices directory found in model directory.") + + voice_path = self._voices_dir / f"{voice_name}.bin" + if not voice_path.exists(): + raise FileNotFoundError( + f"Voice '{voice_name}' not found at {voice_path}. " + f"Available voices: {[f.stem for f in self._voices_dir.glob('*.bin')]}" + ) + + voice_data = np.fromfile(voice_path, dtype=np.float32) + self._voices[voice_name] = voice_data + return voice_data + + @property + def available_voices(self) -> List[str]: + """Returns list of available voice names.""" + if self._voices_dir is None or not self._voices_dir.is_dir(): + return [] + return sorted(f.stem for f in self._voices_dir.glob("*.bin")) + + def forward( + self, + input_ids: Union[torch.Tensor, np.ndarray], + ref_s: Union[torch.Tensor, np.ndarray], + speed: Union[torch.Tensor, np.ndarray, float], + **kwargs, + ) -> ModelOutput: + """ + Run inference on the Kokoro model. + + Args: + input_ids: Token IDs of shape [batch_size, sequence_length]. + ref_s: Voice style embedding of shape [batch_size, style_dim]. + speed: Speed factor, scalar or array. + + Returns: + ModelOutput with `waveform` and `phonemes`. + """ + self.compile() + + if isinstance(input_ids, torch.Tensor): + input_ids = input_ids.numpy() + if isinstance(ref_s, torch.Tensor): + ref_s = ref_s.numpy() + if isinstance(speed, (int, float)): + speed = np.array([speed], dtype=np.float32) + elif isinstance(speed, torch.Tensor): + speed = speed.numpy() + + inputs = { + "input_ids": input_ids, + "ref_s": ref_s, + "speed": speed, + } + + outputs = self._inference(inputs) + waveform = torch.from_numpy(outputs[0]) + phonemes = torch.from_numpy(outputs[1]) + return ModelOutput(waveform=waveform, phonemes=phonemes) + + def generate( + self, + input_ids: Union[torch.Tensor, np.ndarray], + voice: Optional[str] = None, + ref_s: Optional[Union[torch.Tensor, np.ndarray]] = None, + speed: float = 1.0, + **kwargs, + ) -> torch.FloatTensor: + """ + Generate audio waveform from token IDs. + + Args: + input_ids: Token IDs of shape [batch_size, sequence_length]. + voice: Name of a voice preset (e.g., "af_heart"). Ignored if ref_s is provided. + ref_s: Voice style embedding. If None, loaded from voice preset. + speed: Speed factor (default 1.0). + + Returns: + Audio waveform tensor. + """ + if ref_s is None: + if voice is None: + voice = "af_heart" + voice_data = self._load_voice(voice) + ref_s = voice_data.reshape(1, -1) + + if isinstance(input_ids, torch.Tensor): + if input_ids.dim() == 1: + input_ids = input_ids.unsqueeze(0) + elif isinstance(input_ids, np.ndarray): + if input_ids.ndim == 1: + input_ids = input_ids.reshape(1, -1) + + if isinstance(ref_s, np.ndarray) and ref_s.ndim == 1: + ref_s = ref_s.reshape(1, -1) + + result = self.forward(input_ids=input_ids, ref_s=ref_s, speed=speed) + return result.waveform + + def reshape(self, *args, **kwargs): + logger.warning("Static shapes are not supported for Kokoro model.") + return self + + def can_generate(self) -> bool: + return True diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 3ad9877a82..91ce1a2038 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -119,6 +119,8 @@ pass +_kokoro_available = importlib.util.find_spec("kokoro") is not None + _safetensors_version = "N/A" _safetensors_available = importlib.util.find_spec("safetensors") is not None if _safetensors_available: @@ -308,6 +310,10 @@ def is_open_clip_available(): return _open_clip_available +def is_kokoro_available(): + return _kokoro_available + + def is_safetensors_available(): return _safetensors_available diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..31c3168979 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -245,6 +245,30 @@ def bind_cores_for_best_perf(): logger.info(f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}") +def _is_kokoro_model( + model_name_or_path: Union[str, Path], + all_files: list, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, +) -> bool: + """Detect Kokoro TTS models by checking for 'istftnet' key in config.json.""" + if "config.json" not in all_files: + return False + try: + config_path = Path(model_name_or_path) + if config_path.is_dir(): + config_file = config_path / "config.json" + else: + config_file = hf_hub_download( + repo_id=str(model_name_or_path), filename="config.json", cache_dir=cache_dir, token=token + ) + with open(config_file, "r", encoding="utf-8") as f: + config = json.load(f) + return "istftnet" in config and "plbert" in config + except Exception: + return False + + def _infer_library_from_model_name_or_path( model_name_or_path: Union[str, Path], subfolder: str = "", @@ -257,6 +281,8 @@ def _infer_library_from_model_name_or_path( ) if "open_clip_config.json" in all_files or "open_clip_pytorch_model.bin" in all_files: library_name = "open_clip" + elif _is_kokoro_model(model_name_or_path, all_files, cache_dir=cache_dir, token=token): + library_name = "kokoro" else: library_name = TasksManager._infer_library_from_model_name_or_path( model_name_or_path=model_name_or_path, cache_dir=cache_dir @@ -273,6 +299,8 @@ def _infer_library_from_model_or_model_class( return library_name if model.__module__.startswith("open_clip"): library_name = "open_clip" + elif model.__module__.startswith("kokoro") or getattr(model, "_kokoro_model", False): + library_name = "kokoro" elif model.__module__.startswith("optimum"): # for wrapped models like timm in optimum.intel.openvino.modeling_timm library_name = TasksManager._infer_library_from_model_or_model_class(model=model.model) @@ -493,3 +521,50 @@ def from_pretrained( setattr(model.config, "export_model_type", "clip") return model + + +class _KokoroForTextToSpeech: + """Wrapper for loading Kokoro TTS model with a config conforming to optimum-intel expectations.""" + + @classmethod + def from_pretrained( + cls, + model_name_or_path: Union[str, Path], + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, + **kwargs, + ): + try: + from kokoro import KPipeline + except ImportError: + raise ImportError( + "To load a Kokoro TTS model, the `kokoro` package is required. " + "Please install it with `pip install kokoro`." + ) + + pipeline = KPipeline(lang_code="a", repo_id=str(model_name_or_path)) + model = pipeline.model + model._kokoro_model = True + model._kokoro_repo_id = str(model_name_or_path) + + # Load config.json and create a PretrainedConfig-like object + config_path = Path(model_name_or_path) + if config_path.is_dir(): + config_file = config_path / "config.json" + else: + config_file = hf_hub_download( + repo_id=str(model_name_or_path), filename="config.json", cache_dir=cache_dir, token=token + ) + + with open(config_file, "r", encoding="utf-8") as f: + config_dict = json.load(f) + + config = PretrainedConfig() + config.model_type = "kokoro" + config.export_model_type = "kokoro" + for key, value in config_dict.items(): + setattr(config, key, value) + + model.config = config + + return model diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 9519cea1ec..b6187a674b 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -13,6 +13,7 @@ # limitations under the License. +import importlib.util import unittest from pathlib import Path @@ -393,3 +394,22 @@ def test_export_custom_model(self): ov_outputs = ov_model(**tokens) self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4)) self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4)) + + +@unittest.skipUnless( + importlib.util.find_spec("kokoro") is not None, + "kokoro package is not installed", +) +class KokoroExportModelTest(unittest.TestCase): + def test_kokoro_export(self): + model_id = MODEL_NAMES["kokoro"] + with TemporaryDirectory() as tmpdirname: + main_export( + model_name_or_path=model_id, + output=Path(tmpdirname), + task="text-to-audio", + ) + output_path = Path(tmpdirname) + self.assertTrue((output_path / "openvino_model.xml").exists()) + self.assertTrue((output_path / "openvino_model.bin").exists()) + self.assertTrue((output_path / "config.json").exists()) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fe6d584d2f..40e189ae3e 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -114,6 +114,7 @@ "internlm2": "optimum-intel-internal-testing/tiny-random-internlm2", "internvl_chat": "optimum-intel-internal-testing/tiny-random-internvl2", "jais": "optimum-intel-internal-testing/tiny-random-jais", + "kokoro": "hexgrad/Kokoro-82M", "levit": "optimum-intel-internal-testing/tiny-random-LevitModel", "lfm2": "optimum-intel-internal-testing/tiny-random-lfm2", "longt5": "optimum-intel-internal-testing/tiny-random-longt5", From 605dd93eb3784be4cbbf3083aef351346c8d810e Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Mon, 30 Mar 2026 06:40:10 +0000 Subject: [PATCH 2/4] Save misaki data files --- optimum/exporters/openvino/convert.py | 28 +++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index a35ee70f0d..596782819e 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -574,9 +574,6 @@ def _save_kokoro_config_and_assets(model, output: Path): if repo_id is None: return - if True: - return - # Export voice embeddings to .bin format voices_dir = output / "voices" voices_dir.mkdir(parents=True, exist_ok=True) @@ -587,6 +584,30 @@ def _save_kokoro_config_and_assets(model, output: Path): logger.warning(f"Could not list files for {repo_id}. Skipping voice export.") return + # Save misaki data files from GitHub to data dir of output directory + try: + import urllib.request + + MISAKI_DATA_URL = "https://raw.githubusercontent.com/hexgrad/misaki/main/misaki/data" + MISAKI_DATA_FILES = [ + "gb_gold.json", "gb_silver.json", + "us_gold.json", "us_silver.json", + "vi_acronyms.json", "vi_symbols.json", "vi_teencode.json", + "ja_words.txt", + ] + data_out = output / "data" + data_out.mkdir(parents=True, exist_ok=True) + for fname in MISAKI_DATA_FILES: + url = f"{MISAKI_DATA_URL}/{fname}" + dest = data_out / fname + try: + urllib.request.urlretrieve(url, dest) + logger.info(f"Downloaded misaki data file: {fname}") + except Exception as e: + logger.warning(f"Failed to download {fname} from {url}: {e}") + except Exception as e: + logger.warning(f"Could not download misaki data files: {e}") + voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt")) if not voice_pt_files: return @@ -739,7 +760,6 @@ def _get_kokoro_submodels(model): fn_get_submodels = _get_kokoro_submodels - #raise "Exception" if library_name == "diffusers": export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") stateful_submodels = False From 73189825f3a5ecfe5fa5d4c2811949a710b81218 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Fri, 3 Apr 2026 11:45:31 +0000 Subject: [PATCH 3/4] Fix inference --- optimum/intel/openvino/modeling_text2speech.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py index e27a6662a6..5754001f98 100644 --- a/optimum/intel/openvino/modeling_text2speech.py +++ b/optimum/intel/openvino/modeling_text2speech.py @@ -172,12 +172,22 @@ def from_pretrained(cls, model_id, **kwargs): token=kwargs.get("token"), revision=kwargs.get("revision"), ) + # Detect Kokoro models that lack model_type by checking for + # characteristic config keys (same heuristic used by CLI export). + if not getattr(config, "model_type", None): + if hasattr(config, "istftnet") and hasattr(config, "plbert"): + config.model_type = "kokoro" + config.export_model_type = "kokoro" if getattr(config, "model_type", None) == "kokoro": kwargs["config"] = config - except Exception: - pass + except Exception as e: + logger.warning(f"Could not pre-load config for Kokoro detection: {e}") return super().from_pretrained(model_id, **kwargs) + @classmethod + def _export(cls, model_id, config, use_cache=False, **kwargs): + return super()._export(model_id, config, use_cache=use_cache, **kwargs) + @classmethod def _from_pretrained( cls, From 4b42a04f9f4f4b928b2dca1267264c4ee1b0f0bb Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Fri, 3 Apr 2026 12:15:25 +0000 Subject: [PATCH 4/4] Add preprocess_input --- .../intel/openvino/modeling_text2speech.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/optimum/intel/openvino/modeling_text2speech.py b/optimum/intel/openvino/modeling_text2speech.py index 5754001f98..f21c5b5283 100644 --- a/optimum/intel/openvino/modeling_text2speech.py +++ b/optimum/intel/openvino/modeling_text2speech.py @@ -208,6 +208,22 @@ def reshape(self, *args, **kwargs): logger.warning("Static shapes are not supported for this model.") return self + def preprocess_input(self, text: str, **kwargs) -> dict: + """ + Preprocess a text string into model inputs (input_ids and other required tensors). + + Args: + text: The input text to synthesize. + **kwargs: Model-specific arguments (e.g., voice, speed, lang_code for Kokoro). + + Returns: + Dictionary with model inputs ready for `generate()`. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement `preprocess_input`. " + "Use the appropriate model-specific subclass." + ) + class _OVModelForSpeechT5ForTextToSpeech(OVModelForTextToSpeechSeq2Seq): """ @@ -733,3 +749,62 @@ def reshape(self, *args, **kwargs): def can_generate(self) -> bool: return True + + def preprocess_input( + self, + text: str, + voice: str = "af_heart", + speed: float = 1.0, + lang_code: str = "a", + **kwargs, + ) -> dict: + """ + Preprocess a text string into model inputs for Kokoro TTS. + + Uses the ``kokoro`` and ``misaki`` packages for grapheme-to-phoneme + conversion and phoneme tokenization. + + Args: + text: The input text to synthesize. + voice: Name of a voice preset (e.g., ``"af_heart"``). + speed: Speed factor (default 1.0). + lang_code: Language code for G2P (default ``"a"`` for American English). + + Returns: + Dictionary with ``input_ids``, ``ref_s``, and ``speed`` ready for + ``generate()`` or ``forward()``. + """ + try: + from kokoro import KPipeline + except ImportError: + raise ImportError( + "The `kokoro` and `misaki` packages are required for text preprocessing. " + "Install them with: pip install kokoro misaki[en]" + ) + + vocab = getattr(self.config, "vocab", None) + if vocab is None: + raise ValueError("Model config does not contain 'vocab'. Cannot tokenize phonemes.") + + pipeline = KPipeline(lang_code=lang_code, model=False) + + # G2P: text -> phoneme tokens -> phoneme string + _, tokens = pipeline.g2p(text) + phonemes = KPipeline.tokens_to_ps(tokens) + if not phonemes: + raise ValueError(f"G2P produced no phonemes for input text: {text!r}") + + # Tokenize: phoneme string -> token IDs (with BOS/EOS) + input_ids = [vocab.get(p) for p in phonemes] + input_ids = [i for i in input_ids if i is not None] + input_ids = torch.LongTensor([[0, *input_ids, 0]]) + + # Load voice embedding indexed by phoneme length + voice_pack = pipeline.load_voice(voice) + ref_s = voice_pack[len(phonemes) - 1] + + return { + "input_ids": input_ids, + "ref_s": ref_s, + "speed": speed, + }