Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,7 @@ Here is the list of the supported architectures :
- All Transformer and CLIP-based models.

## [OpenCLIP](https://github.com/mlfoundations/open_clip)
- All CLIP-based models
- All CLIP-based models

## [Kokoro](https://github.com/hexgrad/kokoro)
- Kokoro-82M (text-to-speech)
5 changes: 5 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
)
from optimum.intel.utils.modeling_utils import (
_infer_library_from_model_name_or_path,
_KokoroForTextToSpeech,
_OpenClipForZeroShotImageClassification,
)

Expand Down Expand Up @@ -86,6 +87,8 @@ def infer_task(
if task == "auto":
if library_name == "open_clip":
task = "zero-shot-image-classification"
elif library_name == "kokoro":
task = "text-to-audio"
else:
try:
task = TasksManager._infer_task_from_model_name_or_path(
Expand Down Expand Up @@ -471,6 +474,8 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
try:
if library_name == "open_clip":
model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
elif library_name == "kokoro":
model = _KokoroForTextToSpeech.from_pretrained(model_name_or_path, cache_dir=cache_dir, token=token)
else:
# remote code models like phi3_v internvl2, minicpmv, internvl2, nanollava, maira2 should be loaded using AutoModelForCausalLM and not AutoModelForImageTextToText
# TODO: use config.auto_map to load remote code models instead (for other models we can directly use config.architectures)
Expand Down
116 changes: 109 additions & 7 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,89 @@ def export_models(
return outputs


def _save_kokoro_config_and_assets(model, output: Path):
"""Save Kokoro model config.json and export voice embeddings."""
import json
import tempfile

import numpy as np
from huggingface_hub import hf_hub_download, list_repo_files

repo_id = getattr(model, "_kokoro_repo_id", None)

# Save config.json
config_dict = {}
for key in vars(model.config):
if not key.startswith("_"):
config_dict[key] = getattr(model.config, key)
config_path = output / "config.json"
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config_dict, f, indent=2)

if repo_id is None:
return

# Export voice embeddings to .bin format
voices_dir = output / "voices"
voices_dir.mkdir(parents=True, exist_ok=True)

try:
repo_files = list_repo_files(repo_id=repo_id)
except Exception:
logger.warning(f"Could not list files for {repo_id}. Skipping voice export.")
return

# Save misaki data files from GitHub to data dir of output directory
try:
import urllib.request

MISAKI_DATA_URL = "https://raw.githubusercontent.com/hexgrad/misaki/main/misaki/data"
MISAKI_DATA_FILES = [
"gb_gold.json", "gb_silver.json",
"us_gold.json", "us_silver.json",
"vi_acronyms.json", "vi_symbols.json", "vi_teencode.json",
"ja_words.txt",
]
data_out = output / "data"
data_out.mkdir(parents=True, exist_ok=True)
for fname in MISAKI_DATA_FILES:
url = f"{MISAKI_DATA_URL}/{fname}"
dest = data_out / fname
try:
urllib.request.urlretrieve(url, dest)
logger.info(f"Downloaded misaki data file: {fname}")
except Exception as e:
logger.warning(f"Failed to download {fname} from {url}: {e}")
except Exception as e:
logger.warning(f"Could not download misaki data files: {e}")

voice_pt_files = sorted(path for path in repo_files if path.startswith("voices/") and path.endswith(".pt"))
if not voice_pt_files:
return

logger.info(f"Found {len(voice_pt_files)} voice files. Exporting to {voices_dir} ...")
with tempfile.TemporaryDirectory(prefix="kokoro_voice_pt_") as tmp_dir:
for remote_path in voice_pt_files:
local_pt = hf_hub_download(repo_id=repo_id, filename=remote_path, local_dir=tmp_dir)
voice_name = Path(remote_path).stem
out_bin = voices_dir / f"{voice_name}.bin"

import torch

voice_obj = torch.load(local_pt, map_location="cpu")
if torch.is_tensor(voice_obj):
voice_tensor = voice_obj
elif isinstance(voice_obj, dict):
voice_tensor = next(v for v in voice_obj.values() if torch.is_tensor(v))
else:
logger.warning(f"Unsupported voice format in {remote_path}, skipping.")
continue

voice_tensor = voice_tensor.detach().cpu().to(torch.float32).contiguous()
np.asarray(voice_tensor.numpy(), dtype=np.float32).tofile(out_bin)
logger.info(f"Exported {remote_path} -> {out_bin}")


def export_from_model(
model: Union["PreTrainedModel", "ModelMixin", "DiffusionPipeline"],
output: Union[str, Path],
Expand All @@ -576,7 +659,7 @@ def export_from_model(
)

library_name = _infer_library_from_model_or_model_class(model)
if library_name != "open_clip":
if library_name not in ("open_clip", "kokoro"):
TasksManager.standardize_model_attributes(model)

if hasattr(model.config, "export_model_type") and model.config.export_model_type is not None:
Expand All @@ -594,12 +677,15 @@ def export_from_model(
if task is not None and task != "auto":
task = TasksManager.map_from_synonym(task)
else:
try:
task = TasksManager._infer_task_from_model_or_model_class(model=model)
except (ValueError, KeyError) as e:
raise RuntimeError(
f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
if library_name == "kokoro":
task = "text-to-audio"
else:
try:
task = TasksManager._infer_task_from_model_or_model_class(model=model)
except (ValueError, KeyError) as e:
raise RuntimeError(
f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)

if (
not custom_architecture
Expand Down Expand Up @@ -661,6 +747,19 @@ def export_from_model(
model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels
)

if library_name == "kokoro":
custom_architecture = True
export_config_constructor = TasksManager.get_exporter_config_constructor(
model=model, exporter="openvino", task=task, library_name="kokoro"
)
kokoro_export_config = export_config_constructor(model.config, task=task)
custom_export_configs = {"model": kokoro_export_config}

def _get_kokoro_submodels(model):
return {"model": model}

fn_get_submodels = _get_kokoro_submodels

if library_name == "diffusers":
export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
stateful_submodels = False
Expand Down Expand Up @@ -699,6 +798,9 @@ def export_from_model(
if hasattr(preprocess, "save_pretrained"):
preprocess.save_pretrained(output)

files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name == "kokoro":
_save_kokoro_config_and_assets(model, output)
files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
# some model configs may have issues with loading without parameters initialization
Expand Down
83 changes: 83 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
from .model_patcher import (
AfmoeModelPatcher,
AquilaModelPatcher,
KokoroModelPatcher,
ArcticModelPatcher,
BaichuanModelPatcher,
BigBirdPegasusModelPatcher,
Expand Down Expand Up @@ -238,6 +239,8 @@
def init_model_configs():
if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
if "kokoro" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["kokoro"] = {}
TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
"transformers",
"LlavaForConditionalGeneration",
Expand Down Expand Up @@ -5451,3 +5454,83 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
)

return dummy_inputs


class DummyKokoroInputGenerator(DummyInputGenerator):
"""Generates dummy inputs for the Kokoro TTS model."""

SUPPORTED_INPUT_NAMES = ("input_ids", "ref_s", "speed")

def __init__(
self,
task: str,
normalized_config: NormalizedConfig,
sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
**kwargs,
):
self.task = task
self.batch_size = 1
self.sequence_length = sequence_length
self.style_dim = getattr(normalized_config, "style_dim", 128)

def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name == "input_ids":
shape = [self.batch_size, self.sequence_length]
input_ids_value = self.random_int_tensor(shape=shape, min_value=0, max_value=178, framework=framework, dtype=int_dtype)
input_ids_value[:, 0] = 0
input_ids_value[:, -1] = 0
return input_ids_value
elif input_name == "ref_s":
shape = [self.batch_size, self.style_dim * 2]
return self.random_float_tensor(
shape=shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype
)
elif input_name == "speed":
return self.random_int_tensor(
shape=[1], min_value=1, max_value=10, framework=framework, dtype=float_dtype
)
else:
raise ValueError(f"Unsupported input {input_name} for DummyKokoroInputGenerator")


@register_in_tasks_manager(
"kokoro",
*["text-to-audio"],
library_name="kokoro",
)
class KokoroOpenVINOConfig(OnnxConfig):
DEFAULT_ONNX_OPSET = 14
DUMMY_INPUT_GENERATOR_CLASSES = (DummyKokoroInputGenerator,)
NORMALIZED_CONFIG_CLASS = NormalizedConfig
_MODEL_PATCHER = KokoroModelPatcher

def __init__(
self,
config: "PretrainedConfig",
task: str = "text-to-audio",
int_dtype: str = "int64",
float_dtype: str = "fp32",
preprocessors: Optional[List[Any]] = None,
):
super().__init__(
config=config,
task=task,
int_dtype=int_dtype,
float_dtype=float_dtype,
preprocessors=preprocessors,
)

@property
def inputs(self) -> Dict[str, Dict[int, str]]:
return {
"input_ids": {1: ("sequence_length", 2, -1)},
"ref_s": {1: "style_dim"},
"speed": {},
}

@property
def outputs(self) -> Dict[str, Dict[int, str]]:
return {
"waveform": {0: "batch_size", 1: "audio_length"},
"phonemes": {0: "batch_size", 1: "phoneme_length"},
}
17 changes: 17 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8319,3 +8319,20 @@ def __exit__(self, exc_type, exc_value, traceback):
sparse_moe_block = decoder_layer.mlp
decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs


class KokoroModelPatcher(ModelPatcher):
"""
Patches the Kokoro TTS model for OpenVINO export by redirecting forward
to forward_with_tokens, which takes (input_ids, ref_s, speed) and returns
(audio_waveform, phonemes).
"""

def __enter__(self):
super().__enter__()
self._model._orig_forward = self._model.forward
self._model.forward = self._model.forward_with_tokens

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
self._model.forward = self._model._orig_forward
8 changes: 7 additions & 1 deletion optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,18 @@ def _get_input_info(
if name in inputs:
named_dims = inputs[name]
for idx, dim_name in named_dims.items():
orig_dim_name = dim_name
if isinstance(orig_dim_name, tuple):
dim_name, min_value, max_value = dim_name
if dim_name in name_to_symbol:
symbol = name_to_symbol[dim_name]
else:
symbol = Symbol()
name_to_symbol[dim_name] = symbol
dim = Dimension(-1)
if isinstance(orig_dim_name, tuple):
dim = Dimension(min_value, max_value)
else:
dim = Dimension(-1)
dim.set_symbol(symbol)
shape[idx] = dim
info = InputInfo(name=name, shape=shape, type=type, example=example)
Expand Down
Loading