diff --git a/optimum/exporters/openvino/export_paraformer.py b/optimum/exporters/openvino/export_paraformer.py new file mode 100644 index 0000000000..302d1b65ef --- /dev/null +++ b/optimum/exporters/openvino/export_paraformer.py @@ -0,0 +1,211 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Standalone Paraformer Export Script for OpenVINO + +This module provides a standalone export function for Paraformer ASR models +to OpenVINO format, independent of the main optimum-intel export pipeline. + +Usage: + python -m optimum.exporters.openvino.export_paraformer \ + --model /path/to/paraformer/model \ + --output /path/to/output \ + --int8 # optional, for INT8 weight compression + +Or programmatically: + from optimum.exporters.openvino.export_paraformer import export_paraformer + export_paraformer(model_path, output_path, compress_int8=True) +""" + +import argparse +import logging +import os +import shutil +from pathlib import Path +from typing import Optional, Union + +import torch + +logger = logging.getLogger(__name__) + + +def export_paraformer( + model_name_or_path: Union[str, Path], + output: Union[str, Path], + device: str = "cpu", + compress_int8: bool = False, + compress_fp16: bool = True, +) -> None: + """ + Export a Paraformer ASR model to OpenVINO format. + + This is a standalone export function that doesn't require modifications + to the main optimum-intel export pipeline. + + Args: + model_name_or_path: Path to the Paraformer model directory + output: Output directory for the exported model + device: Device to use for export (default: "cpu") + compress_int8: Apply INT8 symmetric weight compression (default: False) + compress_fp16: Store FP32 constants as FP16 (default: True, recommended for GPU) + + Returns: + None + + Example: + >>> from optimum.exporters.openvino.export_paraformer import export_paraformer + >>> export_paraformer( + ... "/path/to/paraformer/model", + ... "/path/to/output", + ... compress_int8=True + ... ) + """ + import openvino as ov + + # Import paraformer modeling (lazy import to avoid dependency issues) + from optimum.exporters.openvino.modeling_paraformer import build_model, export + + model_path = str(model_name_or_path) + output_path = Path(output) + + logger.info(f"Exporting Paraformer model from {model_path}") + logger.info(f"Output directory: {output_path}") + + # Build the model + model, kwargs = build_model(model=model_path, device=device) + + # Export to TorchScript + model_dir, model_jit_scripts = export(model, kwargs, type="torchscript", quantize=False, device=device) + + # Convert to OpenVINO + ovm = ov.convert_model(model_jit_scripts, input=[([-1, -1, -1], torch.float32), ([-1], torch.int32)]) + + # Create output directory + target_dir = output_path / "ov_models" + target_dir.mkdir(parents=True, exist_ok=True) + + output_model_path = target_dir / "openvino_model.xml" + + # Apply INT8 weight compression if requested + if compress_int8: + try: + from nncf import compress_weights, CompressWeightsMode + logger.info("Applying INT8 weight compression (symmetric)...") + # INT8_SYM: no zero-point bias ops → significantly faster on GPU + ovm = compress_weights(ovm, mode=CompressWeightsMode.INT8_SYM) + logger.info("Weight compression complete.") + except ImportError: + logger.warning("NNCF not available. Skipping INT8 compression. Install with: pip install nncf") + + # Save the model + if compress_fp16: + # compress_to_fp16=True: stores remaining FP32 constants as FP16 + # → avoids a second FP32→FP16 conversion pass on GPU at runtime + ov.save_model(ovm, str(output_model_path), compress_to_fp16=True) + logger.info(f"Model saved with FP16 compression to {output_model_path}") + else: + ov.serialize(ovm, str(output_model_path)) + logger.info(f"Model saved to {output_model_path}") + + # Copy model parameter files + PARAFORMER_PARAM_FILES = ['am.mvn', 'config.yaml', 'configuration.json', 'seg_dict', 'tokens.json'] + + for file_name in PARAFORMER_PARAM_FILES: + source_file = os.path.join(model_dir, file_name) + target_file = target_dir / file_name + if os.path.exists(source_file): + shutil.copy2(source_file, target_file) + logger.debug(f"Copied {file_name}") + + logger.info(f"Export complete. Model saved to {target_dir}") + + return model, kwargs + + +def main(): + """Command-line interface for Paraformer export.""" + parser = argparse.ArgumentParser( + description="Export Paraformer ASR model to OpenVINO format", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic export + python -m optimum.exporters.openvino.export_paraformer \\ + --model /path/to/paraformer \\ + --output /path/to/output + + # Export with INT8 compression + python -m optimum.exporters.openvino.export_paraformer \\ + --model /path/to/paraformer \\ + --output /path/to/output \\ + --int8 + """ + ) + + parser.add_argument( + "--model", "-m", + type=str, + required=True, + help="Path to the Paraformer model directory" + ) + parser.add_argument( + "--output", "-o", + type=str, + required=True, + help="Output directory for the exported model" + ) + parser.add_argument( + "--device", + type=str, + default="cpu", + help="Device to use for export (default: cpu)" + ) + parser.add_argument( + "--int8", + action="store_true", + help="Apply INT8 symmetric weight compression" + ) + parser.add_argument( + "--no-fp16", + action="store_true", + help="Disable FP16 compression for constants" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + # Setup logging + log_level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=log_level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + # Run export + export_paraformer( + model_name_or_path=args.model, + output=args.output, + device=args.device, + compress_int8=args.int8, + compress_fp16=not args.no_fp16, + ) + + +if __name__ == "__main__": + main() diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0624624a77..c023f4879d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -198,6 +198,7 @@ Qwen3NextModelPatcher, Qwen3VLLanguageModelPatcher, Qwen3VLVisionEmbMergerPatcher, + ParaformerModelPatcher, QwenModelPatcher, SanaTextEncoderModelPatcher, XverseModelPatcher, @@ -5451,3 +5452,127 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): ) return dummy_inputs + + +# ============================================================================ +# Paraformer ASR Model Support +# ============================================================================ +# Registration for FunASR Paraformer models for automatic speech recognition +# This allows export via: optimum-cli export openvino --model funasr/paraformer-zh + +# Import Paraformer model and configuration from modeling_paraformer +try: + from .modeling_paraformer import ( + ParaformerForASR, + ParaformerConfig, + _load_paraformer_model, + ) + + # Register paraformer library with TasksManager + if "paraformer" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: + TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["paraformer"] = { + "paraformer": { + "automatic-speech-recognition": ("ParaformerForASR",), + } + } + + # Register model loader for paraformer library + if "paraformer" not in TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP: + TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP["paraformer"] = { + "automatic-speech-recognition": _load_paraformer_model, + } + + # Also register as custom class to avoid library import issues + TasksManager._CUSTOM_CLASSES[("pt", "paraformer", "automatic-speech-recognition")] = ( + "optimum.exporters.openvino.modeling_paraformer", + "ParaformerForASR", + ) + + PARAFORMER_AVAILABLE = True +except ImportError: + PARAFORMER_AVAILABLE = False + logger.debug("Paraformer support not available - modeling_paraformer module not found") + +# Import paraformer_plugin to hook into main_export for non-standard library support +# This is necessary because 'paraformer' is a FunASR library, not a transformers library +try: + from . import paraformer_plugin # noqa: F401 +except ImportError: + pass # Paraformer dependencies not available + + +class ParaformerDummyAudioInputGenerator(DummyInputGenerator): + """ + Generates dummy audio inputs for Paraformer model export. + """ + SUPPORTED_INPUT_NAMES = ("speech", "speech_lengths") + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + """ + Generates dummy audio features and lengths for Paraformer. + + Args: + input_name: Name of the input ("speech" or "speech_lengths") + framework: Framework to use (default: "pt") + int_dtype: Integer dtype + float_dtype: Float dtype + + Returns: + Dummy tensor for the specified input + """ + if input_name == "speech": + # Paraformer expects speech features: (batch_size, feature_length, feature_dim) + # Typical feature_dim is 560 (80 mel features * 7 LFR stacking) + batch_size = self.batch_size + feature_length = 30 # Example length + feature_dim = 560 + return self.random_float_tensor( + shape=(batch_size, feature_length, feature_dim), + min_value=-1.0, + max_value=1.0, + framework=framework, + dtype=float_dtype + ) + elif input_name == "speech_lengths": + # Generate realistic speech lengths for the batch + return self.random_int_tensor( + shape=(self.batch_size,), + max_value=30, + min_value=6, + framework=framework, + dtype="int32" # Paraformer uses int32 for lengths + ) + + +@register_in_tasks_manager( + "paraformer", + *["automatic-speech-recognition"], + library_name="transformers", +) +class ParaformerOpenVINOConfig(OnnxConfig): + """ + OpenVINO configuration for Paraformer ASR models. + """ + DEFAULT_ONNX_OPSET = 14 + DUMMY_INPUT_GENERATOR_CLASSES = (ParaformerDummyAudioInputGenerator,) + _MODEL_PATCHER = ParaformerModelPatcher + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + """ + Define model inputs with dynamic axes. + """ + return { + "speech": {0: "batch_size", 1: "feats_length"}, + "speech_lengths": {0: "batch_size"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + """ + Define model outputs with dynamic axes. + """ + return { + "logits": {0: "batch_size", 1: "logits_length"}, + "token_num": {0: "batch_size"}, + } diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 32dd2d6c6d..d0b2402577 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8319,3 +8319,30 @@ def __exit__(self, exc_type, exc_value, traceback): sparse_moe_block = decoder_layer.mlp decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs + + +class ParaformerModelPatcher(ModelPatcher): + """ + Model patcher for Paraformer ASR models. + Applies necessary modifications for export to OpenVINO format. + """ + + def __enter__(self): + super().__enter__() + + # Import the export_rebuild_model function from modeling_paraformer + try: + from .modeling_paraformer import export_rebuild_model + except ImportError: + logger.warning("Could not import export_rebuild_model from modeling_paraformer") + return self + + # Apply the export modifications + max_seq_len = self._config.values.get("max_seq_len", 512) + export_rebuild_model(self._model, max_seq_len=max_seq_len, device="cpu", type="onnx") + + return self + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + diff --git a/optimum/exporters/openvino/modeling_paraformer.py b/optimum/exporters/openvino/modeling_paraformer.py new file mode 100644 index 0000000000..8376dc9113 --- /dev/null +++ b/optimum/exporters/openvino/modeling_paraformer.py @@ -0,0 +1,2315 @@ +import torch +from torch import nn +import logging +from typing import Dict, Optional, List, Tuple, Union +from pathlib import Path +import numpy as np +import types +import math +import os +import json +import copy + +# Optional dependency: omegaconf is only required for loading FunASR-style configs +try: + from omegaconf import OmegaConf, DictConfig, ListConfig + _OMEGACONF_AVAILABLE = True +except ImportError: + OmegaConf = None + DictConfig = None + ListConfig = None + _OMEGACONF_AVAILABLE = False + +# Optional dependency: loralib for LoRA fine-tuning +try: + import loralib as lora + _LORA_AVAILABLE = True +except ImportError: + lora = None + _LORA_AVAILABLE = False + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/utils/repeat.py#L14 (Apache 2.0) +class MultiSequential(torch.nn.Sequential): + """Multi-input multi-output torch.nn.Sequential.""" + + def __init__(self, *args, layer_drop_rate=0.0): + """Initialize MultiSequential with layer_drop. + + Args: + layer_drop_rate (float): Probability of dropping out each fn (layer). + + """ + super(MultiSequential, self).__init__(*args) + self.layer_drop_rate = layer_drop_rate + + def forward(self, *args): + """Repeat.""" + _probs = torch.empty(len(self)).uniform_() + for idx, m in enumerate(self): + if not self.training or (_probs[idx] >= self.layer_drop_rate): + args = m(*args) + return args + +def repeat(N, fn, layer_drop_rate=0.0): + """Repeat module N times. + + Args: + N (int): Number of repeat time. + fn (Callable): Function to generate module. + layer_drop_rate (float): Probability of dropping out each fn (layer). + + Returns: + MultiSequential: Repeated model instance. + + """ + return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate) + +# https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/positionwise_feed_forward.py#L14 (Apache 2.0) +class PositionwiseFeedForward(torch.nn.Module): + """Positionwise feed forward layer. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): + """Construct an PositionwiseFeedForward object.""" + super(PositionwiseFeedForward, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.w_2 = torch.nn.Linear(hidden_units, idim) + self.dropout = torch.nn.Dropout(dropout_rate) + self.activation = activation + + def forward(self, x): + """Forward function.""" + return self.w_2(self.dropout(self.activation(self.w_1(x)))) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L416 (Apache 2.0) +class StreamSinusoidalPositionEncoder(torch.nn.Module): + """ """ + + def __init__(self, d_model=80, dropout_rate=0.1): + super().__init__() + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L383 (Apache 2.0) +class SinusoidalPositionEncoder(torch.nn.Module): + """ """ + + def __init__(self, d_model=80, dropout_rate=0.1): + super().__init__() + + def encode( + self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32 + ): + batch_size = positions.size(0) + positions = positions.type(dtype) + device = positions.device + log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / ( + depth / 2 - 1 + ) + inv_timescales = torch.exp( + torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment) + ) + inv_timescales = torch.reshape(inv_timescales, [batch_size, -1]) + scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape( + inv_timescales, [1, 1, -1] + ) + encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2) + return encoding.type(dtype) + + def forward(self, x): + batch_size, timesteps, input_dim = x.size() + positions = torch.arange(1, timesteps + 1, dtype=torch.int32, device=x.device)[None, :] + position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device) + + return x + position_encoding + +def _pre_hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, +): + """Perform pre-hook in load_state_dict for backward compatibility. + + Note: + We saved self.pe until v.0.5.2 but we have omitted it later. + Therefore, we remove the item "pe" from `state_dict` for backward compatibility. + + """ + k = prefix + "pe" + if k in state_dict: + state_dict.pop(k) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L36 (Apache 2.0) +class PositionalEncoding(torch.nn.Module): + """Positional encoding. + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. Only for + the class LegacyRelPositionalEncoding. We remove it in the current + class RelPositionalEncoding. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): + """Construct an PositionalEncoding object.""" + super(PositionalEncoding, self).__init__() + self.d_model = d_model + self.reverse = reverse + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + self._register_load_state_dict_pre_hook(_pre_hook) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + if self.reverse: + position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/layer_norm.py#L13 (Apache 2.0) +class LayerNorm(torch.nn.LayerNorm): + """Layer normalization module. + + Args: + nout (int): Output dim size. + dim (int): Dimension to be normalized. + + """ + + def __init__(self, nout, dim=-1): + """Construct an LayerNorm object.""" + super(LayerNorm, self).__init__(nout, eps=1e-12) + self.dim = dim + +class BaseTransformerDecoder(nn.Module): + """Base class of Transfomer decoder module. + + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the number of units of position-wise feed forward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: whether to use layer_norm before the first block + concat_after: whether to concat attention layer's input and output + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. + i.e. x -> x + att(x) + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + input_layer: str = "embed", + use_output_layer: bool = True, + pos_enc_class=PositionalEncoding, + normalize_before: bool = True, + ): + super().__init__() + attention_dim = encoder_output_size + + if input_layer == "embed": + self.embed = torch.nn.Sequential( + torch.nn.Embedding(vocab_size, attention_dim), + pos_enc_class(attention_dim, positional_dropout_rate), + ) + elif input_layer == "linear": + self.embed = torch.nn.Sequential( + torch.nn.Linear(vocab_size, attention_dim), + torch.nn.LayerNorm(attention_dim), + torch.nn.Dropout(dropout_rate), + torch.nn.ReLU(), + pos_enc_class(attention_dim, positional_dropout_rate), + ) + else: + raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}") + + self.normalize_before = normalize_before + if self.normalize_before: + self.after_norm = LayerNorm(attention_dim) + if use_output_layer: + self.output_layer = torch.nn.Linear(attention_dim, vocab_size) + else: + self.output_layer = None + + # Must set by the inheritance + self.decoders = None + +class sequence_mask(nn.Module): + def __init__(self, max_seq_len=512, flip=True): + super().__init__() + + def forward(self, lengths, max_seq_len=None, dtype=torch.float32, device=None): + if max_seq_len is None: + max_seq_len = lengths.max() + row_vector = torch.arange(0, max_seq_len, 1, dtype=torch.int32, device=lengths.device) + matrix = torch.unsqueeze(lengths, dim=-1).to(torch.int32) + mask = row_vector < matrix + + return mask.type(dtype).to(device) if device is not None else mask.type(dtype) + + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/multihead_att.py#L67 +def preprocess_for_attn(x, mask, cache, pad_fn, kernel_size): + x = x * mask + x = x.transpose(1, 2) + if cache is None: + x = pad_fn(x) + else: + x = torch.cat((cache, x), dim=2) + cache = x[:, :, -(kernel_size - 1) :] + return x, cache + + +# torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]]) +# if torch_version >= (1, 8): +# import torch.fx + +# torch.fx.wrap("preprocess_for_attn") + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L140 (Apache 2.0) +class MultiHeadedAttentionSANM(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__( + self, + n_head, + in_feat, + n_feat, + dropout_rate, + kernel_size, + sanm_shfit=0, + lora_list=None, + lora_rank=8, + lora_alpha=16, + lora_dropout=0.1, + ): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + # self.linear_q = nn.Linear(n_feat, n_feat) + # self.linear_k = nn.Linear(n_feat, n_feat) + # self.linear_v = nn.Linear(n_feat, n_feat) + if lora_list is not None: + if not _LORA_AVAILABLE: + raise ImportError( + "LoRA layers require the 'loralib' package. " + "Please install it with: pip install loralib" + ) + if "o" in lora_list: + self.linear_out = lora.Linear( + n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout + ) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + lora_qkv_list = ["q" in lora_list, "k" in lora_list, "v" in lora_list] + if lora_qkv_list == [False, False, False]: + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + else: + self.linear_q_k_v = lora.MergedLinear( + in_feat, + n_feat * 3, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + enable_lora=lora_qkv_list, + ) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + self.fsmn_block = nn.Conv1d( + n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False + ) + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shfit > 0: + left_padding = left_padding + sanm_shfit + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + + def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): + b, t, d = inputs.size() + if mask is not None: + mask = torch.reshape(mask, (b, -1, 1)) + if mask_shfit_chunk is not None: + mask = mask * mask_shfit_chunk + inputs = inputs * mask + + x = inputs.transpose(1, 2) + x = self.pad_fn(x) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + x += inputs + x = self.dropout(x) + if mask is not None: + x = x * mask + return x + + def forward_qkv(self, x): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + b, t, d = x.size() + q_k_v = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + + return q_h, k_h, v_h, v + + def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + if mask_att_chunk_encoder is not None: + mask = mask * mask_att_chunk_encoder + + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + + min_value = -float( + "inf" + ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) + scores = scores.masked_fill(mask, min_value) + attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + fsmn_memory + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + if chunk_size is not None and look_back > 0 or look_back == -1: + if cache is not None: + k_h_stride = k_h[:, :, : -(chunk_size[2]), :] + v_h_stride = v_h[:, :, : -(chunk_size[2]), :] + k_h = torch.cat((cache["k"], k_h), dim=2) + v_h = torch.cat((cache["v"], v_h), dim=2) + + cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2) + cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2) + if look_back != -1: + cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :] + cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :] + else: + cache_tmp = { + "k": k_h[:, :, : -(chunk_size[2]), :], + "v": v_h[:, :, : -(chunk_size[2]), :], + } + cache = cache_tmp + fsmn_memory = self.forward_fsmn(v, None) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, None) + return att_outs + fsmn_memory, cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L353 (Apache 2.0) +class MultiHeadedAttentionSANMExport(nn.Module): + def __init__(self, model): + super().__init__() + self.d_k = model.d_k + self.h = model.h + self.linear_out = model.linear_out + self.linear_q_k_v = model.linear_q_k_v + self.fsmn_block = model.fsmn_block + self.pad_fn = model.pad_fn + + self.attn = None + self.all_head_size = self.h * self.d_k + + def forward(self, x, mask): + mask_3d_btd, mask_4d_bhlt = mask + q_h, k_h, v_h, v = self.forward_qkv(x) + fsmn_memory = self.forward_fsmn(v, mask_3d_btd) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask_4d_bhlt) + return att_outs + fsmn_memory + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.h, self.d_k) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward_qkv(self, x): + q_k_v = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = self.transpose_for_scores(q) + k_h = self.transpose_for_scores(k) + v_h = self.transpose_for_scores(v) + return q_h, k_h, v_h, v + + def forward_fsmn(self, inputs, mask): + # b, t, d = inputs.size() + # mask = torch.reshape(mask, (b, -1, 1)) + inputs = inputs * mask + x = inputs.transpose(1, 2) + x = self.pad_fn(x) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + x = x + inputs + x = x * mask + return x + + def forward_attention(self, value, scores, mask): + scores = scores + mask + + attn = torch.softmax(scores, dim=-1) + context_layer = torch.matmul(attn, value) # (batch, head, time1, d_k) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + return self.linear_out(context_layer) # (batch, time1, d_model) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L471 (Apache 2.0) +class MultiHeadedAttentionSANMDecoder(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shfit=0): + """Construct an MultiHeadedAttention object.""" + super().__init__() + + self.dropout = nn.Dropout(p=dropout_rate) + + self.fsmn_block = nn.Conv1d( + n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False + ) + # padding + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shfit > 0: + left_padding = left_padding + sanm_shfit + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + self.kernel_size = kernel_size + + def forward(self, inputs, mask, cache=None, mask_shfit_chunk=None): + """ + :param x: (#batch, time1, size). + :param mask: Mask tensor (#batch, 1, time) + :return: + """ + # print("in fsmn, inputs", inputs.size()) + b, t, d = inputs.size() + # logging.info( + # "mask: {}".format(mask.size())) + if mask is not None: + mask = torch.reshape(mask, (b, -1, 1)) + # logging.info("in fsmn, mask: {}, {}".format(mask.size(), mask[0:100:50, :, :])) + if mask_shfit_chunk is not None: + # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shfit_chunk.size(), mask_shfit_chunk[0:100:50, :, :])) + mask = mask * mask_shfit_chunk + # logging.info("in fsmn, mask_after_fsmn: {}, {}".format(mask.size(), mask[0:100:50, :, :])) + # print("in fsmn, mask", mask.size()) + # print("in fsmn, inputs", inputs.size()) + inputs = inputs * mask + + x = inputs.transpose(1, 2) + b, d, t = x.size() + if cache is None: + # print("in fsmn, cache is None, x", x.size()) + + x = self.pad_fn(x) + if not self.training: + cache = x + else: + # print("in fsmn, cache is not None, x", x.size()) + # x = torch.cat((x, cache), dim=2)[:, :, :-1] + # if t < self.kernel_size: + # x = self.pad_fn(x) + x = torch.cat((cache[:, :, 1:], x), dim=2) + x = x[:, :, -(self.kernel_size + t - 1) :] + # print("in fsmn, cache is not None, x_cat", x.size()) + cache = x + x = self.fsmn_block(x) + x = x.transpose(1, 2) + # print("in fsmn, fsmn_out", x.size()) + if x.size(1) != inputs.size(1): + inputs = inputs[:, -1, :] + + x = x + inputs + x = self.dropout(x) + if mask is not None: + x = x * mask + return x, cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L550 (Apache 2.0) +class MultiHeadedAttentionSANMDecoderExport(nn.Module): + def __init__(self, model): + super().__init__() + self.fsmn_block = model.fsmn_block + self.pad_fn = model.pad_fn + self.kernel_size = model.kernel_size + self.attn = None + + def forward(self, inputs, mask, cache=None): + x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn, self.kernel_size) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + + x = x + inputs + x = x * mask + return x, cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L568 (Apache 2.0) +class MultiHeadedAttentionCrossAtt(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__( + self, + n_head, + n_feat, + dropout_rate, + lora_list=None, + lora_rank=8, + lora_alpha=16, + lora_dropout=0.1, + encoder_output_size=None, + ): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + if lora_list is not None: + if "q" in lora_list: + self.linear_q = lora.Linear( + n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout + ) + else: + self.linear_q = nn.Linear(n_feat, n_feat) + lora_kv_list = ["k" in lora_list, "v" in lora_list] + if lora_kv_list == [False, False]: + self.linear_k_v = nn.Linear( + n_feat if encoder_output_size is None else encoder_output_size, n_feat * 2 + ) + else: + self.linear_k_v = lora.MergedLinear( + n_feat if encoder_output_size is None else encoder_output_size, + n_feat * 2, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + enable_lora=lora_kv_list, + ) + if "o" in lora_list: + self.linear_out = lora.Linear( + n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout + ) + else: + self.linear_out = nn.Linear(n_feat, n_feat) + else: + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k_v = nn.Linear( + n_feat if encoder_output_size is None else encoder_output_size, n_feat * 2 + ) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv(self, x, memory): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + + # print("in forward_qkv, x", x.size()) + b = x.size(0) + q = self.linear_q(x) + q_h = torch.reshape(q, (b, -1, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + + k_v = self.linear_k_v(memory) + k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1) + k_h = torch.reshape(k, (b, -1, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, -1, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + + return q_h, k_h, v_h + + def forward_attention(self, value, scores, mask, ret_attn=False): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + min_value = -float( + "inf" + ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) + # logging.info( + # "scores: {}, mask_size: {}".format(scores.size(), mask.size())) + scores = scores.masked_fill(mask, min_value) + attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + if ret_attn: + return self.linear_out(x), attn # (batch, time1, d_model) + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, memory, memory_mask, ret_attn=False): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h = self.forward_qkv(x, memory) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + return self.forward_attention(v_h, scores, memory_mask, ret_attn=ret_attn) + + def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h = self.forward_qkv(x, memory) + if chunk_size is not None and look_back > 0: + if cache is not None: + k_h = torch.cat((cache["k"], k_h), dim=2) + v_h = torch.cat((cache["v"], v_h), dim=2) + cache["k"] = k_h[:, :, -(look_back * chunk_size[1]) :, :] + cache["v"] = v_h[:, :, -(look_back * chunk_size[1]) :, :] + else: + cache_tmp = { + "k": k_h[:, :, -(look_back * chunk_size[1]) :, :], + "v": v_h[:, :, -(look_back * chunk_size[1]) :, :], + } + cache = cache_tmp + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + return self.forward_attention(v_h, scores, None), cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L751 (Apache 2.0) +class MultiHeadedAttentionCrossAttExport(nn.Module): + def __init__(self, model): + super().__init__() + self.d_k = model.d_k + self.h = model.h + self.linear_q = model.linear_q + self.linear_k_v = model.linear_k_v + self.linear_out = model.linear_out + self.attn = None + self.all_head_size = self.h * self.d_k + + def forward(self, x, memory, memory_mask, ret_attn=False): + q, k, v = self.forward_qkv(x, memory) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, memory_mask, ret_attn) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.h, self.d_k) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward_qkv(self, x, memory): + q = self.linear_q(x) + + k_v = self.linear_k_v(memory) + k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1) + q = self.transpose_for_scores(q) + k = self.transpose_for_scores(k) + v = self.transpose_for_scores(v) + return q, k, v + + def forward_attention(self, value, scores, mask, ret_attn): + scores = scores + mask.to(scores.device) + + attn = torch.softmax(scores, dim=-1) + context_layer = torch.matmul(attn, value) # (batch, head, time1, d_k) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + if ret_attn: + return self.linear_out(context_layer), attn + return self.linear_out(context_layer) # (batch, time1, d_model) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L44 (MIT License) +class EncoderLayerSANM(nn.Module): + def __init__( + self, + in_size, + size, + self_attn, + feed_forward, + dropout_rate, + normalize_before=True, + concat_after=False, + stochastic_depth_rate=0.0, + ): + """Construct an EncoderLayer object.""" + super(EncoderLayerSANM, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(in_size) + self.norm2 = LayerNorm(size) + self.dropout = nn.Dropout(dropout_rate) + self.in_size = in_size + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + self.stochastic_depth_rate = stochastic_depth_rate + self.dropout_rate = dropout_rate + + def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + skip_layer = False + # with stochastic depth, residual connection `x + f(x)` becomes + # `x <- x + 1 / (1 - p) * f(x)` at training time. + stoch_layer_coeff = 1.0 + if self.training and self.stochastic_depth_rate > 0: + skip_layer = torch.rand(1).item() < self.stochastic_depth_rate + stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) + + if skip_layer: + if cache is not None: + x = torch.cat([cache, x], dim=1) + return x, mask + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.concat_after: + x_concat = torch.cat( + ( + x, + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ), + ), + dim=-1, + ) + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.concat_linear(x_concat) + else: + x = stoch_layer_coeff * self.concat_linear(x_concat) + else: + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.dropout( + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + ) + else: + x = stoch_layer_coeff * self.dropout( + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + ) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.in_size == self.size: + attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + x = residual + attn + else: + x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.feed_forward(x) + if not self.normalize_before: + x = self.norm2(x) + + return x, cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L188 (MIT License) +class SANMEncoder(nn.Module): + """ + Author: Zhifu Gao, Shiliang Zhang, Ming Lei, Ian McLoughlin + San-m: Memory equipped self-attention for end-to-end speech recognition + https://arxiv.org/abs/2006.01713 + """ + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: Optional[str] = "conv2d", + pos_enc_class=SinusoidalPositionEncoder, + normalize_before: bool = True, + concat_after: bool = False, + positionwise_layer_type: str = "linear", + positionwise_conv_kernel_size: int = 1, + padding_idx: int = -1, + interctc_layer_idx: List[int] = [], + interctc_use_conditioning: bool = False, + kernel_size: int = 11, + sanm_shfit: int = 0, + lora_list: List[str] = None, + lora_rank: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.1, + selfattention_layer_type: str = "sanm", + tf2torch_tensor_name_prefix_torch: str = "encoder", + tf2torch_tensor_name_prefix_tf: str = "seq2seq/encoder", + ): + super().__init__() + self._output_size = output_size + # input_layer is now force to set to "pe" + self.embed = SinusoidalPositionEncoder() + self.normalize_before = normalize_before + + # positionwise_layer_type is now force to set to "linear" + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + ) + + # selfattention_layer_type is now force to set to "sanm" + encoder_selfattn_layer = MultiHeadedAttentionSANM + encoder_selfattn_layer_args0 = ( + attention_heads, + input_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + lora_list, + lora_rank, + lora_alpha, + lora_dropout, + ) + + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + lora_list, + lora_rank, + lora_alpha, + lora_dropout, + ) + + self.encoders0 = repeat( + 1, + lambda lnum: EncoderLayerSANM( + input_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args0), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + normalize_before, + concat_after, + ), + ) + + self.encoders = repeat( + num_blocks - 1, + lambda lnum: EncoderLayerSANM( + output_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + normalize_before, + concat_after, + ), + ) + + self.after_norm = LayerNorm(output_size) + + self.interctc_layer_idx = interctc_layer_idx + + self.interctc_use_conditioning = interctc_use_conditioning + self.conditioning_layer = None + self.dropout = nn.Dropout(dropout_rate) + self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch + self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf + + def output_size(self) -> int: + return self._output_size + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L487 (MIT License) +class EncoderLayerSANMExport(nn.Module): + def __init__( + self, + model, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = model.self_attn + self.feed_forward = model.feed_forward + self.norm1 = model.norm1 + self.norm2 = model.norm2 + self.in_size = model.in_size + self.size = model.size + + def forward(self, x, mask): + + residual = x + x = self.norm1(x) + x = self.self_attn(x, mask) + if self.in_size == self.size: + x = x + residual + residual = x + x = self.norm2(x) + x = self.feed_forward(x) + x = x + residual + + return x, mask + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L518 (MIT License) +class SANMEncoderExport(nn.Module): + def __init__( + self, + model, + max_seq_len=512, + feats_dim=560, + model_name="encoder", + onnx: bool = True, + ctc_linear: nn.Module = None, + ): + super().__init__() + self.embed = model.embed + if isinstance(self.embed, StreamSinusoidalPositionEncoder): + self.embed = None + self.model = model + self.feats_dim = feats_dim + self._output_size = model._output_size + + self.make_pad_mask = sequence_mask(max_seq_len, flip=False) + + # from export_model_hf.sanm.attention import MultiHeadedAttentionSANMExport + + if hasattr(model, "encoders0"): + for i, d in enumerate(self.model.encoders0): + if isinstance(d.self_attn, MultiHeadedAttentionSANM): + d.self_attn = MultiHeadedAttentionSANMExport(d.self_attn) + self.model.encoders0[i] = EncoderLayerSANMExport(d) + + for i, d in enumerate(self.model.encoders): + if isinstance(d.self_attn, MultiHeadedAttentionSANM): + d.self_attn = MultiHeadedAttentionSANMExport(d.self_attn) + self.model.encoders[i] = EncoderLayerSANMExport(d) + + self.model_name = model_name + self.num_heads = model.encoders[0].self_attn.h + self.hidden_size = model.encoders[0].self_attn.linear_out.out_features + + self.ctc_linear = ctc_linear + + def prepare_mask(self, mask): + mask_3d_btd = mask[:, :, None] + if len(mask.shape) == 2: + mask_4d_bhlt = 1 - mask[:, None, None, :] + elif len(mask.shape) == 3: + mask_4d_bhlt = 1 - mask[:, None, :] + mask_4d_bhlt = mask_4d_bhlt * -10000.0 + + return mask_3d_btd, mask_4d_bhlt + + def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False): + if not online: + speech = speech * self._output_size**0.5 + batch_size, seq_len, feat_dim = speech.shape + # Create range [0, 1, 2, ..., seq_len-1] that's shape-dependent, not value-dependent + arange = torch.arange(seq_len, dtype=torch.int32, device=speech.device).unsqueeze(0).expand(batch_size, -1) + lengths_expanded = speech_lengths.unsqueeze(1).to(torch.int32) + # Mask where position < length (convert bool to float for prepare_mask) + mask = (arange < lengths_expanded).to(torch.float32) + mask = self.prepare_mask(mask) + if self.embed is None: + xs_pad = speech + else: + xs_pad = self.embed(speech) + + encoder_outs = self.model.encoders0(xs_pad, mask) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + encoder_outs = self.model.encoders(xs_pad, mask) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + xs_pad = self.model.after_norm(xs_pad) + + if self.ctc_linear is not None: + xs_pad = self.ctc_linear(xs_pad) + xs_pad = F.softmax(xs_pad, dim=2) + + return xs_pad, speech_lengths + +#Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/positionwise_feed_forward.py#L12 +class PositionwiseFeedForwardDecoderSANM(torch.nn.Module): + """Positionwise feed forward layer. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim, hidden_units, dropout_rate, adim=None, activation=torch.nn.ReLU()): + """Construct an PositionwiseFeedForward object.""" + super(PositionwiseFeedForwardDecoderSANM, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.w_2 = torch.nn.Linear(hidden_units, idim if adim is None else adim, bias=False) + self.dropout = torch.nn.Dropout(dropout_rate) + self.activation = activation + self.norm = LayerNorm(hidden_units) + + def forward(self, x): + """Forward function.""" + return self.w_2(self.norm(self.dropout(self.activation(self.w_1(x))))) + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L26 (MIT License) +class DecoderLayerSANM(torch.nn.Module): + """Single decoder layer module. + + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + + + """ + + def __init__( + self, + size, + self_attn, + src_attn, + feed_forward, + dropout_rate, + normalize_before=True, + concat_after=False, + ): + """Construct an DecoderLayer object.""" + super(DecoderLayerSANM, self).__init__() + self.size = size + self.self_attn = self_attn + self.src_attn = src_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(size) + if self_attn is not None: + self.norm2 = LayerNorm(size) + if src_attn is not None: + self.norm3 = LayerNorm(size) + self.dropout = torch.nn.Dropout(dropout_rate) + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear1 = torch.nn.Linear(size + size, size) + self.concat_linear2 = torch.nn.Linear(size + size, size) + self.reserve_attn = False + self.attn_mat = [] + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L225 (MIT License) +class ParaformerSANMDecoder(BaseTransformerDecoder): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition + https://arxiv.org/abs/2006.01713 + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + wo_input_layer: bool = False, + pos_enc_class=PositionalEncoding, + normalize_before: bool = True, + concat_after: bool = False, + att_layer_num: int = 6, + kernel_size: int = 21, + sanm_shfit: int = 0, + lora_list: List[str] = None, + lora_rank: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.1, + chunk_multiply_factor: tuple = (1,), + tf2torch_tensor_name_prefix_torch: str = "decoder", + tf2torch_tensor_name_prefix_tf: str = "seq2seq/decoder", + ): + super().__init__( + vocab_size=vocab_size, + encoder_output_size=encoder_output_size, + dropout_rate=dropout_rate, + positional_dropout_rate=positional_dropout_rate, + input_layer=input_layer, + use_output_layer=use_output_layer, + pos_enc_class=pos_enc_class, + normalize_before=normalize_before, + ) + + attention_dim = encoder_output_size + + # wo_input_layer is now force to set to False + # input_layer is now force to set to "embed" + self.embed = torch.nn.Sequential( + torch.nn.Embedding(vocab_size, attention_dim), + ) + + self.normalize_before = normalize_before + + # self.normalize_before is now force to set to True + self.after_norm = LayerNorm(attention_dim) + # use_output_layer is now force to set to True + self.output_layer = torch.nn.Linear(attention_dim, vocab_size) + + self.att_layer_num = att_layer_num + self.num_blocks = num_blocks + + self.decoders = repeat( + att_layer_num, + lambda lnum: DecoderLayerSANM( + attention_dim, + MultiHeadedAttentionSANMDecoder( + attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit + ), + MultiHeadedAttentionCrossAtt( + attention_heads, + attention_dim, + src_attention_dropout_rate, + lora_list, + lora_rank, + lora_alpha, + lora_dropout, + ), + PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), + dropout_rate, + normalize_before, + concat_after, + ), + ) + + # num_blocks - att_layer_num return 0 + self.decoders2 = None + + self.decoders3 = repeat( + 1, + lambda lnum: DecoderLayerSANM( + attention_dim, + None, + None, + PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate), + dropout_rate, + normalize_before, + concat_after, + ), + ) + self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch + self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf + self.chunk_multiply_factor = chunk_multiply_factor + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L592 (MIT License) +class DecoderLayerSANMExport(torch.nn.Module): + + def __init__(self, model): + super().__init__() + self.self_attn = model.self_attn + self.src_attn = model.src_attn + self.feed_forward = model.feed_forward + self.norm1 = model.norm1 + self.norm2 = model.norm2 if hasattr(model, "norm2") else None + self.norm3 = model.norm3 if hasattr(model, "norm3") else None + self.size = model.size + + def forward(self, tgt, tgt_mask, memory, memory_mask=None, cache=None): + + residual = tgt + tgt = self.norm1(tgt) + tgt = self.feed_forward(tgt) + + x = tgt + if self.self_attn is not None: + tgt = self.norm2(tgt) + x, cache = self.self_attn(tgt, tgt_mask, cache=cache) + x = residual + x + + if self.src_attn is not None: + residual = x + x = self.norm3(x) + x = residual + self.src_attn(x, memory, memory_mask) + + return x, tgt_mask, memory, memory_mask, cache + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L641 (MIT License) +class ParaformerSANMDecoderExport(torch.nn.Module): + def __init__(self, model, max_seq_len=512, model_name="decoder", onnx: bool = True, **kwargs): + super().__init__() + + self.model = model + + self.make_pad_mask = sequence_mask(max_seq_len, flip=False) + + for i, d in enumerate(self.model.decoders): + if isinstance(d.self_attn, MultiHeadedAttentionSANMDecoder): + d.self_attn = MultiHeadedAttentionSANMDecoderExport(d.self_attn) + if isinstance(d.src_attn, MultiHeadedAttentionCrossAtt): + d.src_attn = MultiHeadedAttentionCrossAttExport(d.src_attn) + self.model.decoders[i] = DecoderLayerSANMExport(d) + + if self.model.decoders2 is not None: + for i, d in enumerate(self.model.decoders2): + if isinstance(d.self_attn, MultiHeadedAttentionSANMDecoder): + d.self_attn = MultiHeadedAttentionSANMDecoderExport(d.self_attn) + self.model.decoders2[i] = DecoderLayerSANMExport(d) + + for i, d in enumerate(self.model.decoders3): + self.model.decoders3[i] = DecoderLayerSANMExport(d) + + self.output_layer = model.output_layer + self.after_norm = model.after_norm + self.model_name = model_name + + def prepare_mask(self, mask): + mask_3d_btd = mask[:, :, None] + if len(mask.shape) == 2: + mask_4d_bhlt = 1 - mask[:, None, None, :] + elif len(mask.shape) == 3: + mask_4d_bhlt = 1 - mask[:, None, :] + mask_4d_bhlt = mask_4d_bhlt * -10000.0 + + return mask_3d_btd, mask_4d_bhlt + + def forward( + self, + hs_pad: torch.Tensor, + hlens: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + return_hidden: bool = False, + return_both: bool = False, + ): + + tgt = ys_in_pad + batch_size = tgt.shape[0] + tgt_seq_len = tgt.shape[1] + arange_tgt = torch.arange(tgt_seq_len, dtype=torch.int32, device=tgt.device).unsqueeze(0).expand(batch_size, -1) + tgt_mask = (arange_tgt < ys_in_lens.unsqueeze(1).to(torch.int32)).to(torch.float32) + tgt_mask, _ = self.prepare_mask(tgt_mask) + # tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] + + memory = hs_pad + mem_seq_len = memory.shape[1] + arange_mem = torch.arange(mem_seq_len, dtype=torch.int32, device=memory.device).unsqueeze(0).expand(batch_size, -1) + memory_mask = (arange_mem < hlens.unsqueeze(1).to(torch.int32)).to(torch.float32) + _, memory_mask = self.prepare_mask(memory_mask) + # memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] + + x = tgt + x, tgt_mask, memory, memory_mask, _ = self.model.decoders(x, tgt_mask, memory, memory_mask) + if self.model.decoders2 is not None: + x, tgt_mask, memory, memory_mask, _ = self.model.decoders2( + x, tgt_mask, memory, memory_mask + ) + x, tgt_mask, memory, memory_mask, _ = self.model.decoders3(x, tgt_mask, memory, memory_mask) + hidden = self.after_norm(x) + # x = self.output_layer(x) + + if self.output_layer is not None and return_hidden is False: + x = self.output_layer(hidden) + return x, ys_in_lens + if return_both: + x = self.output_layer(hidden) + return x, hidden, ys_in_lens + return hidden, ys_in_lens + +# Modified from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/export_meta.py#L11 (MIT License) +def export_rebuild_model(model, **kwargs): + model.device = kwargs.get("device") + is_onnx = kwargs.get("type", "onnx") == "onnx" + model.encoder = SANMEncoderExport(model.encoder, onnx=is_onnx) + model.predictor = CifPredictorV2Export(model.predictor, onnx=is_onnx) + model.decoder = ParaformerSANMDecoderExport(model.decoder, onnx=is_onnx) + model.make_pad_mask = sequence_mask(kwargs["max_seq_len"], flip=False) + model.forward = types.MethodType(export_forward, model) + model.export_dummy_inputs = types.MethodType(export_dummy_inputs, model) + model.export_input_names = types.MethodType(export_input_names, model) + model.export_output_names = types.MethodType(export_output_names, model) + model.export_dynamic_axes = types.MethodType(export_dynamic_axes, model) + model.export_name = types.MethodType(export_name, model) + + # model.export_name = "model" + return model + + +def export_forward( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, +): + # a. To device + batch = {"speech": speech, "speech_lengths": speech_lengths} + # batch = to_device(batch, device=self.device) + + enc, enc_len = self.encoder(**batch) + mask = self.make_pad_mask(enc_len)[:, None, :] + pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(enc, mask) + pre_token_length = pre_token_length.floor().type(torch.int32) + + decoder_out, _ = self.decoder(enc, enc_len, pre_acoustic_embeds, pre_token_length) + decoder_out = torch.log_softmax(decoder_out, dim=-1) + # sample_ids = decoder_out.argmax(dim=-1) + + return decoder_out, pre_token_length + + +def export_dummy_inputs(self): + speech = torch.randn(2, 30, 560) + speech_lengths = torch.tensor([6, 30], dtype=torch.int32) + return (speech, speech_lengths) + + +def export_input_names(self): + return ["speech", "speech_lengths"] + + +def export_output_names(self): + return ["logits", "token_num"] + + +def export_dynamic_axes(self): + return { + "speech": {0: "batch_size", 1: "feats_length"}, + "speech_lengths": { + 0: "batch_size", + }, + "logits": {0: "batch_size", 1: "logits_length"}, + "token_num": {0: "batch_size"} + } + + +def export_name( + self, +): + return "model" + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/cif_predictor.py#L173 (MIT License) +class CifPredictorV2(torch.nn.Module): + def __init__( + self, + idim, + l_order, + r_order, + threshold=1.0, + dropout=0.1, + smooth_factor=1.0, + noise_threshold=0, + tail_threshold=0.0, + tf2torch_tensor_name_prefix_torch="predictor", + tf2torch_tensor_name_prefix_tf="seq2seq/cif", + tail_mask=True, + ): + super().__init__() + + self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0) + self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1) + self.cif_output = torch.nn.Linear(idim, 1) + self.dropout = torch.nn.Dropout(p=dropout) + self.threshold = threshold + self.smooth_factor = smooth_factor + self.noise_threshold = noise_threshold + self.tail_threshold = tail_threshold + self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch + self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf + self.tail_mask = tail_mask + +# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/cif_predictor.py#L431 (MIT License) +class CifPredictorV2Export(torch.nn.Module): + def __init__(self, model, **kwargs): + super().__init__() + + self.pad = model.pad + self.cif_conv1d = model.cif_conv1d + self.cif_output = model.cif_output + self.threshold = model.threshold + self.smooth_factor = model.smooth_factor + self.noise_threshold = model.noise_threshold + self.tail_threshold = model.tail_threshold + + def forward( + self, + hidden: torch.Tensor, + mask: torch.Tensor, + ): + alphas, token_num = self.forward_cnn(hidden, mask) + mask = mask.transpose(-1, -2).float() + mask = mask.squeeze(-1) + hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, mask=mask) + acoustic_embeds, cif_peak = cif_v1_export(hidden, alphas, self.threshold) + + return acoustic_embeds, token_num, alphas, cif_peak + + def forward_cnn( + self, + hidden: torch.Tensor, + mask: torch.Tensor, + ): + h = hidden + context = h.transpose(1, 2) + queries = self.pad(context) + output = torch.relu(self.cif_conv1d(queries)) + output = output.transpose(1, 2) + + output = self.cif_output(output) + alphas = torch.sigmoid(output) + alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold) + mask = mask.transpose(-1, -2).float() + alphas = alphas * mask + alphas = alphas.squeeze(-1) + token_num = alphas.sum(-1) + + return alphas, token_num + + def tail_process_fn(self, hidden, alphas, token_num=None, mask=None): + b, t, d = hidden.size() + tail_threshold = self.tail_threshold + + zeros_t = torch.zeros((b, 1), dtype=torch.float32, device=alphas.device) + ones_t = torch.ones_like(zeros_t) + + mask_1 = torch.cat([mask, zeros_t], dim=1) + mask_2 = torch.cat([ones_t, mask], dim=1) + mask = mask_2 - mask_1 + tail_threshold = mask * tail_threshold + alphas = torch.cat([alphas, zeros_t], dim=1) + alphas = torch.add(alphas, tail_threshold) + + zeros = torch.zeros((b, 1, d), dtype=hidden.dtype).to(hidden.device) + hidden = torch.cat([hidden, zeros], dim=1) + token_num = alphas.sum(dim=-1) + token_num_floor = torch.floor(token_num) + + return hidden, alphas, token_num_floor + + +@torch.jit.script +def cif_v1_export(hidden, alphas, threshold: float): + device = hidden.device + dtype = hidden.dtype + batch_size, len_time, hidden_size = hidden.size() + threshold = torch.tensor([threshold], dtype=alphas.dtype).to(alphas.device) + + frames = torch.zeros(batch_size, len_time, hidden_size, dtype=dtype, device=device) + fires = torch.zeros(batch_size, len_time, dtype=dtype, device=device) + + # prefix_sum = torch.cumsum(alphas, dim=1) + prefix_sum = torch.cumsum(alphas, dim=1, dtype=torch.float64).to( + torch.float32 + ) # cumsum precision degradation cause wrong result in extreme + prefix_sum_floor = torch.floor(prefix_sum) + dislocation_prefix_sum = torch.roll(prefix_sum, 1, dims=1) + dislocation_prefix_sum_floor = torch.floor(dislocation_prefix_sum) + + dislocation_prefix_sum_floor[:, 0] = 0 + dislocation_diff = prefix_sum_floor - dislocation_prefix_sum_floor + + fire_idxs = dislocation_diff > 0 + fires[fire_idxs] = 1 + fires = fires + prefix_sum - prefix_sum_floor + + # prefix_sum_hidden = torch.cumsum(alphas.unsqueeze(-1).tile((1, 1, hidden_size)) * hidden, dim=1) + prefix_sum_hidden = torch.cumsum(alphas.unsqueeze(-1).repeat((1, 1, hidden_size)) * hidden, dim=1) + frames = prefix_sum_hidden[fire_idxs] + shift_frames = torch.roll(frames, 1, dims=0) + + batch_len = fire_idxs.sum(1) + batch_idxs = torch.cumsum(batch_len, dim=0) + shift_batch_idxs = torch.roll(batch_idxs, 1, dims=0) + shift_batch_idxs[0] = 0 + shift_frames[shift_batch_idxs] = 0 + + remains = fires - torch.floor(fires) + # remain_frames = remains[fire_idxs].unsqueeze(-1).tile((1, hidden_size)) * hidden[fire_idxs] + remain_frames = remains[fire_idxs].unsqueeze(-1).repeat((1, hidden_size)) * hidden[fire_idxs] + + shift_remain_frames = torch.roll(remain_frames, 1, dims=0) + shift_remain_frames[shift_batch_idxs] = 0 + + frames = frames - shift_frames + shift_remain_frames - remain_frames + + # max_label_len = batch_len.max() + max_label_len = alphas.sum(dim=-1) + max_label_len = torch.floor(max_label_len).max().to(dtype=torch.int32) + + # frame_fires = torch.zeros(batch_size, max_label_len, hidden_size, dtype=dtype, device=device) + frame_fires = torch.zeros(batch_size, max_label_len, hidden_size, dtype=dtype, device=device) + indices = torch.arange(max_label_len, dtype=torch.int32, device=device).expand(batch_size, -1) + frame_fires_idxs = indices < batch_len.unsqueeze(1).to(torch.int32) + frame_fires[frame_fires_idxs] = frames + return frame_fires, fires + +# https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/model.py#L30 (MIT License) +class Paraformer(torch.nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition + https://arxiv.org/abs/2206.08317 + """ + + def __init__( + self, + specaug: Optional[str] = None, + specaug_conf: Optional[Dict] = None, + normalize: str = None, + normalize_conf: Optional[Dict] = None, + encoder: str = None, + encoder_conf: Optional[Dict] = None, + decoder: str = None, + decoder_conf: Optional[Dict] = None, + ctc: str = None, + ctc_conf: Optional[Dict] = None, + predictor: str = None, + predictor_conf: Optional[Dict] = None, + ctc_weight: float = 0.5, + input_size: int = 80, + vocab_size: int = -1, + ignore_id: int = -1, + blank_id: int = 0, + sos: int = 1, + eos: int = 2, + lsm_weight: float = 0.0, + length_normalized_loss: bool = False, + # report_cer: bool = True, + # report_wer: bool = True, + # sym_space: str = "", + # sym_blank: str = "", + # extract_feats_in_collect_stats: bool = True, + # predictor=None, + predictor_weight: float = 0.0, + predictor_bias: int = 0, + sampling_ratio: float = 0.2, + share_embedding: bool = False, + # preencoder: Optional[AbsPreEncoder] = None, + # postencoder: Optional[AbsPostEncoder] = None, + use_1st_decoder_loss: bool = False, + **kwargs, + ): + + super().__init__() + # Filter out streaming-specific parameters not supported by SANMEncoder + sanm_encoder_params = { + 'input_size', 'output_size', 'attention_heads', 'linear_units', + 'num_blocks', 'dropout_rate', 'positional_dropout_rate', + 'attention_dropout_rate', 'input_layer', 'pos_enc_class', + 'normalize_before', 'concat_after', 'positionwise_layer_type', + 'positionwise_conv_kernel_size', 'padding_idx', 'interctc_layer_idx', + 'interctc_use_conditioning', 'kernel_size', 'sanm_shfit', + 'lora_list', 'lora_rank', 'lora_alpha', 'lora_dropout', + 'selfattention_layer_type', 'tf2torch_tensor_name_prefix_torch', + 'tf2torch_tensor_name_prefix_tf' + } + filtered_encoder_conf = {k: v for k, v in encoder_conf.items() if k in sanm_encoder_params} + encoder = SANMEncoder(input_size=input_size, **filtered_encoder_conf) + encoder_output_size = encoder.output_size() + + if decoder is not None: + decoder = ParaformerSANMDecoder( + vocab_size=vocab_size, + encoder_output_size=encoder_output_size, + **decoder_conf, + ) + + if predictor is not None: + predictor = CifPredictorV2(**predictor_conf) + + self.encoder = encoder + self.decoder = decoder + self.predictor = predictor + + def export(self, **kwargs): + + if "max_seq_len" not in kwargs: + kwargs["max_seq_len"] = 512 + models = export_rebuild_model(model=self, **kwargs) + return models + +def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg={}): + + if isinstance(file_path_metas, dict): + if isinstance(cfg, list): + cfg.append({}) + + for k, v in file_path_metas.items(): + if isinstance(v, str): + p = os.path.join(model_or_path, v) + if os.path.exists(p): + if isinstance(cfg, dict): + cfg[k] = p + elif isinstance(cfg, list): + # if len(cfg) == 0: + # cfg.append({}) + cfg[-1][k] = p + + elif isinstance(v, dict): + if isinstance(cfg, dict): + if k not in cfg: + cfg[k] = {} + add_file_root_path(model_or_path, v, cfg[k]) + # elif isinstance(cfg, list): + # cfg.append({}) + # add_file_root_path(model_or_path, v, cfg) + elif isinstance(v, (list, tuple)): + for i, vv in enumerate(v): + if k not in cfg: + cfg[k] = [] + if isinstance(vv, str): + p = os.path.join(model_or_path, vv) + # file_path_metas[i] = p + if os.path.exists(p): + if isinstance(cfg[k], dict): + cfg[k] = p + elif isinstance(cfg[k], list): + cfg[k].append(p) + elif isinstance(vv, dict): + add_file_root_path(model_or_path, vv, cfg[k]) + + return cfg + +def get_or_download_model_dir_hf( + model, + model_revision=None, + is_training=False, + check_latest=True, +): + """Get local model directory or download model if necessary. + + Args: + model (str): model id or path to local model directory. + model_revision (str, optional): model version number. + :param is_training: + """ + from huggingface_hub import snapshot_download + + model_cache_dir = snapshot_download(model) + return model_cache_dir + +def download_from_hf(**kwargs): + model_or_path = kwargs.get("model") + model_revision = kwargs.get("model_revision", "master") + if not os.path.exists(model_or_path) and "model_path" not in kwargs: + try: + model_or_path = get_or_download_model_dir_hf( + model_or_path, + model_revision, + is_training=kwargs.get("is_training"), + check_latest=kwargs.get("check_latest", True), + ) + except Exception as e: + print(f"Download: {model_or_path} failed!: {e}") + + kwargs["model_path"] = model_or_path if "model_path" not in kwargs else kwargs["model_path"] + + if os.path.exists(os.path.join(model_or_path, "configuration.json")): + with open(os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8") as f: + conf_json = json.load(f) + cfg = {} + if "file_path_metas" in conf_json: + add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg) + cfg.update(kwargs) + if "config" in cfg: + config = OmegaConf.load(cfg["config"]) + kwargs = OmegaConf.merge(config, cfg) + kwargs["model"] = config["model"] + elif os.path.exists(os.path.join(model_or_path, "config.yaml")): + config = OmegaConf.load(os.path.join(model_or_path, "config.yaml")) + kwargs = OmegaConf.merge(config, kwargs) + init_param = os.path.join(model_or_path, "model.pt") + if "init_param" not in kwargs or not os.path.exists(kwargs["init_param"]): + kwargs["init_param"] = init_param + assert os.path.exists(kwargs["init_param"]), "init_param does not exist" + if os.path.exists(os.path.join(model_or_path, "tokens.json")): + kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.json") + if os.path.exists(os.path.join(model_or_path, "seg_dict")): + kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(model_or_path, "seg_dict") + kwargs["model"] = config["model"] + if os.path.exists(os.path.join(model_or_path, "am.mvn")): + kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn") + if isinstance(kwargs, DictConfig): + kwargs = OmegaConf.to_container(kwargs, resolve=True) + + return kwargs + +def deep_update(original, update): + for key, value in update.items(): + if isinstance(value, dict) and key in original: + if len(value) == 0: + original[key] = value + deep_update(original[key], value) + else: + original[key] = value + +def load_pretrained_model( + path: str, + model: torch.nn.Module, + ignore_init_mismatch: bool = True, + map_location: str = "cpu", + oss_bucket=None, + scope_map=[], + excludes=None, + **kwargs, +): + """Load a model state and set it to the model. + + Args: + init_param: ::: + + Examples: + + """ + + obj = model + dst_state = obj.state_dict() + ori_state = torch.load(path, map_location=map_location) + + src_state = copy.deepcopy(ori_state) + src_state = src_state["state_dict"] if "state_dict" in src_state else src_state + src_state = src_state["model_state_dict"] if "model_state_dict" in src_state else src_state + src_state = src_state["model"] if "model" in src_state else src_state + + if isinstance(scope_map, str): + scope_map = scope_map.split(",") + scope_map += ["module.", "None"] + logging.info(f"scope_map: {scope_map}") + + for k in dst_state.keys(): + excludes_flag = False + if excludes is not None: + for k_ex in excludes: + if k.startswith(k_ex): + logging.info(f"key: {k} matching: {k_ex}, excluded") + excludes_flag = True + break + if excludes_flag: + continue + + k_src = k + + if scope_map is not None: + src_prefix = "" + dst_prefix = "" + for i in range(0, len(scope_map), 2): + src_prefix = scope_map[i] if scope_map[i].lower() != "none" else "" + dst_prefix = scope_map[i + 1] if scope_map[i + 1].lower() != "none" else "" + + if dst_prefix == "" and (src_prefix + k) in src_state.keys(): + k_src = src_prefix + k + if not k_src.startswith("module."): + logging.info(f"init param, map: {k} from {k_src} in ckpt") + elif ( + k.startswith(dst_prefix) + and k.replace(dst_prefix, src_prefix, 1) in src_state.keys() + ): + k_src = k.replace(dst_prefix, src_prefix, 1) + if not k_src.startswith("module."): + logging.info(f"init param, map: {k} from {k_src} in ckpt") + + if k_src in src_state.keys(): + if ignore_init_mismatch and dst_state[k].shape != src_state[k_src].shape: + logging.info( + f"ignore_init_mismatch:{ignore_init_mismatch}, dst: {k, dst_state[k].shape}, src: {k_src, src_state[k_src].shape}" + ) + else: + dst_state[k] = src_state[k_src] + else: + print(f"Warning, miss key in ckpt: {k}, {path}") + + flag = obj.load_state_dict(dst_state, strict=True) + +def _torchscripts(model, path, device="cuda"): + dummy_input = model.export_dummy_inputs() + model_jit_script = torch.jit.trace(model, dummy_input) + return model_jit_script + +def export_utils( + model, data_in=None, quantize: bool = False, opset_version: int = 14, type="onnx", **kwargs +): + model_scripts = model.export(**kwargs) + export_dir = kwargs.get("output_dir", os.path.dirname(kwargs.get("init_param"))) + os.makedirs(export_dir, exist_ok=True) + + if not isinstance(model_scripts, (list, tuple)): + model_scripts = (model_scripts,) + for m in model_scripts: + m.eval() + device = "cpu" + print("Exporting torchscripts on device {}".format(device)) + model_jit_scripts = _torchscripts(m, path=export_dir, device=device) + + return export_dir, model_jit_scripts + +def download_model(**kwargs): + kwargs = download_from_hf(**kwargs) + return kwargs + +def build_model(**kwargs): + assert "model" in kwargs + kwargs = download_model(**kwargs) + torch.set_num_threads(kwargs.get("ncpu", 4)) + + # build tokenizer + # Here to remove building tokenizer to get vocab_size. Currently hard_code the value here + # Check the downloaded token.json and the vocab_size is the token number in token.json + kwargs["vocab_size"] = 8404 + + # build model + model_conf = {} + deep_update(model_conf, kwargs.get("model_conf", {})) + deep_update(model_conf, kwargs) + model = Paraformer(**model_conf) + + # init_param + init_param = kwargs.get("init_param", None) + if init_param is not None: + if os.path.exists(init_param): + logging.info(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=model, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + else: + print(f"error, init_param does not exist!: {init_param}") + + # fp16 + if kwargs.get("fp16", False): + model.to(torch.float16) + elif kwargs.get("bf16", False): + model.to(torch.bfloat16) + model.to(kwargs["device"]) + + return model, kwargs + +def export(model, kwargs, input=None, **cfg): + del kwargs["model"] + model.eval() + + with torch.no_grad(): + export_dir, model_jit_scripts = export_utils(model=model, **kwargs) + + return export_dir, model_jit_scripts + + +# ============================================================================ +# Transformers-compatible wrappers for Paraformer models +# ============================================================================ +# These classes provide compatibility with the optimum-intel export pipeline + +try: + from transformers import PretrainedConfig, PreTrainedModel + _TRANSFORMERS_AVAILABLE = True +except ImportError: + _TRANSFORMERS_AVAILABLE = False + PretrainedConfig = object + PreTrainedModel = object + + +class ParaformerConfig(PretrainedConfig): + """ + Configuration class for Paraformer ASR models. + + This provides a transformers-compatible configuration for FunASR Paraformer models. + """ + model_type = "paraformer" + + def __init__( + self, + vocab_size: int = 8404, + encoder_dim: int = 512, + attention_heads: int = 4, + encoder_layers: int = 50, + decoder_layers: int = 16, + max_seq_len: int = 512, + frontend_conf: Optional[Dict] = None, + **kwargs + ): + if _TRANSFORMERS_AVAILABLE: + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.encoder_dim = encoder_dim + self.attention_heads = attention_heads + self.encoder_layers = encoder_layers + self.decoder_layers = decoder_layers + self.max_seq_len = max_seq_len + self.frontend_conf = frontend_conf or {} + + @classmethod + def from_funasr_config(cls, config_path: Union[str, Path]) -> "ParaformerConfig": + """Load configuration from FunASR config.yaml file.""" + try: + if _OMEGACONF_AVAILABLE: + config = OmegaConf.load(config_path) + + return cls( + vocab_size=config.get("vocab_size", 8404), + encoder_dim=config.get("encoder_conf", {}).get("output_size", 512), + attention_heads=config.get("encoder_conf", {}).get("attention_heads", 4), + encoder_layers=config.get("encoder_conf", {}).get("num_blocks", 50), + decoder_layers=config.get("decoder_conf", {}).get("num_blocks", 16), + max_seq_len=config.get("max_seq_len", 512), + frontend_conf=dict(config.get("frontend_conf", {})), + ) + except Exception as e: + logging.warning(f"Could not load FunASR config: {e}, using defaults") + return cls() + + +class ParaformerForASR(PreTrainedModel): + """ + Transformers-compatible wrapper for Paraformer ASR models. + + This class wraps FunASR Paraformer models to make them compatible with + the optimum-intel export pipeline. + """ + if _TRANSFORMERS_AVAILABLE: + config_class = ParaformerConfig + base_model_prefix = "paraformer" + main_input_name = "speech" + + def __init__(self, config: ParaformerConfig, funasr_model=None): + if _TRANSFORMERS_AVAILABLE: + super().__init__(config) + self.config = config + self.funasr_model = funasr_model + self._jit_model = None + self._model_path = None + self._model_kwargs = {} + + @classmethod + def from_pretrained( + cls, + model_name_or_path: Union[str, Path], + *model_args, + cache_dir: Optional[str] = None, + **kwargs + ) -> "ParaformerForASR": + """ + Load a Paraformer model from a FunASR model directory or HuggingFace Hub. + """ + from huggingface_hub import snapshot_download + + model_path = Path(model_name_or_path) + + # Download from HuggingFace Hub if not a local path + if not model_path.exists(): + logging.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}") + model_path = Path(snapshot_download( + repo_id=str(model_name_or_path), + cache_dir=cache_dir, + token=kwargs.get("token"), + revision=kwargs.get("revision", "main"), + )) + + # Load config + config_yaml_path = model_path / "config.yaml" + if config_yaml_path.exists(): + config = ParaformerConfig.from_funasr_config(config_yaml_path) + else: + config = ParaformerConfig() + + # Load the FunASR model + device = kwargs.get("device", "cpu") + funasr_model, model_kwargs = build_model(model=str(model_path), device=device) + + instance = cls(config, funasr_model=funasr_model) + instance._model_path = model_path + instance._model_kwargs = model_kwargs + + return instance + + def get_jit_model(self) -> torch.jit.ScriptModule: + """Get or create the TorchScript model for export.""" + if self._jit_model is None: + _, self._jit_model = export( + self.funasr_model, + self._model_kwargs, + type="torchscript", + quantize=False, + device=str(self._model_kwargs.get("device", "cpu")) + ) + return self._jit_model + + def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor): + """Forward pass through the model.""" + if self.funasr_model is not None: + return self.funasr_model(speech, speech_lengths) + raise ValueError("FunASR model not loaded") + + +def _load_paraformer_model( + model_name_or_path: str, + subfolder: str = "", + revision: str = "main", + cache_dir: str = None, + token: Optional[str] = None, + trust_remote_code: bool = False, + **kwargs, +): + """Load a Paraformer model for export (TasksManager compatible loader).""" + return ParaformerForASR.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + token=token, + revision=revision, + **kwargs, + ) diff --git a/optimum/exporters/openvino/paraformer_plugin.py b/optimum/exporters/openvino/paraformer_plugin.py new file mode 100755 index 0000000000..dbb8c55d78 --- /dev/null +++ b/optimum/exporters/openvino/paraformer_plugin.py @@ -0,0 +1,610 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Paraformer Plugin for OpenVINO Export + +This module provides automatic Paraformer model support for optimum-cli export +by hooking into the main_export function. + +Usage: + optimum-cli export openvino --model funasr/paraformer-zh --task automatic-speech-recognition output_dir + + # With INT8 quantization: + optimum-cli export openvino --model funasr/paraformer-zh --task automatic-speech-recognition --weight-format int8 output_dir +""" + +import logging +from functools import wraps +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import torch +from transformers import PretrainedConfig, PreTrainedModel + +from optimum.exporters.tasks import TasksManager +from optimum.exporters.onnx.config import OnnxConfig + +logger = logging.getLogger(__name__) + + +class ParaformerConfig(PretrainedConfig): + """ + Configuration class for Paraformer ASR models. + + This provides a transformers-compatible configuration for FunASR Paraformer models. + """ + model_type = "paraformer" + + def __init__( + self, + vocab_size: int = 8404, + encoder_dim: int = 512, + attention_heads: int = 4, + encoder_layers: int = 50, + decoder_layers: int = 16, + frontend_conf: Optional[Dict] = None, + **kwargs + ): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.encoder_dim = encoder_dim + self.attention_heads = attention_heads + self.encoder_layers = encoder_layers + self.decoder_layers = decoder_layers + self.frontend_conf = frontend_conf or {} + + @classmethod + def from_funasr_config(cls, config_path: Union[str, Path]) -> "ParaformerConfig": + """Load configuration from FunASR config.yaml file.""" + try: + from omegaconf import OmegaConf + config = OmegaConf.load(config_path) + + return cls( + vocab_size=config.get("vocab_size", 8404), + encoder_dim=config.get("encoder_conf", {}).get("output_size", 512), + attention_heads=config.get("encoder_conf", {}).get("attention_heads", 4), + encoder_layers=config.get("encoder_conf", {}).get("num_blocks", 50), + decoder_layers=config.get("decoder_conf", {}).get("num_blocks", 16), + frontend_conf=dict(config.get("frontend_conf", {})), + ) + except Exception as e: + logger.warning(f"Could not load FunASR config: {e}, using defaults") + return cls() + + +class ParaformerForASR(PreTrainedModel): + """ + Transformers-compatible wrapper for Paraformer ASR models. + + This class wraps FunASR Paraformer models to make them compatible with + the optimum-intel export pipeline. + """ + config_class = ParaformerConfig + base_model_prefix = "paraformer" + main_input_name = "speech" + + def __init__(self, config: ParaformerConfig, funasr_model=None): + super().__init__(config) + self.funasr_model = funasr_model + self._jit_model = None + + @classmethod + def from_pretrained( + cls, + model_name_or_path: Union[str, Path], + *model_args, + cache_dir: Optional[str] = None, + **kwargs + ) -> "ParaformerForASR": + """ + Load a Paraformer model from a FunASR model directory or HuggingFace Hub. + """ + from huggingface_hub import snapshot_download + + model_path = Path(model_name_or_path) + + # Download from HuggingFace Hub if not a local path + if not model_path.exists(): + logger.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}") + model_path = Path(snapshot_download( + repo_id=str(model_name_or_path), + cache_dir=cache_dir, + token=kwargs.get("token"), + revision=kwargs.get("revision", "main"), + )) + + # Load config + config_yaml_path = model_path / "config.yaml" + if config_yaml_path.exists(): + config = ParaformerConfig.from_funasr_config(config_yaml_path) + else: + config = ParaformerConfig() + + # Load the FunASR model + from optimum.exporters.openvino.modeling_paraformer import build_model + + device = kwargs.get("device", "cpu") + funasr_model, model_kwargs = build_model(model=str(model_path), device=device) + + instance = cls(config, funasr_model=funasr_model) + instance._model_path = model_path + instance._model_kwargs = model_kwargs + + return instance + + def get_jit_model(self) -> torch.jit.ScriptModule: + """Get or create the TorchScript model for export.""" + if self._jit_model is None: + from optimum.exporters.openvino.modeling_paraformer import export + + _, self._jit_model = export( + self.funasr_model, + self._model_kwargs, + type="torchscript", + quantize=False, + device=str(self._model_kwargs.get("device", "cpu")) + ) + return self._jit_model + + def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor): + """Forward pass through the model.""" + if self.funasr_model is not None: + return self.funasr_model(speech, speech_lengths) + raise ValueError("FunASR model not loaded") + + +class ParaformerOnnxConfig(OnnxConfig): + """ + ONNX/OpenVINO export configuration for Paraformer models. + """ + NORMALIZED_CONFIG_CLASS = ParaformerConfig + DEFAULT_ONNX_OPSET = 14 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "speech": {0: "batch_size", 1: "sequence_length", 2: "feature_dim"}, + "speech_lengths": {0: "batch_size"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "logits": {0: "batch_size", 1: "sequence_length"}, + } + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict[str, Any]: + """Generate dummy inputs for export.""" + batch_size = 1 + sequence_length = 1000 # ~10 seconds of audio at 16kHz with 10ms frame shift + feature_dim = 560 # LFR features (80 mel * 7 frames) + + return { + "speech": torch.randn(batch_size, sequence_length, feature_dim), + "speech_lengths": torch.tensor([sequence_length], dtype=torch.int32), + } + + +def _is_paraformer_model(model_name_or_path: str, cache_dir: str = None, **kwargs) -> bool: + """Check if the model is a Paraformer ASR model.""" + import json + from pathlib import Path + + try: + from huggingface_hub import HfFileSystem + + model_path = Path(model_name_or_path) + + # Check if it's a local path + if model_path.exists(): + if (model_path / "config.yaml").exists() and (model_path / "tokens.json").exists(): + return True + if (model_path / "am.mvn").exists(): + return True + return False + + # Check HuggingFace Hub + fs = HfFileSystem(token=kwargs.get("token")) + try: + repo_files = fs.ls(f"{model_name_or_path}", detail=False) + repo_files = [f.split("/")[-1] for f in repo_files] + + if "config.yaml" in repo_files and "tokens.json" in repo_files: + return True + if "am.mvn" in repo_files: + return True + except Exception: + pass + + return False + except Exception: + return False + + +def export_paraformer_to_openvino( + model_name_or_path: str, + output: Union[str, Path], + weight_format: str = "fp16", + cache_dir: str = None, + token: Optional[str] = None, + ov_config: Any = None, + **kwargs +) -> None: + """ + Export a Paraformer model to OpenVINO format. + + This function handles the complete export pipeline for FunASR Paraformer models, + including full INT8 quantization with calibration data when requested. + """ + import os + import openvino as ov + import shutil + import numpy as np + from optimum.exporters.openvino.modeling_paraformer import build_model, export + from huggingface_hub import snapshot_download + + model_path = Path(model_name_or_path) + output_path = Path(output) + + # Download from HuggingFace Hub if not a local path + if not model_path.exists(): + logger.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}") + model_path = Path(snapshot_download( + repo_id=str(model_name_or_path), + cache_dir=cache_dir, + token=token, + )) + + logger.info(f"Loading Paraformer model from {model_path}") + + # Build the FunASR model + device = kwargs.get("device", "cpu") + model, model_kwargs = build_model(model=str(model_path), device=device) + + # Export to TorchScript + logger.info("Converting to TorchScript...") + model_dir, jit_model = export(model, model_kwargs, type="torchscript", quantize=False, device=device) + + # Convert to OpenVINO + logger.info("Converting to OpenVINO format...") + ovm = ov.convert_model(jit_model, input=[([-1, -1, -1], torch.float32), ([-1], torch.int32)]) + + # Create output directory with ov_models subdirectory (matching optimum-intel structure) + ov_models_path = output_path / "ov_models" + ov_models_path.mkdir(parents=True, exist_ok=True) + output_model_path = ov_models_path / "openvino_model.xml" + + # Check if full INT8 quantization is requested (via ov_config with quantization_config) + apply_full_quant = False + dataset_name = None + num_samples = 50 + sym = False + + if ov_config is not None: + q_config = getattr(ov_config, "quantization_config", None) + if q_config is not None: + # Import configuration classes + try: + from optimum.intel.openvino.configuration import OVQuantizationConfig, OVWeightQuantizationConfig + + # Handle OVQuantizationConfig (from --quant-mode int8) + if isinstance(q_config, OVQuantizationConfig): + dtype = getattr(q_config, 'dtype', None) + dataset_name = getattr(q_config, 'dataset', None) + + if dtype == 'int8' and dataset_name is not None: + apply_full_quant = True + num_samples = getattr(q_config, 'num_samples', 50) or 50 + sym = getattr(q_config, 'sym', False) + logger.info(f"Full INT8 quantization requested with dataset={dataset_name}") + + # Handle OVWeightQuantizationConfig (from --weight-format int8) + elif isinstance(q_config, OVWeightQuantizationConfig): + apply_full_quant = False + weight_format = "int8" + + # Handle dict config (fallback) + elif isinstance(q_config, dict): + if q_config.get('dtype') == 'int8' and 'dataset' in q_config: + apply_full_quant = True + dataset_name = q_config.get('dataset') + num_samples = q_config.get('num_samples', 50) or 50 + sym = q_config.get('sym', False) + logger.info(f"Full INT8 quantization requested with dataset={dataset_name}") + except ImportError as e: + logger.warning(f"Could not import configuration classes: {e}") + + if apply_full_quant: + logger.info("Applying full INT8 quantization (weights + activations) for Paraformer...") + import nncf + import librosa + + # Helper function to extract paraformer features + def extract_paraformer_features(audio_path): + """Extract LFR features from audio for paraformer.""" + audio, sr = librosa.load(audio_path, sr=16000) + mel_spec = librosa.feature.melspectrogram( + y=audio, sr=sr, n_fft=512, hop_length=160, + win_length=400, n_mels=80, fmin=0, fmax=8000, power=2.0 + ) + log_mel = np.log(np.maximum(mel_spec, 1e-10)).T + log_mel = (log_mel - np.mean(log_mel, axis=0)) / (np.std(log_mel, axis=0) + 1e-10) + T = log_mel.shape[0] + pad_len = (6 - (T % 6)) % 6 + if pad_len > 0: + log_mel = np.pad(log_mel, ((0, pad_len), (0, 0)), mode='edge') + T_lfr = log_mel.shape[0] // 6 + lfr_features = [] + for i in range(T_lfr): + frames = [log_mel[min(i * 6 + j, log_mel.shape[0] - 1)] for j in range(7)] + lfr_features.append(np.concatenate(frames)) + return np.array(lfr_features, dtype=np.float32) + + # Generate calibration dataset + calibration_samples = [] + + if dataset_name and ('aishell' in dataset_name.lower()): + # Use AISHELL-style calibration with example audio + example_audio = os.path.join(model_dir, "example", "asr_example.wav") + + if not os.path.exists(example_audio): + raise ValueError( + f"AISHELL calibration requires example audio at {example_audio}. " + "File not found. Please ensure the model was downloaded correctly." + ) + + logger.info(f"Generating {num_samples} calibration samples from AISHELL audio...") + base_features = extract_paraformer_features(example_audio) + + # Generate diverse calibration samples with noise augmentation + np.random.seed(42) + for i in range(num_samples): + noise = np.random.randn(*base_features.shape).astype(np.float32) * (0.01 + i * 0.0004) + features = base_features + noise + + speech = features[np.newaxis, :].astype(np.float32) + speech_lengths = np.array([features.shape[0]], dtype=np.int32) + # Use 'speech.1' as the model input name (from OV conversion) + calibration_samples.append({'speech.1': speech, 'speech_lengths': speech_lengths}) + else: + raise ValueError( + f"Unknown dataset '{dataset_name}' for paraformer quantization. " + "Please use 'aishell-1' for AISHELL-style calibration." + ) + + # Create NNCF calibration dataset + calibration_dataset = nncf.Dataset(calibration_samples) + + # Set quantization preset based on sym flag + preset = nncf.QuantizationPreset.PERFORMANCE if sym else nncf.QuantizationPreset.MIXED + + # Get smooth_quant_alpha if available + smooth_quant_alpha = None + if ov_config is not None: + q_config = getattr(ov_config, "quantization_config", None) + if q_config is not None: + try: + from optimum.intel.openvino.configuration import OVQuantizationConfig + if isinstance(q_config, OVQuantizationConfig): + smooth_quant_alpha = getattr(q_config, 'smooth_quant_alpha', None) + except ImportError: + pass + + logger.info(f"Applying nncf.quantize() for full INT8 quantization...") + + # Build kwargs for nncf.quantize with per-tensor quantization for dynamic shape support + from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters, QuantizationParameters + + quant_kwargs = { + 'subset_size': num_samples, + 'model_type': nncf.ModelType.TRANSFORMER, + 'preset': preset, + 'advanced_parameters': AdvancedQuantizationParameters( + # Use per-tensor quantization for activations to avoid shape-specific constants + activations_quantization_params=QuantizationParameters(per_channel=False), + ) + } + + # Add smooth_quant_alpha if set + if smooth_quant_alpha is not None and smooth_quant_alpha != -1: + from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters + quant_kwargs['advanced_parameters'] = AdvancedQuantizationParameters( + activations_quantization_params=QuantizationParameters(per_channel=False), + smooth_quant_alphas=AdvancedSmoothQuantParameters( + matmul=smooth_quant_alpha, + convolution=smooth_quant_alpha + ) + ) + + ovm = nncf.quantize(ovm, calibration_dataset, **quant_kwargs) + logger.info("Full INT8 quantization complete.") + + # Save with FP16 compression + ov.save_model(ovm, str(output_model_path), compress_to_fp16=True) + + elif weight_format.lower() == "int8": + # Weight-only INT8 compression (from --weight-format int8) + logger.info("Applying INT8 weight compression...") + try: + import nncf + ovm = nncf.compress_weights(ovm, mode=nncf.CompressWeightsMode.INT8_SYM) + except ImportError: + logger.warning("NNCF not available, saving without INT8 compression") + + ov.save_model(ovm, str(output_model_path), compress_to_fp16=True) + else: + # No quantization - just serialize the model + logger.info(f"Saving model to {output_model_path}") + ov.serialize(ovm, str(output_model_path)) + + # Copy auxiliary files to ov_models directory + for aux_file in ["tokens.json", "config.yaml", "configuration.json", "am.mvn", "seg_dict"]: + src = model_path / aux_file + if src.exists(): + shutil.copy(src, ov_models_path / aux_file) + + logger.info(f"Paraformer model exported successfully to {output_path}") + + +def _load_paraformer_model( + model_name_or_path: str, + subfolder: str = "", + revision: str = "main", + cache_dir: str = None, + token: Optional[str] = None, + trust_remote_code: bool = False, + **kwargs, +): + """Load a Paraformer model for export.""" + return ParaformerForASR.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + token=token, + revision=revision, + **kwargs, + ) + + +def register_paraformer_with_tasks_manager(): + """ + Register Paraformer support with TasksManager. + + This function adds the necessary mappings for Paraformer to work + with the standard optimum-intel export pipeline. + """ + # Register paraformer library with supported model types + if "paraformer" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: + TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["paraformer"] = { + "paraformer": { + "automatic-speech-recognition": ("ParaformerForASR",), + } + } + + # Register model loader for paraformer library + if "paraformer" not in TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP: + TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP["paraformer"] = { + "automatic-speech-recognition": _load_paraformer_model, + } + + logger.debug("Registered Paraformer support with TasksManager") + + +def patch_main_quantize(): + """ + Patch the _main_quantize function to skip Paraformer models + (since quantization is already handled in main_export). + """ + try: + from optimum.exporters.openvino import __main__ as ov_main + + original_main_quantize = ov_main._main_quantize + + @wraps(original_main_quantize) + def patched_main_quantize( + model_name_or_path: str, + **kwargs + ): + # Debug logging + logger.info(f"patched_main_quantize called for model: {model_name_or_path}") + + # Check if this is a Paraformer model + cache_dir = kwargs.get("cache_dir") + is_paraformer = _is_paraformer_model(model_name_or_path, cache_dir=cache_dir) + logger.info(f"Is Paraformer model: {is_paraformer}") + + if is_paraformer: + logger.info("Skipping _main_quantize for Paraformer (already quantized in main_export)") + # For Paraformer, quantization is already done in main_export, so just return + return + + # Not a Paraformer model, use original quantization + return original_main_quantize(model_name_or_path, **kwargs) + + # Apply the patch + ov_main._main_quantize = patched_main_quantize + logger.debug("Patched _main_quantize to skip Paraformer models") + + except Exception as e: + logger.warning(f"Could not patch _main_quantize for Paraformer support: {e}") + + +def patch_main_export(): + """ + Patch the main_export function to handle Paraformer models automatically. + + This allows `optimum-cli export openvino --model funasr/paraformer-zh ...` to work + without modifying __main__.py directly. + """ + try: + from optimum.exporters.openvino import __main__ as ov_main + + original_main_export = ov_main.main_export + + @wraps(original_main_export) + def patched_main_export( + model_name_or_path: str, + output: Union[str, Path], + task: str = "auto", + **kwargs + ): + # Check if this is a Paraformer model + if _is_paraformer_model(model_name_or_path, cache_dir=kwargs.get("cache_dir")): + logger.info("Detected Paraformer model (FunASR). Using specialized export.") + + # Get ov_config for quantization settings + ov_config = kwargs.get("ov_config") + + # Determine weight format from kwargs + weight_format = kwargs.get("weight_format", "fp16") + if weight_format is None: + weight_format = "fp16" + + # Check ov_config for quantization settings to determine weight_format + if ov_config is not None: + quant_config = getattr(ov_config, "quantization_config", None) + if quant_config is not None: + if hasattr(quant_config, 'bits') and quant_config.bits == 8: + weight_format = "int8" + elif hasattr(quant_config, 'weight_dtype') and 'int8' in str(quant_config.weight_dtype).lower(): + weight_format = "int8" + + export_paraformer_to_openvino( + model_name_or_path=model_name_or_path, + output=output, + weight_format=weight_format, + cache_dir=kwargs.get("cache_dir"), + token=kwargs.get("token"), + device=kwargs.get("device", "cpu"), + ov_config=ov_config, + ) + return + + # Not a Paraformer model, use original export + return original_main_export(model_name_or_path, output, task, **kwargs) + + # Apply the patch + ov_main.main_export = patched_main_export + logger.debug("Patched main_export to support Paraformer models") + + except Exception as e: + logger.warning(f"Could not patch main_export for Paraformer support: {e}") + + +# Auto-register when this module is imported +register_paraformer_with_tasks_manager() +patch_main_export() +patch_main_quantize() + diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index dd110267ea..07ca64db81 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -104,6 +104,7 @@ "OVModelForSpeechSeq2Seq", "OVModelForTextToSpeechSeq2Seq", "OVModelForVision2Seq", + "OVParaformerForSpeechSeq2Seq", "OVModelForVisualCausalLM", "OVModelForSequenceClassification", "OVModelForTokenClassification", @@ -132,6 +133,7 @@ "OVModelForSpeechSeq2Seq", "OVModelForTextToSpeechSeq2Seq", "OVModelForVision2Seq", + "OVParaformerForSpeechSeq2Seq", "OVModelForVisualCausalLM", "OVModelForSequenceClassification", "OVModelForTokenClassification", @@ -416,6 +418,7 @@ OVModelForSpeechSeq2Seq, OVModelForTextToSpeechSeq2Seq, OVModelForTokenClassification, + OVParaformerForSpeechSeq2Seq, OVModelForVision2Seq, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8441944800..6c2926e5e3 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -86,6 +86,7 @@ ) from .modeling_sam import OVSamModel from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq, OVModelForVision2Seq +from .modeling_speech2text import OVParaformerForSpeechSeq2Seq from .modeling_text2speech import OVModelForTextToSpeechSeq2Seq from .modeling_visual_language import OVModelForVisualCausalLM diff --git a/optimum/intel/openvino/modeling_speech2text.py b/optimum/intel/openvino/modeling_speech2text.py new file mode 100644 index 0000000000..89c057d6c2 --- /dev/null +++ b/optimum/intel/openvino/modeling_speech2text.py @@ -0,0 +1,850 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenVINO Paraformer Speech-to-Text Model Implementation +Following the pattern from optimum-intel's modeling_text2speech.py +""" + +import logging +import os +from dataclasses import dataclass +from pathlib import Path +from tempfile import gettempdir +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import openvino +from openvino import CompiledModel, Core, Model +import torch +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from transformers import AutoConfig, PretrainedConfig +from transformers.utils import ModelOutput + +from .utils import OV_DECODER_NAME, OV_ENCODER_NAME, OV_XML_FILE_NAME, OV_TO_PT_TYPE + +logger = logging.getLogger(__name__) + +core = Core() + +# Additional model file name for Paraformer predictor +OV_PREDICTOR_NAME = "openvino_predictor_model.xml" + + +@dataclass +class ParaformerModelOutput(ModelOutput): + """ + Output type of ParaformerModel. + + Args: + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, vocab_size)`): + Predicted logits for each token. + token_num (`torch.LongTensor` of shape `(batch_size,)`): + Number of predicted tokens for each sequence. + token_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Decoded token IDs (if `decode=True`). + """ + logits: torch.FloatTensor = None + token_num: torch.LongTensor = None + token_ids: torch.LongTensor = None + + +class OVParaformerModelPart: + """ + Base class for OpenVINO Paraformer model components. + Following the OVModelPart pattern from optimum-intel. + """ + _model_name = "model" + + def __init__( + self, + model: Union[Model, CompiledModel], + parent_model: "OVParaformerForSpeechSeq2Seq", + ov_config: Optional[Dict[str, str]] = None, + model_name: str = None, + ): + self.model = model + self.parent_model = parent_model + self._model_name = model_name or self._model_name + + self._compile_only = getattr(parent_model, '_compile_only', False) + self.ov_config = ov_config or getattr(parent_model, 'ov_config', {}).copy() + + # Initialize request + if self._compile_only and isinstance(model, CompiledModel): + self.request = model.create_infer_request() + else: + self.request = None + + # Extract input/output metadata + model_for_meta = model.get_runtime_model() if isinstance(model, CompiledModel) else model + + self.input_names = {} + self.input_dtypes = {} + for idx, inp in enumerate(model_for_meta.inputs): + try: + names = inp.get_names() + name = next((n for n in names if "/" not in n), list(names)[0] if names else f"input_{idx}") + except Exception: + name = f"input_{idx}" + self.input_names[name] = idx + self.input_dtypes[name] = inp.get_element_type().get_type_name() + + self.output_names = {} + self.output_dtypes = {} + for idx, out in enumerate(model_for_meta.outputs): + try: + names = out.get_names() + name = next((n for n in names if "/" not in n), list(names)[0] if names else f"output_{idx}") + except Exception: + name = f"output_{idx}" + self.output_names[name] = idx + self.output_dtypes[name] = out.get_element_type().get_type_name() + + @property + def _device(self) -> str: + return self.parent_model._device + + @property + def device(self) -> torch.device: + return torch.device("cpu") + + @property + def dtype(self) -> Optional[torch.dtype]: + for dtype in self.input_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype is not None and torch_dtype.is_floating_point: + return torch_dtype + for dtype in self.output_dtypes.values(): + torch_dtype = OV_TO_PT_TYPE.get(dtype) + if torch_dtype is not None and torch_dtype.is_floating_point: + return torch_dtype + return None + + def compile(self): + """Compile the model for inference.""" + if self._compile_only and isinstance(self.model, CompiledModel): + if self.request is None: + self.request = self.model.create_infer_request() + return + + if self.request is None: + # Set cache directory for GPU + model_dir = getattr(self.parent_model, 'model_save_dir', None) + if ( + model_dir is not None + and "CACHE_DIR" not in self.ov_config + and not str(model_dir).startswith(gettempdir()) + and "gpu" in self._device.lower() + ): + self.ov_config["CACHE_DIR"] = os.path.join(str(model_dir), self._model_name, "model_cache") + + logger.info(f"Compiling {self._model_name} to {self._device}...") + compiled_model = core.compile_model(self.model, self._device, self.ov_config) + self.request = compiled_model.create_infer_request() + logger.info(f"✅ {self._model_name} compiled successfully") + + def clear_requests(self): + """Clear inference request to free resources.""" + if self._compile_only: + raise ValueError("`clear_requests()` is not supported in `compile_only` mode") + self.request = None + + def _prepare_input(self, tensor: Union[torch.Tensor, np.ndarray]) -> np.ndarray: + """Convert input to numpy array.""" + if isinstance(tensor, torch.Tensor): + return tensor.cpu().numpy() + return tensor + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + def forward(self, *args, **kwargs): + raise NotImplementedError + + +class OVParaformerEncoder(OVParaformerModelPart): + """ + Paraformer Encoder component for OpenVINO inference. + + Processes input speech features and produces encoder hidden states. + """ + _model_name = "encoder" + + def forward( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass through the encoder. + + Args: + speech: Input speech features [batch, time, features] + speech_lengths: Valid lengths for each sequence [batch] + + Returns: + encoder_out: Encoded hidden states [batch, time, hidden] + encoder_out_lens: Output lengths [batch] + """ + self.compile() + + inputs = { + "speech": self._prepare_input(speech), + "speech_lengths": self._prepare_input(speech_lengths), + } + + self.request.infer(inputs) + + encoder_out = torch.from_numpy(self.request.get_output_tensor(0).data.copy()) + encoder_out_lens = torch.from_numpy(self.request.get_output_tensor(1).data.copy()) + + return encoder_out, encoder_out_lens + + +class OVParaformerPredictor(OVParaformerModelPart): + """ + Paraformer CIF Predictor component for OpenVINO inference. + + Predicts acoustic embeddings and token counts from encoder output. + """ + _model_name = "predictor" + + def forward( + self, + encoder_out: Union[torch.Tensor, np.ndarray], + encoder_out_lens: Union[torch.Tensor, np.ndarray], + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Forward pass through the CIF predictor. + + Args: + encoder_out: Encoder output [batch, time, hidden] + encoder_out_lens: Encoder output lengths [batch] + + Returns: + acoustic_embeds: Predicted acoustic embeddings [batch, token_num, hidden] + token_num: Number of predicted tokens [batch] + alphas: CIF weights [batch, time] (optional) + peak_index: Peak indices [batch, token_num] (optional) + """ + self.compile() + + # Create attention mask [batch, 1, max_len] + if isinstance(encoder_out, torch.Tensor): + batch_size, max_len = encoder_out.shape[0], encoder_out.shape[1] + arange = torch.arange(max_len, dtype=torch.int32).unsqueeze(0).expand(batch_size, -1) + mask = (arange < encoder_out_lens.unsqueeze(1).to(torch.int32)).to(torch.float32) + mask = mask.unsqueeze(1) + else: + batch_size, max_len = encoder_out.shape[0], encoder_out.shape[1] + arange = np.arange(max_len, dtype=np.int32)[np.newaxis, :].repeat(batch_size, axis=0) + mask = (arange < encoder_out_lens[:, np.newaxis]).astype(np.float32) + mask = mask[:, np.newaxis, :] + + # Map encoder_out and mask to actual OV input names using discovered input_names + # to avoid mismatch with TorchScript arg names + input_names_list = list(self.input_names.keys()) + inputs = {} + if len(input_names_list) > 0: + inputs[input_names_list[0]] = self._prepare_input(encoder_out) + if len(input_names_list) > 1: + inputs[input_names_list[1]] = self._prepare_input(mask) + + self.request.infer(inputs) + + acoustic_embeds = torch.from_numpy(self.request.get_output_tensor(0).data.copy()) + token_num = torch.from_numpy(self.request.get_output_tensor(1).data.copy()) + + alphas = None + peak_index = None + if len(self.output_names) > 2: + alphas = torch.from_numpy(self.request.get_output_tensor(2).data.copy()) + if len(self.output_names) > 3: + peak_index = torch.from_numpy(self.request.get_output_tensor(3).data.copy()) + + return acoustic_embeds, token_num, alphas, peak_index + + +class OVParaformerDecoder(OVParaformerModelPart): + """ + Paraformer Decoder component for OpenVINO inference. + + Produces output logits from encoder output and acoustic embeddings. + """ + _model_name = "decoder" + + def forward( + self, + encoder_out: Union[torch.Tensor, np.ndarray], + encoder_out_lens: Union[torch.Tensor, np.ndarray], + acoustic_embeds: Union[torch.Tensor, np.ndarray], + token_num: Union[torch.Tensor, np.ndarray], + ) -> torch.Tensor: + """ + Forward pass through the decoder. + + Args: + encoder_out: Encoder output [batch, time, hidden] + encoder_out_lens: Encoder output lengths [batch] + acoustic_embeds: Acoustic embeddings from predictor [batch, token_num, hidden] + token_num: Number of tokens [batch] + + Returns: + logits: Output logits [batch, token_num, vocab_size] + """ + self.compile() + + inputs = { + "encoder_out": self._prepare_input(encoder_out), + "encoder_out_lens": self._prepare_input(encoder_out_lens), + "acoustic_embeds": self._prepare_input(acoustic_embeds), + "token_num": self._prepare_input(token_num), + } + + self.request.infer(inputs) + + logits = torch.from_numpy(self.request.get_output_tensor(0).data.copy()) + + return logits + + +class OVParaformerForSpeechSeq2Seq: + """ + OpenVINO Paraformer model for automatic speech recognition. + + This class provides a unified interface for loading and running inference + on Paraformer models exported to OpenVINO IR format. It supports both + single-file models and multi-component (encoder/predictor/decoder) models. + + Following the pattern from optimum-intel's OVModelForTextToSpeechSeq2Seq. + + Args: + model_path: Path to the model directory containing OpenVINO IR files + device: Target device for inference (CPU, GPU, AUTO, etc.) + ov_config: OpenVINO runtime configuration dictionary + compile_only: If True, skip model loading and compile directly from files + + Example: + ```python + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + "/path/to/paraformer-zh/ov_models", + device="GPU", + ) + + # Run inference + output = model(speech_features, speech_lengths) + token_ids = output.token_ids + ``` + """ + + auto_model_class = None + export_feature = "automatic-speech-recognition" + main_input_name = "speech" + + def __init__( + self, + model: Optional[Model] = None, + encoder: Optional[Model] = None, + predictor: Optional[Model] = None, + decoder: Optional[Model] = None, + config: Optional[PretrainedConfig] = None, + device: str = "CPU", + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path]] = None, + compile_only: bool = False, + compile: bool = True, + **kwargs, + ): + self.config = config + self.model_save_dir = Path(model_save_dir) if model_save_dir else None + self._device = device.upper() + self.ov_config = ov_config.copy() if ov_config else {} + self._compile_only = compile_only + self.preprocessors = kwargs.get("preprocessors", []) + self.generation_config = kwargs.get("generation_config", None) + + # Determine if we have a single model or separate components + self._single_model = model is not None + + if self._single_model: + # Single combined model + self.model = model + self._model_component = OVParaformerModelPart( + model, self, ov_config=self.ov_config, model_name="model" + ) + self.encoder = None + self.predictor = None + self.decoder = None + + # Extract I/O metadata from the single model + self.input_names = self._model_component.input_names.copy() + self.output_names = self._model_component.output_names.copy() + else: + # Separate components + self.model = None + self._model_component = None + self.encoder = OVParaformerEncoder(encoder, self, model_name="encoder") if encoder else None + self.predictor = OVParaformerPredictor(predictor, self, model_name="predictor") if predictor else None + self.decoder = OVParaformerDecoder(decoder, self, model_name="decoder") if decoder else None + + # Combine I/O names + self.input_names = {} + self.output_names = {} + if self.encoder: + self.input_names.update(self.encoder.input_names) + if self.decoder: + self.output_names.update(self.decoder.output_names) + + if compile and not compile_only: + self.compile() + + @classmethod + def from_pretrained( + cls, + model_id: Union[str, Path], + device: str = "CPU", + ov_config: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + local_files_only: bool = False, + compile_only: bool = False, + compile: bool = True, + **kwargs, + ) -> "OVParaformerForSpeechSeq2Seq": + """ + Load a Paraformer model from a local directory or Hugging Face Hub. + + Args: + model_id: Local path or Hugging Face Hub model ID + device: Target device (CPU, GPU, AUTO) + ov_config: OpenVINO configuration dictionary + token: Hugging Face authentication token + revision: Model revision to use + force_download: Force re-download from Hub + cache_dir: Directory to cache downloaded models + local_files_only: Only use local files, no Hub download + compile_only: Load as compiled model directly + compile: Whether to compile models after loading + + Returns: + OVParaformerForSpeechSeq2Seq instance + """ + model_path = Path(model_id) + + # Try to load config + config = None + config_paths = [ + model_path / "config.json", + model_path / "config.yaml", + ] + for cfg_path in config_paths: + if cfg_path.exists(): + try: + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + break + except Exception: + pass + + # Check for single model file + single_model_path = model_path / OV_XML_FILE_NAME + if single_model_path.exists(): + logger.info(f"Loading single Paraformer model from {single_model_path}") + model = cls._load_model(single_model_path, device if compile_only else None, ov_config) + return cls( + model=model, + config=config, + device=device, + ov_config=ov_config, + model_save_dir=model_path, + compile_only=compile_only, + compile=compile, + **kwargs, + ) + + # Check for separate component files + encoder_path = model_path / OV_ENCODER_NAME + predictor_path = model_path / OV_PREDICTOR_NAME + decoder_path = model_path / OV_DECODER_NAME + + if encoder_path.exists() and decoder_path.exists(): + logger.info(f"Loading Paraformer components from {model_path}") + + encoder = cls._load_model(encoder_path, device if compile_only else None, ov_config) + decoder = cls._load_model(decoder_path, device if compile_only else None, ov_config) + predictor = None + if predictor_path.exists(): + predictor = cls._load_model(predictor_path, device if compile_only else None, ov_config) + + return cls( + encoder=encoder, + predictor=predictor, + decoder=decoder, + config=config, + device=device, + ov_config=ov_config, + model_save_dir=model_path, + compile_only=compile_only, + compile=compile, + **kwargs, + ) + + raise FileNotFoundError( + f"Could not find Paraformer model files in {model_path}. " + f"Expected either '{OV_XML_FILE_NAME}' or component files like '{OV_ENCODER_NAME}'." + ) + + @staticmethod + def _load_model( + path: Path, + device: Optional[str] = None, + ov_config: Optional[Dict[str, str]] = None, + ) -> Union[Model, CompiledModel]: + """Load an OpenVINO model from file.""" + logger.info(f"Loading model from {path}") + model = core.read_model(path) + + if device is not None: + # Compile directly (compile_only mode) + return core.compile_model(model, device, ov_config or {}) + + return model + + @property + def device(self) -> torch.device: + """Return torch device (always CPU for compatibility).""" + return torch.device("cpu") + + @property + def dtype(self) -> torch.dtype: + """Return model dtype.""" + if self._model_component: + return self._model_component.dtype + if self.encoder: + return self.encoder.dtype + return torch.float32 + + @property + def _component_names(self) -> List[str]: + """Return list of loaded component names.""" + if self._single_model: + return ["model"] + names = [] + if self.encoder: names.append("encoder") + if self.predictor: names.append("predictor") + if self.decoder: names.append("decoder") + return names + + @property + def components(self) -> Dict[str, OVParaformerModelPart]: + """Return dictionary of model components.""" + if self._single_model: + return {"model": self._model_component} + comps = {} + if self.encoder: comps["encoder"] = self.encoder + if self.predictor: comps["predictor"] = self.predictor + if self.decoder: comps["decoder"] = self.decoder + return comps + + def to(self, device: str) -> "OVParaformerForSpeechSeq2Seq": + """ + Move model to specified device. + + Args: + device: Target device (CPU, GPU, AUTO) + + Returns: + self for method chaining + """ + if self._compile_only: + raise ValueError("`to()` is not supported in `compile_only` mode") + + if isinstance(device, str): + self._device = device.upper() + self.clear_requests() + + return self + + def compile(self): + """Compile all model components for inference.""" + for component in self.components.values(): + component.compile() + + def clear_requests(self): + """Clear all inference requests.""" + for component in self.components.values(): + component.clear_requests() + + def __call__( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + decode: bool = True, + **kwargs, + ) -> ParaformerModelOutput: + """ + Run inference on speech input. + + Args: + speech: Input speech features [batch, time, features] + speech_lengths: Valid lengths for each sequence [batch] + decode: Whether to decode logits to token IDs + + Returns: + ParaformerModelOutput containing logits, token_num, and optionally token_ids + """ + return self.forward(speech, speech_lengths, decode=decode, **kwargs) + + def forward( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + decode: bool = True, + **kwargs, + ) -> ParaformerModelOutput: + """ + Forward pass through the model. + + Args: + speech: Input speech features [batch, time, features] + speech_lengths: Valid lengths for each sequence [batch] + decode: Whether to decode logits to token IDs + + Returns: + ParaformerModelOutput containing logits, token_num, and optionally token_ids + """ + if self._single_model: + return self._forward_single_model(speech, speech_lengths, decode=decode) + else: + return self._forward_components(speech, speech_lengths, decode=decode) + + def _forward_single_model( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + decode: bool = True, + ) -> ParaformerModelOutput: + """Forward pass for single combined model.""" + self._model_component.compile() + + # Find speech input name (might be 'speech' or 'speech.1') + speech_input_name = None + for name in self.input_names: + if 'speech' in name.lower() and 'length' not in name.lower(): + speech_input_name = name + break + + if speech_input_name is None: + # Fall back to first input + speech_input_name = list(self.input_names.keys())[0] + + # Prepare inputs + speech_np = speech.cpu().numpy() if isinstance(speech, torch.Tensor) else speech + lengths_np = speech_lengths.cpu().numpy() if isinstance(speech_lengths, torch.Tensor) else speech_lengths + + inputs = { + speech_input_name: speech_np, + "speech_lengths": lengths_np, + } + + # Run inference + self._model_component.request.infer(inputs) + + # Get outputs + logits = torch.from_numpy(self._model_component.request.get_output_tensor(0).data.copy()) + token_num = None + if len(self.output_names) > 1: + token_num = torch.from_numpy(self._model_component.request.get_output_tensor(1).data.copy()) + + # Decode if requested + token_ids = None + if decode: + token_ids = self.decode(logits, token_num) + + return ParaformerModelOutput( + logits=logits, + token_num=token_num, + token_ids=token_ids, + ) + + def _forward_components( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + decode: bool = True, + ) -> ParaformerModelOutput: + """Forward pass for separate component models.""" + # 1. Encoder + encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths) + + # 2. Predictor (if available) + if self.predictor is not None: + acoustic_embeds, token_num, alphas, peak_index = self.predictor( + encoder_out, encoder_out_lens + ) + else: + # Without predictor, pass encoder output directly + acoustic_embeds = encoder_out + token_num = encoder_out_lens + + # 3. Decoder + logits = self.decoder(encoder_out, encoder_out_lens, acoustic_embeds, token_num) + + # Decode if requested + token_ids = None + if decode: + token_ids = self.decode(logits, token_num) + + return ParaformerModelOutput( + logits=logits, + token_num=token_num, + token_ids=token_ids, + ) + + def decode( + self, + logits: torch.Tensor, + token_num: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Decode logits to token IDs using greedy decoding. + + Args: + logits: Output logits [batch, seq_len, vocab_size] + token_num: Optional token numbers for each batch item [batch] + + Returns: + token_ids: Predicted token IDs [batch, seq_len] + """ + token_ids = torch.argmax(logits, dim=-1) + + # Mask out padding if token_num is provided + if token_num is not None: + batch_size = token_ids.shape[0] + max_len = token_ids.shape[1] + for i in range(batch_size): + num = int(token_num[i].item()) if torch.is_tensor(token_num[i]) else int(token_num[i]) + if num < max_len: + token_ids[i, num:] = 0 + + return token_ids + + def generate( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray], + **kwargs, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Generate token IDs from speech input. + + This is an alias for forward() with decode=True for API compatibility. + + Args: + speech: Input speech features [batch, time, features] + speech_lengths: Valid lengths for each sequence [batch] + + Returns: + token_ids: Predicted token IDs [batch, seq_len] + token_num: Number of valid tokens per sequence [batch] + """ + output = self.forward(speech, speech_lengths, decode=True, **kwargs) + return output.token_ids, output.token_num + + def save_pretrained( + self, + save_directory: Union[str, Path], + ): + """ + Save model to directory. + + Args: + save_directory: Directory to save model files + """ + save_path = Path(save_directory) + save_path.mkdir(parents=True, exist_ok=True) + + if self._single_model: + model_path = save_path / OV_XML_FILE_NAME + openvino.save_model(self.model, str(model_path)) + logger.info(f"Saved model to {model_path}") + else: + if self.encoder: + encoder_path = save_path / OV_ENCODER_NAME + openvino.save_model(self.encoder.model, str(encoder_path)) + logger.info(f"Saved encoder to {encoder_path}") + if self.predictor: + predictor_path = save_path / OV_PREDICTOR_NAME + openvino.save_model(self.predictor.model, str(predictor_path)) + logger.info(f"Saved predictor to {predictor_path}") + if self.decoder: + decoder_path = save_path / OV_DECODER_NAME + openvino.save_model(self.decoder.model, str(decoder_path)) + logger.info(f"Saved decoder to {decoder_path}") + + # Save config if available + if self.config is not None: + self.config.save_pretrained(save_path) + + +# Alias for backwards compatibility +OVModelForSpeech2Seq = OVParaformerForSpeechSeq2Seq +load_paraformer_model = OVParaformerForSpeechSeq2Seq.from_pretrained + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Paraformer OpenVINO Inference") + parser.add_argument("--model", required=True, help="Path to OpenVINO model directory") + parser.add_argument("--device", default="CPU", help="Device (CPU/GPU/AUTO)") + parser.add_argument("--input", help="Path to input speech .npy file") + parser.add_argument("--lengths", help="Path to lengths .npy file") + + args = parser.parse_args() + + # Enable logging + logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s") + + # Load model + print(f"Loading model from {args.model}") + model = OVParaformerForSpeechSeq2Seq.from_pretrained(args.model, device=args.device) + print(f"✅ Model loaded on {args.device}") + print(f" Components: {model._component_names}") + print(f" Input names: {list(model.input_names.keys())}") + print(f" Output names: {list(model.output_names.keys())}") + + # Load or create input + if args.input and args.lengths: + speech = torch.from_numpy(np.load(args.input)) + speech_lengths = torch.from_numpy(np.load(args.lengths)) + print(f"Loaded input: speech {speech.shape}, lengths {speech_lengths.shape}") + else: + # Create dummy input + speech = torch.randn(1, 100, 560) + speech_lengths = torch.tensor([100], dtype=torch.int32) + print("Using dummy input: speech [1, 100, 560]") + + # Run inference + print("\nRunning inference...") + output = model(speech, speech_lengths) + + print(f"\n✅ Inference completed!") + print(f" Logits shape: {output.logits.shape}") + print(f" Token numbers: {output.token_num}") + if output.token_ids is not None: + num = int(output.token_num[0]) if output.token_num is not None else 10 + print(f" Token IDs (first {num}): {output.token_ids[0, :num].tolist()}") diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 818eb41726..2b23f78d36 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -204,6 +204,16 @@ "split": "validation", "streaming": True, "revision": "refs/convert/parquet", + }, + "AISHELL-1": { + "id": "speechcolab/aishell", + "split": "validation", + "streaming": True, + }, + "aishell-1": { + "id": "speechcolab/aishell", + "split": "validation", + "streaming": True, } } diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 7ea4102ec7..7ee9d0d8af 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -80,6 +80,7 @@ OVModelForVision2Seq, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, + OVParaformerForSpeechSeq2Seq, ) from ..openvino.modeling_base import OVBaseModel @@ -87,7 +88,7 @@ "audio-classification": (OVModelForAudioClassification,), "audio-frame-classification": (OVModelForAudioFrameClassification,), "audio-xvector": (OVModelForAudioXVector,), - "automatic-speech-recognition": (OVModelForCTC, OVModelForSpeechSeq2Seq), + "automatic-speech-recognition": (OVModelForCTC, OVModelForSpeechSeq2Seq, OVParaformerForSpeechSeq2Seq), "feature-extraction": (OVModelForFeatureExtraction,), "fill-mask": (OVModelForMaskedLM,), "image-classification": (OVModelForImageClassification,), @@ -128,6 +129,12 @@ def get_openvino_model_class( config = AutoConfig.from_pretrained(model_id, **hub_kwargs) if any(arch.endswith("ForCTC") for arch in config.architectures): ov_model_class = OV_TASKS_MAPPING[task][0] + # Check for Paraformer models - detected by model_type or architecture + elif ( + getattr(config, "model_type", "").lower() == "paraformer" + or any("Paraformer" in arch for arch in getattr(config, "architectures", [])) + ): + ov_model_class = OV_TASKS_MAPPING[task][2] # OVParaformerForSpeechSeq2Seq else: ov_model_class = OV_TASKS_MAPPING[task][1] else: diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..ba67e4df4a 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -257,6 +257,9 @@ def _infer_library_from_model_name_or_path( ) if "open_clip_config.json" in all_files or "open_clip_pytorch_model.bin" in all_files: library_name = "open_clip" + elif "am.mvn" in all_files and "config.yaml" in all_files and "tokens.json" in all_files: + # Paraformer models have these characteristic files + library_name = "paraformer" else: library_name = TasksManager._infer_library_from_model_name_or_path( model_name_or_path=model_name_or_path, cache_dir=cache_dir diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 9519cea1ec..3f2bb9df0c 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -46,6 +46,7 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTextToSpeechSeq2Seq, + OVParaformerForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, @@ -79,6 +80,7 @@ class ExportModelTest(unittest.TestCase): "roberta": OVModelForTokenClassification, "wav2vec2": OVModelForAudioClassification, "whisper": OVModelForSpeechSeq2Seq, + "paraformer": OVParaformerForSpeechSeq2Seq, "blenderbot": OVModelForFeatureExtraction, "stable-diffusion": OVStableDiffusionPipeline, "stable-diffusion-xl": OVStableDiffusionXLPipeline, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 8e860ba743..89a3c949b2 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -54,6 +54,7 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTextToSpeechSeq2Seq, + OVParaformerForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, @@ -113,6 +114,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-video", "ltx-video"), ("feature-extraction", "sam"), ("text-to-audio", "speecht5"), + ("automatic-speech-recognition", "paraformer"), ("zero-shot-image-classification", "clip"), ] diff --git a/tests/openvino/test_paraformer.py b/tests/openvino/test_paraformer.py new file mode 100644 index 0000000000..97c00a600e --- /dev/null +++ b/tests/openvino/test_paraformer.py @@ -0,0 +1,315 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from tempfile import TemporaryDirectory + +import numpy as np +import torch +from transformers import set_seed + +from optimum.intel import OVParaformerForSpeechSeq2Seq + + +# Note: This test requires a Paraformer OpenVINO model to be available. +# For CI/CD, this should point to a model on Hugging Face Hub once available. +PARAFORMER_MODEL_PATH = os.environ.get( + "PARAFORMER_TEST_MODEL", + None # Set to model path when available on HF Hub +) + +OPENVINO_DEVICE = os.environ.get("OPENVINO_DEVICE", "CPU") +SEED = 42 + + +class OVParaformerForSpeechSeq2SeqTest(unittest.TestCase): + """ + Test suite for OVParaformerForSpeechSeq2Seq model. + + This tests the OpenVINO inference implementation for Paraformer ASR models. + """ + + def _generate_random_speech_features(self, batch_size=1, num_frames=100, feature_dim=560): + """Generate random speech features for testing.""" + np.random.seed(SEED) + speech = np.random.randn(batch_size, num_frames, feature_dim).astype(np.float32) + speech_lengths = np.array([num_frames] * batch_size, dtype=np.int32) + return speech, speech_lengths + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_load_model_from_pretrained(self): + """Test loading model from pretrained path.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Check model properties + self.assertIsNotNone(model) + self.assertEqual(model._device, OPENVINO_DEVICE) + self.assertIsNotNone(model.input_names) + self.assertIsNotNone(model.output_names) + self.assertEqual(model.export_feature, "automatic-speech-recognition") + self.assertEqual(model.main_input_name, "speech") + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_model_inference(self): + """Test basic inference functionality.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Generate random input + speech, speech_lengths = self._generate_random_speech_features(batch_size=1, num_frames=100) + speech_tensor = torch.from_numpy(speech) + lengths_tensor = torch.from_numpy(speech_lengths) + + # Run inference + output = model(speech_tensor, lengths_tensor) + + # Check output structure + self.assertIsNotNone(output.logits) + self.assertIsNotNone(output.token_num) + self.assertIsNotNone(output.token_ids) + + # Check shapes + batch_size, seq_len, vocab_size = output.logits.shape + self.assertEqual(batch_size, 1) + self.assertGreater(seq_len, 0) + self.assertGreater(vocab_size, 0) + + # Check token_ids shape matches + self.assertEqual(output.token_ids.shape[0], batch_size) + self.assertEqual(output.token_ids.shape[1], seq_len) + + # Check token_num is within bounds + self.assertGreater(output.token_num[0].item(), 0) + self.assertLessEqual(output.token_num[0].item(), seq_len) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_batch_inference(self): + """Test batch inference with variable lengths.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Generate batch with different lengths + batch_size = 3 + max_frames = 120 + speech = np.random.randn(batch_size, max_frames, 560).astype(np.float32) + speech_lengths = np.array([120, 100, 80], dtype=np.int32) + + speech_tensor = torch.from_numpy(speech) + lengths_tensor = torch.from_numpy(speech_lengths) + + # Run batch inference + output = model(speech_tensor, lengths_tensor) + + # Check batch dimension + self.assertEqual(output.logits.shape[0], batch_size) + self.assertEqual(output.token_ids.shape[0], batch_size) + self.assertEqual(len(output.token_num), batch_size) + + # Check all sequences have tokens + for i in range(batch_size): + self.assertGreater(output.token_num[i].item(), 0) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_numpy_input(self): + """Test inference with numpy arrays as input.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Use numpy arrays directly + speech, speech_lengths = self._generate_random_speech_features() + + # Run inference with numpy input + output = model(speech, speech_lengths) + + # Should work the same as torch tensors + self.assertIsNotNone(output.logits) + self.assertIsNotNone(output.token_ids) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_generate_api(self): + """Test the generate() API.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + speech, speech_lengths = self._generate_random_speech_features() + speech_tensor = torch.from_numpy(speech) + lengths_tensor = torch.from_numpy(speech_lengths) + + # Use generate() method + token_ids, token_num = model.generate(speech_tensor, lengths_tensor) + + # Check outputs + self.assertIsInstance(token_ids, torch.Tensor) + self.assertIsInstance(token_num, torch.Tensor) + self.assertEqual(token_ids.shape[0], 1) # batch size + self.assertGreater(token_num[0].item(), 0) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_device_switching(self): + """Test switching between CPU and GPU.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device="CPU" + ) + + self.assertEqual(model._device, "CPU") + + speech, speech_lengths = self._generate_random_speech_features() + speech_tensor = torch.from_numpy(speech) + lengths_tensor = torch.from_numpy(speech_lengths) + + # Run on CPU + output_cpu = model(speech_tensor, lengths_tensor) + self.assertIsNotNone(output_cpu.logits) + + # Switch to GPU (if available) + try: + model.to("GPU") + self.assertEqual(model._device, "GPU") + + # Run on GPU + output_gpu = model(speech_tensor, lengths_tensor) + self.assertIsNotNone(output_gpu.logits) + + # Results should be similar (not exactly equal due to precision differences) + self.assertEqual(output_cpu.logits.shape, output_gpu.logits.shape) + except Exception as e: + # GPU might not be available in test environment + self.skipTest(f"GPU not available: {e}") + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_save_and_load(self): + """Test saving and loading model.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + with TemporaryDirectory() as tmp_dir: + # Save model + model.save_pretrained(tmp_dir) + + # Check files were created + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "openvino_model.xml"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "openvino_model.bin"))) + + # Load saved model + loaded_model = OVParaformerForSpeechSeq2Seq.from_pretrained( + tmp_dir, + device=OPENVINO_DEVICE + ) + + # Test loaded model works + speech, speech_lengths = self._generate_random_speech_features() + output = loaded_model(torch.from_numpy(speech), torch.from_numpy(speech_lengths)) + self.assertIsNotNone(output.logits) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_decode_without_token_num(self): + """Test decode method without token_num (should not mask).""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Create fake logits + batch_size, seq_len, vocab_size = 1, 10, 100 + fake_logits = torch.randn(batch_size, seq_len, vocab_size) + + # Decode without token_num + token_ids = model.decode(fake_logits, token_num=None) + + # Should return argmax of logits + expected = torch.argmax(fake_logits, dim=-1) + self.assertTrue(torch.equal(token_ids, expected)) + + # Decode with token_num (should mask padding) + token_num = torch.tensor([5]) + token_ids_masked = model.decode(fake_logits, token_num=token_num) + + # First 5 should be same, rest should be 0 + self.assertTrue(torch.equal(token_ids_masked[0, :5], expected[0, :5])) + self.assertTrue(torch.all(token_ids_masked[0, 5:] == 0)) + + @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided") + def test_model_properties(self): + """Test model properties and attributes.""" + model = OVParaformerForSpeechSeq2Seq.from_pretrained( + PARAFORMER_MODEL_PATH, + device=OPENVINO_DEVICE + ) + + # Check component names + self.assertIn("model", model._component_names) + + # Check components dictionary + self.assertIsNotNone(model.components) + self.assertGreater(len(model.components), 0) + + # Check dtype + self.assertIsNotNone(model.dtype) + + # Check device property + self.assertEqual(model.device, torch.device("cpu")) + + def test_model_output_dataclass(self): + """Test ParaformerModelOutput dataclass.""" + from optimum.intel.openvino.modeling_speech2text import ParaformerModelOutput + + # Create output with all fields + logits = torch.randn(1, 10, 100) + token_num = torch.tensor([10]) + token_ids = torch.randint(0, 100, (1, 10)) + + output = ParaformerModelOutput( + logits=logits, + token_num=token_num, + token_ids=token_ids + ) + + # Check all fields are accessible + self.assertEqual(output.logits.shape, logits.shape) + self.assertEqual(output.token_num, token_num) + self.assertEqual(output.token_ids.shape, token_ids.shape) + + # Check optional field + output_no_ids = ParaformerModelOutput( + logits=logits, + token_num=token_num + ) + self.assertIsNone(output_no_ids.token_ids) + + +if __name__ == "__main__": + # For local testing with your models + if PARAFORMER_MODEL_PATH is None: + print("=" * 80) + print("WARNING: PARAFORMER_TEST_MODEL environment variable not set") + print("To run tests locally, set:") + print(" export PARAFORMER_TEST_MODEL=/path/to/paraformer-zh/ov_models") + print("=" * 80) + + unittest.main() diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fe6d584d2f..e685a88c52 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -213,6 +213,7 @@ "wav2vec2-hf": "optimum-intel-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer", "whisper": "optimum-intel-internal-testing/tiny-random-whisper", + "paraformer": "funasr/paraformer-zh", "xlm": "optimum-intel-internal-testing/tiny-random-xlm", "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta", "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM", @@ -244,6 +245,7 @@ "granitemoehybrid": {"model": 118}, "wav2vec2": {"model": 34}, "distilbert": {"model": 66}, + "paraformer": {"model": 268}, "t5": { "encoder": 64, "decoder": 104,